vfs_subr.c revision 289513
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_subr.c 289513 2015-10-18 14:41:38Z trasz $");
43
44#include "opt_compat.h"
45#include "opt_ddb.h"
46#include "opt_watchdog.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/condvar.h>
53#include <sys/conf.h>
54#include <sys/dirent.h>
55#include <sys/event.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/file.h>
59#include <sys/fcntl.h>
60#include <sys/jail.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/kthread.h>
64#include <sys/lockf.h>
65#include <sys/malloc.h>
66#include <sys/mount.h>
67#include <sys/namei.h>
68#include <sys/pctrie.h>
69#include <sys/priv.h>
70#include <sys/reboot.h>
71#include <sys/rwlock.h>
72#include <sys/sched.h>
73#include <sys/sleepqueue.h>
74#include <sys/smp.h>
75#include <sys/stat.h>
76#include <sys/sysctl.h>
77#include <sys/syslog.h>
78#include <sys/vmmeter.h>
79#include <sys/vnode.h>
80#include <sys/watchdog.h>
81
82#include <machine/stdarg.h>
83
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_object.h>
88#include <vm/vm_extern.h>
89#include <vm/pmap.h>
90#include <vm/vm_map.h>
91#include <vm/vm_page.h>
92#include <vm/vm_kern.h>
93#include <vm/uma.h>
94
95#ifdef DDB
96#include <ddb/ddb.h>
97#endif
98
99static void	delmntque(struct vnode *vp);
100static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
101		    int slpflag, int slptimeo);
102static void	syncer_shutdown(void *arg, int howto);
103static int	vtryrecycle(struct vnode *vp);
104static void	v_incr_usecount(struct vnode *);
105static void	v_decr_usecount(struct vnode *);
106static void	v_decr_useonly(struct vnode *);
107static void	v_upgrade_usecount(struct vnode *);
108static void	vnlru_free(int);
109static void	vgonel(struct vnode *);
110static void	vfs_knllock(void *arg);
111static void	vfs_knlunlock(void *arg);
112static void	vfs_knl_assert_locked(void *arg);
113static void	vfs_knl_assert_unlocked(void *arg);
114static void	destroy_vpollinfo(struct vpollinfo *vi);
115
116/*
117 * Number of vnodes in existence.  Increased whenever getnewvnode()
118 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
119 */
120static unsigned long	numvnodes;
121
122SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
123    "Number of vnodes in existence");
124
125static u_long vnodes_created;
126SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
127    0, "Number of vnodes created by getnewvnode");
128
129/*
130 * Conversion tables for conversion from vnode types to inode formats
131 * and back.
132 */
133enum vtype iftovt_tab[16] = {
134	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
135	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
136};
137int vttoif_tab[10] = {
138	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
139	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
140};
141
142/*
143 * List of vnodes that are ready for recycling.
144 */
145static TAILQ_HEAD(freelst, vnode) vnode_free_list;
146
147/*
148 * Free vnode target.  Free vnodes may simply be files which have been stat'd
149 * but not read.  This is somewhat common, and a small cache of such files
150 * should be kept to avoid recreation costs.
151 */
152static u_long wantfreevnodes;
153SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, "");
154/* Number of vnodes in the free list. */
155static u_long freevnodes;
156SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0,
157    "Number of vnodes in the free list");
158
159static int vlru_allow_cache_src;
160SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
161    &vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
162
163static u_long recycles_count;
164SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
165    "Number of vnodes recycled to avoid exceding kern.maxvnodes");
166
167/*
168 * Various variables used for debugging the new implementation of
169 * reassignbuf().
170 * XXX these are probably of (very) limited utility now.
171 */
172static int reassignbufcalls;
173SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
174    "Number of calls to reassignbuf");
175
176static u_long free_owe_inact;
177SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
178    "Number of times free vnodes kept on active list due to VFS "
179    "owing inactivation");
180
181/*
182 * Cache for the mount type id assigned to NFS.  This is used for
183 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
184 */
185int	nfs_mount_type = -1;
186
187/* To keep more than one thread at a time from running vfs_getnewfsid */
188static struct mtx mntid_mtx;
189
190/*
191 * Lock for any access to the following:
192 *	vnode_free_list
193 *	numvnodes
194 *	freevnodes
195 */
196static struct mtx vnode_free_list_mtx;
197
198/* Publicly exported FS */
199struct nfs_public nfs_pub;
200
201static uma_zone_t buf_trie_zone;
202
203/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
204static uma_zone_t vnode_zone;
205static uma_zone_t vnodepoll_zone;
206
207/*
208 * The workitem queue.
209 *
210 * It is useful to delay writes of file data and filesystem metadata
211 * for tens of seconds so that quickly created and deleted files need
212 * not waste disk bandwidth being created and removed. To realize this,
213 * we append vnodes to a "workitem" queue. When running with a soft
214 * updates implementation, most pending metadata dependencies should
215 * not wait for more than a few seconds. Thus, mounted on block devices
216 * are delayed only about a half the time that file data is delayed.
217 * Similarly, directory updates are more critical, so are only delayed
218 * about a third the time that file data is delayed. Thus, there are
219 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
220 * one each second (driven off the filesystem syncer process). The
221 * syncer_delayno variable indicates the next queue that is to be processed.
222 * Items that need to be processed soon are placed in this queue:
223 *
224 *	syncer_workitem_pending[syncer_delayno]
225 *
226 * A delay of fifteen seconds is done by placing the request fifteen
227 * entries later in the queue:
228 *
229 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
230 *
231 */
232static int syncer_delayno;
233static long syncer_mask;
234LIST_HEAD(synclist, bufobj);
235static struct synclist *syncer_workitem_pending;
236/*
237 * The sync_mtx protects:
238 *	bo->bo_synclist
239 *	sync_vnode_count
240 *	syncer_delayno
241 *	syncer_state
242 *	syncer_workitem_pending
243 *	syncer_worklist_len
244 *	rushjob
245 */
246static struct mtx sync_mtx;
247static struct cv sync_wakeup;
248
249#define SYNCER_MAXDELAY		32
250static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
251static int syncdelay = 30;		/* max time to delay syncing data */
252static int filedelay = 30;		/* time to delay syncing files */
253SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
254    "Time to delay syncing files (in seconds)");
255static int dirdelay = 29;		/* time to delay syncing directories */
256SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
257    "Time to delay syncing directories (in seconds)");
258static int metadelay = 28;		/* time to delay syncing metadata */
259SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
260    "Time to delay syncing metadata (in seconds)");
261static int rushjob;		/* number of slots to run ASAP */
262static int stat_rush_requests;	/* number of times I/O speeded up */
263SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
264    "Number of times I/O speeded up (rush requests)");
265
266/*
267 * When shutting down the syncer, run it at four times normal speed.
268 */
269#define SYNCER_SHUTDOWN_SPEEDUP		4
270static int sync_vnode_count;
271static int syncer_worklist_len;
272static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
273    syncer_state;
274
275/*
276 * Number of vnodes we want to exist at any one time.  This is mostly used
277 * to size hash tables in vnode-related code.  It is normally not used in
278 * getnewvnode(), as wantfreevnodes is normally nonzero.)
279 *
280 * XXX desiredvnodes is historical cruft and should not exist.
281 */
282int desiredvnodes;
283
284static int
285sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
286{
287	int error, old_desiredvnodes;
288
289	old_desiredvnodes = desiredvnodes;
290	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
291		return (error);
292	if (old_desiredvnodes != desiredvnodes) {
293		vfs_hash_changesize(desiredvnodes);
294		cache_changesize(desiredvnodes);
295	}
296	return (0);
297}
298
299SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
300    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
301    sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
302SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
303    &wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
304static int vnlru_nowhere;
305SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
306    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
307
308/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
309static int vnsz2log;
310
311/*
312 * Support for the bufobj clean & dirty pctrie.
313 */
314static void *
315buf_trie_alloc(struct pctrie *ptree)
316{
317
318	return uma_zalloc(buf_trie_zone, M_NOWAIT);
319}
320
321static void
322buf_trie_free(struct pctrie *ptree, void *node)
323{
324
325	uma_zfree(buf_trie_zone, node);
326}
327PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
328
329/*
330 * Initialize the vnode management data structures.
331 *
332 * Reevaluate the following cap on the number of vnodes after the physical
333 * memory size exceeds 512GB.  In the limit, as the physical memory size
334 * grows, the ratio of physical pages to vnodes approaches sixteen to one.
335 */
336#ifndef	MAXVNODES_MAX
337#define	MAXVNODES_MAX	(512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
338#endif
339static void
340vntblinit(void *dummy __unused)
341{
342	u_int i;
343	int physvnodes, virtvnodes;
344
345	/*
346	 * Desiredvnodes is a function of the physical memory size and the
347	 * kernel's heap size.  Generally speaking, it scales with the
348	 * physical memory size.  The ratio of desiredvnodes to physical pages
349	 * is one to four until desiredvnodes exceeds 98,304.  Thereafter, the
350	 * marginal ratio of desiredvnodes to physical pages is one to
351	 * sixteen.  However, desiredvnodes is limited by the kernel's heap
352	 * size.  The memory required by desiredvnodes vnodes and vm objects
353	 * may not exceed one seventh of the kernel's heap size.
354	 */
355	physvnodes = maxproc + cnt.v_page_count / 16 + 3 * min(98304 * 4,
356	    cnt.v_page_count) / 16;
357	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
358	    sizeof(struct vnode)));
359	desiredvnodes = min(physvnodes, virtvnodes);
360	if (desiredvnodes > MAXVNODES_MAX) {
361		if (bootverbose)
362			printf("Reducing kern.maxvnodes %d -> %d\n",
363			    desiredvnodes, MAXVNODES_MAX);
364		desiredvnodes = MAXVNODES_MAX;
365	}
366	wantfreevnodes = desiredvnodes / 4;
367	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
368	TAILQ_INIT(&vnode_free_list);
369	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
370	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
371	    NULL, NULL, UMA_ALIGN_PTR, 0);
372	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
373	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
374	/*
375	 * Preallocate enough nodes to support one-per buf so that
376	 * we can not fail an insert.  reassignbuf() callers can not
377	 * tolerate the insertion failure.
378	 */
379	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
380	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
381	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
382	uma_prealloc(buf_trie_zone, nbuf);
383	/*
384	 * Initialize the filesystem syncer.
385	 */
386	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
387	    &syncer_mask);
388	syncer_maxdelay = syncer_mask + 1;
389	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
390	cv_init(&sync_wakeup, "syncer");
391	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
392		vnsz2log++;
393	vnsz2log--;
394}
395SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
396
397
398/*
399 * Mark a mount point as busy. Used to synchronize access and to delay
400 * unmounting. Eventually, mountlist_mtx is not released on failure.
401 *
402 * vfs_busy() is a custom lock, it can block the caller.
403 * vfs_busy() only sleeps if the unmount is active on the mount point.
404 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
405 * vnode belonging to mp.
406 *
407 * Lookup uses vfs_busy() to traverse mount points.
408 * root fs			var fs
409 * / vnode lock		A	/ vnode lock (/var)		D
410 * /var vnode lock	B	/log vnode lock(/var/log)	E
411 * vfs_busy lock	C	vfs_busy lock			F
412 *
413 * Within each file system, the lock order is C->A->B and F->D->E.
414 *
415 * When traversing across mounts, the system follows that lock order:
416 *
417 *        C->A->B
418 *              |
419 *              +->F->D->E
420 *
421 * The lookup() process for namei("/var") illustrates the process:
422 *  VOP_LOOKUP() obtains B while A is held
423 *  vfs_busy() obtains a shared lock on F while A and B are held
424 *  vput() releases lock on B
425 *  vput() releases lock on A
426 *  VFS_ROOT() obtains lock on D while shared lock on F is held
427 *  vfs_unbusy() releases shared lock on F
428 *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
429 *    Attempt to lock A (instead of vp_crossmp) while D is held would
430 *    violate the global order, causing deadlocks.
431 *
432 * dounmount() locks B while F is drained.
433 */
434int
435vfs_busy(struct mount *mp, int flags)
436{
437
438	MPASS((flags & ~MBF_MASK) == 0);
439	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
440
441	MNT_ILOCK(mp);
442	MNT_REF(mp);
443	/*
444	 * If mount point is currenly being unmounted, sleep until the
445	 * mount point fate is decided.  If thread doing the unmounting fails,
446	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
447	 * that this mount point has survived the unmount attempt and vfs_busy
448	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
449	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
450	 * about to be really destroyed.  vfs_busy needs to release its
451	 * reference on the mount point in this case and return with ENOENT,
452	 * telling the caller that mount mount it tried to busy is no longer
453	 * valid.
454	 */
455	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
456		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
457			MNT_REL(mp);
458			MNT_IUNLOCK(mp);
459			CTR1(KTR_VFS, "%s: failed busying before sleeping",
460			    __func__);
461			return (ENOENT);
462		}
463		if (flags & MBF_MNTLSTLOCK)
464			mtx_unlock(&mountlist_mtx);
465		mp->mnt_kern_flag |= MNTK_MWAIT;
466		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
467		if (flags & MBF_MNTLSTLOCK)
468			mtx_lock(&mountlist_mtx);
469		MNT_ILOCK(mp);
470	}
471	if (flags & MBF_MNTLSTLOCK)
472		mtx_unlock(&mountlist_mtx);
473	mp->mnt_lockref++;
474	MNT_IUNLOCK(mp);
475	return (0);
476}
477
478/*
479 * Free a busy filesystem.
480 */
481void
482vfs_unbusy(struct mount *mp)
483{
484
485	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
486	MNT_ILOCK(mp);
487	MNT_REL(mp);
488	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
489	mp->mnt_lockref--;
490	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
491		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
492		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
493		mp->mnt_kern_flag &= ~MNTK_DRAINING;
494		wakeup(&mp->mnt_lockref);
495	}
496	MNT_IUNLOCK(mp);
497}
498
499/*
500 * Lookup a mount point by filesystem identifier.
501 */
502struct mount *
503vfs_getvfs(fsid_t *fsid)
504{
505	struct mount *mp;
506
507	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
508	mtx_lock(&mountlist_mtx);
509	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
510		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
511		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
512			vfs_ref(mp);
513			mtx_unlock(&mountlist_mtx);
514			return (mp);
515		}
516	}
517	mtx_unlock(&mountlist_mtx);
518	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
519	return ((struct mount *) 0);
520}
521
522/*
523 * Lookup a mount point by filesystem identifier, busying it before
524 * returning.
525 *
526 * To avoid congestion on mountlist_mtx, implement simple direct-mapped
527 * cache for popular filesystem identifiers.  The cache is lockess, using
528 * the fact that struct mount's are never freed.  In worst case we may
529 * get pointer to unmounted or even different filesystem, so we have to
530 * check what we got, and go slow way if so.
531 */
532struct mount *
533vfs_busyfs(fsid_t *fsid)
534{
535#define	FSID_CACHE_SIZE	256
536	typedef struct mount * volatile vmp_t;
537	static vmp_t cache[FSID_CACHE_SIZE];
538	struct mount *mp;
539	int error;
540	uint32_t hash;
541
542	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
543	hash = fsid->val[0] ^ fsid->val[1];
544	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
545	mp = cache[hash];
546	if (mp == NULL ||
547	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
548	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
549		goto slow;
550	if (vfs_busy(mp, 0) != 0) {
551		cache[hash] = NULL;
552		goto slow;
553	}
554	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
555	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
556		return (mp);
557	else
558	    vfs_unbusy(mp);
559
560slow:
561	mtx_lock(&mountlist_mtx);
562	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
563		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
564		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
565			error = vfs_busy(mp, MBF_MNTLSTLOCK);
566			if (error) {
567				cache[hash] = NULL;
568				mtx_unlock(&mountlist_mtx);
569				return (NULL);
570			}
571			cache[hash] = mp;
572			return (mp);
573		}
574	}
575	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
576	mtx_unlock(&mountlist_mtx);
577	return ((struct mount *) 0);
578}
579
580/*
581 * Check if a user can access privileged mount options.
582 */
583int
584vfs_suser(struct mount *mp, struct thread *td)
585{
586	int error;
587
588	/*
589	 * If the thread is jailed, but this is not a jail-friendly file
590	 * system, deny immediately.
591	 */
592	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
593		return (EPERM);
594
595	/*
596	 * If the file system was mounted outside the jail of the calling
597	 * thread, deny immediately.
598	 */
599	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
600		return (EPERM);
601
602	/*
603	 * If file system supports delegated administration, we don't check
604	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
605	 * by the file system itself.
606	 * If this is not the user that did original mount, we check for
607	 * the PRIV_VFS_MOUNT_OWNER privilege.
608	 */
609	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
610	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
611		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
612			return (error);
613	}
614	return (0);
615}
616
617/*
618 * Get a new unique fsid.  Try to make its val[0] unique, since this value
619 * will be used to create fake device numbers for stat().  Also try (but
620 * not so hard) make its val[0] unique mod 2^16, since some emulators only
621 * support 16-bit device numbers.  We end up with unique val[0]'s for the
622 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
623 *
624 * Keep in mind that several mounts may be running in parallel.  Starting
625 * the search one past where the previous search terminated is both a
626 * micro-optimization and a defense against returning the same fsid to
627 * different mounts.
628 */
629void
630vfs_getnewfsid(struct mount *mp)
631{
632	static uint16_t mntid_base;
633	struct mount *nmp;
634	fsid_t tfsid;
635	int mtype;
636
637	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
638	mtx_lock(&mntid_mtx);
639	mtype = mp->mnt_vfc->vfc_typenum;
640	tfsid.val[1] = mtype;
641	mtype = (mtype & 0xFF) << 24;
642	for (;;) {
643		tfsid.val[0] = makedev(255,
644		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
645		mntid_base++;
646		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
647			break;
648		vfs_rel(nmp);
649	}
650	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
651	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
652	mtx_unlock(&mntid_mtx);
653}
654
655/*
656 * Knob to control the precision of file timestamps:
657 *
658 *   0 = seconds only; nanoseconds zeroed.
659 *   1 = seconds and nanoseconds, accurate within 1/HZ.
660 *   2 = seconds and nanoseconds, truncated to microseconds.
661 * >=3 = seconds and nanoseconds, maximum precision.
662 */
663enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
664
665static int timestamp_precision = TSP_USEC;
666SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
667    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
668    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
669    "3+: sec + ns (max. precision))");
670
671/*
672 * Get a current timestamp.
673 */
674void
675vfs_timestamp(struct timespec *tsp)
676{
677	struct timeval tv;
678
679	switch (timestamp_precision) {
680	case TSP_SEC:
681		tsp->tv_sec = time_second;
682		tsp->tv_nsec = 0;
683		break;
684	case TSP_HZ:
685		getnanotime(tsp);
686		break;
687	case TSP_USEC:
688		microtime(&tv);
689		TIMEVAL_TO_TIMESPEC(&tv, tsp);
690		break;
691	case TSP_NSEC:
692	default:
693		nanotime(tsp);
694		break;
695	}
696}
697
698/*
699 * Set vnode attributes to VNOVAL
700 */
701void
702vattr_null(struct vattr *vap)
703{
704
705	vap->va_type = VNON;
706	vap->va_size = VNOVAL;
707	vap->va_bytes = VNOVAL;
708	vap->va_mode = VNOVAL;
709	vap->va_nlink = VNOVAL;
710	vap->va_uid = VNOVAL;
711	vap->va_gid = VNOVAL;
712	vap->va_fsid = VNOVAL;
713	vap->va_fileid = VNOVAL;
714	vap->va_blocksize = VNOVAL;
715	vap->va_rdev = VNOVAL;
716	vap->va_atime.tv_sec = VNOVAL;
717	vap->va_atime.tv_nsec = VNOVAL;
718	vap->va_mtime.tv_sec = VNOVAL;
719	vap->va_mtime.tv_nsec = VNOVAL;
720	vap->va_ctime.tv_sec = VNOVAL;
721	vap->va_ctime.tv_nsec = VNOVAL;
722	vap->va_birthtime.tv_sec = VNOVAL;
723	vap->va_birthtime.tv_nsec = VNOVAL;
724	vap->va_flags = VNOVAL;
725	vap->va_gen = VNOVAL;
726	vap->va_vaflags = 0;
727}
728
729/*
730 * This routine is called when we have too many vnodes.  It attempts
731 * to free <count> vnodes and will potentially free vnodes that still
732 * have VM backing store (VM backing store is typically the cause
733 * of a vnode blowout so we want to do this).  Therefore, this operation
734 * is not considered cheap.
735 *
736 * A number of conditions may prevent a vnode from being reclaimed.
737 * the buffer cache may have references on the vnode, a directory
738 * vnode may still have references due to the namei cache representing
739 * underlying files, or the vnode may be in active use.   It is not
740 * desireable to reuse such vnodes.  These conditions may cause the
741 * number of vnodes to reach some minimum value regardless of what
742 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
743 */
744static int
745vlrureclaim(struct mount *mp)
746{
747	struct vnode *vp;
748	int done;
749	int trigger;
750	int usevnodes;
751	int count;
752
753	/*
754	 * Calculate the trigger point, don't allow user
755	 * screwups to blow us up.   This prevents us from
756	 * recycling vnodes with lots of resident pages.  We
757	 * aren't trying to free memory, we are trying to
758	 * free vnodes.
759	 */
760	usevnodes = desiredvnodes;
761	if (usevnodes <= 0)
762		usevnodes = 1;
763	trigger = cnt.v_page_count * 2 / usevnodes;
764	done = 0;
765	vn_start_write(NULL, &mp, V_WAIT);
766	MNT_ILOCK(mp);
767	count = mp->mnt_nvnodelistsize / 10 + 1;
768	while (count != 0) {
769		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
770		while (vp != NULL && vp->v_type == VMARKER)
771			vp = TAILQ_NEXT(vp, v_nmntvnodes);
772		if (vp == NULL)
773			break;
774		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
775		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
776		--count;
777		if (!VI_TRYLOCK(vp))
778			goto next_iter;
779		/*
780		 * If it's been deconstructed already, it's still
781		 * referenced, or it exceeds the trigger, skip it.
782		 */
783		if (vp->v_usecount ||
784		    (!vlru_allow_cache_src &&
785			!LIST_EMPTY(&(vp)->v_cache_src)) ||
786		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
787		    vp->v_object->resident_page_count > trigger)) {
788			VI_UNLOCK(vp);
789			goto next_iter;
790		}
791		MNT_IUNLOCK(mp);
792		vholdl(vp);
793		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
794			vdrop(vp);
795			goto next_iter_mntunlocked;
796		}
797		VI_LOCK(vp);
798		/*
799		 * v_usecount may have been bumped after VOP_LOCK() dropped
800		 * the vnode interlock and before it was locked again.
801		 *
802		 * It is not necessary to recheck VI_DOOMED because it can
803		 * only be set by another thread that holds both the vnode
804		 * lock and vnode interlock.  If another thread has the
805		 * vnode lock before we get to VOP_LOCK() and obtains the
806		 * vnode interlock after VOP_LOCK() drops the vnode
807		 * interlock, the other thread will be unable to drop the
808		 * vnode lock before our VOP_LOCK() call fails.
809		 */
810		if (vp->v_usecount ||
811		    (!vlru_allow_cache_src &&
812			!LIST_EMPTY(&(vp)->v_cache_src)) ||
813		    (vp->v_object != NULL &&
814		    vp->v_object->resident_page_count > trigger)) {
815			VOP_UNLOCK(vp, LK_INTERLOCK);
816			vdrop(vp);
817			goto next_iter_mntunlocked;
818		}
819		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
820		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
821		atomic_add_long(&recycles_count, 1);
822		vgonel(vp);
823		VOP_UNLOCK(vp, 0);
824		vdropl(vp);
825		done++;
826next_iter_mntunlocked:
827		if (!should_yield())
828			goto relock_mnt;
829		goto yield;
830next_iter:
831		if (!should_yield())
832			continue;
833		MNT_IUNLOCK(mp);
834yield:
835		kern_yield(PRI_USER);
836relock_mnt:
837		MNT_ILOCK(mp);
838	}
839	MNT_IUNLOCK(mp);
840	vn_finished_write(mp);
841	return done;
842}
843
844/*
845 * Attempt to keep the free list at wantfreevnodes length.
846 */
847static void
848vnlru_free(int count)
849{
850	struct vnode *vp;
851
852	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
853	for (; count > 0; count--) {
854		vp = TAILQ_FIRST(&vnode_free_list);
855		/*
856		 * The list can be modified while the free_list_mtx
857		 * has been dropped and vp could be NULL here.
858		 */
859		if (!vp)
860			break;
861		VNASSERT(vp->v_op != NULL, vp,
862		    ("vnlru_free: vnode already reclaimed."));
863		KASSERT((vp->v_iflag & VI_FREE) != 0,
864		    ("Removing vnode not on freelist"));
865		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
866		    ("Mangling active vnode"));
867		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
868		/*
869		 * Don't recycle if we can't get the interlock.
870		 */
871		if (!VI_TRYLOCK(vp)) {
872			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
873			continue;
874		}
875		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
876		    vp, ("vp inconsistent on freelist"));
877
878		/*
879		 * The clear of VI_FREE prevents activation of the
880		 * vnode.  There is no sense in putting the vnode on
881		 * the mount point active list, only to remove it
882		 * later during recycling.  Inline the relevant part
883		 * of vholdl(), to avoid triggering assertions or
884		 * activating.
885		 */
886		freevnodes--;
887		vp->v_iflag &= ~VI_FREE;
888		vp->v_holdcnt++;
889
890		mtx_unlock(&vnode_free_list_mtx);
891		VI_UNLOCK(vp);
892		vtryrecycle(vp);
893		/*
894		 * If the recycled succeeded this vdrop will actually free
895		 * the vnode.  If not it will simply place it back on
896		 * the free list.
897		 */
898		vdrop(vp);
899		mtx_lock(&vnode_free_list_mtx);
900	}
901}
902/*
903 * Attempt to recycle vnodes in a context that is always safe to block.
904 * Calling vlrurecycle() from the bowels of filesystem code has some
905 * interesting deadlock problems.
906 */
907static struct proc *vnlruproc;
908static int vnlruproc_sig;
909
910static void
911vnlru_proc(void)
912{
913	struct mount *mp, *nmp;
914	int done;
915	struct proc *p = vnlruproc;
916
917	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
918	    SHUTDOWN_PRI_FIRST);
919
920	for (;;) {
921		kproc_suspend_check(p);
922		mtx_lock(&vnode_free_list_mtx);
923		if (freevnodes > wantfreevnodes)
924			vnlru_free(freevnodes - wantfreevnodes);
925		if (numvnodes <= desiredvnodes * 9 / 10) {
926			vnlruproc_sig = 0;
927			wakeup(&vnlruproc_sig);
928			msleep(vnlruproc, &vnode_free_list_mtx,
929			    PVFS|PDROP, "vlruwt", hz);
930			continue;
931		}
932		mtx_unlock(&vnode_free_list_mtx);
933		done = 0;
934		mtx_lock(&mountlist_mtx);
935		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
936			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
937				nmp = TAILQ_NEXT(mp, mnt_list);
938				continue;
939			}
940			done += vlrureclaim(mp);
941			mtx_lock(&mountlist_mtx);
942			nmp = TAILQ_NEXT(mp, mnt_list);
943			vfs_unbusy(mp);
944		}
945		mtx_unlock(&mountlist_mtx);
946		if (done == 0) {
947#if 0
948			/* These messages are temporary debugging aids */
949			if (vnlru_nowhere < 5)
950				printf("vnlru process getting nowhere..\n");
951			else if (vnlru_nowhere == 5)
952				printf("vnlru process messages stopped.\n");
953#endif
954			vnlru_nowhere++;
955			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
956		} else
957			kern_yield(PRI_USER);
958	}
959}
960
961static struct kproc_desc vnlru_kp = {
962	"vnlru",
963	vnlru_proc,
964	&vnlruproc
965};
966SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
967    &vnlru_kp);
968
969/*
970 * Routines having to do with the management of the vnode table.
971 */
972
973/*
974 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
975 * before we actually vgone().  This function must be called with the vnode
976 * held to prevent the vnode from being returned to the free list midway
977 * through vgone().
978 */
979static int
980vtryrecycle(struct vnode *vp)
981{
982	struct mount *vnmp;
983
984	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
985	VNASSERT(vp->v_holdcnt, vp,
986	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
987	/*
988	 * This vnode may found and locked via some other list, if so we
989	 * can't recycle it yet.
990	 */
991	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
992		CTR2(KTR_VFS,
993		    "%s: impossible to recycle, vp %p lock is already held",
994		    __func__, vp);
995		return (EWOULDBLOCK);
996	}
997	/*
998	 * Don't recycle if its filesystem is being suspended.
999	 */
1000	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1001		VOP_UNLOCK(vp, 0);
1002		CTR2(KTR_VFS,
1003		    "%s: impossible to recycle, cannot start the write for %p",
1004		    __func__, vp);
1005		return (EBUSY);
1006	}
1007	/*
1008	 * If we got this far, we need to acquire the interlock and see if
1009	 * anyone picked up this vnode from another list.  If not, we will
1010	 * mark it with DOOMED via vgonel() so that anyone who does find it
1011	 * will skip over it.
1012	 */
1013	VI_LOCK(vp);
1014	if (vp->v_usecount) {
1015		VOP_UNLOCK(vp, LK_INTERLOCK);
1016		vn_finished_write(vnmp);
1017		CTR2(KTR_VFS,
1018		    "%s: impossible to recycle, %p is already referenced",
1019		    __func__, vp);
1020		return (EBUSY);
1021	}
1022	if ((vp->v_iflag & VI_DOOMED) == 0) {
1023		atomic_add_long(&recycles_count, 1);
1024		vgonel(vp);
1025	}
1026	VOP_UNLOCK(vp, LK_INTERLOCK);
1027	vn_finished_write(vnmp);
1028	return (0);
1029}
1030
1031/*
1032 * Wait for available vnodes.
1033 */
1034static int
1035getnewvnode_wait(int suspended)
1036{
1037
1038	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1039	if (numvnodes > desiredvnodes) {
1040		if (suspended) {
1041			/*
1042			 * File system is beeing suspended, we cannot risk a
1043			 * deadlock here, so allocate new vnode anyway.
1044			 */
1045			if (freevnodes > wantfreevnodes)
1046				vnlru_free(freevnodes - wantfreevnodes);
1047			return (0);
1048		}
1049		if (vnlruproc_sig == 0) {
1050			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1051			wakeup(vnlruproc);
1052		}
1053		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1054		    "vlruwk", hz);
1055	}
1056	return (numvnodes > desiredvnodes ? ENFILE : 0);
1057}
1058
1059void
1060getnewvnode_reserve(u_int count)
1061{
1062	struct thread *td;
1063
1064	td = curthread;
1065	/* First try to be quick and racy. */
1066	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1067		td->td_vp_reserv += count;
1068		return;
1069	} else
1070		atomic_subtract_long(&numvnodes, count);
1071
1072	mtx_lock(&vnode_free_list_mtx);
1073	while (count > 0) {
1074		if (getnewvnode_wait(0) == 0) {
1075			count--;
1076			td->td_vp_reserv++;
1077			atomic_add_long(&numvnodes, 1);
1078		}
1079	}
1080	mtx_unlock(&vnode_free_list_mtx);
1081}
1082
1083void
1084getnewvnode_drop_reserve(void)
1085{
1086	struct thread *td;
1087
1088	td = curthread;
1089	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1090	td->td_vp_reserv = 0;
1091}
1092
1093/*
1094 * Return the next vnode from the free list.
1095 */
1096int
1097getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1098    struct vnode **vpp)
1099{
1100	struct vnode *vp;
1101	struct bufobj *bo;
1102	struct thread *td;
1103	int error;
1104
1105	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1106	vp = NULL;
1107	td = curthread;
1108	if (td->td_vp_reserv > 0) {
1109		td->td_vp_reserv -= 1;
1110		goto alloc;
1111	}
1112	mtx_lock(&vnode_free_list_mtx);
1113	/*
1114	 * Lend our context to reclaim vnodes if they've exceeded the max.
1115	 */
1116	if (freevnodes > wantfreevnodes)
1117		vnlru_free(1);
1118	error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1119	    MNTK_SUSPEND));
1120#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1121	if (error != 0) {
1122		mtx_unlock(&vnode_free_list_mtx);
1123		return (error);
1124	}
1125#endif
1126	atomic_add_long(&numvnodes, 1);
1127	mtx_unlock(&vnode_free_list_mtx);
1128alloc:
1129	atomic_add_long(&vnodes_created, 1);
1130	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
1131	/*
1132	 * Setup locks.
1133	 */
1134	vp->v_vnlock = &vp->v_lock;
1135	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
1136	/*
1137	 * By default, don't allow shared locks unless filesystems
1138	 * opt-in.
1139	 */
1140	lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE | LK_IS_VNODE);
1141	/*
1142	 * Initialize bufobj.
1143	 */
1144	bo = &vp->v_bufobj;
1145	bo->__bo_vnode = vp;
1146	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
1147	bo->bo_ops = &buf_ops_bio;
1148	bo->bo_private = vp;
1149	TAILQ_INIT(&bo->bo_clean.bv_hd);
1150	TAILQ_INIT(&bo->bo_dirty.bv_hd);
1151	/*
1152	 * Initialize namecache.
1153	 */
1154	LIST_INIT(&vp->v_cache_src);
1155	TAILQ_INIT(&vp->v_cache_dst);
1156	/*
1157	 * Finalize various vnode identity bits.
1158	 */
1159	vp->v_type = VNON;
1160	vp->v_tag = tag;
1161	vp->v_op = vops;
1162	v_incr_usecount(vp);
1163	vp->v_data = NULL;
1164#ifdef MAC
1165	mac_vnode_init(vp);
1166	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1167		mac_vnode_associate_singlelabel(mp, vp);
1168	else if (mp == NULL && vops != &dead_vnodeops)
1169		printf("NULL mp in getnewvnode()\n");
1170#endif
1171	if (mp != NULL) {
1172		bo->bo_bsize = mp->mnt_stat.f_iosize;
1173		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1174			vp->v_vflag |= VV_NOKNOTE;
1175	}
1176	rangelock_init(&vp->v_rl);
1177
1178	/*
1179	 * For the filesystems which do not use vfs_hash_insert(),
1180	 * still initialize v_hash to have vfs_hash_index() useful.
1181	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1182	 * its own hashing.
1183	 */
1184	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1185
1186	*vpp = vp;
1187	return (0);
1188}
1189
1190/*
1191 * Delete from old mount point vnode list, if on one.
1192 */
1193static void
1194delmntque(struct vnode *vp)
1195{
1196	struct mount *mp;
1197	int active;
1198
1199	mp = vp->v_mount;
1200	if (mp == NULL)
1201		return;
1202	MNT_ILOCK(mp);
1203	VI_LOCK(vp);
1204	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1205	    ("Active vnode list size %d > Vnode list size %d",
1206	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1207	active = vp->v_iflag & VI_ACTIVE;
1208	vp->v_iflag &= ~VI_ACTIVE;
1209	if (active) {
1210		mtx_lock(&vnode_free_list_mtx);
1211		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1212		mp->mnt_activevnodelistsize--;
1213		mtx_unlock(&vnode_free_list_mtx);
1214	}
1215	vp->v_mount = NULL;
1216	VI_UNLOCK(vp);
1217	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1218		("bad mount point vnode list size"));
1219	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1220	mp->mnt_nvnodelistsize--;
1221	MNT_REL(mp);
1222	MNT_IUNLOCK(mp);
1223}
1224
1225static void
1226insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1227{
1228
1229	vp->v_data = NULL;
1230	vp->v_op = &dead_vnodeops;
1231	vgone(vp);
1232	vput(vp);
1233}
1234
1235/*
1236 * Insert into list of vnodes for the new mount point, if available.
1237 */
1238int
1239insmntque1(struct vnode *vp, struct mount *mp,
1240	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1241{
1242
1243	KASSERT(vp->v_mount == NULL,
1244		("insmntque: vnode already on per mount vnode list"));
1245	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1246	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1247
1248	/*
1249	 * We acquire the vnode interlock early to ensure that the
1250	 * vnode cannot be recycled by another process releasing a
1251	 * holdcnt on it before we get it on both the vnode list
1252	 * and the active vnode list. The mount mutex protects only
1253	 * manipulation of the vnode list and the vnode freelist
1254	 * mutex protects only manipulation of the active vnode list.
1255	 * Hence the need to hold the vnode interlock throughout.
1256	 */
1257	MNT_ILOCK(mp);
1258	VI_LOCK(vp);
1259	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1260	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1261	    mp->mnt_nvnodelistsize == 0)) &&
1262	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1263		VI_UNLOCK(vp);
1264		MNT_IUNLOCK(mp);
1265		if (dtr != NULL)
1266			dtr(vp, dtr_arg);
1267		return (EBUSY);
1268	}
1269	vp->v_mount = mp;
1270	MNT_REF(mp);
1271	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1272	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1273		("neg mount point vnode list size"));
1274	mp->mnt_nvnodelistsize++;
1275	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1276	    ("Activating already active vnode"));
1277	vp->v_iflag |= VI_ACTIVE;
1278	mtx_lock(&vnode_free_list_mtx);
1279	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1280	mp->mnt_activevnodelistsize++;
1281	mtx_unlock(&vnode_free_list_mtx);
1282	VI_UNLOCK(vp);
1283	MNT_IUNLOCK(mp);
1284	return (0);
1285}
1286
1287int
1288insmntque(struct vnode *vp, struct mount *mp)
1289{
1290
1291	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1292}
1293
1294/*
1295 * Flush out and invalidate all buffers associated with a bufobj
1296 * Called with the underlying object locked.
1297 */
1298int
1299bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1300{
1301	int error;
1302
1303	BO_LOCK(bo);
1304	if (flags & V_SAVE) {
1305		error = bufobj_wwait(bo, slpflag, slptimeo);
1306		if (error) {
1307			BO_UNLOCK(bo);
1308			return (error);
1309		}
1310		if (bo->bo_dirty.bv_cnt > 0) {
1311			BO_UNLOCK(bo);
1312			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1313				return (error);
1314			/*
1315			 * XXX We could save a lock/unlock if this was only
1316			 * enabled under INVARIANTS
1317			 */
1318			BO_LOCK(bo);
1319			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1320				panic("vinvalbuf: dirty bufs");
1321		}
1322	}
1323	/*
1324	 * If you alter this loop please notice that interlock is dropped and
1325	 * reacquired in flushbuflist.  Special care is needed to ensure that
1326	 * no race conditions occur from this.
1327	 */
1328	do {
1329		error = flushbuflist(&bo->bo_clean,
1330		    flags, bo, slpflag, slptimeo);
1331		if (error == 0 && !(flags & V_CLEANONLY))
1332			error = flushbuflist(&bo->bo_dirty,
1333			    flags, bo, slpflag, slptimeo);
1334		if (error != 0 && error != EAGAIN) {
1335			BO_UNLOCK(bo);
1336			return (error);
1337		}
1338	} while (error != 0);
1339
1340	/*
1341	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1342	 * have write I/O in-progress but if there is a VM object then the
1343	 * VM object can also have read-I/O in-progress.
1344	 */
1345	do {
1346		bufobj_wwait(bo, 0, 0);
1347		BO_UNLOCK(bo);
1348		if (bo->bo_object != NULL) {
1349			VM_OBJECT_WLOCK(bo->bo_object);
1350			vm_object_pip_wait(bo->bo_object, "bovlbx");
1351			VM_OBJECT_WUNLOCK(bo->bo_object);
1352		}
1353		BO_LOCK(bo);
1354	} while (bo->bo_numoutput > 0);
1355	BO_UNLOCK(bo);
1356
1357	/*
1358	 * Destroy the copy in the VM cache, too.
1359	 */
1360	if (bo->bo_object != NULL &&
1361	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1362		VM_OBJECT_WLOCK(bo->bo_object);
1363		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1364		    OBJPR_CLEANONLY : 0);
1365		VM_OBJECT_WUNLOCK(bo->bo_object);
1366	}
1367
1368#ifdef INVARIANTS
1369	BO_LOCK(bo);
1370	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1371	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1372		panic("vinvalbuf: flush failed");
1373	BO_UNLOCK(bo);
1374#endif
1375	return (0);
1376}
1377
1378/*
1379 * Flush out and invalidate all buffers associated with a vnode.
1380 * Called with the underlying object locked.
1381 */
1382int
1383vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1384{
1385
1386	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1387	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1388	if (vp->v_object != NULL && vp->v_object->handle != vp)
1389		return (0);
1390	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1391}
1392
1393/*
1394 * Flush out buffers on the specified list.
1395 *
1396 */
1397static int
1398flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1399    int slptimeo)
1400{
1401	struct buf *bp, *nbp;
1402	int retval, error;
1403	daddr_t lblkno;
1404	b_xflags_t xflags;
1405
1406	ASSERT_BO_WLOCKED(bo);
1407
1408	retval = 0;
1409	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1410		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1411		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1412			continue;
1413		}
1414		lblkno = 0;
1415		xflags = 0;
1416		if (nbp != NULL) {
1417			lblkno = nbp->b_lblkno;
1418			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1419		}
1420		retval = EAGAIN;
1421		error = BUF_TIMELOCK(bp,
1422		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1423		    "flushbuf", slpflag, slptimeo);
1424		if (error) {
1425			BO_LOCK(bo);
1426			return (error != ENOLCK ? error : EAGAIN);
1427		}
1428		KASSERT(bp->b_bufobj == bo,
1429		    ("bp %p wrong b_bufobj %p should be %p",
1430		    bp, bp->b_bufobj, bo));
1431		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1432			BUF_UNLOCK(bp);
1433			BO_LOCK(bo);
1434			return (EAGAIN);
1435		}
1436		/*
1437		 * XXX Since there are no node locks for NFS, I
1438		 * believe there is a slight chance that a delayed
1439		 * write will occur while sleeping just above, so
1440		 * check for it.
1441		 */
1442		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1443		    (flags & V_SAVE)) {
1444			bremfree(bp);
1445			bp->b_flags |= B_ASYNC;
1446			bwrite(bp);
1447			BO_LOCK(bo);
1448			return (EAGAIN);	/* XXX: why not loop ? */
1449		}
1450		bremfree(bp);
1451		bp->b_flags |= (B_INVAL | B_RELBUF);
1452		bp->b_flags &= ~B_ASYNC;
1453		brelse(bp);
1454		BO_LOCK(bo);
1455		if (nbp != NULL &&
1456		    (nbp->b_bufobj != bo ||
1457		     nbp->b_lblkno != lblkno ||
1458		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1459			break;			/* nbp invalid */
1460	}
1461	return (retval);
1462}
1463
1464/*
1465 * Truncate a file's buffer and pages to a specified length.  This
1466 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1467 * sync activity.
1468 */
1469int
1470vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1471{
1472	struct buf *bp, *nbp;
1473	int anyfreed;
1474	int trunclbn;
1475	struct bufobj *bo;
1476
1477	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1478	    vp, cred, blksize, (uintmax_t)length);
1479
1480	/*
1481	 * Round up to the *next* lbn.
1482	 */
1483	trunclbn = (length + blksize - 1) / blksize;
1484
1485	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1486restart:
1487	bo = &vp->v_bufobj;
1488	BO_LOCK(bo);
1489	anyfreed = 1;
1490	for (;anyfreed;) {
1491		anyfreed = 0;
1492		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1493			if (bp->b_lblkno < trunclbn)
1494				continue;
1495			if (BUF_LOCK(bp,
1496			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1497			    BO_LOCKPTR(bo)) == ENOLCK)
1498				goto restart;
1499
1500			bremfree(bp);
1501			bp->b_flags |= (B_INVAL | B_RELBUF);
1502			bp->b_flags &= ~B_ASYNC;
1503			brelse(bp);
1504			anyfreed = 1;
1505
1506			BO_LOCK(bo);
1507			if (nbp != NULL &&
1508			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1509			    (nbp->b_vp != vp) ||
1510			    (nbp->b_flags & B_DELWRI))) {
1511				BO_UNLOCK(bo);
1512				goto restart;
1513			}
1514		}
1515
1516		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1517			if (bp->b_lblkno < trunclbn)
1518				continue;
1519			if (BUF_LOCK(bp,
1520			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1521			    BO_LOCKPTR(bo)) == ENOLCK)
1522				goto restart;
1523			bremfree(bp);
1524			bp->b_flags |= (B_INVAL | B_RELBUF);
1525			bp->b_flags &= ~B_ASYNC;
1526			brelse(bp);
1527			anyfreed = 1;
1528
1529			BO_LOCK(bo);
1530			if (nbp != NULL &&
1531			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1532			    (nbp->b_vp != vp) ||
1533			    (nbp->b_flags & B_DELWRI) == 0)) {
1534				BO_UNLOCK(bo);
1535				goto restart;
1536			}
1537		}
1538	}
1539
1540	if (length > 0) {
1541restartsync:
1542		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1543			if (bp->b_lblkno > 0)
1544				continue;
1545			/*
1546			 * Since we hold the vnode lock this should only
1547			 * fail if we're racing with the buf daemon.
1548			 */
1549			if (BUF_LOCK(bp,
1550			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1551			    BO_LOCKPTR(bo)) == ENOLCK) {
1552				goto restart;
1553			}
1554			VNASSERT((bp->b_flags & B_DELWRI), vp,
1555			    ("buf(%p) on dirty queue without DELWRI", bp));
1556
1557			bremfree(bp);
1558			bawrite(bp);
1559			BO_LOCK(bo);
1560			goto restartsync;
1561		}
1562	}
1563
1564	bufobj_wwait(bo, 0, 0);
1565	BO_UNLOCK(bo);
1566	vnode_pager_setsize(vp, length);
1567
1568	return (0);
1569}
1570
1571static void
1572buf_vlist_remove(struct buf *bp)
1573{
1574	struct bufv *bv;
1575
1576	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1577	ASSERT_BO_WLOCKED(bp->b_bufobj);
1578	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1579	    (BX_VNDIRTY|BX_VNCLEAN),
1580	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1581	if (bp->b_xflags & BX_VNDIRTY)
1582		bv = &bp->b_bufobj->bo_dirty;
1583	else
1584		bv = &bp->b_bufobj->bo_clean;
1585	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1586	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1587	bv->bv_cnt--;
1588	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1589}
1590
1591/*
1592 * Add the buffer to the sorted clean or dirty block list.
1593 *
1594 * NOTE: xflags is passed as a constant, optimizing this inline function!
1595 */
1596static void
1597buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1598{
1599	struct bufv *bv;
1600	struct buf *n;
1601	int error;
1602
1603	ASSERT_BO_WLOCKED(bo);
1604	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1605	    ("dead bo %p", bo));
1606	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1607	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1608	bp->b_xflags |= xflags;
1609	if (xflags & BX_VNDIRTY)
1610		bv = &bo->bo_dirty;
1611	else
1612		bv = &bo->bo_clean;
1613
1614	/*
1615	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1616	 * we tend to grow at the tail so lookup_le should usually be cheaper
1617	 * than _ge.
1618	 */
1619	if (bv->bv_cnt == 0 ||
1620	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1621		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1622	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1623		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1624	else
1625		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1626	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1627	if (error)
1628		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1629	bv->bv_cnt++;
1630}
1631
1632/*
1633 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1634 * shadow buffers used in background bitmap writes.
1635 *
1636 * This code isn't quite efficient as it could be because we are maintaining
1637 * two sorted lists and do not know which list the block resides in.
1638 *
1639 * During a "make buildworld" the desired buffer is found at one of
1640 * the roots more than 60% of the time.  Thus, checking both roots
1641 * before performing either splay eliminates unnecessary splays on the
1642 * first tree splayed.
1643 */
1644struct buf *
1645gbincore(struct bufobj *bo, daddr_t lblkno)
1646{
1647	struct buf *bp;
1648
1649	ASSERT_BO_LOCKED(bo);
1650	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1651	if (bp != NULL)
1652		return (bp);
1653	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1654}
1655
1656/*
1657 * Associate a buffer with a vnode.
1658 */
1659void
1660bgetvp(struct vnode *vp, struct buf *bp)
1661{
1662	struct bufobj *bo;
1663
1664	bo = &vp->v_bufobj;
1665	ASSERT_BO_WLOCKED(bo);
1666	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1667
1668	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1669	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1670	    ("bgetvp: bp already attached! %p", bp));
1671
1672	vhold(vp);
1673	bp->b_vp = vp;
1674	bp->b_bufobj = bo;
1675	/*
1676	 * Insert onto list for new vnode.
1677	 */
1678	buf_vlist_add(bp, bo, BX_VNCLEAN);
1679}
1680
1681/*
1682 * Disassociate a buffer from a vnode.
1683 */
1684void
1685brelvp(struct buf *bp)
1686{
1687	struct bufobj *bo;
1688	struct vnode *vp;
1689
1690	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1691	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1692
1693	/*
1694	 * Delete from old vnode list, if on one.
1695	 */
1696	vp = bp->b_vp;		/* XXX */
1697	bo = bp->b_bufobj;
1698	BO_LOCK(bo);
1699	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1700		buf_vlist_remove(bp);
1701	else
1702		panic("brelvp: Buffer %p not on queue.", bp);
1703	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1704		bo->bo_flag &= ~BO_ONWORKLST;
1705		mtx_lock(&sync_mtx);
1706		LIST_REMOVE(bo, bo_synclist);
1707		syncer_worklist_len--;
1708		mtx_unlock(&sync_mtx);
1709	}
1710	bp->b_vp = NULL;
1711	bp->b_bufobj = NULL;
1712	BO_UNLOCK(bo);
1713	vdrop(vp);
1714}
1715
1716/*
1717 * Add an item to the syncer work queue.
1718 */
1719static void
1720vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1721{
1722	int slot;
1723
1724	ASSERT_BO_WLOCKED(bo);
1725
1726	mtx_lock(&sync_mtx);
1727	if (bo->bo_flag & BO_ONWORKLST)
1728		LIST_REMOVE(bo, bo_synclist);
1729	else {
1730		bo->bo_flag |= BO_ONWORKLST;
1731		syncer_worklist_len++;
1732	}
1733
1734	if (delay > syncer_maxdelay - 2)
1735		delay = syncer_maxdelay - 2;
1736	slot = (syncer_delayno + delay) & syncer_mask;
1737
1738	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1739	mtx_unlock(&sync_mtx);
1740}
1741
1742static int
1743sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1744{
1745	int error, len;
1746
1747	mtx_lock(&sync_mtx);
1748	len = syncer_worklist_len - sync_vnode_count;
1749	mtx_unlock(&sync_mtx);
1750	error = SYSCTL_OUT(req, &len, sizeof(len));
1751	return (error);
1752}
1753
1754SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1755    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1756
1757static struct proc *updateproc;
1758static void sched_sync(void);
1759static struct kproc_desc up_kp = {
1760	"syncer",
1761	sched_sync,
1762	&updateproc
1763};
1764SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1765
1766static int
1767sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1768{
1769	struct vnode *vp;
1770	struct mount *mp;
1771
1772	*bo = LIST_FIRST(slp);
1773	if (*bo == NULL)
1774		return (0);
1775	vp = (*bo)->__bo_vnode;	/* XXX */
1776	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
1777		return (1);
1778	/*
1779	 * We use vhold in case the vnode does not
1780	 * successfully sync.  vhold prevents the vnode from
1781	 * going away when we unlock the sync_mtx so that
1782	 * we can acquire the vnode interlock.
1783	 */
1784	vholdl(vp);
1785	mtx_unlock(&sync_mtx);
1786	VI_UNLOCK(vp);
1787	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
1788		vdrop(vp);
1789		mtx_lock(&sync_mtx);
1790		return (*bo == LIST_FIRST(slp));
1791	}
1792	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1793	(void) VOP_FSYNC(vp, MNT_LAZY, td);
1794	VOP_UNLOCK(vp, 0);
1795	vn_finished_write(mp);
1796	BO_LOCK(*bo);
1797	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
1798		/*
1799		 * Put us back on the worklist.  The worklist
1800		 * routine will remove us from our current
1801		 * position and then add us back in at a later
1802		 * position.
1803		 */
1804		vn_syncer_add_to_worklist(*bo, syncdelay);
1805	}
1806	BO_UNLOCK(*bo);
1807	vdrop(vp);
1808	mtx_lock(&sync_mtx);
1809	return (0);
1810}
1811
1812static int first_printf = 1;
1813
1814/*
1815 * System filesystem synchronizer daemon.
1816 */
1817static void
1818sched_sync(void)
1819{
1820	struct synclist *next, *slp;
1821	struct bufobj *bo;
1822	long starttime;
1823	struct thread *td = curthread;
1824	int last_work_seen;
1825	int net_worklist_len;
1826	int syncer_final_iter;
1827	int error;
1828
1829	last_work_seen = 0;
1830	syncer_final_iter = 0;
1831	syncer_state = SYNCER_RUNNING;
1832	starttime = time_uptime;
1833	td->td_pflags |= TDP_NORUNNINGBUF;
1834
1835	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
1836	    SHUTDOWN_PRI_LAST);
1837
1838	mtx_lock(&sync_mtx);
1839	for (;;) {
1840		if (syncer_state == SYNCER_FINAL_DELAY &&
1841		    syncer_final_iter == 0) {
1842			mtx_unlock(&sync_mtx);
1843			kproc_suspend_check(td->td_proc);
1844			mtx_lock(&sync_mtx);
1845		}
1846		net_worklist_len = syncer_worklist_len - sync_vnode_count;
1847		if (syncer_state != SYNCER_RUNNING &&
1848		    starttime != time_uptime) {
1849			if (first_printf) {
1850				printf("\nSyncing disks, vnodes remaining...");
1851				first_printf = 0;
1852			}
1853			printf("%d ", net_worklist_len);
1854		}
1855		starttime = time_uptime;
1856
1857		/*
1858		 * Push files whose dirty time has expired.  Be careful
1859		 * of interrupt race on slp queue.
1860		 *
1861		 * Skip over empty worklist slots when shutting down.
1862		 */
1863		do {
1864			slp = &syncer_workitem_pending[syncer_delayno];
1865			syncer_delayno += 1;
1866			if (syncer_delayno == syncer_maxdelay)
1867				syncer_delayno = 0;
1868			next = &syncer_workitem_pending[syncer_delayno];
1869			/*
1870			 * If the worklist has wrapped since the
1871			 * it was emptied of all but syncer vnodes,
1872			 * switch to the FINAL_DELAY state and run
1873			 * for one more second.
1874			 */
1875			if (syncer_state == SYNCER_SHUTTING_DOWN &&
1876			    net_worklist_len == 0 &&
1877			    last_work_seen == syncer_delayno) {
1878				syncer_state = SYNCER_FINAL_DELAY;
1879				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
1880			}
1881		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
1882		    syncer_worklist_len > 0);
1883
1884		/*
1885		 * Keep track of the last time there was anything
1886		 * on the worklist other than syncer vnodes.
1887		 * Return to the SHUTTING_DOWN state if any
1888		 * new work appears.
1889		 */
1890		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
1891			last_work_seen = syncer_delayno;
1892		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
1893			syncer_state = SYNCER_SHUTTING_DOWN;
1894		while (!LIST_EMPTY(slp)) {
1895			error = sync_vnode(slp, &bo, td);
1896			if (error == 1) {
1897				LIST_REMOVE(bo, bo_synclist);
1898				LIST_INSERT_HEAD(next, bo, bo_synclist);
1899				continue;
1900			}
1901
1902			if (first_printf == 0)
1903				wdog_kern_pat(WD_LASTVAL);
1904
1905		}
1906		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
1907			syncer_final_iter--;
1908		/*
1909		 * The variable rushjob allows the kernel to speed up the
1910		 * processing of the filesystem syncer process. A rushjob
1911		 * value of N tells the filesystem syncer to process the next
1912		 * N seconds worth of work on its queue ASAP. Currently rushjob
1913		 * is used by the soft update code to speed up the filesystem
1914		 * syncer process when the incore state is getting so far
1915		 * ahead of the disk that the kernel memory pool is being
1916		 * threatened with exhaustion.
1917		 */
1918		if (rushjob > 0) {
1919			rushjob -= 1;
1920			continue;
1921		}
1922		/*
1923		 * Just sleep for a short period of time between
1924		 * iterations when shutting down to allow some I/O
1925		 * to happen.
1926		 *
1927		 * If it has taken us less than a second to process the
1928		 * current work, then wait. Otherwise start right over
1929		 * again. We can still lose time if any single round
1930		 * takes more than two seconds, but it does not really
1931		 * matter as we are just trying to generally pace the
1932		 * filesystem activity.
1933		 */
1934		if (syncer_state != SYNCER_RUNNING ||
1935		    time_uptime == starttime) {
1936			thread_lock(td);
1937			sched_prio(td, PPAUSE);
1938			thread_unlock(td);
1939		}
1940		if (syncer_state != SYNCER_RUNNING)
1941			cv_timedwait(&sync_wakeup, &sync_mtx,
1942			    hz / SYNCER_SHUTDOWN_SPEEDUP);
1943		else if (time_uptime == starttime)
1944			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
1945	}
1946}
1947
1948/*
1949 * Request the syncer daemon to speed up its work.
1950 * We never push it to speed up more than half of its
1951 * normal turn time, otherwise it could take over the cpu.
1952 */
1953int
1954speedup_syncer(void)
1955{
1956	int ret = 0;
1957
1958	mtx_lock(&sync_mtx);
1959	if (rushjob < syncdelay / 2) {
1960		rushjob += 1;
1961		stat_rush_requests += 1;
1962		ret = 1;
1963	}
1964	mtx_unlock(&sync_mtx);
1965	cv_broadcast(&sync_wakeup);
1966	return (ret);
1967}
1968
1969/*
1970 * Tell the syncer to speed up its work and run though its work
1971 * list several times, then tell it to shut down.
1972 */
1973static void
1974syncer_shutdown(void *arg, int howto)
1975{
1976
1977	if (howto & RB_NOSYNC)
1978		return;
1979	mtx_lock(&sync_mtx);
1980	syncer_state = SYNCER_SHUTTING_DOWN;
1981	rushjob = 0;
1982	mtx_unlock(&sync_mtx);
1983	cv_broadcast(&sync_wakeup);
1984	kproc_shutdown(arg, howto);
1985}
1986
1987void
1988syncer_suspend(void)
1989{
1990
1991	syncer_shutdown(updateproc, 0);
1992}
1993
1994void
1995syncer_resume(void)
1996{
1997
1998	mtx_lock(&sync_mtx);
1999	first_printf = 1;
2000	syncer_state = SYNCER_RUNNING;
2001	mtx_unlock(&sync_mtx);
2002	cv_broadcast(&sync_wakeup);
2003	kproc_resume(updateproc);
2004}
2005
2006/*
2007 * Reassign a buffer from one vnode to another.
2008 * Used to assign file specific control information
2009 * (indirect blocks) to the vnode to which they belong.
2010 */
2011void
2012reassignbuf(struct buf *bp)
2013{
2014	struct vnode *vp;
2015	struct bufobj *bo;
2016	int delay;
2017#ifdef INVARIANTS
2018	struct bufv *bv;
2019#endif
2020
2021	vp = bp->b_vp;
2022	bo = bp->b_bufobj;
2023	++reassignbufcalls;
2024
2025	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2026	    bp, bp->b_vp, bp->b_flags);
2027	/*
2028	 * B_PAGING flagged buffers cannot be reassigned because their vp
2029	 * is not fully linked in.
2030	 */
2031	if (bp->b_flags & B_PAGING)
2032		panic("cannot reassign paging buffer");
2033
2034	/*
2035	 * Delete from old vnode list, if on one.
2036	 */
2037	BO_LOCK(bo);
2038	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2039		buf_vlist_remove(bp);
2040	else
2041		panic("reassignbuf: Buffer %p not on queue.", bp);
2042	/*
2043	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2044	 * of clean buffers.
2045	 */
2046	if (bp->b_flags & B_DELWRI) {
2047		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2048			switch (vp->v_type) {
2049			case VDIR:
2050				delay = dirdelay;
2051				break;
2052			case VCHR:
2053				delay = metadelay;
2054				break;
2055			default:
2056				delay = filedelay;
2057			}
2058			vn_syncer_add_to_worklist(bo, delay);
2059		}
2060		buf_vlist_add(bp, bo, BX_VNDIRTY);
2061	} else {
2062		buf_vlist_add(bp, bo, BX_VNCLEAN);
2063
2064		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2065			mtx_lock(&sync_mtx);
2066			LIST_REMOVE(bo, bo_synclist);
2067			syncer_worklist_len--;
2068			mtx_unlock(&sync_mtx);
2069			bo->bo_flag &= ~BO_ONWORKLST;
2070		}
2071	}
2072#ifdef INVARIANTS
2073	bv = &bo->bo_clean;
2074	bp = TAILQ_FIRST(&bv->bv_hd);
2075	KASSERT(bp == NULL || bp->b_bufobj == bo,
2076	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2077	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2078	KASSERT(bp == NULL || bp->b_bufobj == bo,
2079	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2080	bv = &bo->bo_dirty;
2081	bp = TAILQ_FIRST(&bv->bv_hd);
2082	KASSERT(bp == NULL || bp->b_bufobj == bo,
2083	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2084	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2085	KASSERT(bp == NULL || bp->b_bufobj == bo,
2086	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2087#endif
2088	BO_UNLOCK(bo);
2089}
2090
2091/*
2092 * Increment the use and hold counts on the vnode, taking care to reference
2093 * the driver's usecount if this is a chardev.  The vholdl() will remove
2094 * the vnode from the free list if it is presently free.  Requires the
2095 * vnode interlock and returns with it held.
2096 */
2097static void
2098v_incr_usecount(struct vnode *vp)
2099{
2100
2101	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2102	vholdl(vp);
2103	vp->v_usecount++;
2104	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2105		dev_lock();
2106		vp->v_rdev->si_usecount++;
2107		dev_unlock();
2108	}
2109}
2110
2111/*
2112 * Turn a holdcnt into a use+holdcnt such that only one call to
2113 * v_decr_usecount is needed.
2114 */
2115static void
2116v_upgrade_usecount(struct vnode *vp)
2117{
2118
2119	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2120	vp->v_usecount++;
2121	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2122		dev_lock();
2123		vp->v_rdev->si_usecount++;
2124		dev_unlock();
2125	}
2126}
2127
2128/*
2129 * Decrement the vnode use and hold count along with the driver's usecount
2130 * if this is a chardev.  The vdropl() below releases the vnode interlock
2131 * as it may free the vnode.
2132 */
2133static void
2134v_decr_usecount(struct vnode *vp)
2135{
2136
2137	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2138	VNASSERT(vp->v_usecount > 0, vp,
2139	    ("v_decr_usecount: negative usecount"));
2140	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2141	vp->v_usecount--;
2142	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2143		dev_lock();
2144		vp->v_rdev->si_usecount--;
2145		dev_unlock();
2146	}
2147	vdropl(vp);
2148}
2149
2150/*
2151 * Decrement only the use count and driver use count.  This is intended to
2152 * be paired with a follow on vdropl() to release the remaining hold count.
2153 * In this way we may vgone() a vnode with a 0 usecount without risk of
2154 * having it end up on a free list because the hold count is kept above 0.
2155 */
2156static void
2157v_decr_useonly(struct vnode *vp)
2158{
2159
2160	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2161	VNASSERT(vp->v_usecount > 0, vp,
2162	    ("v_decr_useonly: negative usecount"));
2163	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2164	vp->v_usecount--;
2165	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2166		dev_lock();
2167		vp->v_rdev->si_usecount--;
2168		dev_unlock();
2169	}
2170}
2171
2172/*
2173 * Grab a particular vnode from the free list, increment its
2174 * reference count and lock it.  VI_DOOMED is set if the vnode
2175 * is being destroyed.  Only callers who specify LK_RETRY will
2176 * see doomed vnodes.  If inactive processing was delayed in
2177 * vput try to do it here.
2178 */
2179int
2180vget(struct vnode *vp, int flags, struct thread *td)
2181{
2182	int error;
2183
2184	error = 0;
2185	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2186	    ("vget: invalid lock operation"));
2187	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2188
2189	if ((flags & LK_INTERLOCK) == 0)
2190		VI_LOCK(vp);
2191	vholdl(vp);
2192	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2193		vdrop(vp);
2194		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2195		    vp);
2196		return (error);
2197	}
2198	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2199		panic("vget: vn_lock failed to return ENOENT\n");
2200	VI_LOCK(vp);
2201	/* Upgrade our holdcnt to a usecount. */
2202	v_upgrade_usecount(vp);
2203	/*
2204	 * We don't guarantee that any particular close will
2205	 * trigger inactive processing so just make a best effort
2206	 * here at preventing a reference to a removed file.  If
2207	 * we don't succeed no harm is done.
2208	 */
2209	if (vp->v_iflag & VI_OWEINACT) {
2210		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2211		    (flags & LK_NOWAIT) == 0)
2212			vinactive(vp, td);
2213		vp->v_iflag &= ~VI_OWEINACT;
2214	}
2215	VI_UNLOCK(vp);
2216	return (0);
2217}
2218
2219/*
2220 * Increase the reference count of a vnode.
2221 */
2222void
2223vref(struct vnode *vp)
2224{
2225
2226	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2227	VI_LOCK(vp);
2228	v_incr_usecount(vp);
2229	VI_UNLOCK(vp);
2230}
2231
2232/*
2233 * Return reference count of a vnode.
2234 *
2235 * The results of this call are only guaranteed when some mechanism other
2236 * than the VI lock is used to stop other processes from gaining references
2237 * to the vnode.  This may be the case if the caller holds the only reference.
2238 * This is also useful when stale data is acceptable as race conditions may
2239 * be accounted for by some other means.
2240 */
2241int
2242vrefcnt(struct vnode *vp)
2243{
2244	int usecnt;
2245
2246	VI_LOCK(vp);
2247	usecnt = vp->v_usecount;
2248	VI_UNLOCK(vp);
2249
2250	return (usecnt);
2251}
2252
2253#define	VPUTX_VRELE	1
2254#define	VPUTX_VPUT	2
2255#define	VPUTX_VUNREF	3
2256
2257static void
2258vputx(struct vnode *vp, int func)
2259{
2260	int error;
2261
2262	KASSERT(vp != NULL, ("vputx: null vp"));
2263	if (func == VPUTX_VUNREF)
2264		ASSERT_VOP_LOCKED(vp, "vunref");
2265	else if (func == VPUTX_VPUT)
2266		ASSERT_VOP_LOCKED(vp, "vput");
2267	else
2268		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2269	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2270	VI_LOCK(vp);
2271
2272	/* Skip this v_writecount check if we're going to panic below. */
2273	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2274	    ("vputx: missed vn_close"));
2275	error = 0;
2276
2277	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2278	    vp->v_usecount == 1)) {
2279		if (func == VPUTX_VPUT)
2280			VOP_UNLOCK(vp, 0);
2281		v_decr_usecount(vp);
2282		return;
2283	}
2284
2285	if (vp->v_usecount != 1) {
2286		vprint("vputx: negative ref count", vp);
2287		panic("vputx: negative ref cnt");
2288	}
2289	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2290	/*
2291	 * We want to hold the vnode until the inactive finishes to
2292	 * prevent vgone() races.  We drop the use count here and the
2293	 * hold count below when we're done.
2294	 */
2295	v_decr_useonly(vp);
2296	/*
2297	 * We must call VOP_INACTIVE with the node locked. Mark
2298	 * as VI_DOINGINACT to avoid recursion.
2299	 */
2300	vp->v_iflag |= VI_OWEINACT;
2301	switch (func) {
2302	case VPUTX_VRELE:
2303		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2304		VI_LOCK(vp);
2305		break;
2306	case VPUTX_VPUT:
2307		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2308			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2309			    LK_NOWAIT);
2310			VI_LOCK(vp);
2311		}
2312		break;
2313	case VPUTX_VUNREF:
2314		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2315			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2316			VI_LOCK(vp);
2317		}
2318		break;
2319	}
2320	if (vp->v_usecount > 0)
2321		vp->v_iflag &= ~VI_OWEINACT;
2322	if (error == 0) {
2323		if (vp->v_iflag & VI_OWEINACT)
2324			vinactive(vp, curthread);
2325		if (func != VPUTX_VUNREF)
2326			VOP_UNLOCK(vp, 0);
2327	}
2328	vdropl(vp);
2329}
2330
2331/*
2332 * Vnode put/release.
2333 * If count drops to zero, call inactive routine and return to freelist.
2334 */
2335void
2336vrele(struct vnode *vp)
2337{
2338
2339	vputx(vp, VPUTX_VRELE);
2340}
2341
2342/*
2343 * Release an already locked vnode.  This give the same effects as
2344 * unlock+vrele(), but takes less time and avoids releasing and
2345 * re-aquiring the lock (as vrele() acquires the lock internally.)
2346 */
2347void
2348vput(struct vnode *vp)
2349{
2350
2351	vputx(vp, VPUTX_VPUT);
2352}
2353
2354/*
2355 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2356 */
2357void
2358vunref(struct vnode *vp)
2359{
2360
2361	vputx(vp, VPUTX_VUNREF);
2362}
2363
2364/*
2365 * Somebody doesn't want the vnode recycled.
2366 */
2367void
2368vhold(struct vnode *vp)
2369{
2370
2371	VI_LOCK(vp);
2372	vholdl(vp);
2373	VI_UNLOCK(vp);
2374}
2375
2376/*
2377 * Increase the hold count and activate if this is the first reference.
2378 */
2379void
2380vholdl(struct vnode *vp)
2381{
2382	struct mount *mp;
2383
2384	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2385#ifdef INVARIANTS
2386	/* getnewvnode() calls v_incr_usecount() without holding interlock. */
2387	if (vp->v_type != VNON || vp->v_data != NULL)
2388		ASSERT_VI_LOCKED(vp, "vholdl");
2389#endif
2390	vp->v_holdcnt++;
2391	if ((vp->v_iflag & VI_FREE) == 0)
2392		return;
2393	VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
2394	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2395	/*
2396	 * Remove a vnode from the free list, mark it as in use,
2397	 * and put it on the active list.
2398	 */
2399	mtx_lock(&vnode_free_list_mtx);
2400	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2401	freevnodes--;
2402	vp->v_iflag &= ~(VI_FREE|VI_AGE);
2403	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2404	    ("Activating already active vnode"));
2405	vp->v_iflag |= VI_ACTIVE;
2406	mp = vp->v_mount;
2407	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2408	mp->mnt_activevnodelistsize++;
2409	mtx_unlock(&vnode_free_list_mtx);
2410}
2411
2412/*
2413 * Note that there is one less who cares about this vnode.
2414 * vdrop() is the opposite of vhold().
2415 */
2416void
2417vdrop(struct vnode *vp)
2418{
2419
2420	VI_LOCK(vp);
2421	vdropl(vp);
2422}
2423
2424/*
2425 * Drop the hold count of the vnode.  If this is the last reference to
2426 * the vnode we place it on the free list unless it has been vgone'd
2427 * (marked VI_DOOMED) in which case we will free it.
2428 */
2429void
2430vdropl(struct vnode *vp)
2431{
2432	struct bufobj *bo;
2433	struct mount *mp;
2434	int active;
2435
2436	ASSERT_VI_LOCKED(vp, "vdropl");
2437	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2438	if (vp->v_holdcnt <= 0)
2439		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2440	vp->v_holdcnt--;
2441	if (vp->v_holdcnt > 0) {
2442		VI_UNLOCK(vp);
2443		return;
2444	}
2445	if ((vp->v_iflag & VI_DOOMED) == 0) {
2446		/*
2447		 * Mark a vnode as free: remove it from its active list
2448		 * and put it up for recycling on the freelist.
2449		 */
2450		VNASSERT(vp->v_op != NULL, vp,
2451		    ("vdropl: vnode already reclaimed."));
2452		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2453		    ("vnode already free"));
2454		VNASSERT(vp->v_holdcnt == 0, vp,
2455		    ("vdropl: freeing when we shouldn't"));
2456		active = vp->v_iflag & VI_ACTIVE;
2457		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2458			vp->v_iflag &= ~VI_ACTIVE;
2459			mp = vp->v_mount;
2460			mtx_lock(&vnode_free_list_mtx);
2461			if (active) {
2462				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2463				    v_actfreelist);
2464				mp->mnt_activevnodelistsize--;
2465			}
2466			if (vp->v_iflag & VI_AGE) {
2467				TAILQ_INSERT_HEAD(&vnode_free_list, vp,
2468				    v_actfreelist);
2469			} else {
2470				TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2471				    v_actfreelist);
2472			}
2473			freevnodes++;
2474			vp->v_iflag &= ~VI_AGE;
2475			vp->v_iflag |= VI_FREE;
2476			mtx_unlock(&vnode_free_list_mtx);
2477		} else {
2478			atomic_add_long(&free_owe_inact, 1);
2479		}
2480		VI_UNLOCK(vp);
2481		return;
2482	}
2483	/*
2484	 * The vnode has been marked for destruction, so free it.
2485	 */
2486	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2487	atomic_subtract_long(&numvnodes, 1);
2488	bo = &vp->v_bufobj;
2489	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2490	    ("cleaned vnode still on the free list."));
2491	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2492	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2493	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2494	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2495	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2496	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2497	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2498	    ("clean blk trie not empty"));
2499	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2500	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2501	    ("dirty blk trie not empty"));
2502	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2503	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2504	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2505	VI_UNLOCK(vp);
2506#ifdef MAC
2507	mac_vnode_destroy(vp);
2508#endif
2509	if (vp->v_pollinfo != NULL)
2510		destroy_vpollinfo(vp->v_pollinfo);
2511#ifdef INVARIANTS
2512	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2513	vp->v_op = NULL;
2514#endif
2515	rangelock_destroy(&vp->v_rl);
2516	lockdestroy(vp->v_vnlock);
2517	mtx_destroy(&vp->v_interlock);
2518	rw_destroy(BO_LOCKPTR(bo));
2519	uma_zfree(vnode_zone, vp);
2520}
2521
2522/*
2523 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2524 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2525 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2526 * failed lock upgrade.
2527 */
2528void
2529vinactive(struct vnode *vp, struct thread *td)
2530{
2531	struct vm_object *obj;
2532
2533	ASSERT_VOP_ELOCKED(vp, "vinactive");
2534	ASSERT_VI_LOCKED(vp, "vinactive");
2535	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2536	    ("vinactive: recursed on VI_DOINGINACT"));
2537	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2538	vp->v_iflag |= VI_DOINGINACT;
2539	vp->v_iflag &= ~VI_OWEINACT;
2540	VI_UNLOCK(vp);
2541	/*
2542	 * Before moving off the active list, we must be sure that any
2543	 * modified pages are on the vnode's dirty list since these will
2544	 * no longer be checked once the vnode is on the inactive list.
2545	 * Because the vnode vm object keeps a hold reference on the vnode
2546	 * if there is at least one resident non-cached page, the vnode
2547	 * cannot leave the active list without the page cleanup done.
2548	 */
2549	obj = vp->v_object;
2550	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2551		VM_OBJECT_WLOCK(obj);
2552		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2553		VM_OBJECT_WUNLOCK(obj);
2554	}
2555	VOP_INACTIVE(vp, td);
2556	VI_LOCK(vp);
2557	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2558	    ("vinactive: lost VI_DOINGINACT"));
2559	vp->v_iflag &= ~VI_DOINGINACT;
2560}
2561
2562/*
2563 * Remove any vnodes in the vnode table belonging to mount point mp.
2564 *
2565 * If FORCECLOSE is not specified, there should not be any active ones,
2566 * return error if any are found (nb: this is a user error, not a
2567 * system error). If FORCECLOSE is specified, detach any active vnodes
2568 * that are found.
2569 *
2570 * If WRITECLOSE is set, only flush out regular file vnodes open for
2571 * writing.
2572 *
2573 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2574 *
2575 * `rootrefs' specifies the base reference count for the root vnode
2576 * of this filesystem. The root vnode is considered busy if its
2577 * v_usecount exceeds this value. On a successful return, vflush(, td)
2578 * will call vrele() on the root vnode exactly rootrefs times.
2579 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2580 * be zero.
2581 */
2582#ifdef DIAGNOSTIC
2583static int busyprt = 0;		/* print out busy vnodes */
2584SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2585#endif
2586
2587int
2588vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2589{
2590	struct vnode *vp, *mvp, *rootvp = NULL;
2591	struct vattr vattr;
2592	int busy = 0, error;
2593
2594	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2595	    rootrefs, flags);
2596	if (rootrefs > 0) {
2597		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2598		    ("vflush: bad args"));
2599		/*
2600		 * Get the filesystem root vnode. We can vput() it
2601		 * immediately, since with rootrefs > 0, it won't go away.
2602		 */
2603		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2604			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2605			    __func__, error);
2606			return (error);
2607		}
2608		vput(rootvp);
2609	}
2610loop:
2611	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2612		vholdl(vp);
2613		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2614		if (error) {
2615			vdrop(vp);
2616			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2617			goto loop;
2618		}
2619		/*
2620		 * Skip over a vnodes marked VV_SYSTEM.
2621		 */
2622		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2623			VOP_UNLOCK(vp, 0);
2624			vdrop(vp);
2625			continue;
2626		}
2627		/*
2628		 * If WRITECLOSE is set, flush out unlinked but still open
2629		 * files (even if open only for reading) and regular file
2630		 * vnodes open for writing.
2631		 */
2632		if (flags & WRITECLOSE) {
2633			if (vp->v_object != NULL) {
2634				VM_OBJECT_WLOCK(vp->v_object);
2635				vm_object_page_clean(vp->v_object, 0, 0, 0);
2636				VM_OBJECT_WUNLOCK(vp->v_object);
2637			}
2638			error = VOP_FSYNC(vp, MNT_WAIT, td);
2639			if (error != 0) {
2640				VOP_UNLOCK(vp, 0);
2641				vdrop(vp);
2642				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2643				return (error);
2644			}
2645			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2646			VI_LOCK(vp);
2647
2648			if ((vp->v_type == VNON ||
2649			    (error == 0 && vattr.va_nlink > 0)) &&
2650			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2651				VOP_UNLOCK(vp, 0);
2652				vdropl(vp);
2653				continue;
2654			}
2655		} else
2656			VI_LOCK(vp);
2657		/*
2658		 * With v_usecount == 0, all we need to do is clear out the
2659		 * vnode data structures and we are done.
2660		 *
2661		 * If FORCECLOSE is set, forcibly close the vnode.
2662		 */
2663		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2664			vgonel(vp);
2665		} else {
2666			busy++;
2667#ifdef DIAGNOSTIC
2668			if (busyprt)
2669				vprint("vflush: busy vnode", vp);
2670#endif
2671		}
2672		VOP_UNLOCK(vp, 0);
2673		vdropl(vp);
2674	}
2675	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2676		/*
2677		 * If just the root vnode is busy, and if its refcount
2678		 * is equal to `rootrefs', then go ahead and kill it.
2679		 */
2680		VI_LOCK(rootvp);
2681		KASSERT(busy > 0, ("vflush: not busy"));
2682		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2683		    ("vflush: usecount %d < rootrefs %d",
2684		     rootvp->v_usecount, rootrefs));
2685		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2686			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2687			vgone(rootvp);
2688			VOP_UNLOCK(rootvp, 0);
2689			busy = 0;
2690		} else
2691			VI_UNLOCK(rootvp);
2692	}
2693	if (busy) {
2694		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2695		    busy);
2696		return (EBUSY);
2697	}
2698	for (; rootrefs > 0; rootrefs--)
2699		vrele(rootvp);
2700	return (0);
2701}
2702
2703/*
2704 * Recycle an unused vnode to the front of the free list.
2705 */
2706int
2707vrecycle(struct vnode *vp)
2708{
2709	int recycled;
2710
2711	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2712	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2713	recycled = 0;
2714	VI_LOCK(vp);
2715	if (vp->v_usecount == 0) {
2716		recycled = 1;
2717		vgonel(vp);
2718	}
2719	VI_UNLOCK(vp);
2720	return (recycled);
2721}
2722
2723/*
2724 * Eliminate all activity associated with a vnode
2725 * in preparation for reuse.
2726 */
2727void
2728vgone(struct vnode *vp)
2729{
2730	VI_LOCK(vp);
2731	vgonel(vp);
2732	VI_UNLOCK(vp);
2733}
2734
2735static void
2736notify_lowervp_vfs_dummy(struct mount *mp __unused,
2737    struct vnode *lowervp __unused)
2738{
2739}
2740
2741/*
2742 * Notify upper mounts about reclaimed or unlinked vnode.
2743 */
2744void
2745vfs_notify_upper(struct vnode *vp, int event)
2746{
2747	static struct vfsops vgonel_vfsops = {
2748		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2749		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2750	};
2751	struct mount *mp, *ump, *mmp;
2752
2753	mp = vp->v_mount;
2754	if (mp == NULL)
2755		return;
2756
2757	MNT_ILOCK(mp);
2758	if (TAILQ_EMPTY(&mp->mnt_uppers))
2759		goto unlock;
2760	MNT_IUNLOCK(mp);
2761	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2762	mmp->mnt_op = &vgonel_vfsops;
2763	mmp->mnt_kern_flag |= MNTK_MARKER;
2764	MNT_ILOCK(mp);
2765	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
2766	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
2767		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
2768			ump = TAILQ_NEXT(ump, mnt_upper_link);
2769			continue;
2770		}
2771		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
2772		MNT_IUNLOCK(mp);
2773		switch (event) {
2774		case VFS_NOTIFY_UPPER_RECLAIM:
2775			VFS_RECLAIM_LOWERVP(ump, vp);
2776			break;
2777		case VFS_NOTIFY_UPPER_UNLINK:
2778			VFS_UNLINK_LOWERVP(ump, vp);
2779			break;
2780		default:
2781			KASSERT(0, ("invalid event %d", event));
2782			break;
2783		}
2784		MNT_ILOCK(mp);
2785		ump = TAILQ_NEXT(mmp, mnt_upper_link);
2786		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
2787	}
2788	free(mmp, M_TEMP);
2789	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
2790	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
2791		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
2792		wakeup(&mp->mnt_uppers);
2793	}
2794unlock:
2795	MNT_IUNLOCK(mp);
2796}
2797
2798/*
2799 * vgone, with the vp interlock held.
2800 */
2801static void
2802vgonel(struct vnode *vp)
2803{
2804	struct thread *td;
2805	int oweinact;
2806	int active;
2807	struct mount *mp;
2808
2809	ASSERT_VOP_ELOCKED(vp, "vgonel");
2810	ASSERT_VI_LOCKED(vp, "vgonel");
2811	VNASSERT(vp->v_holdcnt, vp,
2812	    ("vgonel: vp %p has no reference.", vp));
2813	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2814	td = curthread;
2815
2816	/*
2817	 * Don't vgonel if we're already doomed.
2818	 */
2819	if (vp->v_iflag & VI_DOOMED)
2820		return;
2821	vp->v_iflag |= VI_DOOMED;
2822
2823	/*
2824	 * Check to see if the vnode is in use.  If so, we have to call
2825	 * VOP_CLOSE() and VOP_INACTIVE().
2826	 */
2827	active = vp->v_usecount;
2828	oweinact = (vp->v_iflag & VI_OWEINACT);
2829	VI_UNLOCK(vp);
2830	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
2831
2832	/*
2833	 * If purging an active vnode, it must be closed and
2834	 * deactivated before being reclaimed.
2835	 */
2836	if (active)
2837		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
2838	if (oweinact || active) {
2839		VI_LOCK(vp);
2840		if ((vp->v_iflag & VI_DOINGINACT) == 0)
2841			vinactive(vp, td);
2842		VI_UNLOCK(vp);
2843	}
2844	if (vp->v_type == VSOCK)
2845		vfs_unp_reclaim(vp);
2846
2847	/*
2848	 * Clean out any buffers associated with the vnode.
2849	 * If the flush fails, just toss the buffers.
2850	 */
2851	mp = NULL;
2852	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
2853		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
2854	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
2855		while (vinvalbuf(vp, 0, 0, 0) != 0)
2856			;
2857	}
2858
2859	BO_LOCK(&vp->v_bufobj);
2860	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
2861	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
2862	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
2863	    vp->v_bufobj.bo_clean.bv_cnt == 0,
2864	    ("vp %p bufobj not invalidated", vp));
2865	vp->v_bufobj.bo_flag |= BO_DEAD;
2866	BO_UNLOCK(&vp->v_bufobj);
2867
2868	/*
2869	 * Reclaim the vnode.
2870	 */
2871	if (VOP_RECLAIM(vp, td))
2872		panic("vgone: cannot reclaim");
2873	if (mp != NULL)
2874		vn_finished_secondary_write(mp);
2875	VNASSERT(vp->v_object == NULL, vp,
2876	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
2877	/*
2878	 * Clear the advisory locks and wake up waiting threads.
2879	 */
2880	(void)VOP_ADVLOCKPURGE(vp);
2881	/*
2882	 * Delete from old mount point vnode list.
2883	 */
2884	delmntque(vp);
2885	cache_purge(vp);
2886	/*
2887	 * Done with purge, reset to the standard lock and invalidate
2888	 * the vnode.
2889	 */
2890	VI_LOCK(vp);
2891	vp->v_vnlock = &vp->v_lock;
2892	vp->v_op = &dead_vnodeops;
2893	vp->v_tag = "none";
2894	vp->v_type = VBAD;
2895}
2896
2897/*
2898 * Calculate the total number of references to a special device.
2899 */
2900int
2901vcount(struct vnode *vp)
2902{
2903	int count;
2904
2905	dev_lock();
2906	count = vp->v_rdev->si_usecount;
2907	dev_unlock();
2908	return (count);
2909}
2910
2911/*
2912 * Same as above, but using the struct cdev *as argument
2913 */
2914int
2915count_dev(struct cdev *dev)
2916{
2917	int count;
2918
2919	dev_lock();
2920	count = dev->si_usecount;
2921	dev_unlock();
2922	return(count);
2923}
2924
2925/*
2926 * Print out a description of a vnode.
2927 */
2928static char *typename[] =
2929{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
2930 "VMARKER"};
2931
2932void
2933vn_printf(struct vnode *vp, const char *fmt, ...)
2934{
2935	va_list ap;
2936	char buf[256], buf2[16];
2937	u_long flags;
2938
2939	va_start(ap, fmt);
2940	vprintf(fmt, ap);
2941	va_end(ap);
2942	printf("%p: ", (void *)vp);
2943	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
2944	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
2945	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
2946	buf[0] = '\0';
2947	buf[1] = '\0';
2948	if (vp->v_vflag & VV_ROOT)
2949		strlcat(buf, "|VV_ROOT", sizeof(buf));
2950	if (vp->v_vflag & VV_ISTTY)
2951		strlcat(buf, "|VV_ISTTY", sizeof(buf));
2952	if (vp->v_vflag & VV_NOSYNC)
2953		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
2954	if (vp->v_vflag & VV_ETERNALDEV)
2955		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
2956	if (vp->v_vflag & VV_CACHEDLABEL)
2957		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
2958	if (vp->v_vflag & VV_TEXT)
2959		strlcat(buf, "|VV_TEXT", sizeof(buf));
2960	if (vp->v_vflag & VV_COPYONWRITE)
2961		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
2962	if (vp->v_vflag & VV_SYSTEM)
2963		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
2964	if (vp->v_vflag & VV_PROCDEP)
2965		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
2966	if (vp->v_vflag & VV_NOKNOTE)
2967		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
2968	if (vp->v_vflag & VV_DELETED)
2969		strlcat(buf, "|VV_DELETED", sizeof(buf));
2970	if (vp->v_vflag & VV_MD)
2971		strlcat(buf, "|VV_MD", sizeof(buf));
2972	if (vp->v_vflag & VV_FORCEINSMQ)
2973		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
2974	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
2975	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
2976	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
2977	if (flags != 0) {
2978		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
2979		strlcat(buf, buf2, sizeof(buf));
2980	}
2981	if (vp->v_iflag & VI_MOUNT)
2982		strlcat(buf, "|VI_MOUNT", sizeof(buf));
2983	if (vp->v_iflag & VI_AGE)
2984		strlcat(buf, "|VI_AGE", sizeof(buf));
2985	if (vp->v_iflag & VI_DOOMED)
2986		strlcat(buf, "|VI_DOOMED", sizeof(buf));
2987	if (vp->v_iflag & VI_FREE)
2988		strlcat(buf, "|VI_FREE", sizeof(buf));
2989	if (vp->v_iflag & VI_ACTIVE)
2990		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
2991	if (vp->v_iflag & VI_DOINGINACT)
2992		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
2993	if (vp->v_iflag & VI_OWEINACT)
2994		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
2995	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
2996	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
2997	if (flags != 0) {
2998		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
2999		strlcat(buf, buf2, sizeof(buf));
3000	}
3001	printf("    flags (%s)\n", buf + 1);
3002	if (mtx_owned(VI_MTX(vp)))
3003		printf(" VI_LOCKed");
3004	if (vp->v_object != NULL)
3005		printf("    v_object %p ref %d pages %d "
3006		    "cleanbuf %d dirtybuf %d\n",
3007		    vp->v_object, vp->v_object->ref_count,
3008		    vp->v_object->resident_page_count,
3009		    vp->v_bufobj.bo_clean.bv_cnt,
3010		    vp->v_bufobj.bo_dirty.bv_cnt);
3011	printf("    ");
3012	lockmgr_printinfo(vp->v_vnlock);
3013	if (vp->v_data != NULL)
3014		VOP_PRINT(vp);
3015}
3016
3017#ifdef DDB
3018/*
3019 * List all of the locked vnodes in the system.
3020 * Called when debugging the kernel.
3021 */
3022DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3023{
3024	struct mount *mp;
3025	struct vnode *vp;
3026
3027	/*
3028	 * Note: because this is DDB, we can't obey the locking semantics
3029	 * for these structures, which means we could catch an inconsistent
3030	 * state and dereference a nasty pointer.  Not much to be done
3031	 * about that.
3032	 */
3033	db_printf("Locked vnodes\n");
3034	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3035		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3036			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3037				vprint("", vp);
3038		}
3039	}
3040}
3041
3042/*
3043 * Show details about the given vnode.
3044 */
3045DB_SHOW_COMMAND(vnode, db_show_vnode)
3046{
3047	struct vnode *vp;
3048
3049	if (!have_addr)
3050		return;
3051	vp = (struct vnode *)addr;
3052	vn_printf(vp, "vnode ");
3053}
3054
3055/*
3056 * Show details about the given mount point.
3057 */
3058DB_SHOW_COMMAND(mount, db_show_mount)
3059{
3060	struct mount *mp;
3061	struct vfsopt *opt;
3062	struct statfs *sp;
3063	struct vnode *vp;
3064	char buf[512];
3065	uint64_t mflags;
3066	u_int flags;
3067
3068	if (!have_addr) {
3069		/* No address given, print short info about all mount points. */
3070		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3071			db_printf("%p %s on %s (%s)\n", mp,
3072			    mp->mnt_stat.f_mntfromname,
3073			    mp->mnt_stat.f_mntonname,
3074			    mp->mnt_stat.f_fstypename);
3075			if (db_pager_quit)
3076				break;
3077		}
3078		db_printf("\nMore info: show mount <addr>\n");
3079		return;
3080	}
3081
3082	mp = (struct mount *)addr;
3083	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3084	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3085
3086	buf[0] = '\0';
3087	mflags = mp->mnt_flag;
3088#define	MNT_FLAG(flag)	do {						\
3089	if (mflags & (flag)) {						\
3090		if (buf[0] != '\0')					\
3091			strlcat(buf, ", ", sizeof(buf));		\
3092		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3093		mflags &= ~(flag);					\
3094	}								\
3095} while (0)
3096	MNT_FLAG(MNT_RDONLY);
3097	MNT_FLAG(MNT_SYNCHRONOUS);
3098	MNT_FLAG(MNT_NOEXEC);
3099	MNT_FLAG(MNT_NOSUID);
3100	MNT_FLAG(MNT_NFS4ACLS);
3101	MNT_FLAG(MNT_UNION);
3102	MNT_FLAG(MNT_ASYNC);
3103	MNT_FLAG(MNT_SUIDDIR);
3104	MNT_FLAG(MNT_SOFTDEP);
3105	MNT_FLAG(MNT_NOSYMFOLLOW);
3106	MNT_FLAG(MNT_GJOURNAL);
3107	MNT_FLAG(MNT_MULTILABEL);
3108	MNT_FLAG(MNT_ACLS);
3109	MNT_FLAG(MNT_NOATIME);
3110	MNT_FLAG(MNT_NOCLUSTERR);
3111	MNT_FLAG(MNT_NOCLUSTERW);
3112	MNT_FLAG(MNT_SUJ);
3113	MNT_FLAG(MNT_EXRDONLY);
3114	MNT_FLAG(MNT_EXPORTED);
3115	MNT_FLAG(MNT_DEFEXPORTED);
3116	MNT_FLAG(MNT_EXPORTANON);
3117	MNT_FLAG(MNT_EXKERB);
3118	MNT_FLAG(MNT_EXPUBLIC);
3119	MNT_FLAG(MNT_LOCAL);
3120	MNT_FLAG(MNT_QUOTA);
3121	MNT_FLAG(MNT_ROOTFS);
3122	MNT_FLAG(MNT_USER);
3123	MNT_FLAG(MNT_IGNORE);
3124	MNT_FLAG(MNT_UPDATE);
3125	MNT_FLAG(MNT_DELEXPORT);
3126	MNT_FLAG(MNT_RELOAD);
3127	MNT_FLAG(MNT_FORCE);
3128	MNT_FLAG(MNT_SNAPSHOT);
3129	MNT_FLAG(MNT_BYFSID);
3130#undef MNT_FLAG
3131	if (mflags != 0) {
3132		if (buf[0] != '\0')
3133			strlcat(buf, ", ", sizeof(buf));
3134		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3135		    "0x%016jx", mflags);
3136	}
3137	db_printf("    mnt_flag = %s\n", buf);
3138
3139	buf[0] = '\0';
3140	flags = mp->mnt_kern_flag;
3141#define	MNT_KERN_FLAG(flag)	do {					\
3142	if (flags & (flag)) {						\
3143		if (buf[0] != '\0')					\
3144			strlcat(buf, ", ", sizeof(buf));		\
3145		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3146		flags &= ~(flag);					\
3147	}								\
3148} while (0)
3149	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3150	MNT_KERN_FLAG(MNTK_ASYNC);
3151	MNT_KERN_FLAG(MNTK_SOFTDEP);
3152	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3153	MNT_KERN_FLAG(MNTK_DRAINING);
3154	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3155	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3156	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3157	MNT_KERN_FLAG(MNTK_NO_IOPF);
3158	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3159	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3160	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3161	MNT_KERN_FLAG(MNTK_MARKER);
3162	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3163	MNT_KERN_FLAG(MNTK_NOASYNC);
3164	MNT_KERN_FLAG(MNTK_UNMOUNT);
3165	MNT_KERN_FLAG(MNTK_MWAIT);
3166	MNT_KERN_FLAG(MNTK_SUSPEND);
3167	MNT_KERN_FLAG(MNTK_SUSPEND2);
3168	MNT_KERN_FLAG(MNTK_SUSPENDED);
3169	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3170	MNT_KERN_FLAG(MNTK_NOKNOTE);
3171#undef MNT_KERN_FLAG
3172	if (flags != 0) {
3173		if (buf[0] != '\0')
3174			strlcat(buf, ", ", sizeof(buf));
3175		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3176		    "0x%08x", flags);
3177	}
3178	db_printf("    mnt_kern_flag = %s\n", buf);
3179
3180	db_printf("    mnt_opt = ");
3181	opt = TAILQ_FIRST(mp->mnt_opt);
3182	if (opt != NULL) {
3183		db_printf("%s", opt->name);
3184		opt = TAILQ_NEXT(opt, link);
3185		while (opt != NULL) {
3186			db_printf(", %s", opt->name);
3187			opt = TAILQ_NEXT(opt, link);
3188		}
3189	}
3190	db_printf("\n");
3191
3192	sp = &mp->mnt_stat;
3193	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3194	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3195	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3196	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3197	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3198	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3199	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3200	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3201	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3202	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3203	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3204	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3205
3206	db_printf("    mnt_cred = { uid=%u ruid=%u",
3207	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3208	if (jailed(mp->mnt_cred))
3209		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3210	db_printf(" }\n");
3211	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3212	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3213	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3214	db_printf("    mnt_activevnodelistsize = %d\n",
3215	    mp->mnt_activevnodelistsize);
3216	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3217	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3218	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3219	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3220	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3221	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3222	db_printf("    mnt_secondary_accwrites = %d\n",
3223	    mp->mnt_secondary_accwrites);
3224	db_printf("    mnt_gjprovider = %s\n",
3225	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3226
3227	db_printf("\n\nList of active vnodes\n");
3228	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3229		if (vp->v_type != VMARKER) {
3230			vn_printf(vp, "vnode ");
3231			if (db_pager_quit)
3232				break;
3233		}
3234	}
3235	db_printf("\n\nList of inactive vnodes\n");
3236	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3237		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3238			vn_printf(vp, "vnode ");
3239			if (db_pager_quit)
3240				break;
3241		}
3242	}
3243}
3244#endif	/* DDB */
3245
3246/*
3247 * Fill in a struct xvfsconf based on a struct vfsconf.
3248 */
3249static int
3250vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3251{
3252	struct xvfsconf xvfsp;
3253
3254	bzero(&xvfsp, sizeof(xvfsp));
3255	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3256	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3257	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3258	xvfsp.vfc_flags = vfsp->vfc_flags;
3259	/*
3260	 * These are unused in userland, we keep them
3261	 * to not break binary compatibility.
3262	 */
3263	xvfsp.vfc_vfsops = NULL;
3264	xvfsp.vfc_next = NULL;
3265	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3266}
3267
3268#ifdef COMPAT_FREEBSD32
3269struct xvfsconf32 {
3270	uint32_t	vfc_vfsops;
3271	char		vfc_name[MFSNAMELEN];
3272	int32_t		vfc_typenum;
3273	int32_t		vfc_refcount;
3274	int32_t		vfc_flags;
3275	uint32_t	vfc_next;
3276};
3277
3278static int
3279vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3280{
3281	struct xvfsconf32 xvfsp;
3282
3283	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3284	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3285	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3286	xvfsp.vfc_flags = vfsp->vfc_flags;
3287	xvfsp.vfc_vfsops = 0;
3288	xvfsp.vfc_next = 0;
3289	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3290}
3291#endif
3292
3293/*
3294 * Top level filesystem related information gathering.
3295 */
3296static int
3297sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3298{
3299	struct vfsconf *vfsp;
3300	int error;
3301
3302	error = 0;
3303	vfsconf_slock();
3304	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3305#ifdef COMPAT_FREEBSD32
3306		if (req->flags & SCTL_MASK32)
3307			error = vfsconf2x32(req, vfsp);
3308		else
3309#endif
3310			error = vfsconf2x(req, vfsp);
3311		if (error)
3312			break;
3313	}
3314	vfsconf_sunlock();
3315	return (error);
3316}
3317
3318SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3319    CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3320    "S,xvfsconf", "List of all configured filesystems");
3321
3322#ifndef BURN_BRIDGES
3323static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3324
3325static int
3326vfs_sysctl(SYSCTL_HANDLER_ARGS)
3327{
3328	int *name = (int *)arg1 - 1;	/* XXX */
3329	u_int namelen = arg2 + 1;	/* XXX */
3330	struct vfsconf *vfsp;
3331
3332	log(LOG_WARNING, "userland calling deprecated sysctl, "
3333	    "please rebuild world\n");
3334
3335#if 1 || defined(COMPAT_PRELITE2)
3336	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3337	if (namelen == 1)
3338		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3339#endif
3340
3341	switch (name[1]) {
3342	case VFS_MAXTYPENUM:
3343		if (namelen != 2)
3344			return (ENOTDIR);
3345		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3346	case VFS_CONF:
3347		if (namelen != 3)
3348			return (ENOTDIR);	/* overloaded */
3349		vfsconf_slock();
3350		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3351			if (vfsp->vfc_typenum == name[2])
3352				break;
3353		}
3354		vfsconf_sunlock();
3355		if (vfsp == NULL)
3356			return (EOPNOTSUPP);
3357#ifdef COMPAT_FREEBSD32
3358		if (req->flags & SCTL_MASK32)
3359			return (vfsconf2x32(req, vfsp));
3360		else
3361#endif
3362			return (vfsconf2x(req, vfsp));
3363	}
3364	return (EOPNOTSUPP);
3365}
3366
3367static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3368    CTLFLAG_MPSAFE, vfs_sysctl,
3369    "Generic filesystem");
3370
3371#if 1 || defined(COMPAT_PRELITE2)
3372
3373static int
3374sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3375{
3376	int error;
3377	struct vfsconf *vfsp;
3378	struct ovfsconf ovfs;
3379
3380	vfsconf_slock();
3381	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3382		bzero(&ovfs, sizeof(ovfs));
3383		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3384		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3385		ovfs.vfc_index = vfsp->vfc_typenum;
3386		ovfs.vfc_refcount = vfsp->vfc_refcount;
3387		ovfs.vfc_flags = vfsp->vfc_flags;
3388		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3389		if (error != 0) {
3390			vfsconf_sunlock();
3391			return (error);
3392		}
3393	}
3394	vfsconf_sunlock();
3395	return (0);
3396}
3397
3398#endif /* 1 || COMPAT_PRELITE2 */
3399#endif /* !BURN_BRIDGES */
3400
3401#define KINFO_VNODESLOP		10
3402#ifdef notyet
3403/*
3404 * Dump vnode list (via sysctl).
3405 */
3406/* ARGSUSED */
3407static int
3408sysctl_vnode(SYSCTL_HANDLER_ARGS)
3409{
3410	struct xvnode *xvn;
3411	struct mount *mp;
3412	struct vnode *vp;
3413	int error, len, n;
3414
3415	/*
3416	 * Stale numvnodes access is not fatal here.
3417	 */
3418	req->lock = 0;
3419	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3420	if (!req->oldptr)
3421		/* Make an estimate */
3422		return (SYSCTL_OUT(req, 0, len));
3423
3424	error = sysctl_wire_old_buffer(req, 0);
3425	if (error != 0)
3426		return (error);
3427	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3428	n = 0;
3429	mtx_lock(&mountlist_mtx);
3430	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3431		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3432			continue;
3433		MNT_ILOCK(mp);
3434		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3435			if (n == len)
3436				break;
3437			vref(vp);
3438			xvn[n].xv_size = sizeof *xvn;
3439			xvn[n].xv_vnode = vp;
3440			xvn[n].xv_id = 0;	/* XXX compat */
3441#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3442			XV_COPY(usecount);
3443			XV_COPY(writecount);
3444			XV_COPY(holdcnt);
3445			XV_COPY(mount);
3446			XV_COPY(numoutput);
3447			XV_COPY(type);
3448#undef XV_COPY
3449			xvn[n].xv_flag = vp->v_vflag;
3450
3451			switch (vp->v_type) {
3452			case VREG:
3453			case VDIR:
3454			case VLNK:
3455				break;
3456			case VBLK:
3457			case VCHR:
3458				if (vp->v_rdev == NULL) {
3459					vrele(vp);
3460					continue;
3461				}
3462				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3463				break;
3464			case VSOCK:
3465				xvn[n].xv_socket = vp->v_socket;
3466				break;
3467			case VFIFO:
3468				xvn[n].xv_fifo = vp->v_fifoinfo;
3469				break;
3470			case VNON:
3471			case VBAD:
3472			default:
3473				/* shouldn't happen? */
3474				vrele(vp);
3475				continue;
3476			}
3477			vrele(vp);
3478			++n;
3479		}
3480		MNT_IUNLOCK(mp);
3481		mtx_lock(&mountlist_mtx);
3482		vfs_unbusy(mp);
3483		if (n == len)
3484			break;
3485	}
3486	mtx_unlock(&mountlist_mtx);
3487
3488	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3489	free(xvn, M_TEMP);
3490	return (error);
3491}
3492
3493SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3494    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3495    "");
3496#endif
3497
3498/*
3499 * Unmount all filesystems. The list is traversed in reverse order
3500 * of mounting to avoid dependencies.
3501 */
3502void
3503vfs_unmountall(void)
3504{
3505	struct mount *mp;
3506	struct thread *td;
3507	int error;
3508
3509	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3510	td = curthread;
3511
3512	/*
3513	 * Since this only runs when rebooting, it is not interlocked.
3514	 */
3515	while(!TAILQ_EMPTY(&mountlist)) {
3516		mp = TAILQ_LAST(&mountlist, mntlist);
3517		vfs_ref(mp);
3518		error = dounmount(mp, MNT_FORCE, td);
3519		if (error != 0) {
3520			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3521			/*
3522			 * XXX: Due to the way in which we mount the root
3523			 * file system off of devfs, devfs will generate a
3524			 * "busy" warning when we try to unmount it before
3525			 * the root.  Don't print a warning as a result in
3526			 * order to avoid false positive errors that may
3527			 * cause needless upset.
3528			 */
3529			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3530				printf("unmount of %s failed (",
3531				    mp->mnt_stat.f_mntonname);
3532				if (error == EBUSY)
3533					printf("BUSY)\n");
3534				else
3535					printf("%d)\n", error);
3536			}
3537		} else {
3538			/* The unmount has removed mp from the mountlist */
3539		}
3540	}
3541}
3542
3543/*
3544 * perform msync on all vnodes under a mount point
3545 * the mount point must be locked.
3546 */
3547void
3548vfs_msync(struct mount *mp, int flags)
3549{
3550	struct vnode *vp, *mvp;
3551	struct vm_object *obj;
3552
3553	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3554	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3555		obj = vp->v_object;
3556		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3557		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3558			if (!vget(vp,
3559			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3560			    curthread)) {
3561				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3562					vput(vp);
3563					continue;
3564				}
3565
3566				obj = vp->v_object;
3567				if (obj != NULL) {
3568					VM_OBJECT_WLOCK(obj);
3569					vm_object_page_clean(obj, 0, 0,
3570					    flags == MNT_WAIT ?
3571					    OBJPC_SYNC : OBJPC_NOSYNC);
3572					VM_OBJECT_WUNLOCK(obj);
3573				}
3574				vput(vp);
3575			}
3576		} else
3577			VI_UNLOCK(vp);
3578	}
3579}
3580
3581static void
3582destroy_vpollinfo_free(struct vpollinfo *vi)
3583{
3584
3585	knlist_destroy(&vi->vpi_selinfo.si_note);
3586	mtx_destroy(&vi->vpi_lock);
3587	uma_zfree(vnodepoll_zone, vi);
3588}
3589
3590static void
3591destroy_vpollinfo(struct vpollinfo *vi)
3592{
3593
3594	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3595	seldrain(&vi->vpi_selinfo);
3596	destroy_vpollinfo_free(vi);
3597}
3598
3599/*
3600 * Initalize per-vnode helper structure to hold poll-related state.
3601 */
3602void
3603v_addpollinfo(struct vnode *vp)
3604{
3605	struct vpollinfo *vi;
3606
3607	if (vp->v_pollinfo != NULL)
3608		return;
3609	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3610	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3611	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3612	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3613	VI_LOCK(vp);
3614	if (vp->v_pollinfo != NULL) {
3615		VI_UNLOCK(vp);
3616		destroy_vpollinfo_free(vi);
3617		return;
3618	}
3619	vp->v_pollinfo = vi;
3620	VI_UNLOCK(vp);
3621}
3622
3623/*
3624 * Record a process's interest in events which might happen to
3625 * a vnode.  Because poll uses the historic select-style interface
3626 * internally, this routine serves as both the ``check for any
3627 * pending events'' and the ``record my interest in future events''
3628 * functions.  (These are done together, while the lock is held,
3629 * to avoid race conditions.)
3630 */
3631int
3632vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3633{
3634
3635	v_addpollinfo(vp);
3636	mtx_lock(&vp->v_pollinfo->vpi_lock);
3637	if (vp->v_pollinfo->vpi_revents & events) {
3638		/*
3639		 * This leaves events we are not interested
3640		 * in available for the other process which
3641		 * which presumably had requested them
3642		 * (otherwise they would never have been
3643		 * recorded).
3644		 */
3645		events &= vp->v_pollinfo->vpi_revents;
3646		vp->v_pollinfo->vpi_revents &= ~events;
3647
3648		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3649		return (events);
3650	}
3651	vp->v_pollinfo->vpi_events |= events;
3652	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3653	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3654	return (0);
3655}
3656
3657/*
3658 * Routine to create and manage a filesystem syncer vnode.
3659 */
3660#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3661static int	sync_fsync(struct  vop_fsync_args *);
3662static int	sync_inactive(struct  vop_inactive_args *);
3663static int	sync_reclaim(struct  vop_reclaim_args *);
3664
3665static struct vop_vector sync_vnodeops = {
3666	.vop_bypass =	VOP_EOPNOTSUPP,
3667	.vop_close =	sync_close,		/* close */
3668	.vop_fsync =	sync_fsync,		/* fsync */
3669	.vop_inactive =	sync_inactive,	/* inactive */
3670	.vop_reclaim =	sync_reclaim,	/* reclaim */
3671	.vop_lock1 =	vop_stdlock,	/* lock */
3672	.vop_unlock =	vop_stdunlock,	/* unlock */
3673	.vop_islocked =	vop_stdislocked,	/* islocked */
3674};
3675
3676/*
3677 * Create a new filesystem syncer vnode for the specified mount point.
3678 */
3679void
3680vfs_allocate_syncvnode(struct mount *mp)
3681{
3682	struct vnode *vp;
3683	struct bufobj *bo;
3684	static long start, incr, next;
3685	int error;
3686
3687	/* Allocate a new vnode */
3688	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3689	if (error != 0)
3690		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3691	vp->v_type = VNON;
3692	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3693	vp->v_vflag |= VV_FORCEINSMQ;
3694	error = insmntque(vp, mp);
3695	if (error != 0)
3696		panic("vfs_allocate_syncvnode: insmntque() failed");
3697	vp->v_vflag &= ~VV_FORCEINSMQ;
3698	VOP_UNLOCK(vp, 0);
3699	/*
3700	 * Place the vnode onto the syncer worklist. We attempt to
3701	 * scatter them about on the list so that they will go off
3702	 * at evenly distributed times even if all the filesystems
3703	 * are mounted at once.
3704	 */
3705	next += incr;
3706	if (next == 0 || next > syncer_maxdelay) {
3707		start /= 2;
3708		incr /= 2;
3709		if (start == 0) {
3710			start = syncer_maxdelay / 2;
3711			incr = syncer_maxdelay;
3712		}
3713		next = start;
3714	}
3715	bo = &vp->v_bufobj;
3716	BO_LOCK(bo);
3717	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3718	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3719	mtx_lock(&sync_mtx);
3720	sync_vnode_count++;
3721	if (mp->mnt_syncer == NULL) {
3722		mp->mnt_syncer = vp;
3723		vp = NULL;
3724	}
3725	mtx_unlock(&sync_mtx);
3726	BO_UNLOCK(bo);
3727	if (vp != NULL) {
3728		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3729		vgone(vp);
3730		vput(vp);
3731	}
3732}
3733
3734void
3735vfs_deallocate_syncvnode(struct mount *mp)
3736{
3737	struct vnode *vp;
3738
3739	mtx_lock(&sync_mtx);
3740	vp = mp->mnt_syncer;
3741	if (vp != NULL)
3742		mp->mnt_syncer = NULL;
3743	mtx_unlock(&sync_mtx);
3744	if (vp != NULL)
3745		vrele(vp);
3746}
3747
3748/*
3749 * Do a lazy sync of the filesystem.
3750 */
3751static int
3752sync_fsync(struct vop_fsync_args *ap)
3753{
3754	struct vnode *syncvp = ap->a_vp;
3755	struct mount *mp = syncvp->v_mount;
3756	int error, save;
3757	struct bufobj *bo;
3758
3759	/*
3760	 * We only need to do something if this is a lazy evaluation.
3761	 */
3762	if (ap->a_waitfor != MNT_LAZY)
3763		return (0);
3764
3765	/*
3766	 * Move ourselves to the back of the sync list.
3767	 */
3768	bo = &syncvp->v_bufobj;
3769	BO_LOCK(bo);
3770	vn_syncer_add_to_worklist(bo, syncdelay);
3771	BO_UNLOCK(bo);
3772
3773	/*
3774	 * Walk the list of vnodes pushing all that are dirty and
3775	 * not already on the sync list.
3776	 */
3777	if (vfs_busy(mp, MBF_NOWAIT) != 0)
3778		return (0);
3779	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
3780		vfs_unbusy(mp);
3781		return (0);
3782	}
3783	save = curthread_pflags_set(TDP_SYNCIO);
3784	vfs_msync(mp, MNT_NOWAIT);
3785	error = VFS_SYNC(mp, MNT_LAZY);
3786	curthread_pflags_restore(save);
3787	vn_finished_write(mp);
3788	vfs_unbusy(mp);
3789	return (error);
3790}
3791
3792/*
3793 * The syncer vnode is no referenced.
3794 */
3795static int
3796sync_inactive(struct vop_inactive_args *ap)
3797{
3798
3799	vgone(ap->a_vp);
3800	return (0);
3801}
3802
3803/*
3804 * The syncer vnode is no longer needed and is being decommissioned.
3805 *
3806 * Modifications to the worklist must be protected by sync_mtx.
3807 */
3808static int
3809sync_reclaim(struct vop_reclaim_args *ap)
3810{
3811	struct vnode *vp = ap->a_vp;
3812	struct bufobj *bo;
3813
3814	bo = &vp->v_bufobj;
3815	BO_LOCK(bo);
3816	mtx_lock(&sync_mtx);
3817	if (vp->v_mount->mnt_syncer == vp)
3818		vp->v_mount->mnt_syncer = NULL;
3819	if (bo->bo_flag & BO_ONWORKLST) {
3820		LIST_REMOVE(bo, bo_synclist);
3821		syncer_worklist_len--;
3822		sync_vnode_count--;
3823		bo->bo_flag &= ~BO_ONWORKLST;
3824	}
3825	mtx_unlock(&sync_mtx);
3826	BO_UNLOCK(bo);
3827
3828	return (0);
3829}
3830
3831/*
3832 * Check if vnode represents a disk device
3833 */
3834int
3835vn_isdisk(struct vnode *vp, int *errp)
3836{
3837	int error;
3838
3839	error = 0;
3840	dev_lock();
3841	if (vp->v_type != VCHR)
3842		error = ENOTBLK;
3843	else if (vp->v_rdev == NULL)
3844		error = ENXIO;
3845	else if (vp->v_rdev->si_devsw == NULL)
3846		error = ENXIO;
3847	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
3848		error = ENOTBLK;
3849	dev_unlock();
3850	if (errp != NULL)
3851		*errp = error;
3852	return (error == 0);
3853}
3854
3855/*
3856 * Common filesystem object access control check routine.  Accepts a
3857 * vnode's type, "mode", uid and gid, requested access mode, credentials,
3858 * and optional call-by-reference privused argument allowing vaccess()
3859 * to indicate to the caller whether privilege was used to satisfy the
3860 * request (obsoleted).  Returns 0 on success, or an errno on failure.
3861 */
3862int
3863vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
3864    accmode_t accmode, struct ucred *cred, int *privused)
3865{
3866	accmode_t dac_granted;
3867	accmode_t priv_granted;
3868
3869	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
3870	    ("invalid bit in accmode"));
3871	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
3872	    ("VAPPEND without VWRITE"));
3873
3874	/*
3875	 * Look for a normal, non-privileged way to access the file/directory
3876	 * as requested.  If it exists, go with that.
3877	 */
3878
3879	if (privused != NULL)
3880		*privused = 0;
3881
3882	dac_granted = 0;
3883
3884	/* Check the owner. */
3885	if (cred->cr_uid == file_uid) {
3886		dac_granted |= VADMIN;
3887		if (file_mode & S_IXUSR)
3888			dac_granted |= VEXEC;
3889		if (file_mode & S_IRUSR)
3890			dac_granted |= VREAD;
3891		if (file_mode & S_IWUSR)
3892			dac_granted |= (VWRITE | VAPPEND);
3893
3894		if ((accmode & dac_granted) == accmode)
3895			return (0);
3896
3897		goto privcheck;
3898	}
3899
3900	/* Otherwise, check the groups (first match) */
3901	if (groupmember(file_gid, cred)) {
3902		if (file_mode & S_IXGRP)
3903			dac_granted |= VEXEC;
3904		if (file_mode & S_IRGRP)
3905			dac_granted |= VREAD;
3906		if (file_mode & S_IWGRP)
3907			dac_granted |= (VWRITE | VAPPEND);
3908
3909		if ((accmode & dac_granted) == accmode)
3910			return (0);
3911
3912		goto privcheck;
3913	}
3914
3915	/* Otherwise, check everyone else. */
3916	if (file_mode & S_IXOTH)
3917		dac_granted |= VEXEC;
3918	if (file_mode & S_IROTH)
3919		dac_granted |= VREAD;
3920	if (file_mode & S_IWOTH)
3921		dac_granted |= (VWRITE | VAPPEND);
3922	if ((accmode & dac_granted) == accmode)
3923		return (0);
3924
3925privcheck:
3926	/*
3927	 * Build a privilege mask to determine if the set of privileges
3928	 * satisfies the requirements when combined with the granted mask
3929	 * from above.  For each privilege, if the privilege is required,
3930	 * bitwise or the request type onto the priv_granted mask.
3931	 */
3932	priv_granted = 0;
3933
3934	if (type == VDIR) {
3935		/*
3936		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
3937		 * requests, instead of PRIV_VFS_EXEC.
3938		 */
3939		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3940		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
3941			priv_granted |= VEXEC;
3942	} else {
3943		/*
3944		 * Ensure that at least one execute bit is on. Otherwise,
3945		 * a privileged user will always succeed, and we don't want
3946		 * this to happen unless the file really is executable.
3947		 */
3948		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
3949		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
3950		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
3951			priv_granted |= VEXEC;
3952	}
3953
3954	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
3955	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
3956		priv_granted |= VREAD;
3957
3958	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
3959	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
3960		priv_granted |= (VWRITE | VAPPEND);
3961
3962	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
3963	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
3964		priv_granted |= VADMIN;
3965
3966	if ((accmode & (priv_granted | dac_granted)) == accmode) {
3967		/* XXX audit: privilege used */
3968		if (privused != NULL)
3969			*privused = 1;
3970		return (0);
3971	}
3972
3973	return ((accmode & VADMIN) ? EPERM : EACCES);
3974}
3975
3976/*
3977 * Credential check based on process requesting service, and per-attribute
3978 * permissions.
3979 */
3980int
3981extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
3982    struct thread *td, accmode_t accmode)
3983{
3984
3985	/*
3986	 * Kernel-invoked always succeeds.
3987	 */
3988	if (cred == NOCRED)
3989		return (0);
3990
3991	/*
3992	 * Do not allow privileged processes in jail to directly manipulate
3993	 * system attributes.
3994	 */
3995	switch (attrnamespace) {
3996	case EXTATTR_NAMESPACE_SYSTEM:
3997		/* Potentially should be: return (EPERM); */
3998		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
3999	case EXTATTR_NAMESPACE_USER:
4000		return (VOP_ACCESS(vp, accmode, cred, td));
4001	default:
4002		return (EPERM);
4003	}
4004}
4005
4006#ifdef DEBUG_VFS_LOCKS
4007/*
4008 * This only exists to supress warnings from unlocked specfs accesses.  It is
4009 * no longer ok to have an unlocked VFS.
4010 */
4011#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4012	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4013
4014int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4015SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4016    "Drop into debugger on lock violation");
4017
4018int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4019SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4020    0, "Check for interlock across VOPs");
4021
4022int vfs_badlock_print = 1;	/* Print lock violations. */
4023SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4024    0, "Print lock violations");
4025
4026#ifdef KDB
4027int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4028SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4029    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4030#endif
4031
4032static void
4033vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4034{
4035
4036#ifdef KDB
4037	if (vfs_badlock_backtrace)
4038		kdb_backtrace();
4039#endif
4040	if (vfs_badlock_print)
4041		printf("%s: %p %s\n", str, (void *)vp, msg);
4042	if (vfs_badlock_ddb)
4043		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4044}
4045
4046void
4047assert_vi_locked(struct vnode *vp, const char *str)
4048{
4049
4050	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4051		vfs_badlock("interlock is not locked but should be", str, vp);
4052}
4053
4054void
4055assert_vi_unlocked(struct vnode *vp, const char *str)
4056{
4057
4058	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4059		vfs_badlock("interlock is locked but should not be", str, vp);
4060}
4061
4062void
4063assert_vop_locked(struct vnode *vp, const char *str)
4064{
4065	int locked;
4066
4067	if (!IGNORE_LOCK(vp)) {
4068		locked = VOP_ISLOCKED(vp);
4069		if (locked == 0 || locked == LK_EXCLOTHER)
4070			vfs_badlock("is not locked but should be", str, vp);
4071	}
4072}
4073
4074void
4075assert_vop_unlocked(struct vnode *vp, const char *str)
4076{
4077
4078	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4079		vfs_badlock("is locked but should not be", str, vp);
4080}
4081
4082void
4083assert_vop_elocked(struct vnode *vp, const char *str)
4084{
4085
4086	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4087		vfs_badlock("is not exclusive locked but should be", str, vp);
4088}
4089
4090#if 0
4091void
4092assert_vop_elocked_other(struct vnode *vp, const char *str)
4093{
4094
4095	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4096		vfs_badlock("is not exclusive locked by another thread",
4097		    str, vp);
4098}
4099
4100void
4101assert_vop_slocked(struct vnode *vp, const char *str)
4102{
4103
4104	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4105		vfs_badlock("is not locked shared but should be", str, vp);
4106}
4107#endif /* 0 */
4108#endif /* DEBUG_VFS_LOCKS */
4109
4110void
4111vop_rename_fail(struct vop_rename_args *ap)
4112{
4113
4114	if (ap->a_tvp != NULL)
4115		vput(ap->a_tvp);
4116	if (ap->a_tdvp == ap->a_tvp)
4117		vrele(ap->a_tdvp);
4118	else
4119		vput(ap->a_tdvp);
4120	vrele(ap->a_fdvp);
4121	vrele(ap->a_fvp);
4122}
4123
4124void
4125vop_rename_pre(void *ap)
4126{
4127	struct vop_rename_args *a = ap;
4128
4129#ifdef DEBUG_VFS_LOCKS
4130	if (a->a_tvp)
4131		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4132	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4133	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4134	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4135
4136	/* Check the source (from). */
4137	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4138	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4139		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4140	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4141		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4142
4143	/* Check the target. */
4144	if (a->a_tvp)
4145		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4146	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4147#endif
4148	if (a->a_tdvp != a->a_fdvp)
4149		vhold(a->a_fdvp);
4150	if (a->a_tvp != a->a_fvp)
4151		vhold(a->a_fvp);
4152	vhold(a->a_tdvp);
4153	if (a->a_tvp)
4154		vhold(a->a_tvp);
4155}
4156
4157void
4158vop_strategy_pre(void *ap)
4159{
4160#ifdef DEBUG_VFS_LOCKS
4161	struct vop_strategy_args *a;
4162	struct buf *bp;
4163
4164	a = ap;
4165	bp = a->a_bp;
4166
4167	/*
4168	 * Cluster ops lock their component buffers but not the IO container.
4169	 */
4170	if ((bp->b_flags & B_CLUSTER) != 0)
4171		return;
4172
4173	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4174		if (vfs_badlock_print)
4175			printf(
4176			    "VOP_STRATEGY: bp is not locked but should be\n");
4177		if (vfs_badlock_ddb)
4178			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4179	}
4180#endif
4181}
4182
4183void
4184vop_lock_pre(void *ap)
4185{
4186#ifdef DEBUG_VFS_LOCKS
4187	struct vop_lock1_args *a = ap;
4188
4189	if ((a->a_flags & LK_INTERLOCK) == 0)
4190		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4191	else
4192		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4193#endif
4194}
4195
4196void
4197vop_lock_post(void *ap, int rc)
4198{
4199#ifdef DEBUG_VFS_LOCKS
4200	struct vop_lock1_args *a = ap;
4201
4202	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4203	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4204		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4205#endif
4206}
4207
4208void
4209vop_unlock_pre(void *ap)
4210{
4211#ifdef DEBUG_VFS_LOCKS
4212	struct vop_unlock_args *a = ap;
4213
4214	if (a->a_flags & LK_INTERLOCK)
4215		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4216	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4217#endif
4218}
4219
4220void
4221vop_unlock_post(void *ap, int rc)
4222{
4223#ifdef DEBUG_VFS_LOCKS
4224	struct vop_unlock_args *a = ap;
4225
4226	if (a->a_flags & LK_INTERLOCK)
4227		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4228#endif
4229}
4230
4231void
4232vop_create_post(void *ap, int rc)
4233{
4234	struct vop_create_args *a = ap;
4235
4236	if (!rc)
4237		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4238}
4239
4240void
4241vop_deleteextattr_post(void *ap, int rc)
4242{
4243	struct vop_deleteextattr_args *a = ap;
4244
4245	if (!rc)
4246		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4247}
4248
4249void
4250vop_link_post(void *ap, int rc)
4251{
4252	struct vop_link_args *a = ap;
4253
4254	if (!rc) {
4255		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4256		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4257	}
4258}
4259
4260void
4261vop_mkdir_post(void *ap, int rc)
4262{
4263	struct vop_mkdir_args *a = ap;
4264
4265	if (!rc)
4266		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4267}
4268
4269void
4270vop_mknod_post(void *ap, int rc)
4271{
4272	struct vop_mknod_args *a = ap;
4273
4274	if (!rc)
4275		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4276}
4277
4278void
4279vop_remove_post(void *ap, int rc)
4280{
4281	struct vop_remove_args *a = ap;
4282
4283	if (!rc) {
4284		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4285		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4286	}
4287}
4288
4289void
4290vop_rename_post(void *ap, int rc)
4291{
4292	struct vop_rename_args *a = ap;
4293
4294	if (!rc) {
4295		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4296		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4297		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4298		if (a->a_tvp)
4299			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4300	}
4301	if (a->a_tdvp != a->a_fdvp)
4302		vdrop(a->a_fdvp);
4303	if (a->a_tvp != a->a_fvp)
4304		vdrop(a->a_fvp);
4305	vdrop(a->a_tdvp);
4306	if (a->a_tvp)
4307		vdrop(a->a_tvp);
4308}
4309
4310void
4311vop_rmdir_post(void *ap, int rc)
4312{
4313	struct vop_rmdir_args *a = ap;
4314
4315	if (!rc) {
4316		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4317		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4318	}
4319}
4320
4321void
4322vop_setattr_post(void *ap, int rc)
4323{
4324	struct vop_setattr_args *a = ap;
4325
4326	if (!rc)
4327		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4328}
4329
4330void
4331vop_setextattr_post(void *ap, int rc)
4332{
4333	struct vop_setextattr_args *a = ap;
4334
4335	if (!rc)
4336		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4337}
4338
4339void
4340vop_symlink_post(void *ap, int rc)
4341{
4342	struct vop_symlink_args *a = ap;
4343
4344	if (!rc)
4345		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4346}
4347
4348static struct knlist fs_knlist;
4349
4350static void
4351vfs_event_init(void *arg)
4352{
4353	knlist_init_mtx(&fs_knlist, NULL);
4354}
4355/* XXX - correct order? */
4356SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4357
4358void
4359vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4360{
4361
4362	KNOTE_UNLOCKED(&fs_knlist, event);
4363}
4364
4365static int	filt_fsattach(struct knote *kn);
4366static void	filt_fsdetach(struct knote *kn);
4367static int	filt_fsevent(struct knote *kn, long hint);
4368
4369struct filterops fs_filtops = {
4370	.f_isfd = 0,
4371	.f_attach = filt_fsattach,
4372	.f_detach = filt_fsdetach,
4373	.f_event = filt_fsevent
4374};
4375
4376static int
4377filt_fsattach(struct knote *kn)
4378{
4379
4380	kn->kn_flags |= EV_CLEAR;
4381	knlist_add(&fs_knlist, kn, 0);
4382	return (0);
4383}
4384
4385static void
4386filt_fsdetach(struct knote *kn)
4387{
4388
4389	knlist_remove(&fs_knlist, kn, 0);
4390}
4391
4392static int
4393filt_fsevent(struct knote *kn, long hint)
4394{
4395
4396	kn->kn_fflags |= hint;
4397	return (kn->kn_fflags != 0);
4398}
4399
4400static int
4401sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4402{
4403	struct vfsidctl vc;
4404	int error;
4405	struct mount *mp;
4406
4407	error = SYSCTL_IN(req, &vc, sizeof(vc));
4408	if (error)
4409		return (error);
4410	if (vc.vc_vers != VFS_CTL_VERS1)
4411		return (EINVAL);
4412	mp = vfs_getvfs(&vc.vc_fsid);
4413	if (mp == NULL)
4414		return (ENOENT);
4415	/* ensure that a specific sysctl goes to the right filesystem. */
4416	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4417	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4418		vfs_rel(mp);
4419		return (EINVAL);
4420	}
4421	VCTLTOREQ(&vc, req);
4422	error = VFS_SYSCTL(mp, vc.vc_op, req);
4423	vfs_rel(mp);
4424	return (error);
4425}
4426
4427SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4428    NULL, 0, sysctl_vfs_ctl, "",
4429    "Sysctl by fsid");
4430
4431/*
4432 * Function to initialize a va_filerev field sensibly.
4433 * XXX: Wouldn't a random number make a lot more sense ??
4434 */
4435u_quad_t
4436init_va_filerev(void)
4437{
4438	struct bintime bt;
4439
4440	getbinuptime(&bt);
4441	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4442}
4443
4444static int	filt_vfsread(struct knote *kn, long hint);
4445static int	filt_vfswrite(struct knote *kn, long hint);
4446static int	filt_vfsvnode(struct knote *kn, long hint);
4447static void	filt_vfsdetach(struct knote *kn);
4448static struct filterops vfsread_filtops = {
4449	.f_isfd = 1,
4450	.f_detach = filt_vfsdetach,
4451	.f_event = filt_vfsread
4452};
4453static struct filterops vfswrite_filtops = {
4454	.f_isfd = 1,
4455	.f_detach = filt_vfsdetach,
4456	.f_event = filt_vfswrite
4457};
4458static struct filterops vfsvnode_filtops = {
4459	.f_isfd = 1,
4460	.f_detach = filt_vfsdetach,
4461	.f_event = filt_vfsvnode
4462};
4463
4464static void
4465vfs_knllock(void *arg)
4466{
4467	struct vnode *vp = arg;
4468
4469	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4470}
4471
4472static void
4473vfs_knlunlock(void *arg)
4474{
4475	struct vnode *vp = arg;
4476
4477	VOP_UNLOCK(vp, 0);
4478}
4479
4480static void
4481vfs_knl_assert_locked(void *arg)
4482{
4483#ifdef DEBUG_VFS_LOCKS
4484	struct vnode *vp = arg;
4485
4486	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4487#endif
4488}
4489
4490static void
4491vfs_knl_assert_unlocked(void *arg)
4492{
4493#ifdef DEBUG_VFS_LOCKS
4494	struct vnode *vp = arg;
4495
4496	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4497#endif
4498}
4499
4500int
4501vfs_kqfilter(struct vop_kqfilter_args *ap)
4502{
4503	struct vnode *vp = ap->a_vp;
4504	struct knote *kn = ap->a_kn;
4505	struct knlist *knl;
4506
4507	switch (kn->kn_filter) {
4508	case EVFILT_READ:
4509		kn->kn_fop = &vfsread_filtops;
4510		break;
4511	case EVFILT_WRITE:
4512		kn->kn_fop = &vfswrite_filtops;
4513		break;
4514	case EVFILT_VNODE:
4515		kn->kn_fop = &vfsvnode_filtops;
4516		break;
4517	default:
4518		return (EINVAL);
4519	}
4520
4521	kn->kn_hook = (caddr_t)vp;
4522
4523	v_addpollinfo(vp);
4524	if (vp->v_pollinfo == NULL)
4525		return (ENOMEM);
4526	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4527	vhold(vp);
4528	knlist_add(knl, kn, 0);
4529
4530	return (0);
4531}
4532
4533/*
4534 * Detach knote from vnode
4535 */
4536static void
4537filt_vfsdetach(struct knote *kn)
4538{
4539	struct vnode *vp = (struct vnode *)kn->kn_hook;
4540
4541	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4542	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4543	vdrop(vp);
4544}
4545
4546/*ARGSUSED*/
4547static int
4548filt_vfsread(struct knote *kn, long hint)
4549{
4550	struct vnode *vp = (struct vnode *)kn->kn_hook;
4551	struct vattr va;
4552	int res;
4553
4554	/*
4555	 * filesystem is gone, so set the EOF flag and schedule
4556	 * the knote for deletion.
4557	 */
4558	if (hint == NOTE_REVOKE) {
4559		VI_LOCK(vp);
4560		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4561		VI_UNLOCK(vp);
4562		return (1);
4563	}
4564
4565	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4566		return (0);
4567
4568	VI_LOCK(vp);
4569	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4570	res = (kn->kn_data != 0);
4571	VI_UNLOCK(vp);
4572	return (res);
4573}
4574
4575/*ARGSUSED*/
4576static int
4577filt_vfswrite(struct knote *kn, long hint)
4578{
4579	struct vnode *vp = (struct vnode *)kn->kn_hook;
4580
4581	VI_LOCK(vp);
4582
4583	/*
4584	 * filesystem is gone, so set the EOF flag and schedule
4585	 * the knote for deletion.
4586	 */
4587	if (hint == NOTE_REVOKE)
4588		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4589
4590	kn->kn_data = 0;
4591	VI_UNLOCK(vp);
4592	return (1);
4593}
4594
4595static int
4596filt_vfsvnode(struct knote *kn, long hint)
4597{
4598	struct vnode *vp = (struct vnode *)kn->kn_hook;
4599	int res;
4600
4601	VI_LOCK(vp);
4602	if (kn->kn_sfflags & hint)
4603		kn->kn_fflags |= hint;
4604	if (hint == NOTE_REVOKE) {
4605		kn->kn_flags |= EV_EOF;
4606		VI_UNLOCK(vp);
4607		return (1);
4608	}
4609	res = (kn->kn_fflags != 0);
4610	VI_UNLOCK(vp);
4611	return (res);
4612}
4613
4614int
4615vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4616{
4617	int error;
4618
4619	if (dp->d_reclen > ap->a_uio->uio_resid)
4620		return (ENAMETOOLONG);
4621	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4622	if (error) {
4623		if (ap->a_ncookies != NULL) {
4624			if (ap->a_cookies != NULL)
4625				free(ap->a_cookies, M_TEMP);
4626			ap->a_cookies = NULL;
4627			*ap->a_ncookies = 0;
4628		}
4629		return (error);
4630	}
4631	if (ap->a_ncookies == NULL)
4632		return (0);
4633
4634	KASSERT(ap->a_cookies,
4635	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4636
4637	*ap->a_cookies = realloc(*ap->a_cookies,
4638	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4639	(*ap->a_cookies)[*ap->a_ncookies] = off;
4640	return (0);
4641}
4642
4643/*
4644 * Mark for update the access time of the file if the filesystem
4645 * supports VOP_MARKATIME.  This functionality is used by execve and
4646 * mmap, so we want to avoid the I/O implied by directly setting
4647 * va_atime for the sake of efficiency.
4648 */
4649void
4650vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4651{
4652	struct mount *mp;
4653
4654	mp = vp->v_mount;
4655	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4656	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4657		(void)VOP_MARKATIME(vp);
4658}
4659
4660/*
4661 * The purpose of this routine is to remove granularity from accmode_t,
4662 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4663 * VADMIN and VAPPEND.
4664 *
4665 * If it returns 0, the caller is supposed to continue with the usual
4666 * access checks using 'accmode' as modified by this routine.  If it
4667 * returns nonzero value, the caller is supposed to return that value
4668 * as errno.
4669 *
4670 * Note that after this routine runs, accmode may be zero.
4671 */
4672int
4673vfs_unixify_accmode(accmode_t *accmode)
4674{
4675	/*
4676	 * There is no way to specify explicit "deny" rule using
4677	 * file mode or POSIX.1e ACLs.
4678	 */
4679	if (*accmode & VEXPLICIT_DENY) {
4680		*accmode = 0;
4681		return (0);
4682	}
4683
4684	/*
4685	 * None of these can be translated into usual access bits.
4686	 * Also, the common case for NFSv4 ACLs is to not contain
4687	 * either of these bits. Caller should check for VWRITE
4688	 * on the containing directory instead.
4689	 */
4690	if (*accmode & (VDELETE_CHILD | VDELETE))
4691		return (EPERM);
4692
4693	if (*accmode & VADMIN_PERMS) {
4694		*accmode &= ~VADMIN_PERMS;
4695		*accmode |= VADMIN;
4696	}
4697
4698	/*
4699	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4700	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4701	 */
4702	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4703
4704	return (0);
4705}
4706
4707/*
4708 * These are helper functions for filesystems to traverse all
4709 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4710 *
4711 * This interface replaces MNT_VNODE_FOREACH.
4712 */
4713
4714MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4715
4716struct vnode *
4717__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4718{
4719	struct vnode *vp;
4720
4721	if (should_yield())
4722		kern_yield(PRI_USER);
4723	MNT_ILOCK(mp);
4724	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4725	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4726	while (vp != NULL && (vp->v_type == VMARKER ||
4727	    (vp->v_iflag & VI_DOOMED) != 0))
4728		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4729
4730	/* Check if we are done */
4731	if (vp == NULL) {
4732		__mnt_vnode_markerfree_all(mvp, mp);
4733		/* MNT_IUNLOCK(mp); -- done in above function */
4734		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4735		return (NULL);
4736	}
4737	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4738	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4739	VI_LOCK(vp);
4740	MNT_IUNLOCK(mp);
4741	return (vp);
4742}
4743
4744struct vnode *
4745__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4746{
4747	struct vnode *vp;
4748
4749	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4750	MNT_ILOCK(mp);
4751	MNT_REF(mp);
4752	(*mvp)->v_type = VMARKER;
4753
4754	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4755	while (vp != NULL && (vp->v_type == VMARKER ||
4756	    (vp->v_iflag & VI_DOOMED) != 0))
4757		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4758
4759	/* Check if we are done */
4760	if (vp == NULL) {
4761		MNT_REL(mp);
4762		MNT_IUNLOCK(mp);
4763		free(*mvp, M_VNODE_MARKER);
4764		*mvp = NULL;
4765		return (NULL);
4766	}
4767	(*mvp)->v_mount = mp;
4768	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4769	VI_LOCK(vp);
4770	MNT_IUNLOCK(mp);
4771	return (vp);
4772}
4773
4774
4775void
4776__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
4777{
4778
4779	if (*mvp == NULL) {
4780		MNT_IUNLOCK(mp);
4781		return;
4782	}
4783
4784	mtx_assert(MNT_MTX(mp), MA_OWNED);
4785
4786	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4787	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4788	MNT_REL(mp);
4789	MNT_IUNLOCK(mp);
4790	free(*mvp, M_VNODE_MARKER);
4791	*mvp = NULL;
4792}
4793
4794/*
4795 * These are helper functions for filesystems to traverse their
4796 * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
4797 */
4798static void
4799mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4800{
4801
4802	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4803
4804	MNT_ILOCK(mp);
4805	MNT_REL(mp);
4806	MNT_IUNLOCK(mp);
4807	free(*mvp, M_VNODE_MARKER);
4808	*mvp = NULL;
4809}
4810
4811static struct vnode *
4812mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4813{
4814	struct vnode *vp, *nvp;
4815
4816	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
4817	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4818restart:
4819	vp = TAILQ_NEXT(*mvp, v_actfreelist);
4820	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4821	while (vp != NULL) {
4822		if (vp->v_type == VMARKER) {
4823			vp = TAILQ_NEXT(vp, v_actfreelist);
4824			continue;
4825		}
4826		if (!VI_TRYLOCK(vp)) {
4827			if (mp_ncpus == 1 || should_yield()) {
4828				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4829				mtx_unlock(&vnode_free_list_mtx);
4830				pause("vnacti", 1);
4831				mtx_lock(&vnode_free_list_mtx);
4832				goto restart;
4833			}
4834			continue;
4835		}
4836		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
4837		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
4838		    ("alien vnode on the active list %p %p", vp, mp));
4839		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
4840			break;
4841		nvp = TAILQ_NEXT(vp, v_actfreelist);
4842		VI_UNLOCK(vp);
4843		vp = nvp;
4844	}
4845
4846	/* Check if we are done */
4847	if (vp == NULL) {
4848		mtx_unlock(&vnode_free_list_mtx);
4849		mnt_vnode_markerfree_active(mvp, mp);
4850		return (NULL);
4851	}
4852	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
4853	mtx_unlock(&vnode_free_list_mtx);
4854	ASSERT_VI_LOCKED(vp, "active iter");
4855	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
4856	return (vp);
4857}
4858
4859struct vnode *
4860__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
4861{
4862
4863	if (should_yield())
4864		kern_yield(PRI_USER);
4865	mtx_lock(&vnode_free_list_mtx);
4866	return (mnt_vnode_next_active(mvp, mp));
4867}
4868
4869struct vnode *
4870__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
4871{
4872	struct vnode *vp;
4873
4874	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4875	MNT_ILOCK(mp);
4876	MNT_REF(mp);
4877	MNT_IUNLOCK(mp);
4878	(*mvp)->v_type = VMARKER;
4879	(*mvp)->v_mount = mp;
4880
4881	mtx_lock(&vnode_free_list_mtx);
4882	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
4883	if (vp == NULL) {
4884		mtx_unlock(&vnode_free_list_mtx);
4885		mnt_vnode_markerfree_active(mvp, mp);
4886		return (NULL);
4887	}
4888	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
4889	return (mnt_vnode_next_active(mvp, mp));
4890}
4891
4892void
4893__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
4894{
4895
4896	if (*mvp == NULL)
4897		return;
4898
4899	mtx_lock(&vnode_free_list_mtx);
4900	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
4901	mtx_unlock(&vnode_free_list_mtx);
4902	mnt_vnode_markerfree_active(mvp, mp);
4903}
4904