vfs_subr.c revision 292895
1/*-
2 * Copyright (c) 1989, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 * (c) UNIX System Laboratories, Inc.
5 * All or some portions of this file are derived from material licensed
6 * to the University of California by American Telephone and Telegraph
7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
8 * the permission of UNIX System Laboratories, Inc.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 4. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
35 */
36
37/*
38 * External virtual filesystem routines
39 */
40
41#include <sys/cdefs.h>
42__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_subr.c 292895 2015-12-30 00:04:33Z mckusick $");
43
44#include "opt_compat.h"
45#include "opt_ddb.h"
46#include "opt_watchdog.h"
47
48#include <sys/param.h>
49#include <sys/systm.h>
50#include <sys/bio.h>
51#include <sys/buf.h>
52#include <sys/condvar.h>
53#include <sys/conf.h>
54#include <sys/dirent.h>
55#include <sys/event.h>
56#include <sys/eventhandler.h>
57#include <sys/extattr.h>
58#include <sys/file.h>
59#include <sys/fcntl.h>
60#include <sys/jail.h>
61#include <sys/kdb.h>
62#include <sys/kernel.h>
63#include <sys/kthread.h>
64#include <sys/lockf.h>
65#include <sys/malloc.h>
66#include <sys/mount.h>
67#include <sys/namei.h>
68#include <sys/pctrie.h>
69#include <sys/priv.h>
70#include <sys/reboot.h>
71#include <sys/rwlock.h>
72#include <sys/sched.h>
73#include <sys/sleepqueue.h>
74#include <sys/smp.h>
75#include <sys/stat.h>
76#include <sys/sysctl.h>
77#include <sys/syslog.h>
78#include <sys/vmmeter.h>
79#include <sys/vnode.h>
80#include <sys/watchdog.h>
81
82#include <machine/stdarg.h>
83
84#include <security/mac/mac_framework.h>
85
86#include <vm/vm.h>
87#include <vm/vm_object.h>
88#include <vm/vm_extern.h>
89#include <vm/pmap.h>
90#include <vm/vm_map.h>
91#include <vm/vm_page.h>
92#include <vm/vm_kern.h>
93#include <vm/uma.h>
94
95#ifdef DDB
96#include <ddb/ddb.h>
97#endif
98
99static void	delmntque(struct vnode *vp);
100static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
101		    int slpflag, int slptimeo);
102static void	syncer_shutdown(void *arg, int howto);
103static int	vtryrecycle(struct vnode *vp);
104static void	v_incr_usecount(struct vnode *);
105static void	v_decr_usecount(struct vnode *);
106static void	v_decr_useonly(struct vnode *);
107static void	v_upgrade_usecount(struct vnode *);
108static void	vnlru_free(int);
109static void	vgonel(struct vnode *);
110static void	vfs_knllock(void *arg);
111static void	vfs_knlunlock(void *arg);
112static void	vfs_knl_assert_locked(void *arg);
113static void	vfs_knl_assert_unlocked(void *arg);
114static void	destroy_vpollinfo(struct vpollinfo *vi);
115
116/*
117 * Number of vnodes in existence.  Increased whenever getnewvnode()
118 * allocates a new vnode, decreased in vdropl() for VI_DOOMED vnode.
119 */
120static unsigned long	numvnodes;
121
122SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
123    "Number of vnodes in existence");
124
125static u_long vnodes_created;
126SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
127    0, "Number of vnodes created by getnewvnode");
128
129/*
130 * Conversion tables for conversion from vnode types to inode formats
131 * and back.
132 */
133enum vtype iftovt_tab[16] = {
134	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
135	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
136};
137int vttoif_tab[10] = {
138	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
139	S_IFSOCK, S_IFIFO, S_IFMT, S_IFMT
140};
141
142/*
143 * List of vnodes that are ready for recycling.
144 */
145static TAILQ_HEAD(freelst, vnode) vnode_free_list;
146
147/*
148 * "Free" vnode target.  Free vnodes are rarely completely free, but are
149 * just ones that are cheap to recycle.  Usually they are for files which
150 * have been stat'd but not read; these usually have inode and namecache
151 * data attached to them.  This target is the preferred minimum size of a
152 * sub-cache consisting mostly of such files. The system balances the size
153 * of this sub-cache with its complement to try to prevent either from
154 * thrashing while the other is relatively inactive.  The targets express
155 * a preference for the best balance.
156 *
157 * "Above" this target there are 2 further targets (watermarks) related
158 * to recyling of free vnodes.  In the best-operating case, the cache is
159 * exactly full, the free list has size between vlowat and vhiwat above the
160 * free target, and recycling from it and normal use maintains this state.
161 * Sometimes the free list is below vlowat or even empty, but this state
162 * is even better for immediate use provided the cache is not full.
163 * Otherwise, vnlru_proc() runs to reclaim enough vnodes (usually non-free
164 * ones) to reach one of these states.  The watermarks are currently hard-
165 * coded as 4% and 9% of the available space higher.  These and the default
166 * of 25% for wantfreevnodes are too large if the memory size is large.
167 * E.g., 9% of 75% of MAXVNODES is more than 566000 vnodes to reclaim
168 * whenever vnlru_proc() becomes active.
169 */
170static u_long wantfreevnodes;
171SYSCTL_ULONG(_vfs, OID_AUTO, wantfreevnodes, CTLFLAG_RW,
172    &wantfreevnodes, 0, "Target for minimum number of \"free\" vnodes");
173static u_long freevnodes;
174SYSCTL_ULONG(_vfs, OID_AUTO, freevnodes, CTLFLAG_RD,
175    &freevnodes, 0, "Number of \"free\" vnodes");
176
177/*
178 * The vfs.vlru_allow_cache_src sysctl variable is no longer used but
179 * the sysctl remains to provide ABI compatibility. The new code frees
180 * namecache sources as the last chance to satisfy the highest watermark,
181 * instead of selecting the source vnodes randomly. This provides good
182 * enough behaviour to keep vn_fullpath() working in most situations.
183 * The filesystem layout with deep trees, where the depricated knob was
184 * required, is thus handled automatically.
185 */
186static int vlru_allow_cache_src;
187SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
188    &vlru_allow_cache_src, 0, "Placeholder for API compatibility (unused)");
189
190static u_long recycles_count;
191SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
192    "Number of vnodes recycled to meet vnode cache targets");
193
194/*
195 * Various variables used for debugging the new implementation of
196 * reassignbuf().
197 * XXX these are probably of (very) limited utility now.
198 */
199static int reassignbufcalls;
200SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
201    "Number of calls to reassignbuf");
202
203static u_long free_owe_inact;
204SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
205    "Number of times free vnodes kept on active list due to VFS "
206    "owing inactivation");
207
208/*
209 * Cache for the mount type id assigned to NFS.  This is used for
210 * special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
211 */
212int	nfs_mount_type = -1;
213
214/* To keep more than one thread at a time from running vfs_getnewfsid */
215static struct mtx mntid_mtx;
216
217/*
218 * Lock for any access to the following:
219 *	vnode_free_list
220 *	numvnodes
221 *	freevnodes
222 */
223static struct mtx vnode_free_list_mtx;
224
225/* Publicly exported FS */
226struct nfs_public nfs_pub;
227
228static uma_zone_t buf_trie_zone;
229
230/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
231static uma_zone_t vnode_zone;
232static uma_zone_t vnodepoll_zone;
233
234/*
235 * The workitem queue.
236 *
237 * It is useful to delay writes of file data and filesystem metadata
238 * for tens of seconds so that quickly created and deleted files need
239 * not waste disk bandwidth being created and removed. To realize this,
240 * we append vnodes to a "workitem" queue. When running with a soft
241 * updates implementation, most pending metadata dependencies should
242 * not wait for more than a few seconds. Thus, mounted on block devices
243 * are delayed only about a half the time that file data is delayed.
244 * Similarly, directory updates are more critical, so are only delayed
245 * about a third the time that file data is delayed. Thus, there are
246 * SYNCER_MAXDELAY queues that are processed round-robin at a rate of
247 * one each second (driven off the filesystem syncer process). The
248 * syncer_delayno variable indicates the next queue that is to be processed.
249 * Items that need to be processed soon are placed in this queue:
250 *
251 *	syncer_workitem_pending[syncer_delayno]
252 *
253 * A delay of fifteen seconds is done by placing the request fifteen
254 * entries later in the queue:
255 *
256 *	syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask]
257 *
258 */
259static int syncer_delayno;
260static long syncer_mask;
261LIST_HEAD(synclist, bufobj);
262static struct synclist *syncer_workitem_pending;
263/*
264 * The sync_mtx protects:
265 *	bo->bo_synclist
266 *	sync_vnode_count
267 *	syncer_delayno
268 *	syncer_state
269 *	syncer_workitem_pending
270 *	syncer_worklist_len
271 *	rushjob
272 */
273static struct mtx sync_mtx;
274static struct cv sync_wakeup;
275
276#define SYNCER_MAXDELAY		32
277static int syncer_maxdelay = SYNCER_MAXDELAY;	/* maximum delay time */
278static int syncdelay = 30;		/* max time to delay syncing data */
279static int filedelay = 30;		/* time to delay syncing files */
280SYSCTL_INT(_kern, OID_AUTO, filedelay, CTLFLAG_RW, &filedelay, 0,
281    "Time to delay syncing files (in seconds)");
282static int dirdelay = 29;		/* time to delay syncing directories */
283SYSCTL_INT(_kern, OID_AUTO, dirdelay, CTLFLAG_RW, &dirdelay, 0,
284    "Time to delay syncing directories (in seconds)");
285static int metadelay = 28;		/* time to delay syncing metadata */
286SYSCTL_INT(_kern, OID_AUTO, metadelay, CTLFLAG_RW, &metadelay, 0,
287    "Time to delay syncing metadata (in seconds)");
288static int rushjob;		/* number of slots to run ASAP */
289static int stat_rush_requests;	/* number of times I/O speeded up */
290SYSCTL_INT(_debug, OID_AUTO, rush_requests, CTLFLAG_RW, &stat_rush_requests, 0,
291    "Number of times I/O speeded up (rush requests)");
292
293/*
294 * When shutting down the syncer, run it at four times normal speed.
295 */
296#define SYNCER_SHUTDOWN_SPEEDUP		4
297static int sync_vnode_count;
298static int syncer_worklist_len;
299static enum { SYNCER_RUNNING, SYNCER_SHUTTING_DOWN, SYNCER_FINAL_DELAY }
300    syncer_state;
301
302/* Target for maximum number of vnodes. */
303int desiredvnodes;
304static int gapvnodes;		/* gap between wanted and desired */
305static int vhiwat;		/* enough extras after expansion */
306static int vlowat;		/* minimal extras before expansion */
307static int vstir;		/* nonzero to stir non-free vnodes */
308static volatile int vsmalltrigger = 8;	/* pref to keep if > this many pages */
309
310static int
311sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
312{
313	int error, old_desiredvnodes;
314
315	old_desiredvnodes = desiredvnodes;
316	if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
317		return (error);
318	if (old_desiredvnodes != desiredvnodes) {
319		wantfreevnodes = desiredvnodes / 4;
320		/* XXX locking seems to be incomplete. */
321		vfs_hash_changesize(desiredvnodes);
322		cache_changesize(desiredvnodes);
323	}
324	return (0);
325}
326
327SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
328    CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
329    sysctl_update_desiredvnodes, "I", "Target for maximum number of vnodes");
330SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
331    &wantfreevnodes, 0, "Old name for vfs.wantfreevnodes (legacy)");
332static int vnlru_nowhere;
333SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
334    &vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
335
336/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
337static int vnsz2log;
338
339/*
340 * Support for the bufobj clean & dirty pctrie.
341 */
342static void *
343buf_trie_alloc(struct pctrie *ptree)
344{
345
346	return uma_zalloc(buf_trie_zone, M_NOWAIT);
347}
348
349static void
350buf_trie_free(struct pctrie *ptree, void *node)
351{
352
353	uma_zfree(buf_trie_zone, node);
354}
355PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
356
357/*
358 * Initialize the vnode management data structures.
359 *
360 * Reevaluate the following cap on the number of vnodes after the physical
361 * memory size exceeds 512GB.  In the limit, as the physical memory size
362 * grows, the ratio of the memory size in KB to to vnodes approaches 64:1.
363 */
364#ifndef	MAXVNODES_MAX
365#define	MAXVNODES_MAX	(512 * 1024 * 1024 / 64)	/* 8M */
366#endif
367
368/*
369 * Initialize a vnode as it first enters the zone.
370 */
371static int
372vnode_init(void *mem, int size, int flags)
373{
374	struct vnode *vp;
375	struct bufobj *bo;
376
377	vp = mem;
378	bzero(vp, size);
379	/*
380	 * Setup locks.
381	 */
382	vp->v_vnlock = &vp->v_lock;
383	mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
384	/*
385	 * By default, don't allow shared locks unless filesystems opt-in.
386	 */
387	lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
388	    LK_NOSHARE | LK_IS_VNODE);
389	/*
390	 * Initialize bufobj.
391	 */
392	bo = &vp->v_bufobj;
393	bo->__bo_vnode = vp;
394	rw_init(BO_LOCKPTR(bo), "bufobj interlock");
395	bo->bo_private = vp;
396	TAILQ_INIT(&bo->bo_clean.bv_hd);
397	TAILQ_INIT(&bo->bo_dirty.bv_hd);
398	/*
399	 * Initialize namecache.
400	 */
401	LIST_INIT(&vp->v_cache_src);
402	TAILQ_INIT(&vp->v_cache_dst);
403	/*
404	 * Initialize rangelocks.
405	 */
406	rangelock_init(&vp->v_rl);
407	return (0);
408}
409
410/*
411 * Free a vnode when it is cleared from the zone.
412 */
413static void
414vnode_fini(void *mem, int size)
415{
416	struct vnode *vp;
417	struct bufobj *bo;
418
419	vp = mem;
420	rangelock_destroy(&vp->v_rl);
421	lockdestroy(vp->v_vnlock);
422	mtx_destroy(&vp->v_interlock);
423	bo = &vp->v_bufobj;
424	rw_destroy(BO_LOCKPTR(bo));
425}
426
427static void
428vntblinit(void *dummy __unused)
429{
430	u_int i;
431	int physvnodes, virtvnodes;
432
433	/*
434	 * Desiredvnodes is a function of the physical memory size and the
435	 * kernel's heap size.  Generally speaking, it scales with the
436	 * physical memory size.  The ratio of desiredvnodes to the physical
437	 * memory size is 1:16 until desiredvnodes exceeds 98,304.
438	 * Thereafter, the
439	 * marginal ratio of desiredvnodes to the physical memory size is
440	 * 1:64.  However, desiredvnodes is limited by the kernel's heap
441	 * size.  The memory required by desiredvnodes vnodes and vm objects
442	 * must not exceed 1/7th of the kernel's heap size.
443	 */
444	physvnodes = maxproc + pgtok(cnt.v_page_count) / 64 +
445	    3 * min(98304 * 16, pgtok(cnt.v_page_count)) / 64;
446	virtvnodes = vm_kmem_size / (7 * (sizeof(struct vm_object) +
447	    sizeof(struct vnode)));
448	desiredvnodes = min(physvnodes, virtvnodes);
449	if (desiredvnodes > MAXVNODES_MAX) {
450		if (bootverbose)
451			printf("Reducing kern.maxvnodes %d -> %d\n",
452			    desiredvnodes, MAXVNODES_MAX);
453		desiredvnodes = MAXVNODES_MAX;
454	}
455	wantfreevnodes = desiredvnodes / 4;
456	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
457	TAILQ_INIT(&vnode_free_list);
458	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
459	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
460	    vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
461	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
462	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
463	/*
464	 * Preallocate enough nodes to support one-per buf so that
465	 * we can not fail an insert.  reassignbuf() callers can not
466	 * tolerate the insertion failure.
467	 */
468	buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
469	    NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
470	    UMA_ZONE_NOFREE | UMA_ZONE_VM);
471	uma_prealloc(buf_trie_zone, nbuf);
472	/*
473	 * Initialize the filesystem syncer.
474	 */
475	syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
476	    &syncer_mask);
477	syncer_maxdelay = syncer_mask + 1;
478	mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
479	cv_init(&sync_wakeup, "syncer");
480	for (i = 1; i <= sizeof(struct vnode); i <<= 1)
481		vnsz2log++;
482	vnsz2log--;
483}
484SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vntblinit, NULL);
485
486
487/*
488 * Mark a mount point as busy. Used to synchronize access and to delay
489 * unmounting. Eventually, mountlist_mtx is not released on failure.
490 *
491 * vfs_busy() is a custom lock, it can block the caller.
492 * vfs_busy() only sleeps if the unmount is active on the mount point.
493 * For a mountpoint mp, vfs_busy-enforced lock is before lock of any
494 * vnode belonging to mp.
495 *
496 * Lookup uses vfs_busy() to traverse mount points.
497 * root fs			var fs
498 * / vnode lock		A	/ vnode lock (/var)		D
499 * /var vnode lock	B	/log vnode lock(/var/log)	E
500 * vfs_busy lock	C	vfs_busy lock			F
501 *
502 * Within each file system, the lock order is C->A->B and F->D->E.
503 *
504 * When traversing across mounts, the system follows that lock order:
505 *
506 *        C->A->B
507 *              |
508 *              +->F->D->E
509 *
510 * The lookup() process for namei("/var") illustrates the process:
511 *  VOP_LOOKUP() obtains B while A is held
512 *  vfs_busy() obtains a shared lock on F while A and B are held
513 *  vput() releases lock on B
514 *  vput() releases lock on A
515 *  VFS_ROOT() obtains lock on D while shared lock on F is held
516 *  vfs_unbusy() releases shared lock on F
517 *  vn_lock() obtains lock on deadfs vnode vp_crossmp instead of A.
518 *    Attempt to lock A (instead of vp_crossmp) while D is held would
519 *    violate the global order, causing deadlocks.
520 *
521 * dounmount() locks B while F is drained.
522 */
523int
524vfs_busy(struct mount *mp, int flags)
525{
526
527	MPASS((flags & ~MBF_MASK) == 0);
528	CTR3(KTR_VFS, "%s: mp %p with flags %d", __func__, mp, flags);
529
530	MNT_ILOCK(mp);
531	MNT_REF(mp);
532	/*
533	 * If mount point is currenly being unmounted, sleep until the
534	 * mount point fate is decided.  If thread doing the unmounting fails,
535	 * it will clear MNTK_UNMOUNT flag before waking us up, indicating
536	 * that this mount point has survived the unmount attempt and vfs_busy
537	 * should retry.  Otherwise the unmounter thread will set MNTK_REFEXPIRE
538	 * flag in addition to MNTK_UNMOUNT, indicating that mount point is
539	 * about to be really destroyed.  vfs_busy needs to release its
540	 * reference on the mount point in this case and return with ENOENT,
541	 * telling the caller that mount mount it tried to busy is no longer
542	 * valid.
543	 */
544	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
545		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
546			MNT_REL(mp);
547			MNT_IUNLOCK(mp);
548			CTR1(KTR_VFS, "%s: failed busying before sleeping",
549			    __func__);
550			return (ENOENT);
551		}
552		if (flags & MBF_MNTLSTLOCK)
553			mtx_unlock(&mountlist_mtx);
554		mp->mnt_kern_flag |= MNTK_MWAIT;
555		msleep(mp, MNT_MTX(mp), PVFS | PDROP, "vfs_busy", 0);
556		if (flags & MBF_MNTLSTLOCK)
557			mtx_lock(&mountlist_mtx);
558		MNT_ILOCK(mp);
559	}
560	if (flags & MBF_MNTLSTLOCK)
561		mtx_unlock(&mountlist_mtx);
562	mp->mnt_lockref++;
563	MNT_IUNLOCK(mp);
564	return (0);
565}
566
567/*
568 * Free a busy filesystem.
569 */
570void
571vfs_unbusy(struct mount *mp)
572{
573
574	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
575	MNT_ILOCK(mp);
576	MNT_REL(mp);
577	KASSERT(mp->mnt_lockref > 0, ("negative mnt_lockref"));
578	mp->mnt_lockref--;
579	if (mp->mnt_lockref == 0 && (mp->mnt_kern_flag & MNTK_DRAINING) != 0) {
580		MPASS(mp->mnt_kern_flag & MNTK_UNMOUNT);
581		CTR1(KTR_VFS, "%s: waking up waiters", __func__);
582		mp->mnt_kern_flag &= ~MNTK_DRAINING;
583		wakeup(&mp->mnt_lockref);
584	}
585	MNT_IUNLOCK(mp);
586}
587
588/*
589 * Lookup a mount point by filesystem identifier.
590 */
591struct mount *
592vfs_getvfs(fsid_t *fsid)
593{
594	struct mount *mp;
595
596	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
597	mtx_lock(&mountlist_mtx);
598	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
599		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
600		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
601			vfs_ref(mp);
602			mtx_unlock(&mountlist_mtx);
603			return (mp);
604		}
605	}
606	mtx_unlock(&mountlist_mtx);
607	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
608	return ((struct mount *) 0);
609}
610
611/*
612 * Lookup a mount point by filesystem identifier, busying it before
613 * returning.
614 *
615 * To avoid congestion on mountlist_mtx, implement simple direct-mapped
616 * cache for popular filesystem identifiers.  The cache is lockess, using
617 * the fact that struct mount's are never freed.  In worst case we may
618 * get pointer to unmounted or even different filesystem, so we have to
619 * check what we got, and go slow way if so.
620 */
621struct mount *
622vfs_busyfs(fsid_t *fsid)
623{
624#define	FSID_CACHE_SIZE	256
625	typedef struct mount * volatile vmp_t;
626	static vmp_t cache[FSID_CACHE_SIZE];
627	struct mount *mp;
628	int error;
629	uint32_t hash;
630
631	CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
632	hash = fsid->val[0] ^ fsid->val[1];
633	hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
634	mp = cache[hash];
635	if (mp == NULL ||
636	    mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
637	    mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
638		goto slow;
639	if (vfs_busy(mp, 0) != 0) {
640		cache[hash] = NULL;
641		goto slow;
642	}
643	if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
644	    mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
645		return (mp);
646	else
647	    vfs_unbusy(mp);
648
649slow:
650	mtx_lock(&mountlist_mtx);
651	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
652		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
653		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
654			error = vfs_busy(mp, MBF_MNTLSTLOCK);
655			if (error) {
656				cache[hash] = NULL;
657				mtx_unlock(&mountlist_mtx);
658				return (NULL);
659			}
660			cache[hash] = mp;
661			return (mp);
662		}
663	}
664	CTR2(KTR_VFS, "%s: lookup failed for %p id", __func__, fsid);
665	mtx_unlock(&mountlist_mtx);
666	return ((struct mount *) 0);
667}
668
669/*
670 * Check if a user can access privileged mount options.
671 */
672int
673vfs_suser(struct mount *mp, struct thread *td)
674{
675	int error;
676
677	/*
678	 * If the thread is jailed, but this is not a jail-friendly file
679	 * system, deny immediately.
680	 */
681	if (!(mp->mnt_vfc->vfc_flags & VFCF_JAIL) && jailed(td->td_ucred))
682		return (EPERM);
683
684	/*
685	 * If the file system was mounted outside the jail of the calling
686	 * thread, deny immediately.
687	 */
688	if (prison_check(td->td_ucred, mp->mnt_cred) != 0)
689		return (EPERM);
690
691	/*
692	 * If file system supports delegated administration, we don't check
693	 * for the PRIV_VFS_MOUNT_OWNER privilege - it will be better verified
694	 * by the file system itself.
695	 * If this is not the user that did original mount, we check for
696	 * the PRIV_VFS_MOUNT_OWNER privilege.
697	 */
698	if (!(mp->mnt_vfc->vfc_flags & VFCF_DELEGADMIN) &&
699	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
700		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
701			return (error);
702	}
703	return (0);
704}
705
706/*
707 * Get a new unique fsid.  Try to make its val[0] unique, since this value
708 * will be used to create fake device numbers for stat().  Also try (but
709 * not so hard) make its val[0] unique mod 2^16, since some emulators only
710 * support 16-bit device numbers.  We end up with unique val[0]'s for the
711 * first 2^16 calls and unique val[0]'s mod 2^16 for the first 2^8 calls.
712 *
713 * Keep in mind that several mounts may be running in parallel.  Starting
714 * the search one past where the previous search terminated is both a
715 * micro-optimization and a defense against returning the same fsid to
716 * different mounts.
717 */
718void
719vfs_getnewfsid(struct mount *mp)
720{
721	static uint16_t mntid_base;
722	struct mount *nmp;
723	fsid_t tfsid;
724	int mtype;
725
726	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
727	mtx_lock(&mntid_mtx);
728	mtype = mp->mnt_vfc->vfc_typenum;
729	tfsid.val[1] = mtype;
730	mtype = (mtype & 0xFF) << 24;
731	for (;;) {
732		tfsid.val[0] = makedev(255,
733		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
734		mntid_base++;
735		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
736			break;
737		vfs_rel(nmp);
738	}
739	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
740	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
741	mtx_unlock(&mntid_mtx);
742}
743
744/*
745 * Knob to control the precision of file timestamps:
746 *
747 *   0 = seconds only; nanoseconds zeroed.
748 *   1 = seconds and nanoseconds, accurate within 1/HZ.
749 *   2 = seconds and nanoseconds, truncated to microseconds.
750 * >=3 = seconds and nanoseconds, maximum precision.
751 */
752enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
753
754static int timestamp_precision = TSP_USEC;
755SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
756    &timestamp_precision, 0, "File timestamp precision (0: seconds, "
757    "1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
758    "3+: sec + ns (max. precision))");
759
760/*
761 * Get a current timestamp.
762 */
763void
764vfs_timestamp(struct timespec *tsp)
765{
766	struct timeval tv;
767
768	switch (timestamp_precision) {
769	case TSP_SEC:
770		tsp->tv_sec = time_second;
771		tsp->tv_nsec = 0;
772		break;
773	case TSP_HZ:
774		getnanotime(tsp);
775		break;
776	case TSP_USEC:
777		microtime(&tv);
778		TIMEVAL_TO_TIMESPEC(&tv, tsp);
779		break;
780	case TSP_NSEC:
781	default:
782		nanotime(tsp);
783		break;
784	}
785}
786
787/*
788 * Set vnode attributes to VNOVAL
789 */
790void
791vattr_null(struct vattr *vap)
792{
793
794	vap->va_type = VNON;
795	vap->va_size = VNOVAL;
796	vap->va_bytes = VNOVAL;
797	vap->va_mode = VNOVAL;
798	vap->va_nlink = VNOVAL;
799	vap->va_uid = VNOVAL;
800	vap->va_gid = VNOVAL;
801	vap->va_fsid = VNOVAL;
802	vap->va_fileid = VNOVAL;
803	vap->va_blocksize = VNOVAL;
804	vap->va_rdev = VNOVAL;
805	vap->va_atime.tv_sec = VNOVAL;
806	vap->va_atime.tv_nsec = VNOVAL;
807	vap->va_mtime.tv_sec = VNOVAL;
808	vap->va_mtime.tv_nsec = VNOVAL;
809	vap->va_ctime.tv_sec = VNOVAL;
810	vap->va_ctime.tv_nsec = VNOVAL;
811	vap->va_birthtime.tv_sec = VNOVAL;
812	vap->va_birthtime.tv_nsec = VNOVAL;
813	vap->va_flags = VNOVAL;
814	vap->va_gen = VNOVAL;
815	vap->va_vaflags = 0;
816}
817
818/*
819 * This routine is called when we have too many vnodes.  It attempts
820 * to free <count> vnodes and will potentially free vnodes that still
821 * have VM backing store (VM backing store is typically the cause
822 * of a vnode blowout so we want to do this).  Therefore, this operation
823 * is not considered cheap.
824 *
825 * A number of conditions may prevent a vnode from being reclaimed.
826 * the buffer cache may have references on the vnode, a directory
827 * vnode may still have references due to the namei cache representing
828 * underlying files, or the vnode may be in active use.   It is not
829 * desireable to reuse such vnodes.  These conditions may cause the
830 * number of vnodes to reach some minimum value regardless of what
831 * you set kern.maxvnodes to.  Do not set kern.maxvnodes too low.
832 */
833static int
834vlrureclaim(struct mount *mp, int reclaim_nc_src, int trigger)
835{
836	struct vnode *vp;
837	int count, done, target;
838
839	done = 0;
840	vn_start_write(NULL, &mp, V_WAIT);
841	MNT_ILOCK(mp);
842	count = mp->mnt_nvnodelistsize;
843	target = count * (int64_t)gapvnodes / imax(desiredvnodes, 1);
844	target = target / 10 + 1;
845	while (count != 0 && done < target) {
846		vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
847		while (vp != NULL && vp->v_type == VMARKER)
848			vp = TAILQ_NEXT(vp, v_nmntvnodes);
849		if (vp == NULL)
850			break;
851		/*
852		 * XXX LRU is completely broken for non-free vnodes.  First
853		 * by calling here in mountpoint order, then by moving
854		 * unselected vnodes to the end here, and most grossly by
855		 * removing the vlruvp() function that was supposed to
856		 * maintain the order.  (This function was born broken
857		 * since syncer problems prevented it doing anything.)  The
858		 * order is closer to LRC (C = Created).
859		 *
860		 * LRU reclaiming of vnodes seems to have last worked in
861		 * FreeBSD-3 where LRU wasn't mentioned under any spelling.
862		 * Then there was no hold count, and inactive vnodes were
863		 * simply put on the free list in LRU order.  The separate
864		 * lists also break LRU.  We prefer to reclaim from the
865		 * free list for technical reasons.  This tends to thrash
866		 * the free list to keep very unrecently used held vnodes.
867		 * The problem is mitigated by keeping the free list large.
868		 */
869		TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
870		TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
871		--count;
872		if (!VI_TRYLOCK(vp))
873			goto next_iter;
874		/*
875		 * If it's been deconstructed already, it's still
876		 * referenced, or it exceeds the trigger, skip it.
877		 * Also skip free vnodes.  We are trying to make space
878		 * to expand the free list, not reduce it.
879		 */
880		if (vp->v_usecount ||
881		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
882		    ((vp->v_iflag & VI_FREE) != 0) ||
883		    (vp->v_iflag & VI_DOOMED) != 0 || (vp->v_object != NULL &&
884		    vp->v_object->resident_page_count > trigger)) {
885			VI_UNLOCK(vp);
886			goto next_iter;
887		}
888		MNT_IUNLOCK(mp);
889		vholdl(vp);
890		if (VOP_LOCK(vp, LK_INTERLOCK|LK_EXCLUSIVE|LK_NOWAIT)) {
891			vdrop(vp);
892			goto next_iter_mntunlocked;
893		}
894		VI_LOCK(vp);
895		/*
896		 * v_usecount may have been bumped after VOP_LOCK() dropped
897		 * the vnode interlock and before it was locked again.
898		 *
899		 * It is not necessary to recheck VI_DOOMED because it can
900		 * only be set by another thread that holds both the vnode
901		 * lock and vnode interlock.  If another thread has the
902		 * vnode lock before we get to VOP_LOCK() and obtains the
903		 * vnode interlock after VOP_LOCK() drops the vnode
904		 * interlock, the other thread will be unable to drop the
905		 * vnode lock before our VOP_LOCK() call fails.
906		 */
907		if (vp->v_usecount ||
908		    (!reclaim_nc_src && !LIST_EMPTY(&vp->v_cache_src)) ||
909		    (vp->v_iflag & VI_FREE) != 0 ||
910		    (vp->v_object != NULL &&
911		    vp->v_object->resident_page_count > trigger)) {
912			VOP_UNLOCK(vp, LK_INTERLOCK);
913			vdrop(vp);
914			goto next_iter_mntunlocked;
915		}
916		KASSERT((vp->v_iflag & VI_DOOMED) == 0,
917		    ("VI_DOOMED unexpectedly detected in vlrureclaim()"));
918		atomic_add_long(&recycles_count, 1);
919		vgonel(vp);
920		VOP_UNLOCK(vp, 0);
921		vdropl(vp);
922		done++;
923next_iter_mntunlocked:
924		if (!should_yield())
925			goto relock_mnt;
926		goto yield;
927next_iter:
928		if (!should_yield())
929			continue;
930		MNT_IUNLOCK(mp);
931yield:
932		kern_yield(PRI_USER);
933relock_mnt:
934		MNT_ILOCK(mp);
935	}
936	MNT_IUNLOCK(mp);
937	vn_finished_write(mp);
938	return done;
939}
940
941/*
942 * Attempt to reduce the free list by the requested amount.
943 */
944static void
945vnlru_free(int count)
946{
947	struct vnode *vp;
948
949	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
950	for (; count > 0; count--) {
951		vp = TAILQ_FIRST(&vnode_free_list);
952		/*
953		 * The list can be modified while the free_list_mtx
954		 * has been dropped and vp could be NULL here.
955		 */
956		if (!vp)
957			break;
958		VNASSERT(vp->v_op != NULL, vp,
959		    ("vnlru_free: vnode already reclaimed."));
960		KASSERT((vp->v_iflag & VI_FREE) != 0,
961		    ("Removing vnode not on freelist"));
962		KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
963		    ("Mangling active vnode"));
964		TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
965		/*
966		 * Don't recycle if we can't get the interlock.
967		 */
968		if (!VI_TRYLOCK(vp)) {
969			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
970			continue;
971		}
972		VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
973		    vp, ("vp inconsistent on freelist"));
974
975		/*
976		 * The clear of VI_FREE prevents activation of the
977		 * vnode.  There is no sense in putting the vnode on
978		 * the mount point active list, only to remove it
979		 * later during recycling.  Inline the relevant part
980		 * of vholdl(), to avoid triggering assertions or
981		 * activating.
982		 */
983		freevnodes--;
984		vp->v_iflag &= ~VI_FREE;
985		vp->v_holdcnt++;
986
987		mtx_unlock(&vnode_free_list_mtx);
988		VI_UNLOCK(vp);
989		vtryrecycle(vp);
990		/*
991		 * If the recycled succeeded this vdrop will actually free
992		 * the vnode.  If not it will simply place it back on
993		 * the free list.
994		 */
995		vdrop(vp);
996		mtx_lock(&vnode_free_list_mtx);
997	}
998}
999
1000/* XXX some names and initialization are bad for limits and watermarks. */
1001static int
1002vspace(void)
1003{
1004	int space;
1005
1006	gapvnodes = imax(desiredvnodes - wantfreevnodes, 100);
1007	vhiwat = gapvnodes / 11; /* 9% -- just under the 10% in vlrureclaim() */
1008	vlowat = vhiwat / 2;
1009	if (numvnodes > desiredvnodes)
1010		return (0);
1011	space = desiredvnodes - numvnodes;
1012	if (freevnodes > wantfreevnodes)
1013		space += freevnodes - wantfreevnodes;
1014	return (space);
1015}
1016
1017/*
1018 * Attempt to recycle vnodes in a context that is always safe to block.
1019 * Calling vlrurecycle() from the bowels of filesystem code has some
1020 * interesting deadlock problems.
1021 */
1022static struct proc *vnlruproc;
1023static int vnlruproc_sig;
1024
1025static void
1026vnlru_proc(void)
1027{
1028	struct mount *mp, *nmp;
1029	unsigned long ofreevnodes, onumvnodes;
1030	int done, force, reclaim_nc_src, trigger, usevnodes;
1031
1032	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, vnlruproc,
1033	    SHUTDOWN_PRI_FIRST);
1034
1035	force = 0;
1036	for (;;) {
1037		kproc_suspend_check(vnlruproc);
1038		mtx_lock(&vnode_free_list_mtx);
1039		/*
1040		 * If numvnodes is too large (due to desiredvnodes being
1041		 * adjusted using its sysctl, or emergency growth), first
1042		 * try to reduce it by discarding from the free list.
1043		 */
1044		if (numvnodes > desiredvnodes && freevnodes > 0)
1045			vnlru_free(ulmin(numvnodes - desiredvnodes,
1046			    freevnodes));
1047		/*
1048		 * Sleep if the vnode cache is in a good state.  This is
1049		 * when it is not over-full and has space for about a 4%
1050		 * or 9% expansion (by growing its size or inexcessively
1051		 * reducing its free list).  Otherwise, try to reclaim
1052		 * space for a 10% expansion.
1053		 */
1054		if (vstir && force == 0) {
1055			force = 1;
1056			vstir = 0;
1057		}
1058		if (vspace() >= vlowat && force == 0) {
1059			vnlruproc_sig = 0;
1060			wakeup(&vnlruproc_sig);
1061			msleep(vnlruproc, &vnode_free_list_mtx,
1062			    PVFS|PDROP, "vlruwt", hz);
1063			continue;
1064		}
1065		mtx_unlock(&vnode_free_list_mtx);
1066		done = 0;
1067		ofreevnodes = freevnodes;
1068		onumvnodes = numvnodes;
1069		/*
1070		 * Calculate parameters for recycling.  These are the same
1071		 * throughout the loop to give some semblance of fairness.
1072		 * The trigger point is to avoid recycling vnodes with lots
1073		 * of resident pages.  We aren't trying to free memory; we
1074		 * are trying to recycle or at least free vnodes.
1075		 */
1076		if (numvnodes <= desiredvnodes)
1077			usevnodes = numvnodes - freevnodes;
1078		else
1079			usevnodes = numvnodes;
1080		if (usevnodes <= 0)
1081			usevnodes = 1;
1082		/*
1083		 * The trigger value is is chosen to give a conservatively
1084		 * large value to ensure that it alone doesn't prevent
1085		 * making progress.  The value can easily be so large that
1086		 * it is effectively infinite in some congested and
1087		 * misconfigured cases, and this is necessary.  Normally
1088		 * it is about 8 to 100 (pages), which is quite large.
1089		 */
1090		trigger = cnt.v_page_count * 2 / usevnodes;
1091		if (force < 2)
1092			trigger = vsmalltrigger;
1093		reclaim_nc_src = force >= 3;
1094		mtx_lock(&mountlist_mtx);
1095		for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
1096			if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) {
1097				nmp = TAILQ_NEXT(mp, mnt_list);
1098				continue;
1099			}
1100			done += vlrureclaim(mp, reclaim_nc_src, trigger);
1101			mtx_lock(&mountlist_mtx);
1102			nmp = TAILQ_NEXT(mp, mnt_list);
1103			vfs_unbusy(mp);
1104		}
1105		mtx_unlock(&mountlist_mtx);
1106		if (onumvnodes > desiredvnodes && numvnodes <= desiredvnodes)
1107			uma_reclaim();
1108		if (done == 0) {
1109			if (force == 0 || force == 1) {
1110				force = 2;
1111				continue;
1112			}
1113			if (force == 2) {
1114				force = 3;
1115				continue;
1116			}
1117			force = 0;
1118			vnlru_nowhere++;
1119			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
1120		} else
1121			kern_yield(PRI_USER);
1122		/*
1123		 * After becoming active to expand above low water, keep
1124		 * active until above high water.
1125		 */
1126		force = vspace() < vhiwat;
1127	}
1128}
1129
1130static struct kproc_desc vnlru_kp = {
1131	"vnlru",
1132	vnlru_proc,
1133	&vnlruproc
1134};
1135SYSINIT(vnlru, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start,
1136    &vnlru_kp);
1137
1138/*
1139 * Routines having to do with the management of the vnode table.
1140 */
1141
1142/*
1143 * Try to recycle a freed vnode.  We abort if anyone picks up a reference
1144 * before we actually vgone().  This function must be called with the vnode
1145 * held to prevent the vnode from being returned to the free list midway
1146 * through vgone().
1147 */
1148static int
1149vtryrecycle(struct vnode *vp)
1150{
1151	struct mount *vnmp;
1152
1153	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
1154	VNASSERT(vp->v_holdcnt, vp,
1155	    ("vtryrecycle: Recycling vp %p without a reference.", vp));
1156	/*
1157	 * This vnode may found and locked via some other list, if so we
1158	 * can't recycle it yet.
1159	 */
1160	if (VOP_LOCK(vp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
1161		CTR2(KTR_VFS,
1162		    "%s: impossible to recycle, vp %p lock is already held",
1163		    __func__, vp);
1164		return (EWOULDBLOCK);
1165	}
1166	/*
1167	 * Don't recycle if its filesystem is being suspended.
1168	 */
1169	if (vn_start_write(vp, &vnmp, V_NOWAIT) != 0) {
1170		VOP_UNLOCK(vp, 0);
1171		CTR2(KTR_VFS,
1172		    "%s: impossible to recycle, cannot start the write for %p",
1173		    __func__, vp);
1174		return (EBUSY);
1175	}
1176	/*
1177	 * If we got this far, we need to acquire the interlock and see if
1178	 * anyone picked up this vnode from another list.  If not, we will
1179	 * mark it with DOOMED via vgonel() so that anyone who does find it
1180	 * will skip over it.
1181	 */
1182	VI_LOCK(vp);
1183	if (vp->v_usecount) {
1184		VOP_UNLOCK(vp, LK_INTERLOCK);
1185		vn_finished_write(vnmp);
1186		CTR2(KTR_VFS,
1187		    "%s: impossible to recycle, %p is already referenced",
1188		    __func__, vp);
1189		return (EBUSY);
1190	}
1191	if ((vp->v_iflag & VI_DOOMED) == 0) {
1192		atomic_add_long(&recycles_count, 1);
1193		vgonel(vp);
1194	}
1195	VOP_UNLOCK(vp, LK_INTERLOCK);
1196	vn_finished_write(vnmp);
1197	return (0);
1198}
1199
1200static void
1201vcheckspace(void)
1202{
1203
1204	if (vspace() < vlowat && vnlruproc_sig == 0) {
1205		vnlruproc_sig = 1;
1206		wakeup(vnlruproc);
1207	}
1208}
1209
1210/*
1211 * Wait if necessary for space for a new vnode.
1212 */
1213static int
1214getnewvnode_wait(int suspended)
1215{
1216
1217	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
1218	if (numvnodes >= desiredvnodes) {
1219		if (suspended) {
1220			/*
1221			 * The file system is being suspended.  We cannot
1222			 * risk a deadlock here, so allow allocation of
1223			 * another vnode even if this would give too many.
1224			 */
1225			return (0);
1226		}
1227		if (vnlruproc_sig == 0) {
1228			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
1229			wakeup(vnlruproc);
1230		}
1231		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
1232		    "vlruwk", hz);
1233	}
1234	/* Post-adjust like the pre-adjust in getnewvnode(). */
1235	if (numvnodes + 1 > desiredvnodes && freevnodes > 1)
1236		vnlru_free(1);
1237	return (numvnodes >= desiredvnodes ? ENFILE : 0);
1238}
1239
1240/*
1241 * This hack is fragile, and probably not needed any more now that the
1242 * watermark handling works.
1243 */
1244void
1245getnewvnode_reserve(u_int count)
1246{
1247	struct thread *td;
1248
1249	/* Pre-adjust like the pre-adjust in getnewvnode(), with any count. */
1250	/* XXX no longer so quick, but this part is not racy. */
1251	mtx_lock(&vnode_free_list_mtx);
1252	if (numvnodes + count > desiredvnodes && freevnodes > wantfreevnodes)
1253		vnlru_free(ulmin(numvnodes + count - desiredvnodes,
1254		    freevnodes - wantfreevnodes));
1255	mtx_unlock(&vnode_free_list_mtx);
1256
1257	td = curthread;
1258	/* First try to be quick and racy. */
1259	if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
1260		td->td_vp_reserv += count;
1261		vcheckspace();	/* XXX no longer so quick, but more racy */
1262		return;
1263	} else
1264		atomic_subtract_long(&numvnodes, count);
1265
1266	mtx_lock(&vnode_free_list_mtx);
1267	while (count > 0) {
1268		if (getnewvnode_wait(0) == 0) {
1269			count--;
1270			td->td_vp_reserv++;
1271			atomic_add_long(&numvnodes, 1);
1272		}
1273	}
1274	vcheckspace();
1275	mtx_unlock(&vnode_free_list_mtx);
1276}
1277
1278/*
1279 * This hack is fragile, especially if desiredvnodes or wantvnodes are
1280 * misconfgured or changed significantly.  Reducing desiredvnodes below
1281 * the reserved amount should cause bizarre behaviour like reducing it
1282 * below the number of active vnodes -- the system will try to reduce
1283 * numvnodes to match, but should fail, so the subtraction below should
1284 * not overflow.
1285 */
1286void
1287getnewvnode_drop_reserve(void)
1288{
1289	struct thread *td;
1290
1291	td = curthread;
1292	atomic_subtract_long(&numvnodes, td->td_vp_reserv);
1293	td->td_vp_reserv = 0;
1294}
1295
1296/*
1297 * Return the next vnode from the free list.
1298 */
1299int
1300getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
1301    struct vnode **vpp)
1302{
1303	struct vnode *vp;
1304	struct thread *td;
1305	struct lock_object *lo;
1306	static int cyclecount;
1307	int error;
1308
1309	CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
1310	vp = NULL;
1311	td = curthread;
1312	if (td->td_vp_reserv > 0) {
1313		td->td_vp_reserv -= 1;
1314		goto alloc;
1315	}
1316	mtx_lock(&vnode_free_list_mtx);
1317	if (numvnodes < desiredvnodes)
1318		cyclecount = 0;
1319	else if (cyclecount++ >= freevnodes) {
1320		cyclecount = 0;
1321		vstir = 1;
1322	}
1323	/*
1324	 * Grow the vnode cache if it will not be above its target max
1325	 * after growing.  Otherwise, if the free list is nonempty, try
1326	 * to reclaim 1 item from it before growing the cache (possibly
1327	 * above its target max if the reclamation failed or is delayed).
1328	 * Otherwise, wait for some space.  In all cases, schedule
1329	 * vnlru_proc() if we are getting short of space.  The watermarks
1330	 * should be chosen so that we never wait or even reclaim from
1331	 * the free list to below its target minimum.
1332	 */
1333	if (numvnodes + 1 <= desiredvnodes)
1334		;
1335	else if (freevnodes > 0)
1336		vnlru_free(1);
1337	else {
1338		error = getnewvnode_wait(mp != NULL && (mp->mnt_kern_flag &
1339		    MNTK_SUSPEND));
1340#if 0	/* XXX Not all VFS_VGET/ffs_vget callers check returns. */
1341		if (error != 0) {
1342			mtx_unlock(&vnode_free_list_mtx);
1343			return (error);
1344		}
1345#endif
1346	}
1347	vcheckspace();
1348	atomic_add_long(&numvnodes, 1);
1349	mtx_unlock(&vnode_free_list_mtx);
1350alloc:
1351	atomic_add_long(&vnodes_created, 1);
1352	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
1353	/*
1354	 * Locks are given the generic name "vnode" when created.
1355	 * Follow the historic practice of using the filesystem
1356	 * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
1357	 *
1358	 * Locks live in a witness group keyed on their name. Thus,
1359	 * when a lock is renamed, it must also move from the witness
1360	 * group of its old name to the witness group of its new name.
1361	 *
1362	 * The change only needs to be made when the vnode moves
1363	 * from one filesystem type to another. We ensure that each
1364	 * filesystem use a single static name pointer for its tag so
1365	 * that we can compare pointers rather than doing a strcmp().
1366	 */
1367	lo = &vp->v_vnlock->lock_object;
1368	if (lo->lo_name != tag) {
1369		lo->lo_name = tag;
1370		WITNESS_DESTROY(lo);
1371		WITNESS_INIT(lo, tag);
1372	}
1373	/*
1374	 * By default, don't allow shared locks unless filesystems opt-in.
1375	 */
1376	vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
1377	/*
1378	 * Finalize various vnode identity bits.
1379	 */
1380	KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
1381	KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
1382	KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
1383	vp->v_type = VNON;
1384	vp->v_tag = tag;
1385	vp->v_op = vops;
1386	v_incr_usecount(vp);
1387	vp->v_bufobj.bo_ops = &buf_ops_bio;
1388#ifdef MAC
1389	mac_vnode_init(vp);
1390	if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
1391		mac_vnode_associate_singlelabel(mp, vp);
1392	else if (mp == NULL && vops != &dead_vnodeops)
1393		printf("NULL mp in getnewvnode()\n");
1394#endif
1395	if (mp != NULL) {
1396		vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
1397		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
1398			vp->v_vflag |= VV_NOKNOTE;
1399	}
1400
1401	/*
1402	 * For the filesystems which do not use vfs_hash_insert(),
1403	 * still initialize v_hash to have vfs_hash_index() useful.
1404	 * E.g., nullfs uses vfs_hash_index() on the lower vnode for
1405	 * its own hashing.
1406	 */
1407	vp->v_hash = (uintptr_t)vp >> vnsz2log;
1408
1409	*vpp = vp;
1410	return (0);
1411}
1412
1413/*
1414 * Delete from old mount point vnode list, if on one.
1415 */
1416static void
1417delmntque(struct vnode *vp)
1418{
1419	struct mount *mp;
1420	int active;
1421
1422	mp = vp->v_mount;
1423	if (mp == NULL)
1424		return;
1425	MNT_ILOCK(mp);
1426	VI_LOCK(vp);
1427	KASSERT(mp->mnt_activevnodelistsize <= mp->mnt_nvnodelistsize,
1428	    ("Active vnode list size %d > Vnode list size %d",
1429	     mp->mnt_activevnodelistsize, mp->mnt_nvnodelistsize));
1430	active = vp->v_iflag & VI_ACTIVE;
1431	vp->v_iflag &= ~VI_ACTIVE;
1432	if (active) {
1433		mtx_lock(&vnode_free_list_mtx);
1434		TAILQ_REMOVE(&mp->mnt_activevnodelist, vp, v_actfreelist);
1435		mp->mnt_activevnodelistsize--;
1436		mtx_unlock(&vnode_free_list_mtx);
1437	}
1438	vp->v_mount = NULL;
1439	VI_UNLOCK(vp);
1440	VNASSERT(mp->mnt_nvnodelistsize > 0, vp,
1441		("bad mount point vnode list size"));
1442	TAILQ_REMOVE(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1443	mp->mnt_nvnodelistsize--;
1444	MNT_REL(mp);
1445	MNT_IUNLOCK(mp);
1446}
1447
1448static void
1449insmntque_stddtr(struct vnode *vp, void *dtr_arg)
1450{
1451
1452	vp->v_data = NULL;
1453	vp->v_op = &dead_vnodeops;
1454	vgone(vp);
1455	vput(vp);
1456}
1457
1458/*
1459 * Insert into list of vnodes for the new mount point, if available.
1460 */
1461int
1462insmntque1(struct vnode *vp, struct mount *mp,
1463	void (*dtr)(struct vnode *, void *), void *dtr_arg)
1464{
1465
1466	KASSERT(vp->v_mount == NULL,
1467		("insmntque: vnode already on per mount vnode list"));
1468	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
1469	ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
1470
1471	/*
1472	 * We acquire the vnode interlock early to ensure that the
1473	 * vnode cannot be recycled by another process releasing a
1474	 * holdcnt on it before we get it on both the vnode list
1475	 * and the active vnode list. The mount mutex protects only
1476	 * manipulation of the vnode list and the vnode freelist
1477	 * mutex protects only manipulation of the active vnode list.
1478	 * Hence the need to hold the vnode interlock throughout.
1479	 */
1480	MNT_ILOCK(mp);
1481	VI_LOCK(vp);
1482	if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
1483	    ((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
1484	    mp->mnt_nvnodelistsize == 0)) &&
1485	    (vp->v_vflag & VV_FORCEINSMQ) == 0) {
1486		VI_UNLOCK(vp);
1487		MNT_IUNLOCK(mp);
1488		if (dtr != NULL)
1489			dtr(vp, dtr_arg);
1490		return (EBUSY);
1491	}
1492	vp->v_mount = mp;
1493	MNT_REF(mp);
1494	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
1495	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
1496		("neg mount point vnode list size"));
1497	mp->mnt_nvnodelistsize++;
1498	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
1499	    ("Activating already active vnode"));
1500	vp->v_iflag |= VI_ACTIVE;
1501	mtx_lock(&vnode_free_list_mtx);
1502	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
1503	mp->mnt_activevnodelistsize++;
1504	mtx_unlock(&vnode_free_list_mtx);
1505	VI_UNLOCK(vp);
1506	MNT_IUNLOCK(mp);
1507	return (0);
1508}
1509
1510int
1511insmntque(struct vnode *vp, struct mount *mp)
1512{
1513
1514	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
1515}
1516
1517/*
1518 * Flush out and invalidate all buffers associated with a bufobj
1519 * Called with the underlying object locked.
1520 */
1521int
1522bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo)
1523{
1524	int error;
1525
1526	BO_LOCK(bo);
1527	if (flags & V_SAVE) {
1528		error = bufobj_wwait(bo, slpflag, slptimeo);
1529		if (error) {
1530			BO_UNLOCK(bo);
1531			return (error);
1532		}
1533		if (bo->bo_dirty.bv_cnt > 0) {
1534			BO_UNLOCK(bo);
1535			if ((error = BO_SYNC(bo, MNT_WAIT)) != 0)
1536				return (error);
1537			/*
1538			 * XXX We could save a lock/unlock if this was only
1539			 * enabled under INVARIANTS
1540			 */
1541			BO_LOCK(bo);
1542			if (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)
1543				panic("vinvalbuf: dirty bufs");
1544		}
1545	}
1546	/*
1547	 * If you alter this loop please notice that interlock is dropped and
1548	 * reacquired in flushbuflist.  Special care is needed to ensure that
1549	 * no race conditions occur from this.
1550	 */
1551	do {
1552		error = flushbuflist(&bo->bo_clean,
1553		    flags, bo, slpflag, slptimeo);
1554		if (error == 0 && !(flags & V_CLEANONLY))
1555			error = flushbuflist(&bo->bo_dirty,
1556			    flags, bo, slpflag, slptimeo);
1557		if (error != 0 && error != EAGAIN) {
1558			BO_UNLOCK(bo);
1559			return (error);
1560		}
1561	} while (error != 0);
1562
1563	/*
1564	 * Wait for I/O to complete.  XXX needs cleaning up.  The vnode can
1565	 * have write I/O in-progress but if there is a VM object then the
1566	 * VM object can also have read-I/O in-progress.
1567	 */
1568	do {
1569		bufobj_wwait(bo, 0, 0);
1570		BO_UNLOCK(bo);
1571		if (bo->bo_object != NULL) {
1572			VM_OBJECT_WLOCK(bo->bo_object);
1573			vm_object_pip_wait(bo->bo_object, "bovlbx");
1574			VM_OBJECT_WUNLOCK(bo->bo_object);
1575		}
1576		BO_LOCK(bo);
1577	} while (bo->bo_numoutput > 0);
1578	BO_UNLOCK(bo);
1579
1580	/*
1581	 * Destroy the copy in the VM cache, too.
1582	 */
1583	if (bo->bo_object != NULL &&
1584	    (flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
1585		VM_OBJECT_WLOCK(bo->bo_object);
1586		vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
1587		    OBJPR_CLEANONLY : 0);
1588		VM_OBJECT_WUNLOCK(bo->bo_object);
1589	}
1590
1591#ifdef INVARIANTS
1592	BO_LOCK(bo);
1593	if ((flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0 &&
1594	    (bo->bo_dirty.bv_cnt > 0 || bo->bo_clean.bv_cnt > 0))
1595		panic("vinvalbuf: flush failed");
1596	BO_UNLOCK(bo);
1597#endif
1598	return (0);
1599}
1600
1601/*
1602 * Flush out and invalidate all buffers associated with a vnode.
1603 * Called with the underlying object locked.
1604 */
1605int
1606vinvalbuf(struct vnode *vp, int flags, int slpflag, int slptimeo)
1607{
1608
1609	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
1610	ASSERT_VOP_LOCKED(vp, "vinvalbuf");
1611	if (vp->v_object != NULL && vp->v_object->handle != vp)
1612		return (0);
1613	return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
1614}
1615
1616/*
1617 * Flush out buffers on the specified list.
1618 *
1619 */
1620static int
1621flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
1622    int slptimeo)
1623{
1624	struct buf *bp, *nbp;
1625	int retval, error;
1626	daddr_t lblkno;
1627	b_xflags_t xflags;
1628
1629	ASSERT_BO_WLOCKED(bo);
1630
1631	retval = 0;
1632	TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
1633		if (((flags & V_NORMAL) && (bp->b_xflags & BX_ALTDATA)) ||
1634		    ((flags & V_ALT) && (bp->b_xflags & BX_ALTDATA) == 0)) {
1635			continue;
1636		}
1637		lblkno = 0;
1638		xflags = 0;
1639		if (nbp != NULL) {
1640			lblkno = nbp->b_lblkno;
1641			xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
1642		}
1643		retval = EAGAIN;
1644		error = BUF_TIMELOCK(bp,
1645		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
1646		    "flushbuf", slpflag, slptimeo);
1647		if (error) {
1648			BO_LOCK(bo);
1649			return (error != ENOLCK ? error : EAGAIN);
1650		}
1651		KASSERT(bp->b_bufobj == bo,
1652		    ("bp %p wrong b_bufobj %p should be %p",
1653		    bp, bp->b_bufobj, bo));
1654		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
1655			BUF_UNLOCK(bp);
1656			BO_LOCK(bo);
1657			return (EAGAIN);
1658		}
1659		/*
1660		 * XXX Since there are no node locks for NFS, I
1661		 * believe there is a slight chance that a delayed
1662		 * write will occur while sleeping just above, so
1663		 * check for it.
1664		 */
1665		if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
1666		    (flags & V_SAVE)) {
1667			bremfree(bp);
1668			bp->b_flags |= B_ASYNC;
1669			bwrite(bp);
1670			BO_LOCK(bo);
1671			return (EAGAIN);	/* XXX: why not loop ? */
1672		}
1673		bremfree(bp);
1674		bp->b_flags |= (B_INVAL | B_RELBUF);
1675		bp->b_flags &= ~B_ASYNC;
1676		brelse(bp);
1677		BO_LOCK(bo);
1678		if (nbp != NULL &&
1679		    (nbp->b_bufobj != bo ||
1680		     nbp->b_lblkno != lblkno ||
1681		     (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
1682			break;			/* nbp invalid */
1683	}
1684	return (retval);
1685}
1686
1687/*
1688 * Truncate a file's buffer and pages to a specified length.  This
1689 * is in lieu of the old vinvalbuf mechanism, which performed unneeded
1690 * sync activity.
1691 */
1692int
1693vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
1694{
1695	struct buf *bp, *nbp;
1696	int anyfreed;
1697	int trunclbn;
1698	struct bufobj *bo;
1699
1700	CTR5(KTR_VFS, "%s: vp %p with cred %p and block %d:%ju", __func__,
1701	    vp, cred, blksize, (uintmax_t)length);
1702
1703	/*
1704	 * Round up to the *next* lbn.
1705	 */
1706	trunclbn = (length + blksize - 1) / blksize;
1707
1708	ASSERT_VOP_LOCKED(vp, "vtruncbuf");
1709restart:
1710	bo = &vp->v_bufobj;
1711	BO_LOCK(bo);
1712	anyfreed = 1;
1713	for (;anyfreed;) {
1714		anyfreed = 0;
1715		TAILQ_FOREACH_SAFE(bp, &bo->bo_clean.bv_hd, b_bobufs, nbp) {
1716			if (bp->b_lblkno < trunclbn)
1717				continue;
1718			if (BUF_LOCK(bp,
1719			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1720			    BO_LOCKPTR(bo)) == ENOLCK)
1721				goto restart;
1722
1723			bremfree(bp);
1724			bp->b_flags |= (B_INVAL | B_RELBUF);
1725			bp->b_flags &= ~B_ASYNC;
1726			brelse(bp);
1727			anyfreed = 1;
1728
1729			BO_LOCK(bo);
1730			if (nbp != NULL &&
1731			    (((nbp->b_xflags & BX_VNCLEAN) == 0) ||
1732			    (nbp->b_vp != vp) ||
1733			    (nbp->b_flags & B_DELWRI))) {
1734				BO_UNLOCK(bo);
1735				goto restart;
1736			}
1737		}
1738
1739		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1740			if (bp->b_lblkno < trunclbn)
1741				continue;
1742			if (BUF_LOCK(bp,
1743			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1744			    BO_LOCKPTR(bo)) == ENOLCK)
1745				goto restart;
1746			bremfree(bp);
1747			bp->b_flags |= (B_INVAL | B_RELBUF);
1748			bp->b_flags &= ~B_ASYNC;
1749			brelse(bp);
1750			anyfreed = 1;
1751
1752			BO_LOCK(bo);
1753			if (nbp != NULL &&
1754			    (((nbp->b_xflags & BX_VNDIRTY) == 0) ||
1755			    (nbp->b_vp != vp) ||
1756			    (nbp->b_flags & B_DELWRI) == 0)) {
1757				BO_UNLOCK(bo);
1758				goto restart;
1759			}
1760		}
1761	}
1762
1763	if (length > 0) {
1764restartsync:
1765		TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
1766			if (bp->b_lblkno > 0)
1767				continue;
1768			/*
1769			 * Since we hold the vnode lock this should only
1770			 * fail if we're racing with the buf daemon.
1771			 */
1772			if (BUF_LOCK(bp,
1773			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
1774			    BO_LOCKPTR(bo)) == ENOLCK) {
1775				goto restart;
1776			}
1777			VNASSERT((bp->b_flags & B_DELWRI), vp,
1778			    ("buf(%p) on dirty queue without DELWRI", bp));
1779
1780			bremfree(bp);
1781			bawrite(bp);
1782			BO_LOCK(bo);
1783			goto restartsync;
1784		}
1785	}
1786
1787	bufobj_wwait(bo, 0, 0);
1788	BO_UNLOCK(bo);
1789	vnode_pager_setsize(vp, length);
1790
1791	return (0);
1792}
1793
1794static void
1795buf_vlist_remove(struct buf *bp)
1796{
1797	struct bufv *bv;
1798
1799	KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
1800	ASSERT_BO_WLOCKED(bp->b_bufobj);
1801	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
1802	    (BX_VNDIRTY|BX_VNCLEAN),
1803	    ("buf_vlist_remove: Buf %p is on two lists", bp));
1804	if (bp->b_xflags & BX_VNDIRTY)
1805		bv = &bp->b_bufobj->bo_dirty;
1806	else
1807		bv = &bp->b_bufobj->bo_clean;
1808	BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
1809	TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
1810	bv->bv_cnt--;
1811	bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
1812}
1813
1814/*
1815 * Add the buffer to the sorted clean or dirty block list.
1816 *
1817 * NOTE: xflags is passed as a constant, optimizing this inline function!
1818 */
1819static void
1820buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
1821{
1822	struct bufv *bv;
1823	struct buf *n;
1824	int error;
1825
1826	ASSERT_BO_WLOCKED(bo);
1827	KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
1828	    ("dead bo %p", bo));
1829	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
1830	    ("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
1831	bp->b_xflags |= xflags;
1832	if (xflags & BX_VNDIRTY)
1833		bv = &bo->bo_dirty;
1834	else
1835		bv = &bo->bo_clean;
1836
1837	/*
1838	 * Keep the list ordered.  Optimize empty list insertion.  Assume
1839	 * we tend to grow at the tail so lookup_le should usually be cheaper
1840	 * than _ge.
1841	 */
1842	if (bv->bv_cnt == 0 ||
1843	    bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
1844		TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
1845	else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
1846		TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
1847	else
1848		TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
1849	error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
1850	if (error)
1851		panic("buf_vlist_add:  Preallocated nodes insufficient.");
1852	bv->bv_cnt++;
1853}
1854
1855/*
1856 * Lookup a buffer using the splay tree.  Note that we specifically avoid
1857 * shadow buffers used in background bitmap writes.
1858 *
1859 * This code isn't quite efficient as it could be because we are maintaining
1860 * two sorted lists and do not know which list the block resides in.
1861 *
1862 * During a "make buildworld" the desired buffer is found at one of
1863 * the roots more than 60% of the time.  Thus, checking both roots
1864 * before performing either splay eliminates unnecessary splays on the
1865 * first tree splayed.
1866 */
1867struct buf *
1868gbincore(struct bufobj *bo, daddr_t lblkno)
1869{
1870	struct buf *bp;
1871
1872	ASSERT_BO_LOCKED(bo);
1873	bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
1874	if (bp != NULL)
1875		return (bp);
1876	return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
1877}
1878
1879/*
1880 * Associate a buffer with a vnode.
1881 */
1882void
1883bgetvp(struct vnode *vp, struct buf *bp)
1884{
1885	struct bufobj *bo;
1886
1887	bo = &vp->v_bufobj;
1888	ASSERT_BO_WLOCKED(bo);
1889	VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
1890
1891	CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
1892	VNASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0, vp,
1893	    ("bgetvp: bp already attached! %p", bp));
1894
1895	vhold(vp);
1896	bp->b_vp = vp;
1897	bp->b_bufobj = bo;
1898	/*
1899	 * Insert onto list for new vnode.
1900	 */
1901	buf_vlist_add(bp, bo, BX_VNCLEAN);
1902}
1903
1904/*
1905 * Disassociate a buffer from a vnode.
1906 */
1907void
1908brelvp(struct buf *bp)
1909{
1910	struct bufobj *bo;
1911	struct vnode *vp;
1912
1913	CTR3(KTR_BUF, "brelvp(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
1914	KASSERT(bp->b_vp != NULL, ("brelvp: NULL"));
1915
1916	/*
1917	 * Delete from old vnode list, if on one.
1918	 */
1919	vp = bp->b_vp;		/* XXX */
1920	bo = bp->b_bufobj;
1921	BO_LOCK(bo);
1922	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
1923		buf_vlist_remove(bp);
1924	else
1925		panic("brelvp: Buffer %p not on queue.", bp);
1926	if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
1927		bo->bo_flag &= ~BO_ONWORKLST;
1928		mtx_lock(&sync_mtx);
1929		LIST_REMOVE(bo, bo_synclist);
1930		syncer_worklist_len--;
1931		mtx_unlock(&sync_mtx);
1932	}
1933	bp->b_vp = NULL;
1934	bp->b_bufobj = NULL;
1935	BO_UNLOCK(bo);
1936	vdrop(vp);
1937}
1938
1939/*
1940 * Add an item to the syncer work queue.
1941 */
1942static void
1943vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
1944{
1945	int slot;
1946
1947	ASSERT_BO_WLOCKED(bo);
1948
1949	mtx_lock(&sync_mtx);
1950	if (bo->bo_flag & BO_ONWORKLST)
1951		LIST_REMOVE(bo, bo_synclist);
1952	else {
1953		bo->bo_flag |= BO_ONWORKLST;
1954		syncer_worklist_len++;
1955	}
1956
1957	if (delay > syncer_maxdelay - 2)
1958		delay = syncer_maxdelay - 2;
1959	slot = (syncer_delayno + delay) & syncer_mask;
1960
1961	LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
1962	mtx_unlock(&sync_mtx);
1963}
1964
1965static int
1966sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
1967{
1968	int error, len;
1969
1970	mtx_lock(&sync_mtx);
1971	len = syncer_worklist_len - sync_vnode_count;
1972	mtx_unlock(&sync_mtx);
1973	error = SYSCTL_OUT(req, &len, sizeof(len));
1974	return (error);
1975}
1976
1977SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
1978    sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
1979
1980static struct proc *updateproc;
1981static void sched_sync(void);
1982static struct kproc_desc up_kp = {
1983	"syncer",
1984	sched_sync,
1985	&updateproc
1986};
1987SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp);
1988
1989static int
1990sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
1991{
1992	struct vnode *vp;
1993	struct mount *mp;
1994
1995	*bo = LIST_FIRST(slp);
1996	if (*bo == NULL)
1997		return (0);
1998	vp = (*bo)->__bo_vnode;	/* XXX */
1999	if (VOP_ISLOCKED(vp) != 0 || VI_TRYLOCK(vp) == 0)
2000		return (1);
2001	/*
2002	 * We use vhold in case the vnode does not
2003	 * successfully sync.  vhold prevents the vnode from
2004	 * going away when we unlock the sync_mtx so that
2005	 * we can acquire the vnode interlock.
2006	 */
2007	vholdl(vp);
2008	mtx_unlock(&sync_mtx);
2009	VI_UNLOCK(vp);
2010	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
2011		vdrop(vp);
2012		mtx_lock(&sync_mtx);
2013		return (*bo == LIST_FIRST(slp));
2014	}
2015	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2016	(void) VOP_FSYNC(vp, MNT_LAZY, td);
2017	VOP_UNLOCK(vp, 0);
2018	vn_finished_write(mp);
2019	BO_LOCK(*bo);
2020	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
2021		/*
2022		 * Put us back on the worklist.  The worklist
2023		 * routine will remove us from our current
2024		 * position and then add us back in at a later
2025		 * position.
2026		 */
2027		vn_syncer_add_to_worklist(*bo, syncdelay);
2028	}
2029	BO_UNLOCK(*bo);
2030	vdrop(vp);
2031	mtx_lock(&sync_mtx);
2032	return (0);
2033}
2034
2035static int first_printf = 1;
2036
2037/*
2038 * System filesystem synchronizer daemon.
2039 */
2040static void
2041sched_sync(void)
2042{
2043	struct synclist *next, *slp;
2044	struct bufobj *bo;
2045	long starttime;
2046	struct thread *td = curthread;
2047	int last_work_seen;
2048	int net_worklist_len;
2049	int syncer_final_iter;
2050	int error;
2051
2052	last_work_seen = 0;
2053	syncer_final_iter = 0;
2054	syncer_state = SYNCER_RUNNING;
2055	starttime = time_uptime;
2056	td->td_pflags |= TDP_NORUNNINGBUF;
2057
2058	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
2059	    SHUTDOWN_PRI_LAST);
2060
2061	mtx_lock(&sync_mtx);
2062	for (;;) {
2063		if (syncer_state == SYNCER_FINAL_DELAY &&
2064		    syncer_final_iter == 0) {
2065			mtx_unlock(&sync_mtx);
2066			kproc_suspend_check(td->td_proc);
2067			mtx_lock(&sync_mtx);
2068		}
2069		net_worklist_len = syncer_worklist_len - sync_vnode_count;
2070		if (syncer_state != SYNCER_RUNNING &&
2071		    starttime != time_uptime) {
2072			if (first_printf) {
2073				printf("\nSyncing disks, vnodes remaining...");
2074				first_printf = 0;
2075			}
2076			printf("%d ", net_worklist_len);
2077		}
2078		starttime = time_uptime;
2079
2080		/*
2081		 * Push files whose dirty time has expired.  Be careful
2082		 * of interrupt race on slp queue.
2083		 *
2084		 * Skip over empty worklist slots when shutting down.
2085		 */
2086		do {
2087			slp = &syncer_workitem_pending[syncer_delayno];
2088			syncer_delayno += 1;
2089			if (syncer_delayno == syncer_maxdelay)
2090				syncer_delayno = 0;
2091			next = &syncer_workitem_pending[syncer_delayno];
2092			/*
2093			 * If the worklist has wrapped since the
2094			 * it was emptied of all but syncer vnodes,
2095			 * switch to the FINAL_DELAY state and run
2096			 * for one more second.
2097			 */
2098			if (syncer_state == SYNCER_SHUTTING_DOWN &&
2099			    net_worklist_len == 0 &&
2100			    last_work_seen == syncer_delayno) {
2101				syncer_state = SYNCER_FINAL_DELAY;
2102				syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
2103			}
2104		} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
2105		    syncer_worklist_len > 0);
2106
2107		/*
2108		 * Keep track of the last time there was anything
2109		 * on the worklist other than syncer vnodes.
2110		 * Return to the SHUTTING_DOWN state if any
2111		 * new work appears.
2112		 */
2113		if (net_worklist_len > 0 || syncer_state == SYNCER_RUNNING)
2114			last_work_seen = syncer_delayno;
2115		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
2116			syncer_state = SYNCER_SHUTTING_DOWN;
2117		while (!LIST_EMPTY(slp)) {
2118			error = sync_vnode(slp, &bo, td);
2119			if (error == 1) {
2120				LIST_REMOVE(bo, bo_synclist);
2121				LIST_INSERT_HEAD(next, bo, bo_synclist);
2122				continue;
2123			}
2124
2125			if (first_printf == 0)
2126				wdog_kern_pat(WD_LASTVAL);
2127
2128		}
2129		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
2130			syncer_final_iter--;
2131		/*
2132		 * The variable rushjob allows the kernel to speed up the
2133		 * processing of the filesystem syncer process. A rushjob
2134		 * value of N tells the filesystem syncer to process the next
2135		 * N seconds worth of work on its queue ASAP. Currently rushjob
2136		 * is used by the soft update code to speed up the filesystem
2137		 * syncer process when the incore state is getting so far
2138		 * ahead of the disk that the kernel memory pool is being
2139		 * threatened with exhaustion.
2140		 */
2141		if (rushjob > 0) {
2142			rushjob -= 1;
2143			continue;
2144		}
2145		/*
2146		 * Just sleep for a short period of time between
2147		 * iterations when shutting down to allow some I/O
2148		 * to happen.
2149		 *
2150		 * If it has taken us less than a second to process the
2151		 * current work, then wait. Otherwise start right over
2152		 * again. We can still lose time if any single round
2153		 * takes more than two seconds, but it does not really
2154		 * matter as we are just trying to generally pace the
2155		 * filesystem activity.
2156		 */
2157		if (syncer_state != SYNCER_RUNNING ||
2158		    time_uptime == starttime) {
2159			thread_lock(td);
2160			sched_prio(td, PPAUSE);
2161			thread_unlock(td);
2162		}
2163		if (syncer_state != SYNCER_RUNNING)
2164			cv_timedwait(&sync_wakeup, &sync_mtx,
2165			    hz / SYNCER_SHUTDOWN_SPEEDUP);
2166		else if (time_uptime == starttime)
2167			cv_timedwait(&sync_wakeup, &sync_mtx, hz);
2168	}
2169}
2170
2171/*
2172 * Request the syncer daemon to speed up its work.
2173 * We never push it to speed up more than half of its
2174 * normal turn time, otherwise it could take over the cpu.
2175 */
2176int
2177speedup_syncer(void)
2178{
2179	int ret = 0;
2180
2181	mtx_lock(&sync_mtx);
2182	if (rushjob < syncdelay / 2) {
2183		rushjob += 1;
2184		stat_rush_requests += 1;
2185		ret = 1;
2186	}
2187	mtx_unlock(&sync_mtx);
2188	cv_broadcast(&sync_wakeup);
2189	return (ret);
2190}
2191
2192/*
2193 * Tell the syncer to speed up its work and run though its work
2194 * list several times, then tell it to shut down.
2195 */
2196static void
2197syncer_shutdown(void *arg, int howto)
2198{
2199
2200	if (howto & RB_NOSYNC)
2201		return;
2202	mtx_lock(&sync_mtx);
2203	syncer_state = SYNCER_SHUTTING_DOWN;
2204	rushjob = 0;
2205	mtx_unlock(&sync_mtx);
2206	cv_broadcast(&sync_wakeup);
2207	kproc_shutdown(arg, howto);
2208}
2209
2210void
2211syncer_suspend(void)
2212{
2213
2214	syncer_shutdown(updateproc, 0);
2215}
2216
2217void
2218syncer_resume(void)
2219{
2220
2221	mtx_lock(&sync_mtx);
2222	first_printf = 1;
2223	syncer_state = SYNCER_RUNNING;
2224	mtx_unlock(&sync_mtx);
2225	cv_broadcast(&sync_wakeup);
2226	kproc_resume(updateproc);
2227}
2228
2229/*
2230 * Reassign a buffer from one vnode to another.
2231 * Used to assign file specific control information
2232 * (indirect blocks) to the vnode to which they belong.
2233 */
2234void
2235reassignbuf(struct buf *bp)
2236{
2237	struct vnode *vp;
2238	struct bufobj *bo;
2239	int delay;
2240#ifdef INVARIANTS
2241	struct bufv *bv;
2242#endif
2243
2244	vp = bp->b_vp;
2245	bo = bp->b_bufobj;
2246	++reassignbufcalls;
2247
2248	CTR3(KTR_BUF, "reassignbuf(%p) vp %p flags %X",
2249	    bp, bp->b_vp, bp->b_flags);
2250	/*
2251	 * B_PAGING flagged buffers cannot be reassigned because their vp
2252	 * is not fully linked in.
2253	 */
2254	if (bp->b_flags & B_PAGING)
2255		panic("cannot reassign paging buffer");
2256
2257	/*
2258	 * Delete from old vnode list, if on one.
2259	 */
2260	BO_LOCK(bo);
2261	if (bp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN))
2262		buf_vlist_remove(bp);
2263	else
2264		panic("reassignbuf: Buffer %p not on queue.", bp);
2265	/*
2266	 * If dirty, put on list of dirty buffers; otherwise insert onto list
2267	 * of clean buffers.
2268	 */
2269	if (bp->b_flags & B_DELWRI) {
2270		if ((bo->bo_flag & BO_ONWORKLST) == 0) {
2271			switch (vp->v_type) {
2272			case VDIR:
2273				delay = dirdelay;
2274				break;
2275			case VCHR:
2276				delay = metadelay;
2277				break;
2278			default:
2279				delay = filedelay;
2280			}
2281			vn_syncer_add_to_worklist(bo, delay);
2282		}
2283		buf_vlist_add(bp, bo, BX_VNDIRTY);
2284	} else {
2285		buf_vlist_add(bp, bo, BX_VNCLEAN);
2286
2287		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
2288			mtx_lock(&sync_mtx);
2289			LIST_REMOVE(bo, bo_synclist);
2290			syncer_worklist_len--;
2291			mtx_unlock(&sync_mtx);
2292			bo->bo_flag &= ~BO_ONWORKLST;
2293		}
2294	}
2295#ifdef INVARIANTS
2296	bv = &bo->bo_clean;
2297	bp = TAILQ_FIRST(&bv->bv_hd);
2298	KASSERT(bp == NULL || bp->b_bufobj == bo,
2299	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2300	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2301	KASSERT(bp == NULL || bp->b_bufobj == bo,
2302	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2303	bv = &bo->bo_dirty;
2304	bp = TAILQ_FIRST(&bv->bv_hd);
2305	KASSERT(bp == NULL || bp->b_bufobj == bo,
2306	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2307	bp = TAILQ_LAST(&bv->bv_hd, buflists);
2308	KASSERT(bp == NULL || bp->b_bufobj == bo,
2309	    ("bp %p wrong b_bufobj %p should be %p", bp, bp->b_bufobj, bo));
2310#endif
2311	BO_UNLOCK(bo);
2312}
2313
2314/*
2315 * Increment the use and hold counts on the vnode, taking care to reference
2316 * the driver's usecount if this is a chardev.  The vholdl() will remove
2317 * the vnode from the free list if it is presently free.  Requires the
2318 * vnode interlock and returns with it held.
2319 */
2320static void
2321v_incr_usecount(struct vnode *vp)
2322{
2323
2324	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2325	vholdl(vp);
2326	vp->v_usecount++;
2327	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2328		dev_lock();
2329		vp->v_rdev->si_usecount++;
2330		dev_unlock();
2331	}
2332}
2333
2334/*
2335 * Turn a holdcnt into a use+holdcnt such that only one call to
2336 * v_decr_usecount is needed.
2337 */
2338static void
2339v_upgrade_usecount(struct vnode *vp)
2340{
2341
2342	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2343	vp->v_usecount++;
2344	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2345		dev_lock();
2346		vp->v_rdev->si_usecount++;
2347		dev_unlock();
2348	}
2349}
2350
2351/*
2352 * Decrement the vnode use and hold count along with the driver's usecount
2353 * if this is a chardev.  The vdropl() below releases the vnode interlock
2354 * as it may free the vnode.
2355 */
2356static void
2357v_decr_usecount(struct vnode *vp)
2358{
2359
2360	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2361	VNASSERT(vp->v_usecount > 0, vp,
2362	    ("v_decr_usecount: negative usecount"));
2363	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2364	vp->v_usecount--;
2365	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2366		dev_lock();
2367		vp->v_rdev->si_usecount--;
2368		dev_unlock();
2369	}
2370	vdropl(vp);
2371}
2372
2373/*
2374 * Decrement only the use count and driver use count.  This is intended to
2375 * be paired with a follow on vdropl() to release the remaining hold count.
2376 * In this way we may vgone() a vnode with a 0 usecount without risk of
2377 * having it end up on a free list because the hold count is kept above 0.
2378 */
2379static void
2380v_decr_useonly(struct vnode *vp)
2381{
2382
2383	ASSERT_VI_LOCKED(vp, __FUNCTION__);
2384	VNASSERT(vp->v_usecount > 0, vp,
2385	    ("v_decr_useonly: negative usecount"));
2386	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2387	vp->v_usecount--;
2388	if (vp->v_type == VCHR && vp->v_rdev != NULL) {
2389		dev_lock();
2390		vp->v_rdev->si_usecount--;
2391		dev_unlock();
2392	}
2393}
2394
2395/*
2396 * Grab a particular vnode from the free list, increment its
2397 * reference count and lock it.  VI_DOOMED is set if the vnode
2398 * is being destroyed.  Only callers who specify LK_RETRY will
2399 * see doomed vnodes.  If inactive processing was delayed in
2400 * vput try to do it here.
2401 */
2402int
2403vget(struct vnode *vp, int flags, struct thread *td)
2404{
2405	int error;
2406
2407	error = 0;
2408	VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
2409	    ("vget: invalid lock operation"));
2410	CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
2411
2412	if ((flags & LK_INTERLOCK) == 0)
2413		VI_LOCK(vp);
2414	vholdl(vp);
2415	if ((error = vn_lock(vp, flags | LK_INTERLOCK)) != 0) {
2416		vdrop(vp);
2417		CTR2(KTR_VFS, "%s: impossible to lock vnode %p", __func__,
2418		    vp);
2419		return (error);
2420	}
2421	if (vp->v_iflag & VI_DOOMED && (flags & LK_RETRY) == 0)
2422		panic("vget: vn_lock failed to return ENOENT\n");
2423	VI_LOCK(vp);
2424	/* Upgrade our holdcnt to a usecount. */
2425	v_upgrade_usecount(vp);
2426	/*
2427	 * We don't guarantee that any particular close will
2428	 * trigger inactive processing so just make a best effort
2429	 * here at preventing a reference to a removed file.  If
2430	 * we don't succeed no harm is done.
2431	 */
2432	if (vp->v_iflag & VI_OWEINACT) {
2433		if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE &&
2434		    (flags & LK_NOWAIT) == 0)
2435			vinactive(vp, td);
2436		vp->v_iflag &= ~VI_OWEINACT;
2437	}
2438	VI_UNLOCK(vp);
2439	return (0);
2440}
2441
2442/*
2443 * Increase the reference count of a vnode.
2444 */
2445void
2446vref(struct vnode *vp)
2447{
2448
2449	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2450	VI_LOCK(vp);
2451	v_incr_usecount(vp);
2452	VI_UNLOCK(vp);
2453}
2454
2455/*
2456 * Return reference count of a vnode.
2457 *
2458 * The results of this call are only guaranteed when some mechanism other
2459 * than the VI lock is used to stop other processes from gaining references
2460 * to the vnode.  This may be the case if the caller holds the only reference.
2461 * This is also useful when stale data is acceptable as race conditions may
2462 * be accounted for by some other means.
2463 */
2464int
2465vrefcnt(struct vnode *vp)
2466{
2467	int usecnt;
2468
2469	VI_LOCK(vp);
2470	usecnt = vp->v_usecount;
2471	VI_UNLOCK(vp);
2472
2473	return (usecnt);
2474}
2475
2476#define	VPUTX_VRELE	1
2477#define	VPUTX_VPUT	2
2478#define	VPUTX_VUNREF	3
2479
2480static void
2481vputx(struct vnode *vp, int func)
2482{
2483	int error;
2484
2485	KASSERT(vp != NULL, ("vputx: null vp"));
2486	if (func == VPUTX_VUNREF)
2487		ASSERT_VOP_LOCKED(vp, "vunref");
2488	else if (func == VPUTX_VPUT)
2489		ASSERT_VOP_LOCKED(vp, "vput");
2490	else
2491		KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
2492	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2493	VI_LOCK(vp);
2494
2495	/* Skip this v_writecount check if we're going to panic below. */
2496	VNASSERT(vp->v_writecount < vp->v_usecount || vp->v_usecount < 1, vp,
2497	    ("vputx: missed vn_close"));
2498	error = 0;
2499
2500	if (vp->v_usecount > 1 || ((vp->v_iflag & VI_DOINGINACT) &&
2501	    vp->v_usecount == 1)) {
2502		if (func == VPUTX_VPUT)
2503			VOP_UNLOCK(vp, 0);
2504		v_decr_usecount(vp);
2505		return;
2506	}
2507
2508	if (vp->v_usecount != 1) {
2509		vprint("vputx: negative ref count", vp);
2510		panic("vputx: negative ref cnt");
2511	}
2512	CTR2(KTR_VFS, "%s: return vnode %p to the freelist", __func__, vp);
2513	/*
2514	 * We want to hold the vnode until the inactive finishes to
2515	 * prevent vgone() races.  We drop the use count here and the
2516	 * hold count below when we're done.
2517	 */
2518	v_decr_useonly(vp);
2519	/*
2520	 * We must call VOP_INACTIVE with the node locked. Mark
2521	 * as VI_DOINGINACT to avoid recursion.
2522	 */
2523	vp->v_iflag |= VI_OWEINACT;
2524	switch (func) {
2525	case VPUTX_VRELE:
2526		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2527		VI_LOCK(vp);
2528		break;
2529	case VPUTX_VPUT:
2530		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2531			error = VOP_LOCK(vp, LK_UPGRADE | LK_INTERLOCK |
2532			    LK_NOWAIT);
2533			VI_LOCK(vp);
2534		}
2535		break;
2536	case VPUTX_VUNREF:
2537		if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
2538			error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
2539			VI_LOCK(vp);
2540		}
2541		break;
2542	}
2543	if (vp->v_usecount > 0)
2544		vp->v_iflag &= ~VI_OWEINACT;
2545	if (error == 0) {
2546		if (vp->v_iflag & VI_OWEINACT)
2547			vinactive(vp, curthread);
2548		if (func != VPUTX_VUNREF)
2549			VOP_UNLOCK(vp, 0);
2550	}
2551	vdropl(vp);
2552}
2553
2554/*
2555 * Vnode put/release.
2556 * If count drops to zero, call inactive routine and return to freelist.
2557 */
2558void
2559vrele(struct vnode *vp)
2560{
2561
2562	vputx(vp, VPUTX_VRELE);
2563}
2564
2565/*
2566 * Release an already locked vnode.  This give the same effects as
2567 * unlock+vrele(), but takes less time and avoids releasing and
2568 * re-aquiring the lock (as vrele() acquires the lock internally.)
2569 */
2570void
2571vput(struct vnode *vp)
2572{
2573
2574	vputx(vp, VPUTX_VPUT);
2575}
2576
2577/*
2578 * Release an exclusively locked vnode. Do not unlock the vnode lock.
2579 */
2580void
2581vunref(struct vnode *vp)
2582{
2583
2584	vputx(vp, VPUTX_VUNREF);
2585}
2586
2587/*
2588 * Somebody doesn't want the vnode recycled.
2589 */
2590void
2591vhold(struct vnode *vp)
2592{
2593
2594	VI_LOCK(vp);
2595	vholdl(vp);
2596	VI_UNLOCK(vp);
2597}
2598
2599/*
2600 * Increase the hold count and activate if this is the first reference.
2601 */
2602void
2603vholdl(struct vnode *vp)
2604{
2605	struct mount *mp;
2606
2607	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2608#ifdef INVARIANTS
2609	/* getnewvnode() calls v_incr_usecount() without holding interlock. */
2610	if (vp->v_type != VNON || vp->v_data != NULL)
2611		ASSERT_VI_LOCKED(vp, "vholdl");
2612#endif
2613	vp->v_holdcnt++;
2614	if ((vp->v_iflag & VI_FREE) == 0)
2615		return;
2616	VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
2617	VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
2618	/*
2619	 * Remove a vnode from the free list, mark it as in use,
2620	 * and put it on the active list.
2621	 */
2622	mtx_lock(&vnode_free_list_mtx);
2623	TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
2624	freevnodes--;
2625	vp->v_iflag &= ~VI_FREE;
2626	KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
2627	    ("Activating already active vnode"));
2628	vp->v_iflag |= VI_ACTIVE;
2629	mp = vp->v_mount;
2630	TAILQ_INSERT_HEAD(&mp->mnt_activevnodelist, vp, v_actfreelist);
2631	mp->mnt_activevnodelistsize++;
2632	mtx_unlock(&vnode_free_list_mtx);
2633}
2634
2635/*
2636 * Note that there is one less who cares about this vnode.
2637 * vdrop() is the opposite of vhold().
2638 */
2639void
2640vdrop(struct vnode *vp)
2641{
2642
2643	VI_LOCK(vp);
2644	vdropl(vp);
2645}
2646
2647/*
2648 * Drop the hold count of the vnode.  If this is the last reference to
2649 * the vnode we place it on the free list unless it has been vgone'd
2650 * (marked VI_DOOMED) in which case we will free it.
2651 *
2652 * Because the vnode vm object keeps a hold reference on the vnode if
2653 * there is at least one resident non-cached page, the vnode cannot
2654 * leave the active list without the page cleanup done.
2655 */
2656void
2657vdropl(struct vnode *vp)
2658{
2659	struct bufobj *bo;
2660	struct mount *mp;
2661	int active;
2662
2663	ASSERT_VI_LOCKED(vp, "vdropl");
2664	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2665	if (vp->v_holdcnt <= 0)
2666		panic("vdrop: holdcnt %d", vp->v_holdcnt);
2667	vp->v_holdcnt--;
2668	if (vp->v_holdcnt > 0) {
2669		VI_UNLOCK(vp);
2670		return;
2671	}
2672	if ((vp->v_iflag & VI_DOOMED) == 0) {
2673		/*
2674		 * Mark a vnode as free: remove it from its active list
2675		 * and put it up for recycling on the freelist.
2676		 */
2677		VNASSERT(vp->v_op != NULL, vp,
2678		    ("vdropl: vnode already reclaimed."));
2679		VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2680		    ("vnode already free"));
2681		VNASSERT(vp->v_holdcnt == 0, vp,
2682		    ("vdropl: freeing when we shouldn't"));
2683		active = vp->v_iflag & VI_ACTIVE;
2684		if ((vp->v_iflag & VI_OWEINACT) == 0) {
2685			vp->v_iflag &= ~VI_ACTIVE;
2686			mp = vp->v_mount;
2687			mtx_lock(&vnode_free_list_mtx);
2688			if (active) {
2689				TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
2690				    v_actfreelist);
2691				mp->mnt_activevnodelistsize--;
2692			}
2693			TAILQ_INSERT_TAIL(&vnode_free_list, vp,
2694			    v_actfreelist);
2695			freevnodes++;
2696			vp->v_iflag |= VI_FREE;
2697			mtx_unlock(&vnode_free_list_mtx);
2698		} else {
2699			atomic_add_long(&free_owe_inact, 1);
2700		}
2701		VI_UNLOCK(vp);
2702		return;
2703	}
2704	/*
2705	 * The vnode has been marked for destruction, so free it.
2706	 *
2707	 * The vnode will be returned to the zone where it will
2708	 * normally remain until it is needed for another vnode. We
2709	 * need to cleanup (or verify that the cleanup has already
2710	 * been done) any residual data left from its current use
2711	 * so as not to contaminate the freshly allocated vnode.
2712	 */
2713	CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
2714	atomic_subtract_long(&numvnodes, 1);
2715	bo = &vp->v_bufobj;
2716	VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
2717	    ("cleaned vnode still on the free list."));
2718	VNASSERT(vp->v_data == NULL, vp, ("cleaned vnode isn't"));
2719	VNASSERT(vp->v_holdcnt == 0, vp, ("Non-zero hold count"));
2720	VNASSERT(vp->v_usecount == 0, vp, ("Non-zero use count"));
2721	VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
2722	VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
2723	VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
2724	VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
2725	    ("clean blk trie not empty"));
2726	VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
2727	VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
2728	    ("dirty blk trie not empty"));
2729	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
2730	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
2731	VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
2732	VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
2733	    ("Dangling rangelock waiters"));
2734	VI_UNLOCK(vp);
2735#ifdef MAC
2736	mac_vnode_destroy(vp);
2737#endif
2738	if (vp->v_pollinfo != NULL) {
2739		destroy_vpollinfo(vp->v_pollinfo);
2740		vp->v_pollinfo = NULL;
2741	}
2742#ifdef INVARIANTS
2743	/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
2744	vp->v_op = NULL;
2745#endif
2746	bzero(&vp->v_un, sizeof(vp->v_un));
2747	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
2748	vp->v_iflag = 0;
2749	vp->v_vflag = 0;
2750	bo->bo_flag = 0;
2751	uma_zfree(vnode_zone, vp);
2752}
2753
2754/*
2755 * Call VOP_INACTIVE on the vnode and manage the DOINGINACT and OWEINACT
2756 * flags.  DOINGINACT prevents us from recursing in calls to vinactive.
2757 * OWEINACT tracks whether a vnode missed a call to inactive due to a
2758 * failed lock upgrade.
2759 */
2760void
2761vinactive(struct vnode *vp, struct thread *td)
2762{
2763	struct vm_object *obj;
2764
2765	ASSERT_VOP_ELOCKED(vp, "vinactive");
2766	ASSERT_VI_LOCKED(vp, "vinactive");
2767	VNASSERT((vp->v_iflag & VI_DOINGINACT) == 0, vp,
2768	    ("vinactive: recursed on VI_DOINGINACT"));
2769	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2770	vp->v_iflag |= VI_DOINGINACT;
2771	vp->v_iflag &= ~VI_OWEINACT;
2772	VI_UNLOCK(vp);
2773	/*
2774	 * Before moving off the active list, we must be sure that any
2775	 * modified pages are converted into the vnode's dirty
2776	 * buffers, since these will no longer be checked once the
2777	 * vnode is on the inactive list.
2778	 *
2779	 * The write-out of the dirty pages is asynchronous.  At the
2780	 * point that VOP_INACTIVE() is called, there could still be
2781	 * pending I/O and dirty pages in the object.
2782	 */
2783	obj = vp->v_object;
2784	if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
2785		VM_OBJECT_WLOCK(obj);
2786		vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
2787		VM_OBJECT_WUNLOCK(obj);
2788	}
2789	VOP_INACTIVE(vp, td);
2790	VI_LOCK(vp);
2791	VNASSERT(vp->v_iflag & VI_DOINGINACT, vp,
2792	    ("vinactive: lost VI_DOINGINACT"));
2793	vp->v_iflag &= ~VI_DOINGINACT;
2794}
2795
2796/*
2797 * Remove any vnodes in the vnode table belonging to mount point mp.
2798 *
2799 * If FORCECLOSE is not specified, there should not be any active ones,
2800 * return error if any are found (nb: this is a user error, not a
2801 * system error). If FORCECLOSE is specified, detach any active vnodes
2802 * that are found.
2803 *
2804 * If WRITECLOSE is set, only flush out regular file vnodes open for
2805 * writing.
2806 *
2807 * SKIPSYSTEM causes any vnodes marked VV_SYSTEM to be skipped.
2808 *
2809 * `rootrefs' specifies the base reference count for the root vnode
2810 * of this filesystem. The root vnode is considered busy if its
2811 * v_usecount exceeds this value. On a successful return, vflush(, td)
2812 * will call vrele() on the root vnode exactly rootrefs times.
2813 * If the SKIPSYSTEM or WRITECLOSE flags are specified, rootrefs must
2814 * be zero.
2815 */
2816#ifdef DIAGNOSTIC
2817static int busyprt = 0;		/* print out busy vnodes */
2818SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, "Print out busy vnodes");
2819#endif
2820
2821int
2822vflush(struct mount *mp, int rootrefs, int flags, struct thread *td)
2823{
2824	struct vnode *vp, *mvp, *rootvp = NULL;
2825	struct vattr vattr;
2826	int busy = 0, error;
2827
2828	CTR4(KTR_VFS, "%s: mp %p with rootrefs %d and flags %d", __func__, mp,
2829	    rootrefs, flags);
2830	if (rootrefs > 0) {
2831		KASSERT((flags & (SKIPSYSTEM | WRITECLOSE)) == 0,
2832		    ("vflush: bad args"));
2833		/*
2834		 * Get the filesystem root vnode. We can vput() it
2835		 * immediately, since with rootrefs > 0, it won't go away.
2836		 */
2837		if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rootvp)) != 0) {
2838			CTR2(KTR_VFS, "%s: vfs_root lookup failed with %d",
2839			    __func__, error);
2840			return (error);
2841		}
2842		vput(rootvp);
2843	}
2844loop:
2845	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2846		vholdl(vp);
2847		error = vn_lock(vp, LK_INTERLOCK | LK_EXCLUSIVE);
2848		if (error) {
2849			vdrop(vp);
2850			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2851			goto loop;
2852		}
2853		/*
2854		 * Skip over a vnodes marked VV_SYSTEM.
2855		 */
2856		if ((flags & SKIPSYSTEM) && (vp->v_vflag & VV_SYSTEM)) {
2857			VOP_UNLOCK(vp, 0);
2858			vdrop(vp);
2859			continue;
2860		}
2861		/*
2862		 * If WRITECLOSE is set, flush out unlinked but still open
2863		 * files (even if open only for reading) and regular file
2864		 * vnodes open for writing.
2865		 */
2866		if (flags & WRITECLOSE) {
2867			if (vp->v_object != NULL) {
2868				VM_OBJECT_WLOCK(vp->v_object);
2869				vm_object_page_clean(vp->v_object, 0, 0, 0);
2870				VM_OBJECT_WUNLOCK(vp->v_object);
2871			}
2872			error = VOP_FSYNC(vp, MNT_WAIT, td);
2873			if (error != 0) {
2874				VOP_UNLOCK(vp, 0);
2875				vdrop(vp);
2876				MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2877				return (error);
2878			}
2879			error = VOP_GETATTR(vp, &vattr, td->td_ucred);
2880			VI_LOCK(vp);
2881
2882			if ((vp->v_type == VNON ||
2883			    (error == 0 && vattr.va_nlink > 0)) &&
2884			    (vp->v_writecount == 0 || vp->v_type != VREG)) {
2885				VOP_UNLOCK(vp, 0);
2886				vdropl(vp);
2887				continue;
2888			}
2889		} else
2890			VI_LOCK(vp);
2891		/*
2892		 * With v_usecount == 0, all we need to do is clear out the
2893		 * vnode data structures and we are done.
2894		 *
2895		 * If FORCECLOSE is set, forcibly close the vnode.
2896		 */
2897		if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
2898			vgonel(vp);
2899		} else {
2900			busy++;
2901#ifdef DIAGNOSTIC
2902			if (busyprt)
2903				vprint("vflush: busy vnode", vp);
2904#endif
2905		}
2906		VOP_UNLOCK(vp, 0);
2907		vdropl(vp);
2908	}
2909	if (rootrefs > 0 && (flags & FORCECLOSE) == 0) {
2910		/*
2911		 * If just the root vnode is busy, and if its refcount
2912		 * is equal to `rootrefs', then go ahead and kill it.
2913		 */
2914		VI_LOCK(rootvp);
2915		KASSERT(busy > 0, ("vflush: not busy"));
2916		VNASSERT(rootvp->v_usecount >= rootrefs, rootvp,
2917		    ("vflush: usecount %d < rootrefs %d",
2918		     rootvp->v_usecount, rootrefs));
2919		if (busy == 1 && rootvp->v_usecount == rootrefs) {
2920			VOP_LOCK(rootvp, LK_EXCLUSIVE|LK_INTERLOCK);
2921			vgone(rootvp);
2922			VOP_UNLOCK(rootvp, 0);
2923			busy = 0;
2924		} else
2925			VI_UNLOCK(rootvp);
2926	}
2927	if (busy) {
2928		CTR2(KTR_VFS, "%s: failing as %d vnodes are busy", __func__,
2929		    busy);
2930		return (EBUSY);
2931	}
2932	for (; rootrefs > 0; rootrefs--)
2933		vrele(rootvp);
2934	return (0);
2935}
2936
2937/*
2938 * Recycle an unused vnode to the front of the free list.
2939 */
2940int
2941vrecycle(struct vnode *vp)
2942{
2943	int recycled;
2944
2945	ASSERT_VOP_ELOCKED(vp, "vrecycle");
2946	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
2947	recycled = 0;
2948	VI_LOCK(vp);
2949	if (vp->v_usecount == 0) {
2950		recycled = 1;
2951		vgonel(vp);
2952	}
2953	VI_UNLOCK(vp);
2954	return (recycled);
2955}
2956
2957/*
2958 * Eliminate all activity associated with a vnode
2959 * in preparation for reuse.
2960 */
2961void
2962vgone(struct vnode *vp)
2963{
2964	VI_LOCK(vp);
2965	vgonel(vp);
2966	VI_UNLOCK(vp);
2967}
2968
2969static void
2970notify_lowervp_vfs_dummy(struct mount *mp __unused,
2971    struct vnode *lowervp __unused)
2972{
2973}
2974
2975/*
2976 * Notify upper mounts about reclaimed or unlinked vnode.
2977 */
2978void
2979vfs_notify_upper(struct vnode *vp, int event)
2980{
2981	static struct vfsops vgonel_vfsops = {
2982		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
2983		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
2984	};
2985	struct mount *mp, *ump, *mmp;
2986
2987	mp = vp->v_mount;
2988	if (mp == NULL)
2989		return;
2990
2991	MNT_ILOCK(mp);
2992	if (TAILQ_EMPTY(&mp->mnt_uppers))
2993		goto unlock;
2994	MNT_IUNLOCK(mp);
2995	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
2996	mmp->mnt_op = &vgonel_vfsops;
2997	mmp->mnt_kern_flag |= MNTK_MARKER;
2998	MNT_ILOCK(mp);
2999	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
3000	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
3001		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
3002			ump = TAILQ_NEXT(ump, mnt_upper_link);
3003			continue;
3004		}
3005		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
3006		MNT_IUNLOCK(mp);
3007		switch (event) {
3008		case VFS_NOTIFY_UPPER_RECLAIM:
3009			VFS_RECLAIM_LOWERVP(ump, vp);
3010			break;
3011		case VFS_NOTIFY_UPPER_UNLINK:
3012			VFS_UNLINK_LOWERVP(ump, vp);
3013			break;
3014		default:
3015			KASSERT(0, ("invalid event %d", event));
3016			break;
3017		}
3018		MNT_ILOCK(mp);
3019		ump = TAILQ_NEXT(mmp, mnt_upper_link);
3020		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
3021	}
3022	free(mmp, M_TEMP);
3023	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
3024	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
3025		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
3026		wakeup(&mp->mnt_uppers);
3027	}
3028unlock:
3029	MNT_IUNLOCK(mp);
3030}
3031
3032/*
3033 * vgone, with the vp interlock held.
3034 */
3035static void
3036vgonel(struct vnode *vp)
3037{
3038	struct thread *td;
3039	int oweinact;
3040	int active;
3041	struct mount *mp;
3042
3043	ASSERT_VOP_ELOCKED(vp, "vgonel");
3044	ASSERT_VI_LOCKED(vp, "vgonel");
3045	VNASSERT(vp->v_holdcnt, vp,
3046	    ("vgonel: vp %p has no reference.", vp));
3047	CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
3048	td = curthread;
3049
3050	/*
3051	 * Don't vgonel if we're already doomed.
3052	 */
3053	if (vp->v_iflag & VI_DOOMED)
3054		return;
3055	vp->v_iflag |= VI_DOOMED;
3056
3057	/*
3058	 * Check to see if the vnode is in use.  If so, we have to call
3059	 * VOP_CLOSE() and VOP_INACTIVE().
3060	 */
3061	active = vp->v_usecount;
3062	oweinact = (vp->v_iflag & VI_OWEINACT);
3063	VI_UNLOCK(vp);
3064	vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
3065
3066	/*
3067	 * If purging an active vnode, it must be closed and
3068	 * deactivated before being reclaimed.
3069	 */
3070	if (active)
3071		VOP_CLOSE(vp, FNONBLOCK, NOCRED, td);
3072	if (oweinact || active) {
3073		VI_LOCK(vp);
3074		if ((vp->v_iflag & VI_DOINGINACT) == 0)
3075			vinactive(vp, td);
3076		VI_UNLOCK(vp);
3077	}
3078	if (vp->v_type == VSOCK)
3079		vfs_unp_reclaim(vp);
3080
3081	/*
3082	 * Clean out any buffers associated with the vnode.
3083	 * If the flush fails, just toss the buffers.
3084	 */
3085	mp = NULL;
3086	if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
3087		(void) vn_start_secondary_write(vp, &mp, V_WAIT);
3088	if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
3089		while (vinvalbuf(vp, 0, 0, 0) != 0)
3090			;
3091	}
3092
3093	BO_LOCK(&vp->v_bufobj);
3094	KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
3095	    vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
3096	    TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
3097	    vp->v_bufobj.bo_clean.bv_cnt == 0,
3098	    ("vp %p bufobj not invalidated", vp));
3099	vp->v_bufobj.bo_flag |= BO_DEAD;
3100	BO_UNLOCK(&vp->v_bufobj);
3101
3102	/*
3103	 * Reclaim the vnode.
3104	 */
3105	if (VOP_RECLAIM(vp, td))
3106		panic("vgone: cannot reclaim");
3107	if (mp != NULL)
3108		vn_finished_secondary_write(mp);
3109	VNASSERT(vp->v_object == NULL, vp,
3110	    ("vop_reclaim left v_object vp=%p, tag=%s", vp, vp->v_tag));
3111	/*
3112	 * Clear the advisory locks and wake up waiting threads.
3113	 */
3114	(void)VOP_ADVLOCKPURGE(vp);
3115	vp->v_lockf = NULL;
3116	/*
3117	 * Delete from old mount point vnode list.
3118	 */
3119	delmntque(vp);
3120	cache_purge(vp);
3121	/*
3122	 * Done with purge, reset to the standard lock and invalidate
3123	 * the vnode.
3124	 */
3125	VI_LOCK(vp);
3126	vp->v_vnlock = &vp->v_lock;
3127	vp->v_op = &dead_vnodeops;
3128	vp->v_tag = "none";
3129	vp->v_type = VBAD;
3130}
3131
3132/*
3133 * Calculate the total number of references to a special device.
3134 */
3135int
3136vcount(struct vnode *vp)
3137{
3138	int count;
3139
3140	dev_lock();
3141	count = vp->v_rdev->si_usecount;
3142	dev_unlock();
3143	return (count);
3144}
3145
3146/*
3147 * Same as above, but using the struct cdev *as argument
3148 */
3149int
3150count_dev(struct cdev *dev)
3151{
3152	int count;
3153
3154	dev_lock();
3155	count = dev->si_usecount;
3156	dev_unlock();
3157	return(count);
3158}
3159
3160/*
3161 * Print out a description of a vnode.
3162 */
3163static char *typename[] =
3164{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD",
3165 "VMARKER"};
3166
3167void
3168vn_printf(struct vnode *vp, const char *fmt, ...)
3169{
3170	va_list ap;
3171	char buf[256], buf2[16];
3172	u_long flags;
3173
3174	va_start(ap, fmt);
3175	vprintf(fmt, ap);
3176	va_end(ap);
3177	printf("%p: ", (void *)vp);
3178	printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
3179	printf("    usecount %d, writecount %d, refcount %d mountedhere %p\n",
3180	    vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
3181	buf[0] = '\0';
3182	buf[1] = '\0';
3183	if (vp->v_vflag & VV_ROOT)
3184		strlcat(buf, "|VV_ROOT", sizeof(buf));
3185	if (vp->v_vflag & VV_ISTTY)
3186		strlcat(buf, "|VV_ISTTY", sizeof(buf));
3187	if (vp->v_vflag & VV_NOSYNC)
3188		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
3189	if (vp->v_vflag & VV_ETERNALDEV)
3190		strlcat(buf, "|VV_ETERNALDEV", sizeof(buf));
3191	if (vp->v_vflag & VV_CACHEDLABEL)
3192		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
3193	if (vp->v_vflag & VV_TEXT)
3194		strlcat(buf, "|VV_TEXT", sizeof(buf));
3195	if (vp->v_vflag & VV_COPYONWRITE)
3196		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
3197	if (vp->v_vflag & VV_SYSTEM)
3198		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
3199	if (vp->v_vflag & VV_PROCDEP)
3200		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
3201	if (vp->v_vflag & VV_NOKNOTE)
3202		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
3203	if (vp->v_vflag & VV_DELETED)
3204		strlcat(buf, "|VV_DELETED", sizeof(buf));
3205	if (vp->v_vflag & VV_MD)
3206		strlcat(buf, "|VV_MD", sizeof(buf));
3207	if (vp->v_vflag & VV_FORCEINSMQ)
3208		strlcat(buf, "|VV_FORCEINSMQ", sizeof(buf));
3209	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC | VV_ETERNALDEV |
3210	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
3211	    VV_NOKNOTE | VV_DELETED | VV_MD | VV_FORCEINSMQ);
3212	if (flags != 0) {
3213		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
3214		strlcat(buf, buf2, sizeof(buf));
3215	}
3216	if (vp->v_iflag & VI_MOUNT)
3217		strlcat(buf, "|VI_MOUNT", sizeof(buf));
3218	if (vp->v_iflag & VI_DOOMED)
3219		strlcat(buf, "|VI_DOOMED", sizeof(buf));
3220	if (vp->v_iflag & VI_FREE)
3221		strlcat(buf, "|VI_FREE", sizeof(buf));
3222	if (vp->v_iflag & VI_ACTIVE)
3223		strlcat(buf, "|VI_ACTIVE", sizeof(buf));
3224	if (vp->v_iflag & VI_DOINGINACT)
3225		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
3226	if (vp->v_iflag & VI_OWEINACT)
3227		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
3228	flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
3229	    VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
3230	if (flags != 0) {
3231		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
3232		strlcat(buf, buf2, sizeof(buf));
3233	}
3234	printf("    flags (%s)\n", buf + 1);
3235	if (mtx_owned(VI_MTX(vp)))
3236		printf(" VI_LOCKed");
3237	if (vp->v_object != NULL)
3238		printf("    v_object %p ref %d pages %d "
3239		    "cleanbuf %d dirtybuf %d\n",
3240		    vp->v_object, vp->v_object->ref_count,
3241		    vp->v_object->resident_page_count,
3242		    vp->v_bufobj.bo_clean.bv_cnt,
3243		    vp->v_bufobj.bo_dirty.bv_cnt);
3244	printf("    ");
3245	lockmgr_printinfo(vp->v_vnlock);
3246	if (vp->v_data != NULL)
3247		VOP_PRINT(vp);
3248}
3249
3250#ifdef DDB
3251/*
3252 * List all of the locked vnodes in the system.
3253 * Called when debugging the kernel.
3254 */
3255DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
3256{
3257	struct mount *mp;
3258	struct vnode *vp;
3259
3260	/*
3261	 * Note: because this is DDB, we can't obey the locking semantics
3262	 * for these structures, which means we could catch an inconsistent
3263	 * state and dereference a nasty pointer.  Not much to be done
3264	 * about that.
3265	 */
3266	db_printf("Locked vnodes\n");
3267	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3268		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3269			if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
3270				vprint("", vp);
3271		}
3272	}
3273}
3274
3275/*
3276 * Show details about the given vnode.
3277 */
3278DB_SHOW_COMMAND(vnode, db_show_vnode)
3279{
3280	struct vnode *vp;
3281
3282	if (!have_addr)
3283		return;
3284	vp = (struct vnode *)addr;
3285	vn_printf(vp, "vnode ");
3286}
3287
3288/*
3289 * Show details about the given mount point.
3290 */
3291DB_SHOW_COMMAND(mount, db_show_mount)
3292{
3293	struct mount *mp;
3294	struct vfsopt *opt;
3295	struct statfs *sp;
3296	struct vnode *vp;
3297	char buf[512];
3298	uint64_t mflags;
3299	u_int flags;
3300
3301	if (!have_addr) {
3302		/* No address given, print short info about all mount points. */
3303		TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3304			db_printf("%p %s on %s (%s)\n", mp,
3305			    mp->mnt_stat.f_mntfromname,
3306			    mp->mnt_stat.f_mntonname,
3307			    mp->mnt_stat.f_fstypename);
3308			if (db_pager_quit)
3309				break;
3310		}
3311		db_printf("\nMore info: show mount <addr>\n");
3312		return;
3313	}
3314
3315	mp = (struct mount *)addr;
3316	db_printf("%p %s on %s (%s)\n", mp, mp->mnt_stat.f_mntfromname,
3317	    mp->mnt_stat.f_mntonname, mp->mnt_stat.f_fstypename);
3318
3319	buf[0] = '\0';
3320	mflags = mp->mnt_flag;
3321#define	MNT_FLAG(flag)	do {						\
3322	if (mflags & (flag)) {						\
3323		if (buf[0] != '\0')					\
3324			strlcat(buf, ", ", sizeof(buf));		\
3325		strlcat(buf, (#flag) + 4, sizeof(buf));			\
3326		mflags &= ~(flag);					\
3327	}								\
3328} while (0)
3329	MNT_FLAG(MNT_RDONLY);
3330	MNT_FLAG(MNT_SYNCHRONOUS);
3331	MNT_FLAG(MNT_NOEXEC);
3332	MNT_FLAG(MNT_NOSUID);
3333	MNT_FLAG(MNT_NFS4ACLS);
3334	MNT_FLAG(MNT_UNION);
3335	MNT_FLAG(MNT_ASYNC);
3336	MNT_FLAG(MNT_SUIDDIR);
3337	MNT_FLAG(MNT_SOFTDEP);
3338	MNT_FLAG(MNT_NOSYMFOLLOW);
3339	MNT_FLAG(MNT_GJOURNAL);
3340	MNT_FLAG(MNT_MULTILABEL);
3341	MNT_FLAG(MNT_ACLS);
3342	MNT_FLAG(MNT_NOATIME);
3343	MNT_FLAG(MNT_NOCLUSTERR);
3344	MNT_FLAG(MNT_NOCLUSTERW);
3345	MNT_FLAG(MNT_SUJ);
3346	MNT_FLAG(MNT_EXRDONLY);
3347	MNT_FLAG(MNT_EXPORTED);
3348	MNT_FLAG(MNT_DEFEXPORTED);
3349	MNT_FLAG(MNT_EXPORTANON);
3350	MNT_FLAG(MNT_EXKERB);
3351	MNT_FLAG(MNT_EXPUBLIC);
3352	MNT_FLAG(MNT_LOCAL);
3353	MNT_FLAG(MNT_QUOTA);
3354	MNT_FLAG(MNT_ROOTFS);
3355	MNT_FLAG(MNT_USER);
3356	MNT_FLAG(MNT_IGNORE);
3357	MNT_FLAG(MNT_UPDATE);
3358	MNT_FLAG(MNT_DELEXPORT);
3359	MNT_FLAG(MNT_RELOAD);
3360	MNT_FLAG(MNT_FORCE);
3361	MNT_FLAG(MNT_SNAPSHOT);
3362	MNT_FLAG(MNT_BYFSID);
3363#undef MNT_FLAG
3364	if (mflags != 0) {
3365		if (buf[0] != '\0')
3366			strlcat(buf, ", ", sizeof(buf));
3367		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3368		    "0x%016jx", mflags);
3369	}
3370	db_printf("    mnt_flag = %s\n", buf);
3371
3372	buf[0] = '\0';
3373	flags = mp->mnt_kern_flag;
3374#define	MNT_KERN_FLAG(flag)	do {					\
3375	if (flags & (flag)) {						\
3376		if (buf[0] != '\0')					\
3377			strlcat(buf, ", ", sizeof(buf));		\
3378		strlcat(buf, (#flag) + 5, sizeof(buf));			\
3379		flags &= ~(flag);					\
3380	}								\
3381} while (0)
3382	MNT_KERN_FLAG(MNTK_UNMOUNTF);
3383	MNT_KERN_FLAG(MNTK_ASYNC);
3384	MNT_KERN_FLAG(MNTK_SOFTDEP);
3385	MNT_KERN_FLAG(MNTK_NOINSMNTQ);
3386	MNT_KERN_FLAG(MNTK_DRAINING);
3387	MNT_KERN_FLAG(MNTK_REFEXPIRE);
3388	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
3389	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
3390	MNT_KERN_FLAG(MNTK_NO_IOPF);
3391	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
3392	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
3393	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
3394	MNT_KERN_FLAG(MNTK_MARKER);
3395	MNT_KERN_FLAG(MNTK_USES_BCACHE);
3396	MNT_KERN_FLAG(MNTK_NOASYNC);
3397	MNT_KERN_FLAG(MNTK_UNMOUNT);
3398	MNT_KERN_FLAG(MNTK_MWAIT);
3399	MNT_KERN_FLAG(MNTK_SUSPEND);
3400	MNT_KERN_FLAG(MNTK_SUSPEND2);
3401	MNT_KERN_FLAG(MNTK_SUSPENDED);
3402	MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
3403	MNT_KERN_FLAG(MNTK_NOKNOTE);
3404#undef MNT_KERN_FLAG
3405	if (flags != 0) {
3406		if (buf[0] != '\0')
3407			strlcat(buf, ", ", sizeof(buf));
3408		snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf),
3409		    "0x%08x", flags);
3410	}
3411	db_printf("    mnt_kern_flag = %s\n", buf);
3412
3413	db_printf("    mnt_opt = ");
3414	opt = TAILQ_FIRST(mp->mnt_opt);
3415	if (opt != NULL) {
3416		db_printf("%s", opt->name);
3417		opt = TAILQ_NEXT(opt, link);
3418		while (opt != NULL) {
3419			db_printf(", %s", opt->name);
3420			opt = TAILQ_NEXT(opt, link);
3421		}
3422	}
3423	db_printf("\n");
3424
3425	sp = &mp->mnt_stat;
3426	db_printf("    mnt_stat = { version=%u type=%u flags=0x%016jx "
3427	    "bsize=%ju iosize=%ju blocks=%ju bfree=%ju bavail=%jd files=%ju "
3428	    "ffree=%jd syncwrites=%ju asyncwrites=%ju syncreads=%ju "
3429	    "asyncreads=%ju namemax=%u owner=%u fsid=[%d, %d] }\n",
3430	    (u_int)sp->f_version, (u_int)sp->f_type, (uintmax_t)sp->f_flags,
3431	    (uintmax_t)sp->f_bsize, (uintmax_t)sp->f_iosize,
3432	    (uintmax_t)sp->f_blocks, (uintmax_t)sp->f_bfree,
3433	    (intmax_t)sp->f_bavail, (uintmax_t)sp->f_files,
3434	    (intmax_t)sp->f_ffree, (uintmax_t)sp->f_syncwrites,
3435	    (uintmax_t)sp->f_asyncwrites, (uintmax_t)sp->f_syncreads,
3436	    (uintmax_t)sp->f_asyncreads, (u_int)sp->f_namemax,
3437	    (u_int)sp->f_owner, (int)sp->f_fsid.val[0], (int)sp->f_fsid.val[1]);
3438
3439	db_printf("    mnt_cred = { uid=%u ruid=%u",
3440	    (u_int)mp->mnt_cred->cr_uid, (u_int)mp->mnt_cred->cr_ruid);
3441	if (jailed(mp->mnt_cred))
3442		db_printf(", jail=%d", mp->mnt_cred->cr_prison->pr_id);
3443	db_printf(" }\n");
3444	db_printf("    mnt_ref = %d\n", mp->mnt_ref);
3445	db_printf("    mnt_gen = %d\n", mp->mnt_gen);
3446	db_printf("    mnt_nvnodelistsize = %d\n", mp->mnt_nvnodelistsize);
3447	db_printf("    mnt_activevnodelistsize = %d\n",
3448	    mp->mnt_activevnodelistsize);
3449	db_printf("    mnt_writeopcount = %d\n", mp->mnt_writeopcount);
3450	db_printf("    mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
3451	db_printf("    mnt_iosize_max = %d\n", mp->mnt_iosize_max);
3452	db_printf("    mnt_hashseed = %u\n", mp->mnt_hashseed);
3453	db_printf("    mnt_lockref = %d\n", mp->mnt_lockref);
3454	db_printf("    mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
3455	db_printf("    mnt_secondary_accwrites = %d\n",
3456	    mp->mnt_secondary_accwrites);
3457	db_printf("    mnt_gjprovider = %s\n",
3458	    mp->mnt_gjprovider != NULL ? mp->mnt_gjprovider : "NULL");
3459
3460	db_printf("\n\nList of active vnodes\n");
3461	TAILQ_FOREACH(vp, &mp->mnt_activevnodelist, v_actfreelist) {
3462		if (vp->v_type != VMARKER) {
3463			vn_printf(vp, "vnode ");
3464			if (db_pager_quit)
3465				break;
3466		}
3467	}
3468	db_printf("\n\nList of inactive vnodes\n");
3469	TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3470		if (vp->v_type != VMARKER && (vp->v_iflag & VI_ACTIVE) == 0) {
3471			vn_printf(vp, "vnode ");
3472			if (db_pager_quit)
3473				break;
3474		}
3475	}
3476}
3477#endif	/* DDB */
3478
3479/*
3480 * Fill in a struct xvfsconf based on a struct vfsconf.
3481 */
3482static int
3483vfsconf2x(struct sysctl_req *req, struct vfsconf *vfsp)
3484{
3485	struct xvfsconf xvfsp;
3486
3487	bzero(&xvfsp, sizeof(xvfsp));
3488	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3489	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3490	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3491	xvfsp.vfc_flags = vfsp->vfc_flags;
3492	/*
3493	 * These are unused in userland, we keep them
3494	 * to not break binary compatibility.
3495	 */
3496	xvfsp.vfc_vfsops = NULL;
3497	xvfsp.vfc_next = NULL;
3498	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3499}
3500
3501#ifdef COMPAT_FREEBSD32
3502struct xvfsconf32 {
3503	uint32_t	vfc_vfsops;
3504	char		vfc_name[MFSNAMELEN];
3505	int32_t		vfc_typenum;
3506	int32_t		vfc_refcount;
3507	int32_t		vfc_flags;
3508	uint32_t	vfc_next;
3509};
3510
3511static int
3512vfsconf2x32(struct sysctl_req *req, struct vfsconf *vfsp)
3513{
3514	struct xvfsconf32 xvfsp;
3515
3516	strcpy(xvfsp.vfc_name, vfsp->vfc_name);
3517	xvfsp.vfc_typenum = vfsp->vfc_typenum;
3518	xvfsp.vfc_refcount = vfsp->vfc_refcount;
3519	xvfsp.vfc_flags = vfsp->vfc_flags;
3520	xvfsp.vfc_vfsops = 0;
3521	xvfsp.vfc_next = 0;
3522	return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
3523}
3524#endif
3525
3526/*
3527 * Top level filesystem related information gathering.
3528 */
3529static int
3530sysctl_vfs_conflist(SYSCTL_HANDLER_ARGS)
3531{
3532	struct vfsconf *vfsp;
3533	int error;
3534
3535	error = 0;
3536	vfsconf_slock();
3537	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3538#ifdef COMPAT_FREEBSD32
3539		if (req->flags & SCTL_MASK32)
3540			error = vfsconf2x32(req, vfsp);
3541		else
3542#endif
3543			error = vfsconf2x(req, vfsp);
3544		if (error)
3545			break;
3546	}
3547	vfsconf_sunlock();
3548	return (error);
3549}
3550
3551SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
3552    CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
3553    "S,xvfsconf", "List of all configured filesystems");
3554
3555#ifndef BURN_BRIDGES
3556static int	sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS);
3557
3558static int
3559vfs_sysctl(SYSCTL_HANDLER_ARGS)
3560{
3561	int *name = (int *)arg1 - 1;	/* XXX */
3562	u_int namelen = arg2 + 1;	/* XXX */
3563	struct vfsconf *vfsp;
3564
3565	log(LOG_WARNING, "userland calling deprecated sysctl, "
3566	    "please rebuild world\n");
3567
3568#if 1 || defined(COMPAT_PRELITE2)
3569	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
3570	if (namelen == 1)
3571		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
3572#endif
3573
3574	switch (name[1]) {
3575	case VFS_MAXTYPENUM:
3576		if (namelen != 2)
3577			return (ENOTDIR);
3578		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
3579	case VFS_CONF:
3580		if (namelen != 3)
3581			return (ENOTDIR);	/* overloaded */
3582		vfsconf_slock();
3583		TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3584			if (vfsp->vfc_typenum == name[2])
3585				break;
3586		}
3587		vfsconf_sunlock();
3588		if (vfsp == NULL)
3589			return (EOPNOTSUPP);
3590#ifdef COMPAT_FREEBSD32
3591		if (req->flags & SCTL_MASK32)
3592			return (vfsconf2x32(req, vfsp));
3593		else
3594#endif
3595			return (vfsconf2x(req, vfsp));
3596	}
3597	return (EOPNOTSUPP);
3598}
3599
3600static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
3601    CTLFLAG_MPSAFE, vfs_sysctl,
3602    "Generic filesystem");
3603
3604#if 1 || defined(COMPAT_PRELITE2)
3605
3606static int
3607sysctl_ovfs_conf(SYSCTL_HANDLER_ARGS)
3608{
3609	int error;
3610	struct vfsconf *vfsp;
3611	struct ovfsconf ovfs;
3612
3613	vfsconf_slock();
3614	TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
3615		bzero(&ovfs, sizeof(ovfs));
3616		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
3617		strcpy(ovfs.vfc_name, vfsp->vfc_name);
3618		ovfs.vfc_index = vfsp->vfc_typenum;
3619		ovfs.vfc_refcount = vfsp->vfc_refcount;
3620		ovfs.vfc_flags = vfsp->vfc_flags;
3621		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
3622		if (error != 0) {
3623			vfsconf_sunlock();
3624			return (error);
3625		}
3626	}
3627	vfsconf_sunlock();
3628	return (0);
3629}
3630
3631#endif /* 1 || COMPAT_PRELITE2 */
3632#endif /* !BURN_BRIDGES */
3633
3634#define KINFO_VNODESLOP		10
3635#ifdef notyet
3636/*
3637 * Dump vnode list (via sysctl).
3638 */
3639/* ARGSUSED */
3640static int
3641sysctl_vnode(SYSCTL_HANDLER_ARGS)
3642{
3643	struct xvnode *xvn;
3644	struct mount *mp;
3645	struct vnode *vp;
3646	int error, len, n;
3647
3648	/*
3649	 * Stale numvnodes access is not fatal here.
3650	 */
3651	req->lock = 0;
3652	len = (numvnodes + KINFO_VNODESLOP) * sizeof *xvn;
3653	if (!req->oldptr)
3654		/* Make an estimate */
3655		return (SYSCTL_OUT(req, 0, len));
3656
3657	error = sysctl_wire_old_buffer(req, 0);
3658	if (error != 0)
3659		return (error);
3660	xvn = malloc(len, M_TEMP, M_ZERO | M_WAITOK);
3661	n = 0;
3662	mtx_lock(&mountlist_mtx);
3663	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
3664		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
3665			continue;
3666		MNT_ILOCK(mp);
3667		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
3668			if (n == len)
3669				break;
3670			vref(vp);
3671			xvn[n].xv_size = sizeof *xvn;
3672			xvn[n].xv_vnode = vp;
3673			xvn[n].xv_id = 0;	/* XXX compat */
3674#define XV_COPY(field) xvn[n].xv_##field = vp->v_##field
3675			XV_COPY(usecount);
3676			XV_COPY(writecount);
3677			XV_COPY(holdcnt);
3678			XV_COPY(mount);
3679			XV_COPY(numoutput);
3680			XV_COPY(type);
3681#undef XV_COPY
3682			xvn[n].xv_flag = vp->v_vflag;
3683
3684			switch (vp->v_type) {
3685			case VREG:
3686			case VDIR:
3687			case VLNK:
3688				break;
3689			case VBLK:
3690			case VCHR:
3691				if (vp->v_rdev == NULL) {
3692					vrele(vp);
3693					continue;
3694				}
3695				xvn[n].xv_dev = dev2udev(vp->v_rdev);
3696				break;
3697			case VSOCK:
3698				xvn[n].xv_socket = vp->v_socket;
3699				break;
3700			case VFIFO:
3701				xvn[n].xv_fifo = vp->v_fifoinfo;
3702				break;
3703			case VNON:
3704			case VBAD:
3705			default:
3706				/* shouldn't happen? */
3707				vrele(vp);
3708				continue;
3709			}
3710			vrele(vp);
3711			++n;
3712		}
3713		MNT_IUNLOCK(mp);
3714		mtx_lock(&mountlist_mtx);
3715		vfs_unbusy(mp);
3716		if (n == len)
3717			break;
3718	}
3719	mtx_unlock(&mountlist_mtx);
3720
3721	error = SYSCTL_OUT(req, xvn, n * sizeof *xvn);
3722	free(xvn, M_TEMP);
3723	return (error);
3724}
3725
3726SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
3727    CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
3728    "");
3729#endif
3730
3731/*
3732 * Unmount all filesystems. The list is traversed in reverse order
3733 * of mounting to avoid dependencies.
3734 */
3735void
3736vfs_unmountall(void)
3737{
3738	struct mount *mp;
3739	struct thread *td;
3740	int error;
3741
3742	CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
3743	td = curthread;
3744
3745	/*
3746	 * Since this only runs when rebooting, it is not interlocked.
3747	 */
3748	while(!TAILQ_EMPTY(&mountlist)) {
3749		mp = TAILQ_LAST(&mountlist, mntlist);
3750		vfs_ref(mp);
3751		error = dounmount(mp, MNT_FORCE, td);
3752		if (error != 0) {
3753			TAILQ_REMOVE(&mountlist, mp, mnt_list);
3754			/*
3755			 * XXX: Due to the way in which we mount the root
3756			 * file system off of devfs, devfs will generate a
3757			 * "busy" warning when we try to unmount it before
3758			 * the root.  Don't print a warning as a result in
3759			 * order to avoid false positive errors that may
3760			 * cause needless upset.
3761			 */
3762			if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
3763				printf("unmount of %s failed (",
3764				    mp->mnt_stat.f_mntonname);
3765				if (error == EBUSY)
3766					printf("BUSY)\n");
3767				else
3768					printf("%d)\n", error);
3769			}
3770		} else {
3771			/* The unmount has removed mp from the mountlist */
3772		}
3773	}
3774}
3775
3776/*
3777 * perform msync on all vnodes under a mount point
3778 * the mount point must be locked.
3779 */
3780void
3781vfs_msync(struct mount *mp, int flags)
3782{
3783	struct vnode *vp, *mvp;
3784	struct vm_object *obj;
3785
3786	CTR2(KTR_VFS, "%s: mp %p", __func__, mp);
3787	MNT_VNODE_FOREACH_ACTIVE(vp, mp, mvp) {
3788		obj = vp->v_object;
3789		if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0 &&
3790		    (flags == MNT_WAIT || VOP_ISLOCKED(vp) == 0)) {
3791			if (!vget(vp,
3792			    LK_EXCLUSIVE | LK_RETRY | LK_INTERLOCK,
3793			    curthread)) {
3794				if (vp->v_vflag & VV_NOSYNC) {	/* unlinked */
3795					vput(vp);
3796					continue;
3797				}
3798
3799				obj = vp->v_object;
3800				if (obj != NULL) {
3801					VM_OBJECT_WLOCK(obj);
3802					vm_object_page_clean(obj, 0, 0,
3803					    flags == MNT_WAIT ?
3804					    OBJPC_SYNC : OBJPC_NOSYNC);
3805					VM_OBJECT_WUNLOCK(obj);
3806				}
3807				vput(vp);
3808			}
3809		} else
3810			VI_UNLOCK(vp);
3811	}
3812}
3813
3814static void
3815destroy_vpollinfo_free(struct vpollinfo *vi)
3816{
3817
3818	knlist_destroy(&vi->vpi_selinfo.si_note);
3819	mtx_destroy(&vi->vpi_lock);
3820	uma_zfree(vnodepoll_zone, vi);
3821}
3822
3823static void
3824destroy_vpollinfo(struct vpollinfo *vi)
3825{
3826
3827	knlist_clear(&vi->vpi_selinfo.si_note, 1);
3828	seldrain(&vi->vpi_selinfo);
3829	destroy_vpollinfo_free(vi);
3830}
3831
3832/*
3833 * Initalize per-vnode helper structure to hold poll-related state.
3834 */
3835void
3836v_addpollinfo(struct vnode *vp)
3837{
3838	struct vpollinfo *vi;
3839
3840	if (vp->v_pollinfo != NULL)
3841		return;
3842	vi = uma_zalloc(vnodepoll_zone, M_WAITOK);
3843	mtx_init(&vi->vpi_lock, "vnode pollinfo", NULL, MTX_DEF);
3844	knlist_init(&vi->vpi_selinfo.si_note, vp, vfs_knllock,
3845	    vfs_knlunlock, vfs_knl_assert_locked, vfs_knl_assert_unlocked);
3846	VI_LOCK(vp);
3847	if (vp->v_pollinfo != NULL) {
3848		VI_UNLOCK(vp);
3849		destroy_vpollinfo_free(vi);
3850		return;
3851	}
3852	vp->v_pollinfo = vi;
3853	VI_UNLOCK(vp);
3854}
3855
3856/*
3857 * Record a process's interest in events which might happen to
3858 * a vnode.  Because poll uses the historic select-style interface
3859 * internally, this routine serves as both the ``check for any
3860 * pending events'' and the ``record my interest in future events''
3861 * functions.  (These are done together, while the lock is held,
3862 * to avoid race conditions.)
3863 */
3864int
3865vn_pollrecord(struct vnode *vp, struct thread *td, int events)
3866{
3867
3868	v_addpollinfo(vp);
3869	mtx_lock(&vp->v_pollinfo->vpi_lock);
3870	if (vp->v_pollinfo->vpi_revents & events) {
3871		/*
3872		 * This leaves events we are not interested
3873		 * in available for the other process which
3874		 * which presumably had requested them
3875		 * (otherwise they would never have been
3876		 * recorded).
3877		 */
3878		events &= vp->v_pollinfo->vpi_revents;
3879		vp->v_pollinfo->vpi_revents &= ~events;
3880
3881		mtx_unlock(&vp->v_pollinfo->vpi_lock);
3882		return (events);
3883	}
3884	vp->v_pollinfo->vpi_events |= events;
3885	selrecord(td, &vp->v_pollinfo->vpi_selinfo);
3886	mtx_unlock(&vp->v_pollinfo->vpi_lock);
3887	return (0);
3888}
3889
3890/*
3891 * Routine to create and manage a filesystem syncer vnode.
3892 */
3893#define sync_close ((int (*)(struct  vop_close_args *))nullop)
3894static int	sync_fsync(struct  vop_fsync_args *);
3895static int	sync_inactive(struct  vop_inactive_args *);
3896static int	sync_reclaim(struct  vop_reclaim_args *);
3897
3898static struct vop_vector sync_vnodeops = {
3899	.vop_bypass =	VOP_EOPNOTSUPP,
3900	.vop_close =	sync_close,		/* close */
3901	.vop_fsync =	sync_fsync,		/* fsync */
3902	.vop_inactive =	sync_inactive,	/* inactive */
3903	.vop_reclaim =	sync_reclaim,	/* reclaim */
3904	.vop_lock1 =	vop_stdlock,	/* lock */
3905	.vop_unlock =	vop_stdunlock,	/* unlock */
3906	.vop_islocked =	vop_stdislocked,	/* islocked */
3907};
3908
3909/*
3910 * Create a new filesystem syncer vnode for the specified mount point.
3911 */
3912void
3913vfs_allocate_syncvnode(struct mount *mp)
3914{
3915	struct vnode *vp;
3916	struct bufobj *bo;
3917	static long start, incr, next;
3918	int error;
3919
3920	/* Allocate a new vnode */
3921	error = getnewvnode("syncer", mp, &sync_vnodeops, &vp);
3922	if (error != 0)
3923		panic("vfs_allocate_syncvnode: getnewvnode() failed");
3924	vp->v_type = VNON;
3925	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3926	vp->v_vflag |= VV_FORCEINSMQ;
3927	error = insmntque(vp, mp);
3928	if (error != 0)
3929		panic("vfs_allocate_syncvnode: insmntque() failed");
3930	vp->v_vflag &= ~VV_FORCEINSMQ;
3931	VOP_UNLOCK(vp, 0);
3932	/*
3933	 * Place the vnode onto the syncer worklist. We attempt to
3934	 * scatter them about on the list so that they will go off
3935	 * at evenly distributed times even if all the filesystems
3936	 * are mounted at once.
3937	 */
3938	next += incr;
3939	if (next == 0 || next > syncer_maxdelay) {
3940		start /= 2;
3941		incr /= 2;
3942		if (start == 0) {
3943			start = syncer_maxdelay / 2;
3944			incr = syncer_maxdelay;
3945		}
3946		next = start;
3947	}
3948	bo = &vp->v_bufobj;
3949	BO_LOCK(bo);
3950	vn_syncer_add_to_worklist(bo, syncdelay > 0 ? next % syncdelay : 0);
3951	/* XXX - vn_syncer_add_to_worklist() also grabs and drops sync_mtx. */
3952	mtx_lock(&sync_mtx);
3953	sync_vnode_count++;
3954	if (mp->mnt_syncer == NULL) {
3955		mp->mnt_syncer = vp;
3956		vp = NULL;
3957	}
3958	mtx_unlock(&sync_mtx);
3959	BO_UNLOCK(bo);
3960	if (vp != NULL) {
3961		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3962		vgone(vp);
3963		vput(vp);
3964	}
3965}
3966
3967void
3968vfs_deallocate_syncvnode(struct mount *mp)
3969{
3970	struct vnode *vp;
3971
3972	mtx_lock(&sync_mtx);
3973	vp = mp->mnt_syncer;
3974	if (vp != NULL)
3975		mp->mnt_syncer = NULL;
3976	mtx_unlock(&sync_mtx);
3977	if (vp != NULL)
3978		vrele(vp);
3979}
3980
3981/*
3982 * Do a lazy sync of the filesystem.
3983 */
3984static int
3985sync_fsync(struct vop_fsync_args *ap)
3986{
3987	struct vnode *syncvp = ap->a_vp;
3988	struct mount *mp = syncvp->v_mount;
3989	int error, save;
3990	struct bufobj *bo;
3991
3992	/*
3993	 * We only need to do something if this is a lazy evaluation.
3994	 */
3995	if (ap->a_waitfor != MNT_LAZY)
3996		return (0);
3997
3998	/*
3999	 * Move ourselves to the back of the sync list.
4000	 */
4001	bo = &syncvp->v_bufobj;
4002	BO_LOCK(bo);
4003	vn_syncer_add_to_worklist(bo, syncdelay);
4004	BO_UNLOCK(bo);
4005
4006	/*
4007	 * Walk the list of vnodes pushing all that are dirty and
4008	 * not already on the sync list.
4009	 */
4010	if (vfs_busy(mp, MBF_NOWAIT) != 0)
4011		return (0);
4012	if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
4013		vfs_unbusy(mp);
4014		return (0);
4015	}
4016	save = curthread_pflags_set(TDP_SYNCIO);
4017	vfs_msync(mp, MNT_NOWAIT);
4018	error = VFS_SYNC(mp, MNT_LAZY);
4019	curthread_pflags_restore(save);
4020	vn_finished_write(mp);
4021	vfs_unbusy(mp);
4022	return (error);
4023}
4024
4025/*
4026 * The syncer vnode is no referenced.
4027 */
4028static int
4029sync_inactive(struct vop_inactive_args *ap)
4030{
4031
4032	vgone(ap->a_vp);
4033	return (0);
4034}
4035
4036/*
4037 * The syncer vnode is no longer needed and is being decommissioned.
4038 *
4039 * Modifications to the worklist must be protected by sync_mtx.
4040 */
4041static int
4042sync_reclaim(struct vop_reclaim_args *ap)
4043{
4044	struct vnode *vp = ap->a_vp;
4045	struct bufobj *bo;
4046
4047	bo = &vp->v_bufobj;
4048	BO_LOCK(bo);
4049	mtx_lock(&sync_mtx);
4050	if (vp->v_mount->mnt_syncer == vp)
4051		vp->v_mount->mnt_syncer = NULL;
4052	if (bo->bo_flag & BO_ONWORKLST) {
4053		LIST_REMOVE(bo, bo_synclist);
4054		syncer_worklist_len--;
4055		sync_vnode_count--;
4056		bo->bo_flag &= ~BO_ONWORKLST;
4057	}
4058	mtx_unlock(&sync_mtx);
4059	BO_UNLOCK(bo);
4060
4061	return (0);
4062}
4063
4064/*
4065 * Check if vnode represents a disk device
4066 */
4067int
4068vn_isdisk(struct vnode *vp, int *errp)
4069{
4070	int error;
4071
4072	if (vp->v_type != VCHR) {
4073		error = ENOTBLK;
4074		goto out;
4075	}
4076	error = 0;
4077	dev_lock();
4078	if (vp->v_rdev == NULL)
4079		error = ENXIO;
4080	else if (vp->v_rdev->si_devsw == NULL)
4081		error = ENXIO;
4082	else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
4083		error = ENOTBLK;
4084	dev_unlock();
4085out:
4086	if (errp != NULL)
4087		*errp = error;
4088	return (error == 0);
4089}
4090
4091/*
4092 * Common filesystem object access control check routine.  Accepts a
4093 * vnode's type, "mode", uid and gid, requested access mode, credentials,
4094 * and optional call-by-reference privused argument allowing vaccess()
4095 * to indicate to the caller whether privilege was used to satisfy the
4096 * request (obsoleted).  Returns 0 on success, or an errno on failure.
4097 */
4098int
4099vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
4100    accmode_t accmode, struct ucred *cred, int *privused)
4101{
4102	accmode_t dac_granted;
4103	accmode_t priv_granted;
4104
4105	KASSERT((accmode & ~(VEXEC | VWRITE | VREAD | VADMIN | VAPPEND)) == 0,
4106	    ("invalid bit in accmode"));
4107	KASSERT((accmode & VAPPEND) == 0 || (accmode & VWRITE),
4108	    ("VAPPEND without VWRITE"));
4109
4110	/*
4111	 * Look for a normal, non-privileged way to access the file/directory
4112	 * as requested.  If it exists, go with that.
4113	 */
4114
4115	if (privused != NULL)
4116		*privused = 0;
4117
4118	dac_granted = 0;
4119
4120	/* Check the owner. */
4121	if (cred->cr_uid == file_uid) {
4122		dac_granted |= VADMIN;
4123		if (file_mode & S_IXUSR)
4124			dac_granted |= VEXEC;
4125		if (file_mode & S_IRUSR)
4126			dac_granted |= VREAD;
4127		if (file_mode & S_IWUSR)
4128			dac_granted |= (VWRITE | VAPPEND);
4129
4130		if ((accmode & dac_granted) == accmode)
4131			return (0);
4132
4133		goto privcheck;
4134	}
4135
4136	/* Otherwise, check the groups (first match) */
4137	if (groupmember(file_gid, cred)) {
4138		if (file_mode & S_IXGRP)
4139			dac_granted |= VEXEC;
4140		if (file_mode & S_IRGRP)
4141			dac_granted |= VREAD;
4142		if (file_mode & S_IWGRP)
4143			dac_granted |= (VWRITE | VAPPEND);
4144
4145		if ((accmode & dac_granted) == accmode)
4146			return (0);
4147
4148		goto privcheck;
4149	}
4150
4151	/* Otherwise, check everyone else. */
4152	if (file_mode & S_IXOTH)
4153		dac_granted |= VEXEC;
4154	if (file_mode & S_IROTH)
4155		dac_granted |= VREAD;
4156	if (file_mode & S_IWOTH)
4157		dac_granted |= (VWRITE | VAPPEND);
4158	if ((accmode & dac_granted) == accmode)
4159		return (0);
4160
4161privcheck:
4162	/*
4163	 * Build a privilege mask to determine if the set of privileges
4164	 * satisfies the requirements when combined with the granted mask
4165	 * from above.  For each privilege, if the privilege is required,
4166	 * bitwise or the request type onto the priv_granted mask.
4167	 */
4168	priv_granted = 0;
4169
4170	if (type == VDIR) {
4171		/*
4172		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
4173		 * requests, instead of PRIV_VFS_EXEC.
4174		 */
4175		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4176		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
4177			priv_granted |= VEXEC;
4178	} else {
4179		/*
4180		 * Ensure that at least one execute bit is on. Otherwise,
4181		 * a privileged user will always succeed, and we don't want
4182		 * this to happen unless the file really is executable.
4183		 */
4184		if ((accmode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
4185		    (file_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) != 0 &&
4186		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
4187			priv_granted |= VEXEC;
4188	}
4189
4190	if ((accmode & VREAD) && ((dac_granted & VREAD) == 0) &&
4191	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
4192		priv_granted |= VREAD;
4193
4194	if ((accmode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
4195	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
4196		priv_granted |= (VWRITE | VAPPEND);
4197
4198	if ((accmode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
4199	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
4200		priv_granted |= VADMIN;
4201
4202	if ((accmode & (priv_granted | dac_granted)) == accmode) {
4203		/* XXX audit: privilege used */
4204		if (privused != NULL)
4205			*privused = 1;
4206		return (0);
4207	}
4208
4209	return ((accmode & VADMIN) ? EPERM : EACCES);
4210}
4211
4212/*
4213 * Credential check based on process requesting service, and per-attribute
4214 * permissions.
4215 */
4216int
4217extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
4218    struct thread *td, accmode_t accmode)
4219{
4220
4221	/*
4222	 * Kernel-invoked always succeeds.
4223	 */
4224	if (cred == NOCRED)
4225		return (0);
4226
4227	/*
4228	 * Do not allow privileged processes in jail to directly manipulate
4229	 * system attributes.
4230	 */
4231	switch (attrnamespace) {
4232	case EXTATTR_NAMESPACE_SYSTEM:
4233		/* Potentially should be: return (EPERM); */
4234		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
4235	case EXTATTR_NAMESPACE_USER:
4236		return (VOP_ACCESS(vp, accmode, cred, td));
4237	default:
4238		return (EPERM);
4239	}
4240}
4241
4242#ifdef DEBUG_VFS_LOCKS
4243/*
4244 * This only exists to supress warnings from unlocked specfs accesses.  It is
4245 * no longer ok to have an unlocked VFS.
4246 */
4247#define	IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL ||		\
4248	(vp)->v_type == VCHR ||	(vp)->v_type == VBAD)
4249
4250int vfs_badlock_ddb = 1;	/* Drop into debugger on violation. */
4251SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_ddb, CTLFLAG_RW, &vfs_badlock_ddb, 0,
4252    "Drop into debugger on lock violation");
4253
4254int vfs_badlock_mutex = 1;	/* Check for interlock across VOPs. */
4255SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_mutex, CTLFLAG_RW, &vfs_badlock_mutex,
4256    0, "Check for interlock across VOPs");
4257
4258int vfs_badlock_print = 1;	/* Print lock violations. */
4259SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_print, CTLFLAG_RW, &vfs_badlock_print,
4260    0, "Print lock violations");
4261
4262#ifdef KDB
4263int vfs_badlock_backtrace = 1;	/* Print backtrace at lock violations. */
4264SYSCTL_INT(_debug, OID_AUTO, vfs_badlock_backtrace, CTLFLAG_RW,
4265    &vfs_badlock_backtrace, 0, "Print backtrace at lock violations");
4266#endif
4267
4268static void
4269vfs_badlock(const char *msg, const char *str, struct vnode *vp)
4270{
4271
4272#ifdef KDB
4273	if (vfs_badlock_backtrace)
4274		kdb_backtrace();
4275#endif
4276	if (vfs_badlock_print)
4277		printf("%s: %p %s\n", str, (void *)vp, msg);
4278	if (vfs_badlock_ddb)
4279		kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4280}
4281
4282void
4283assert_vi_locked(struct vnode *vp, const char *str)
4284{
4285
4286	if (vfs_badlock_mutex && !mtx_owned(VI_MTX(vp)))
4287		vfs_badlock("interlock is not locked but should be", str, vp);
4288}
4289
4290void
4291assert_vi_unlocked(struct vnode *vp, const char *str)
4292{
4293
4294	if (vfs_badlock_mutex && mtx_owned(VI_MTX(vp)))
4295		vfs_badlock("interlock is locked but should not be", str, vp);
4296}
4297
4298void
4299assert_vop_locked(struct vnode *vp, const char *str)
4300{
4301	int locked;
4302
4303	if (!IGNORE_LOCK(vp)) {
4304		locked = VOP_ISLOCKED(vp);
4305		if (locked == 0 || locked == LK_EXCLOTHER)
4306			vfs_badlock("is not locked but should be", str, vp);
4307	}
4308}
4309
4310void
4311assert_vop_unlocked(struct vnode *vp, const char *str)
4312{
4313
4314	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) == LK_EXCLUSIVE)
4315		vfs_badlock("is locked but should not be", str, vp);
4316}
4317
4318void
4319assert_vop_elocked(struct vnode *vp, const char *str)
4320{
4321
4322	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
4323		vfs_badlock("is not exclusive locked but should be", str, vp);
4324}
4325
4326#if 0
4327void
4328assert_vop_elocked_other(struct vnode *vp, const char *str)
4329{
4330
4331	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_EXCLOTHER)
4332		vfs_badlock("is not exclusive locked by another thread",
4333		    str, vp);
4334}
4335
4336void
4337assert_vop_slocked(struct vnode *vp, const char *str)
4338{
4339
4340	if (!IGNORE_LOCK(vp) && VOP_ISLOCKED(vp) != LK_SHARED)
4341		vfs_badlock("is not locked shared but should be", str, vp);
4342}
4343#endif /* 0 */
4344#endif /* DEBUG_VFS_LOCKS */
4345
4346void
4347vop_rename_fail(struct vop_rename_args *ap)
4348{
4349
4350	if (ap->a_tvp != NULL)
4351		vput(ap->a_tvp);
4352	if (ap->a_tdvp == ap->a_tvp)
4353		vrele(ap->a_tdvp);
4354	else
4355		vput(ap->a_tdvp);
4356	vrele(ap->a_fdvp);
4357	vrele(ap->a_fvp);
4358}
4359
4360void
4361vop_rename_pre(void *ap)
4362{
4363	struct vop_rename_args *a = ap;
4364
4365#ifdef DEBUG_VFS_LOCKS
4366	if (a->a_tvp)
4367		ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
4368	ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
4369	ASSERT_VI_UNLOCKED(a->a_fvp, "VOP_RENAME");
4370	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
4371
4372	/* Check the source (from). */
4373	if (a->a_tdvp->v_vnlock != a->a_fdvp->v_vnlock &&
4374	    (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fdvp->v_vnlock))
4375		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
4376	if (a->a_tvp == NULL || a->a_tvp->v_vnlock != a->a_fvp->v_vnlock)
4377		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
4378
4379	/* Check the target. */
4380	if (a->a_tvp)
4381		ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
4382	ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
4383#endif
4384	if (a->a_tdvp != a->a_fdvp)
4385		vhold(a->a_fdvp);
4386	if (a->a_tvp != a->a_fvp)
4387		vhold(a->a_fvp);
4388	vhold(a->a_tdvp);
4389	if (a->a_tvp)
4390		vhold(a->a_tvp);
4391}
4392
4393void
4394vop_strategy_pre(void *ap)
4395{
4396#ifdef DEBUG_VFS_LOCKS
4397	struct vop_strategy_args *a;
4398	struct buf *bp;
4399
4400	a = ap;
4401	bp = a->a_bp;
4402
4403	/*
4404	 * Cluster ops lock their component buffers but not the IO container.
4405	 */
4406	if ((bp->b_flags & B_CLUSTER) != 0)
4407		return;
4408
4409	if (panicstr == NULL && !BUF_ISLOCKED(bp)) {
4410		if (vfs_badlock_print)
4411			printf(
4412			    "VOP_STRATEGY: bp is not locked but should be\n");
4413		if (vfs_badlock_ddb)
4414			kdb_enter(KDB_WHY_VFSLOCK, "lock violation");
4415	}
4416#endif
4417}
4418
4419void
4420vop_lock_pre(void *ap)
4421{
4422#ifdef DEBUG_VFS_LOCKS
4423	struct vop_lock1_args *a = ap;
4424
4425	if ((a->a_flags & LK_INTERLOCK) == 0)
4426		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4427	else
4428		ASSERT_VI_LOCKED(a->a_vp, "VOP_LOCK");
4429#endif
4430}
4431
4432void
4433vop_lock_post(void *ap, int rc)
4434{
4435#ifdef DEBUG_VFS_LOCKS
4436	struct vop_lock1_args *a = ap;
4437
4438	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
4439	if (rc == 0 && (a->a_flags & LK_EXCLOTHER) == 0)
4440		ASSERT_VOP_LOCKED(a->a_vp, "VOP_LOCK");
4441#endif
4442}
4443
4444void
4445vop_unlock_pre(void *ap)
4446{
4447#ifdef DEBUG_VFS_LOCKS
4448	struct vop_unlock_args *a = ap;
4449
4450	if (a->a_flags & LK_INTERLOCK)
4451		ASSERT_VI_LOCKED(a->a_vp, "VOP_UNLOCK");
4452	ASSERT_VOP_LOCKED(a->a_vp, "VOP_UNLOCK");
4453#endif
4454}
4455
4456void
4457vop_unlock_post(void *ap, int rc)
4458{
4459#ifdef DEBUG_VFS_LOCKS
4460	struct vop_unlock_args *a = ap;
4461
4462	if (a->a_flags & LK_INTERLOCK)
4463		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_UNLOCK");
4464#endif
4465}
4466
4467void
4468vop_create_post(void *ap, int rc)
4469{
4470	struct vop_create_args *a = ap;
4471
4472	if (!rc)
4473		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4474}
4475
4476void
4477vop_deleteextattr_post(void *ap, int rc)
4478{
4479	struct vop_deleteextattr_args *a = ap;
4480
4481	if (!rc)
4482		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4483}
4484
4485void
4486vop_link_post(void *ap, int rc)
4487{
4488	struct vop_link_args *a = ap;
4489
4490	if (!rc) {
4491		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
4492		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
4493	}
4494}
4495
4496void
4497vop_mkdir_post(void *ap, int rc)
4498{
4499	struct vop_mkdir_args *a = ap;
4500
4501	if (!rc)
4502		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4503}
4504
4505void
4506vop_mknod_post(void *ap, int rc)
4507{
4508	struct vop_mknod_args *a = ap;
4509
4510	if (!rc)
4511		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4512}
4513
4514void
4515vop_remove_post(void *ap, int rc)
4516{
4517	struct vop_remove_args *a = ap;
4518
4519	if (!rc) {
4520		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4521		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4522	}
4523}
4524
4525void
4526vop_rename_post(void *ap, int rc)
4527{
4528	struct vop_rename_args *a = ap;
4529
4530	if (!rc) {
4531		VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
4532		VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
4533		VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
4534		if (a->a_tvp)
4535			VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
4536	}
4537	if (a->a_tdvp != a->a_fdvp)
4538		vdrop(a->a_fdvp);
4539	if (a->a_tvp != a->a_fvp)
4540		vdrop(a->a_fvp);
4541	vdrop(a->a_tdvp);
4542	if (a->a_tvp)
4543		vdrop(a->a_tvp);
4544}
4545
4546void
4547vop_rmdir_post(void *ap, int rc)
4548{
4549	struct vop_rmdir_args *a = ap;
4550
4551	if (!rc) {
4552		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE | NOTE_LINK);
4553		VFS_KNOTE_LOCKED(a->a_vp, NOTE_DELETE);
4554	}
4555}
4556
4557void
4558vop_setattr_post(void *ap, int rc)
4559{
4560	struct vop_setattr_args *a = ap;
4561
4562	if (!rc)
4563		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4564}
4565
4566void
4567vop_setextattr_post(void *ap, int rc)
4568{
4569	struct vop_setextattr_args *a = ap;
4570
4571	if (!rc)
4572		VFS_KNOTE_LOCKED(a->a_vp, NOTE_ATTRIB);
4573}
4574
4575void
4576vop_symlink_post(void *ap, int rc)
4577{
4578	struct vop_symlink_args *a = ap;
4579
4580	if (!rc)
4581		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
4582}
4583
4584static struct knlist fs_knlist;
4585
4586static void
4587vfs_event_init(void *arg)
4588{
4589	knlist_init_mtx(&fs_knlist, NULL);
4590}
4591/* XXX - correct order? */
4592SYSINIT(vfs_knlist, SI_SUB_VFS, SI_ORDER_ANY, vfs_event_init, NULL);
4593
4594void
4595vfs_event_signal(fsid_t *fsid, uint32_t event, intptr_t data __unused)
4596{
4597
4598	KNOTE_UNLOCKED(&fs_knlist, event);
4599}
4600
4601static int	filt_fsattach(struct knote *kn);
4602static void	filt_fsdetach(struct knote *kn);
4603static int	filt_fsevent(struct knote *kn, long hint);
4604
4605struct filterops fs_filtops = {
4606	.f_isfd = 0,
4607	.f_attach = filt_fsattach,
4608	.f_detach = filt_fsdetach,
4609	.f_event = filt_fsevent
4610};
4611
4612static int
4613filt_fsattach(struct knote *kn)
4614{
4615
4616	kn->kn_flags |= EV_CLEAR;
4617	knlist_add(&fs_knlist, kn, 0);
4618	return (0);
4619}
4620
4621static void
4622filt_fsdetach(struct knote *kn)
4623{
4624
4625	knlist_remove(&fs_knlist, kn, 0);
4626}
4627
4628static int
4629filt_fsevent(struct knote *kn, long hint)
4630{
4631
4632	kn->kn_fflags |= hint;
4633	return (kn->kn_fflags != 0);
4634}
4635
4636static int
4637sysctl_vfs_ctl(SYSCTL_HANDLER_ARGS)
4638{
4639	struct vfsidctl vc;
4640	int error;
4641	struct mount *mp;
4642
4643	error = SYSCTL_IN(req, &vc, sizeof(vc));
4644	if (error)
4645		return (error);
4646	if (vc.vc_vers != VFS_CTL_VERS1)
4647		return (EINVAL);
4648	mp = vfs_getvfs(&vc.vc_fsid);
4649	if (mp == NULL)
4650		return (ENOENT);
4651	/* ensure that a specific sysctl goes to the right filesystem. */
4652	if (strcmp(vc.vc_fstypename, "*") != 0 &&
4653	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
4654		vfs_rel(mp);
4655		return (EINVAL);
4656	}
4657	VCTLTOREQ(&vc, req);
4658	error = VFS_SYSCTL(mp, vc.vc_op, req);
4659	vfs_rel(mp);
4660	return (error);
4661}
4662
4663SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLTYPE_OPAQUE | CTLFLAG_WR,
4664    NULL, 0, sysctl_vfs_ctl, "",
4665    "Sysctl by fsid");
4666
4667/*
4668 * Function to initialize a va_filerev field sensibly.
4669 * XXX: Wouldn't a random number make a lot more sense ??
4670 */
4671u_quad_t
4672init_va_filerev(void)
4673{
4674	struct bintime bt;
4675
4676	getbinuptime(&bt);
4677	return (((u_quad_t)bt.sec << 32LL) | (bt.frac >> 32LL));
4678}
4679
4680static int	filt_vfsread(struct knote *kn, long hint);
4681static int	filt_vfswrite(struct knote *kn, long hint);
4682static int	filt_vfsvnode(struct knote *kn, long hint);
4683static void	filt_vfsdetach(struct knote *kn);
4684static struct filterops vfsread_filtops = {
4685	.f_isfd = 1,
4686	.f_detach = filt_vfsdetach,
4687	.f_event = filt_vfsread
4688};
4689static struct filterops vfswrite_filtops = {
4690	.f_isfd = 1,
4691	.f_detach = filt_vfsdetach,
4692	.f_event = filt_vfswrite
4693};
4694static struct filterops vfsvnode_filtops = {
4695	.f_isfd = 1,
4696	.f_detach = filt_vfsdetach,
4697	.f_event = filt_vfsvnode
4698};
4699
4700static void
4701vfs_knllock(void *arg)
4702{
4703	struct vnode *vp = arg;
4704
4705	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
4706}
4707
4708static void
4709vfs_knlunlock(void *arg)
4710{
4711	struct vnode *vp = arg;
4712
4713	VOP_UNLOCK(vp, 0);
4714}
4715
4716static void
4717vfs_knl_assert_locked(void *arg)
4718{
4719#ifdef DEBUG_VFS_LOCKS
4720	struct vnode *vp = arg;
4721
4722	ASSERT_VOP_LOCKED(vp, "vfs_knl_assert_locked");
4723#endif
4724}
4725
4726static void
4727vfs_knl_assert_unlocked(void *arg)
4728{
4729#ifdef DEBUG_VFS_LOCKS
4730	struct vnode *vp = arg;
4731
4732	ASSERT_VOP_UNLOCKED(vp, "vfs_knl_assert_unlocked");
4733#endif
4734}
4735
4736int
4737vfs_kqfilter(struct vop_kqfilter_args *ap)
4738{
4739	struct vnode *vp = ap->a_vp;
4740	struct knote *kn = ap->a_kn;
4741	struct knlist *knl;
4742
4743	switch (kn->kn_filter) {
4744	case EVFILT_READ:
4745		kn->kn_fop = &vfsread_filtops;
4746		break;
4747	case EVFILT_WRITE:
4748		kn->kn_fop = &vfswrite_filtops;
4749		break;
4750	case EVFILT_VNODE:
4751		kn->kn_fop = &vfsvnode_filtops;
4752		break;
4753	default:
4754		return (EINVAL);
4755	}
4756
4757	kn->kn_hook = (caddr_t)vp;
4758
4759	v_addpollinfo(vp);
4760	if (vp->v_pollinfo == NULL)
4761		return (ENOMEM);
4762	knl = &vp->v_pollinfo->vpi_selinfo.si_note;
4763	vhold(vp);
4764	knlist_add(knl, kn, 0);
4765
4766	return (0);
4767}
4768
4769/*
4770 * Detach knote from vnode
4771 */
4772static void
4773filt_vfsdetach(struct knote *kn)
4774{
4775	struct vnode *vp = (struct vnode *)kn->kn_hook;
4776
4777	KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
4778	knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
4779	vdrop(vp);
4780}
4781
4782/*ARGSUSED*/
4783static int
4784filt_vfsread(struct knote *kn, long hint)
4785{
4786	struct vnode *vp = (struct vnode *)kn->kn_hook;
4787	struct vattr va;
4788	int res;
4789
4790	/*
4791	 * filesystem is gone, so set the EOF flag and schedule
4792	 * the knote for deletion.
4793	 */
4794	if (hint == NOTE_REVOKE) {
4795		VI_LOCK(vp);
4796		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4797		VI_UNLOCK(vp);
4798		return (1);
4799	}
4800
4801	if (VOP_GETATTR(vp, &va, curthread->td_ucred))
4802		return (0);
4803
4804	VI_LOCK(vp);
4805	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
4806	res = (kn->kn_data != 0);
4807	VI_UNLOCK(vp);
4808	return (res);
4809}
4810
4811/*ARGSUSED*/
4812static int
4813filt_vfswrite(struct knote *kn, long hint)
4814{
4815	struct vnode *vp = (struct vnode *)kn->kn_hook;
4816
4817	VI_LOCK(vp);
4818
4819	/*
4820	 * filesystem is gone, so set the EOF flag and schedule
4821	 * the knote for deletion.
4822	 */
4823	if (hint == NOTE_REVOKE)
4824		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
4825
4826	kn->kn_data = 0;
4827	VI_UNLOCK(vp);
4828	return (1);
4829}
4830
4831static int
4832filt_vfsvnode(struct knote *kn, long hint)
4833{
4834	struct vnode *vp = (struct vnode *)kn->kn_hook;
4835	int res;
4836
4837	VI_LOCK(vp);
4838	if (kn->kn_sfflags & hint)
4839		kn->kn_fflags |= hint;
4840	if (hint == NOTE_REVOKE) {
4841		kn->kn_flags |= EV_EOF;
4842		VI_UNLOCK(vp);
4843		return (1);
4844	}
4845	res = (kn->kn_fflags != 0);
4846	VI_UNLOCK(vp);
4847	return (res);
4848}
4849
4850int
4851vfs_read_dirent(struct vop_readdir_args *ap, struct dirent *dp, off_t off)
4852{
4853	int error;
4854
4855	if (dp->d_reclen > ap->a_uio->uio_resid)
4856		return (ENAMETOOLONG);
4857	error = uiomove(dp, dp->d_reclen, ap->a_uio);
4858	if (error) {
4859		if (ap->a_ncookies != NULL) {
4860			if (ap->a_cookies != NULL)
4861				free(ap->a_cookies, M_TEMP);
4862			ap->a_cookies = NULL;
4863			*ap->a_ncookies = 0;
4864		}
4865		return (error);
4866	}
4867	if (ap->a_ncookies == NULL)
4868		return (0);
4869
4870	KASSERT(ap->a_cookies,
4871	    ("NULL ap->a_cookies value with non-NULL ap->a_ncookies!"));
4872
4873	*ap->a_cookies = realloc(*ap->a_cookies,
4874	    (*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
4875	(*ap->a_cookies)[*ap->a_ncookies] = off;
4876	return (0);
4877}
4878
4879/*
4880 * Mark for update the access time of the file if the filesystem
4881 * supports VOP_MARKATIME.  This functionality is used by execve and
4882 * mmap, so we want to avoid the I/O implied by directly setting
4883 * va_atime for the sake of efficiency.
4884 */
4885void
4886vfs_mark_atime(struct vnode *vp, struct ucred *cred)
4887{
4888	struct mount *mp;
4889
4890	mp = vp->v_mount;
4891	ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
4892	if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
4893		(void)VOP_MARKATIME(vp);
4894}
4895
4896/*
4897 * The purpose of this routine is to remove granularity from accmode_t,
4898 * reducing it into standard unix access bits - VEXEC, VREAD, VWRITE,
4899 * VADMIN and VAPPEND.
4900 *
4901 * If it returns 0, the caller is supposed to continue with the usual
4902 * access checks using 'accmode' as modified by this routine.  If it
4903 * returns nonzero value, the caller is supposed to return that value
4904 * as errno.
4905 *
4906 * Note that after this routine runs, accmode may be zero.
4907 */
4908int
4909vfs_unixify_accmode(accmode_t *accmode)
4910{
4911	/*
4912	 * There is no way to specify explicit "deny" rule using
4913	 * file mode or POSIX.1e ACLs.
4914	 */
4915	if (*accmode & VEXPLICIT_DENY) {
4916		*accmode = 0;
4917		return (0);
4918	}
4919
4920	/*
4921	 * None of these can be translated into usual access bits.
4922	 * Also, the common case for NFSv4 ACLs is to not contain
4923	 * either of these bits. Caller should check for VWRITE
4924	 * on the containing directory instead.
4925	 */
4926	if (*accmode & (VDELETE_CHILD | VDELETE))
4927		return (EPERM);
4928
4929	if (*accmode & VADMIN_PERMS) {
4930		*accmode &= ~VADMIN_PERMS;
4931		*accmode |= VADMIN;
4932	}
4933
4934	/*
4935	 * There is no way to deny VREAD_ATTRIBUTES, VREAD_ACL
4936	 * or VSYNCHRONIZE using file mode or POSIX.1e ACL.
4937	 */
4938	*accmode &= ~(VSTAT_PERMS | VSYNCHRONIZE);
4939
4940	return (0);
4941}
4942
4943/*
4944 * These are helper functions for filesystems to traverse all
4945 * their vnodes.  See MNT_VNODE_FOREACH_ALL() in sys/mount.h.
4946 *
4947 * This interface replaces MNT_VNODE_FOREACH.
4948 */
4949
4950MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
4951
4952struct vnode *
4953__mnt_vnode_next_all(struct vnode **mvp, struct mount *mp)
4954{
4955	struct vnode *vp;
4956
4957	if (should_yield())
4958		kern_yield(PRI_USER);
4959	MNT_ILOCK(mp);
4960	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
4961	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
4962	while (vp != NULL && (vp->v_type == VMARKER ||
4963	    (vp->v_iflag & VI_DOOMED) != 0))
4964		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4965
4966	/* Check if we are done */
4967	if (vp == NULL) {
4968		__mnt_vnode_markerfree_all(mvp, mp);
4969		/* MNT_IUNLOCK(mp); -- done in above function */
4970		mtx_assert(MNT_MTX(mp), MA_NOTOWNED);
4971		return (NULL);
4972	}
4973	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
4974	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
4975	VI_LOCK(vp);
4976	MNT_IUNLOCK(mp);
4977	return (vp);
4978}
4979
4980struct vnode *
4981__mnt_vnode_first_all(struct vnode **mvp, struct mount *mp)
4982{
4983	struct vnode *vp;
4984
4985	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
4986	MNT_ILOCK(mp);
4987	MNT_REF(mp);
4988	(*mvp)->v_type = VMARKER;
4989
4990	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
4991	while (vp != NULL && (vp->v_type == VMARKER ||
4992	    (vp->v_iflag & VI_DOOMED) != 0))
4993		vp = TAILQ_NEXT(vp, v_nmntvnodes);
4994
4995	/* Check if we are done */
4996	if (vp == NULL) {
4997		MNT_REL(mp);
4998		MNT_IUNLOCK(mp);
4999		free(*mvp, M_VNODE_MARKER);
5000		*mvp = NULL;
5001		return (NULL);
5002	}
5003	(*mvp)->v_mount = mp;
5004	TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
5005	VI_LOCK(vp);
5006	MNT_IUNLOCK(mp);
5007	return (vp);
5008}
5009
5010
5011void
5012__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
5013{
5014
5015	if (*mvp == NULL) {
5016		MNT_IUNLOCK(mp);
5017		return;
5018	}
5019
5020	mtx_assert(MNT_MTX(mp), MA_OWNED);
5021
5022	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5023	TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
5024	MNT_REL(mp);
5025	MNT_IUNLOCK(mp);
5026	free(*mvp, M_VNODE_MARKER);
5027	*mvp = NULL;
5028}
5029
5030/*
5031 * These are helper functions for filesystems to traverse their
5032 * active vnodes.  See MNT_VNODE_FOREACH_ACTIVE() in sys/mount.h
5033 */
5034static void
5035mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
5036{
5037
5038	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5039
5040	MNT_ILOCK(mp);
5041	MNT_REL(mp);
5042	MNT_IUNLOCK(mp);
5043	free(*mvp, M_VNODE_MARKER);
5044	*mvp = NULL;
5045}
5046
5047static struct vnode *
5048mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
5049{
5050	struct vnode *vp, *nvp;
5051
5052	mtx_assert(&vnode_free_list_mtx, MA_OWNED);
5053	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
5054restart:
5055	vp = TAILQ_NEXT(*mvp, v_actfreelist);
5056	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
5057	while (vp != NULL) {
5058		if (vp->v_type == VMARKER) {
5059			vp = TAILQ_NEXT(vp, v_actfreelist);
5060			continue;
5061		}
5062		if (!VI_TRYLOCK(vp)) {
5063			if (mp_ncpus == 1 || should_yield()) {
5064				TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
5065				mtx_unlock(&vnode_free_list_mtx);
5066				pause("vnacti", 1);
5067				mtx_lock(&vnode_free_list_mtx);
5068				goto restart;
5069			}
5070			continue;
5071		}
5072		KASSERT(vp->v_type != VMARKER, ("locked marker %p", vp));
5073		KASSERT(vp->v_mount == mp || vp->v_mount == NULL,
5074		    ("alien vnode on the active list %p %p", vp, mp));
5075		if (vp->v_mount == mp && (vp->v_iflag & VI_DOOMED) == 0)
5076			break;
5077		nvp = TAILQ_NEXT(vp, v_actfreelist);
5078		VI_UNLOCK(vp);
5079		vp = nvp;
5080	}
5081
5082	/* Check if we are done */
5083	if (vp == NULL) {
5084		mtx_unlock(&vnode_free_list_mtx);
5085		mnt_vnode_markerfree_active(mvp, mp);
5086		return (NULL);
5087	}
5088	TAILQ_INSERT_AFTER(&mp->mnt_activevnodelist, vp, *mvp, v_actfreelist);
5089	mtx_unlock(&vnode_free_list_mtx);
5090	ASSERT_VI_LOCKED(vp, "active iter");
5091	KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
5092	return (vp);
5093}
5094
5095struct vnode *
5096__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
5097{
5098
5099	if (should_yield())
5100		kern_yield(PRI_USER);
5101	mtx_lock(&vnode_free_list_mtx);
5102	return (mnt_vnode_next_active(mvp, mp));
5103}
5104
5105struct vnode *
5106__mnt_vnode_first_active(struct vnode **mvp, struct mount *mp)
5107{
5108	struct vnode *vp;
5109
5110	*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
5111	MNT_ILOCK(mp);
5112	MNT_REF(mp);
5113	MNT_IUNLOCK(mp);
5114	(*mvp)->v_type = VMARKER;
5115	(*mvp)->v_mount = mp;
5116
5117	mtx_lock(&vnode_free_list_mtx);
5118	vp = TAILQ_FIRST(&mp->mnt_activevnodelist);
5119	if (vp == NULL) {
5120		mtx_unlock(&vnode_free_list_mtx);
5121		mnt_vnode_markerfree_active(mvp, mp);
5122		return (NULL);
5123	}
5124	TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
5125	return (mnt_vnode_next_active(mvp, mp));
5126}
5127
5128void
5129__mnt_vnode_markerfree_active(struct vnode **mvp, struct mount *mp)
5130{
5131
5132	if (*mvp == NULL)
5133		return;
5134
5135	mtx_lock(&vnode_free_list_mtx);
5136	TAILQ_REMOVE(&mp->mnt_activevnodelist, *mvp, v_actfreelist);
5137	mtx_unlock(&vnode_free_list_mtx);
5138	mnt_vnode_markerfree_active(mvp, mp);
5139}
5140