155714Skris/*-
255714Skris * SPDX-License-Identifier: (BSD-2-Clause AND BSD-3-Clause)
355714Skris *
455714Skris * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
555714Skris * All rights reserved.
655714Skris *
755714Skris * This software was developed for the FreeBSD Project by Marshall
8280304Sjkim * Kirk McKusick and Network Associates Laboratories, the Security
955714Skris * Research Division of Network Associates, Inc. under DARPA/SPAWAR
1055714Skris * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
1155714Skris * research program
1255714Skris *
1355714Skris * Redistribution and use in source and binary forms, with or without
1455714Skris * modification, are permitted provided that the following conditions
15280304Sjkim * are met:
1655714Skris * 1. Redistributions of source code must retain the above copyright
1755714Skris *    notice, this list of conditions and the following disclaimer.
1855714Skris * 2. Redistributions in binary form must reproduce the above copyright
1955714Skris *    notice, this list of conditions and the following disclaimer in the
2055714Skris *    documentation and/or other materials provided with the distribution.
2155714Skris *
22280304Sjkim * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
2355714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2455714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2555714Skris * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2655714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2755714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2855714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2955714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3055714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3155714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3255714Skris * SUCH DAMAGE.
3355714Skris *
3455714Skris * Copyright (c) 1982, 1986, 1989, 1993
3555714Skris *	The Regents of the University of California.  All rights reserved.
3655714Skris *
37280304Sjkim * Redistribution and use in source and binary forms, with or without
3855714Skris * modification, are permitted provided that the following conditions
3955714Skris * are met:
40280304Sjkim * 1. Redistributions of source code must retain the above copyright
4155714Skris *    notice, this list of conditions and the following disclaimer.
4255714Skris * 2. Redistributions in binary form must reproduce the above copyright
4355714Skris *    notice, this list of conditions and the following disclaimer in the
4455714Skris *    documentation and/or other materials provided with the distribution.
4555714Skris * 3. Neither the name of the University nor the names of its contributors
4655714Skris *    may be used to endorse or promote products derived from this software
4755714Skris *    without specific prior written permission.
4855714Skris *
4955714Skris * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
5055714Skris * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
5155714Skris * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52280304Sjkim * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
5355714Skris * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
5455714Skris * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
5555714Skris * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
5655714Skris * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
5755714Skris * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
5855714Skris * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
5955714Skris * SUCH DAMAGE.
6055714Skris * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
6155714Skris */
62160814Ssimon
63280304Sjkim#include <sys/cdefs.h>
64280304Sjkim#include "opt_directio.h"
65280304Sjkim#include "opt_ffs.h"
6655714Skris#include "opt_ufs.h"
67280304Sjkim
68280304Sjkim#include <sys/param.h>
69280304Sjkim#include <sys/bio.h>
70194206Ssimon#include <sys/systm.h>
71160814Ssimon#include <sys/buf.h>
72160814Ssimon#include <sys/conf.h>
7355714Skris#include <sys/extattr.h>
74280304Sjkim#include <sys/kernel.h>
75280304Sjkim#include <sys/limits.h>
76280304Sjkim#include <sys/malloc.h>
77280304Sjkim#include <sys/mount.h>
78280304Sjkim#include <sys/priv.h>
79280304Sjkim#include <sys/rwlock.h>
80280304Sjkim#include <sys/stat.h>
81280304Sjkim#include <sys/sysctl.h>
82280304Sjkim#include <sys/vmmeter.h>
83280304Sjkim#include <sys/vnode.h>
84280304Sjkim
85280304Sjkim#include <vm/vm.h>
86280304Sjkim#include <vm/vm_param.h>
87280304Sjkim#include <vm/vm_extern.h>
88280304Sjkim#include <vm/vm_object.h>
89280304Sjkim#include <vm/vm_page.h>
90160814Ssimon#include <vm/vm_pager.h>
91160814Ssimon#include <vm/vnode_pager.h>
92280304Sjkim
93280304Sjkim#include <ufs/ufs/extattr.h>
94280304Sjkim#include <ufs/ufs/quota.h>
95280304Sjkim#include <ufs/ufs/inode.h>
9655714Skris#include <ufs/ufs/ufs_extern.h>
97280304Sjkim#include <ufs/ufs/ufsmount.h>
98280304Sjkim#include <ufs/ufs/dir.h>
9955714Skris#ifdef UFS_DIRHASH
100280304Sjkim#include <ufs/ufs/dirhash.h>
101280304Sjkim#endif
102280304Sjkim
103280304Sjkim#include <ufs/ffs/fs.h>
104280304Sjkim#include <ufs/ffs/ffs_extern.h>
10555714Skris
106280304Sjkim#define	ALIGNED_TO(ptr, s)	\
107280304Sjkim	(((uintptr_t)(ptr) & (_Alignof(s) - 1)) == 0)
108280304Sjkim
109280304Sjkim#ifdef DIRECTIO
11055714Skrisextern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
111280304Sjkim#endif
112280304Sjkimstatic vop_fdatasync_t	ffs_fdatasync;
113280304Sjkimstatic vop_fsync_t	ffs_fsync;
114280304Sjkimstatic vop_getpages_t	ffs_getpages;
115280304Sjkimstatic vop_getpages_async_t	ffs_getpages_async;
11655714Skrisstatic vop_lock1_t	ffs_lock;
117280304Sjkim#ifdef INVARIANTS
118280304Sjkimstatic vop_unlock_t	ffs_unlock_debug;
119280304Sjkim#endif
12055714Skrisstatic vop_read_t	ffs_read;
121280304Sjkimstatic vop_write_t	ffs_write;
122280304Sjkimstatic int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
123280304Sjkimstatic int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
124280304Sjkim		    struct ucred *cred);
125280304Sjkimstatic vop_strategy_t	ffsext_strategy;
126280304Sjkimstatic vop_closeextattr_t	ffs_closeextattr;
127160814Ssimonstatic vop_deleteextattr_t	ffs_deleteextattr;
128280304Sjkimstatic vop_getextattr_t	ffs_getextattr;
129280304Sjkimstatic vop_listextattr_t	ffs_listextattr;
130280304Sjkimstatic vop_openextattr_t	ffs_openextattr;
13155714Skrisstatic vop_setextattr_t	ffs_setextattr;
132280304Sjkimstatic vop_vptofh_t	ffs_vptofh;
133280304Sjkimstatic vop_vput_pair_t	ffs_vput_pair;
134280304Sjkim
135280304Sjkimvop_fplookup_vexec_t ufs_fplookup_vexec;
136280304Sjkim
137280304Sjkim/* Global vfs data structures for ufs. */
138280304Sjkimstruct vop_vector ffs_vnodeops1 = {
139280304Sjkim	.vop_default =		&ufs_vnodeops,
140280304Sjkim	.vop_fsync =		ffs_fsync,
141280304Sjkim	.vop_fdatasync =	ffs_fdatasync,
142280304Sjkim	.vop_getpages =		ffs_getpages,
143280304Sjkim	.vop_getpages_async =	ffs_getpages_async,
144280304Sjkim	.vop_lock1 =		ffs_lock,
14555714Skris#ifdef INVARIANTS
146	.vop_unlock =		ffs_unlock_debug,
147#endif
148	.vop_read =		ffs_read,
149	.vop_reallocblks =	ffs_reallocblks,
150	.vop_write =		ffs_write,
151	.vop_vptofh =		ffs_vptofh,
152	.vop_vput_pair =	ffs_vput_pair,
153	.vop_fplookup_vexec =	ufs_fplookup_vexec,
154	.vop_fplookup_symlink =	VOP_EAGAIN,
155};
156VFS_VOP_VECTOR_REGISTER(ffs_vnodeops1);
157
158struct vop_vector ffs_fifoops1 = {
159	.vop_default =		&ufs_fifoops,
160	.vop_fsync =		ffs_fsync,
161	.vop_fdatasync =	ffs_fdatasync,
162	.vop_lock1 =		ffs_lock,
163#ifdef INVARIANTS
164	.vop_unlock =		ffs_unlock_debug,
165#endif
166	.vop_vptofh =		ffs_vptofh,
167	.vop_fplookup_vexec =   VOP_EAGAIN,
168	.vop_fplookup_symlink = VOP_EAGAIN,
169};
170VFS_VOP_VECTOR_REGISTER(ffs_fifoops1);
171
172/* Global vfs data structures for ufs. */
173struct vop_vector ffs_vnodeops2 = {
174	.vop_default =		&ufs_vnodeops,
175	.vop_fsync =		ffs_fsync,
176	.vop_fdatasync =	ffs_fdatasync,
177	.vop_getpages =		ffs_getpages,
178	.vop_getpages_async =	ffs_getpages_async,
179	.vop_lock1 =		ffs_lock,
180#ifdef INVARIANTS
181	.vop_unlock =		ffs_unlock_debug,
182#endif
183	.vop_read =		ffs_read,
184	.vop_reallocblks =	ffs_reallocblks,
185	.vop_write =		ffs_write,
186	.vop_closeextattr =	ffs_closeextattr,
187	.vop_deleteextattr =	ffs_deleteextattr,
188	.vop_getextattr =	ffs_getextattr,
189	.vop_listextattr =	ffs_listextattr,
190	.vop_openextattr =	ffs_openextattr,
191	.vop_setextattr =	ffs_setextattr,
192	.vop_vptofh =		ffs_vptofh,
193	.vop_vput_pair =	ffs_vput_pair,
194	.vop_fplookup_vexec =	ufs_fplookup_vexec,
195	.vop_fplookup_symlink =	VOP_EAGAIN,
196};
197VFS_VOP_VECTOR_REGISTER(ffs_vnodeops2);
198
199struct vop_vector ffs_fifoops2 = {
200	.vop_default =		&ufs_fifoops,
201	.vop_fsync =		ffs_fsync,
202	.vop_fdatasync =	ffs_fdatasync,
203	.vop_lock1 =		ffs_lock,
204#ifdef INVARIANTS
205	.vop_unlock =		ffs_unlock_debug,
206#endif
207	.vop_reallocblks =	ffs_reallocblks,
208	.vop_strategy =		ffsext_strategy,
209	.vop_closeextattr =	ffs_closeextattr,
210	.vop_deleteextattr =	ffs_deleteextattr,
211	.vop_getextattr =	ffs_getextattr,
212	.vop_listextattr =	ffs_listextattr,
213	.vop_openextattr =	ffs_openextattr,
214	.vop_setextattr =	ffs_setextattr,
215	.vop_vptofh =		ffs_vptofh,
216	.vop_fplookup_vexec =   VOP_EAGAIN,
217	.vop_fplookup_symlink = VOP_EAGAIN,
218};
219VFS_VOP_VECTOR_REGISTER(ffs_fifoops2);
220
221/*
222 * Synch an open file.
223 */
224/* ARGSUSED */
225static int
226ffs_fsync(struct vop_fsync_args *ap)
227{
228	struct vnode *vp;
229	struct bufobj *bo;
230	int error;
231
232	vp = ap->a_vp;
233	bo = &vp->v_bufobj;
234retry:
235	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
236	if (error)
237		return (error);
238	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
239		error = softdep_fsync(vp);
240		if (error)
241			return (error);
242
243		/*
244		 * The softdep_fsync() function may drop vp lock,
245		 * allowing for dirty buffers to reappear on the
246		 * bo_dirty list. Recheck and resync as needed.
247		 */
248		BO_LOCK(bo);
249		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
250		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
251			BO_UNLOCK(bo);
252			goto retry;
253		}
254		BO_UNLOCK(bo);
255	}
256	if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), 0))
257		return (ENXIO);
258	return (0);
259}
260
261int
262ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
263{
264	struct inode *ip;
265	struct bufobj *bo;
266	struct ufsmount *ump;
267	struct buf *bp, *nbp;
268	ufs_lbn_t lbn;
269	int error, passes, wflag;
270	bool still_dirty, unlocked, wait;
271
272	ip = VTOI(vp);
273	bo = &vp->v_bufobj;
274	ump = VFSTOUFS(vp->v_mount);
275#ifdef WITNESS
276	wflag = IS_SNAPSHOT(ip) ? LK_NOWITNESS : 0;
277#else
278	wflag = 0;
279#endif
280
281	/*
282	 * When doing MNT_WAIT we must first flush all dependencies
283	 * on the inode.
284	 */
285	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
286	    (error = softdep_sync_metadata(vp)) != 0) {
287		if (ffs_fsfail_cleanup(ump, error))
288			error = 0;
289		return (error);
290	}
291
292	/*
293	 * Flush all dirty buffers associated with a vnode.
294	 */
295	error = 0;
296	passes = 0;
297	wait = false;	/* Always do an async pass first. */
298	unlocked = false;
299	lbn = lblkno(ITOFS(ip), (ip->i_size + ITOFS(ip)->fs_bsize - 1));
300	BO_LOCK(bo);
301loop:
302	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
303		bp->b_vflags &= ~BV_SCANNED;
304	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
305		/*
306		 * Reasons to skip this buffer: it has already been considered
307		 * on this pass, the buffer has dependencies that will cause
308		 * it to be redirtied and it has not already been deferred,
309		 * or it is already being written.
310		 */
311		if ((bp->b_vflags & BV_SCANNED) != 0)
312			continue;
313		bp->b_vflags |= BV_SCANNED;
314		/*
315		 * Flush indirects in order, if requested.
316		 *
317		 * Note that if only datasync is requested, we can
318		 * skip indirect blocks when softupdates are not
319		 * active.  Otherwise we must flush them with data,
320		 * since dependencies prevent data block writes.
321		 */
322		if (waitfor == MNT_WAIT && bp->b_lblkno <= -UFS_NDADDR &&
323		    (lbn_level(bp->b_lblkno) >= passes ||
324		    ((flags & DATA_ONLY) != 0 && !DOINGSOFTDEP(vp))))
325			continue;
326		if (bp->b_lblkno > lbn)
327			panic("ffs_syncvnode: syncing truncated data.");
328		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
329			BO_UNLOCK(bo);
330		} else if (wait) {
331			if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
332			    LK_INTERLOCK | wflag, BO_LOCKPTR(bo)) != 0) {
333				BO_LOCK(bo);
334				bp->b_vflags &= ~BV_SCANNED;
335				goto next_locked;
336			}
337		} else
338			continue;
339		if ((bp->b_flags & B_DELWRI) == 0)
340			panic("ffs_fsync: not dirty");
341		/*
342		 * Check for dependencies and potentially complete them.
343		 */
344		if (!LIST_EMPTY(&bp->b_dep) &&
345		    (error = softdep_sync_buf(vp, bp,
346		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
347			/*
348			 * Lock order conflict, buffer was already unlocked,
349			 * and vnode possibly unlocked.
350			 */
351			if (error == ERELOOKUP) {
352				if (vp->v_data == NULL)
353					return (EBADF);
354				unlocked = true;
355				if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
356				    (error = softdep_sync_metadata(vp)) != 0) {
357					if (ffs_fsfail_cleanup(ump, error))
358						error = 0;
359					return (unlocked && error == 0 ?
360					    ERELOOKUP : error);
361				}
362				/* Re-evaluate inode size */
363				lbn = lblkno(ITOFS(ip), (ip->i_size +
364				    ITOFS(ip)->fs_bsize - 1));
365				goto next;
366			}
367			/* I/O error. */
368			if (error != EBUSY) {
369				BUF_UNLOCK(bp);
370				return (error);
371			}
372			/* If we deferred once, don't defer again. */
373		    	if ((bp->b_flags & B_DEFERRED) == 0) {
374				bp->b_flags |= B_DEFERRED;
375				BUF_UNLOCK(bp);
376				goto next;
377			}
378		}
379		if (wait) {
380			bremfree(bp);
381			error = bwrite(bp);
382			if (ffs_fsfail_cleanup(ump, error))
383				error = 0;
384			if (error != 0)
385				return (error);
386		} else if ((bp->b_flags & B_CLUSTEROK)) {
387			(void) vfs_bio_awrite(bp);
388		} else {
389			bremfree(bp);
390			(void) bawrite(bp);
391		}
392next:
393		/*
394		 * Since we may have slept during the I/O, we need
395		 * to start from a known point.
396		 */
397		BO_LOCK(bo);
398next_locked:
399		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
400	}
401	if (waitfor != MNT_WAIT) {
402		BO_UNLOCK(bo);
403		if ((flags & NO_INO_UPDT) != 0)
404			return (unlocked ? ERELOOKUP : 0);
405		error = ffs_update(vp, 0);
406		if (error == 0 && unlocked)
407			error = ERELOOKUP;
408		return (error);
409	}
410	/* Drain IO to see if we're done. */
411	bufobj_wwait(bo, 0, 0);
412	/*
413	 * Block devices associated with filesystems may have new I/O
414	 * requests posted for them even if the vnode is locked, so no
415	 * amount of trying will get them clean.  We make several passes
416	 * as a best effort.
417	 *
418	 * Regular files may need multiple passes to flush all dependency
419	 * work as it is possible that we must write once per indirect
420	 * level, once for the leaf, and once for the inode and each of
421	 * these will be done with one sync and one async pass.
422	 */
423	if (bo->bo_dirty.bv_cnt > 0) {
424		if ((flags & DATA_ONLY) == 0) {
425			still_dirty = true;
426		} else {
427			/*
428			 * For data-only sync, dirty indirect buffers
429			 * are ignored.
430			 */
431			still_dirty = false;
432			TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
433				if (bp->b_lblkno > -UFS_NDADDR) {
434					still_dirty = true;
435					break;
436				}
437			}
438		}
439
440		if (still_dirty) {
441			/* Write the inode after sync passes to flush deps. */
442			if (wait && DOINGSOFTDEP(vp) &&
443			    (flags & NO_INO_UPDT) == 0) {
444				BO_UNLOCK(bo);
445				ffs_update(vp, 1);
446				BO_LOCK(bo);
447			}
448			/* switch between sync/async. */
449			wait = !wait;
450			if (wait || ++passes < UFS_NIADDR + 2)
451				goto loop;
452		}
453	}
454	BO_UNLOCK(bo);
455	error = 0;
456	if ((flags & DATA_ONLY) == 0) {
457		if ((flags & NO_INO_UPDT) == 0)
458			error = ffs_update(vp, 1);
459		if (DOINGSUJ(vp))
460			softdep_journal_fsync(VTOI(vp));
461	} else if ((ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)) != 0) {
462		error = ffs_update(vp, 1);
463	}
464	if (error == 0 && unlocked)
465		error = ERELOOKUP;
466	if (error == 0)
467		ip->i_flag &= ~IN_NEEDSYNC;
468	return (error);
469}
470
471static int
472ffs_fdatasync(struct vop_fdatasync_args *ap)
473{
474
475	return (ffs_syncvnode(ap->a_vp, MNT_WAIT, DATA_ONLY));
476}
477
478static int
479ffs_lock(
480	struct vop_lock1_args /* {
481		struct vnode *a_vp;
482		int a_flags;
483		char *file;
484		int line;
485	} */ *ap)
486{
487#if !defined(NO_FFS_SNAPSHOT) || defined(DIAGNOSTIC)
488	struct vnode *vp = ap->a_vp;
489#endif	/* !NO_FFS_SNAPSHOT || DIAGNOSTIC */
490#ifdef DIAGNOSTIC
491	struct inode *ip;
492#endif	/* DIAGNOSTIC */
493	int result;
494#ifndef NO_FFS_SNAPSHOT
495	int flags;
496	struct lock *lkp;
497
498	/*
499	 * Adaptive spinning mixed with SU leads to trouble. use a giant hammer
500	 * and only use it when LK_NODDLKTREAT is set. Currently this means it
501	 * is only used during path lookup.
502	 */
503	if ((ap->a_flags & LK_NODDLKTREAT) != 0)
504		ap->a_flags |= LK_ADAPTIVE;
505	switch (ap->a_flags & LK_TYPE_MASK) {
506	case LK_SHARED:
507	case LK_UPGRADE:
508	case LK_EXCLUSIVE:
509		flags = ap->a_flags;
510		for (;;) {
511#ifdef DEBUG_VFS_LOCKS
512			VNPASS(vp->v_holdcnt != 0, vp);
513#endif	/* DEBUG_VFS_LOCKS */
514			lkp = vp->v_vnlock;
515			result = lockmgr_lock_flags(lkp, flags,
516			    &VI_MTX(vp)->lock_object, ap->a_file, ap->a_line);
517			if (lkp == vp->v_vnlock || result != 0)
518				break;
519			/*
520			 * Apparent success, except that the vnode
521			 * mutated between snapshot file vnode and
522			 * regular file vnode while this process
523			 * slept.  The lock currently held is not the
524			 * right lock.  Release it, and try to get the
525			 * new lock.
526			 */
527			lockmgr_unlock(lkp);
528			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
529			    (LK_INTERLOCK | LK_NOWAIT))
530				return (EBUSY);
531			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
532				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
533			flags &= ~LK_INTERLOCK;
534		}
535#ifdef DIAGNOSTIC
536		switch (ap->a_flags & LK_TYPE_MASK) {
537		case LK_UPGRADE:
538		case LK_EXCLUSIVE:
539			if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
540				ip = VTOI(vp);
541				if (ip != NULL)
542					ip->i_lock_gen++;
543			}
544		}
545#endif	/* DIAGNOSTIC */
546		break;
547	default:
548#ifdef DIAGNOSTIC
549		if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
550			ip = VTOI(vp);
551			if (ip != NULL)
552				ufs_unlock_tracker(ip);
553		}
554#endif	/* DIAGNOSTIC */
555		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
556		break;
557	}
558#else	/* NO_FFS_SNAPSHOT */
559	/*
560	 * See above for an explanation.
561	 */
562	if ((ap->a_flags & LK_NODDLKTREAT) != 0)
563		ap->a_flags |= LK_ADAPTIVE;
564#ifdef DIAGNOSTIC
565	if ((ap->a_flags & LK_TYPE_MASK) == LK_DOWNGRADE) {
566		ip = VTOI(vp);
567		if (ip != NULL)
568			ufs_unlock_tracker(ip);
569	}
570#endif	/* DIAGNOSTIC */
571	result =  VOP_LOCK1_APV(&ufs_vnodeops, ap);
572#endif	/* NO_FFS_SNAPSHOT */
573#ifdef DIAGNOSTIC
574	switch (ap->a_flags & LK_TYPE_MASK) {
575	case LK_UPGRADE:
576	case LK_EXCLUSIVE:
577		if (result == 0 && vp->v_vnlock->lk_recurse == 0) {
578			ip = VTOI(vp);
579			if (ip != NULL)
580				ip->i_lock_gen++;
581		}
582	}
583#endif	/* DIAGNOSTIC */
584	return (result);
585}
586
587#ifdef INVARIANTS
588static int
589ffs_unlock_debug(struct vop_unlock_args *ap)
590{
591	struct vnode *vp;
592	struct inode *ip;
593
594	vp = ap->a_vp;
595	ip = VTOI(vp);
596	if (ip->i_flag & UFS_INODE_FLAG_LAZY_MASK_ASSERTABLE) {
597		if ((vp->v_mflag & VMP_LAZYLIST) == 0) {
598			VI_LOCK(vp);
599			VNASSERT((vp->v_mflag & VMP_LAZYLIST), vp,
600			    ("%s: modified vnode (%x) not on lazy list",
601			    __func__, ip->i_flag));
602			VI_UNLOCK(vp);
603		}
604	}
605	KASSERT(vp->v_type != VDIR || vp->v_vnlock->lk_recurse != 0 ||
606	    (ip->i_flag & IN_ENDOFF) == 0,
607	    ("ufs dir vp %p ip %p flags %#x", vp, ip, ip->i_flag));
608#ifdef DIAGNOSTIC
609	if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE && ip != NULL &&
610	    vp->v_vnlock->lk_recurse == 0)
611		ufs_unlock_tracker(ip);
612#endif
613	return (VOP_UNLOCK_APV(&ufs_vnodeops, ap));
614}
615#endif
616
617static int
618ffs_read_hole(struct uio *uio, long xfersize, long *size)
619{
620	ssize_t saved_resid, tlen;
621	int error;
622
623	while (xfersize > 0) {
624		tlen = min(xfersize, ZERO_REGION_SIZE);
625		saved_resid = uio->uio_resid;
626		error = vn_io_fault_uiomove(__DECONST(void *, zero_region),
627		    tlen, uio);
628		if (error != 0)
629			return (error);
630		tlen = saved_resid - uio->uio_resid;
631		xfersize -= tlen;
632		*size -= tlen;
633	}
634	return (0);
635}
636
637/*
638 * Vnode op for reading.
639 */
640static int
641ffs_read(
642	struct vop_read_args /* {
643		struct vnode *a_vp;
644		struct uio *a_uio;
645		int a_ioflag;
646		struct ucred *a_cred;
647	} */ *ap)
648{
649	struct vnode *vp;
650	struct inode *ip;
651	struct uio *uio;
652	struct fs *fs;
653	struct buf *bp;
654	ufs_lbn_t lbn, nextlbn;
655	off_t bytesinfile;
656	long size, xfersize, blkoffset;
657	ssize_t orig_resid;
658	int bflag, error, ioflag, seqcount;
659
660	vp = ap->a_vp;
661	uio = ap->a_uio;
662	ioflag = ap->a_ioflag;
663	if (ap->a_ioflag & IO_EXT)
664#ifdef notyet
665		return (ffs_extread(vp, uio, ioflag));
666#else
667		panic("ffs_read+IO_EXT");
668#endif
669#ifdef DIRECTIO
670	if ((ioflag & IO_DIRECT) != 0) {
671		int workdone;
672
673		error = ffs_rawread(vp, uio, &workdone);
674		if (error != 0 || workdone != 0)
675			return error;
676	}
677#endif
678
679	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
680	ip = VTOI(vp);
681
682#ifdef INVARIANTS
683	if (uio->uio_rw != UIO_READ)
684		panic("ffs_read: mode");
685
686	if (vp->v_type == VLNK) {
687		if ((int)ip->i_size < VFSTOUFS(vp->v_mount)->um_maxsymlinklen)
688			panic("ffs_read: short symlink");
689	} else if (vp->v_type != VREG && vp->v_type != VDIR)
690		panic("ffs_read: type %d",  vp->v_type);
691#endif
692	orig_resid = uio->uio_resid;
693	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
694	if (orig_resid == 0)
695		return (0);
696	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
697	fs = ITOFS(ip);
698	if (uio->uio_offset < ip->i_size &&
699	    uio->uio_offset >= fs->fs_maxfilesize)
700		return (EOVERFLOW);
701
702	bflag = GB_UNMAPPED | (uio->uio_segflg == UIO_NOCOPY ? 0 : GB_NOSPARSE);
703#ifdef WITNESS
704	bflag |= IS_SNAPSHOT(ip) ? GB_NOWITNESS : 0;
705#endif
706	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
707		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
708			break;
709		lbn = lblkno(fs, uio->uio_offset);
710		nextlbn = lbn + 1;
711
712		/*
713		 * size of buffer.  The buffer representing the
714		 * end of the file is rounded up to the size of
715		 * the block type ( fragment or full block,
716		 * depending ).
717		 */
718		size = blksize(fs, ip, lbn);
719		blkoffset = blkoff(fs, uio->uio_offset);
720
721		/*
722		 * The amount we want to transfer in this iteration is
723		 * one FS block less the amount of the data before
724		 * our startpoint (duh!)
725		 */
726		xfersize = fs->fs_bsize - blkoffset;
727
728		/*
729		 * But if we actually want less than the block,
730		 * or the file doesn't have a whole block more of data,
731		 * then use the lesser number.
732		 */
733		if (uio->uio_resid < xfersize)
734			xfersize = uio->uio_resid;
735		if (bytesinfile < xfersize)
736			xfersize = bytesinfile;
737
738		if (lblktosize(fs, nextlbn) >= ip->i_size) {
739			/*
740			 * Don't do readahead if this is the end of the file.
741			 */
742			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
743		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
744			/*
745			 * Otherwise if we are allowed to cluster,
746			 * grab as much as we can.
747			 *
748			 * XXX  This may not be a win if we are not
749			 * doing sequential access.
750			 */
751			error = cluster_read(vp, ip->i_size, lbn,
752			    size, NOCRED, blkoffset + uio->uio_resid,
753			    seqcount, bflag, &bp);
754		} else if (seqcount > 1) {
755			/*
756			 * If we are NOT allowed to cluster, then
757			 * if we appear to be acting sequentially,
758			 * fire off a request for a readahead
759			 * as well as a read. Note that the 4th and 5th
760			 * arguments point to arrays of the size specified in
761			 * the 6th argument.
762			 */
763			int nextsize = blksize(fs, ip, nextlbn);
764			error = breadn_flags(vp, lbn, lbn, size, &nextlbn,
765			    &nextsize, 1, NOCRED, bflag, NULL, &bp);
766		} else {
767			/*
768			 * Failing all of the above, just read what the
769			 * user asked for. Interestingly, the same as
770			 * the first option above.
771			 */
772			error = bread_gb(vp, lbn, size, NOCRED, bflag, &bp);
773		}
774		if (error == EJUSTRETURN) {
775			error = ffs_read_hole(uio, xfersize, &size);
776			if (error == 0)
777				continue;
778		}
779		if (error != 0) {
780			brelse(bp);
781			bp = NULL;
782			break;
783		}
784
785		/*
786		 * We should only get non-zero b_resid when an I/O error
787		 * has occurred, which should cause us to break above.
788		 * However, if the short read did not cause an error,
789		 * then we want to ensure that we do not uiomove bad
790		 * or uninitialized data.
791		 */
792		size -= bp->b_resid;
793		if (size < xfersize) {
794			if (size == 0)
795				break;
796			xfersize = size;
797		}
798
799		if (buf_mapped(bp)) {
800			error = vn_io_fault_uiomove((char *)bp->b_data +
801			    blkoffset, (int)xfersize, uio);
802		} else {
803			error = vn_io_fault_pgmove(bp->b_pages,
804			    blkoffset + (bp->b_offset & PAGE_MASK),
805			    (int)xfersize, uio);
806		}
807		if (error)
808			break;
809
810		vfs_bio_brelse(bp, ioflag);
811	}
812
813	/*
814	 * This can only happen in the case of an error
815	 * because the loop above resets bp to NULL on each iteration
816	 * and on normal completion has not set a new value into it.
817	 * so it must have come from a 'break' statement
818	 */
819	if (bp != NULL)
820		vfs_bio_brelse(bp, ioflag);
821
822	if ((error == 0 || uio->uio_resid != orig_resid) &&
823	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
824		UFS_INODE_SET_FLAG_SHARED(ip, IN_ACCESS);
825	return (error);
826}
827
828/*
829 * Vnode op for writing.
830 */
831static int
832ffs_write(
833	struct vop_write_args /* {
834		struct vnode *a_vp;
835		struct uio *a_uio;
836		int a_ioflag;
837		struct ucred *a_cred;
838	} */ *ap)
839{
840	struct vnode *vp;
841	struct uio *uio;
842	struct inode *ip;
843	struct fs *fs;
844	struct buf *bp;
845	ufs_lbn_t lbn;
846	off_t osize;
847	ssize_t resid, r;
848	int seqcount;
849	int blkoffset, error, flags, ioflag, size, xfersize;
850
851	vp = ap->a_vp;
852	if (DOINGSUJ(vp))
853		softdep_prealloc(vp, MNT_WAIT);
854	if (vp->v_data == NULL)
855		return (EBADF);
856
857	uio = ap->a_uio;
858	ioflag = ap->a_ioflag;
859	if (ap->a_ioflag & IO_EXT)
860#ifdef notyet
861		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
862#else
863		panic("ffs_write+IO_EXT");
864#endif
865
866	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
867	ip = VTOI(vp);
868
869#ifdef INVARIANTS
870	if (uio->uio_rw != UIO_WRITE)
871		panic("ffs_write: mode");
872#endif
873
874	switch (vp->v_type) {
875	case VREG:
876		if (ioflag & IO_APPEND)
877			uio->uio_offset = ip->i_size;
878		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
879			return (EPERM);
880		/* FALLTHROUGH */
881	case VLNK:
882		break;
883	case VDIR:
884		panic("ffs_write: dir write");
885		break;
886	default:
887		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
888			(int)uio->uio_offset,
889			(int)uio->uio_resid
890		);
891	}
892
893	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
894	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
895	fs = ITOFS(ip);
896
897	/*
898	 * Maybe this should be above the vnode op call, but so long as
899	 * file servers have no limits, I don't think it matters.
900	 */
901	error = vn_rlimit_fsizex(vp, uio, fs->fs_maxfilesize, &r,
902	    uio->uio_td);
903	if (error != 0) {
904		vn_rlimit_fsizex_res(uio, r);
905		return (error);
906	}
907
908	resid = uio->uio_resid;
909	osize = ip->i_size;
910	if (seqcount > BA_SEQMAX)
911		flags = BA_SEQMAX << BA_SEQSHIFT;
912	else
913		flags = seqcount << BA_SEQSHIFT;
914	if (ioflag & IO_SYNC)
915		flags |= IO_SYNC;
916	flags |= BA_UNMAPPED;
917
918	for (error = 0; uio->uio_resid > 0;) {
919		lbn = lblkno(fs, uio->uio_offset);
920		blkoffset = blkoff(fs, uio->uio_offset);
921		xfersize = fs->fs_bsize - blkoffset;
922		if (uio->uio_resid < xfersize)
923			xfersize = uio->uio_resid;
924		if (uio->uio_offset + xfersize > ip->i_size)
925			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
926
927		/*
928		 * We must perform a read-before-write if the transfer size
929		 * does not cover the entire buffer.
930		 */
931		if (fs->fs_bsize > xfersize)
932			flags |= BA_CLRBUF;
933		else
934			flags &= ~BA_CLRBUF;
935/* XXX is uio->uio_offset the right thing here? */
936		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
937		    ap->a_cred, flags, &bp);
938		if (error != 0) {
939			vnode_pager_setsize(vp, ip->i_size);
940			break;
941		}
942		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
943			bp->b_flags |= B_NOCACHE;
944
945		if (uio->uio_offset + xfersize > ip->i_size) {
946			ip->i_size = uio->uio_offset + xfersize;
947			DIP_SET(ip, i_size, ip->i_size);
948			UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
949		}
950
951		size = blksize(fs, ip, lbn) - bp->b_resid;
952		if (size < xfersize)
953			xfersize = size;
954
955		if (buf_mapped(bp)) {
956			error = vn_io_fault_uiomove((char *)bp->b_data +
957			    blkoffset, (int)xfersize, uio);
958		} else {
959			error = vn_io_fault_pgmove(bp->b_pages,
960			    blkoffset + (bp->b_offset & PAGE_MASK),
961			    (int)xfersize, uio);
962		}
963		/*
964		 * If the buffer is not already filled and we encounter an
965		 * error while trying to fill it, we have to clear out any
966		 * garbage data from the pages instantiated for the buffer.
967		 * If we do not, a failed uiomove() during a write can leave
968		 * the prior contents of the pages exposed to a userland mmap.
969		 *
970		 * Note that we need only clear buffers with a transfer size
971		 * equal to the block size because buffers with a shorter
972		 * transfer size were cleared above by the call to UFS_BALLOC()
973		 * with the BA_CLRBUF flag set.
974		 *
975		 * If the source region for uiomove identically mmaps the
976		 * buffer, uiomove() performed the NOP copy, and the buffer
977		 * content remains valid because the page fault handler
978		 * validated the pages.
979		 */
980		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
981		    fs->fs_bsize == xfersize) {
982			if (error == EFAULT && LIST_EMPTY(&bp->b_dep)) {
983				bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
984				brelse(bp);
985				break;
986			} else {
987				vfs_bio_clrbuf(bp);
988			}
989		}
990
991		vfs_bio_set_flags(bp, ioflag);
992
993		/*
994		 * If IO_SYNC each buffer is written synchronously.  Otherwise
995		 * if we have a severe page deficiency write the buffer
996		 * asynchronously.  Otherwise try to cluster, and if that
997		 * doesn't do it then either do an async write (if O_DIRECT),
998		 * or a delayed write (if not).
999		 */
1000		if (ioflag & IO_SYNC) {
1001			(void)bwrite(bp);
1002		} else if (vm_page_count_severe() ||
1003			    buf_dirty_count_severe() ||
1004			    (ioflag & IO_ASYNC)) {
1005			bp->b_flags |= B_CLUSTEROK;
1006			bawrite(bp);
1007		} else if (xfersize + blkoffset == fs->fs_bsize) {
1008			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
1009				bp->b_flags |= B_CLUSTEROK;
1010				cluster_write(vp, &ip->i_clusterw, bp,
1011				    ip->i_size, seqcount, GB_UNMAPPED);
1012			} else {
1013				bawrite(bp);
1014			}
1015		} else if (ioflag & IO_DIRECT) {
1016			bp->b_flags |= B_CLUSTEROK;
1017			bawrite(bp);
1018		} else {
1019			bp->b_flags |= B_CLUSTEROK;
1020			bdwrite(bp);
1021		}
1022		if (error || xfersize == 0)
1023			break;
1024		UFS_INODE_SET_FLAG(ip, IN_CHANGE | IN_UPDATE);
1025	}
1026	/*
1027	 * If we successfully wrote any data, and we are not the superuser
1028	 * we clear the setuid and setgid bits as a precaution against
1029	 * tampering.
1030	 */
1031	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
1032	    ap->a_cred) {
1033		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID)) {
1034			vn_seqc_write_begin(vp);
1035			UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1036			DIP_SET(ip, i_mode, ip->i_mode);
1037			vn_seqc_write_end(vp);
1038		}
1039	}
1040	if (error) {
1041		if (ioflag & IO_UNIT) {
1042			(void)ffs_truncate(vp, osize,
1043			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
1044			uio->uio_offset -= resid - uio->uio_resid;
1045			uio->uio_resid = resid;
1046		}
1047	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) {
1048		if (!(ioflag & IO_DATASYNC) ||
1049		    (ip->i_flags & (IN_SIZEMOD | IN_IBLKDATA)))
1050			error = ffs_update(vp, 1);
1051		if (ffs_fsfail_cleanup(VFSTOUFS(vp->v_mount), error))
1052			error = ENXIO;
1053	}
1054	vn_rlimit_fsizex_res(uio, r);
1055	return (error);
1056}
1057
1058/*
1059 * Extended attribute area reading.
1060 */
1061static int
1062ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
1063{
1064	struct inode *ip;
1065	struct ufs2_dinode *dp;
1066	struct fs *fs;
1067	struct buf *bp;
1068	ufs_lbn_t lbn, nextlbn;
1069	off_t bytesinfile;
1070	long size, xfersize, blkoffset;
1071	ssize_t orig_resid;
1072	int error;
1073
1074	ip = VTOI(vp);
1075	fs = ITOFS(ip);
1076	dp = ip->i_din2;
1077
1078#ifdef INVARIANTS
1079	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
1080		panic("ffs_extread: mode");
1081
1082#endif
1083	orig_resid = uio->uio_resid;
1084	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
1085	if (orig_resid == 0)
1086		return (0);
1087	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
1088
1089	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
1090		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
1091			break;
1092		lbn = lblkno(fs, uio->uio_offset);
1093		nextlbn = lbn + 1;
1094
1095		/*
1096		 * size of buffer.  The buffer representing the
1097		 * end of the file is rounded up to the size of
1098		 * the block type ( fragment or full block,
1099		 * depending ).
1100		 */
1101		size = sblksize(fs, dp->di_extsize, lbn);
1102		blkoffset = blkoff(fs, uio->uio_offset);
1103
1104		/*
1105		 * The amount we want to transfer in this iteration is
1106		 * one FS block less the amount of the data before
1107		 * our startpoint (duh!)
1108		 */
1109		xfersize = fs->fs_bsize - blkoffset;
1110
1111		/*
1112		 * But if we actually want less than the block,
1113		 * or the file doesn't have a whole block more of data,
1114		 * then use the lesser number.
1115		 */
1116		if (uio->uio_resid < xfersize)
1117			xfersize = uio->uio_resid;
1118		if (bytesinfile < xfersize)
1119			xfersize = bytesinfile;
1120
1121		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
1122			/*
1123			 * Don't do readahead if this is the end of the info.
1124			 */
1125			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
1126		} else {
1127			/*
1128			 * If we have a second block, then
1129			 * fire off a request for a readahead
1130			 * as well as a read. Note that the 4th and 5th
1131			 * arguments point to arrays of the size specified in
1132			 * the 6th argument.
1133			 */
1134			int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
1135			nextlbn = -1 - nextlbn;
1136			error = breadn(vp, -1 - lbn,
1137			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
1138		}
1139		if (error) {
1140			brelse(bp);
1141			bp = NULL;
1142			break;
1143		}
1144
1145		/*
1146		 * We should only get non-zero b_resid when an I/O error
1147		 * has occurred, which should cause us to break above.
1148		 * However, if the short read did not cause an error,
1149		 * then we want to ensure that we do not uiomove bad
1150		 * or uninitialized data.
1151		 */
1152		size -= bp->b_resid;
1153		if (size < xfersize) {
1154			if (size == 0)
1155				break;
1156			xfersize = size;
1157		}
1158
1159		error = uiomove((char *)bp->b_data + blkoffset,
1160					(int)xfersize, uio);
1161		if (error)
1162			break;
1163		vfs_bio_brelse(bp, ioflag);
1164	}
1165
1166	/*
1167	 * This can only happen in the case of an error
1168	 * because the loop above resets bp to NULL on each iteration
1169	 * and on normal completion has not set a new value into it.
1170	 * so it must have come from a 'break' statement
1171	 */
1172	if (bp != NULL)
1173		vfs_bio_brelse(bp, ioflag);
1174	return (error);
1175}
1176
1177/*
1178 * Extended attribute area writing.
1179 */
1180static int
1181ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1182{
1183	struct inode *ip;
1184	struct ufs2_dinode *dp;
1185	struct fs *fs;
1186	struct buf *bp;
1187	ufs_lbn_t lbn;
1188	off_t osize;
1189	ssize_t resid;
1190	int blkoffset, error, flags, size, xfersize;
1191
1192	ip = VTOI(vp);
1193	fs = ITOFS(ip);
1194	dp = ip->i_din2;
1195
1196#ifdef INVARIANTS
1197	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1198		panic("ffs_extwrite: mode");
1199#endif
1200
1201	if (ioflag & IO_APPEND)
1202		uio->uio_offset = dp->di_extsize;
1203	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1204	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1205	if ((uoff_t)uio->uio_offset + uio->uio_resid >
1206	    UFS_NXADDR * fs->fs_bsize)
1207		return (EFBIG);
1208
1209	resid = uio->uio_resid;
1210	osize = dp->di_extsize;
1211	flags = IO_EXT;
1212	if (ioflag & IO_SYNC)
1213		flags |= IO_SYNC;
1214
1215	for (error = 0; uio->uio_resid > 0;) {
1216		lbn = lblkno(fs, uio->uio_offset);
1217		blkoffset = blkoff(fs, uio->uio_offset);
1218		xfersize = fs->fs_bsize - blkoffset;
1219		if (uio->uio_resid < xfersize)
1220			xfersize = uio->uio_resid;
1221
1222		/*
1223		 * We must perform a read-before-write if the transfer size
1224		 * does not cover the entire buffer.
1225		 */
1226		if (fs->fs_bsize > xfersize)
1227			flags |= BA_CLRBUF;
1228		else
1229			flags &= ~BA_CLRBUF;
1230		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1231		    ucred, flags, &bp);
1232		if (error != 0)
1233			break;
1234		/*
1235		 * If the buffer is not valid we have to clear out any
1236		 * garbage data from the pages instantiated for the buffer.
1237		 * If we do not, a failed uiomove() during a write can leave
1238		 * the prior contents of the pages exposed to a userland
1239		 * mmap().  XXX deal with uiomove() errors a better way.
1240		 */
1241		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1242			vfs_bio_clrbuf(bp);
1243
1244		if (uio->uio_offset + xfersize > dp->di_extsize) {
1245			dp->di_extsize = uio->uio_offset + xfersize;
1246			UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
1247		}
1248
1249		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1250		if (size < xfersize)
1251			xfersize = size;
1252
1253		error =
1254		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1255
1256		vfs_bio_set_flags(bp, ioflag);
1257
1258		/*
1259		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1260		 * if we have a severe page deficiency write the buffer
1261		 * asynchronously.  Otherwise try to cluster, and if that
1262		 * doesn't do it then either do an async write (if O_DIRECT),
1263		 * or a delayed write (if not).
1264		 */
1265		if (ioflag & IO_SYNC) {
1266			(void)bwrite(bp);
1267		} else if (vm_page_count_severe() ||
1268			    buf_dirty_count_severe() ||
1269			    xfersize + blkoffset == fs->fs_bsize ||
1270			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1271			bawrite(bp);
1272		else
1273			bdwrite(bp);
1274		if (error || xfersize == 0)
1275			break;
1276		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
1277	}
1278	/*
1279	 * If we successfully wrote any data, and we are not the superuser
1280	 * we clear the setuid and setgid bits as a precaution against
1281	 * tampering.
1282	 */
1283	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1284		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID)) {
1285			vn_seqc_write_begin(vp);
1286			UFS_INODE_SET_MODE(ip, ip->i_mode & ~(ISUID | ISGID));
1287			dp->di_mode = ip->i_mode;
1288			vn_seqc_write_end(vp);
1289		}
1290	}
1291	if (error) {
1292		if (ioflag & IO_UNIT) {
1293			(void)ffs_truncate(vp, osize,
1294			    IO_EXT | (ioflag&IO_SYNC), ucred);
1295			uio->uio_offset -= resid - uio->uio_resid;
1296			uio->uio_resid = resid;
1297		}
1298	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1299		error = ffs_update(vp, 1);
1300	return (error);
1301}
1302
1303/*
1304 * Vnode operating to retrieve a named extended attribute.
1305 *
1306 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1307 * the length of the EA, and possibly the pointer to the entry and to the data.
1308 */
1309static int
1310ffs_findextattr(uint8_t *ptr, uint64_t length, int nspace, const char *name,
1311    struct extattr **eapp, uint8_t **eac)
1312{
1313	struct extattr *eap, *eaend;
1314	size_t nlen;
1315
1316	nlen = strlen(name);
1317	KASSERT(ALIGNED_TO(ptr, struct extattr), ("unaligned"));
1318	eap = (struct extattr *)ptr;
1319	eaend = (struct extattr *)(ptr + length);
1320	for (; eap < eaend; eap = EXTATTR_NEXT(eap)) {
1321		KASSERT(EXTATTR_NEXT(eap) <= eaend,
1322		    ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1323		if (eap->ea_namespace != nspace || eap->ea_namelength != nlen
1324		    || memcmp(eap->ea_name, name, nlen) != 0)
1325			continue;
1326		if (eapp != NULL)
1327			*eapp = eap;
1328		if (eac != NULL)
1329			*eac = EXTATTR_CONTENT(eap);
1330		return (EXTATTR_CONTENT_SIZE(eap));
1331	}
1332	return (-1);
1333}
1334
1335static int
1336ffs_rdextattr(uint8_t **p, struct vnode *vp, struct thread *td)
1337{
1338	const struct extattr *eap, *eaend, *eapnext;
1339	struct inode *ip;
1340	struct ufs2_dinode *dp;
1341	struct fs *fs;
1342	struct uio luio;
1343	struct iovec liovec;
1344	uint64_t easize;
1345	int error;
1346	uint8_t *eae;
1347
1348	ip = VTOI(vp);
1349	fs = ITOFS(ip);
1350	dp = ip->i_din2;
1351	easize = dp->di_extsize;
1352	if ((uoff_t)easize > UFS_NXADDR * fs->fs_bsize)
1353		return (EFBIG);
1354
1355	eae = malloc(easize, M_TEMP, M_WAITOK);
1356
1357	liovec.iov_base = eae;
1358	liovec.iov_len = easize;
1359	luio.uio_iov = &liovec;
1360	luio.uio_iovcnt = 1;
1361	luio.uio_offset = 0;
1362	luio.uio_resid = easize;
1363	luio.uio_segflg = UIO_SYSSPACE;
1364	luio.uio_rw = UIO_READ;
1365	luio.uio_td = td;
1366
1367	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1368	if (error) {
1369		free(eae, M_TEMP);
1370		return (error);
1371	}
1372	/* Validate disk xattrfile contents. */
1373	for (eap = (void *)eae, eaend = (void *)(eae + easize); eap < eaend;
1374	    eap = eapnext) {
1375		/* Detect zeroed out tail */
1376		if (eap->ea_length < sizeof(*eap) || eap->ea_length == 0) {
1377			easize = (const uint8_t *)eap - eae;
1378			break;
1379		}
1380
1381		eapnext = EXTATTR_NEXT(eap);
1382		/* Bogusly long entry. */
1383		if (eapnext > eaend) {
1384			free(eae, M_TEMP);
1385			return (EINTEGRITY);
1386		}
1387	}
1388	ip->i_ea_len = easize;
1389	*p = eae;
1390	return (0);
1391}
1392
1393static void
1394ffs_lock_ea(struct vnode *vp)
1395{
1396	struct inode *ip;
1397
1398	ip = VTOI(vp);
1399	VI_LOCK(vp);
1400	while (ip->i_flag & IN_EA_LOCKED) {
1401		UFS_INODE_SET_FLAG(ip, IN_EA_LOCKWAIT);
1402		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1403		    0);
1404	}
1405	UFS_INODE_SET_FLAG(ip, IN_EA_LOCKED);
1406	VI_UNLOCK(vp);
1407}
1408
1409static void
1410ffs_unlock_ea(struct vnode *vp)
1411{
1412	struct inode *ip;
1413
1414	ip = VTOI(vp);
1415	VI_LOCK(vp);
1416	if (ip->i_flag & IN_EA_LOCKWAIT)
1417		wakeup(&ip->i_ea_refs);
1418	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1419	VI_UNLOCK(vp);
1420}
1421
1422static int
1423ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1424{
1425	struct inode *ip;
1426	int error;
1427
1428	ip = VTOI(vp);
1429
1430	ffs_lock_ea(vp);
1431	if (ip->i_ea_area != NULL) {
1432		ip->i_ea_refs++;
1433		ffs_unlock_ea(vp);
1434		return (0);
1435	}
1436	error = ffs_rdextattr(&ip->i_ea_area, vp, td);
1437	if (error) {
1438		ffs_unlock_ea(vp);
1439		return (error);
1440	}
1441	ip->i_ea_error = 0;
1442	ip->i_ea_refs++;
1443	ffs_unlock_ea(vp);
1444	return (0);
1445}
1446
1447/*
1448 * Vnode extattr transaction commit/abort
1449 */
1450static int
1451ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1452{
1453	struct inode *ip;
1454	struct uio luio;
1455	struct iovec *liovec;
1456	struct ufs2_dinode *dp;
1457	size_t ea_len, tlen;
1458	int error, i, lcnt;
1459	bool truncate;
1460
1461	ip = VTOI(vp);
1462
1463	ffs_lock_ea(vp);
1464	if (ip->i_ea_area == NULL) {
1465		ffs_unlock_ea(vp);
1466		return (EINVAL);
1467	}
1468	dp = ip->i_din2;
1469	error = ip->i_ea_error;
1470	truncate = false;
1471	if (commit && error == 0) {
1472		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1473		if (cred == NOCRED)
1474			cred =  vp->v_mount->mnt_cred;
1475
1476		ea_len = MAX(ip->i_ea_len, dp->di_extsize);
1477		for (lcnt = 1, tlen = ea_len - ip->i_ea_len; tlen > 0;) {
1478			tlen -= MIN(ZERO_REGION_SIZE, tlen);
1479			lcnt++;
1480		}
1481
1482		liovec = __builtin_alloca(lcnt * sizeof(struct iovec));
1483		luio.uio_iovcnt = lcnt;
1484
1485		liovec[0].iov_base = ip->i_ea_area;
1486		liovec[0].iov_len = ip->i_ea_len;
1487		for (i = 1, tlen = ea_len - ip->i_ea_len; i < lcnt; i++) {
1488			liovec[i].iov_base = __DECONST(void *, zero_region);
1489			liovec[i].iov_len = MIN(ZERO_REGION_SIZE, tlen);
1490			tlen -= liovec[i].iov_len;
1491		}
1492		MPASS(tlen == 0);
1493
1494		luio.uio_iov = liovec;
1495		luio.uio_offset = 0;
1496		luio.uio_resid = ea_len;
1497		luio.uio_segflg = UIO_SYSSPACE;
1498		luio.uio_rw = UIO_WRITE;
1499		luio.uio_td = td;
1500		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1501		if (error == 0 && ip->i_ea_len == 0)
1502			truncate = true;
1503	}
1504	if (--ip->i_ea_refs == 0) {
1505		free(ip->i_ea_area, M_TEMP);
1506		ip->i_ea_area = NULL;
1507		ip->i_ea_len = 0;
1508		ip->i_ea_error = 0;
1509	}
1510	ffs_unlock_ea(vp);
1511
1512	if (truncate)
1513		ffs_truncate(vp, 0, IO_EXT, cred);
1514	return (error);
1515}
1516
1517/*
1518 * Vnode extattr strategy routine for fifos.
1519 *
1520 * We need to check for a read or write of the external attributes.
1521 * Otherwise we just fall through and do the usual thing.
1522 */
1523static int
1524ffsext_strategy(
1525	struct vop_strategy_args /* {
1526		struct vnodeop_desc *a_desc;
1527		struct vnode *a_vp;
1528		struct buf *a_bp;
1529	} */ *ap)
1530{
1531	struct vnode *vp;
1532	daddr_t lbn;
1533
1534	vp = ap->a_vp;
1535	lbn = ap->a_bp->b_lblkno;
1536	if (I_IS_UFS2(VTOI(vp)) && lbn < 0 && lbn >= -UFS_NXADDR)
1537		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1538	if (vp->v_type == VFIFO)
1539		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1540	panic("spec nodes went here");
1541}
1542
1543/*
1544 * Vnode extattr transaction commit/abort
1545 */
1546static int
1547ffs_openextattr(
1548	struct vop_openextattr_args /* {
1549		struct vnodeop_desc *a_desc;
1550		struct vnode *a_vp;
1551		IN struct ucred *a_cred;
1552		IN struct thread *a_td;
1553	} */ *ap)
1554{
1555
1556	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1557		return (EOPNOTSUPP);
1558
1559	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1560}
1561
1562/*
1563 * Vnode extattr transaction commit/abort
1564 */
1565static int
1566ffs_closeextattr(
1567	struct vop_closeextattr_args /* {
1568		struct vnodeop_desc *a_desc;
1569		struct vnode *a_vp;
1570		int a_commit;
1571		IN struct ucred *a_cred;
1572		IN struct thread *a_td;
1573	} */ *ap)
1574{
1575	struct vnode *vp;
1576
1577	vp = ap->a_vp;
1578	if (vp->v_type == VCHR || vp->v_type == VBLK)
1579		return (EOPNOTSUPP);
1580	if (ap->a_commit && (vp->v_mount->mnt_flag & MNT_RDONLY) != 0)
1581		return (EROFS);
1582
1583	if (ap->a_commit && DOINGSUJ(vp)) {
1584		ASSERT_VOP_ELOCKED(vp, "ffs_closeextattr commit");
1585		softdep_prealloc(vp, MNT_WAIT);
1586		if (vp->v_data == NULL)
1587			return (EBADF);
1588	}
1589	return (ffs_close_ea(vp, ap->a_commit, ap->a_cred, ap->a_td));
1590}
1591
1592/*
1593 * Vnode operation to remove a named attribute.
1594 */
1595static int
1596ffs_deleteextattr(
1597	struct vop_deleteextattr_args /* {
1598		IN struct vnode *a_vp;
1599		IN int a_attrnamespace;
1600		IN const char *a_name;
1601		IN struct ucred *a_cred;
1602		IN struct thread *a_td;
1603	} */ *ap)
1604{
1605	struct vnode *vp;
1606	struct inode *ip;
1607	struct extattr *eap;
1608	uint32_t ul;
1609	int olen, error, i, easize;
1610	uint8_t *eae;
1611	void *tmp;
1612
1613	vp = ap->a_vp;
1614	ip = VTOI(vp);
1615
1616	if (vp->v_type == VCHR || vp->v_type == VBLK)
1617		return (EOPNOTSUPP);
1618	if (strlen(ap->a_name) == 0)
1619		return (EINVAL);
1620	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1621		return (EROFS);
1622
1623	error = extattr_check_cred(vp, ap->a_attrnamespace,
1624	    ap->a_cred, ap->a_td, VWRITE);
1625	if (error) {
1626		/*
1627		 * ffs_lock_ea is not needed there, because the vnode
1628		 * must be exclusively locked.
1629		 */
1630		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1631			ip->i_ea_error = error;
1632		return (error);
1633	}
1634
1635	if (DOINGSUJ(vp)) {
1636		ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1637		softdep_prealloc(vp, MNT_WAIT);
1638		if (vp->v_data == NULL)
1639			return (EBADF);
1640	}
1641
1642	error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1643	if (error)
1644		return (error);
1645
1646	/* CEM: delete could be done in-place instead */
1647	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1648	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1649	easize = ip->i_ea_len;
1650
1651	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1652	    &eap, NULL);
1653	if (olen == -1) {
1654		/* delete but nonexistent */
1655		free(eae, M_TEMP);
1656		ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1657		return (ENOATTR);
1658	}
1659	ul = eap->ea_length;
1660	i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1661	bcopy(EXTATTR_NEXT(eap), eap, easize - i);
1662	easize -= ul;
1663
1664	tmp = ip->i_ea_area;
1665	ip->i_ea_area = eae;
1666	ip->i_ea_len = easize;
1667	free(tmp, M_TEMP);
1668	error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1669	return (error);
1670}
1671
1672/*
1673 * Vnode operation to retrieve a named extended attribute.
1674 */
1675static int
1676ffs_getextattr(
1677	struct vop_getextattr_args /* {
1678		IN struct vnode *a_vp;
1679		IN int a_attrnamespace;
1680		IN const char *a_name;
1681		INOUT struct uio *a_uio;
1682		OUT size_t *a_size;
1683		IN struct ucred *a_cred;
1684		IN struct thread *a_td;
1685	} */ *ap)
1686{
1687	struct inode *ip;
1688	uint8_t *eae, *p;
1689	unsigned easize;
1690	int error, ealen;
1691
1692	ip = VTOI(ap->a_vp);
1693
1694	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1695		return (EOPNOTSUPP);
1696
1697	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1698	    ap->a_cred, ap->a_td, VREAD);
1699	if (error)
1700		return (error);
1701
1702	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1703	if (error)
1704		return (error);
1705
1706	eae = ip->i_ea_area;
1707	easize = ip->i_ea_len;
1708
1709	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1710	    NULL, &p);
1711	if (ealen >= 0) {
1712		error = 0;
1713		if (ap->a_size != NULL)
1714			*ap->a_size = ealen;
1715		else if (ap->a_uio != NULL)
1716			error = uiomove(p, ealen, ap->a_uio);
1717	} else
1718		error = ENOATTR;
1719
1720	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1721	return (error);
1722}
1723
1724/*
1725 * Vnode operation to retrieve extended attributes on a vnode.
1726 */
1727static int
1728ffs_listextattr(
1729	struct vop_listextattr_args /* {
1730		IN struct vnode *a_vp;
1731		IN int a_attrnamespace;
1732		INOUT struct uio *a_uio;
1733		OUT size_t *a_size;
1734		IN struct ucred *a_cred;
1735		IN struct thread *a_td;
1736	} */ *ap)
1737{
1738	struct inode *ip;
1739	struct extattr *eap, *eaend;
1740	int error, ealen;
1741
1742	ip = VTOI(ap->a_vp);
1743
1744	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1745		return (EOPNOTSUPP);
1746
1747	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1748	    ap->a_cred, ap->a_td, VREAD);
1749	if (error)
1750		return (error);
1751
1752	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1753	if (error)
1754		return (error);
1755
1756	error = 0;
1757	if (ap->a_size != NULL)
1758		*ap->a_size = 0;
1759
1760	KASSERT(ALIGNED_TO(ip->i_ea_area, struct extattr), ("unaligned"));
1761	eap = (struct extattr *)ip->i_ea_area;
1762	eaend = (struct extattr *)(ip->i_ea_area + ip->i_ea_len);
1763	for (; error == 0 && eap < eaend; eap = EXTATTR_NEXT(eap)) {
1764		KASSERT(EXTATTR_NEXT(eap) <= eaend,
1765		    ("extattr next %p beyond %p", EXTATTR_NEXT(eap), eaend));
1766		if (eap->ea_namespace != ap->a_attrnamespace)
1767			continue;
1768
1769		ealen = eap->ea_namelength;
1770		if (ap->a_size != NULL)
1771			*ap->a_size += ealen + 1;
1772		else if (ap->a_uio != NULL)
1773			error = uiomove(&eap->ea_namelength, ealen + 1,
1774			    ap->a_uio);
1775	}
1776
1777	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1778	return (error);
1779}
1780
1781/*
1782 * Vnode operation to set a named attribute.
1783 */
1784static int
1785ffs_setextattr(
1786	struct vop_setextattr_args /* {
1787		IN struct vnode *a_vp;
1788		IN int a_attrnamespace;
1789		IN const char *a_name;
1790		INOUT struct uio *a_uio;
1791		IN struct ucred *a_cred;
1792		IN struct thread *a_td;
1793	} */ *ap)
1794{
1795	struct vnode *vp;
1796	struct inode *ip;
1797	struct fs *fs;
1798	struct extattr *eap;
1799	uint32_t ealength, ul;
1800	ssize_t ealen;
1801	int olen, eapad1, eapad2, error, i, easize;
1802	uint8_t *eae;
1803	void *tmp;
1804
1805	vp = ap->a_vp;
1806	ip = VTOI(vp);
1807	fs = ITOFS(ip);
1808
1809	if (vp->v_type == VCHR || vp->v_type == VBLK)
1810		return (EOPNOTSUPP);
1811	if (strlen(ap->a_name) == 0)
1812		return (EINVAL);
1813
1814	/* XXX Now unsupported API to delete EAs using NULL uio. */
1815	if (ap->a_uio == NULL)
1816		return (EOPNOTSUPP);
1817
1818	if (vp->v_mount->mnt_flag & MNT_RDONLY)
1819		return (EROFS);
1820
1821	ealen = ap->a_uio->uio_resid;
1822	if (ealen < 0 || ealen > lblktosize(fs, UFS_NXADDR))
1823		return (EINVAL);
1824
1825	error = extattr_check_cred(vp, ap->a_attrnamespace,
1826	    ap->a_cred, ap->a_td, VWRITE);
1827	if (error) {
1828		/*
1829		 * ffs_lock_ea is not needed there, because the vnode
1830		 * must be exclusively locked.
1831		 */
1832		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1833			ip->i_ea_error = error;
1834		return (error);
1835	}
1836
1837	if (DOINGSUJ(vp)) {
1838		ASSERT_VOP_ELOCKED(vp, "ffs_deleteextattr");
1839		softdep_prealloc(vp, MNT_WAIT);
1840		if (vp->v_data == NULL)
1841			return (EBADF);
1842	}
1843
1844	error = ffs_open_ea(vp, ap->a_cred, ap->a_td);
1845	if (error)
1846		return (error);
1847
1848	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1849	eapad1 = roundup2(ealength, 8) - ealength;
1850	eapad2 = roundup2(ealen, 8) - ealen;
1851	ealength += eapad1 + ealen + eapad2;
1852
1853	/*
1854	 * CEM: rewrites of the same size or smaller could be done in-place
1855	 * instead.  (We don't acquire any fine-grained locks in here either,
1856	 * so we could also do bigger writes in-place.)
1857	 */
1858	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1859	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1860	easize = ip->i_ea_len;
1861
1862	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1863	    &eap, NULL);
1864        if (olen == -1) {
1865		/* new, append at end */
1866		KASSERT(ALIGNED_TO(eae + easize, struct extattr),
1867		    ("unaligned"));
1868		eap = (struct extattr *)(eae + easize);
1869		easize += ealength;
1870	} else {
1871		ul = eap->ea_length;
1872		i = (uint8_t *)EXTATTR_NEXT(eap) - eae;
1873		if (ul != ealength) {
1874			bcopy(EXTATTR_NEXT(eap), (uint8_t *)eap + ealength,
1875			    easize - i);
1876			easize += (ealength - ul);
1877		}
1878	}
1879	if (easize > lblktosize(fs, UFS_NXADDR)) {
1880		free(eae, M_TEMP);
1881		ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1882		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1883			ip->i_ea_error = ENOSPC;
1884		return (ENOSPC);
1885	}
1886	eap->ea_length = ealength;
1887	eap->ea_namespace = ap->a_attrnamespace;
1888	eap->ea_contentpadlen = eapad2;
1889	eap->ea_namelength = strlen(ap->a_name);
1890	memcpy(eap->ea_name, ap->a_name, strlen(ap->a_name));
1891	bzero(&eap->ea_name[strlen(ap->a_name)], eapad1);
1892	error = uiomove(EXTATTR_CONTENT(eap), ealen, ap->a_uio);
1893	if (error) {
1894		free(eae, M_TEMP);
1895		ffs_close_ea(vp, 0, ap->a_cred, ap->a_td);
1896		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1897			ip->i_ea_error = error;
1898		return (error);
1899	}
1900	bzero((uint8_t *)EXTATTR_CONTENT(eap) + ealen, eapad2);
1901
1902	tmp = ip->i_ea_area;
1903	ip->i_ea_area = eae;
1904	ip->i_ea_len = easize;
1905	free(tmp, M_TEMP);
1906	error = ffs_close_ea(vp, 1, ap->a_cred, ap->a_td);
1907	return (error);
1908}
1909
1910/*
1911 * Vnode pointer to File handle
1912 */
1913static int
1914ffs_vptofh(
1915	struct vop_vptofh_args /* {
1916		IN struct vnode *a_vp;
1917		IN struct fid *a_fhp;
1918	} */ *ap)
1919{
1920	struct inode *ip;
1921	struct ufid *ufhp;
1922
1923	ip = VTOI(ap->a_vp);
1924	ufhp = (struct ufid *)ap->a_fhp;
1925	ufhp->ufid_len = sizeof(struct ufid);
1926	ufhp->ufid_ino = ip->i_number;
1927	ufhp->ufid_gen = ip->i_gen;
1928	return (0);
1929}
1930
1931SYSCTL_DECL(_vfs_ffs);
1932static int use_buf_pager = 1;
1933SYSCTL_INT(_vfs_ffs, OID_AUTO, use_buf_pager, CTLFLAG_RWTUN, &use_buf_pager, 0,
1934    "Always use buffer pager instead of bmap");
1935
1936static daddr_t
1937ffs_gbp_getblkno(struct vnode *vp, vm_ooffset_t off)
1938{
1939
1940	return (lblkno(VFSTOUFS(vp->v_mount)->um_fs, off));
1941}
1942
1943static int
1944ffs_gbp_getblksz(struct vnode *vp, daddr_t lbn, long *sz)
1945{
1946
1947	*sz = blksize(VFSTOUFS(vp->v_mount)->um_fs, VTOI(vp), lbn);
1948	return (0);
1949}
1950
1951static int
1952ffs_getpages(struct vop_getpages_args *ap)
1953{
1954	struct vnode *vp;
1955	struct ufsmount *um;
1956
1957	vp = ap->a_vp;
1958	um = VFSTOUFS(vp->v_mount);
1959
1960	if (!use_buf_pager && um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE)
1961		return (vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1962		    ap->a_rbehind, ap->a_rahead, NULL, NULL));
1963	return (vfs_bio_getpages(vp, ap->a_m, ap->a_count, ap->a_rbehind,
1964	    ap->a_rahead, ffs_gbp_getblkno, ffs_gbp_getblksz));
1965}
1966
1967static int
1968ffs_getpages_async(struct vop_getpages_async_args *ap)
1969{
1970	struct vnode *vp;
1971	struct ufsmount *um;
1972	bool do_iodone;
1973	int error;
1974
1975	vp = ap->a_vp;
1976	um = VFSTOUFS(vp->v_mount);
1977	do_iodone = true;
1978
1979	if (um->um_devvp->v_bufobj.bo_bsize <= PAGE_SIZE) {
1980		error = vnode_pager_generic_getpages(vp, ap->a_m, ap->a_count,
1981		    ap->a_rbehind, ap->a_rahead, ap->a_iodone, ap->a_arg);
1982		if (error == 0)
1983			do_iodone = false;
1984	} else {
1985		error = vfs_bio_getpages(vp, ap->a_m, ap->a_count,
1986		    ap->a_rbehind, ap->a_rahead, ffs_gbp_getblkno,
1987		    ffs_gbp_getblksz);
1988	}
1989	if (do_iodone && ap->a_iodone != NULL)
1990		ap->a_iodone(ap->a_arg, ap->a_m, ap->a_count, error);
1991
1992	return (error);
1993}
1994
1995static int
1996ffs_vput_pair(struct vop_vput_pair_args *ap)
1997{
1998	struct mount *mp;
1999	struct vnode *dvp, *vp, *vp1, **vpp;
2000	struct inode *dp, *ip;
2001	ino_t ip_ino;
2002	uint64_t ip_gen;
2003	int error, vp_locked;
2004
2005	dvp = ap->a_dvp;
2006	dp = VTOI(dvp);
2007	vpp = ap->a_vpp;
2008	vp = vpp != NULL ? *vpp : NULL;
2009
2010	if ((dp->i_flag & (IN_NEEDSYNC | IN_ENDOFF)) == 0) {
2011		vput(dvp);
2012		if (vp != NULL && ap->a_unlock_vp)
2013			vput(vp);
2014		return (0);
2015	}
2016
2017	mp = dvp->v_mount;
2018	if (vp != NULL) {
2019		if (ap->a_unlock_vp) {
2020			vput(vp);
2021		} else {
2022			MPASS(vp->v_type != VNON);
2023			vp_locked = VOP_ISLOCKED(vp);
2024			ip = VTOI(vp);
2025			ip_ino = ip->i_number;
2026			ip_gen = ip->i_gen;
2027			VOP_UNLOCK(vp);
2028		}
2029	}
2030
2031	/*
2032	 * If compaction or fsync was requested do it in ffs_vput_pair()
2033	 * now that other locks are no longer held.
2034         */
2035	if ((dp->i_flag & IN_ENDOFF) != 0) {
2036		VNASSERT(I_ENDOFF(dp) != 0 && I_ENDOFF(dp) < dp->i_size, dvp,
2037		    ("IN_ENDOFF set but I_ENDOFF() is not"));
2038		dp->i_flag &= ~IN_ENDOFF;
2039		error = UFS_TRUNCATE(dvp, (off_t)I_ENDOFF(dp), IO_NORMAL |
2040		    (DOINGASYNC(dvp) ? 0 : IO_SYNC), curthread->td_ucred);
2041		if (error != 0 && error != ERELOOKUP) {
2042			if (!ffs_fsfail_cleanup(VFSTOUFS(mp), error)) {
2043				vn_printf(dvp,
2044				    "IN_ENDOFF: failed to truncate, "
2045				    "error %d\n", error);
2046			}
2047#ifdef UFS_DIRHASH
2048			ufsdirhash_free(dp);
2049#endif
2050		}
2051		SET_I_ENDOFF(dp, 0);
2052	}
2053	if ((dp->i_flag & IN_NEEDSYNC) != 0) {
2054		do {
2055			error = ffs_syncvnode(dvp, MNT_WAIT, 0);
2056		} while (error == ERELOOKUP);
2057	}
2058
2059	vput(dvp);
2060
2061	if (vp == NULL || ap->a_unlock_vp)
2062		return (0);
2063	MPASS(mp != NULL);
2064
2065	/*
2066	 * It is possible that vp is reclaimed at this point. Only
2067	 * routines that call us with a_unlock_vp == false can find
2068	 * that their vp has been reclaimed. There are three areas
2069	 * that are affected:
2070	 * 1) vn_open_cred() - later VOPs could fail, but
2071	 *    dead_open() returns 0 to simulate successful open.
2072	 * 2) ffs_snapshot() - creation of snapshot fails with EBADF.
2073	 * 3) NFS server (several places) - code is prepared to detect
2074	 *    and respond to dead vnodes by returning ESTALE.
2075	 */
2076	VOP_LOCK(vp, vp_locked | LK_RETRY);
2077	if (IS_UFS(vp))
2078		return (0);
2079
2080	/*
2081	 * Try harder to recover from reclaimed vp if reclaim was not
2082	 * because underlying inode was cleared.  We saved inode
2083	 * number and inode generation, so we can try to reinstantiate
2084	 * exactly same version of inode.  If this fails, return
2085	 * original doomed vnode and let caller to handle
2086	 * consequences.
2087	 *
2088	 * Note that callers must keep write started around
2089	 * VOP_VPUT_PAIR() calls, so it is safe to use mp without
2090	 * busying it.
2091	 */
2092	VOP_UNLOCK(vp);
2093	error = ffs_inotovp(mp, ip_ino, ip_gen, LK_EXCLUSIVE, &vp1,
2094	    FFSV_REPLACE_DOOMED);
2095	if (error != 0) {
2096		VOP_LOCK(vp, vp_locked | LK_RETRY);
2097	} else {
2098		vrele(vp);
2099		*vpp = vp1;
2100	}
2101	return (error);
2102}
2103