1/*-
2 * Copyright (c) 2002, 2003 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	from: @(#)ufs_readwrite.c	8.11 (Berkeley) 5/8/95
60 * from: $FreeBSD: .../ufs/ufs_readwrite.c,v 1.96 2002/08/12 09:22:11 phk ...
61 *	@(#)ffs_vnops.c	8.15 (Berkeley) 5/14/95
62 */
63
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD$");
66
67#include <sys/param.h>
68#include <sys/bio.h>
69#include <sys/systm.h>
70#include <sys/buf.h>
71#include <sys/conf.h>
72#include <sys/extattr.h>
73#include <sys/kernel.h>
74#include <sys/limits.h>
75#include <sys/malloc.h>
76#include <sys/mount.h>
77#include <sys/priv.h>
78#include <sys/rwlock.h>
79#include <sys/stat.h>
80#include <sys/vmmeter.h>
81#include <sys/vnode.h>
82
83#include <vm/vm.h>
84#include <vm/vm_param.h>
85#include <vm/vm_extern.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pager.h>
89#include <vm/vnode_pager.h>
90
91#include <ufs/ufs/extattr.h>
92#include <ufs/ufs/quota.h>
93#include <ufs/ufs/inode.h>
94#include <ufs/ufs/ufs_extern.h>
95#include <ufs/ufs/ufsmount.h>
96
97#include <ufs/ffs/fs.h>
98#include <ufs/ffs/ffs_extern.h>
99#include "opt_directio.h"
100#include "opt_ffs.h"
101
102#ifdef DIRECTIO
103extern int	ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
104#endif
105static vop_fsync_t	ffs_fsync;
106static vop_lock1_t	ffs_lock;
107static vop_getpages_t	ffs_getpages;
108static vop_read_t	ffs_read;
109static vop_write_t	ffs_write;
110static int	ffs_extread(struct vnode *vp, struct uio *uio, int ioflag);
111static int	ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag,
112		    struct ucred *cred);
113static vop_strategy_t	ffsext_strategy;
114static vop_closeextattr_t	ffs_closeextattr;
115static vop_deleteextattr_t	ffs_deleteextattr;
116static vop_getextattr_t	ffs_getextattr;
117static vop_listextattr_t	ffs_listextattr;
118static vop_openextattr_t	ffs_openextattr;
119static vop_setextattr_t	ffs_setextattr;
120static vop_vptofh_t	ffs_vptofh;
121
122
123/* Global vfs data structures for ufs. */
124struct vop_vector ffs_vnodeops1 = {
125	.vop_default =		&ufs_vnodeops,
126	.vop_fsync =		ffs_fsync,
127	.vop_getpages =		ffs_getpages,
128	.vop_lock1 =		ffs_lock,
129	.vop_read =		ffs_read,
130	.vop_reallocblks =	ffs_reallocblks,
131	.vop_write =		ffs_write,
132	.vop_vptofh =		ffs_vptofh,
133};
134
135struct vop_vector ffs_fifoops1 = {
136	.vop_default =		&ufs_fifoops,
137	.vop_fsync =		ffs_fsync,
138	.vop_reallocblks =	ffs_reallocblks, /* XXX: really ??? */
139	.vop_vptofh =		ffs_vptofh,
140};
141
142/* Global vfs data structures for ufs. */
143struct vop_vector ffs_vnodeops2 = {
144	.vop_default =		&ufs_vnodeops,
145	.vop_fsync =		ffs_fsync,
146	.vop_getpages =		ffs_getpages,
147	.vop_lock1 =		ffs_lock,
148	.vop_read =		ffs_read,
149	.vop_reallocblks =	ffs_reallocblks,
150	.vop_write =		ffs_write,
151	.vop_closeextattr =	ffs_closeextattr,
152	.vop_deleteextattr =	ffs_deleteextattr,
153	.vop_getextattr =	ffs_getextattr,
154	.vop_listextattr =	ffs_listextattr,
155	.vop_openextattr =	ffs_openextattr,
156	.vop_setextattr =	ffs_setextattr,
157	.vop_vptofh =		ffs_vptofh,
158};
159
160struct vop_vector ffs_fifoops2 = {
161	.vop_default =		&ufs_fifoops,
162	.vop_fsync =		ffs_fsync,
163	.vop_lock1 =		ffs_lock,
164	.vop_reallocblks =	ffs_reallocblks,
165	.vop_strategy =		ffsext_strategy,
166	.vop_closeextattr =	ffs_closeextattr,
167	.vop_deleteextattr =	ffs_deleteextattr,
168	.vop_getextattr =	ffs_getextattr,
169	.vop_listextattr =	ffs_listextattr,
170	.vop_openextattr =	ffs_openextattr,
171	.vop_setextattr =	ffs_setextattr,
172	.vop_vptofh =		ffs_vptofh,
173};
174
175/*
176 * Synch an open file.
177 */
178/* ARGSUSED */
179static int
180ffs_fsync(struct vop_fsync_args *ap)
181{
182	struct vnode *vp;
183	struct bufobj *bo;
184	int error;
185
186	vp = ap->a_vp;
187	bo = &vp->v_bufobj;
188retry:
189	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
190	if (error)
191		return (error);
192	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
193		error = softdep_fsync(vp);
194		if (error)
195			return (error);
196
197		/*
198		 * The softdep_fsync() function may drop vp lock,
199		 * allowing for dirty buffers to reappear on the
200		 * bo_dirty list. Recheck and resync as needed.
201		 */
202		BO_LOCK(bo);
203		if ((vp->v_type == VREG || vp->v_type == VDIR) &&
204		    (bo->bo_numoutput > 0 || bo->bo_dirty.bv_cnt > 0)) {
205			BO_UNLOCK(bo);
206			goto retry;
207		}
208		BO_UNLOCK(bo);
209	}
210	return (0);
211}
212
213int
214ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
215{
216	struct inode *ip;
217	struct bufobj *bo;
218	struct buf *bp;
219	struct buf *nbp;
220	ufs_lbn_t lbn;
221	int error, wait, passes;
222
223	ip = VTOI(vp);
224	ip->i_flag &= ~IN_NEEDSYNC;
225	bo = &vp->v_bufobj;
226
227	/*
228	 * When doing MNT_WAIT we must first flush all dependencies
229	 * on the inode.
230	 */
231	if (DOINGSOFTDEP(vp) && waitfor == MNT_WAIT &&
232	    (error = softdep_sync_metadata(vp)) != 0)
233		return (error);
234
235	/*
236	 * Flush all dirty buffers associated with a vnode.
237	 */
238	error = 0;
239	passes = 0;
240	wait = 0;	/* Always do an async pass first. */
241	lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
242	BO_LOCK(bo);
243loop:
244	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
245		bp->b_vflags &= ~BV_SCANNED;
246	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
247		/*
248		 * Reasons to skip this buffer: it has already been considered
249		 * on this pass, the buffer has dependencies that will cause
250		 * it to be redirtied and it has not already been deferred,
251		 * or it is already being written.
252		 */
253		if ((bp->b_vflags & BV_SCANNED) != 0)
254			continue;
255		bp->b_vflags |= BV_SCANNED;
256		/* Flush indirects in order. */
257		if (waitfor == MNT_WAIT && bp->b_lblkno <= -NDADDR &&
258		    lbn_level(bp->b_lblkno) >= passes)
259			continue;
260		if (bp->b_lblkno > lbn)
261			panic("ffs_syncvnode: syncing truncated data.");
262		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0) {
263			BO_UNLOCK(bo);
264		} else if (wait != 0) {
265			if (BUF_LOCK(bp,
266			    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
267			    BO_LOCKPTR(bo)) != 0) {
268				bp->b_vflags &= ~BV_SCANNED;
269				goto next;
270			}
271		} else
272			continue;
273		if ((bp->b_flags & B_DELWRI) == 0)
274			panic("ffs_fsync: not dirty");
275		/*
276		 * Check for dependencies and potentially complete them.
277		 */
278		if (!LIST_EMPTY(&bp->b_dep) &&
279		    (error = softdep_sync_buf(vp, bp,
280		    wait ? MNT_WAIT : MNT_NOWAIT)) != 0) {
281			/* I/O error. */
282			if (error != EBUSY) {
283				BUF_UNLOCK(bp);
284				return (error);
285			}
286			/* If we deferred once, don't defer again. */
287		    	if ((bp->b_flags & B_DEFERRED) == 0) {
288				bp->b_flags |= B_DEFERRED;
289				BUF_UNLOCK(bp);
290				goto next;
291			}
292		}
293		if (wait) {
294			bremfree(bp);
295			if ((error = bwrite(bp)) != 0)
296				return (error);
297		} else if ((bp->b_flags & B_CLUSTEROK)) {
298			(void) vfs_bio_awrite(bp);
299		} else {
300			bremfree(bp);
301			(void) bawrite(bp);
302		}
303next:
304		/*
305		 * Since we may have slept during the I/O, we need
306		 * to start from a known point.
307		 */
308		BO_LOCK(bo);
309		nbp = TAILQ_FIRST(&bo->bo_dirty.bv_hd);
310	}
311	if (waitfor != MNT_WAIT) {
312		BO_UNLOCK(bo);
313		if ((flags & NO_INO_UPDT) != 0)
314			return (0);
315		else
316			return (ffs_update(vp, 0));
317	}
318	/* Drain IO to see if we're done. */
319	bufobj_wwait(bo, 0, 0);
320	/*
321	 * Block devices associated with filesystems may have new I/O
322	 * requests posted for them even if the vnode is locked, so no
323	 * amount of trying will get them clean.  We make several passes
324	 * as a best effort.
325	 *
326	 * Regular files may need multiple passes to flush all dependency
327	 * work as it is possible that we must write once per indirect
328	 * level, once for the leaf, and once for the inode and each of
329	 * these will be done with one sync and one async pass.
330	 */
331	if (bo->bo_dirty.bv_cnt > 0) {
332		/* Write the inode after sync passes to flush deps. */
333		if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) {
334			BO_UNLOCK(bo);
335			ffs_update(vp, 1);
336			BO_LOCK(bo);
337		}
338		/* switch between sync/async. */
339		wait = !wait;
340		if (wait == 1 || ++passes < NIADDR + 2)
341			goto loop;
342#ifdef INVARIANTS
343		if (!vn_isdisk(vp, NULL))
344			vprint("ffs_fsync: dirty", vp);
345#endif
346	}
347	BO_UNLOCK(bo);
348	error = 0;
349	if ((flags & NO_INO_UPDT) == 0)
350		error = ffs_update(vp, 1);
351	if (DOINGSUJ(vp))
352		softdep_journal_fsync(VTOI(vp));
353	return (error);
354}
355
356static int
357ffs_lock(ap)
358	struct vop_lock1_args /* {
359		struct vnode *a_vp;
360		int a_flags;
361		struct thread *a_td;
362		char *file;
363		int line;
364	} */ *ap;
365{
366#ifndef NO_FFS_SNAPSHOT
367	struct vnode *vp;
368	int flags;
369	struct lock *lkp;
370	int result;
371
372	switch (ap->a_flags & LK_TYPE_MASK) {
373	case LK_SHARED:
374	case LK_UPGRADE:
375	case LK_EXCLUSIVE:
376		vp = ap->a_vp;
377		flags = ap->a_flags;
378		for (;;) {
379#ifdef DEBUG_VFS_LOCKS
380			KASSERT(vp->v_holdcnt != 0,
381			    ("ffs_lock %p: zero hold count", vp));
382#endif
383			lkp = vp->v_vnlock;
384			result = _lockmgr_args(lkp, flags, VI_MTX(vp),
385			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
386			    ap->a_file, ap->a_line);
387			if (lkp == vp->v_vnlock || result != 0)
388				break;
389			/*
390			 * Apparent success, except that the vnode
391			 * mutated between snapshot file vnode and
392			 * regular file vnode while this process
393			 * slept.  The lock currently held is not the
394			 * right lock.  Release it, and try to get the
395			 * new lock.
396			 */
397			(void) _lockmgr_args(lkp, LK_RELEASE, NULL,
398			    LK_WMESG_DEFAULT, LK_PRIO_DEFAULT, LK_TIMO_DEFAULT,
399			    ap->a_file, ap->a_line);
400			if ((flags & (LK_INTERLOCK | LK_NOWAIT)) ==
401			    (LK_INTERLOCK | LK_NOWAIT))
402				return (EBUSY);
403			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
404				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
405			flags &= ~LK_INTERLOCK;
406		}
407		break;
408	default:
409		result = VOP_LOCK1_APV(&ufs_vnodeops, ap);
410	}
411	return (result);
412#else
413	return (VOP_LOCK1_APV(&ufs_vnodeops, ap));
414#endif
415}
416
417/*
418 * Vnode op for reading.
419 */
420static int
421ffs_read(ap)
422	struct vop_read_args /* {
423		struct vnode *a_vp;
424		struct uio *a_uio;
425		int a_ioflag;
426		struct ucred *a_cred;
427	} */ *ap;
428{
429	struct vnode *vp;
430	struct inode *ip;
431	struct uio *uio;
432	struct fs *fs;
433	struct buf *bp;
434	ufs_lbn_t lbn, nextlbn;
435	off_t bytesinfile;
436	long size, xfersize, blkoffset;
437	ssize_t orig_resid;
438	int error;
439	int seqcount;
440	int ioflag;
441
442	vp = ap->a_vp;
443	uio = ap->a_uio;
444	ioflag = ap->a_ioflag;
445	if (ap->a_ioflag & IO_EXT)
446#ifdef notyet
447		return (ffs_extread(vp, uio, ioflag));
448#else
449		panic("ffs_read+IO_EXT");
450#endif
451#ifdef DIRECTIO
452	if ((ioflag & IO_DIRECT) != 0) {
453		int workdone;
454
455		error = ffs_rawread(vp, uio, &workdone);
456		if (error != 0 || workdone != 0)
457			return error;
458	}
459#endif
460
461	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
462	ip = VTOI(vp);
463
464#ifdef INVARIANTS
465	if (uio->uio_rw != UIO_READ)
466		panic("ffs_read: mode");
467
468	if (vp->v_type == VLNK) {
469		if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen)
470			panic("ffs_read: short symlink");
471	} else if (vp->v_type != VREG && vp->v_type != VDIR)
472		panic("ffs_read: type %d",  vp->v_type);
473#endif
474	orig_resid = uio->uio_resid;
475	KASSERT(orig_resid >= 0, ("ffs_read: uio->uio_resid < 0"));
476	if (orig_resid == 0)
477		return (0);
478	KASSERT(uio->uio_offset >= 0, ("ffs_read: uio->uio_offset < 0"));
479	fs = ip->i_fs;
480	if (uio->uio_offset < ip->i_size &&
481	    uio->uio_offset >= fs->fs_maxfilesize)
482		return (EOVERFLOW);
483
484	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
485		if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0)
486			break;
487		lbn = lblkno(fs, uio->uio_offset);
488		nextlbn = lbn + 1;
489
490		/*
491		 * size of buffer.  The buffer representing the
492		 * end of the file is rounded up to the size of
493		 * the block type ( fragment or full block,
494		 * depending ).
495		 */
496		size = blksize(fs, ip, lbn);
497		blkoffset = blkoff(fs, uio->uio_offset);
498
499		/*
500		 * The amount we want to transfer in this iteration is
501		 * one FS block less the amount of the data before
502		 * our startpoint (duh!)
503		 */
504		xfersize = fs->fs_bsize - blkoffset;
505
506		/*
507		 * But if we actually want less than the block,
508		 * or the file doesn't have a whole block more of data,
509		 * then use the lesser number.
510		 */
511		if (uio->uio_resid < xfersize)
512			xfersize = uio->uio_resid;
513		if (bytesinfile < xfersize)
514			xfersize = bytesinfile;
515
516		if (lblktosize(fs, nextlbn) >= ip->i_size) {
517			/*
518			 * Don't do readahead if this is the end of the file.
519			 */
520			error = bread_gb(vp, lbn, size, NOCRED,
521			    GB_UNMAPPED, &bp);
522		} else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) {
523			/*
524			 * Otherwise if we are allowed to cluster,
525			 * grab as much as we can.
526			 *
527			 * XXX  This may not be a win if we are not
528			 * doing sequential access.
529			 */
530			error = cluster_read(vp, ip->i_size, lbn,
531			    size, NOCRED, blkoffset + uio->uio_resid,
532			    seqcount, GB_UNMAPPED, &bp);
533		} else if (seqcount > 1) {
534			/*
535			 * If we are NOT allowed to cluster, then
536			 * if we appear to be acting sequentially,
537			 * fire off a request for a readahead
538			 * as well as a read. Note that the 4th and 5th
539			 * arguments point to arrays of the size specified in
540			 * the 6th argument.
541			 */
542			u_int nextsize = blksize(fs, ip, nextlbn);
543			error = breadn_flags(vp, lbn, size, &nextlbn,
544			    &nextsize, 1, NOCRED, GB_UNMAPPED, &bp);
545		} else {
546			/*
547			 * Failing all of the above, just read what the
548			 * user asked for. Interestingly, the same as
549			 * the first option above.
550			 */
551			error = bread_gb(vp, lbn, size, NOCRED,
552			    GB_UNMAPPED, &bp);
553		}
554		if (error) {
555			brelse(bp);
556			bp = NULL;
557			break;
558		}
559
560		/*
561		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
562		 * will cause us to attempt to release the buffer later on
563		 * and will cause the buffer cache to attempt to free the
564		 * underlying pages.
565		 */
566		if (ioflag & IO_DIRECT)
567			bp->b_flags |= B_DIRECT;
568
569		/*
570		 * We should only get non-zero b_resid when an I/O error
571		 * has occurred, which should cause us to break above.
572		 * However, if the short read did not cause an error,
573		 * then we want to ensure that we do not uiomove bad
574		 * or uninitialized data.
575		 */
576		size -= bp->b_resid;
577		if (size < xfersize) {
578			if (size == 0)
579				break;
580			xfersize = size;
581		}
582
583		if ((bp->b_flags & B_UNMAPPED) == 0) {
584			error = vn_io_fault_uiomove((char *)bp->b_data +
585			    blkoffset, (int)xfersize, uio);
586		} else {
587			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
588			    (int)xfersize, uio);
589		}
590		if (error)
591			break;
592
593		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
594		   (LIST_EMPTY(&bp->b_dep))) {
595			/*
596			 * If there are no dependencies, and it's VMIO,
597			 * then we don't need the buf, mark it available
598			 * for freeing.  For non-direct VMIO reads, the VM
599			 * has the data.
600			 */
601			bp->b_flags |= B_RELBUF;
602			brelse(bp);
603		} else {
604			/*
605			 * Otherwise let whoever
606			 * made the request take care of
607			 * freeing it. We just queue
608			 * it onto another list.
609			 */
610			bqrelse(bp);
611		}
612	}
613
614	/*
615	 * This can only happen in the case of an error
616	 * because the loop above resets bp to NULL on each iteration
617	 * and on normal completion has not set a new value into it.
618	 * so it must have come from a 'break' statement
619	 */
620	if (bp != NULL) {
621		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
622		   (LIST_EMPTY(&bp->b_dep))) {
623			bp->b_flags |= B_RELBUF;
624			brelse(bp);
625		} else {
626			bqrelse(bp);
627		}
628	}
629
630	if ((error == 0 || uio->uio_resid != orig_resid) &&
631	    (vp->v_mount->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0 &&
632	    (ip->i_flag & IN_ACCESS) == 0) {
633		VI_LOCK(vp);
634		ip->i_flag |= IN_ACCESS;
635		VI_UNLOCK(vp);
636	}
637	return (error);
638}
639
640/*
641 * Vnode op for writing.
642 */
643static int
644ffs_write(ap)
645	struct vop_write_args /* {
646		struct vnode *a_vp;
647		struct uio *a_uio;
648		int a_ioflag;
649		struct ucred *a_cred;
650	} */ *ap;
651{
652	struct vnode *vp;
653	struct uio *uio;
654	struct inode *ip;
655	struct fs *fs;
656	struct buf *bp;
657	ufs_lbn_t lbn;
658	off_t osize;
659	ssize_t resid;
660	int seqcount;
661	int blkoffset, error, flags, ioflag, size, xfersize;
662
663	vp = ap->a_vp;
664	uio = ap->a_uio;
665	ioflag = ap->a_ioflag;
666	if (ap->a_ioflag & IO_EXT)
667#ifdef notyet
668		return (ffs_extwrite(vp, uio, ioflag, ap->a_cred));
669#else
670		panic("ffs_write+IO_EXT");
671#endif
672
673	seqcount = ap->a_ioflag >> IO_SEQSHIFT;
674	ip = VTOI(vp);
675
676#ifdef INVARIANTS
677	if (uio->uio_rw != UIO_WRITE)
678		panic("ffs_write: mode");
679#endif
680
681	switch (vp->v_type) {
682	case VREG:
683		if (ioflag & IO_APPEND)
684			uio->uio_offset = ip->i_size;
685		if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size)
686			return (EPERM);
687		/* FALLTHROUGH */
688	case VLNK:
689		break;
690	case VDIR:
691		panic("ffs_write: dir write");
692		break;
693	default:
694		panic("ffs_write: type %p %d (%d,%d)", vp, (int)vp->v_type,
695			(int)uio->uio_offset,
696			(int)uio->uio_resid
697		);
698	}
699
700	KASSERT(uio->uio_resid >= 0, ("ffs_write: uio->uio_resid < 0"));
701	KASSERT(uio->uio_offset >= 0, ("ffs_write: uio->uio_offset < 0"));
702	fs = ip->i_fs;
703	if ((uoff_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize)
704		return (EFBIG);
705	/*
706	 * Maybe this should be above the vnode op call, but so long as
707	 * file servers have no limits, I don't think it matters.
708	 */
709	if (vn_rlimit_fsize(vp, uio, uio->uio_td))
710		return (EFBIG);
711
712	resid = uio->uio_resid;
713	osize = ip->i_size;
714	if (seqcount > BA_SEQMAX)
715		flags = BA_SEQMAX << BA_SEQSHIFT;
716	else
717		flags = seqcount << BA_SEQSHIFT;
718	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
719		flags |= IO_SYNC;
720	flags |= BA_UNMAPPED;
721
722	for (error = 0; uio->uio_resid > 0;) {
723		lbn = lblkno(fs, uio->uio_offset);
724		blkoffset = blkoff(fs, uio->uio_offset);
725		xfersize = fs->fs_bsize - blkoffset;
726		if (uio->uio_resid < xfersize)
727			xfersize = uio->uio_resid;
728		if (uio->uio_offset + xfersize > ip->i_size)
729			vnode_pager_setsize(vp, uio->uio_offset + xfersize);
730
731		/*
732		 * We must perform a read-before-write if the transfer size
733		 * does not cover the entire buffer.
734		 */
735		if (fs->fs_bsize > xfersize)
736			flags |= BA_CLRBUF;
737		else
738			flags &= ~BA_CLRBUF;
739/* XXX is uio->uio_offset the right thing here? */
740		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
741		    ap->a_cred, flags, &bp);
742		if (error != 0) {
743			vnode_pager_setsize(vp, ip->i_size);
744			break;
745		}
746		if (ioflag & IO_DIRECT)
747			bp->b_flags |= B_DIRECT;
748		if ((ioflag & (IO_SYNC|IO_INVAL)) == (IO_SYNC|IO_INVAL))
749			bp->b_flags |= B_NOCACHE;
750
751		if (uio->uio_offset + xfersize > ip->i_size) {
752			ip->i_size = uio->uio_offset + xfersize;
753			DIP_SET(ip, i_size, ip->i_size);
754		}
755
756		size = blksize(fs, ip, lbn) - bp->b_resid;
757		if (size < xfersize)
758			xfersize = size;
759
760		if ((bp->b_flags & B_UNMAPPED) == 0) {
761			error = vn_io_fault_uiomove((char *)bp->b_data +
762			    blkoffset, (int)xfersize, uio);
763		} else {
764			error = vn_io_fault_pgmove(bp->b_pages, blkoffset,
765			    (int)xfersize, uio);
766		}
767		/*
768		 * If the buffer is not already filled and we encounter an
769		 * error while trying to fill it, we have to clear out any
770		 * garbage data from the pages instantiated for the buffer.
771		 * If we do not, a failed uiomove() during a write can leave
772		 * the prior contents of the pages exposed to a userland mmap.
773		 *
774		 * Note that we need only clear buffers with a transfer size
775		 * equal to the block size because buffers with a shorter
776		 * transfer size were cleared above by the call to UFS_BALLOC()
777		 * with the BA_CLRBUF flag set.
778		 *
779		 * If the source region for uiomove identically mmaps the
780		 * buffer, uiomove() performed the NOP copy, and the buffer
781		 * content remains valid because the page fault handler
782		 * validated the pages.
783		 */
784		if (error != 0 && (bp->b_flags & B_CACHE) == 0 &&
785		    fs->fs_bsize == xfersize)
786			vfs_bio_clrbuf(bp);
787		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
788		   (LIST_EMPTY(&bp->b_dep))) {
789			bp->b_flags |= B_RELBUF;
790		}
791
792		/*
793		 * If IO_SYNC each buffer is written synchronously.  Otherwise
794		 * if we have a severe page deficiency write the buffer
795		 * asynchronously.  Otherwise try to cluster, and if that
796		 * doesn't do it then either do an async write (if O_DIRECT),
797		 * or a delayed write (if not).
798		 */
799		if (ioflag & IO_SYNC) {
800			(void)bwrite(bp);
801		} else if (vm_page_count_severe() ||
802			    buf_dirty_count_severe() ||
803			    (ioflag & IO_ASYNC)) {
804			bp->b_flags |= B_CLUSTEROK;
805			bawrite(bp);
806		} else if (xfersize + blkoffset == fs->fs_bsize) {
807			if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) {
808				bp->b_flags |= B_CLUSTEROK;
809				cluster_write(vp, bp, ip->i_size, seqcount,
810				    GB_UNMAPPED);
811			} else {
812				bawrite(bp);
813			}
814		} else if (ioflag & IO_DIRECT) {
815			bp->b_flags |= B_CLUSTEROK;
816			bawrite(bp);
817		} else {
818			bp->b_flags |= B_CLUSTEROK;
819			bdwrite(bp);
820		}
821		if (error || xfersize == 0)
822			break;
823		ip->i_flag |= IN_CHANGE | IN_UPDATE;
824	}
825	/*
826	 * If we successfully wrote any data, and we are not the superuser
827	 * we clear the setuid and setgid bits as a precaution against
828	 * tampering.
829	 */
830	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid &&
831	    ap->a_cred) {
832		if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) {
833			ip->i_mode &= ~(ISUID | ISGID);
834			DIP_SET(ip, i_mode, ip->i_mode);
835		}
836	}
837	if (error) {
838		if (ioflag & IO_UNIT) {
839			(void)ffs_truncate(vp, osize,
840			    IO_NORMAL | (ioflag & IO_SYNC), ap->a_cred);
841			uio->uio_offset -= resid - uio->uio_resid;
842			uio->uio_resid = resid;
843		}
844	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
845		error = ffs_update(vp, 1);
846	return (error);
847}
848
849/*
850 * get page routine
851 */
852static int
853ffs_getpages(ap)
854	struct vop_getpages_args *ap;
855{
856	int i;
857	vm_page_t mreq;
858	int pcount;
859
860	pcount = round_page(ap->a_count) / PAGE_SIZE;
861	mreq = ap->a_m[ap->a_reqpage];
862
863	/*
864	 * if ANY DEV_BSIZE blocks are valid on a large filesystem block,
865	 * then the entire page is valid.  Since the page may be mapped,
866	 * user programs might reference data beyond the actual end of file
867	 * occuring within the page.  We have to zero that data.
868	 */
869	VM_OBJECT_WLOCK(mreq->object);
870	if (mreq->valid) {
871		if (mreq->valid != VM_PAGE_BITS_ALL)
872			vm_page_zero_invalid(mreq, TRUE);
873		for (i = 0; i < pcount; i++) {
874			if (i != ap->a_reqpage) {
875				vm_page_lock(ap->a_m[i]);
876				vm_page_free(ap->a_m[i]);
877				vm_page_unlock(ap->a_m[i]);
878			}
879		}
880		VM_OBJECT_WUNLOCK(mreq->object);
881		return VM_PAGER_OK;
882	}
883	VM_OBJECT_WUNLOCK(mreq->object);
884
885	return vnode_pager_generic_getpages(ap->a_vp, ap->a_m,
886					    ap->a_count,
887					    ap->a_reqpage);
888}
889
890
891/*
892 * Extended attribute area reading.
893 */
894static int
895ffs_extread(struct vnode *vp, struct uio *uio, int ioflag)
896{
897	struct inode *ip;
898	struct ufs2_dinode *dp;
899	struct fs *fs;
900	struct buf *bp;
901	ufs_lbn_t lbn, nextlbn;
902	off_t bytesinfile;
903	long size, xfersize, blkoffset;
904	ssize_t orig_resid;
905	int error;
906
907	ip = VTOI(vp);
908	fs = ip->i_fs;
909	dp = ip->i_din2;
910
911#ifdef INVARIANTS
912	if (uio->uio_rw != UIO_READ || fs->fs_magic != FS_UFS2_MAGIC)
913		panic("ffs_extread: mode");
914
915#endif
916	orig_resid = uio->uio_resid;
917	KASSERT(orig_resid >= 0, ("ffs_extread: uio->uio_resid < 0"));
918	if (orig_resid == 0)
919		return (0);
920	KASSERT(uio->uio_offset >= 0, ("ffs_extread: uio->uio_offset < 0"));
921
922	for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) {
923		if ((bytesinfile = dp->di_extsize - uio->uio_offset) <= 0)
924			break;
925		lbn = lblkno(fs, uio->uio_offset);
926		nextlbn = lbn + 1;
927
928		/*
929		 * size of buffer.  The buffer representing the
930		 * end of the file is rounded up to the size of
931		 * the block type ( fragment or full block,
932		 * depending ).
933		 */
934		size = sblksize(fs, dp->di_extsize, lbn);
935		blkoffset = blkoff(fs, uio->uio_offset);
936
937		/*
938		 * The amount we want to transfer in this iteration is
939		 * one FS block less the amount of the data before
940		 * our startpoint (duh!)
941		 */
942		xfersize = fs->fs_bsize - blkoffset;
943
944		/*
945		 * But if we actually want less than the block,
946		 * or the file doesn't have a whole block more of data,
947		 * then use the lesser number.
948		 */
949		if (uio->uio_resid < xfersize)
950			xfersize = uio->uio_resid;
951		if (bytesinfile < xfersize)
952			xfersize = bytesinfile;
953
954		if (lblktosize(fs, nextlbn) >= dp->di_extsize) {
955			/*
956			 * Don't do readahead if this is the end of the info.
957			 */
958			error = bread(vp, -1 - lbn, size, NOCRED, &bp);
959		} else {
960			/*
961			 * If we have a second block, then
962			 * fire off a request for a readahead
963			 * as well as a read. Note that the 4th and 5th
964			 * arguments point to arrays of the size specified in
965			 * the 6th argument.
966			 */
967			u_int nextsize = sblksize(fs, dp->di_extsize, nextlbn);
968
969			nextlbn = -1 - nextlbn;
970			error = breadn(vp, -1 - lbn,
971			    size, &nextlbn, &nextsize, 1, NOCRED, &bp);
972		}
973		if (error) {
974			brelse(bp);
975			bp = NULL;
976			break;
977		}
978
979		/*
980		 * If IO_DIRECT then set B_DIRECT for the buffer.  This
981		 * will cause us to attempt to release the buffer later on
982		 * and will cause the buffer cache to attempt to free the
983		 * underlying pages.
984		 */
985		if (ioflag & IO_DIRECT)
986			bp->b_flags |= B_DIRECT;
987
988		/*
989		 * We should only get non-zero b_resid when an I/O error
990		 * has occurred, which should cause us to break above.
991		 * However, if the short read did not cause an error,
992		 * then we want to ensure that we do not uiomove bad
993		 * or uninitialized data.
994		 */
995		size -= bp->b_resid;
996		if (size < xfersize) {
997			if (size == 0)
998				break;
999			xfersize = size;
1000		}
1001
1002		error = uiomove((char *)bp->b_data + blkoffset,
1003					(int)xfersize, uio);
1004		if (error)
1005			break;
1006
1007		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1008		   (LIST_EMPTY(&bp->b_dep))) {
1009			/*
1010			 * If there are no dependencies, and it's VMIO,
1011			 * then we don't need the buf, mark it available
1012			 * for freeing.  For non-direct VMIO reads, the VM
1013			 * has the data.
1014			 */
1015			bp->b_flags |= B_RELBUF;
1016			brelse(bp);
1017		} else {
1018			/*
1019			 * Otherwise let whoever
1020			 * made the request take care of
1021			 * freeing it. We just queue
1022			 * it onto another list.
1023			 */
1024			bqrelse(bp);
1025		}
1026	}
1027
1028	/*
1029	 * This can only happen in the case of an error
1030	 * because the loop above resets bp to NULL on each iteration
1031	 * and on normal completion has not set a new value into it.
1032	 * so it must have come from a 'break' statement
1033	 */
1034	if (bp != NULL) {
1035		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1036		   (LIST_EMPTY(&bp->b_dep))) {
1037			bp->b_flags |= B_RELBUF;
1038			brelse(bp);
1039		} else {
1040			bqrelse(bp);
1041		}
1042	}
1043	return (error);
1044}
1045
1046/*
1047 * Extended attribute area writing.
1048 */
1049static int
1050ffs_extwrite(struct vnode *vp, struct uio *uio, int ioflag, struct ucred *ucred)
1051{
1052	struct inode *ip;
1053	struct ufs2_dinode *dp;
1054	struct fs *fs;
1055	struct buf *bp;
1056	ufs_lbn_t lbn;
1057	off_t osize;
1058	ssize_t resid;
1059	int blkoffset, error, flags, size, xfersize;
1060
1061	ip = VTOI(vp);
1062	fs = ip->i_fs;
1063	dp = ip->i_din2;
1064
1065#ifdef INVARIANTS
1066	if (uio->uio_rw != UIO_WRITE || fs->fs_magic != FS_UFS2_MAGIC)
1067		panic("ffs_extwrite: mode");
1068#endif
1069
1070	if (ioflag & IO_APPEND)
1071		uio->uio_offset = dp->di_extsize;
1072	KASSERT(uio->uio_offset >= 0, ("ffs_extwrite: uio->uio_offset < 0"));
1073	KASSERT(uio->uio_resid >= 0, ("ffs_extwrite: uio->uio_resid < 0"));
1074	if ((uoff_t)uio->uio_offset + uio->uio_resid > NXADDR * fs->fs_bsize)
1075		return (EFBIG);
1076
1077	resid = uio->uio_resid;
1078	osize = dp->di_extsize;
1079	flags = IO_EXT;
1080	if ((ioflag & IO_SYNC) && !DOINGASYNC(vp))
1081		flags |= IO_SYNC;
1082
1083	for (error = 0; uio->uio_resid > 0;) {
1084		lbn = lblkno(fs, uio->uio_offset);
1085		blkoffset = blkoff(fs, uio->uio_offset);
1086		xfersize = fs->fs_bsize - blkoffset;
1087		if (uio->uio_resid < xfersize)
1088			xfersize = uio->uio_resid;
1089
1090		/*
1091		 * We must perform a read-before-write if the transfer size
1092		 * does not cover the entire buffer.
1093		 */
1094		if (fs->fs_bsize > xfersize)
1095			flags |= BA_CLRBUF;
1096		else
1097			flags &= ~BA_CLRBUF;
1098		error = UFS_BALLOC(vp, uio->uio_offset, xfersize,
1099		    ucred, flags, &bp);
1100		if (error != 0)
1101			break;
1102		/*
1103		 * If the buffer is not valid we have to clear out any
1104		 * garbage data from the pages instantiated for the buffer.
1105		 * If we do not, a failed uiomove() during a write can leave
1106		 * the prior contents of the pages exposed to a userland
1107		 * mmap().  XXX deal with uiomove() errors a better way.
1108		 */
1109		if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize)
1110			vfs_bio_clrbuf(bp);
1111		if (ioflag & IO_DIRECT)
1112			bp->b_flags |= B_DIRECT;
1113
1114		if (uio->uio_offset + xfersize > dp->di_extsize)
1115			dp->di_extsize = uio->uio_offset + xfersize;
1116
1117		size = sblksize(fs, dp->di_extsize, lbn) - bp->b_resid;
1118		if (size < xfersize)
1119			xfersize = size;
1120
1121		error =
1122		    uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio);
1123		if ((ioflag & (IO_VMIO|IO_DIRECT)) &&
1124		   (LIST_EMPTY(&bp->b_dep))) {
1125			bp->b_flags |= B_RELBUF;
1126		}
1127
1128		/*
1129		 * If IO_SYNC each buffer is written synchronously.  Otherwise
1130		 * if we have a severe page deficiency write the buffer
1131		 * asynchronously.  Otherwise try to cluster, and if that
1132		 * doesn't do it then either do an async write (if O_DIRECT),
1133		 * or a delayed write (if not).
1134		 */
1135		if (ioflag & IO_SYNC) {
1136			(void)bwrite(bp);
1137		} else if (vm_page_count_severe() ||
1138			    buf_dirty_count_severe() ||
1139			    xfersize + blkoffset == fs->fs_bsize ||
1140			    (ioflag & (IO_ASYNC | IO_DIRECT)))
1141			bawrite(bp);
1142		else
1143			bdwrite(bp);
1144		if (error || xfersize == 0)
1145			break;
1146		ip->i_flag |= IN_CHANGE;
1147	}
1148	/*
1149	 * If we successfully wrote any data, and we are not the superuser
1150	 * we clear the setuid and setgid bits as a precaution against
1151	 * tampering.
1152	 */
1153	if ((ip->i_mode & (ISUID | ISGID)) && resid > uio->uio_resid && ucred) {
1154		if (priv_check_cred(ucred, PRIV_VFS_RETAINSUGID, 0)) {
1155			ip->i_mode &= ~(ISUID | ISGID);
1156			dp->di_mode = ip->i_mode;
1157		}
1158	}
1159	if (error) {
1160		if (ioflag & IO_UNIT) {
1161			(void)ffs_truncate(vp, osize,
1162			    IO_EXT | (ioflag&IO_SYNC), ucred);
1163			uio->uio_offset -= resid - uio->uio_resid;
1164			uio->uio_resid = resid;
1165		}
1166	} else if (resid > uio->uio_resid && (ioflag & IO_SYNC))
1167		error = ffs_update(vp, 1);
1168	return (error);
1169}
1170
1171
1172/*
1173 * Vnode operating to retrieve a named extended attribute.
1174 *
1175 * Locate a particular EA (nspace:name) in the area (ptr:length), and return
1176 * the length of the EA, and possibly the pointer to the entry and to the data.
1177 */
1178static int
1179ffs_findextattr(u_char *ptr, u_int length, int nspace, const char *name, u_char **eap, u_char **eac)
1180{
1181	u_char *p, *pe, *pn, *p0;
1182	int eapad1, eapad2, ealength, ealen, nlen;
1183	uint32_t ul;
1184
1185	pe = ptr + length;
1186	nlen = strlen(name);
1187
1188	for (p = ptr; p < pe; p = pn) {
1189		p0 = p;
1190		bcopy(p, &ul, sizeof(ul));
1191		pn = p + ul;
1192		/* make sure this entry is complete */
1193		if (pn > pe)
1194			break;
1195		p += sizeof(uint32_t);
1196		if (*p != nspace)
1197			continue;
1198		p++;
1199		eapad2 = *p++;
1200		if (*p != nlen)
1201			continue;
1202		p++;
1203		if (bcmp(p, name, nlen))
1204			continue;
1205		ealength = sizeof(uint32_t) + 3 + nlen;
1206		eapad1 = 8 - (ealength % 8);
1207		if (eapad1 == 8)
1208			eapad1 = 0;
1209		ealength += eapad1;
1210		ealen = ul - ealength - eapad2;
1211		p += nlen + eapad1;
1212		if (eap != NULL)
1213			*eap = p0;
1214		if (eac != NULL)
1215			*eac = p;
1216		return (ealen);
1217	}
1218	return(-1);
1219}
1220
1221static int
1222ffs_rdextattr(u_char **p, struct vnode *vp, struct thread *td, int extra)
1223{
1224	struct inode *ip;
1225	struct ufs2_dinode *dp;
1226	struct fs *fs;
1227	struct uio luio;
1228	struct iovec liovec;
1229	u_int easize;
1230	int error;
1231	u_char *eae;
1232
1233	ip = VTOI(vp);
1234	fs = ip->i_fs;
1235	dp = ip->i_din2;
1236	easize = dp->di_extsize;
1237	if ((uoff_t)easize + extra > NXADDR * fs->fs_bsize)
1238		return (EFBIG);
1239
1240	eae = malloc(easize + extra, M_TEMP, M_WAITOK);
1241
1242	liovec.iov_base = eae;
1243	liovec.iov_len = easize;
1244	luio.uio_iov = &liovec;
1245	luio.uio_iovcnt = 1;
1246	luio.uio_offset = 0;
1247	luio.uio_resid = easize;
1248	luio.uio_segflg = UIO_SYSSPACE;
1249	luio.uio_rw = UIO_READ;
1250	luio.uio_td = td;
1251
1252	error = ffs_extread(vp, &luio, IO_EXT | IO_SYNC);
1253	if (error) {
1254		free(eae, M_TEMP);
1255		return(error);
1256	}
1257	*p = eae;
1258	return (0);
1259}
1260
1261static void
1262ffs_lock_ea(struct vnode *vp)
1263{
1264	struct inode *ip;
1265
1266	ip = VTOI(vp);
1267	VI_LOCK(vp);
1268	while (ip->i_flag & IN_EA_LOCKED) {
1269		ip->i_flag |= IN_EA_LOCKWAIT;
1270		msleep(&ip->i_ea_refs, &vp->v_interlock, PINOD + 2, "ufs_ea",
1271		    0);
1272	}
1273	ip->i_flag |= IN_EA_LOCKED;
1274	VI_UNLOCK(vp);
1275}
1276
1277static void
1278ffs_unlock_ea(struct vnode *vp)
1279{
1280	struct inode *ip;
1281
1282	ip = VTOI(vp);
1283	VI_LOCK(vp);
1284	if (ip->i_flag & IN_EA_LOCKWAIT)
1285		wakeup(&ip->i_ea_refs);
1286	ip->i_flag &= ~(IN_EA_LOCKED | IN_EA_LOCKWAIT);
1287	VI_UNLOCK(vp);
1288}
1289
1290static int
1291ffs_open_ea(struct vnode *vp, struct ucred *cred, struct thread *td)
1292{
1293	struct inode *ip;
1294	struct ufs2_dinode *dp;
1295	int error;
1296
1297	ip = VTOI(vp);
1298
1299	ffs_lock_ea(vp);
1300	if (ip->i_ea_area != NULL) {
1301		ip->i_ea_refs++;
1302		ffs_unlock_ea(vp);
1303		return (0);
1304	}
1305	dp = ip->i_din2;
1306	error = ffs_rdextattr(&ip->i_ea_area, vp, td, 0);
1307	if (error) {
1308		ffs_unlock_ea(vp);
1309		return (error);
1310	}
1311	ip->i_ea_len = dp->di_extsize;
1312	ip->i_ea_error = 0;
1313	ip->i_ea_refs++;
1314	ffs_unlock_ea(vp);
1315	return (0);
1316}
1317
1318/*
1319 * Vnode extattr transaction commit/abort
1320 */
1321static int
1322ffs_close_ea(struct vnode *vp, int commit, struct ucred *cred, struct thread *td)
1323{
1324	struct inode *ip;
1325	struct uio luio;
1326	struct iovec liovec;
1327	int error;
1328	struct ufs2_dinode *dp;
1329
1330	ip = VTOI(vp);
1331
1332	ffs_lock_ea(vp);
1333	if (ip->i_ea_area == NULL) {
1334		ffs_unlock_ea(vp);
1335		return (EINVAL);
1336	}
1337	dp = ip->i_din2;
1338	error = ip->i_ea_error;
1339	if (commit && error == 0) {
1340		ASSERT_VOP_ELOCKED(vp, "ffs_close_ea commit");
1341		if (cred == NOCRED)
1342			cred =  vp->v_mount->mnt_cred;
1343		liovec.iov_base = ip->i_ea_area;
1344		liovec.iov_len = ip->i_ea_len;
1345		luio.uio_iov = &liovec;
1346		luio.uio_iovcnt = 1;
1347		luio.uio_offset = 0;
1348		luio.uio_resid = ip->i_ea_len;
1349		luio.uio_segflg = UIO_SYSSPACE;
1350		luio.uio_rw = UIO_WRITE;
1351		luio.uio_td = td;
1352		/* XXX: I'm not happy about truncating to zero size */
1353		if (ip->i_ea_len < dp->di_extsize)
1354			error = ffs_truncate(vp, 0, IO_EXT, cred);
1355		error = ffs_extwrite(vp, &luio, IO_EXT | IO_SYNC, cred);
1356	}
1357	if (--ip->i_ea_refs == 0) {
1358		free(ip->i_ea_area, M_TEMP);
1359		ip->i_ea_area = NULL;
1360		ip->i_ea_len = 0;
1361		ip->i_ea_error = 0;
1362	}
1363	ffs_unlock_ea(vp);
1364	return (error);
1365}
1366
1367/*
1368 * Vnode extattr strategy routine for fifos.
1369 *
1370 * We need to check for a read or write of the external attributes.
1371 * Otherwise we just fall through and do the usual thing.
1372 */
1373static int
1374ffsext_strategy(struct vop_strategy_args *ap)
1375/*
1376struct vop_strategy_args {
1377	struct vnodeop_desc *a_desc;
1378	struct vnode *a_vp;
1379	struct buf *a_bp;
1380};
1381*/
1382{
1383	struct vnode *vp;
1384	daddr_t lbn;
1385
1386	vp = ap->a_vp;
1387	lbn = ap->a_bp->b_lblkno;
1388	if (VTOI(vp)->i_fs->fs_magic == FS_UFS2_MAGIC &&
1389	    lbn < 0 && lbn >= -NXADDR)
1390		return (VOP_STRATEGY_APV(&ufs_vnodeops, ap));
1391	if (vp->v_type == VFIFO)
1392		return (VOP_STRATEGY_APV(&ufs_fifoops, ap));
1393	panic("spec nodes went here");
1394}
1395
1396/*
1397 * Vnode extattr transaction commit/abort
1398 */
1399static int
1400ffs_openextattr(struct vop_openextattr_args *ap)
1401/*
1402struct vop_openextattr_args {
1403	struct vnodeop_desc *a_desc;
1404	struct vnode *a_vp;
1405	IN struct ucred *a_cred;
1406	IN struct thread *a_td;
1407};
1408*/
1409{
1410
1411	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1412		return (EOPNOTSUPP);
1413
1414	return (ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td));
1415}
1416
1417
1418/*
1419 * Vnode extattr transaction commit/abort
1420 */
1421static int
1422ffs_closeextattr(struct vop_closeextattr_args *ap)
1423/*
1424struct vop_closeextattr_args {
1425	struct vnodeop_desc *a_desc;
1426	struct vnode *a_vp;
1427	int a_commit;
1428	IN struct ucred *a_cred;
1429	IN struct thread *a_td;
1430};
1431*/
1432{
1433
1434	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1435		return (EOPNOTSUPP);
1436
1437	if (ap->a_commit && (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY))
1438		return (EROFS);
1439
1440	return (ffs_close_ea(ap->a_vp, ap->a_commit, ap->a_cred, ap->a_td));
1441}
1442
1443/*
1444 * Vnode operation to remove a named attribute.
1445 */
1446static int
1447ffs_deleteextattr(struct vop_deleteextattr_args *ap)
1448/*
1449vop_deleteextattr {
1450	IN struct vnode *a_vp;
1451	IN int a_attrnamespace;
1452	IN const char *a_name;
1453	IN struct ucred *a_cred;
1454	IN struct thread *a_td;
1455};
1456*/
1457{
1458	struct inode *ip;
1459	struct fs *fs;
1460	uint32_t ealength, ul;
1461	int ealen, olen, eapad1, eapad2, error, i, easize;
1462	u_char *eae, *p;
1463
1464	ip = VTOI(ap->a_vp);
1465	fs = ip->i_fs;
1466
1467	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1468		return (EOPNOTSUPP);
1469
1470	if (strlen(ap->a_name) == 0)
1471		return (EINVAL);
1472
1473	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1474		return (EROFS);
1475
1476	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1477	    ap->a_cred, ap->a_td, VWRITE);
1478	if (error) {
1479
1480		/*
1481		 * ffs_lock_ea is not needed there, because the vnode
1482		 * must be exclusively locked.
1483		 */
1484		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1485			ip->i_ea_error = error;
1486		return (error);
1487	}
1488
1489	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1490	if (error)
1491		return (error);
1492
1493	ealength = eapad1 = ealen = eapad2 = 0;
1494
1495	eae = malloc(ip->i_ea_len, M_TEMP, M_WAITOK);
1496	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1497	easize = ip->i_ea_len;
1498
1499	olen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1500	    &p, NULL);
1501	if (olen == -1) {
1502		/* delete but nonexistent */
1503		free(eae, M_TEMP);
1504		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1505		return(ENOATTR);
1506	}
1507	bcopy(p, &ul, sizeof ul);
1508	i = p - eae + ul;
1509	if (ul != ealength) {
1510		bcopy(p + ul, p + ealength, easize - i);
1511		easize += (ealength - ul);
1512	}
1513	if (easize > NXADDR * fs->fs_bsize) {
1514		free(eae, M_TEMP);
1515		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1516		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1517			ip->i_ea_error = ENOSPC;
1518		return(ENOSPC);
1519	}
1520	p = ip->i_ea_area;
1521	ip->i_ea_area = eae;
1522	ip->i_ea_len = easize;
1523	free(p, M_TEMP);
1524	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1525	return(error);
1526}
1527
1528/*
1529 * Vnode operation to retrieve a named extended attribute.
1530 */
1531static int
1532ffs_getextattr(struct vop_getextattr_args *ap)
1533/*
1534vop_getextattr {
1535	IN struct vnode *a_vp;
1536	IN int a_attrnamespace;
1537	IN const char *a_name;
1538	INOUT struct uio *a_uio;
1539	OUT size_t *a_size;
1540	IN struct ucred *a_cred;
1541	IN struct thread *a_td;
1542};
1543*/
1544{
1545	struct inode *ip;
1546	u_char *eae, *p;
1547	unsigned easize;
1548	int error, ealen;
1549
1550	ip = VTOI(ap->a_vp);
1551
1552	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1553		return (EOPNOTSUPP);
1554
1555	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1556	    ap->a_cred, ap->a_td, VREAD);
1557	if (error)
1558		return (error);
1559
1560	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1561	if (error)
1562		return (error);
1563
1564	eae = ip->i_ea_area;
1565	easize = ip->i_ea_len;
1566
1567	ealen = ffs_findextattr(eae, easize, ap->a_attrnamespace, ap->a_name,
1568	    NULL, &p);
1569	if (ealen >= 0) {
1570		error = 0;
1571		if (ap->a_size != NULL)
1572			*ap->a_size = ealen;
1573		else if (ap->a_uio != NULL)
1574			error = uiomove(p, ealen, ap->a_uio);
1575	} else
1576		error = ENOATTR;
1577
1578	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1579	return(error);
1580}
1581
1582/*
1583 * Vnode operation to retrieve extended attributes on a vnode.
1584 */
1585static int
1586ffs_listextattr(struct vop_listextattr_args *ap)
1587/*
1588vop_listextattr {
1589	IN struct vnode *a_vp;
1590	IN int a_attrnamespace;
1591	INOUT struct uio *a_uio;
1592	OUT size_t *a_size;
1593	IN struct ucred *a_cred;
1594	IN struct thread *a_td;
1595};
1596*/
1597{
1598	struct inode *ip;
1599	u_char *eae, *p, *pe, *pn;
1600	unsigned easize;
1601	uint32_t ul;
1602	int error, ealen;
1603
1604	ip = VTOI(ap->a_vp);
1605
1606	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1607		return (EOPNOTSUPP);
1608
1609	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1610	    ap->a_cred, ap->a_td, VREAD);
1611	if (error)
1612		return (error);
1613
1614	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1615	if (error)
1616		return (error);
1617	eae = ip->i_ea_area;
1618	easize = ip->i_ea_len;
1619
1620	error = 0;
1621	if (ap->a_size != NULL)
1622		*ap->a_size = 0;
1623	pe = eae + easize;
1624	for(p = eae; error == 0 && p < pe; p = pn) {
1625		bcopy(p, &ul, sizeof(ul));
1626		pn = p + ul;
1627		if (pn > pe)
1628			break;
1629		p += sizeof(ul);
1630		if (*p++ != ap->a_attrnamespace)
1631			continue;
1632		p++;	/* pad2 */
1633		ealen = *p;
1634		if (ap->a_size != NULL) {
1635			*ap->a_size += ealen + 1;
1636		} else if (ap->a_uio != NULL) {
1637			error = uiomove(p, ealen + 1, ap->a_uio);
1638		}
1639	}
1640	ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1641	return(error);
1642}
1643
1644/*
1645 * Vnode operation to set a named attribute.
1646 */
1647static int
1648ffs_setextattr(struct vop_setextattr_args *ap)
1649/*
1650vop_setextattr {
1651	IN struct vnode *a_vp;
1652	IN int a_attrnamespace;
1653	IN const char *a_name;
1654	INOUT struct uio *a_uio;
1655	IN struct ucred *a_cred;
1656	IN struct thread *a_td;
1657};
1658*/
1659{
1660	struct inode *ip;
1661	struct fs *fs;
1662	uint32_t ealength, ul;
1663	ssize_t ealen;
1664	int olen, eapad1, eapad2, error, i, easize;
1665	u_char *eae, *p;
1666
1667	ip = VTOI(ap->a_vp);
1668	fs = ip->i_fs;
1669
1670	if (ap->a_vp->v_type == VCHR || ap->a_vp->v_type == VBLK)
1671		return (EOPNOTSUPP);
1672
1673	if (strlen(ap->a_name) == 0)
1674		return (EINVAL);
1675
1676	/* XXX Now unsupported API to delete EAs using NULL uio. */
1677	if (ap->a_uio == NULL)
1678		return (EOPNOTSUPP);
1679
1680	if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1681		return (EROFS);
1682
1683	ealen = ap->a_uio->uio_resid;
1684	if (ealen < 0 || ealen > lblktosize(fs, NXADDR))
1685		return (EINVAL);
1686
1687	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
1688	    ap->a_cred, ap->a_td, VWRITE);
1689	if (error) {
1690
1691		/*
1692		 * ffs_lock_ea is not needed there, because the vnode
1693		 * must be exclusively locked.
1694		 */
1695		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1696			ip->i_ea_error = error;
1697		return (error);
1698	}
1699
1700	error = ffs_open_ea(ap->a_vp, ap->a_cred, ap->a_td);
1701	if (error)
1702		return (error);
1703
1704	ealength = sizeof(uint32_t) + 3 + strlen(ap->a_name);
1705	eapad1 = 8 - (ealength % 8);
1706	if (eapad1 == 8)
1707		eapad1 = 0;
1708	eapad2 = 8 - (ealen % 8);
1709	if (eapad2 == 8)
1710		eapad2 = 0;
1711	ealength += eapad1 + ealen + eapad2;
1712
1713	eae = malloc(ip->i_ea_len + ealength, M_TEMP, M_WAITOK);
1714	bcopy(ip->i_ea_area, eae, ip->i_ea_len);
1715	easize = ip->i_ea_len;
1716
1717	olen = ffs_findextattr(eae, easize,
1718	    ap->a_attrnamespace, ap->a_name, &p, NULL);
1719        if (olen == -1) {
1720		/* new, append at end */
1721		p = eae + easize;
1722		easize += ealength;
1723	} else {
1724		bcopy(p, &ul, sizeof ul);
1725		i = p - eae + ul;
1726		if (ul != ealength) {
1727			bcopy(p + ul, p + ealength, easize - i);
1728			easize += (ealength - ul);
1729		}
1730	}
1731	if (easize > lblktosize(fs, NXADDR)) {
1732		free(eae, M_TEMP);
1733		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1734		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1735			ip->i_ea_error = ENOSPC;
1736		return(ENOSPC);
1737	}
1738	bcopy(&ealength, p, sizeof(ealength));
1739	p += sizeof(ealength);
1740	*p++ = ap->a_attrnamespace;
1741	*p++ = eapad2;
1742	*p++ = strlen(ap->a_name);
1743	strcpy(p, ap->a_name);
1744	p += strlen(ap->a_name);
1745	bzero(p, eapad1);
1746	p += eapad1;
1747	error = uiomove(p, ealen, ap->a_uio);
1748	if (error) {
1749		free(eae, M_TEMP);
1750		ffs_close_ea(ap->a_vp, 0, ap->a_cred, ap->a_td);
1751		if (ip->i_ea_area != NULL && ip->i_ea_error == 0)
1752			ip->i_ea_error = error;
1753		return(error);
1754	}
1755	p += ealen;
1756	bzero(p, eapad2);
1757
1758	p = ip->i_ea_area;
1759	ip->i_ea_area = eae;
1760	ip->i_ea_len = easize;
1761	free(p, M_TEMP);
1762	error = ffs_close_ea(ap->a_vp, 1, ap->a_cred, ap->a_td);
1763	return(error);
1764}
1765
1766/*
1767 * Vnode pointer to File handle
1768 */
1769static int
1770ffs_vptofh(struct vop_vptofh_args *ap)
1771/*
1772vop_vptofh {
1773	IN struct vnode *a_vp;
1774	IN struct fid *a_fhp;
1775};
1776*/
1777{
1778	struct inode *ip;
1779	struct ufid *ufhp;
1780
1781	ip = VTOI(ap->a_vp);
1782	ufhp = (struct ufid *)ap->a_fhp;
1783	ufhp->ufid_len = sizeof(struct ufid);
1784	ufhp->ufid_ino = ip->i_number;
1785	ufhp->ufid_gen = ip->i_gen;
1786	return (0);
1787}
1788