ffs_balloc.c revision 304667
1/*-
2 * Copyright (c) 2002 Networks Associates Technology, Inc.
3 * All rights reserved.
4 *
5 * This software was developed for the FreeBSD Project by Marshall
6 * Kirk McKusick and Network Associates Laboratories, the Security
7 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
8 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
9 * research program
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 *    notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
21 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
24 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30 * SUCH DAMAGE.
31 *
32 * Copyright (c) 1982, 1986, 1989, 1993
33 *	The Regents of the University of California.  All rights reserved.
34 *
35 * Redistribution and use in source and binary forms, with or without
36 * modification, are permitted provided that the following conditions
37 * are met:
38 * 1. Redistributions of source code must retain the above copyright
39 *    notice, this list of conditions and the following disclaimer.
40 * 2. Redistributions in binary form must reproduce the above copyright
41 *    notice, this list of conditions and the following disclaimer in the
42 *    documentation and/or other materials provided with the distribution.
43 * 4. Neither the name of the University nor the names of its contributors
44 *    may be used to endorse or promote products derived from this software
45 *    without specific prior written permission.
46 *
47 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
48 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
49 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
50 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
51 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
52 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
53 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
54 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
55 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
56 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * SUCH DAMAGE.
58 *
59 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
60 */
61
62#include <sys/cdefs.h>
63__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_balloc.c 304667 2016-08-23 07:51:00Z kib $");
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/lock.h>
70#include <sys/mount.h>
71#include <sys/vnode.h>
72
73#include <ufs/ufs/quota.h>
74#include <ufs/ufs/inode.h>
75#include <ufs/ufs/ufs_extern.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/ufsmount.h>
78
79#include <ufs/ffs/fs.h>
80#include <ufs/ffs/ffs_extern.h>
81
82/*
83 * Balloc defines the structure of filesystem storage
84 * by allocating the physical blocks on a device given
85 * the inode and the logical block number in a file.
86 * This is the allocation strategy for UFS1. Below is
87 * the allocation strategy for UFS2.
88 */
89int
90ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
91    struct ucred *cred, int flags, struct buf **bpp)
92{
93	struct inode *ip;
94	struct ufs1_dinode *dp;
95	ufs_lbn_t lbn, lastlbn;
96	struct fs *fs;
97	ufs1_daddr_t nb;
98	struct buf *bp, *nbp;
99	struct ufsmount *ump;
100	struct indir indirs[NIADDR + 2];
101	int deallocated, osize, nsize, num, i, error;
102	ufs2_daddr_t newb;
103	ufs1_daddr_t *bap, pref;
104	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
105	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
106	int unwindidx = -1;
107	int saved_inbdflush;
108	static struct timeval lastfail;
109	static int curfail;
110	int gbflags, reclaimed;
111
112	ip = VTOI(vp);
113	dp = ip->i_din1;
114	fs = ip->i_fs;
115	ump = ip->i_ump;
116	lbn = lblkno(fs, startoffset);
117	size = blkoff(fs, startoffset) + size;
118	reclaimed = 0;
119	if (size > fs->fs_bsize)
120		panic("ffs_balloc_ufs1: blk too big");
121	*bpp = NULL;
122	if (flags & IO_EXT)
123		return (EOPNOTSUPP);
124	if (lbn < 0)
125		return (EFBIG);
126	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
127
128	if (DOINGSOFTDEP(vp))
129		softdep_prealloc(vp, MNT_WAIT);
130	/*
131	 * If the next write will extend the file into a new block,
132	 * and the file is currently composed of a fragment
133	 * this fragment has to be extended to be a full block.
134	 */
135	lastlbn = lblkno(fs, ip->i_size);
136	if (lastlbn < NDADDR && lastlbn < lbn) {
137		nb = lastlbn;
138		osize = blksize(fs, ip, nb);
139		if (osize < fs->fs_bsize && osize > 0) {
140			UFS_LOCK(ump);
141			error = ffs_realloccg(ip, nb, dp->di_db[nb],
142			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
143			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
144			   cred, &bp);
145			if (error)
146				return (error);
147			if (DOINGSOFTDEP(vp))
148				softdep_setup_allocdirect(ip, nb,
149				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
150				    fs->fs_bsize, osize, bp);
151			ip->i_size = smalllblktosize(fs, nb + 1);
152			dp->di_size = ip->i_size;
153			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
154			ip->i_flag |= IN_CHANGE | IN_UPDATE;
155			if (flags & IO_SYNC)
156				bwrite(bp);
157			else
158				bawrite(bp);
159		}
160	}
161	/*
162	 * The first NDADDR blocks are direct blocks
163	 */
164	if (lbn < NDADDR) {
165		if (flags & BA_METAONLY)
166			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
167		nb = dp->di_db[lbn];
168		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
169			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
170			if (error) {
171				brelse(bp);
172				return (error);
173			}
174			bp->b_blkno = fsbtodb(fs, nb);
175			*bpp = bp;
176			return (0);
177		}
178		if (nb != 0) {
179			/*
180			 * Consider need to reallocate a fragment.
181			 */
182			osize = fragroundup(fs, blkoff(fs, ip->i_size));
183			nsize = fragroundup(fs, size);
184			if (nsize <= osize) {
185				error = bread(vp, lbn, osize, NOCRED, &bp);
186				if (error) {
187					brelse(bp);
188					return (error);
189				}
190				bp->b_blkno = fsbtodb(fs, nb);
191			} else {
192				UFS_LOCK(ump);
193				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
194				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
195				    &dp->di_db[0]), osize, nsize, flags,
196				    cred, &bp);
197				if (error)
198					return (error);
199				if (DOINGSOFTDEP(vp))
200					softdep_setup_allocdirect(ip, lbn,
201					    dbtofsb(fs, bp->b_blkno), nb,
202					    nsize, osize, bp);
203			}
204		} else {
205			if (ip->i_size < smalllblktosize(fs, lbn + 1))
206				nsize = fragroundup(fs, size);
207			else
208				nsize = fs->fs_bsize;
209			UFS_LOCK(ump);
210			error = ffs_alloc(ip, lbn,
211			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
212			    nsize, flags, cred, &newb);
213			if (error)
214				return (error);
215			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
216			bp->b_blkno = fsbtodb(fs, newb);
217			if (flags & BA_CLRBUF)
218				vfs_bio_clrbuf(bp);
219			if (DOINGSOFTDEP(vp))
220				softdep_setup_allocdirect(ip, lbn, newb, 0,
221				    nsize, 0, bp);
222		}
223		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
224		ip->i_flag |= IN_CHANGE | IN_UPDATE;
225		*bpp = bp;
226		return (0);
227	}
228	/*
229	 * Determine the number of levels of indirection.
230	 */
231	pref = 0;
232	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
233		return(error);
234#ifdef INVARIANTS
235	if (num < 1)
236		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
237#endif
238	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
239	/*
240	 * Fetch the first indirect block allocating if necessary.
241	 */
242	--num;
243	nb = dp->di_ib[indirs[0].in_off];
244	allocib = NULL;
245	allocblk = allociblk;
246	lbns_remfree = lbns;
247	if (nb == 0) {
248		UFS_LOCK(ump);
249		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
250		    (ufs1_daddr_t *)0);
251		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
252		    flags, cred, &newb)) != 0) {
253			curthread_pflags_restore(saved_inbdflush);
254			return (error);
255		}
256		pref = newb + fs->fs_frag;
257		nb = newb;
258		MPASS(allocblk < allociblk + nitems(allociblk));
259		MPASS(lbns_remfree < lbns + nitems(lbns));
260		*allocblk++ = nb;
261		*lbns_remfree++ = indirs[1].in_lbn;
262		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
263		bp->b_blkno = fsbtodb(fs, nb);
264		vfs_bio_clrbuf(bp);
265		if (DOINGSOFTDEP(vp)) {
266			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
267			    newb, 0, fs->fs_bsize, 0, bp);
268			bdwrite(bp);
269		} else {
270			/*
271			 * Write synchronously so that indirect blocks
272			 * never point at garbage.
273			 */
274			if (DOINGASYNC(vp))
275				bdwrite(bp);
276			else if ((error = bwrite(bp)) != 0)
277				goto fail;
278		}
279		allocib = &dp->di_ib[indirs[0].in_off];
280		*allocib = nb;
281		ip->i_flag |= IN_CHANGE | IN_UPDATE;
282	}
283	/*
284	 * Fetch through the indirect blocks, allocating as necessary.
285	 */
286retry:
287	for (i = 1;;) {
288		error = bread(vp,
289		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
290		if (error) {
291			brelse(bp);
292			goto fail;
293		}
294		bap = (ufs1_daddr_t *)bp->b_data;
295		nb = bap[indirs[i].in_off];
296		if (i == num)
297			break;
298		i += 1;
299		if (nb != 0) {
300			bqrelse(bp);
301			continue;
302		}
303		UFS_LOCK(ump);
304		/*
305		 * If parent indirect has just been allocated, try to cluster
306		 * immediately following it.
307		 */
308		if (pref == 0)
309			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
310			    (ufs1_daddr_t *)0);
311		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
312		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
313			brelse(bp);
314			if (++reclaimed == 1) {
315				UFS_LOCK(ump);
316				softdep_request_cleanup(fs, vp, cred,
317				    FLUSH_BLOCKS_WAIT);
318				UFS_UNLOCK(ump);
319				goto retry;
320			}
321			if (ppsratecheck(&lastfail, &curfail, 1)) {
322				ffs_fserr(fs, ip->i_number, "filesystem full");
323				uprintf("\n%s: write failed, filesystem "
324				    "is full\n", fs->fs_fsmnt);
325			}
326			goto fail;
327		}
328		pref = newb + fs->fs_frag;
329		nb = newb;
330		MPASS(allocblk < allociblk + nitems(allociblk));
331		MPASS(lbns_remfree < lbns + nitems(lbns));
332		*allocblk++ = nb;
333		*lbns_remfree++ = indirs[i].in_lbn;
334		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
335		nbp->b_blkno = fsbtodb(fs, nb);
336		vfs_bio_clrbuf(nbp);
337		if (DOINGSOFTDEP(vp)) {
338			softdep_setup_allocindir_meta(nbp, ip, bp,
339			    indirs[i - 1].in_off, nb);
340			bdwrite(nbp);
341		} else {
342			/*
343			 * Write synchronously so that indirect blocks
344			 * never point at garbage.
345			 */
346			if ((error = bwrite(nbp)) != 0) {
347				brelse(bp);
348				goto fail;
349			}
350		}
351		bap[indirs[i - 1].in_off] = nb;
352		if (allocib == NULL && unwindidx < 0)
353			unwindidx = i - 1;
354		/*
355		 * If required, write synchronously, otherwise use
356		 * delayed write.
357		 */
358		if (flags & IO_SYNC) {
359			bwrite(bp);
360		} else {
361			if (bp->b_bufsize == fs->fs_bsize)
362				bp->b_flags |= B_CLUSTEROK;
363			bdwrite(bp);
364		}
365	}
366	/*
367	 * If asked only for the indirect block, then return it.
368	 */
369	if (flags & BA_METAONLY) {
370		curthread_pflags_restore(saved_inbdflush);
371		*bpp = bp;
372		return (0);
373	}
374	/*
375	 * Get the data block, allocating if necessary.
376	 */
377	if (nb == 0) {
378		UFS_LOCK(ump);
379		/*
380		 * If allocating metadata at the front of the cylinder
381		 * group and parent indirect block has just been allocated,
382		 * then cluster next to it if it is the first indirect in
383		 * the file. Otherwise it has been allocated in the metadata
384		 * area, so we want to find our own place out in the data area.
385		 */
386		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
387			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
388			    &bap[0]);
389		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
390		    flags | IO_BUFLOCKED, cred, &newb);
391		if (error) {
392			brelse(bp);
393			if (++reclaimed == 1) {
394				UFS_LOCK(ump);
395				softdep_request_cleanup(fs, vp, cred,
396				    FLUSH_BLOCKS_WAIT);
397				UFS_UNLOCK(ump);
398				goto retry;
399			}
400			if (ppsratecheck(&lastfail, &curfail, 1)) {
401				ffs_fserr(fs, ip->i_number, "filesystem full");
402				uprintf("\n%s: write failed, filesystem "
403				    "is full\n", fs->fs_fsmnt);
404			}
405			goto fail;
406		}
407		nb = newb;
408		MPASS(allocblk < allociblk + nitems(allociblk));
409		MPASS(lbns_remfree < lbns + nitems(lbns));
410		*allocblk++ = nb;
411		*lbns_remfree++ = lbn;
412		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
413		nbp->b_blkno = fsbtodb(fs, nb);
414		if (flags & BA_CLRBUF)
415			vfs_bio_clrbuf(nbp);
416		if (DOINGSOFTDEP(vp))
417			softdep_setup_allocindir_page(ip, lbn, bp,
418			    indirs[i].in_off, nb, 0, nbp);
419		bap[indirs[i].in_off] = nb;
420		/*
421		 * If required, write synchronously, otherwise use
422		 * delayed write.
423		 */
424		if (flags & IO_SYNC) {
425			bwrite(bp);
426		} else {
427			if (bp->b_bufsize == fs->fs_bsize)
428				bp->b_flags |= B_CLUSTEROK;
429			bdwrite(bp);
430		}
431		curthread_pflags_restore(saved_inbdflush);
432		*bpp = nbp;
433		return (0);
434	}
435	brelse(bp);
436	if (flags & BA_CLRBUF) {
437		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
438		if (seqcount != 0 &&
439		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
440		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
441			error = cluster_read(vp, ip->i_size, lbn,
442			    (int)fs->fs_bsize, NOCRED,
443			    MAXBSIZE, seqcount, gbflags, &nbp);
444		} else {
445			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
446			    gbflags, &nbp);
447		}
448		if (error) {
449			brelse(nbp);
450			goto fail;
451		}
452	} else {
453		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
454		nbp->b_blkno = fsbtodb(fs, nb);
455	}
456	curthread_pflags_restore(saved_inbdflush);
457	*bpp = nbp;
458	return (0);
459fail:
460	curthread_pflags_restore(saved_inbdflush);
461	/*
462	 * If we have failed to allocate any blocks, simply return the error.
463	 * This is the usual case and avoids the need to fsync the file.
464	 */
465	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
466		return (error);
467	/*
468	 * If we have failed part way through block allocation, we
469	 * have to deallocate any indirect blocks that we have allocated.
470	 * We have to fsync the file before we start to get rid of all
471	 * of its dependencies so that we do not leave them dangling.
472	 * We have to sync it at the end so that the soft updates code
473	 * does not find any untracked changes. Although this is really
474	 * slow, running out of disk space is not expected to be a common
475	 * occurrence. The error return from fsync is ignored as we already
476	 * have an error to return to the user.
477	 *
478	 * XXX Still have to journal the free below
479	 */
480	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
481	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
482	     blkp < allocblk; blkp++, lbns_remfree++) {
483		/*
484		 * We shall not leave the freed blocks on the vnode
485		 * buffer object lists.
486		 */
487		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
488		if (bp != NULL) {
489			bp->b_flags |= (B_INVAL | B_RELBUF);
490			bp->b_flags &= ~B_ASYNC;
491			brelse(bp);
492		}
493		deallocated += fs->fs_bsize;
494	}
495	if (allocib != NULL) {
496		*allocib = 0;
497	} else if (unwindidx >= 0) {
498		int r;
499
500		r = bread(vp, indirs[unwindidx].in_lbn,
501		    (int)fs->fs_bsize, NOCRED, &bp);
502		if (r) {
503			panic("Could not unwind indirect block, error %d", r);
504			brelse(bp);
505		} else {
506			bap = (ufs1_daddr_t *)bp->b_data;
507			bap[indirs[unwindidx].in_off] = 0;
508			if (flags & IO_SYNC) {
509				bwrite(bp);
510			} else {
511				if (bp->b_bufsize == fs->fs_bsize)
512					bp->b_flags |= B_CLUSTEROK;
513				bdwrite(bp);
514			}
515		}
516	}
517	if (deallocated) {
518#ifdef QUOTA
519		/*
520		 * Restore user's disk quota because allocation failed.
521		 */
522		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
523#endif
524		dp->di_blocks -= btodb(deallocated);
525		ip->i_flag |= IN_CHANGE | IN_UPDATE;
526	}
527	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
528	/*
529	 * After the buffers are invalidated and on-disk pointers are
530	 * cleared, free the blocks.
531	 */
532	for (blkp = allociblk; blkp < allocblk; blkp++) {
533		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
534		    ip->i_number, vp->v_type, NULL);
535	}
536	return (error);
537}
538
539/*
540 * Balloc defines the structure of file system storage
541 * by allocating the physical blocks on a device given
542 * the inode and the logical block number in a file.
543 * This is the allocation strategy for UFS2. Above is
544 * the allocation strategy for UFS1.
545 */
546int
547ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
548    struct ucred *cred, int flags, struct buf **bpp)
549{
550	struct inode *ip;
551	struct ufs2_dinode *dp;
552	ufs_lbn_t lbn, lastlbn;
553	struct fs *fs;
554	struct buf *bp, *nbp;
555	struct ufsmount *ump;
556	struct indir indirs[NIADDR + 2];
557	ufs2_daddr_t nb, newb, *bap, pref;
558	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
559	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
560	int deallocated, osize, nsize, num, i, error;
561	int unwindidx = -1;
562	int saved_inbdflush;
563	static struct timeval lastfail;
564	static int curfail;
565	int gbflags, reclaimed;
566
567	ip = VTOI(vp);
568	dp = ip->i_din2;
569	fs = ip->i_fs;
570	ump = ip->i_ump;
571	lbn = lblkno(fs, startoffset);
572	size = blkoff(fs, startoffset) + size;
573	reclaimed = 0;
574	if (size > fs->fs_bsize)
575		panic("ffs_balloc_ufs2: blk too big");
576	*bpp = NULL;
577	if (lbn < 0)
578		return (EFBIG);
579	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
580
581	if (DOINGSOFTDEP(vp))
582		softdep_prealloc(vp, MNT_WAIT);
583
584	/*
585	 * Check for allocating external data.
586	 */
587	if (flags & IO_EXT) {
588		if (lbn >= NXADDR)
589			return (EFBIG);
590		/*
591		 * If the next write will extend the data into a new block,
592		 * and the data is currently composed of a fragment
593		 * this fragment has to be extended to be a full block.
594		 */
595		lastlbn = lblkno(fs, dp->di_extsize);
596		if (lastlbn < lbn) {
597			nb = lastlbn;
598			osize = sblksize(fs, dp->di_extsize, nb);
599			if (osize < fs->fs_bsize && osize > 0) {
600				UFS_LOCK(ump);
601				error = ffs_realloccg(ip, -1 - nb,
602				    dp->di_extb[nb],
603				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
604				    &dp->di_extb[0]), osize,
605				    (int)fs->fs_bsize, flags, cred, &bp);
606				if (error)
607					return (error);
608				if (DOINGSOFTDEP(vp))
609					softdep_setup_allocext(ip, nb,
610					    dbtofsb(fs, bp->b_blkno),
611					    dp->di_extb[nb],
612					    fs->fs_bsize, osize, bp);
613				dp->di_extsize = smalllblktosize(fs, nb + 1);
614				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
615				bp->b_xflags |= BX_ALTDATA;
616				ip->i_flag |= IN_CHANGE;
617				if (flags & IO_SYNC)
618					bwrite(bp);
619				else
620					bawrite(bp);
621			}
622		}
623		/*
624		 * All blocks are direct blocks
625		 */
626		if (flags & BA_METAONLY)
627			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
628		nb = dp->di_extb[lbn];
629		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
630			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
631			    gbflags, &bp);
632			if (error) {
633				brelse(bp);
634				return (error);
635			}
636			bp->b_blkno = fsbtodb(fs, nb);
637			bp->b_xflags |= BX_ALTDATA;
638			*bpp = bp;
639			return (0);
640		}
641		if (nb != 0) {
642			/*
643			 * Consider need to reallocate a fragment.
644			 */
645			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
646			nsize = fragroundup(fs, size);
647			if (nsize <= osize) {
648				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
649				    gbflags, &bp);
650				if (error) {
651					brelse(bp);
652					return (error);
653				}
654				bp->b_blkno = fsbtodb(fs, nb);
655				bp->b_xflags |= BX_ALTDATA;
656			} else {
657				UFS_LOCK(ump);
658				error = ffs_realloccg(ip, -1 - lbn,
659				    dp->di_extb[lbn],
660				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
661				    &dp->di_extb[0]), osize, nsize, flags,
662				    cred, &bp);
663				if (error)
664					return (error);
665				bp->b_xflags |= BX_ALTDATA;
666				if (DOINGSOFTDEP(vp))
667					softdep_setup_allocext(ip, lbn,
668					    dbtofsb(fs, bp->b_blkno), nb,
669					    nsize, osize, bp);
670			}
671		} else {
672			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
673				nsize = fragroundup(fs, size);
674			else
675				nsize = fs->fs_bsize;
676			UFS_LOCK(ump);
677			error = ffs_alloc(ip, lbn,
678			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
679			   nsize, flags, cred, &newb);
680			if (error)
681				return (error);
682			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
683			bp->b_blkno = fsbtodb(fs, newb);
684			bp->b_xflags |= BX_ALTDATA;
685			if (flags & BA_CLRBUF)
686				vfs_bio_clrbuf(bp);
687			if (DOINGSOFTDEP(vp))
688				softdep_setup_allocext(ip, lbn, newb, 0,
689				    nsize, 0, bp);
690		}
691		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
692		ip->i_flag |= IN_CHANGE;
693		*bpp = bp;
694		return (0);
695	}
696	/*
697	 * If the next write will extend the file into a new block,
698	 * and the file is currently composed of a fragment
699	 * this fragment has to be extended to be a full block.
700	 */
701	lastlbn = lblkno(fs, ip->i_size);
702	if (lastlbn < NDADDR && lastlbn < lbn) {
703		nb = lastlbn;
704		osize = blksize(fs, ip, nb);
705		if (osize < fs->fs_bsize && osize > 0) {
706			UFS_LOCK(ump);
707			error = ffs_realloccg(ip, nb, dp->di_db[nb],
708			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
709			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
710			    flags, cred, &bp);
711			if (error)
712				return (error);
713			if (DOINGSOFTDEP(vp))
714				softdep_setup_allocdirect(ip, nb,
715				    dbtofsb(fs, bp->b_blkno),
716				    dp->di_db[nb],
717				    fs->fs_bsize, osize, bp);
718			ip->i_size = smalllblktosize(fs, nb + 1);
719			dp->di_size = ip->i_size;
720			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
721			ip->i_flag |= IN_CHANGE | IN_UPDATE;
722			if (flags & IO_SYNC)
723				bwrite(bp);
724			else
725				bawrite(bp);
726		}
727	}
728	/*
729	 * The first NDADDR blocks are direct blocks
730	 */
731	if (lbn < NDADDR) {
732		if (flags & BA_METAONLY)
733			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
734		nb = dp->di_db[lbn];
735		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
736			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
737			    gbflags, &bp);
738			if (error) {
739				brelse(bp);
740				return (error);
741			}
742			bp->b_blkno = fsbtodb(fs, nb);
743			*bpp = bp;
744			return (0);
745		}
746		if (nb != 0) {
747			/*
748			 * Consider need to reallocate a fragment.
749			 */
750			osize = fragroundup(fs, blkoff(fs, ip->i_size));
751			nsize = fragroundup(fs, size);
752			if (nsize <= osize) {
753				error = bread_gb(vp, lbn, osize, NOCRED,
754				    gbflags, &bp);
755				if (error) {
756					brelse(bp);
757					return (error);
758				}
759				bp->b_blkno = fsbtodb(fs, nb);
760			} else {
761				UFS_LOCK(ump);
762				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
763				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
764				    &dp->di_db[0]), osize, nsize, flags,
765				    cred, &bp);
766				if (error)
767					return (error);
768				if (DOINGSOFTDEP(vp))
769					softdep_setup_allocdirect(ip, lbn,
770					    dbtofsb(fs, bp->b_blkno), nb,
771					    nsize, osize, bp);
772			}
773		} else {
774			if (ip->i_size < smalllblktosize(fs, lbn + 1))
775				nsize = fragroundup(fs, size);
776			else
777				nsize = fs->fs_bsize;
778			UFS_LOCK(ump);
779			error = ffs_alloc(ip, lbn,
780			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
781				&dp->di_db[0]), nsize, flags, cred, &newb);
782			if (error)
783				return (error);
784			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
785			bp->b_blkno = fsbtodb(fs, newb);
786			if (flags & BA_CLRBUF)
787				vfs_bio_clrbuf(bp);
788			if (DOINGSOFTDEP(vp))
789				softdep_setup_allocdirect(ip, lbn, newb, 0,
790				    nsize, 0, bp);
791		}
792		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
793		ip->i_flag |= IN_CHANGE | IN_UPDATE;
794		*bpp = bp;
795		return (0);
796	}
797	/*
798	 * Determine the number of levels of indirection.
799	 */
800	pref = 0;
801	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
802		return(error);
803#ifdef INVARIANTS
804	if (num < 1)
805		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
806#endif
807	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
808	/*
809	 * Fetch the first indirect block allocating if necessary.
810	 */
811	--num;
812	nb = dp->di_ib[indirs[0].in_off];
813	allocib = NULL;
814	allocblk = allociblk;
815	lbns_remfree = lbns;
816	if (nb == 0) {
817		UFS_LOCK(ump);
818		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
819		    (ufs2_daddr_t *)0);
820		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
821		    flags, cred, &newb)) != 0) {
822			curthread_pflags_restore(saved_inbdflush);
823			return (error);
824		}
825		pref = newb + fs->fs_frag;
826		nb = newb;
827		MPASS(allocblk < allociblk + nitems(allociblk));
828		MPASS(lbns_remfree < lbns + nitems(lbns));
829		*allocblk++ = nb;
830		*lbns_remfree++ = indirs[1].in_lbn;
831		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
832		    GB_UNMAPPED);
833		bp->b_blkno = fsbtodb(fs, nb);
834		vfs_bio_clrbuf(bp);
835		if (DOINGSOFTDEP(vp)) {
836			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
837			    newb, 0, fs->fs_bsize, 0, bp);
838			bdwrite(bp);
839		} else {
840			/*
841			 * Write synchronously so that indirect blocks
842			 * never point at garbage.
843			 */
844			if (DOINGASYNC(vp))
845				bdwrite(bp);
846			else if ((error = bwrite(bp)) != 0)
847				goto fail;
848		}
849		allocib = &dp->di_ib[indirs[0].in_off];
850		*allocib = nb;
851		ip->i_flag |= IN_CHANGE | IN_UPDATE;
852	}
853	/*
854	 * Fetch through the indirect blocks, allocating as necessary.
855	 */
856retry:
857	for (i = 1;;) {
858		error = bread(vp,
859		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
860		if (error) {
861			brelse(bp);
862			goto fail;
863		}
864		bap = (ufs2_daddr_t *)bp->b_data;
865		nb = bap[indirs[i].in_off];
866		if (i == num)
867			break;
868		i += 1;
869		if (nb != 0) {
870			bqrelse(bp);
871			continue;
872		}
873		UFS_LOCK(ump);
874		/*
875		 * If parent indirect has just been allocated, try to cluster
876		 * immediately following it.
877		 */
878		if (pref == 0)
879			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
880			    (ufs2_daddr_t *)0);
881		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
882		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
883			brelse(bp);
884			if (++reclaimed == 1) {
885				UFS_LOCK(ump);
886				softdep_request_cleanup(fs, vp, cred,
887				    FLUSH_BLOCKS_WAIT);
888				UFS_UNLOCK(ump);
889				goto retry;
890			}
891			if (ppsratecheck(&lastfail, &curfail, 1)) {
892				ffs_fserr(fs, ip->i_number, "filesystem full");
893				uprintf("\n%s: write failed, filesystem "
894				    "is full\n", fs->fs_fsmnt);
895			}
896			goto fail;
897		}
898		pref = newb + fs->fs_frag;
899		nb = newb;
900		MPASS(allocblk < allociblk + nitems(allociblk));
901		MPASS(lbns_remfree < lbns + nitems(lbns));
902		*allocblk++ = nb;
903		*lbns_remfree++ = indirs[i].in_lbn;
904		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
905		    GB_UNMAPPED);
906		nbp->b_blkno = fsbtodb(fs, nb);
907		vfs_bio_clrbuf(nbp);
908		if (DOINGSOFTDEP(vp)) {
909			softdep_setup_allocindir_meta(nbp, ip, bp,
910			    indirs[i - 1].in_off, nb);
911			bdwrite(nbp);
912		} else {
913			/*
914			 * Write synchronously so that indirect blocks
915			 * never point at garbage.
916			 */
917			if ((error = bwrite(nbp)) != 0) {
918				brelse(bp);
919				goto fail;
920			}
921		}
922		bap[indirs[i - 1].in_off] = nb;
923		if (allocib == NULL && unwindidx < 0)
924			unwindidx = i - 1;
925		/*
926		 * If required, write synchronously, otherwise use
927		 * delayed write.
928		 */
929		if (flags & IO_SYNC) {
930			bwrite(bp);
931		} else {
932			if (bp->b_bufsize == fs->fs_bsize)
933				bp->b_flags |= B_CLUSTEROK;
934			bdwrite(bp);
935		}
936	}
937	/*
938	 * If asked only for the indirect block, then return it.
939	 */
940	if (flags & BA_METAONLY) {
941		curthread_pflags_restore(saved_inbdflush);
942		*bpp = bp;
943		return (0);
944	}
945	/*
946	 * Get the data block, allocating if necessary.
947	 */
948	if (nb == 0) {
949		UFS_LOCK(ump);
950		/*
951		 * If allocating metadata at the front of the cylinder
952		 * group and parent indirect block has just been allocated,
953		 * then cluster next to it if it is the first indirect in
954		 * the file. Otherwise it has been allocated in the metadata
955		 * area, so we want to find our own place out in the data area.
956		 */
957		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
958			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
959			    &bap[0]);
960		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
961		    flags | IO_BUFLOCKED, cred, &newb);
962		if (error) {
963			brelse(bp);
964			if (++reclaimed == 1) {
965				UFS_LOCK(ump);
966				softdep_request_cleanup(fs, vp, cred,
967				    FLUSH_BLOCKS_WAIT);
968				UFS_UNLOCK(ump);
969				goto retry;
970			}
971			if (ppsratecheck(&lastfail, &curfail, 1)) {
972				ffs_fserr(fs, ip->i_number, "filesystem full");
973				uprintf("\n%s: write failed, filesystem "
974				    "is full\n", fs->fs_fsmnt);
975			}
976			goto fail;
977		}
978		nb = newb;
979		MPASS(allocblk < allociblk + nitems(allociblk));
980		MPASS(lbns_remfree < lbns + nitems(lbns));
981		*allocblk++ = nb;
982		*lbns_remfree++ = lbn;
983		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
984		nbp->b_blkno = fsbtodb(fs, nb);
985		if (flags & BA_CLRBUF)
986			vfs_bio_clrbuf(nbp);
987		if (DOINGSOFTDEP(vp))
988			softdep_setup_allocindir_page(ip, lbn, bp,
989			    indirs[i].in_off, nb, 0, nbp);
990		bap[indirs[i].in_off] = nb;
991		/*
992		 * If required, write synchronously, otherwise use
993		 * delayed write.
994		 */
995		if (flags & IO_SYNC) {
996			bwrite(bp);
997		} else {
998			if (bp->b_bufsize == fs->fs_bsize)
999				bp->b_flags |= B_CLUSTEROK;
1000			bdwrite(bp);
1001		}
1002		curthread_pflags_restore(saved_inbdflush);
1003		*bpp = nbp;
1004		return (0);
1005	}
1006	brelse(bp);
1007	/*
1008	 * If requested clear invalid portions of the buffer.  If we
1009	 * have to do a read-before-write (typical if BA_CLRBUF is set),
1010	 * try to do some read-ahead in the sequential case to reduce
1011	 * the number of I/O transactions.
1012	 */
1013	if (flags & BA_CLRBUF) {
1014		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
1015		if (seqcount != 0 &&
1016		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
1017		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
1018			error = cluster_read(vp, ip->i_size, lbn,
1019			    (int)fs->fs_bsize, NOCRED,
1020			    MAXBSIZE, seqcount, gbflags, &nbp);
1021		} else {
1022			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
1023			    NOCRED, gbflags, &nbp);
1024		}
1025		if (error) {
1026			brelse(nbp);
1027			goto fail;
1028		}
1029	} else {
1030		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1031		nbp->b_blkno = fsbtodb(fs, nb);
1032	}
1033	curthread_pflags_restore(saved_inbdflush);
1034	*bpp = nbp;
1035	return (0);
1036fail:
1037	curthread_pflags_restore(saved_inbdflush);
1038	/*
1039	 * If we have failed to allocate any blocks, simply return the error.
1040	 * This is the usual case and avoids the need to fsync the file.
1041	 */
1042	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1043		return (error);
1044	/*
1045	 * If we have failed part way through block allocation, we
1046	 * have to deallocate any indirect blocks that we have allocated.
1047	 * We have to fsync the file before we start to get rid of all
1048	 * of its dependencies so that we do not leave them dangling.
1049	 * We have to sync it at the end so that the soft updates code
1050	 * does not find any untracked changes. Although this is really
1051	 * slow, running out of disk space is not expected to be a common
1052	 * occurrence. The error return from fsync is ignored as we already
1053	 * have an error to return to the user.
1054	 *
1055	 * XXX Still have to journal the free below
1056	 */
1057	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1058	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1059	     blkp < allocblk; blkp++, lbns_remfree++) {
1060		/*
1061		 * We shall not leave the freed blocks on the vnode
1062		 * buffer object lists.
1063		 */
1064		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0, GB_NOCREAT);
1065		if (bp != NULL) {
1066			bp->b_flags |= (B_INVAL | B_RELBUF);
1067			bp->b_flags &= ~B_ASYNC;
1068			brelse(bp);
1069		}
1070		deallocated += fs->fs_bsize;
1071	}
1072	if (allocib != NULL) {
1073		*allocib = 0;
1074	} else if (unwindidx >= 0) {
1075		int r;
1076
1077		r = bread(vp, indirs[unwindidx].in_lbn,
1078		    (int)fs->fs_bsize, NOCRED, &bp);
1079		if (r) {
1080			panic("Could not unwind indirect block, error %d", r);
1081			brelse(bp);
1082		} else {
1083			bap = (ufs2_daddr_t *)bp->b_data;
1084			bap[indirs[unwindidx].in_off] = 0;
1085			if (flags & IO_SYNC) {
1086				bwrite(bp);
1087			} else {
1088				if (bp->b_bufsize == fs->fs_bsize)
1089					bp->b_flags |= B_CLUSTEROK;
1090				bdwrite(bp);
1091			}
1092		}
1093	}
1094	if (deallocated) {
1095#ifdef QUOTA
1096		/*
1097		 * Restore user's disk quota because allocation failed.
1098		 */
1099		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1100#endif
1101		dp->di_blocks -= btodb(deallocated);
1102		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1103	}
1104	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1105	/*
1106	 * After the buffers are invalidated and on-disk pointers are
1107	 * cleared, free the blocks.
1108	 */
1109	for (blkp = allociblk; blkp < allocblk; blkp++) {
1110		ffs_blkfree(ump, fs, ip->i_devvp, *blkp, fs->fs_bsize,
1111		    ip->i_number, vp->v_type, NULL);
1112	}
1113	return (error);
1114}
1115