ffs_balloc.c revision 331017
1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2002 Networks Associates Technology, Inc.
5 * All rights reserved.
6 *
7 * This software was developed for the FreeBSD Project by Marshall
8 * Kirk McKusick and Network Associates Laboratories, the Security
9 * Research Division of Network Associates, Inc. under DARPA/SPAWAR
10 * contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA CHATS
11 * research program
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * Copyright (c) 1982, 1986, 1989, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)ffs_balloc.c	8.8 (Berkeley) 6/16/95
62 */
63
64#include <sys/cdefs.h>
65__FBSDID("$FreeBSD: stable/11/sys/ufs/ffs/ffs_balloc.c 331017 2018-03-15 19:08:33Z kevans $");
66
67#include <sys/param.h>
68#include <sys/systm.h>
69#include <sys/bio.h>
70#include <sys/buf.h>
71#include <sys/lock.h>
72#include <sys/mount.h>
73#include <sys/vnode.h>
74#include <sys/vmmeter.h>
75
76#include <ufs/ufs/quota.h>
77#include <ufs/ufs/inode.h>
78#include <ufs/ufs/ufs_extern.h>
79#include <ufs/ufs/extattr.h>
80#include <ufs/ufs/ufsmount.h>
81
82#include <ufs/ffs/fs.h>
83#include <ufs/ffs/ffs_extern.h>
84
85/*
86 * Balloc defines the structure of filesystem storage
87 * by allocating the physical blocks on a device given
88 * the inode and the logical block number in a file.
89 * This is the allocation strategy for UFS1. Below is
90 * the allocation strategy for UFS2.
91 */
92int
93ffs_balloc_ufs1(struct vnode *vp, off_t startoffset, int size,
94    struct ucred *cred, int flags, struct buf **bpp)
95{
96	struct inode *ip;
97	struct ufs1_dinode *dp;
98	ufs_lbn_t lbn, lastlbn;
99	struct fs *fs;
100	ufs1_daddr_t nb;
101	struct buf *bp, *nbp;
102	struct ufsmount *ump;
103	struct indir indirs[NIADDR + 2];
104	int deallocated, osize, nsize, num, i, error;
105	ufs2_daddr_t newb;
106	ufs1_daddr_t *bap, pref;
107	ufs1_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
108	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
109	int unwindidx = -1;
110	int saved_inbdflush;
111	static struct timeval lastfail;
112	static int curfail;
113	int gbflags, reclaimed;
114
115	ip = VTOI(vp);
116	dp = ip->i_din1;
117	fs = ITOFS(ip);
118	ump = ITOUMP(ip);
119	lbn = lblkno(fs, startoffset);
120	size = blkoff(fs, startoffset) + size;
121	reclaimed = 0;
122	if (size > fs->fs_bsize)
123		panic("ffs_balloc_ufs1: blk too big");
124	*bpp = NULL;
125	if (flags & IO_EXT)
126		return (EOPNOTSUPP);
127	if (lbn < 0)
128		return (EFBIG);
129	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
130
131	if (DOINGSOFTDEP(vp))
132		softdep_prealloc(vp, MNT_WAIT);
133	/*
134	 * If the next write will extend the file into a new block,
135	 * and the file is currently composed of a fragment
136	 * this fragment has to be extended to be a full block.
137	 */
138	lastlbn = lblkno(fs, ip->i_size);
139	if (lastlbn < NDADDR && lastlbn < lbn) {
140		nb = lastlbn;
141		osize = blksize(fs, ip, nb);
142		if (osize < fs->fs_bsize && osize > 0) {
143			UFS_LOCK(ump);
144			error = ffs_realloccg(ip, nb, dp->di_db[nb],
145			   ffs_blkpref_ufs1(ip, lastlbn, (int)nb,
146			   &dp->di_db[0]), osize, (int)fs->fs_bsize, flags,
147			   cred, &bp);
148			if (error)
149				return (error);
150			if (DOINGSOFTDEP(vp))
151				softdep_setup_allocdirect(ip, nb,
152				    dbtofsb(fs, bp->b_blkno), dp->di_db[nb],
153				    fs->fs_bsize, osize, bp);
154			ip->i_size = smalllblktosize(fs, nb + 1);
155			dp->di_size = ip->i_size;
156			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
157			ip->i_flag |= IN_CHANGE | IN_UPDATE;
158			if (flags & IO_SYNC)
159				bwrite(bp);
160			else
161				bawrite(bp);
162		}
163	}
164	/*
165	 * The first NDADDR blocks are direct blocks
166	 */
167	if (lbn < NDADDR) {
168		if (flags & BA_METAONLY)
169			panic("ffs_balloc_ufs1: BA_METAONLY for direct block");
170		nb = dp->di_db[lbn];
171		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
172			error = bread(vp, lbn, fs->fs_bsize, NOCRED, &bp);
173			if (error) {
174				brelse(bp);
175				return (error);
176			}
177			bp->b_blkno = fsbtodb(fs, nb);
178			*bpp = bp;
179			return (0);
180		}
181		if (nb != 0) {
182			/*
183			 * Consider need to reallocate a fragment.
184			 */
185			osize = fragroundup(fs, blkoff(fs, ip->i_size));
186			nsize = fragroundup(fs, size);
187			if (nsize <= osize) {
188				error = bread(vp, lbn, osize, NOCRED, &bp);
189				if (error) {
190					brelse(bp);
191					return (error);
192				}
193				bp->b_blkno = fsbtodb(fs, nb);
194			} else {
195				UFS_LOCK(ump);
196				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
197				    ffs_blkpref_ufs1(ip, lbn, (int)lbn,
198				    &dp->di_db[0]), osize, nsize, flags,
199				    cred, &bp);
200				if (error)
201					return (error);
202				if (DOINGSOFTDEP(vp))
203					softdep_setup_allocdirect(ip, lbn,
204					    dbtofsb(fs, bp->b_blkno), nb,
205					    nsize, osize, bp);
206			}
207		} else {
208			if (ip->i_size < smalllblktosize(fs, lbn + 1))
209				nsize = fragroundup(fs, size);
210			else
211				nsize = fs->fs_bsize;
212			UFS_LOCK(ump);
213			error = ffs_alloc(ip, lbn,
214			    ffs_blkpref_ufs1(ip, lbn, (int)lbn, &dp->di_db[0]),
215			    nsize, flags, cred, &newb);
216			if (error)
217				return (error);
218			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
219			bp->b_blkno = fsbtodb(fs, newb);
220			if (flags & BA_CLRBUF)
221				vfs_bio_clrbuf(bp);
222			if (DOINGSOFTDEP(vp))
223				softdep_setup_allocdirect(ip, lbn, newb, 0,
224				    nsize, 0, bp);
225		}
226		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
227		ip->i_flag |= IN_CHANGE | IN_UPDATE;
228		*bpp = bp;
229		return (0);
230	}
231	/*
232	 * Determine the number of levels of indirection.
233	 */
234	pref = 0;
235	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
236		return(error);
237#ifdef INVARIANTS
238	if (num < 1)
239		panic ("ffs_balloc_ufs1: ufs_getlbns returned indirect block");
240#endif
241	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
242	/*
243	 * Fetch the first indirect block allocating if necessary.
244	 */
245	--num;
246	nb = dp->di_ib[indirs[0].in_off];
247	allocib = NULL;
248	allocblk = allociblk;
249	lbns_remfree = lbns;
250	if (nb == 0) {
251		UFS_LOCK(ump);
252		pref = ffs_blkpref_ufs1(ip, lbn, -indirs[0].in_off - 1,
253		    (ufs1_daddr_t *)0);
254		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
255		    flags, cred, &newb)) != 0) {
256			curthread_pflags_restore(saved_inbdflush);
257			return (error);
258		}
259		pref = newb + fs->fs_frag;
260		nb = newb;
261		MPASS(allocblk < allociblk + nitems(allociblk));
262		MPASS(lbns_remfree < lbns + nitems(lbns));
263		*allocblk++ = nb;
264		*lbns_remfree++ = indirs[1].in_lbn;
265		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0, gbflags);
266		bp->b_blkno = fsbtodb(fs, nb);
267		vfs_bio_clrbuf(bp);
268		if (DOINGSOFTDEP(vp)) {
269			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
270			    newb, 0, fs->fs_bsize, 0, bp);
271			bdwrite(bp);
272		} else {
273			/*
274			 * Write synchronously so that indirect blocks
275			 * never point at garbage.
276			 */
277			if (DOINGASYNC(vp))
278				bdwrite(bp);
279			else if ((error = bwrite(bp)) != 0)
280				goto fail;
281		}
282		allocib = &dp->di_ib[indirs[0].in_off];
283		*allocib = nb;
284		ip->i_flag |= IN_CHANGE | IN_UPDATE;
285	}
286	/*
287	 * Fetch through the indirect blocks, allocating as necessary.
288	 */
289retry:
290	for (i = 1;;) {
291		error = bread(vp,
292		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
293		if (error) {
294			brelse(bp);
295			goto fail;
296		}
297		bap = (ufs1_daddr_t *)bp->b_data;
298		nb = bap[indirs[i].in_off];
299		if (i == num)
300			break;
301		i += 1;
302		if (nb != 0) {
303			bqrelse(bp);
304			continue;
305		}
306		UFS_LOCK(ump);
307		/*
308		 * If parent indirect has just been allocated, try to cluster
309		 * immediately following it.
310		 */
311		if (pref == 0)
312			pref = ffs_blkpref_ufs1(ip, lbn, i - num - 1,
313			    (ufs1_daddr_t *)0);
314		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
315		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
316			brelse(bp);
317			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
318				UFS_LOCK(ump);
319				softdep_request_cleanup(fs, vp, cred,
320				    FLUSH_BLOCKS_WAIT);
321				UFS_UNLOCK(ump);
322				goto retry;
323			}
324			if (ppsratecheck(&lastfail, &curfail, 1)) {
325				ffs_fserr(fs, ip->i_number, "filesystem full");
326				uprintf("\n%s: write failed, filesystem "
327				    "is full\n", fs->fs_fsmnt);
328			}
329			goto fail;
330		}
331		pref = newb + fs->fs_frag;
332		nb = newb;
333		MPASS(allocblk < allociblk + nitems(allociblk));
334		MPASS(lbns_remfree < lbns + nitems(lbns));
335		*allocblk++ = nb;
336		*lbns_remfree++ = indirs[i].in_lbn;
337		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0, 0);
338		nbp->b_blkno = fsbtodb(fs, nb);
339		vfs_bio_clrbuf(nbp);
340		if (DOINGSOFTDEP(vp)) {
341			softdep_setup_allocindir_meta(nbp, ip, bp,
342			    indirs[i - 1].in_off, nb);
343			bdwrite(nbp);
344		} else {
345			/*
346			 * Write synchronously so that indirect blocks
347			 * never point at garbage.
348			 */
349			if ((error = bwrite(nbp)) != 0) {
350				brelse(bp);
351				goto fail;
352			}
353		}
354		bap[indirs[i - 1].in_off] = nb;
355		if (allocib == NULL && unwindidx < 0)
356			unwindidx = i - 1;
357		/*
358		 * If required, write synchronously, otherwise use
359		 * delayed write.
360		 */
361		if (flags & IO_SYNC) {
362			bwrite(bp);
363		} else {
364			if (bp->b_bufsize == fs->fs_bsize)
365				bp->b_flags |= B_CLUSTEROK;
366			bdwrite(bp);
367		}
368	}
369	/*
370	 * If asked only for the indirect block, then return it.
371	 */
372	if (flags & BA_METAONLY) {
373		curthread_pflags_restore(saved_inbdflush);
374		*bpp = bp;
375		return (0);
376	}
377	/*
378	 * Get the data block, allocating if necessary.
379	 */
380	if (nb == 0) {
381		UFS_LOCK(ump);
382		/*
383		 * If allocating metadata at the front of the cylinder
384		 * group and parent indirect block has just been allocated,
385		 * then cluster next to it if it is the first indirect in
386		 * the file. Otherwise it has been allocated in the metadata
387		 * area, so we want to find our own place out in the data area.
388		 */
389		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
390			pref = ffs_blkpref_ufs1(ip, lbn, indirs[i].in_off,
391			    &bap[0]);
392		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
393		    flags | IO_BUFLOCKED, cred, &newb);
394		if (error) {
395			brelse(bp);
396			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
397				UFS_LOCK(ump);
398				softdep_request_cleanup(fs, vp, cred,
399				    FLUSH_BLOCKS_WAIT);
400				UFS_UNLOCK(ump);
401				goto retry;
402			}
403			if (ppsratecheck(&lastfail, &curfail, 1)) {
404				ffs_fserr(fs, ip->i_number, "filesystem full");
405				uprintf("\n%s: write failed, filesystem "
406				    "is full\n", fs->fs_fsmnt);
407			}
408			goto fail;
409		}
410		nb = newb;
411		MPASS(allocblk < allociblk + nitems(allociblk));
412		MPASS(lbns_remfree < lbns + nitems(lbns));
413		*allocblk++ = nb;
414		*lbns_remfree++ = lbn;
415		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
416		nbp->b_blkno = fsbtodb(fs, nb);
417		if (flags & BA_CLRBUF)
418			vfs_bio_clrbuf(nbp);
419		if (DOINGSOFTDEP(vp))
420			softdep_setup_allocindir_page(ip, lbn, bp,
421			    indirs[i].in_off, nb, 0, nbp);
422		bap[indirs[i].in_off] = nb;
423		/*
424		 * If required, write synchronously, otherwise use
425		 * delayed write.
426		 */
427		if (flags & IO_SYNC) {
428			bwrite(bp);
429		} else {
430			if (bp->b_bufsize == fs->fs_bsize)
431				bp->b_flags |= B_CLUSTEROK;
432			bdwrite(bp);
433		}
434		curthread_pflags_restore(saved_inbdflush);
435		*bpp = nbp;
436		return (0);
437	}
438	brelse(bp);
439	if (flags & BA_CLRBUF) {
440		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
441		if (seqcount != 0 &&
442		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
443		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
444			error = cluster_read(vp, ip->i_size, lbn,
445			    (int)fs->fs_bsize, NOCRED,
446			    MAXBSIZE, seqcount, gbflags, &nbp);
447		} else {
448			error = bread_gb(vp, lbn, (int)fs->fs_bsize, NOCRED,
449			    gbflags, &nbp);
450		}
451		if (error) {
452			brelse(nbp);
453			goto fail;
454		}
455	} else {
456		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
457		nbp->b_blkno = fsbtodb(fs, nb);
458	}
459	curthread_pflags_restore(saved_inbdflush);
460	*bpp = nbp;
461	return (0);
462fail:
463	curthread_pflags_restore(saved_inbdflush);
464	/*
465	 * If we have failed to allocate any blocks, simply return the error.
466	 * This is the usual case and avoids the need to fsync the file.
467	 */
468	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
469		return (error);
470	/*
471	 * If we have failed part way through block allocation, we
472	 * have to deallocate any indirect blocks that we have allocated.
473	 * We have to fsync the file before we start to get rid of all
474	 * of its dependencies so that we do not leave them dangling.
475	 * We have to sync it at the end so that the soft updates code
476	 * does not find any untracked changes. Although this is really
477	 * slow, running out of disk space is not expected to be a common
478	 * occurrence. The error return from fsync is ignored as we already
479	 * have an error to return to the user.
480	 *
481	 * XXX Still have to journal the free below
482	 */
483	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
484	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
485	     blkp < allocblk; blkp++, lbns_remfree++) {
486		/*
487		 * We shall not leave the freed blocks on the vnode
488		 * buffer object lists.
489		 */
490		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
491		    GB_NOCREAT | GB_UNMAPPED);
492		if (bp != NULL) {
493			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
494			    ("mismatch1 l %jd %jd b %ju %ju",
495			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
496			    (uintmax_t)bp->b_blkno,
497			    (uintmax_t)fsbtodb(fs, *blkp)));
498			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
499			bp->b_flags &= ~(B_ASYNC | B_CACHE);
500			brelse(bp);
501		}
502		deallocated += fs->fs_bsize;
503	}
504	if (allocib != NULL) {
505		*allocib = 0;
506	} else if (unwindidx >= 0) {
507		int r;
508
509		r = bread(vp, indirs[unwindidx].in_lbn,
510		    (int)fs->fs_bsize, NOCRED, &bp);
511		if (r) {
512			panic("Could not unwind indirect block, error %d", r);
513			brelse(bp);
514		} else {
515			bap = (ufs1_daddr_t *)bp->b_data;
516			bap[indirs[unwindidx].in_off] = 0;
517			if (flags & IO_SYNC) {
518				bwrite(bp);
519			} else {
520				if (bp->b_bufsize == fs->fs_bsize)
521					bp->b_flags |= B_CLUSTEROK;
522				bdwrite(bp);
523			}
524		}
525	}
526	if (deallocated) {
527#ifdef QUOTA
528		/*
529		 * Restore user's disk quota because allocation failed.
530		 */
531		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
532#endif
533		dp->di_blocks -= btodb(deallocated);
534		ip->i_flag |= IN_CHANGE | IN_UPDATE;
535	}
536	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
537	/*
538	 * After the buffers are invalidated and on-disk pointers are
539	 * cleared, free the blocks.
540	 */
541	for (blkp = allociblk; blkp < allocblk; blkp++) {
542#ifdef INVARIANTS
543		if (blkp == allociblk)
544			lbns_remfree = lbns;
545		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
546		    GB_NOCREAT | GB_UNMAPPED);
547		if (bp != NULL) {
548			panic("zombie1 %jd %ju %ju",
549			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
550			    (uintmax_t)fsbtodb(fs, *blkp));
551		}
552		lbns_remfree++;
553#endif
554		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
555		    ip->i_number, vp->v_type, NULL);
556	}
557	return (error);
558}
559
560/*
561 * Balloc defines the structure of file system storage
562 * by allocating the physical blocks on a device given
563 * the inode and the logical block number in a file.
564 * This is the allocation strategy for UFS2. Above is
565 * the allocation strategy for UFS1.
566 */
567int
568ffs_balloc_ufs2(struct vnode *vp, off_t startoffset, int size,
569    struct ucred *cred, int flags, struct buf **bpp)
570{
571	struct inode *ip;
572	struct ufs2_dinode *dp;
573	ufs_lbn_t lbn, lastlbn;
574	struct fs *fs;
575	struct buf *bp, *nbp;
576	struct ufsmount *ump;
577	struct indir indirs[NIADDR + 2];
578	ufs2_daddr_t nb, newb, *bap, pref;
579	ufs2_daddr_t *allocib, *blkp, *allocblk, allociblk[NIADDR + 1];
580	ufs2_daddr_t *lbns_remfree, lbns[NIADDR + 1];
581	int deallocated, osize, nsize, num, i, error;
582	int unwindidx = -1;
583	int saved_inbdflush;
584	static struct timeval lastfail;
585	static int curfail;
586	int gbflags, reclaimed;
587
588	ip = VTOI(vp);
589	dp = ip->i_din2;
590	fs = ITOFS(ip);
591	ump = ITOUMP(ip);
592	lbn = lblkno(fs, startoffset);
593	size = blkoff(fs, startoffset) + size;
594	reclaimed = 0;
595	if (size > fs->fs_bsize)
596		panic("ffs_balloc_ufs2: blk too big");
597	*bpp = NULL;
598	if (lbn < 0)
599		return (EFBIG);
600	gbflags = (flags & BA_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
601
602	if (DOINGSOFTDEP(vp))
603		softdep_prealloc(vp, MNT_WAIT);
604
605	/*
606	 * Check for allocating external data.
607	 */
608	if (flags & IO_EXT) {
609		if (lbn >= NXADDR)
610			return (EFBIG);
611		/*
612		 * If the next write will extend the data into a new block,
613		 * and the data is currently composed of a fragment
614		 * this fragment has to be extended to be a full block.
615		 */
616		lastlbn = lblkno(fs, dp->di_extsize);
617		if (lastlbn < lbn) {
618			nb = lastlbn;
619			osize = sblksize(fs, dp->di_extsize, nb);
620			if (osize < fs->fs_bsize && osize > 0) {
621				UFS_LOCK(ump);
622				error = ffs_realloccg(ip, -1 - nb,
623				    dp->di_extb[nb],
624				    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
625				    &dp->di_extb[0]), osize,
626				    (int)fs->fs_bsize, flags, cred, &bp);
627				if (error)
628					return (error);
629				if (DOINGSOFTDEP(vp))
630					softdep_setup_allocext(ip, nb,
631					    dbtofsb(fs, bp->b_blkno),
632					    dp->di_extb[nb],
633					    fs->fs_bsize, osize, bp);
634				dp->di_extsize = smalllblktosize(fs, nb + 1);
635				dp->di_extb[nb] = dbtofsb(fs, bp->b_blkno);
636				bp->b_xflags |= BX_ALTDATA;
637				ip->i_flag |= IN_CHANGE;
638				if (flags & IO_SYNC)
639					bwrite(bp);
640				else
641					bawrite(bp);
642			}
643		}
644		/*
645		 * All blocks are direct blocks
646		 */
647		if (flags & BA_METAONLY)
648			panic("ffs_balloc_ufs2: BA_METAONLY for ext block");
649		nb = dp->di_extb[lbn];
650		if (nb != 0 && dp->di_extsize >= smalllblktosize(fs, lbn + 1)) {
651			error = bread_gb(vp, -1 - lbn, fs->fs_bsize, NOCRED,
652			    gbflags, &bp);
653			if (error) {
654				brelse(bp);
655				return (error);
656			}
657			bp->b_blkno = fsbtodb(fs, nb);
658			bp->b_xflags |= BX_ALTDATA;
659			*bpp = bp;
660			return (0);
661		}
662		if (nb != 0) {
663			/*
664			 * Consider need to reallocate a fragment.
665			 */
666			osize = fragroundup(fs, blkoff(fs, dp->di_extsize));
667			nsize = fragroundup(fs, size);
668			if (nsize <= osize) {
669				error = bread_gb(vp, -1 - lbn, osize, NOCRED,
670				    gbflags, &bp);
671				if (error) {
672					brelse(bp);
673					return (error);
674				}
675				bp->b_blkno = fsbtodb(fs, nb);
676				bp->b_xflags |= BX_ALTDATA;
677			} else {
678				UFS_LOCK(ump);
679				error = ffs_realloccg(ip, -1 - lbn,
680				    dp->di_extb[lbn],
681				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
682				    &dp->di_extb[0]), osize, nsize, flags,
683				    cred, &bp);
684				if (error)
685					return (error);
686				bp->b_xflags |= BX_ALTDATA;
687				if (DOINGSOFTDEP(vp))
688					softdep_setup_allocext(ip, lbn,
689					    dbtofsb(fs, bp->b_blkno), nb,
690					    nsize, osize, bp);
691			}
692		} else {
693			if (dp->di_extsize < smalllblktosize(fs, lbn + 1))
694				nsize = fragroundup(fs, size);
695			else
696				nsize = fs->fs_bsize;
697			UFS_LOCK(ump);
698			error = ffs_alloc(ip, lbn,
699			   ffs_blkpref_ufs2(ip, lbn, (int)lbn, &dp->di_extb[0]),
700			   nsize, flags, cred, &newb);
701			if (error)
702				return (error);
703			bp = getblk(vp, -1 - lbn, nsize, 0, 0, gbflags);
704			bp->b_blkno = fsbtodb(fs, newb);
705			bp->b_xflags |= BX_ALTDATA;
706			if (flags & BA_CLRBUF)
707				vfs_bio_clrbuf(bp);
708			if (DOINGSOFTDEP(vp))
709				softdep_setup_allocext(ip, lbn, newb, 0,
710				    nsize, 0, bp);
711		}
712		dp->di_extb[lbn] = dbtofsb(fs, bp->b_blkno);
713		ip->i_flag |= IN_CHANGE;
714		*bpp = bp;
715		return (0);
716	}
717	/*
718	 * If the next write will extend the file into a new block,
719	 * and the file is currently composed of a fragment
720	 * this fragment has to be extended to be a full block.
721	 */
722	lastlbn = lblkno(fs, ip->i_size);
723	if (lastlbn < NDADDR && lastlbn < lbn) {
724		nb = lastlbn;
725		osize = blksize(fs, ip, nb);
726		if (osize < fs->fs_bsize && osize > 0) {
727			UFS_LOCK(ump);
728			error = ffs_realloccg(ip, nb, dp->di_db[nb],
729			    ffs_blkpref_ufs2(ip, lastlbn, (int)nb,
730			    &dp->di_db[0]), osize, (int)fs->fs_bsize,
731			    flags, cred, &bp);
732			if (error)
733				return (error);
734			if (DOINGSOFTDEP(vp))
735				softdep_setup_allocdirect(ip, nb,
736				    dbtofsb(fs, bp->b_blkno),
737				    dp->di_db[nb],
738				    fs->fs_bsize, osize, bp);
739			ip->i_size = smalllblktosize(fs, nb + 1);
740			dp->di_size = ip->i_size;
741			dp->di_db[nb] = dbtofsb(fs, bp->b_blkno);
742			ip->i_flag |= IN_CHANGE | IN_UPDATE;
743			if (flags & IO_SYNC)
744				bwrite(bp);
745			else
746				bawrite(bp);
747		}
748	}
749	/*
750	 * The first NDADDR blocks are direct blocks
751	 */
752	if (lbn < NDADDR) {
753		if (flags & BA_METAONLY)
754			panic("ffs_balloc_ufs2: BA_METAONLY for direct block");
755		nb = dp->di_db[lbn];
756		if (nb != 0 && ip->i_size >= smalllblktosize(fs, lbn + 1)) {
757			error = bread_gb(vp, lbn, fs->fs_bsize, NOCRED,
758			    gbflags, &bp);
759			if (error) {
760				brelse(bp);
761				return (error);
762			}
763			bp->b_blkno = fsbtodb(fs, nb);
764			*bpp = bp;
765			return (0);
766		}
767		if (nb != 0) {
768			/*
769			 * Consider need to reallocate a fragment.
770			 */
771			osize = fragroundup(fs, blkoff(fs, ip->i_size));
772			nsize = fragroundup(fs, size);
773			if (nsize <= osize) {
774				error = bread_gb(vp, lbn, osize, NOCRED,
775				    gbflags, &bp);
776				if (error) {
777					brelse(bp);
778					return (error);
779				}
780				bp->b_blkno = fsbtodb(fs, nb);
781			} else {
782				UFS_LOCK(ump);
783				error = ffs_realloccg(ip, lbn, dp->di_db[lbn],
784				    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
785				    &dp->di_db[0]), osize, nsize, flags,
786				    cred, &bp);
787				if (error)
788					return (error);
789				if (DOINGSOFTDEP(vp))
790					softdep_setup_allocdirect(ip, lbn,
791					    dbtofsb(fs, bp->b_blkno), nb,
792					    nsize, osize, bp);
793			}
794		} else {
795			if (ip->i_size < smalllblktosize(fs, lbn + 1))
796				nsize = fragroundup(fs, size);
797			else
798				nsize = fs->fs_bsize;
799			UFS_LOCK(ump);
800			error = ffs_alloc(ip, lbn,
801			    ffs_blkpref_ufs2(ip, lbn, (int)lbn,
802				&dp->di_db[0]), nsize, flags, cred, &newb);
803			if (error)
804				return (error);
805			bp = getblk(vp, lbn, nsize, 0, 0, gbflags);
806			bp->b_blkno = fsbtodb(fs, newb);
807			if (flags & BA_CLRBUF)
808				vfs_bio_clrbuf(bp);
809			if (DOINGSOFTDEP(vp))
810				softdep_setup_allocdirect(ip, lbn, newb, 0,
811				    nsize, 0, bp);
812		}
813		dp->di_db[lbn] = dbtofsb(fs, bp->b_blkno);
814		ip->i_flag |= IN_CHANGE | IN_UPDATE;
815		*bpp = bp;
816		return (0);
817	}
818	/*
819	 * Determine the number of levels of indirection.
820	 */
821	pref = 0;
822	if ((error = ufs_getlbns(vp, lbn, indirs, &num)) != 0)
823		return(error);
824#ifdef INVARIANTS
825	if (num < 1)
826		panic ("ffs_balloc_ufs2: ufs_getlbns returned indirect block");
827#endif
828	saved_inbdflush = curthread_pflags_set(TDP_INBDFLUSH);
829	/*
830	 * Fetch the first indirect block allocating if necessary.
831	 */
832	--num;
833	nb = dp->di_ib[indirs[0].in_off];
834	allocib = NULL;
835	allocblk = allociblk;
836	lbns_remfree = lbns;
837	if (nb == 0) {
838		UFS_LOCK(ump);
839		pref = ffs_blkpref_ufs2(ip, lbn, -indirs[0].in_off - 1,
840		    (ufs2_daddr_t *)0);
841		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
842		    flags, cred, &newb)) != 0) {
843			curthread_pflags_restore(saved_inbdflush);
844			return (error);
845		}
846		pref = newb + fs->fs_frag;
847		nb = newb;
848		MPASS(allocblk < allociblk + nitems(allociblk));
849		MPASS(lbns_remfree < lbns + nitems(lbns));
850		*allocblk++ = nb;
851		*lbns_remfree++ = indirs[1].in_lbn;
852		bp = getblk(vp, indirs[1].in_lbn, fs->fs_bsize, 0, 0,
853		    GB_UNMAPPED);
854		bp->b_blkno = fsbtodb(fs, nb);
855		vfs_bio_clrbuf(bp);
856		if (DOINGSOFTDEP(vp)) {
857			softdep_setup_allocdirect(ip, NDADDR + indirs[0].in_off,
858			    newb, 0, fs->fs_bsize, 0, bp);
859			bdwrite(bp);
860		} else {
861			/*
862			 * Write synchronously so that indirect blocks
863			 * never point at garbage.
864			 */
865			if (DOINGASYNC(vp))
866				bdwrite(bp);
867			else if ((error = bwrite(bp)) != 0)
868				goto fail;
869		}
870		allocib = &dp->di_ib[indirs[0].in_off];
871		*allocib = nb;
872		ip->i_flag |= IN_CHANGE | IN_UPDATE;
873	}
874	/*
875	 * Fetch through the indirect blocks, allocating as necessary.
876	 */
877retry:
878	for (i = 1;;) {
879		error = bread(vp,
880		    indirs[i].in_lbn, (int)fs->fs_bsize, NOCRED, &bp);
881		if (error) {
882			brelse(bp);
883			goto fail;
884		}
885		bap = (ufs2_daddr_t *)bp->b_data;
886		nb = bap[indirs[i].in_off];
887		if (i == num)
888			break;
889		i += 1;
890		if (nb != 0) {
891			bqrelse(bp);
892			continue;
893		}
894		UFS_LOCK(ump);
895		/*
896		 * If parent indirect has just been allocated, try to cluster
897		 * immediately following it.
898		 */
899		if (pref == 0)
900			pref = ffs_blkpref_ufs2(ip, lbn, i - num - 1,
901			    (ufs2_daddr_t *)0);
902		if ((error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
903		    flags | IO_BUFLOCKED, cred, &newb)) != 0) {
904			brelse(bp);
905			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
906				UFS_LOCK(ump);
907				softdep_request_cleanup(fs, vp, cred,
908				    FLUSH_BLOCKS_WAIT);
909				UFS_UNLOCK(ump);
910				goto retry;
911			}
912			if (ppsratecheck(&lastfail, &curfail, 1)) {
913				ffs_fserr(fs, ip->i_number, "filesystem full");
914				uprintf("\n%s: write failed, filesystem "
915				    "is full\n", fs->fs_fsmnt);
916			}
917			goto fail;
918		}
919		pref = newb + fs->fs_frag;
920		nb = newb;
921		MPASS(allocblk < allociblk + nitems(allociblk));
922		MPASS(lbns_remfree < lbns + nitems(lbns));
923		*allocblk++ = nb;
924		*lbns_remfree++ = indirs[i].in_lbn;
925		nbp = getblk(vp, indirs[i].in_lbn, fs->fs_bsize, 0, 0,
926		    GB_UNMAPPED);
927		nbp->b_blkno = fsbtodb(fs, nb);
928		vfs_bio_clrbuf(nbp);
929		if (DOINGSOFTDEP(vp)) {
930			softdep_setup_allocindir_meta(nbp, ip, bp,
931			    indirs[i - 1].in_off, nb);
932			bdwrite(nbp);
933		} else {
934			/*
935			 * Write synchronously so that indirect blocks
936			 * never point at garbage.
937			 */
938			if ((error = bwrite(nbp)) != 0) {
939				brelse(bp);
940				goto fail;
941			}
942		}
943		bap[indirs[i - 1].in_off] = nb;
944		if (allocib == NULL && unwindidx < 0)
945			unwindidx = i - 1;
946		/*
947		 * If required, write synchronously, otherwise use
948		 * delayed write.
949		 */
950		if (flags & IO_SYNC) {
951			bwrite(bp);
952		} else {
953			if (bp->b_bufsize == fs->fs_bsize)
954				bp->b_flags |= B_CLUSTEROK;
955			bdwrite(bp);
956		}
957	}
958	/*
959	 * If asked only for the indirect block, then return it.
960	 */
961	if (flags & BA_METAONLY) {
962		curthread_pflags_restore(saved_inbdflush);
963		*bpp = bp;
964		return (0);
965	}
966	/*
967	 * Get the data block, allocating if necessary.
968	 */
969	if (nb == 0) {
970		UFS_LOCK(ump);
971		/*
972		 * If allocating metadata at the front of the cylinder
973		 * group and parent indirect block has just been allocated,
974		 * then cluster next to it if it is the first indirect in
975		 * the file. Otherwise it has been allocated in the metadata
976		 * area, so we want to find our own place out in the data area.
977		 */
978		if (pref == 0 || (lbn > NDADDR && fs->fs_metaspace != 0))
979			pref = ffs_blkpref_ufs2(ip, lbn, indirs[i].in_off,
980			    &bap[0]);
981		error = ffs_alloc(ip, lbn, pref, (int)fs->fs_bsize,
982		    flags | IO_BUFLOCKED, cred, &newb);
983		if (error) {
984			brelse(bp);
985			if (DOINGSOFTDEP(vp) && ++reclaimed == 1) {
986				UFS_LOCK(ump);
987				softdep_request_cleanup(fs, vp, cred,
988				    FLUSH_BLOCKS_WAIT);
989				UFS_UNLOCK(ump);
990				goto retry;
991			}
992			if (ppsratecheck(&lastfail, &curfail, 1)) {
993				ffs_fserr(fs, ip->i_number, "filesystem full");
994				uprintf("\n%s: write failed, filesystem "
995				    "is full\n", fs->fs_fsmnt);
996			}
997			goto fail;
998		}
999		nb = newb;
1000		MPASS(allocblk < allociblk + nitems(allociblk));
1001		MPASS(lbns_remfree < lbns + nitems(lbns));
1002		*allocblk++ = nb;
1003		*lbns_remfree++ = lbn;
1004		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1005		nbp->b_blkno = fsbtodb(fs, nb);
1006		if (flags & BA_CLRBUF)
1007			vfs_bio_clrbuf(nbp);
1008		if (DOINGSOFTDEP(vp))
1009			softdep_setup_allocindir_page(ip, lbn, bp,
1010			    indirs[i].in_off, nb, 0, nbp);
1011		bap[indirs[i].in_off] = nb;
1012		/*
1013		 * If required, write synchronously, otherwise use
1014		 * delayed write.
1015		 */
1016		if (flags & IO_SYNC) {
1017			bwrite(bp);
1018		} else {
1019			if (bp->b_bufsize == fs->fs_bsize)
1020				bp->b_flags |= B_CLUSTEROK;
1021			bdwrite(bp);
1022		}
1023		curthread_pflags_restore(saved_inbdflush);
1024		*bpp = nbp;
1025		return (0);
1026	}
1027	brelse(bp);
1028	/*
1029	 * If requested clear invalid portions of the buffer.  If we
1030	 * have to do a read-before-write (typical if BA_CLRBUF is set),
1031	 * try to do some read-ahead in the sequential case to reduce
1032	 * the number of I/O transactions.
1033	 */
1034	if (flags & BA_CLRBUF) {
1035		int seqcount = (flags & BA_SEQMASK) >> BA_SEQSHIFT;
1036		if (seqcount != 0 &&
1037		    (vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0 &&
1038		    !(vm_page_count_severe() || buf_dirty_count_severe())) {
1039			error = cluster_read(vp, ip->i_size, lbn,
1040			    (int)fs->fs_bsize, NOCRED,
1041			    MAXBSIZE, seqcount, gbflags, &nbp);
1042		} else {
1043			error = bread_gb(vp, lbn, (int)fs->fs_bsize,
1044			    NOCRED, gbflags, &nbp);
1045		}
1046		if (error) {
1047			brelse(nbp);
1048			goto fail;
1049		}
1050	} else {
1051		nbp = getblk(vp, lbn, fs->fs_bsize, 0, 0, gbflags);
1052		nbp->b_blkno = fsbtodb(fs, nb);
1053	}
1054	curthread_pflags_restore(saved_inbdflush);
1055	*bpp = nbp;
1056	return (0);
1057fail:
1058	curthread_pflags_restore(saved_inbdflush);
1059	/*
1060	 * If we have failed to allocate any blocks, simply return the error.
1061	 * This is the usual case and avoids the need to fsync the file.
1062	 */
1063	if (allocblk == allociblk && allocib == NULL && unwindidx == -1)
1064		return (error);
1065	/*
1066	 * If we have failed part way through block allocation, we
1067	 * have to deallocate any indirect blocks that we have allocated.
1068	 * We have to fsync the file before we start to get rid of all
1069	 * of its dependencies so that we do not leave them dangling.
1070	 * We have to sync it at the end so that the soft updates code
1071	 * does not find any untracked changes. Although this is really
1072	 * slow, running out of disk space is not expected to be a common
1073	 * occurrence. The error return from fsync is ignored as we already
1074	 * have an error to return to the user.
1075	 *
1076	 * XXX Still have to journal the free below
1077	 */
1078	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1079	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
1080	     blkp < allocblk; blkp++, lbns_remfree++) {
1081		/*
1082		 * We shall not leave the freed blocks on the vnode
1083		 * buffer object lists.
1084		 */
1085		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1086		    GB_NOCREAT | GB_UNMAPPED);
1087		if (bp != NULL) {
1088			KASSERT(bp->b_blkno == fsbtodb(fs, *blkp),
1089			    ("mismatch2 l %jd %jd b %ju %ju",
1090			    (intmax_t)bp->b_lblkno, (uintmax_t)*lbns_remfree,
1091			    (uintmax_t)bp->b_blkno,
1092			    (uintmax_t)fsbtodb(fs, *blkp)));
1093			bp->b_flags |= B_INVAL | B_RELBUF | B_NOCACHE;
1094			bp->b_flags &= ~(B_ASYNC | B_CACHE);
1095			brelse(bp);
1096		}
1097		deallocated += fs->fs_bsize;
1098	}
1099	if (allocib != NULL) {
1100		*allocib = 0;
1101	} else if (unwindidx >= 0) {
1102		int r;
1103
1104		r = bread(vp, indirs[unwindidx].in_lbn,
1105		    (int)fs->fs_bsize, NOCRED, &bp);
1106		if (r) {
1107			panic("Could not unwind indirect block, error %d", r);
1108			brelse(bp);
1109		} else {
1110			bap = (ufs2_daddr_t *)bp->b_data;
1111			bap[indirs[unwindidx].in_off] = 0;
1112			if (flags & IO_SYNC) {
1113				bwrite(bp);
1114			} else {
1115				if (bp->b_bufsize == fs->fs_bsize)
1116					bp->b_flags |= B_CLUSTEROK;
1117				bdwrite(bp);
1118			}
1119		}
1120	}
1121	if (deallocated) {
1122#ifdef QUOTA
1123		/*
1124		 * Restore user's disk quota because allocation failed.
1125		 */
1126		(void) chkdq(ip, -btodb(deallocated), cred, FORCE);
1127#endif
1128		dp->di_blocks -= btodb(deallocated);
1129		ip->i_flag |= IN_CHANGE | IN_UPDATE;
1130	}
1131	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
1132	/*
1133	 * After the buffers are invalidated and on-disk pointers are
1134	 * cleared, free the blocks.
1135	 */
1136	for (blkp = allociblk; blkp < allocblk; blkp++) {
1137#ifdef INVARIANTS
1138		if (blkp == allociblk)
1139			lbns_remfree = lbns;
1140		bp = getblk(vp, *lbns_remfree, fs->fs_bsize, 0, 0,
1141		    GB_NOCREAT | GB_UNMAPPED);
1142		if (bp != NULL) {
1143			panic("zombie2 %jd %ju %ju",
1144			    (intmax_t)bp->b_lblkno, (uintmax_t)bp->b_blkno,
1145			    (uintmax_t)fsbtodb(fs, *blkp));
1146		}
1147		lbns_remfree++;
1148#endif
1149		ffs_blkfree(ump, fs, ump->um_devvp, *blkp, fs->fs_bsize,
1150		    ip->i_number, vp->v_type, NULL);
1151	}
1152	return (error);
1153}
1154