ext2_alloc.c revision 325750
11592Srgrimes/*-
250476Speter *  modified for Lites 1.1
31592Srgrimes *
4321267Sngie *  Aug 1995, Godmar Back (gback@cs.utah.edu)
564567Sgshapiro *  University of Utah, Department of Computer Science
638099Speter */
71592Srgrimes/*-
864567Sgshapiro * Copyright (c) 1982, 1986, 1989, 1993
974814Sru *	The Regents of the University of California.  All rights reserved.
1090798Sgshapiro *
11201380Sed * Redistribution and use in source and binary forms, with or without
12201380Sed * modification, are permitted provided that the following conditions
1390164Skris * are met:
141592Srgrimes * 1. Redistributions of source code must retain the above copyright
15147225Sdes *    notice, this list of conditions and the following disclaimer.
16147225Sdes * 2. Redistributions in binary form must reproduce the above copyright
1764567Sgshapiro *    notice, this list of conditions and the following disclaimer in the
1890798Sgshapiro *    documentation and/or other materials provided with the distribution.
1990798Sgshapiro * 4. Neither the name of the University nor the names of its contributors
2064567Sgshapiro *    may be used to endorse or promote products derived from this software
2190798Sgshapiro *    without specific prior written permission.
2290798Sgshapiro *
2390798Sgshapiro * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
2465970Sgshapiro * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
2565970Sgshapiro * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2666961Sgshapiro * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
2766961Sgshapiro * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2865970Sgshapiro * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2965970Sgshapiro * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
3090798Sgshapiro * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
3190798Sgshapiro * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
3290798Sgshapiro * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
331592Srgrimes * SUCH DAMAGE.
34 *
35 *	@(#)ffs_alloc.c	8.8 (Berkeley) 2/21/94
36 * $FreeBSD: stable/10/sys/fs/ext2fs/ext2_alloc.c 325750 2017-11-13 02:31:25Z pfg $
37 */
38
39#include <sys/param.h>
40#include <sys/systm.h>
41#include <sys/conf.h>
42#include <sys/vnode.h>
43#include <sys/stat.h>
44#include <sys/mount.h>
45#include <sys/sysctl.h>
46#include <sys/syslog.h>
47#include <sys/buf.h>
48
49#include <fs/ext2fs/fs.h>
50#include <fs/ext2fs/inode.h>
51#include <fs/ext2fs/ext2_mount.h>
52#include <fs/ext2fs/ext2fs.h>
53#include <fs/ext2fs/ext2_extern.h>
54
55static daddr_t	ext2_alloccg(struct inode *, int, daddr_t, int);
56static daddr_t	ext2_clusteralloc(struct inode *, int, daddr_t, int);
57static u_long	ext2_dirpref(struct inode *);
58static void	ext2_fserr(struct m_ext2fs *, uid_t, char *);
59static u_long	ext2_hashalloc(struct inode *, int, long, int,
60				daddr_t (*)(struct inode *, int, daddr_t,
61						int));
62static daddr_t	ext2_nodealloccg(struct inode *, int, daddr_t, int);
63static daddr_t  ext2_mapsearch(struct m_ext2fs *, char *, daddr_t);
64
65/*
66 * Allocate a block in the filesystem.
67 *
68 * A preference may be optionally specified. If a preference is given
69 * the following hierarchy is used to allocate a block:
70 *   1) allocate the requested block.
71 *   2) allocate a rotationally optimal block in the same cylinder.
72 *   3) allocate a block in the same cylinder group.
73 *   4) quadradically rehash into other cylinder groups, until an
74 *        available block is located.
75 * If no block preference is given the following hierarchy is used
76 * to allocate a block:
77 *   1) allocate a block in the cylinder group that contains the
78 *        inode for the file.
79 *   2) quadradically rehash into other cylinder groups, until an
80 *        available block is located.
81 */
82int
83ext2_alloc(struct inode *ip, daddr_t lbn, e4fs_daddr_t bpref, int size,
84    struct ucred *cred, e4fs_daddr_t *bnp)
85{
86	struct m_ext2fs *fs;
87	struct ext2mount *ump;
88	int32_t bno;
89	int cg;
90
91	*bnp = 0;
92	fs = ip->i_e2fs;
93	ump = ip->i_ump;
94	mtx_assert(EXT2_MTX(ump), MA_OWNED);
95#ifdef INVARIANTS
96	if ((u_int)size > fs->e2fs_bsize || blkoff(fs, size) != 0) {
97		vn_printf(ip->i_devvp, "bsize = %lu, size = %d, fs = %s\n",
98		    (long unsigned int)fs->e2fs_bsize, size, fs->e2fs_fsmnt);
99		panic("ext2_alloc: bad size");
100	}
101	if (cred == NOCRED)
102		panic("ext2_alloc: missing credential");
103#endif		/* INVARIANTS */
104	if (size == fs->e2fs_bsize && fs->e2fs->e2fs_fbcount == 0)
105		goto nospace;
106	if (cred->cr_uid != 0 &&
107	    fs->e2fs->e2fs_fbcount < fs->e2fs->e2fs_rbcount)
108		goto nospace;
109	if (bpref >= fs->e2fs->e2fs_bcount)
110		bpref = 0;
111	if (bpref == 0)
112		cg = ino_to_cg(fs, ip->i_number);
113	else
114		cg = dtog(fs, bpref);
115	bno = (daddr_t)ext2_hashalloc(ip, cg, bpref, fs->e2fs_bsize,
116	    ext2_alloccg);
117	if (bno > 0) {
118		/* set next_alloc fields as done in block_getblk */
119		ip->i_next_alloc_block = lbn;
120		ip->i_next_alloc_goal = bno;
121
122		ip->i_blocks += btodb(fs->e2fs_bsize);
123		ip->i_flag |= IN_CHANGE | IN_UPDATE;
124		*bnp = bno;
125		return (0);
126	}
127nospace:
128	EXT2_UNLOCK(ump);
129	ext2_fserr(fs, cred->cr_uid, "filesystem full");
130	uprintf("\n%s: write failed, filesystem is full\n", fs->e2fs_fsmnt);
131	return (ENOSPC);
132}
133
134/*
135 * Reallocate a sequence of blocks into a contiguous sequence of blocks.
136 *
137 * The vnode and an array of buffer pointers for a range of sequential
138 * logical blocks to be made contiguous is given. The allocator attempts
139 * to find a range of sequential blocks starting as close as possible to
140 * an fs_rotdelay offset from the end of the allocation for the logical
141 * block immediately preceding the current range. If successful, the
142 * physical block numbers in the buffer pointers and in the inode are
143 * changed to reflect the new allocation. If unsuccessful, the allocation
144 * is left unchanged. The success in doing the reallocation is returned.
145 * Note that the error return is not reflected back to the user. Rather
146 * the previous block allocation will be used.
147 */
148
149static SYSCTL_NODE(_vfs, OID_AUTO, ext2fs, CTLFLAG_RW, 0, "EXT2FS filesystem");
150
151static int doasyncfree = 1;
152
153SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doasyncfree, CTLFLAG_RW, &doasyncfree, 0,
154    "Use asychronous writes to update block pointers when freeing blocks");
155
156static int doreallocblks = 0;
157
158SYSCTL_INT(_vfs_ext2fs, OID_AUTO, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
159
160int
161ext2_reallocblks(struct vop_reallocblks_args *ap)
162{
163	struct m_ext2fs *fs;
164	struct inode *ip;
165	struct vnode *vp;
166	struct buf *sbp, *ebp;
167	uint32_t *bap, *sbap, *ebap;
168	struct ext2mount *ump;
169	struct cluster_save *buflist;
170	struct indir start_ap[NIADDR + 1], end_ap[NIADDR + 1], *idp;
171	e2fs_lbn_t start_lbn, end_lbn;
172	int soff;
173	e2fs_daddr_t newblk, blkno;
174	int i, len, start_lvl, end_lvl, pref, ssize;
175
176	if (doreallocblks == 0)
177		return (ENOSPC);
178
179	vp = ap->a_vp;
180	ip = VTOI(vp);
181	fs = ip->i_e2fs;
182	ump = ip->i_ump;
183
184	if (fs->e2fs_contigsumsize <= 0)
185		return (ENOSPC);
186
187	buflist = ap->a_buflist;
188	len = buflist->bs_nchildren;
189	start_lbn = buflist->bs_children[0]->b_lblkno;
190	end_lbn = start_lbn + len - 1;
191#ifdef INVARIANTS
192	for (i = 1; i < len; i++)
193		if (buflist->bs_children[i]->b_lblkno != start_lbn + i)
194			panic("ext2_reallocblks: non-cluster");
195#endif
196	/*
197	 * If the cluster crosses the boundary for the first indirect
198	 * block, leave space for the indirect block. Indirect blocks
199	 * are initially laid out in a position after the last direct
200	 * block. Block reallocation would usually destroy locality by
201	 * moving the indirect block out of the way to make room for
202	 * data blocks if we didn't compensate here. We should also do
203	 * this for other indirect block boundaries, but it is only
204	 * important for the first one.
205	 */
206	if (start_lbn < NDADDR && end_lbn >= NDADDR)
207		return (ENOSPC);
208	/*
209	 * If the latest allocation is in a new cylinder group, assume that
210	 * the filesystem has decided to move and do not force it back to
211	 * the previous cylinder group.
212	 */
213	if (dtog(fs, dbtofsb(fs, buflist->bs_children[0]->b_blkno)) !=
214	    dtog(fs, dbtofsb(fs, buflist->bs_children[len - 1]->b_blkno)))
215		return (ENOSPC);
216	if (ext2_getlbns(vp, start_lbn, start_ap, &start_lvl) ||
217	    ext2_getlbns(vp, end_lbn, end_ap, &end_lvl))
218		return (ENOSPC);
219	/*
220	 * Get the starting offset and block map for the first block.
221	 */
222	if (start_lvl == 0) {
223		sbap = &ip->i_db[0];
224		soff = start_lbn;
225	} else {
226		idp = &start_ap[start_lvl - 1];
227		if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &sbp)) {
228			brelse(sbp);
229			return (ENOSPC);
230		}
231		sbap = (u_int *)sbp->b_data;
232		soff = idp->in_off;
233	}
234	/*
235	 * If the block range spans two block maps, get the second map.
236	 */
237	ebap = NULL;
238	if (end_lvl == 0 || (idp = &end_ap[end_lvl - 1])->in_off + 1 >= len) {
239		ssize = len;
240	} else {
241#ifdef INVARIANTS
242		if (start_ap[start_lvl - 1].in_lbn == idp->in_lbn)
243			panic("ext2_reallocblks: start == end");
244#endif
245		ssize = len - (idp->in_off + 1);
246		if (bread(vp, idp->in_lbn, (int)fs->e2fs_bsize, NOCRED, &ebp))
247			goto fail;
248		ebap = (u_int *)ebp->b_data;
249	}
250	/*
251	 * Find the preferred location for the cluster.
252	 */
253	EXT2_LOCK(ump);
254	pref = ext2_blkpref(ip, start_lbn, soff, sbap, 0);
255	/*
256	 * Search the block map looking for an allocation of the desired size.
257	 */
258	if ((newblk = (e2fs_daddr_t)ext2_hashalloc(ip, dtog(fs, pref), pref,
259	    len, ext2_clusteralloc)) == 0) {
260		EXT2_UNLOCK(ump);
261		goto fail;
262	}
263	/*
264	 * We have found a new contiguous block.
265	 *
266	 * First we have to replace the old block pointers with the new
267	 * block pointers in the inode and indirect blocks associated
268	 * with the file.
269	 */
270#ifdef DEBUG
271	printf("realloc: ino %d, lbns %jd-%jd\n\told:", ip->i_number,
272	    (intmax_t)start_lbn, (intmax_t)end_lbn);
273#endif	/* DEBUG */
274	blkno = newblk;
275	for (bap = &sbap[soff], i = 0; i < len; i++, blkno += fs->e2fs_fpb) {
276		if (i == ssize) {
277			bap = ebap;
278			soff = -i;
279		}
280#ifdef INVARIANTS
281		if (buflist->bs_children[i]->b_blkno != fsbtodb(fs, *bap))
282			panic("ext2_reallocblks: alloc mismatch");
283#endif
284#ifdef DEBUG
285		printf(" %d,", *bap);
286#endif	/* DEBUG */
287		*bap++ = blkno;
288	}
289	/*
290	 * Next we must write out the modified inode and indirect blocks.
291	 * For strict correctness, the writes should be synchronous since
292	 * the old block values may have been written to disk. In practise
293	 * they are almost never written, but if we are concerned about
294	 * strict correctness, the `doasyncfree' flag should be set to zero.
295	 *
296	 * The test on `doasyncfree' should be changed to test a flag
297	 * that shows whether the associated buffers and inodes have
298	 * been written. The flag should be set when the cluster is
299	 * started and cleared whenever the buffer or inode is flushed.
300	 * We can then check below to see if it is set, and do the
301	 * synchronous write only when it has been cleared.
302	 */
303	if (sbap != &ip->i_db[0]) {
304		if (doasyncfree)
305			bdwrite(sbp);
306		else
307			bwrite(sbp);
308	} else {
309		ip->i_flag |= IN_CHANGE | IN_UPDATE;
310		if (!doasyncfree)
311			ext2_update(vp, 1);
312	}
313	if (ssize < len) {
314		if (doasyncfree)
315			bdwrite(ebp);
316		else
317			bwrite(ebp);
318	}
319	/*
320	 * Last, free the old blocks and assign the new blocks to the buffers.
321	 */
322#ifdef DEBUG
323	printf("\n\tnew:");
324#endif	/* DEBUG */
325	for (blkno = newblk, i = 0; i < len; i++, blkno += fs->e2fs_fpb) {
326		ext2_blkfree(ip, dbtofsb(fs, buflist->bs_children[i]->b_blkno),
327		    fs->e2fs_bsize);
328		buflist->bs_children[i]->b_blkno = fsbtodb(fs, blkno);
329#ifdef DEBUG
330		printf(" %d,", blkno);
331#endif	/* DEBUG */
332	}
333#ifdef DEBUG
334	printf("\n");
335#endif	/* DEBUG */
336	return (0);
337
338fail:
339	if (ssize < len)
340		brelse(ebp);
341	if (sbap != &ip->i_db[0])
342		brelse(sbp);
343	return (ENOSPC);
344}
345
346/*
347 * Allocate an inode in the filesystem.
348 *
349 */
350int
351ext2_valloc(struct vnode *pvp, int mode, struct ucred *cred, struct vnode **vpp)
352{
353	struct timespec ts;
354	struct inode *pip;
355	struct m_ext2fs *fs;
356	struct inode *ip;
357	struct ext2mount *ump;
358	ino_t ino, ipref;
359	int i, error, cg;
360
361	*vpp = NULL;
362	pip = VTOI(pvp);
363	fs = pip->i_e2fs;
364	ump = pip->i_ump;
365
366	EXT2_LOCK(ump);
367	if (fs->e2fs->e2fs_ficount == 0)
368		goto noinodes;
369	/*
370	 * If it is a directory then obtain a cylinder group based on
371	 * ext2_dirpref else obtain it using ino_to_cg. The preferred inode is
372	 * always the next inode.
373	 */
374	if ((mode & IFMT) == IFDIR) {
375		cg = ext2_dirpref(pip);
376		if (fs->e2fs_contigdirs[cg] < 255)
377			fs->e2fs_contigdirs[cg]++;
378	} else {
379		cg = ino_to_cg(fs, pip->i_number);
380		if (fs->e2fs_contigdirs[cg] > 0)
381			fs->e2fs_contigdirs[cg]--;
382	}
383	ipref = cg * fs->e2fs->e2fs_ipg + 1;
384	ino = (ino_t)ext2_hashalloc(pip, cg, (long)ipref, mode, ext2_nodealloccg);
385
386	if (ino == 0)
387		goto noinodes;
388	error = VFS_VGET(pvp->v_mount, ino, LK_EXCLUSIVE, vpp);
389	if (error) {
390		ext2_vfree(pvp, ino, mode);
391		return (error);
392	}
393	ip = VTOI(*vpp);
394
395	/*
396	 * The question is whether using VGET was such good idea at all:
397	 * Linux doesn't read the old inode in when it is allocating a
398	 * new one. I will set at least i_size and i_blocks to zero.
399	 */
400	ip->i_flag = 0;
401	ip->i_size = 0;
402	ip->i_blocks = 0;
403	ip->i_mode = 0;
404	ip->i_flags = 0;
405	/* now we want to make sure that the block pointers are zeroed out */
406	for (i = 0; i < NDADDR; i++)
407		ip->i_db[i] = 0;
408	for (i = 0; i < NIADDR; i++)
409		ip->i_ib[i] = 0;
410
411	/*
412	 * Set up a new generation number for this inode.
413	 */
414	ip->i_gen = arc4random();
415
416	vfs_timestamp(&ts);
417	ip->i_birthtime = ts.tv_sec;
418	ip->i_birthnsec = ts.tv_nsec;
419
420/*
421printf("ext2_valloc: allocated inode %d\n", ino);
422*/
423	return (0);
424noinodes:
425	EXT2_UNLOCK(ump);
426	ext2_fserr(fs, cred->cr_uid, "out of inodes");
427	uprintf("\n%s: create/symlink failed, no inodes free\n", fs->e2fs_fsmnt);
428	return (ENOSPC);
429}
430
431/*
432 * Find a cylinder to place a directory.
433 *
434 * The policy implemented by this algorithm is to allocate a
435 * directory inode in the same cylinder group as its parent
436 * directory, but also to reserve space for its files inodes
437 * and data. Restrict the number of directories which may be
438 * allocated one after another in the same cylinder group
439 * without intervening allocation of files.
440 *
441 * If we allocate a first level directory then force allocation
442 * in another cylinder group.
443 *
444 */
445static u_long
446ext2_dirpref(struct inode *pip)
447{
448	struct m_ext2fs *fs;
449	int cg, prefcg, cgsize;
450	u_int avgifree, avgbfree, avgndir, curdirsize;
451	u_int minifree, minbfree, maxndir;
452	u_int mincg, minndir;
453	u_int dirsize, maxcontigdirs;
454
455	mtx_assert(EXT2_MTX(pip->i_ump), MA_OWNED);
456	fs = pip->i_e2fs;
457
458	avgifree = fs->e2fs->e2fs_ficount / fs->e2fs_gcount;
459	avgbfree = fs->e2fs->e2fs_fbcount / fs->e2fs_gcount;
460	avgndir = fs->e2fs_total_dir / fs->e2fs_gcount;
461
462	/*
463	 * Force allocation in another cg if creating a first level dir.
464	 */
465	ASSERT_VOP_LOCKED(ITOV(pip), "ext2fs_dirpref");
466	if (ITOV(pip)->v_vflag & VV_ROOT) {
467		prefcg = arc4random() % fs->e2fs_gcount;
468		mincg = prefcg;
469		minndir = fs->e2fs_ipg;
470		for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
471			if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir &&
472			    fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree &&
473			    fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) {
474				mincg = cg;
475				minndir = fs->e2fs_gd[cg].ext2bgd_ndirs;
476			}
477		for (cg = 0; cg < prefcg; cg++)
478			if (fs->e2fs_gd[cg].ext2bgd_ndirs < minndir &&
479			    fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree &&
480			    fs->e2fs_gd[cg].ext2bgd_nbfree >= avgbfree) {
481				mincg = cg;
482				minndir = fs->e2fs_gd[cg].ext2bgd_ndirs;
483			}
484		return (mincg);
485	}
486	/*
487	 * Count various limits which used for
488	 * optimal allocation of a directory inode.
489	 */
490	maxndir = min(avgndir + fs->e2fs_ipg / 16, fs->e2fs_ipg);
491	minifree = avgifree - avgifree / 4;
492	if (minifree < 1)
493		minifree = 1;
494	minbfree = avgbfree - avgbfree / 4;
495	if (minbfree < 1)
496		minbfree = 1;
497	cgsize = fs->e2fs_fsize * fs->e2fs_fpg;
498	dirsize = AVGDIRSIZE;
499	curdirsize = avgndir ? (cgsize - avgbfree * fs->e2fs_bsize) / avgndir : 0;
500	if (dirsize < curdirsize)
501		dirsize = curdirsize;
502	maxcontigdirs = min((avgbfree * fs->e2fs_bsize) / dirsize, 255);
503	maxcontigdirs = min(maxcontigdirs, fs->e2fs_ipg / AFPDIR);
504	if (maxcontigdirs == 0)
505		maxcontigdirs = 1;
506
507	/*
508	 * Limit number of dirs in one cg and reserve space for
509	 * regular files, but only if we have no deficit in
510	 * inodes or space.
511	 */
512	prefcg = ino_to_cg(fs, pip->i_number);
513	for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
514		if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir &&
515		    fs->e2fs_gd[cg].ext2bgd_nifree >= minifree &&
516		    fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) {
517			if (fs->e2fs_contigdirs[cg] < maxcontigdirs)
518				return (cg);
519		}
520	for (cg = 0; cg < prefcg; cg++)
521		if (fs->e2fs_gd[cg].ext2bgd_ndirs < maxndir &&
522		    fs->e2fs_gd[cg].ext2bgd_nifree >= minifree &&
523		    fs->e2fs_gd[cg].ext2bgd_nbfree >= minbfree) {
524			if (fs->e2fs_contigdirs[cg] < maxcontigdirs)
525				return (cg);
526		}
527	/*
528	 * This is a backstop when we have deficit in space.
529	 */
530	for (cg = prefcg; cg < fs->e2fs_gcount; cg++)
531		if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree)
532			return (cg);
533	for (cg = 0; cg < prefcg; cg++)
534		if (fs->e2fs_gd[cg].ext2bgd_nifree >= avgifree)
535			break;
536	return (cg);
537}
538
539/*
540 * Select the desired position for the next block in a file.
541 *
542 * we try to mimic what Remy does in inode_getblk/block_getblk
543 *
544 * we note: blocknr == 0 means that we're about to allocate either
545 * a direct block or a pointer block at the first level of indirection
546 * (In other words, stuff that will go in i_db[] or i_ib[])
547 *
548 * blocknr != 0 means that we're allocating a block that is none
549 * of the above. Then, blocknr tells us the number of the block
550 * that will hold the pointer
551 */
552e4fs_daddr_t
553ext2_blkpref(struct inode *ip, e2fs_lbn_t lbn, int indx, e2fs_daddr_t *bap,
554    e2fs_daddr_t blocknr)
555{
556	int tmp;
557
558	mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED);
559
560	/*
561	 * If the next block is actually what we thought it is, then set the
562	 * goal to what we thought it should be.
563	 */
564	if (ip->i_next_alloc_block == lbn && ip->i_next_alloc_goal != 0)
565		return ip->i_next_alloc_goal;
566
567	/*
568	 * Now check whether we were provided with an array that basically
569	 * tells us previous blocks to which we want to stay close.
570	 */
571	if (bap)
572		for (tmp = indx - 1; tmp >= 0; tmp--)
573			if (bap[tmp])
574				return bap[tmp];
575
576	/*
577	 * Else lets fall back to the blocknr or, if there is none, follow
578	 * the rule that a block should be allocated near its inode.
579	 */
580	return blocknr ? blocknr :
581	    (e2fs_daddr_t)(ip->i_block_group *
582	    EXT2_BLOCKS_PER_GROUP(ip->i_e2fs)) +
583	    ip->i_e2fs->e2fs->e2fs_first_dblock;
584}
585
586/*
587 * Implement the cylinder overflow algorithm.
588 *
589 * The policy implemented by this algorithm is:
590 *   1) allocate the block in its requested cylinder group.
591 *   2) quadradically rehash on the cylinder group number.
592 *   3) brute force search for a free block.
593 */
594static u_long
595ext2_hashalloc(struct inode *ip, int cg, long pref, int size,
596    daddr_t (*allocator) (struct inode *, int, daddr_t, int))
597{
598	struct m_ext2fs *fs;
599	ino_t result;
600	int i, icg = cg;
601
602	mtx_assert(EXT2_MTX(ip->i_ump), MA_OWNED);
603	fs = ip->i_e2fs;
604	/*
605	 * 1: preferred cylinder group
606	 */
607	result = (*allocator)(ip, cg, pref, size);
608	if (result)
609		return (result);
610	/*
611	 * 2: quadratic rehash
612	 */
613	for (i = 1; i < fs->e2fs_gcount; i *= 2) {
614		cg += i;
615		if (cg >= fs->e2fs_gcount)
616			cg -= fs->e2fs_gcount;
617		result = (*allocator)(ip, cg, 0, size);
618		if (result)
619			return (result);
620	}
621	/*
622	 * 3: brute force search
623	 * Note that we start at i == 2, since 0 was checked initially,
624	 * and 1 is always checked in the quadratic rehash.
625	 */
626	cg = (icg + 2) % fs->e2fs_gcount;
627	for (i = 2; i < fs->e2fs_gcount; i++) {
628		result = (*allocator)(ip, cg, 0, size);
629		if (result)
630			return (result);
631		cg++;
632		if (cg == fs->e2fs_gcount)
633			cg = 0;
634	}
635	return (0);
636}
637
638/*
639 * Determine whether a block can be allocated.
640 *
641 * Check to see if a block of the appropriate size is available,
642 * and if it is, allocate it.
643 */
644static daddr_t
645ext2_alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
646{
647	struct m_ext2fs *fs;
648	struct buf *bp;
649	struct ext2mount *ump;
650	daddr_t bno, runstart, runlen;
651	int bit, loc, end, error, start;
652	char *bbp;
653	/* XXX ondisk32 */
654	fs = ip->i_e2fs;
655	ump = ip->i_ump;
656	if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0)
657		return (0);
658	EXT2_UNLOCK(ump);
659	error = bread(ip->i_devvp, fsbtodb(fs,
660	    fs->e2fs_gd[cg].ext2bgd_b_bitmap),
661	    (int)fs->e2fs_bsize, NOCRED, &bp);
662	if (error) {
663		brelse(bp);
664		EXT2_LOCK(ump);
665		return (0);
666	}
667	if (fs->e2fs_gd[cg].ext2bgd_nbfree == 0) {
668		/*
669		 * Another thread allocated the last block in this
670		 * group while we were waiting for the buffer.
671		 */
672		brelse(bp);
673		EXT2_LOCK(ump);
674		return (0);
675	}
676	bbp = (char *)bp->b_data;
677
678	if (dtog(fs, bpref) != cg)
679		bpref = 0;
680	if (bpref != 0) {
681		bpref = dtogd(fs, bpref);
682		/*
683		 * if the requested block is available, use it
684		 */
685		if (isclr(bbp, bpref)) {
686			bno = bpref;
687			goto gotit;
688		}
689	}
690	/*
691	 * no blocks in the requested cylinder, so take next
692	 * available one in this cylinder group.
693	 * first try to get 8 contigous blocks, then fall back to a single
694	 * block.
695	 */
696	if (bpref)
697		start = dtogd(fs, bpref) / NBBY;
698	else
699		start = 0;
700	end = howmany(fs->e2fs->e2fs_fpg, NBBY) - start;
701retry:
702	runlen = 0;
703	runstart = 0;
704	for (loc = start; loc < end; loc++) {
705		if (bbp[loc] == (char)0xff) {
706			runlen = 0;
707			continue;
708		}
709
710		/* Start of a run, find the number of high clear bits. */
711		if (runlen == 0) {
712			bit = fls(bbp[loc]);
713			runlen = NBBY - bit;
714			runstart = loc * NBBY + bit;
715		} else if (bbp[loc] == 0) {
716			/* Continue a run. */
717			runlen += NBBY;
718		} else {
719			/*
720			 * Finish the current run.  If it isn't long
721			 * enough, start a new one.
722			 */
723			bit = ffs(bbp[loc]) - 1;
724			runlen += bit;
725			if (runlen >= 8) {
726				bno = runstart;
727				goto gotit;
728			}
729
730			/* Run was too short, start a new one. */
731			bit = fls(bbp[loc]);
732			runlen = NBBY - bit;
733			runstart = loc * NBBY + bit;
734		}
735
736		/* If the current run is long enough, use it. */
737		if (runlen >= 8) {
738			bno = runstart;
739			goto gotit;
740		}
741	}
742	if (start != 0) {
743		end = start;
744		start = 0;
745		goto retry;
746	}
747	bno = ext2_mapsearch(fs, bbp, bpref);
748	if (bno < 0) {
749		brelse(bp);
750		EXT2_LOCK(ump);
751		return (0);
752	}
753gotit:
754#ifdef INVARIANTS
755	if (isset(bbp, bno)) {
756		printf("ext2fs_alloccgblk: cg=%d bno=%jd fs=%s\n",
757		    cg, (intmax_t)bno, fs->e2fs_fsmnt);
758		panic("ext2fs_alloccg: dup alloc");
759	}
760#endif
761	setbit(bbp, bno);
762	EXT2_LOCK(ump);
763	ext2_clusteracct(fs, bbp, cg, bno, -1);
764	fs->e2fs->e2fs_fbcount--;
765	fs->e2fs_gd[cg].ext2bgd_nbfree--;
766	fs->e2fs_fmod = 1;
767	EXT2_UNLOCK(ump);
768	bdwrite(bp);
769	return (cg * fs->e2fs->e2fs_fpg + fs->e2fs->e2fs_first_dblock + bno);
770}
771
772/*
773 * Determine whether a cluster can be allocated.
774 */
775static daddr_t
776ext2_clusteralloc(struct inode *ip, int cg, daddr_t bpref, int len)
777{
778	struct m_ext2fs *fs;
779	struct ext2mount *ump;
780	struct buf *bp;
781	char *bbp;
782	int bit, error, got, i, loc, run;
783	int32_t *lp;
784	daddr_t bno;
785
786	fs = ip->i_e2fs;
787	ump = ip->i_ump;
788
789	if (fs->e2fs_maxcluster[cg] < len)
790		return (0);
791
792	EXT2_UNLOCK(ump);
793	error = bread(ip->i_devvp,
794	    fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
795	    (int)fs->e2fs_bsize, NOCRED, &bp);
796	if (error)
797		goto fail_lock;
798
799	bbp = (char *)bp->b_data;
800	EXT2_LOCK(ump);
801	/*
802	 * Check to see if a cluster of the needed size (or bigger) is
803	 * available in this cylinder group.
804	 */
805	lp = &fs->e2fs_clustersum[cg].cs_sum[len];
806	for (i = len; i <= fs->e2fs_contigsumsize; i++)
807		if (*lp++ > 0)
808			break;
809	if (i > fs->e2fs_contigsumsize) {
810		/*
811		 * Update the cluster summary information to reflect
812		 * the true maximum-sized cluster so that future cluster
813		 * allocation requests can avoid reading the bitmap only
814		 * to find no cluster.
815		 */
816		lp = &fs->e2fs_clustersum[cg].cs_sum[len - 1];
817		for (i = len - 1; i > 0; i--)
818			if (*lp-- > 0)
819				break;
820		fs->e2fs_maxcluster[cg] = i;
821		goto fail;
822	}
823	EXT2_UNLOCK(ump);
824
825	/* Search the bitmap to find a big enough cluster like in FFS. */
826	if (dtog(fs, bpref) != cg)
827		bpref = 0;
828	if (bpref != 0)
829		bpref = dtogd(fs, bpref);
830	loc = bpref / NBBY;
831	bit = 1 << (bpref % NBBY);
832	for (run = 0, got = bpref; got < fs->e2fs->e2fs_fpg; got++) {
833		if ((bbp[loc] & bit) != 0)
834			run = 0;
835		else {
836			run++;
837			if (run == len)
838				break;
839		}
840		if ((got & (NBBY - 1)) != (NBBY - 1))
841			bit <<= 1;
842		else {
843			loc++;
844			bit = 1;
845		}
846	}
847
848	if (got >= fs->e2fs->e2fs_fpg)
849		goto fail_lock;
850
851	/* Allocate the cluster that we found. */
852	for (i = 1; i < len; i++)
853		if (!isclr(bbp, got - run + i))
854			panic("ext2_clusteralloc: map mismatch");
855
856	bno = got - run + 1;
857	if (bno >= fs->e2fs->e2fs_fpg)
858		panic("ext2_clusteralloc: allocated out of group");
859
860	EXT2_LOCK(ump);
861	for (i = 0; i < len; i += fs->e2fs_fpb) {
862		setbit(bbp, bno + i);
863		ext2_clusteracct(fs, bbp, cg, bno + i, -1);
864		fs->e2fs->e2fs_fbcount--;
865		fs->e2fs_gd[cg].ext2bgd_nbfree--;
866	}
867	fs->e2fs_fmod = 1;
868	EXT2_UNLOCK(ump);
869
870	bdwrite(bp);
871	return (cg * fs->e2fs->e2fs_fpg + fs->e2fs->e2fs_first_dblock + bno);
872
873fail_lock:
874	EXT2_LOCK(ump);
875fail:
876	brelse(bp);
877	return (0);
878}
879
880/*
881 * Determine whether an inode can be allocated.
882 *
883 * Check to see if an inode is available, and if it is,
884 * allocate it using tode in the specified cylinder group.
885 */
886static daddr_t
887ext2_nodealloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
888{
889	struct m_ext2fs *fs;
890	struct buf *bp;
891	struct ext2mount *ump;
892	int error, start, len;
893	char *ibp, *loc;
894
895	ipref--;	/* to avoid a lot of (ipref -1) */
896	if (ipref == -1)
897		ipref = 0;
898	fs = ip->i_e2fs;
899	ump = ip->i_ump;
900	if (fs->e2fs_gd[cg].ext2bgd_nifree == 0)
901		return (0);
902	EXT2_UNLOCK(ump);
903	error = bread(ip->i_devvp, fsbtodb(fs,
904	    fs->e2fs_gd[cg].ext2bgd_i_bitmap),
905	    (int)fs->e2fs_bsize, NOCRED, &bp);
906	if (error) {
907		brelse(bp);
908		EXT2_LOCK(ump);
909		return (0);
910	}
911	if (fs->e2fs_gd[cg].ext2bgd_nifree == 0) {
912		/*
913		 * Another thread allocated the last i-node in this
914		 * group while we were waiting for the buffer.
915		 */
916		brelse(bp);
917		EXT2_LOCK(ump);
918		return (0);
919	}
920	ibp = (char *)bp->b_data;
921	if (ipref) {
922		ipref %= fs->e2fs->e2fs_ipg;
923		if (isclr(ibp, ipref))
924			goto gotit;
925	}
926	start = ipref / NBBY;
927	len = howmany(fs->e2fs->e2fs_ipg - ipref, NBBY);
928	loc = memcchr(&ibp[start], 0xff, len);
929	if (loc == NULL) {
930		len = start + 1;
931		start = 0;
932		loc = memcchr(&ibp[start], 0xff, len);
933		if (loc == NULL) {
934			printf("cg = %d, ipref = %lld, fs = %s\n",
935			    cg, (long long)ipref, fs->e2fs_fsmnt);
936			panic("ext2fs_nodealloccg: map corrupted");
937			/* NOTREACHED */
938		}
939	}
940	ipref = (loc - ibp) * NBBY + ffs(~*loc) - 1;
941gotit:
942	setbit(ibp, ipref);
943	EXT2_LOCK(ump);
944	fs->e2fs_gd[cg].ext2bgd_nifree--;
945	fs->e2fs->e2fs_ficount--;
946	fs->e2fs_fmod = 1;
947	if ((mode & IFMT) == IFDIR) {
948		fs->e2fs_gd[cg].ext2bgd_ndirs++;
949		fs->e2fs_total_dir++;
950	}
951	EXT2_UNLOCK(ump);
952	bdwrite(bp);
953	return (cg * fs->e2fs->e2fs_ipg + ipref + 1);
954}
955
956/*
957 * Free a block or fragment.
958 *
959 */
960void
961ext2_blkfree(struct inode *ip, e4fs_daddr_t bno, long size)
962{
963	struct m_ext2fs *fs;
964	struct buf *bp;
965	struct ext2mount *ump;
966	int cg, error;
967	char *bbp;
968
969	fs = ip->i_e2fs;
970	ump = ip->i_ump;
971	cg = dtog(fs, bno);
972	if ((u_int)bno >= fs->e2fs->e2fs_bcount) {
973		printf("bad block %lld, ino %llu\n", (long long)bno,
974		    (unsigned long long)ip->i_number);
975		ext2_fserr(fs, ip->i_uid, "bad block");
976		return;
977	}
978	error = bread(ip->i_devvp,
979	    fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_b_bitmap),
980	    (int)fs->e2fs_bsize, NOCRED, &bp);
981	if (error) {
982		brelse(bp);
983		return;
984	}
985	bbp = (char *)bp->b_data;
986	bno = dtogd(fs, bno);
987	if (isclr(bbp, bno)) {
988		printf("block = %lld, fs = %s\n",
989		    (long long)bno, fs->e2fs_fsmnt);
990		panic("ext2_blkfree: freeing free block");
991	}
992	clrbit(bbp, bno);
993	EXT2_LOCK(ump);
994	ext2_clusteracct(fs, bbp, cg, bno, 1);
995	fs->e2fs->e2fs_fbcount++;
996	fs->e2fs_gd[cg].ext2bgd_nbfree++;
997	fs->e2fs_fmod = 1;
998	EXT2_UNLOCK(ump);
999	bdwrite(bp);
1000}
1001
1002/*
1003 * Free an inode.
1004 *
1005 */
1006int
1007ext2_vfree(struct vnode *pvp, ino_t ino, int mode)
1008{
1009	struct m_ext2fs *fs;
1010	struct inode *pip;
1011	struct buf *bp;
1012	struct ext2mount *ump;
1013	int error, cg;
1014	char *ibp;
1015
1016	pip = VTOI(pvp);
1017	fs = pip->i_e2fs;
1018	ump = pip->i_ump;
1019	if ((u_int)ino > fs->e2fs_ipg * fs->e2fs_gcount)
1020		panic("ext2_vfree: range: devvp = %p, ino = %ju, fs = %s",
1021		    pip->i_devvp, (uintmax_t)ino, fs->e2fs_fsmnt);
1022
1023	cg = ino_to_cg(fs, ino);
1024	error = bread(pip->i_devvp,
1025	    fsbtodb(fs, fs->e2fs_gd[cg].ext2bgd_i_bitmap),
1026	    (int)fs->e2fs_bsize, NOCRED, &bp);
1027	if (error) {
1028		brelse(bp);
1029		return (0);
1030	}
1031	ibp = (char *)bp->b_data;
1032	ino = (ino - 1) % fs->e2fs->e2fs_ipg;
1033	if (isclr(ibp, ino)) {
1034		printf("ino = %llu, fs = %s\n",
1035		    (unsigned long long)ino, fs->e2fs_fsmnt);
1036		if (fs->e2fs_ronly == 0)
1037			panic("ext2_vfree: freeing free inode");
1038	}
1039	clrbit(ibp, ino);
1040	EXT2_LOCK(ump);
1041	fs->e2fs->e2fs_ficount++;
1042	fs->e2fs_gd[cg].ext2bgd_nifree++;
1043	if ((mode & IFMT) == IFDIR) {
1044		fs->e2fs_gd[cg].ext2bgd_ndirs--;
1045		fs->e2fs_total_dir--;
1046	}
1047	fs->e2fs_fmod = 1;
1048	EXT2_UNLOCK(ump);
1049	bdwrite(bp);
1050	return (0);
1051}
1052
1053/*
1054 * Find a block in the specified cylinder group.
1055 *
1056 * It is a panic if a request is made to find a block if none are
1057 * available.
1058 */
1059static daddr_t
1060ext2_mapsearch(struct m_ext2fs *fs, char *bbp, daddr_t bpref)
1061{
1062	char *loc;
1063	int start, len;
1064
1065	/*
1066	 * find the fragment by searching through the free block
1067	 * map for an appropriate bit pattern
1068	 */
1069	if (bpref)
1070		start = dtogd(fs, bpref) / NBBY;
1071	else
1072		start = 0;
1073	len = howmany(fs->e2fs->e2fs_fpg, NBBY) - start;
1074	loc = memcchr(&bbp[start], 0xff, len);
1075	if (loc == NULL) {
1076		len = start + 1;
1077		start = 0;
1078		loc = memcchr(&bbp[start], 0xff, len);
1079		if (loc == NULL) {
1080			printf("start = %d, len = %d, fs = %s\n",
1081			    start, len, fs->e2fs_fsmnt);
1082			panic("ext2_mapsearch: map corrupted");
1083			/* NOTREACHED */
1084		}
1085	}
1086	return ((loc - bbp) * NBBY + ffs(~*loc) - 1);
1087}
1088
1089/*
1090 * Fserr prints the name of a filesystem with an error diagnostic.
1091 *
1092 * The form of the error message is:
1093 *	fs: error message
1094 */
1095static void
1096ext2_fserr(struct m_ext2fs *fs, uid_t uid, char *cp)
1097{
1098
1099	log(LOG_ERR, "uid %u on %s: %s\n", uid, fs->e2fs_fsmnt, cp);
1100}
1101
1102int
1103cg_has_sb(int i)
1104{
1105	int a3, a5, a7;
1106
1107	if (i == 0 || i == 1)
1108		return 1;
1109	for (a3 = 3, a5 = 5, a7 = 7;
1110	    a3 <= i || a5 <= i || a7 <= i;
1111	    a3 *= 3, a5 *= 5, a7 *= 7)
1112		if (i == a3 || i == a5 || i == a7)
1113			return 1;
1114	return 0;
1115}
1116