ffs_snapshot.c revision 262779
1/*-
2 * Copyright 2000 Marshall Kirk McKusick. All Rights Reserved.
3 *
4 * Further information about snapshots can be obtained from:
5 *
6 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
7 *	1614 Oxford Street		mckusick@mckusick.com
8 *	Berkeley, CA 94709-1608		+1-510-843-9542
9 *	USA
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 *
15 * 1. Redistributions of source code must retain the above copyright
16 *    notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 *    notice, this list of conditions and the following disclaimer in the
19 *    documentation and/or other materials provided with the distribution.
20 *
21 * THIS SOFTWARE IS PROVIDED BY MARSHALL KIRK MCKUSICK ``AS IS'' AND ANY
22 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED.  IN NO EVENT SHALL MARSHALL KIRK MCKUSICK BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)ffs_snapshot.c	8.11 (McKusick) 7/23/00
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_snapshot.c 262779 2014-03-05 04:23:19Z pfg $");
38
39#include "opt_quota.h"
40
41#include <sys/param.h>
42#include <sys/kernel.h>
43#include <sys/systm.h>
44#include <sys/conf.h>
45#include <sys/bio.h>
46#include <sys/buf.h>
47#include <sys/fcntl.h>
48#include <sys/proc.h>
49#include <sys/namei.h>
50#include <sys/sched.h>
51#include <sys/stat.h>
52#include <sys/malloc.h>
53#include <sys/mount.h>
54#include <sys/resource.h>
55#include <sys/resourcevar.h>
56#include <sys/rwlock.h>
57#include <sys/vnode.h>
58
59#include <geom/geom.h>
60
61#include <ufs/ufs/extattr.h>
62#include <ufs/ufs/quota.h>
63#include <ufs/ufs/ufsmount.h>
64#include <ufs/ufs/inode.h>
65#include <ufs/ufs/ufs_extern.h>
66
67#include <ufs/ffs/fs.h>
68#include <ufs/ffs/ffs_extern.h>
69
70#define KERNCRED thread0.td_ucred
71#define DEBUG 1
72
73#include "opt_ffs.h"
74
75#ifdef NO_FFS_SNAPSHOT
76int
77ffs_snapshot(mp, snapfile)
78	struct mount *mp;
79	char *snapfile;
80{
81	return (EINVAL);
82}
83
84int
85ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
86	struct fs *fs;
87	struct vnode *devvp;
88	ufs2_daddr_t bno;
89	long size;
90	ino_t inum;
91	enum vtype vtype;
92	struct workhead *wkhd;
93{
94	return (EINVAL);
95}
96
97void
98ffs_snapremove(vp)
99	struct vnode *vp;
100{
101}
102
103void
104ffs_snapshot_mount(mp)
105	struct mount *mp;
106{
107}
108
109void
110ffs_snapshot_unmount(mp)
111	struct mount *mp;
112{
113}
114
115void
116ffs_snapgone(ip)
117	struct inode *ip;
118{
119}
120
121int
122ffs_copyonwrite(devvp, bp)
123	struct vnode *devvp;
124	struct buf *bp;
125{
126	return (EINVAL);
127}
128
129void
130ffs_sync_snap(mp, waitfor)
131	struct mount *mp;
132	int waitfor;
133{
134}
135
136#else
137FEATURE(ffs_snapshot, "FFS snapshot support");
138
139LIST_HEAD(, snapdata) snapfree;
140static struct mtx snapfree_lock;
141MTX_SYSINIT(ffs_snapfree, &snapfree_lock, "snapdata free list", MTX_DEF);
142
143static int cgaccount(int, struct vnode *, struct buf *, int);
144static int expunge_ufs1(struct vnode *, struct inode *, struct fs *,
145    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
146    ufs_lbn_t, int), int, int);
147static int indiracct_ufs1(struct vnode *, struct vnode *, int,
148    ufs1_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
149    int (*)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *, struct fs *,
150    ufs_lbn_t, int), int);
151static int fullacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
152    struct fs *, ufs_lbn_t, int);
153static int snapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
154    struct fs *, ufs_lbn_t, int);
155static int mapacct_ufs1(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
156    struct fs *, ufs_lbn_t, int);
157static int expunge_ufs2(struct vnode *, struct inode *, struct fs *,
158    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
159    ufs_lbn_t, int), int, int);
160static int indiracct_ufs2(struct vnode *, struct vnode *, int,
161    ufs2_daddr_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, ufs_lbn_t, struct fs *,
162    int (*)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *, struct fs *,
163    ufs_lbn_t, int), int);
164static int fullacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
165    struct fs *, ufs_lbn_t, int);
166static int snapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
167    struct fs *, ufs_lbn_t, int);
168static int mapacct_ufs2(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
169    struct fs *, ufs_lbn_t, int);
170static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
171static void try_free_snapdata(struct vnode *devvp);
172static struct snapdata *ffs_snapdata_acquire(struct vnode *devvp);
173static int ffs_bp_snapblk(struct vnode *, struct buf *);
174
175/*
176 * To ensure the consistency of snapshots across crashes, we must
177 * synchronously write out copied blocks before allowing the
178 * originals to be modified. Because of the rather severe speed
179 * penalty that this imposes, the code normally only ensures
180 * persistence for the filesystem metadata contained within a
181 * snapshot. Setting the following flag allows this crash
182 * persistence to be enabled for file contents.
183 */
184int dopersistence = 0;
185
186#ifdef DEBUG
187#include <sys/sysctl.h>
188SYSCTL_INT(_debug, OID_AUTO, dopersistence, CTLFLAG_RW, &dopersistence, 0, "");
189static int snapdebug = 0;
190SYSCTL_INT(_debug, OID_AUTO, snapdebug, CTLFLAG_RW, &snapdebug, 0, "");
191int collectsnapstats = 0;
192SYSCTL_INT(_debug, OID_AUTO, collectsnapstats, CTLFLAG_RW, &collectsnapstats,
193	0, "");
194#endif /* DEBUG */
195
196/*
197 * Create a snapshot file and initialize it for the filesystem.
198 */
199int
200ffs_snapshot(mp, snapfile)
201	struct mount *mp;
202	char *snapfile;
203{
204	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
205	int error, cg, snaploc;
206	int i, size, len, loc;
207	ufs2_daddr_t blockno;
208	uint64_t flag;
209	struct timespec starttime = {0, 0}, endtime;
210	char saved_nice = 0;
211	long redo = 0, snaplistsize = 0;
212	int32_t *lp;
213	void *space;
214	struct fs *copy_fs = NULL, *fs;
215	struct thread *td = curthread;
216	struct inode *ip, *xp;
217	struct buf *bp, *nbp, *ibp;
218	struct nameidata nd;
219	struct mount *wrtmp;
220	struct vattr vat;
221	struct vnode *vp, *xvp, *mvp, *devvp;
222	struct uio auio;
223	struct iovec aiov;
224	struct snapdata *sn;
225	struct ufsmount *ump;
226
227	ump = VFSTOUFS(mp);
228	fs = ump->um_fs;
229	sn = NULL;
230	/*
231	 * At the moment, journaled soft updates cannot support
232	 * taking snapshots.
233	 */
234	if (MOUNTEDSUJ(mp)) {
235		vfs_mount_error(mp, "%s: Snapshots are not yet supported when "
236		    "running with journaled soft updates", fs->fs_fsmnt);
237		return (EOPNOTSUPP);
238	}
239	MNT_ILOCK(mp);
240	flag = mp->mnt_flag;
241	MNT_IUNLOCK(mp);
242	/*
243	 * Need to serialize access to snapshot code per filesystem.
244	 */
245	/*
246	 * Assign a snapshot slot in the superblock.
247	 */
248	UFS_LOCK(ump);
249	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
250		if (fs->fs_snapinum[snaploc] == 0)
251			break;
252	UFS_UNLOCK(ump);
253	if (snaploc == FSMAXSNAP)
254		return (ENOSPC);
255	/*
256	 * Create the snapshot file.
257	 */
258restart:
259	NDINIT(&nd, CREATE, LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, snapfile, td);
260	if ((error = namei(&nd)) != 0)
261		return (error);
262	if (nd.ni_vp != NULL) {
263		vput(nd.ni_vp);
264		error = EEXIST;
265	}
266	if (nd.ni_dvp->v_mount != mp)
267		error = EXDEV;
268	if (error) {
269		NDFREE(&nd, NDF_ONLY_PNBUF);
270		if (nd.ni_dvp == nd.ni_vp)
271			vrele(nd.ni_dvp);
272		else
273			vput(nd.ni_dvp);
274		return (error);
275	}
276	VATTR_NULL(&vat);
277	vat.va_type = VREG;
278	vat.va_mode = S_IRUSR;
279	vat.va_vaflags |= VA_EXCLUSIVE;
280	if (VOP_GETWRITEMOUNT(nd.ni_dvp, &wrtmp))
281		wrtmp = NULL;
282	if (wrtmp != mp)
283		panic("ffs_snapshot: mount mismatch");
284	vfs_rel(wrtmp);
285	if (vn_start_write(NULL, &wrtmp, V_NOWAIT) != 0) {
286		NDFREE(&nd, NDF_ONLY_PNBUF);
287		vput(nd.ni_dvp);
288		if ((error = vn_start_write(NULL, &wrtmp,
289		    V_XSLEEP | PCATCH)) != 0)
290			return (error);
291		goto restart;
292	}
293	error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vat);
294	VOP_UNLOCK(nd.ni_dvp, 0);
295	if (error) {
296		NDFREE(&nd, NDF_ONLY_PNBUF);
297		vn_finished_write(wrtmp);
298		vrele(nd.ni_dvp);
299		return (error);
300	}
301	vp = nd.ni_vp;
302	vp->v_vflag |= VV_SYSTEM;
303	ip = VTOI(vp);
304	devvp = ip->i_devvp;
305	/*
306	 * Allocate and copy the last block contents so as to be able
307	 * to set size to that of the filesystem.
308	 */
309	numblks = howmany(fs->fs_size, fs->fs_frag);
310	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
311	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
312	if (error)
313		goto out;
314	ip->i_size = lblktosize(fs, (off_t)numblks);
315	DIP_SET(ip, i_size, ip->i_size);
316	ip->i_flag |= IN_CHANGE | IN_UPDATE;
317	error = readblock(vp, bp, numblks - 1);
318	bawrite(bp);
319	if (error != 0)
320		goto out;
321	/*
322	 * Preallocate critical data structures so that we can copy
323	 * them in without further allocation after we suspend all
324	 * operations on the filesystem. We would like to just release
325	 * the allocated buffers without writing them since they will
326	 * be filled in below once we are ready to go, but this upsets
327	 * the soft update code, so we go ahead and write the new buffers.
328	 *
329	 * Allocate all indirect blocks and mark all of them as not
330	 * needing to be copied.
331	 */
332	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
333		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
334		    fs->fs_bsize, td->td_ucred, BA_METAONLY, &ibp);
335		if (error)
336			goto out;
337		bawrite(ibp);
338	}
339	/*
340	 * Allocate copies for the superblock and its summary information.
341	 */
342	error = UFS_BALLOC(vp, fs->fs_sblockloc, fs->fs_sbsize, KERNCRED,
343	    0, &nbp);
344	if (error)
345		goto out;
346	bawrite(nbp);
347	blkno = fragstoblks(fs, fs->fs_csaddr);
348	len = howmany(fs->fs_cssize, fs->fs_bsize);
349	for (loc = 0; loc < len; loc++) {
350		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(blkno + loc)),
351		    fs->fs_bsize, KERNCRED, 0, &nbp);
352		if (error)
353			goto out;
354		bawrite(nbp);
355	}
356	/*
357	 * Allocate all cylinder group blocks.
358	 */
359	for (cg = 0; cg < fs->fs_ncg; cg++) {
360		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
361		    fs->fs_bsize, KERNCRED, 0, &nbp);
362		if (error)
363			goto out;
364		bawrite(nbp);
365		if (cg % 10 == 0)
366			ffs_syncvnode(vp, MNT_WAIT, 0);
367	}
368	/*
369	 * Copy all the cylinder group maps. Although the
370	 * filesystem is still active, we hope that only a few
371	 * cylinder groups will change between now and when we
372	 * suspend operations. Thus, we will be able to quickly
373	 * touch up the few cylinder groups that changed during
374	 * the suspension period.
375	 */
376	len = howmany(fs->fs_ncg, NBBY);
377	space = malloc(len, M_DEVBUF, M_WAITOK|M_ZERO);
378	UFS_LOCK(ump);
379	fs->fs_active = space;
380	UFS_UNLOCK(ump);
381	for (cg = 0; cg < fs->fs_ncg; cg++) {
382		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
383		    fs->fs_bsize, KERNCRED, 0, &nbp);
384		if (error)
385			goto out;
386		error = cgaccount(cg, vp, nbp, 1);
387		bawrite(nbp);
388		if (cg % 10 == 0)
389			ffs_syncvnode(vp, MNT_WAIT, 0);
390		if (error)
391			goto out;
392	}
393	/*
394	 * Change inode to snapshot type file.
395	 */
396	ip->i_flags |= SF_SNAPSHOT;
397	DIP_SET(ip, i_flags, ip->i_flags);
398	ip->i_flag |= IN_CHANGE | IN_UPDATE;
399	/*
400	 * Ensure that the snapshot is completely on disk.
401	 * Since we have marked it as a snapshot it is safe to
402	 * unlock it as no process will be allowed to write to it.
403	 */
404	if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
405		goto out;
406	VOP_UNLOCK(vp, 0);
407	/*
408	 * All allocations are done, so we can now snapshot the system.
409	 *
410	 * Recind nice scheduling while running with the filesystem suspended.
411	 */
412	if (td->td_proc->p_nice > 0) {
413		struct proc *p;
414
415		p = td->td_proc;
416		PROC_LOCK(p);
417		saved_nice = p->p_nice;
418		sched_nice(p, 0);
419		PROC_UNLOCK(p);
420	}
421	/*
422	 * Suspend operation on filesystem.
423	 */
424	for (;;) {
425		vn_finished_write(wrtmp);
426		if ((error = vfs_write_suspend(vp->v_mount, 0)) != 0) {
427			vn_start_write(NULL, &wrtmp, V_WAIT);
428			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
429			goto out;
430		}
431		if (mp->mnt_kern_flag & MNTK_SUSPENDED)
432			break;
433		vn_start_write(NULL, &wrtmp, V_WAIT);
434	}
435	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
436	if (ip->i_effnlink == 0) {
437		error = ENOENT;		/* Snapshot file unlinked */
438		goto out1;
439	}
440	if (collectsnapstats)
441		nanotime(&starttime);
442
443	/* The last block might have changed.  Copy it again to be sure. */
444	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(numblks - 1)),
445	    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
446	if (error != 0)
447		goto out1;
448	error = readblock(vp, bp, numblks - 1);
449	bp->b_flags |= B_VALIDSUSPWRT;
450	bawrite(bp);
451	if (error != 0)
452		goto out1;
453	/*
454	 * First, copy all the cylinder group maps that have changed.
455	 */
456	for (cg = 0; cg < fs->fs_ncg; cg++) {
457		if ((ACTIVECGNUM(fs, cg) & ACTIVECGOFF(cg)) != 0)
458			continue;
459		redo++;
460		error = UFS_BALLOC(vp, lfragtosize(fs, cgtod(fs, cg)),
461		    fs->fs_bsize, KERNCRED, 0, &nbp);
462		if (error)
463			goto out1;
464		error = cgaccount(cg, vp, nbp, 2);
465		bawrite(nbp);
466		if (error)
467			goto out1;
468	}
469	/*
470	 * Grab a copy of the superblock and its summary information.
471	 * We delay writing it until the suspension is released below.
472	 */
473	copy_fs = malloc((u_long)fs->fs_bsize, M_UFSMNT, M_WAITOK);
474	bcopy(fs, copy_fs, fs->fs_sbsize);
475	if ((fs->fs_flags & (FS_UNCLEAN | FS_NEEDSFSCK)) == 0)
476		copy_fs->fs_clean = 1;
477	size = fs->fs_bsize < SBLOCKSIZE ? fs->fs_bsize : SBLOCKSIZE;
478	if (fs->fs_sbsize < size)
479		bzero(&((char *)copy_fs)[fs->fs_sbsize],
480		    size - fs->fs_sbsize);
481	size = blkroundup(fs, fs->fs_cssize);
482	if (fs->fs_contigsumsize > 0)
483		size += fs->fs_ncg * sizeof(int32_t);
484	space = malloc((u_long)size, M_UFSMNT, M_WAITOK);
485	copy_fs->fs_csp = space;
486	bcopy(fs->fs_csp, copy_fs->fs_csp, fs->fs_cssize);
487	space = (char *)space + fs->fs_cssize;
488	loc = howmany(fs->fs_cssize, fs->fs_fsize);
489	i = fs->fs_frag - loc % fs->fs_frag;
490	len = (i == fs->fs_frag) ? 0 : i * fs->fs_fsize;
491	if (len > 0) {
492		if ((error = bread(devvp, fsbtodb(fs, fs->fs_csaddr + loc),
493		    len, KERNCRED, &bp)) != 0) {
494			brelse(bp);
495			free(copy_fs->fs_csp, M_UFSMNT);
496			free(copy_fs, M_UFSMNT);
497			copy_fs = NULL;
498			goto out1;
499		}
500		bcopy(bp->b_data, space, (u_int)len);
501		space = (char *)space + len;
502		bp->b_flags |= B_INVAL | B_NOCACHE;
503		brelse(bp);
504	}
505	if (fs->fs_contigsumsize > 0) {
506		copy_fs->fs_maxcluster = lp = space;
507		for (i = 0; i < fs->fs_ncg; i++)
508			*lp++ = fs->fs_contigsumsize;
509	}
510	/*
511	 * We must check for active files that have been unlinked
512	 * (e.g., with a zero link count). We have to expunge all
513	 * trace of these files from the snapshot so that they are
514	 * not reclaimed prematurely by fsck or unnecessarily dumped.
515	 * We turn off the MNTK_SUSPENDED flag to avoid a panic from
516	 * spec_strategy about writing on a suspended filesystem.
517	 * Note that we skip unlinked snapshot files as they will
518	 * be handled separately below.
519	 *
520	 * We also calculate the needed size for the snapshot list.
521	 */
522	snaplistsize = fs->fs_ncg + howmany(fs->fs_cssize, fs->fs_bsize) +
523	    FSMAXSNAP + 1 /* superblock */ + 1 /* last block */ + 1 /* size */;
524	MNT_ILOCK(mp);
525	mp->mnt_kern_flag &= ~MNTK_SUSPENDED;
526	MNT_IUNLOCK(mp);
527loop:
528	MNT_VNODE_FOREACH_ALL(xvp, mp, mvp) {
529		if ((xvp->v_usecount == 0 &&
530		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
531		    xvp->v_type == VNON ||
532		    IS_SNAPSHOT(VTOI(xvp))) {
533			VI_UNLOCK(xvp);
534			continue;
535		}
536		/*
537		 * We can skip parent directory vnode because it must have
538		 * this snapshot file in it.
539		 */
540		if (xvp == nd.ni_dvp) {
541			VI_UNLOCK(xvp);
542			continue;
543		}
544		vholdl(xvp);
545		if (vn_lock(xvp, LK_EXCLUSIVE | LK_INTERLOCK) != 0) {
546			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
547			vdrop(xvp);
548			goto loop;
549		}
550		VI_LOCK(xvp);
551		if (xvp->v_usecount == 0 &&
552		    (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) {
553			VI_UNLOCK(xvp);
554			VOP_UNLOCK(xvp, 0);
555			vdrop(xvp);
556			continue;
557		}
558		VI_UNLOCK(xvp);
559		if (snapdebug)
560			vprint("ffs_snapshot: busy vnode", xvp);
561		if (VOP_GETATTR(xvp, &vat, td->td_ucred) == 0 &&
562		    vat.va_nlink > 0) {
563			VOP_UNLOCK(xvp, 0);
564			vdrop(xvp);
565			continue;
566		}
567		xp = VTOI(xvp);
568		if (ffs_checkfreefile(copy_fs, vp, xp->i_number)) {
569			VOP_UNLOCK(xvp, 0);
570			vdrop(xvp);
571			continue;
572		}
573		/*
574		 * If there is a fragment, clear it here.
575		 */
576		blkno = 0;
577		loc = howmany(xp->i_size, fs->fs_bsize) - 1;
578		if (loc < NDADDR) {
579			len = fragroundup(fs, blkoff(fs, xp->i_size));
580			if (len != 0 && len < fs->fs_bsize) {
581				ffs_blkfree(ump, copy_fs, vp,
582				    DIP(xp, i_db[loc]), len, xp->i_number,
583				    xvp->v_type, NULL);
584				blkno = DIP(xp, i_db[loc]);
585				DIP_SET(xp, i_db[loc], 0);
586			}
587		}
588		snaplistsize += 1;
589		if (xp->i_ump->um_fstype == UFS1)
590			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
591			    BLK_NOCOPY, 1);
592		else
593			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
594			    BLK_NOCOPY, 1);
595		if (blkno)
596			DIP_SET(xp, i_db[loc], blkno);
597		if (!error)
598			error = ffs_freefile(ump, copy_fs, vp, xp->i_number,
599			    xp->i_mode, NULL);
600		VOP_UNLOCK(xvp, 0);
601		vdrop(xvp);
602		if (error) {
603			free(copy_fs->fs_csp, M_UFSMNT);
604			free(copy_fs, M_UFSMNT);
605			copy_fs = NULL;
606			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
607			goto out1;
608		}
609	}
610	/*
611	 * Erase the journal file from the snapshot.
612	 */
613	if (fs->fs_flags & FS_SUJ) {
614		error = softdep_journal_lookup(mp, &xvp);
615		if (error) {
616			free(copy_fs->fs_csp, M_UFSMNT);
617			free(copy_fs, M_UFSMNT);
618			copy_fs = NULL;
619			goto out1;
620		}
621		xp = VTOI(xvp);
622		if (xp->i_ump->um_fstype == UFS1)
623			error = expunge_ufs1(vp, xp, copy_fs, fullacct_ufs1,
624			    BLK_NOCOPY, 0);
625		else
626			error = expunge_ufs2(vp, xp, copy_fs, fullacct_ufs2,
627			    BLK_NOCOPY, 0);
628		vput(xvp);
629	}
630	/*
631	 * Acquire a lock on the snapdata structure, creating it if necessary.
632	 */
633	sn = ffs_snapdata_acquire(devvp);
634	/*
635	 * Change vnode to use shared snapshot lock instead of the original
636	 * private lock.
637	 */
638	vp->v_vnlock = &sn->sn_lock;
639	lockmgr(&vp->v_lock, LK_RELEASE, NULL);
640	xp = TAILQ_FIRST(&sn->sn_head);
641	/*
642	 * If this is the first snapshot on this filesystem, then we need
643	 * to allocate the space for the list of preallocated snapshot blocks.
644	 * This list will be refined below, but this preliminary one will
645	 * keep us out of deadlock until the full one is ready.
646	 */
647	if (xp == NULL) {
648		snapblklist = malloc(snaplistsize * sizeof(daddr_t),
649		    M_UFSMNT, M_WAITOK);
650		blkp = &snapblklist[1];
651		*blkp++ = lblkno(fs, fs->fs_sblockloc);
652		blkno = fragstoblks(fs, fs->fs_csaddr);
653		for (cg = 0; cg < fs->fs_ncg; cg++) {
654			if (fragstoblks(fs, cgtod(fs, cg) > blkno))
655				break;
656			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
657		}
658		len = howmany(fs->fs_cssize, fs->fs_bsize);
659		for (loc = 0; loc < len; loc++)
660			*blkp++ = blkno + loc;
661		for (; cg < fs->fs_ncg; cg++)
662			*blkp++ = fragstoblks(fs, cgtod(fs, cg));
663		snapblklist[0] = blkp - snapblklist;
664		VI_LOCK(devvp);
665		if (sn->sn_blklist != NULL)
666			panic("ffs_snapshot: non-empty list");
667		sn->sn_blklist = snapblklist;
668		sn->sn_listsize = blkp - snapblklist;
669		VI_UNLOCK(devvp);
670	}
671	/*
672	 * Record snapshot inode. Since this is the newest snapshot,
673	 * it must be placed at the end of the list.
674	 */
675	VI_LOCK(devvp);
676	fs->fs_snapinum[snaploc] = ip->i_number;
677	if (ip->i_nextsnap.tqe_prev != 0)
678		panic("ffs_snapshot: %ju already on list",
679		    (uintmax_t)ip->i_number);
680	TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
681	devvp->v_vflag |= VV_COPYONWRITE;
682	VI_UNLOCK(devvp);
683	ASSERT_VOP_LOCKED(vp, "ffs_snapshot vp");
684out1:
685	KASSERT((sn != NULL && copy_fs != NULL && error == 0) ||
686		(sn == NULL && copy_fs == NULL && error != 0),
687		("email phk@ and mckusick@"));
688	/*
689	 * Resume operation on filesystem.
690	 */
691	vfs_write_resume(vp->v_mount, VR_START_WRITE | VR_NO_SUSPCLR);
692	if (collectsnapstats && starttime.tv_sec > 0) {
693		nanotime(&endtime);
694		timespecsub(&endtime, &starttime);
695		printf("%s: suspended %ld.%03ld sec, redo %ld of %d\n",
696		    vp->v_mount->mnt_stat.f_mntonname, (long)endtime.tv_sec,
697		    endtime.tv_nsec / 1000000, redo, fs->fs_ncg);
698	}
699	if (copy_fs == NULL)
700		goto out;
701	/*
702	 * Copy allocation information from all the snapshots in
703	 * this snapshot and then expunge them from its view.
704	 */
705	TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap) {
706		if (xp == ip)
707			break;
708		if (xp->i_ump->um_fstype == UFS1)
709			error = expunge_ufs1(vp, xp, fs, snapacct_ufs1,
710			    BLK_SNAP, 0);
711		else
712			error = expunge_ufs2(vp, xp, fs, snapacct_ufs2,
713			    BLK_SNAP, 0);
714		if (error == 0 && xp->i_effnlink == 0) {
715			error = ffs_freefile(ump,
716					     copy_fs,
717					     vp,
718					     xp->i_number,
719					     xp->i_mode, NULL);
720		}
721		if (error) {
722			fs->fs_snapinum[snaploc] = 0;
723			goto done;
724		}
725	}
726	/*
727	 * Allocate space for the full list of preallocated snapshot blocks.
728	 */
729	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
730	    M_UFSMNT, M_WAITOK);
731	ip->i_snapblklist = &snapblklist[1];
732	/*
733	 * Expunge the blocks used by the snapshots from the set of
734	 * blocks marked as used in the snapshot bitmaps. Also, collect
735	 * the list of allocated blocks in i_snapblklist.
736	 */
737	if (ip->i_ump->um_fstype == UFS1)
738		error = expunge_ufs1(vp, ip, copy_fs, mapacct_ufs1,
739		    BLK_SNAP, 0);
740	else
741		error = expunge_ufs2(vp, ip, copy_fs, mapacct_ufs2,
742		    BLK_SNAP, 0);
743	if (error) {
744		fs->fs_snapinum[snaploc] = 0;
745		free(snapblklist, M_UFSMNT);
746		goto done;
747	}
748	if (snaplistsize < ip->i_snapblklist - snapblklist)
749		panic("ffs_snapshot: list too small");
750	snaplistsize = ip->i_snapblklist - snapblklist;
751	snapblklist[0] = snaplistsize;
752	ip->i_snapblklist = 0;
753	/*
754	 * Write out the list of allocated blocks to the end of the snapshot.
755	 */
756	auio.uio_iov = &aiov;
757	auio.uio_iovcnt = 1;
758	aiov.iov_base = (void *)snapblklist;
759	aiov.iov_len = snaplistsize * sizeof(daddr_t);
760	auio.uio_resid = aiov.iov_len;
761	auio.uio_offset = ip->i_size;
762	auio.uio_segflg = UIO_SYSSPACE;
763	auio.uio_rw = UIO_WRITE;
764	auio.uio_td = td;
765	if ((error = VOP_WRITE(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
766		fs->fs_snapinum[snaploc] = 0;
767		free(snapblklist, M_UFSMNT);
768		goto done;
769	}
770	/*
771	 * Write the superblock and its summary information
772	 * to the snapshot.
773	 */
774	blkno = fragstoblks(fs, fs->fs_csaddr);
775	len = howmany(fs->fs_cssize, fs->fs_bsize);
776	space = copy_fs->fs_csp;
777	for (loc = 0; loc < len; loc++) {
778		error = bread(vp, blkno + loc, fs->fs_bsize, KERNCRED, &nbp);
779		if (error) {
780			brelse(nbp);
781			fs->fs_snapinum[snaploc] = 0;
782			free(snapblklist, M_UFSMNT);
783			goto done;
784		}
785		bcopy(space, nbp->b_data, fs->fs_bsize);
786		space = (char *)space + fs->fs_bsize;
787		bawrite(nbp);
788	}
789	error = bread(vp, lblkno(fs, fs->fs_sblockloc), fs->fs_bsize,
790	    KERNCRED, &nbp);
791	if (error) {
792		brelse(nbp);
793	} else {
794		loc = blkoff(fs, fs->fs_sblockloc);
795		bcopy((char *)copy_fs, &nbp->b_data[loc], (u_int)fs->fs_sbsize);
796		bawrite(nbp);
797	}
798	/*
799	 * As this is the newest list, it is the most inclusive, so
800	 * should replace the previous list.
801	 */
802	VI_LOCK(devvp);
803	space = sn->sn_blklist;
804	sn->sn_blklist = snapblklist;
805	sn->sn_listsize = snaplistsize;
806	VI_UNLOCK(devvp);
807	if (space != NULL)
808		free(space, M_UFSMNT);
809	/*
810	 * Preallocate all the direct blocks in the snapshot inode so
811	 * that we never have to write the inode itself to commit an
812	 * update to the contents of the snapshot. Note that once
813	 * created, the size of the snapshot will never change, so
814	 * there will never be a need to write the inode except to
815	 * update the non-integrity-critical time fields and
816	 * allocated-block count.
817	 */
818	for (blockno = 0; blockno < NDADDR; blockno++) {
819		if (DIP(ip, i_db[blockno]) != 0)
820			continue;
821		error = UFS_BALLOC(vp, lblktosize(fs, blockno),
822		    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
823		if (error)
824			break;
825		error = readblock(vp, bp, blockno);
826		bawrite(bp);
827		if (error != 0)
828			break;
829	}
830done:
831	free(copy_fs->fs_csp, M_UFSMNT);
832	free(copy_fs, M_UFSMNT);
833	copy_fs = NULL;
834out:
835	NDFREE(&nd, NDF_ONLY_PNBUF);
836	if (saved_nice > 0) {
837		struct proc *p;
838
839		p = td->td_proc;
840		PROC_LOCK(p);
841		sched_nice(td->td_proc, saved_nice);
842		PROC_UNLOCK(td->td_proc);
843	}
844	UFS_LOCK(ump);
845	if (fs->fs_active != 0) {
846		free(fs->fs_active, M_DEVBUF);
847		fs->fs_active = 0;
848	}
849	UFS_UNLOCK(ump);
850	MNT_ILOCK(mp);
851	mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) | (flag & ~MNT_QUOTA);
852	MNT_IUNLOCK(mp);
853	if (error)
854		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED);
855	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
856	if (error)
857		vput(vp);
858	else
859		VOP_UNLOCK(vp, 0);
860	vrele(nd.ni_dvp);
861	vn_finished_write(wrtmp);
862	process_deferred_inactive(mp);
863	return (error);
864}
865
866/*
867 * Copy a cylinder group map. All the unallocated blocks are marked
868 * BLK_NOCOPY so that the snapshot knows that it need not copy them
869 * if they are later written. If passno is one, then this is a first
870 * pass, so only setting needs to be done. If passno is 2, then this
871 * is a revision to a previous pass which must be undone as the
872 * replacement pass is done.
873 */
874static int
875cgaccount(cg, vp, nbp, passno)
876	int cg;
877	struct vnode *vp;
878	struct buf *nbp;
879	int passno;
880{
881	struct buf *bp, *ibp;
882	struct inode *ip;
883	struct cg *cgp;
884	struct fs *fs;
885	ufs2_daddr_t base, numblks;
886	int error, len, loc, indiroff;
887
888	ip = VTOI(vp);
889	fs = ip->i_fs;
890	error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
891		(int)fs->fs_cgsize, KERNCRED, &bp);
892	if (error) {
893		brelse(bp);
894		return (error);
895	}
896	cgp = (struct cg *)bp->b_data;
897	if (!cg_chkmagic(cgp)) {
898		brelse(bp);
899		return (EIO);
900	}
901	UFS_LOCK(ip->i_ump);
902	ACTIVESET(fs, cg);
903	/*
904	 * Recomputation of summary information might not have been performed
905	 * at mount time.  Sync up summary information for current cylinder
906	 * group while data is in memory to ensure that result of background
907	 * fsck is slightly more consistent.
908	 */
909	fs->fs_cs(fs, cg) = cgp->cg_cs;
910	UFS_UNLOCK(ip->i_ump);
911	bcopy(bp->b_data, nbp->b_data, fs->fs_cgsize);
912	if (fs->fs_cgsize < fs->fs_bsize)
913		bzero(&nbp->b_data[fs->fs_cgsize],
914		    fs->fs_bsize - fs->fs_cgsize);
915	cgp = (struct cg *)nbp->b_data;
916	bqrelse(bp);
917	if (passno == 2)
918		nbp->b_flags |= B_VALIDSUSPWRT;
919	numblks = howmany(fs->fs_size, fs->fs_frag);
920	len = howmany(fs->fs_fpg, fs->fs_frag);
921	base = cgbase(fs, cg) / fs->fs_frag;
922	if (base + len >= numblks)
923		len = numblks - base - 1;
924	loc = 0;
925	if (base < NDADDR) {
926		for ( ; loc < NDADDR; loc++) {
927			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
928				DIP_SET(ip, i_db[loc], BLK_NOCOPY);
929			else if (passno == 2 && DIP(ip, i_db[loc])== BLK_NOCOPY)
930				DIP_SET(ip, i_db[loc], 0);
931			else if (passno == 1 && DIP(ip, i_db[loc])== BLK_NOCOPY)
932				panic("ffs_snapshot: lost direct block");
933		}
934	}
935	error = UFS_BALLOC(vp, lblktosize(fs, (off_t)(base + loc)),
936	    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
937	if (error) {
938		return (error);
939	}
940	indiroff = (base + loc - NDADDR) % NINDIR(fs);
941	for ( ; loc < len; loc++, indiroff++) {
942		if (indiroff >= NINDIR(fs)) {
943			if (passno == 2)
944				ibp->b_flags |= B_VALIDSUSPWRT;
945			bawrite(ibp);
946			error = UFS_BALLOC(vp,
947			    lblktosize(fs, (off_t)(base + loc)),
948			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
949			if (error) {
950				return (error);
951			}
952			indiroff = 0;
953		}
954		if (ip->i_ump->um_fstype == UFS1) {
955			if (ffs_isblock(fs, cg_blksfree(cgp), loc))
956				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
957				    BLK_NOCOPY;
958			else if (passno == 2 && ((ufs1_daddr_t *)(ibp->b_data))
959			    [indiroff] == BLK_NOCOPY)
960				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = 0;
961			else if (passno == 1 && ((ufs1_daddr_t *)(ibp->b_data))
962			    [indiroff] == BLK_NOCOPY)
963				panic("ffs_snapshot: lost indirect block");
964			continue;
965		}
966		if (ffs_isblock(fs, cg_blksfree(cgp), loc))
967			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = BLK_NOCOPY;
968		else if (passno == 2 &&
969		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
970			((ufs2_daddr_t *)(ibp->b_data))[indiroff] = 0;
971		else if (passno == 1 &&
972		    ((ufs2_daddr_t *)(ibp->b_data)) [indiroff] == BLK_NOCOPY)
973			panic("ffs_snapshot: lost indirect block");
974	}
975	if (passno == 2)
976		ibp->b_flags |= B_VALIDSUSPWRT;
977	bdwrite(ibp);
978	return (0);
979}
980
981/*
982 * Before expunging a snapshot inode, note all the
983 * blocks that it claims with BLK_SNAP so that fsck will
984 * be able to account for those blocks properly and so
985 * that this snapshot knows that it need not copy them
986 * if the other snapshot holding them is freed. This code
987 * is reproduced once each for UFS1 and UFS2.
988 */
989static int
990expunge_ufs1(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
991	struct vnode *snapvp;
992	struct inode *cancelip;
993	struct fs *fs;
994	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
995	    struct fs *, ufs_lbn_t, int);
996	int expungetype;
997	int clearmode;
998{
999	int i, error, indiroff;
1000	ufs_lbn_t lbn, rlbn;
1001	ufs2_daddr_t len, blkno, numblks, blksperindir;
1002	struct ufs1_dinode *dip;
1003	struct thread *td = curthread;
1004	struct buf *bp;
1005
1006	/*
1007	 * Prepare to expunge the inode. If its inode block has not
1008	 * yet been copied, then allocate and fill the copy.
1009	 */
1010	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1011	blkno = 0;
1012	if (lbn < NDADDR) {
1013		blkno = VTOI(snapvp)->i_din1->di_db[lbn];
1014	} else {
1015		if (DOINGSOFTDEP(snapvp))
1016			softdep_prealloc(snapvp, MNT_WAIT);
1017		td->td_pflags |= TDP_COWINPROGRESS;
1018		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1019		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1020		td->td_pflags &= ~TDP_COWINPROGRESS;
1021		if (error)
1022			return (error);
1023		indiroff = (lbn - NDADDR) % NINDIR(fs);
1024		blkno = ((ufs1_daddr_t *)(bp->b_data))[indiroff];
1025		bqrelse(bp);
1026	}
1027	if (blkno != 0) {
1028		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1029			return (error);
1030	} else {
1031		error = ffs_balloc_ufs1(snapvp, lblktosize(fs, (off_t)lbn),
1032		    fs->fs_bsize, KERNCRED, 0, &bp);
1033		if (error)
1034			return (error);
1035		if ((error = readblock(snapvp, bp, lbn)) != 0)
1036			return (error);
1037	}
1038	/*
1039	 * Set a snapshot inode to be a zero length file, regular files
1040	 * or unlinked snapshots to be completely unallocated.
1041	 */
1042	dip = (struct ufs1_dinode *)bp->b_data +
1043	    ino_to_fsbo(fs, cancelip->i_number);
1044	if (clearmode || cancelip->i_effnlink == 0)
1045		dip->di_mode = 0;
1046	dip->di_size = 0;
1047	dip->di_blocks = 0;
1048	dip->di_flags &= ~SF_SNAPSHOT;
1049	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs1_daddr_t));
1050	bdwrite(bp);
1051	/*
1052	 * Now go through and expunge all the blocks in the file
1053	 * using the function requested.
1054	 */
1055	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1056	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_db[0],
1057	    &cancelip->i_din1->di_db[NDADDR], fs, 0, expungetype)))
1058		return (error);
1059	if ((error = (*acctfunc)(snapvp, &cancelip->i_din1->di_ib[0],
1060	    &cancelip->i_din1->di_ib[NIADDR], fs, -1, expungetype)))
1061		return (error);
1062	blksperindir = 1;
1063	lbn = -NDADDR;
1064	len = numblks - NDADDR;
1065	rlbn = NDADDR;
1066	for (i = 0; len > 0 && i < NIADDR; i++) {
1067		error = indiracct_ufs1(snapvp, ITOV(cancelip), i,
1068		    cancelip->i_din1->di_ib[i], lbn, rlbn, len,
1069		    blksperindir, fs, acctfunc, expungetype);
1070		if (error)
1071			return (error);
1072		blksperindir *= NINDIR(fs);
1073		lbn -= blksperindir + 1;
1074		len -= blksperindir;
1075		rlbn += blksperindir;
1076	}
1077	return (0);
1078}
1079
1080/*
1081 * Descend an indirect block chain for vnode cancelvp accounting for all
1082 * its indirect blocks in snapvp.
1083 */
1084static int
1085indiracct_ufs1(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1086	    blksperindir, fs, acctfunc, expungetype)
1087	struct vnode *snapvp;
1088	struct vnode *cancelvp;
1089	int level;
1090	ufs1_daddr_t blkno;
1091	ufs_lbn_t lbn;
1092	ufs_lbn_t rlbn;
1093	ufs_lbn_t remblks;
1094	ufs_lbn_t blksperindir;
1095	struct fs *fs;
1096	int (*acctfunc)(struct vnode *, ufs1_daddr_t *, ufs1_daddr_t *,
1097	    struct fs *, ufs_lbn_t, int);
1098	int expungetype;
1099{
1100	int error, num, i;
1101	ufs_lbn_t subblksperindir;
1102	struct indir indirs[NIADDR + 2];
1103	ufs1_daddr_t last, *bap;
1104	struct buf *bp;
1105
1106	if (blkno == 0) {
1107		if (expungetype == BLK_NOCOPY)
1108			return (0);
1109		panic("indiracct_ufs1: missing indir");
1110	}
1111	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1112		return (error);
1113	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1114		panic("indiracct_ufs1: botched params");
1115	/*
1116	 * We have to expand bread here since it will deadlock looking
1117	 * up the block number for any blocks that are not in the cache.
1118	 */
1119	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1120	bp->b_blkno = fsbtodb(fs, blkno);
1121	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1122	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1123		brelse(bp);
1124		return (error);
1125	}
1126	/*
1127	 * Account for the block pointers in this indirect block.
1128	 */
1129	last = howmany(remblks, blksperindir);
1130	if (last > NINDIR(fs))
1131		last = NINDIR(fs);
1132	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1133	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1134	bqrelse(bp);
1135	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1136	    level == 0 ? rlbn : -1, expungetype);
1137	if (error || level == 0)
1138		goto out;
1139	/*
1140	 * Account for the block pointers in each of the indirect blocks
1141	 * in the levels below us.
1142	 */
1143	subblksperindir = blksperindir / NINDIR(fs);
1144	for (lbn++, level--, i = 0; i < last; i++) {
1145		error = indiracct_ufs1(snapvp, cancelvp, level, bap[i], lbn,
1146		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1147		if (error)
1148			goto out;
1149		rlbn += blksperindir;
1150		lbn -= blksperindir;
1151		remblks -= blksperindir;
1152	}
1153out:
1154	free(bap, M_DEVBUF);
1155	return (error);
1156}
1157
1158/*
1159 * Do both snap accounting and map accounting.
1160 */
1161static int
1162fullacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1163	struct vnode *vp;
1164	ufs1_daddr_t *oldblkp, *lastblkp;
1165	struct fs *fs;
1166	ufs_lbn_t lblkno;
1167	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1168{
1169	int error;
1170
1171	if ((error = snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1172		return (error);
1173	return (mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1174}
1175
1176/*
1177 * Identify a set of blocks allocated in a snapshot inode.
1178 */
1179static int
1180snapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1181	struct vnode *vp;
1182	ufs1_daddr_t *oldblkp, *lastblkp;
1183	struct fs *fs;
1184	ufs_lbn_t lblkno;
1185	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1186{
1187	struct inode *ip = VTOI(vp);
1188	ufs1_daddr_t blkno, *blkp;
1189	ufs_lbn_t lbn;
1190	struct buf *ibp;
1191	int error;
1192
1193	for ( ; oldblkp < lastblkp; oldblkp++) {
1194		blkno = *oldblkp;
1195		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1196			continue;
1197		lbn = fragstoblks(fs, blkno);
1198		if (lbn < NDADDR) {
1199			blkp = &ip->i_din1->di_db[lbn];
1200			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1201		} else {
1202			error = ffs_balloc_ufs1(vp, lblktosize(fs, (off_t)lbn),
1203			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1204			if (error)
1205				return (error);
1206			blkp = &((ufs1_daddr_t *)(ibp->b_data))
1207			    [(lbn - NDADDR) % NINDIR(fs)];
1208		}
1209		/*
1210		 * If we are expunging a snapshot vnode and we
1211		 * find a block marked BLK_NOCOPY, then it is
1212		 * one that has been allocated to this snapshot after
1213		 * we took our current snapshot and can be ignored.
1214		 */
1215		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1216			if (lbn >= NDADDR)
1217				brelse(ibp);
1218		} else {
1219			if (*blkp != 0)
1220				panic("snapacct_ufs1: bad block");
1221			*blkp = expungetype;
1222			if (lbn >= NDADDR)
1223				bdwrite(ibp);
1224		}
1225	}
1226	return (0);
1227}
1228
1229/*
1230 * Account for a set of blocks allocated in a snapshot inode.
1231 */
1232static int
1233mapacct_ufs1(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1234	struct vnode *vp;
1235	ufs1_daddr_t *oldblkp, *lastblkp;
1236	struct fs *fs;
1237	ufs_lbn_t lblkno;
1238	int expungetype;
1239{
1240	ufs1_daddr_t blkno;
1241	struct inode *ip;
1242	ino_t inum;
1243	int acctit;
1244
1245	ip = VTOI(vp);
1246	inum = ip->i_number;
1247	if (lblkno == -1)
1248		acctit = 0;
1249	else
1250		acctit = 1;
1251	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1252		blkno = *oldblkp;
1253		if (blkno == 0 || blkno == BLK_NOCOPY)
1254			continue;
1255		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1256			*ip->i_snapblklist++ = lblkno;
1257		if (blkno == BLK_SNAP)
1258			blkno = blkstofrags(fs, lblkno);
1259		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
1260		    vp->v_type, NULL);
1261	}
1262	return (0);
1263}
1264
1265/*
1266 * Before expunging a snapshot inode, note all the
1267 * blocks that it claims with BLK_SNAP so that fsck will
1268 * be able to account for those blocks properly and so
1269 * that this snapshot knows that it need not copy them
1270 * if the other snapshot holding them is freed. This code
1271 * is reproduced once each for UFS1 and UFS2.
1272 */
1273static int
1274expunge_ufs2(snapvp, cancelip, fs, acctfunc, expungetype, clearmode)
1275	struct vnode *snapvp;
1276	struct inode *cancelip;
1277	struct fs *fs;
1278	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1279	    struct fs *, ufs_lbn_t, int);
1280	int expungetype;
1281	int clearmode;
1282{
1283	int i, error, indiroff;
1284	ufs_lbn_t lbn, rlbn;
1285	ufs2_daddr_t len, blkno, numblks, blksperindir;
1286	struct ufs2_dinode *dip;
1287	struct thread *td = curthread;
1288	struct buf *bp;
1289
1290	/*
1291	 * Prepare to expunge the inode. If its inode block has not
1292	 * yet been copied, then allocate and fill the copy.
1293	 */
1294	lbn = fragstoblks(fs, ino_to_fsba(fs, cancelip->i_number));
1295	blkno = 0;
1296	if (lbn < NDADDR) {
1297		blkno = VTOI(snapvp)->i_din2->di_db[lbn];
1298	} else {
1299		if (DOINGSOFTDEP(snapvp))
1300			softdep_prealloc(snapvp, MNT_WAIT);
1301		td->td_pflags |= TDP_COWINPROGRESS;
1302		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1303		   fs->fs_bsize, KERNCRED, BA_METAONLY, &bp);
1304		td->td_pflags &= ~TDP_COWINPROGRESS;
1305		if (error)
1306			return (error);
1307		indiroff = (lbn - NDADDR) % NINDIR(fs);
1308		blkno = ((ufs2_daddr_t *)(bp->b_data))[indiroff];
1309		bqrelse(bp);
1310	}
1311	if (blkno != 0) {
1312		if ((error = bread(snapvp, lbn, fs->fs_bsize, KERNCRED, &bp)))
1313			return (error);
1314	} else {
1315		error = ffs_balloc_ufs2(snapvp, lblktosize(fs, (off_t)lbn),
1316		    fs->fs_bsize, KERNCRED, 0, &bp);
1317		if (error)
1318			return (error);
1319		if ((error = readblock(snapvp, bp, lbn)) != 0)
1320			return (error);
1321	}
1322	/*
1323	 * Set a snapshot inode to be a zero length file, regular files
1324	 * to be completely unallocated.
1325	 */
1326	dip = (struct ufs2_dinode *)bp->b_data +
1327	    ino_to_fsbo(fs, cancelip->i_number);
1328	if (clearmode || cancelip->i_effnlink == 0)
1329		dip->di_mode = 0;
1330	dip->di_size = 0;
1331	dip->di_blocks = 0;
1332	dip->di_flags &= ~SF_SNAPSHOT;
1333	bzero(&dip->di_db[0], (NDADDR + NIADDR) * sizeof(ufs2_daddr_t));
1334	bdwrite(bp);
1335	/*
1336	 * Now go through and expunge all the blocks in the file
1337	 * using the function requested.
1338	 */
1339	numblks = howmany(cancelip->i_size, fs->fs_bsize);
1340	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_db[0],
1341	    &cancelip->i_din2->di_db[NDADDR], fs, 0, expungetype)))
1342		return (error);
1343	if ((error = (*acctfunc)(snapvp, &cancelip->i_din2->di_ib[0],
1344	    &cancelip->i_din2->di_ib[NIADDR], fs, -1, expungetype)))
1345		return (error);
1346	blksperindir = 1;
1347	lbn = -NDADDR;
1348	len = numblks - NDADDR;
1349	rlbn = NDADDR;
1350	for (i = 0; len > 0 && i < NIADDR; i++) {
1351		error = indiracct_ufs2(snapvp, ITOV(cancelip), i,
1352		    cancelip->i_din2->di_ib[i], lbn, rlbn, len,
1353		    blksperindir, fs, acctfunc, expungetype);
1354		if (error)
1355			return (error);
1356		blksperindir *= NINDIR(fs);
1357		lbn -= blksperindir + 1;
1358		len -= blksperindir;
1359		rlbn += blksperindir;
1360	}
1361	return (0);
1362}
1363
1364/*
1365 * Descend an indirect block chain for vnode cancelvp accounting for all
1366 * its indirect blocks in snapvp.
1367 */
1368static int
1369indiracct_ufs2(snapvp, cancelvp, level, blkno, lbn, rlbn, remblks,
1370	    blksperindir, fs, acctfunc, expungetype)
1371	struct vnode *snapvp;
1372	struct vnode *cancelvp;
1373	int level;
1374	ufs2_daddr_t blkno;
1375	ufs_lbn_t lbn;
1376	ufs_lbn_t rlbn;
1377	ufs_lbn_t remblks;
1378	ufs_lbn_t blksperindir;
1379	struct fs *fs;
1380	int (*acctfunc)(struct vnode *, ufs2_daddr_t *, ufs2_daddr_t *,
1381	    struct fs *, ufs_lbn_t, int);
1382	int expungetype;
1383{
1384	int error, num, i;
1385	ufs_lbn_t subblksperindir;
1386	struct indir indirs[NIADDR + 2];
1387	ufs2_daddr_t last, *bap;
1388	struct buf *bp;
1389
1390	if (blkno == 0) {
1391		if (expungetype == BLK_NOCOPY)
1392			return (0);
1393		panic("indiracct_ufs2: missing indir");
1394	}
1395	if ((error = ufs_getlbns(cancelvp, rlbn, indirs, &num)) != 0)
1396		return (error);
1397	if (lbn != indirs[num - 1 - level].in_lbn || num < 2)
1398		panic("indiracct_ufs2: botched params");
1399	/*
1400	 * We have to expand bread here since it will deadlock looking
1401	 * up the block number for any blocks that are not in the cache.
1402	 */
1403	bp = getblk(cancelvp, lbn, fs->fs_bsize, 0, 0, 0);
1404	bp->b_blkno = fsbtodb(fs, blkno);
1405	if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0 &&
1406	    (error = readblock(cancelvp, bp, fragstoblks(fs, blkno)))) {
1407		brelse(bp);
1408		return (error);
1409	}
1410	/*
1411	 * Account for the block pointers in this indirect block.
1412	 */
1413	last = howmany(remblks, blksperindir);
1414	if (last > NINDIR(fs))
1415		last = NINDIR(fs);
1416	bap = malloc(fs->fs_bsize, M_DEVBUF, M_WAITOK);
1417	bcopy(bp->b_data, (caddr_t)bap, fs->fs_bsize);
1418	bqrelse(bp);
1419	error = (*acctfunc)(snapvp, &bap[0], &bap[last], fs,
1420	    level == 0 ? rlbn : -1, expungetype);
1421	if (error || level == 0)
1422		goto out;
1423	/*
1424	 * Account for the block pointers in each of the indirect blocks
1425	 * in the levels below us.
1426	 */
1427	subblksperindir = blksperindir / NINDIR(fs);
1428	for (lbn++, level--, i = 0; i < last; i++) {
1429		error = indiracct_ufs2(snapvp, cancelvp, level, bap[i], lbn,
1430		    rlbn, remblks, subblksperindir, fs, acctfunc, expungetype);
1431		if (error)
1432			goto out;
1433		rlbn += blksperindir;
1434		lbn -= blksperindir;
1435		remblks -= blksperindir;
1436	}
1437out:
1438	free(bap, M_DEVBUF);
1439	return (error);
1440}
1441
1442/*
1443 * Do both snap accounting and map accounting.
1444 */
1445static int
1446fullacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)
1447	struct vnode *vp;
1448	ufs2_daddr_t *oldblkp, *lastblkp;
1449	struct fs *fs;
1450	ufs_lbn_t lblkno;
1451	int exptype;	/* BLK_SNAP or BLK_NOCOPY */
1452{
1453	int error;
1454
1455	if ((error = snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype)))
1456		return (error);
1457	return (mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, exptype));
1458}
1459
1460/*
1461 * Identify a set of blocks allocated in a snapshot inode.
1462 */
1463static int
1464snapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1465	struct vnode *vp;
1466	ufs2_daddr_t *oldblkp, *lastblkp;
1467	struct fs *fs;
1468	ufs_lbn_t lblkno;
1469	int expungetype;	/* BLK_SNAP or BLK_NOCOPY */
1470{
1471	struct inode *ip = VTOI(vp);
1472	ufs2_daddr_t blkno, *blkp;
1473	ufs_lbn_t lbn;
1474	struct buf *ibp;
1475	int error;
1476
1477	for ( ; oldblkp < lastblkp; oldblkp++) {
1478		blkno = *oldblkp;
1479		if (blkno == 0 || blkno == BLK_NOCOPY || blkno == BLK_SNAP)
1480			continue;
1481		lbn = fragstoblks(fs, blkno);
1482		if (lbn < NDADDR) {
1483			blkp = &ip->i_din2->di_db[lbn];
1484			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1485		} else {
1486			error = ffs_balloc_ufs2(vp, lblktosize(fs, (off_t)lbn),
1487			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1488			if (error)
1489				return (error);
1490			blkp = &((ufs2_daddr_t *)(ibp->b_data))
1491			    [(lbn - NDADDR) % NINDIR(fs)];
1492		}
1493		/*
1494		 * If we are expunging a snapshot vnode and we
1495		 * find a block marked BLK_NOCOPY, then it is
1496		 * one that has been allocated to this snapshot after
1497		 * we took our current snapshot and can be ignored.
1498		 */
1499		if (expungetype == BLK_SNAP && *blkp == BLK_NOCOPY) {
1500			if (lbn >= NDADDR)
1501				brelse(ibp);
1502		} else {
1503			if (*blkp != 0)
1504				panic("snapacct_ufs2: bad block");
1505			*blkp = expungetype;
1506			if (lbn >= NDADDR)
1507				bdwrite(ibp);
1508		}
1509	}
1510	return (0);
1511}
1512
1513/*
1514 * Account for a set of blocks allocated in a snapshot inode.
1515 */
1516static int
1517mapacct_ufs2(vp, oldblkp, lastblkp, fs, lblkno, expungetype)
1518	struct vnode *vp;
1519	ufs2_daddr_t *oldblkp, *lastblkp;
1520	struct fs *fs;
1521	ufs_lbn_t lblkno;
1522	int expungetype;
1523{
1524	ufs2_daddr_t blkno;
1525	struct inode *ip;
1526	ino_t inum;
1527	int acctit;
1528
1529	ip = VTOI(vp);
1530	inum = ip->i_number;
1531	if (lblkno == -1)
1532		acctit = 0;
1533	else
1534		acctit = 1;
1535	for ( ; oldblkp < lastblkp; oldblkp++, lblkno++) {
1536		blkno = *oldblkp;
1537		if (blkno == 0 || blkno == BLK_NOCOPY)
1538			continue;
1539		if (acctit && expungetype == BLK_SNAP && blkno != BLK_SNAP)
1540			*ip->i_snapblklist++ = lblkno;
1541		if (blkno == BLK_SNAP)
1542			blkno = blkstofrags(fs, lblkno);
1543		ffs_blkfree(ip->i_ump, fs, vp, blkno, fs->fs_bsize, inum,
1544		    vp->v_type, NULL);
1545	}
1546	return (0);
1547}
1548
1549/*
1550 * Decrement extra reference on snapshot when last name is removed.
1551 * It will not be freed until the last open reference goes away.
1552 */
1553void
1554ffs_snapgone(ip)
1555	struct inode *ip;
1556{
1557	struct inode *xp;
1558	struct fs *fs;
1559	int snaploc;
1560	struct snapdata *sn;
1561	struct ufsmount *ump;
1562
1563	/*
1564	 * Find snapshot in incore list.
1565	 */
1566	xp = NULL;
1567	sn = ip->i_devvp->v_rdev->si_snapdata;
1568	if (sn != NULL)
1569		TAILQ_FOREACH(xp, &sn->sn_head, i_nextsnap)
1570			if (xp == ip)
1571				break;
1572	if (xp != NULL)
1573		vrele(ITOV(ip));
1574	else if (snapdebug)
1575		printf("ffs_snapgone: lost snapshot vnode %ju\n",
1576		    (uintmax_t)ip->i_number);
1577	/*
1578	 * Delete snapshot inode from superblock. Keep list dense.
1579	 */
1580	fs = ip->i_fs;
1581	ump = ip->i_ump;
1582	UFS_LOCK(ump);
1583	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++)
1584		if (fs->fs_snapinum[snaploc] == ip->i_number)
1585			break;
1586	if (snaploc < FSMAXSNAP) {
1587		for (snaploc++; snaploc < FSMAXSNAP; snaploc++) {
1588			if (fs->fs_snapinum[snaploc] == 0)
1589				break;
1590			fs->fs_snapinum[snaploc - 1] = fs->fs_snapinum[snaploc];
1591		}
1592		fs->fs_snapinum[snaploc - 1] = 0;
1593	}
1594	UFS_UNLOCK(ump);
1595}
1596
1597/*
1598 * Prepare a snapshot file for being removed.
1599 */
1600void
1601ffs_snapremove(vp)
1602	struct vnode *vp;
1603{
1604	struct inode *ip;
1605	struct vnode *devvp;
1606	struct buf *ibp;
1607	struct fs *fs;
1608	ufs2_daddr_t numblks, blkno, dblk;
1609	int error, loc, last;
1610	struct snapdata *sn;
1611
1612	ip = VTOI(vp);
1613	fs = ip->i_fs;
1614	devvp = ip->i_devvp;
1615	/*
1616	 * If active, delete from incore list (this snapshot may
1617	 * already have been in the process of being deleted, so
1618	 * would not have been active).
1619	 *
1620	 * Clear copy-on-write flag if last snapshot.
1621	 */
1622	VI_LOCK(devvp);
1623	if (ip->i_nextsnap.tqe_prev != 0) {
1624		sn = devvp->v_rdev->si_snapdata;
1625		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
1626		ip->i_nextsnap.tqe_prev = 0;
1627		VI_UNLOCK(devvp);
1628		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
1629		KASSERT(vp->v_vnlock == &sn->sn_lock,
1630			("ffs_snapremove: lost lock mutation"));
1631		vp->v_vnlock = &vp->v_lock;
1632		VI_LOCK(devvp);
1633		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
1634		try_free_snapdata(devvp);
1635	} else
1636		VI_UNLOCK(devvp);
1637	/*
1638	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
1639	 * snapshots that want them (see ffs_snapblkfree below).
1640	 */
1641	for (blkno = 1; blkno < NDADDR; blkno++) {
1642		dblk = DIP(ip, i_db[blkno]);
1643		if (dblk == 0)
1644			continue;
1645		if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1646			DIP_SET(ip, i_db[blkno], 0);
1647		else if ((dblk == blkstofrags(fs, blkno) &&
1648		     ffs_snapblkfree(fs, ip->i_devvp, dblk, fs->fs_bsize,
1649		     ip->i_number, vp->v_type, NULL))) {
1650			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) -
1651			    btodb(fs->fs_bsize));
1652			DIP_SET(ip, i_db[blkno], 0);
1653		}
1654	}
1655	numblks = howmany(ip->i_size, fs->fs_bsize);
1656	for (blkno = NDADDR; blkno < numblks; blkno += NINDIR(fs)) {
1657		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)blkno),
1658		    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1659		if (error)
1660			continue;
1661		if (fs->fs_size - blkno > NINDIR(fs))
1662			last = NINDIR(fs);
1663		else
1664			last = fs->fs_size - blkno;
1665		for (loc = 0; loc < last; loc++) {
1666			if (ip->i_ump->um_fstype == UFS1) {
1667				dblk = ((ufs1_daddr_t *)(ibp->b_data))[loc];
1668				if (dblk == 0)
1669					continue;
1670				if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1671					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1672				else if ((dblk == blkstofrags(fs, blkno) &&
1673				     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1674				     fs->fs_bsize, ip->i_number, vp->v_type,
1675				     NULL))) {
1676					ip->i_din1->di_blocks -=
1677					    btodb(fs->fs_bsize);
1678					((ufs1_daddr_t *)(ibp->b_data))[loc]= 0;
1679				}
1680				continue;
1681			}
1682			dblk = ((ufs2_daddr_t *)(ibp->b_data))[loc];
1683			if (dblk == 0)
1684				continue;
1685			if (dblk == BLK_NOCOPY || dblk == BLK_SNAP)
1686				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1687			else if ((dblk == blkstofrags(fs, blkno) &&
1688			     ffs_snapblkfree(fs, ip->i_devvp, dblk,
1689			     fs->fs_bsize, ip->i_number, vp->v_type, NULL))) {
1690				ip->i_din2->di_blocks -= btodb(fs->fs_bsize);
1691				((ufs2_daddr_t *)(ibp->b_data))[loc] = 0;
1692			}
1693		}
1694		bawrite(ibp);
1695	}
1696	/*
1697	 * Clear snapshot flag and drop reference.
1698	 */
1699	ip->i_flags &= ~SF_SNAPSHOT;
1700	DIP_SET(ip, i_flags, ip->i_flags);
1701	ip->i_flag |= IN_CHANGE | IN_UPDATE;
1702	/*
1703	 * The dirtied indirects must be written out before
1704	 * softdep_setup_freeblocks() is called.  Otherwise indir_trunc()
1705	 * may find indirect pointers using the magic BLK_* values.
1706	 */
1707	if (DOINGSOFTDEP(vp))
1708		ffs_syncvnode(vp, MNT_WAIT, 0);
1709#ifdef QUOTA
1710	/*
1711	 * Reenable disk quotas for ex-snapshot file.
1712	 */
1713	if (!getinoquota(ip))
1714		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
1715#endif
1716}
1717
1718/*
1719 * Notification that a block is being freed. Return zero if the free
1720 * should be allowed to proceed. Return non-zero if the snapshot file
1721 * wants to claim the block. The block will be claimed if it is an
1722 * uncopied part of one of the snapshots. It will be freed if it is
1723 * either a BLK_NOCOPY or has already been copied in all of the snapshots.
1724 * If a fragment is being freed, then all snapshots that care about
1725 * it must make a copy since a snapshot file can only claim full sized
1726 * blocks. Note that if more than one snapshot file maps the block,
1727 * we can pick one at random to claim it. Since none of the snapshots
1728 * can change, we are assurred that they will all see the same unmodified
1729 * image. When deleting a snapshot file (see ffs_snapremove above), we
1730 * must push any of these claimed blocks to one of the other snapshots
1731 * that maps it. These claimed blocks are easily identified as they will
1732 * have a block number equal to their logical block number within the
1733 * snapshot. A copied block can never have this property because they
1734 * must always have been allocated from a BLK_NOCOPY location.
1735 */
1736int
1737ffs_snapblkfree(fs, devvp, bno, size, inum, vtype, wkhd)
1738	struct fs *fs;
1739	struct vnode *devvp;
1740	ufs2_daddr_t bno;
1741	long size;
1742	ino_t inum;
1743	enum vtype vtype;
1744	struct workhead *wkhd;
1745{
1746	struct buf *ibp, *cbp, *savedcbp = NULL;
1747	struct thread *td = curthread;
1748	struct inode *ip;
1749	struct vnode *vp = NULL;
1750	ufs_lbn_t lbn;
1751	ufs2_daddr_t blkno;
1752	int indiroff = 0, error = 0, claimedblk = 0;
1753	struct snapdata *sn;
1754
1755	lbn = fragstoblks(fs, bno);
1756retry:
1757	VI_LOCK(devvp);
1758	sn = devvp->v_rdev->si_snapdata;
1759	if (sn == NULL) {
1760		VI_UNLOCK(devvp);
1761		return (0);
1762	}
1763	if (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
1764	    VI_MTX(devvp)) != 0)
1765		goto retry;
1766	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
1767		vp = ITOV(ip);
1768		if (DOINGSOFTDEP(vp))
1769			softdep_prealloc(vp, MNT_WAIT);
1770		/*
1771		 * Lookup block being written.
1772		 */
1773		if (lbn < NDADDR) {
1774			blkno = DIP(ip, i_db[lbn]);
1775		} else {
1776			td->td_pflags |= TDP_COWINPROGRESS;
1777			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1778			    fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
1779			td->td_pflags &= ~TDP_COWINPROGRESS;
1780			if (error)
1781				break;
1782			indiroff = (lbn - NDADDR) % NINDIR(fs);
1783			if (ip->i_ump->um_fstype == UFS1)
1784				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
1785			else
1786				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
1787		}
1788		/*
1789		 * Check to see if block needs to be copied.
1790		 */
1791		if (blkno == 0) {
1792			/*
1793			 * A block that we map is being freed. If it has not
1794			 * been claimed yet, we will claim or copy it (below).
1795			 */
1796			claimedblk = 1;
1797		} else if (blkno == BLK_SNAP) {
1798			/*
1799			 * No previous snapshot claimed the block,
1800			 * so it will be freed and become a BLK_NOCOPY
1801			 * (don't care) for us.
1802			 */
1803			if (claimedblk)
1804				panic("snapblkfree: inconsistent block type");
1805			if (lbn < NDADDR) {
1806				DIP_SET(ip, i_db[lbn], BLK_NOCOPY);
1807				ip->i_flag |= IN_CHANGE | IN_UPDATE;
1808			} else if (ip->i_ump->um_fstype == UFS1) {
1809				((ufs1_daddr_t *)(ibp->b_data))[indiroff] =
1810				    BLK_NOCOPY;
1811				bdwrite(ibp);
1812			} else {
1813				((ufs2_daddr_t *)(ibp->b_data))[indiroff] =
1814				    BLK_NOCOPY;
1815				bdwrite(ibp);
1816			}
1817			continue;
1818		} else /* BLK_NOCOPY or default */ {
1819			/*
1820			 * If the snapshot has already copied the block
1821			 * (default), or does not care about the block,
1822			 * it is not needed.
1823			 */
1824			if (lbn >= NDADDR)
1825				bqrelse(ibp);
1826			continue;
1827		}
1828		/*
1829		 * If this is a full size block, we will just grab it
1830		 * and assign it to the snapshot inode. Otherwise we
1831		 * will proceed to copy it. See explanation for this
1832		 * routine as to why only a single snapshot needs to
1833		 * claim this block.
1834		 */
1835		if (size == fs->fs_bsize) {
1836#ifdef DEBUG
1837			if (snapdebug)
1838				printf("%s %ju lbn %jd from inum %ju\n",
1839				    "Grabonremove: snapino",
1840				    (uintmax_t)ip->i_number,
1841				    (intmax_t)lbn, (uintmax_t)inum);
1842#endif
1843			/*
1844			 * If journaling is tracking this write we must add
1845			 * the work to the inode or indirect being written.
1846			 */
1847			if (wkhd != NULL) {
1848				if (lbn < NDADDR)
1849					softdep_inode_append(ip,
1850					    curthread->td_ucred, wkhd);
1851				else
1852					softdep_buf_append(ibp, wkhd);
1853			}
1854			if (lbn < NDADDR) {
1855				DIP_SET(ip, i_db[lbn], bno);
1856			} else if (ip->i_ump->um_fstype == UFS1) {
1857				((ufs1_daddr_t *)(ibp->b_data))[indiroff] = bno;
1858				bdwrite(ibp);
1859			} else {
1860				((ufs2_daddr_t *)(ibp->b_data))[indiroff] = bno;
1861				bdwrite(ibp);
1862			}
1863			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) + btodb(size));
1864			ip->i_flag |= IN_CHANGE | IN_UPDATE;
1865			lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1866			return (1);
1867		}
1868		if (lbn >= NDADDR)
1869			bqrelse(ibp);
1870		/*
1871		 * Allocate the block into which to do the copy. Note that this
1872		 * allocation will never require any additional allocations for
1873		 * the snapshot inode.
1874		 */
1875		td->td_pflags |= TDP_COWINPROGRESS;
1876		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
1877		    fs->fs_bsize, KERNCRED, 0, &cbp);
1878		td->td_pflags &= ~TDP_COWINPROGRESS;
1879		if (error)
1880			break;
1881#ifdef DEBUG
1882		if (snapdebug)
1883			printf("%s%ju lbn %jd %s %ju size %ld to blkno %jd\n",
1884			    "Copyonremove: snapino ", (uintmax_t)ip->i_number,
1885			    (intmax_t)lbn, "for inum", (uintmax_t)inum, size,
1886			    (intmax_t)cbp->b_blkno);
1887#endif
1888		/*
1889		 * If we have already read the old block contents, then
1890		 * simply copy them to the new block. Note that we need
1891		 * to synchronously write snapshots that have not been
1892		 * unlinked, and hence will be visible after a crash,
1893		 * to ensure their integrity. At a minimum we ensure the
1894		 * integrity of the filesystem metadata, but use the
1895		 * dopersistence sysctl-setable flag to decide on the
1896		 * persistence needed for file content data.
1897		 */
1898		if (savedcbp != 0) {
1899			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
1900			bawrite(cbp);
1901			if ((vtype == VDIR || dopersistence) &&
1902			    ip->i_effnlink > 0)
1903				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1904			continue;
1905		}
1906		/*
1907		 * Otherwise, read the old block contents into the buffer.
1908		 */
1909		if ((error = readblock(vp, cbp, lbn)) != 0) {
1910			bzero(cbp->b_data, fs->fs_bsize);
1911			bawrite(cbp);
1912			if ((vtype == VDIR || dopersistence) &&
1913			    ip->i_effnlink > 0)
1914				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1915			break;
1916		}
1917		savedcbp = cbp;
1918	}
1919	/*
1920	 * Note that we need to synchronously write snapshots that
1921	 * have not been unlinked, and hence will be visible after
1922	 * a crash, to ensure their integrity. At a minimum we
1923	 * ensure the integrity of the filesystem metadata, but
1924	 * use the dopersistence sysctl-setable flag to decide on
1925	 * the persistence needed for file content data.
1926	 */
1927	if (savedcbp) {
1928		vp = savedcbp->b_vp;
1929		bawrite(savedcbp);
1930		if ((vtype == VDIR || dopersistence) &&
1931		    VTOI(vp)->i_effnlink > 0)
1932			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
1933	}
1934	/*
1935	 * If we have been unable to allocate a block in which to do
1936	 * the copy, then return non-zero so that the fragment will
1937	 * not be freed. Although space will be lost, the snapshot
1938	 * will stay consistent.
1939	 */
1940	if (error != 0 && wkhd != NULL)
1941		softdep_freework(wkhd);
1942	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
1943	return (error);
1944}
1945
1946/*
1947 * Associate snapshot files when mounting.
1948 */
1949void
1950ffs_snapshot_mount(mp)
1951	struct mount *mp;
1952{
1953	struct ufsmount *ump = VFSTOUFS(mp);
1954	struct vnode *devvp = ump->um_devvp;
1955	struct fs *fs = ump->um_fs;
1956	struct thread *td = curthread;
1957	struct snapdata *sn;
1958	struct vnode *vp;
1959	struct vnode *lastvp;
1960	struct inode *ip;
1961	struct uio auio;
1962	struct iovec aiov;
1963	void *snapblklist;
1964	char *reason;
1965	daddr_t snaplistsize;
1966	int error, snaploc, loc;
1967
1968	/*
1969	 * XXX The following needs to be set before ffs_truncate or
1970	 * VOP_READ can be called.
1971	 */
1972	mp->mnt_stat.f_iosize = fs->fs_bsize;
1973	/*
1974	 * Process each snapshot listed in the superblock.
1975	 */
1976	vp = NULL;
1977	lastvp = NULL;
1978	sn = NULL;
1979	for (snaploc = 0; snaploc < FSMAXSNAP; snaploc++) {
1980		if (fs->fs_snapinum[snaploc] == 0)
1981			break;
1982		if ((error = ffs_vget(mp, fs->fs_snapinum[snaploc],
1983		    LK_EXCLUSIVE, &vp)) != 0){
1984			printf("ffs_snapshot_mount: vget failed %d\n", error);
1985			continue;
1986		}
1987		ip = VTOI(vp);
1988		if (!IS_SNAPSHOT(ip) || ip->i_size ==
1989		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
1990			if (!IS_SNAPSHOT(ip)) {
1991				reason = "non-snapshot";
1992			} else {
1993				reason = "old format snapshot";
1994				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED);
1995				(void)ffs_syncvnode(vp, MNT_WAIT, 0);
1996			}
1997			printf("ffs_snapshot_mount: %s inode %d\n",
1998			    reason, fs->fs_snapinum[snaploc]);
1999			vput(vp);
2000			vp = NULL;
2001			for (loc = snaploc + 1; loc < FSMAXSNAP; loc++) {
2002				if (fs->fs_snapinum[loc] == 0)
2003					break;
2004				fs->fs_snapinum[loc - 1] = fs->fs_snapinum[loc];
2005			}
2006			fs->fs_snapinum[loc - 1] = 0;
2007			snaploc--;
2008			continue;
2009		}
2010		/*
2011		 * Acquire a lock on the snapdata structure, creating it if
2012		 * necessary.
2013		 */
2014		sn = ffs_snapdata_acquire(devvp);
2015		/*
2016		 * Change vnode to use shared snapshot lock instead of the
2017		 * original private lock.
2018		 */
2019		vp->v_vnlock = &sn->sn_lock;
2020		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2021		/*
2022		 * Link it onto the active snapshot list.
2023		 */
2024		VI_LOCK(devvp);
2025		if (ip->i_nextsnap.tqe_prev != 0)
2026			panic("ffs_snapshot_mount: %ju already on list",
2027			    (uintmax_t)ip->i_number);
2028		else
2029			TAILQ_INSERT_TAIL(&sn->sn_head, ip, i_nextsnap);
2030		vp->v_vflag |= VV_SYSTEM;
2031		VI_UNLOCK(devvp);
2032		VOP_UNLOCK(vp, 0);
2033		lastvp = vp;
2034	}
2035	vp = lastvp;
2036	/*
2037	 * No usable snapshots found.
2038	 */
2039	if (sn == NULL || vp == NULL)
2040		return;
2041	/*
2042	 * Allocate the space for the block hints list. We always want to
2043	 * use the list from the newest snapshot.
2044	 */
2045	auio.uio_iov = &aiov;
2046	auio.uio_iovcnt = 1;
2047	aiov.iov_base = (void *)&snaplistsize;
2048	aiov.iov_len = sizeof(snaplistsize);
2049	auio.uio_resid = aiov.iov_len;
2050	auio.uio_offset =
2051	    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag));
2052	auio.uio_segflg = UIO_SYSSPACE;
2053	auio.uio_rw = UIO_READ;
2054	auio.uio_td = td;
2055	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
2056	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2057		printf("ffs_snapshot_mount: read_1 failed %d\n", error);
2058		VOP_UNLOCK(vp, 0);
2059		return;
2060	}
2061	snapblklist = malloc(snaplistsize * sizeof(daddr_t),
2062	    M_UFSMNT, M_WAITOK);
2063	auio.uio_iovcnt = 1;
2064	aiov.iov_base = snapblklist;
2065	aiov.iov_len = snaplistsize * sizeof (daddr_t);
2066	auio.uio_resid = aiov.iov_len;
2067	auio.uio_offset -= sizeof(snaplistsize);
2068	if ((error = VOP_READ(vp, &auio, IO_UNIT, td->td_ucred)) != 0) {
2069		printf("ffs_snapshot_mount: read_2 failed %d\n", error);
2070		VOP_UNLOCK(vp, 0);
2071		free(snapblklist, M_UFSMNT);
2072		return;
2073	}
2074	VOP_UNLOCK(vp, 0);
2075	VI_LOCK(devvp);
2076	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_mount");
2077	sn->sn_listsize = snaplistsize;
2078	sn->sn_blklist = (daddr_t *)snapblklist;
2079	devvp->v_vflag |= VV_COPYONWRITE;
2080	VI_UNLOCK(devvp);
2081}
2082
2083/*
2084 * Disassociate snapshot files when unmounting.
2085 */
2086void
2087ffs_snapshot_unmount(mp)
2088	struct mount *mp;
2089{
2090	struct vnode *devvp = VFSTOUFS(mp)->um_devvp;
2091	struct snapdata *sn;
2092	struct inode *xp;
2093	struct vnode *vp;
2094
2095	VI_LOCK(devvp);
2096	sn = devvp->v_rdev->si_snapdata;
2097	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
2098		vp = ITOV(xp);
2099		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
2100		xp->i_nextsnap.tqe_prev = 0;
2101		lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE,
2102		    VI_MTX(devvp));
2103		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
2104		KASSERT(vp->v_vnlock == &sn->sn_lock,
2105		("ffs_snapshot_unmount: lost lock mutation"));
2106		vp->v_vnlock = &vp->v_lock;
2107		lockmgr(&vp->v_lock, LK_RELEASE, NULL);
2108		lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2109		if (xp->i_effnlink > 0)
2110			vrele(vp);
2111		VI_LOCK(devvp);
2112		sn = devvp->v_rdev->si_snapdata;
2113	}
2114	try_free_snapdata(devvp);
2115	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
2116}
2117
2118/*
2119 * Check the buffer block to be belong to device buffer that shall be
2120 * locked after snaplk. devvp shall be locked on entry, and will be
2121 * leaved locked upon exit.
2122 */
2123static int
2124ffs_bp_snapblk(devvp, bp)
2125	struct vnode *devvp;
2126	struct buf *bp;
2127{
2128	struct snapdata *sn;
2129	struct fs *fs;
2130	ufs2_daddr_t lbn, *snapblklist;
2131	int lower, upper, mid;
2132
2133	ASSERT_VI_LOCKED(devvp, "ffs_bp_snapblk");
2134	KASSERT(devvp->v_type == VCHR, ("Not a device %p", devvp));
2135	sn = devvp->v_rdev->si_snapdata;
2136	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) == NULL)
2137		return (0);
2138	fs = TAILQ_FIRST(&sn->sn_head)->i_fs;
2139	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2140	snapblklist = sn->sn_blklist;
2141	upper = sn->sn_listsize - 1;
2142	lower = 1;
2143	while (lower <= upper) {
2144		mid = (lower + upper) / 2;
2145		if (snapblklist[mid] == lbn)
2146			break;
2147		if (snapblklist[mid] < lbn)
2148			lower = mid + 1;
2149		else
2150			upper = mid - 1;
2151	}
2152	if (lower <= upper)
2153		return (1);
2154	return (0);
2155}
2156
2157void
2158ffs_bdflush(bo, bp)
2159	struct bufobj *bo;
2160	struct buf *bp;
2161{
2162	struct thread *td;
2163	struct vnode *vp, *devvp;
2164	struct buf *nbp;
2165	int bp_bdskip;
2166
2167	if (bo->bo_dirty.bv_cnt <= dirtybufthresh)
2168		return;
2169
2170	td = curthread;
2171	vp = bp->b_vp;
2172	devvp = bo->__bo_vnode;
2173	KASSERT(vp == devvp, ("devvp != vp %p %p", bo, bp));
2174
2175	VI_LOCK(devvp);
2176	bp_bdskip = ffs_bp_snapblk(devvp, bp);
2177	if (bp_bdskip)
2178		bdwriteskip++;
2179	VI_UNLOCK(devvp);
2180	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10 && !bp_bdskip) {
2181		(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
2182		altbufferflushes++;
2183	} else {
2184		BO_LOCK(bo);
2185		/*
2186		 * Try to find a buffer to flush.
2187		 */
2188		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
2189			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
2190			    BUF_LOCK(nbp,
2191				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
2192				continue;
2193			if (bp == nbp)
2194				panic("bdwrite: found ourselves");
2195			BO_UNLOCK(bo);
2196			/*
2197			 * Don't countdeps with the bo lock
2198			 * held.
2199			 */
2200			if (buf_countdeps(nbp, 0)) {
2201				BO_LOCK(bo);
2202				BUF_UNLOCK(nbp);
2203				continue;
2204			}
2205			if (bp_bdskip) {
2206				VI_LOCK(devvp);
2207				if (!ffs_bp_snapblk(vp, nbp)) {
2208					VI_UNLOCK(devvp);
2209					BO_LOCK(bo);
2210					BUF_UNLOCK(nbp);
2211					continue;
2212				}
2213				VI_UNLOCK(devvp);
2214			}
2215			if (nbp->b_flags & B_CLUSTEROK) {
2216				vfs_bio_awrite(nbp);
2217			} else {
2218				bremfree(nbp);
2219				bawrite(nbp);
2220			}
2221			dirtybufferflushes++;
2222			break;
2223		}
2224		if (nbp == NULL)
2225			BO_UNLOCK(bo);
2226	}
2227}
2228
2229/*
2230 * Check for need to copy block that is about to be written,
2231 * copying the block if necessary.
2232 */
2233int
2234ffs_copyonwrite(devvp, bp)
2235	struct vnode *devvp;
2236	struct buf *bp;
2237{
2238	struct snapdata *sn;
2239	struct buf *ibp, *cbp, *savedcbp = NULL;
2240	struct thread *td = curthread;
2241	struct fs *fs;
2242	struct inode *ip;
2243	struct vnode *vp = NULL;
2244	ufs2_daddr_t lbn, blkno, *snapblklist;
2245	int lower, upper, mid, indiroff, error = 0;
2246	int launched_async_io, prev_norunningbuf;
2247	long saved_runningbufspace;
2248
2249	if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
2250		return (0);		/* Update on a snapshot file */
2251	if (td->td_pflags & TDP_COWINPROGRESS)
2252		panic("ffs_copyonwrite: recursive call");
2253	/*
2254	 * First check to see if it is in the preallocated list.
2255	 * By doing this check we avoid several potential deadlocks.
2256	 */
2257	VI_LOCK(devvp);
2258	sn = devvp->v_rdev->si_snapdata;
2259	if (sn == NULL ||
2260	    TAILQ_EMPTY(&sn->sn_head)) {
2261		VI_UNLOCK(devvp);
2262		return (0);		/* No snapshot */
2263	}
2264	ip = TAILQ_FIRST(&sn->sn_head);
2265	fs = ip->i_fs;
2266	lbn = fragstoblks(fs, dbtofsb(fs, bp->b_blkno));
2267	snapblklist = sn->sn_blklist;
2268	upper = sn->sn_listsize - 1;
2269	lower = 1;
2270	while (lower <= upper) {
2271		mid = (lower + upper) / 2;
2272		if (snapblklist[mid] == lbn)
2273			break;
2274		if (snapblklist[mid] < lbn)
2275			lower = mid + 1;
2276		else
2277			upper = mid - 1;
2278	}
2279	if (lower <= upper) {
2280		VI_UNLOCK(devvp);
2281		return (0);
2282	}
2283	launched_async_io = 0;
2284	prev_norunningbuf = td->td_pflags & TDP_NORUNNINGBUF;
2285	/*
2286	 * Since I/O on bp isn't yet in progress and it may be blocked
2287	 * for a long time waiting on snaplk, back it out of
2288	 * runningbufspace, possibly waking other threads waiting for space.
2289	 */
2290	saved_runningbufspace = bp->b_runningbufspace;
2291	if (saved_runningbufspace != 0)
2292		runningbufwakeup(bp);
2293	/*
2294	 * Not in the precomputed list, so check the snapshots.
2295	 */
2296	while (lockmgr(&sn->sn_lock, LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2297	    VI_MTX(devvp)) != 0) {
2298		VI_LOCK(devvp);
2299		sn = devvp->v_rdev->si_snapdata;
2300		if (sn == NULL ||
2301		    TAILQ_EMPTY(&sn->sn_head)) {
2302			VI_UNLOCK(devvp);
2303			if (saved_runningbufspace != 0) {
2304				bp->b_runningbufspace = saved_runningbufspace;
2305				atomic_add_long(&runningbufspace,
2306					       bp->b_runningbufspace);
2307			}
2308			return (0);		/* Snapshot gone */
2309		}
2310	}
2311	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2312		vp = ITOV(ip);
2313		if (DOINGSOFTDEP(vp))
2314			softdep_prealloc(vp, MNT_WAIT);
2315		/*
2316		 * We ensure that everything of our own that needs to be
2317		 * copied will be done at the time that ffs_snapshot is
2318		 * called. Thus we can skip the check here which can
2319		 * deadlock in doing the lookup in UFS_BALLOC.
2320		 */
2321		if (bp->b_vp == vp)
2322			continue;
2323		/*
2324		 * Check to see if block needs to be copied. We do not have
2325		 * to hold the snapshot lock while doing this lookup as it
2326		 * will never require any additional allocations for the
2327		 * snapshot inode.
2328		 */
2329		if (lbn < NDADDR) {
2330			blkno = DIP(ip, i_db[lbn]);
2331		} else {
2332			td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2333			error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2334			   fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
2335			td->td_pflags &= ~TDP_COWINPROGRESS;
2336			if (error)
2337				break;
2338			indiroff = (lbn - NDADDR) % NINDIR(fs);
2339			if (ip->i_ump->um_fstype == UFS1)
2340				blkno=((ufs1_daddr_t *)(ibp->b_data))[indiroff];
2341			else
2342				blkno=((ufs2_daddr_t *)(ibp->b_data))[indiroff];
2343			bqrelse(ibp);
2344		}
2345#ifdef INVARIANTS
2346		if (blkno == BLK_SNAP && bp->b_lblkno >= 0)
2347			panic("ffs_copyonwrite: bad copy block");
2348#endif
2349		if (blkno != 0)
2350			continue;
2351		/*
2352		 * Allocate the block into which to do the copy. Since
2353		 * multiple processes may all try to copy the same block,
2354		 * we have to recheck our need to do a copy if we sleep
2355		 * waiting for the lock.
2356		 *
2357		 * Because all snapshots on a filesystem share a single
2358		 * lock, we ensure that we will never be in competition
2359		 * with another process to allocate a block.
2360		 */
2361		td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
2362		error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
2363		    fs->fs_bsize, KERNCRED, 0, &cbp);
2364		td->td_pflags &= ~TDP_COWINPROGRESS;
2365		if (error)
2366			break;
2367#ifdef DEBUG
2368		if (snapdebug) {
2369			printf("Copyonwrite: snapino %ju lbn %jd for ",
2370			    (uintmax_t)ip->i_number, (intmax_t)lbn);
2371			if (bp->b_vp == devvp)
2372				printf("fs metadata");
2373			else
2374				printf("inum %ju",
2375				    (uintmax_t)VTOI(bp->b_vp)->i_number);
2376			printf(" lblkno %jd to blkno %jd\n",
2377			    (intmax_t)bp->b_lblkno, (intmax_t)cbp->b_blkno);
2378		}
2379#endif
2380		/*
2381		 * If we have already read the old block contents, then
2382		 * simply copy them to the new block. Note that we need
2383		 * to synchronously write snapshots that have not been
2384		 * unlinked, and hence will be visible after a crash,
2385		 * to ensure their integrity. At a minimum we ensure the
2386		 * integrity of the filesystem metadata, but use the
2387		 * dopersistence sysctl-setable flag to decide on the
2388		 * persistence needed for file content data.
2389		 */
2390		if (savedcbp != 0) {
2391			bcopy(savedcbp->b_data, cbp->b_data, fs->fs_bsize);
2392			bawrite(cbp);
2393			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2394			    dopersistence) && ip->i_effnlink > 0)
2395				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2396			else
2397				launched_async_io = 1;
2398			continue;
2399		}
2400		/*
2401		 * Otherwise, read the old block contents into the buffer.
2402		 */
2403		if ((error = readblock(vp, cbp, lbn)) != 0) {
2404			bzero(cbp->b_data, fs->fs_bsize);
2405			bawrite(cbp);
2406			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2407			    dopersistence) && ip->i_effnlink > 0)
2408				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2409			else
2410				launched_async_io = 1;
2411			break;
2412		}
2413		savedcbp = cbp;
2414	}
2415	/*
2416	 * Note that we need to synchronously write snapshots that
2417	 * have not been unlinked, and hence will be visible after
2418	 * a crash, to ensure their integrity. At a minimum we
2419	 * ensure the integrity of the filesystem metadata, but
2420	 * use the dopersistence sysctl-setable flag to decide on
2421	 * the persistence needed for file content data.
2422	 */
2423	if (savedcbp) {
2424		vp = savedcbp->b_vp;
2425		bawrite(savedcbp);
2426		if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
2427		    dopersistence) && VTOI(vp)->i_effnlink > 0)
2428			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
2429		else
2430			launched_async_io = 1;
2431	}
2432	lockmgr(vp->v_vnlock, LK_RELEASE, NULL);
2433	td->td_pflags = (td->td_pflags & ~TDP_NORUNNINGBUF) |
2434		prev_norunningbuf;
2435	if (launched_async_io && (td->td_pflags & TDP_NORUNNINGBUF) == 0)
2436		waitrunningbufspace();
2437	/*
2438	 * I/O on bp will now be started, so count it in runningbufspace.
2439	 */
2440	if (saved_runningbufspace != 0) {
2441		bp->b_runningbufspace = saved_runningbufspace;
2442		atomic_add_long(&runningbufspace, bp->b_runningbufspace);
2443	}
2444	return (error);
2445}
2446
2447/*
2448 * sync snapshots to force freework records waiting on snapshots to claim
2449 * blocks to free.
2450 */
2451void
2452ffs_sync_snap(mp, waitfor)
2453	struct mount *mp;
2454	int waitfor;
2455{
2456	struct snapdata *sn;
2457	struct vnode *devvp;
2458	struct vnode *vp;
2459	struct inode *ip;
2460
2461	devvp = VFSTOUFS(mp)->um_devvp;
2462	if ((devvp->v_vflag & VV_COPYONWRITE) == 0)
2463		return;
2464	for (;;) {
2465		VI_LOCK(devvp);
2466		sn = devvp->v_rdev->si_snapdata;
2467		if (sn == NULL) {
2468			VI_UNLOCK(devvp);
2469			return;
2470		}
2471		if (lockmgr(&sn->sn_lock,
2472		    LK_INTERLOCK | LK_EXCLUSIVE | LK_SLEEPFAIL,
2473		    VI_MTX(devvp)) == 0)
2474			break;
2475	}
2476	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
2477		vp = ITOV(ip);
2478		ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
2479	}
2480	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2481}
2482
2483/*
2484 * Read the specified block into the given buffer.
2485 * Much of this boiler-plate comes from bwrite().
2486 */
2487static int
2488readblock(vp, bp, lbn)
2489	struct vnode *vp;
2490	struct buf *bp;
2491	ufs2_daddr_t lbn;
2492{
2493	struct inode *ip = VTOI(vp);
2494	struct bio *bip;
2495
2496	bip = g_alloc_bio();
2497	bip->bio_cmd = BIO_READ;
2498	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
2499	bip->bio_data = bp->b_data;
2500	bip->bio_length = bp->b_bcount;
2501	bip->bio_done = NULL;
2502
2503	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
2504	bp->b_error = biowait(bip, "snaprdb");
2505	g_destroy_bio(bip);
2506	return (bp->b_error);
2507}
2508
2509#endif
2510
2511/*
2512 * Process file deletes that were deferred by ufs_inactive() due to
2513 * the file system being suspended. Transfer IN_LAZYACCESS into
2514 * IN_MODIFIED for vnodes that were accessed during suspension.
2515 */
2516void
2517process_deferred_inactive(struct mount *mp)
2518{
2519	struct vnode *vp, *mvp;
2520	struct inode *ip;
2521	struct thread *td;
2522	int error;
2523
2524	td = curthread;
2525	(void) vn_start_secondary_write(NULL, &mp, V_WAIT);
2526 loop:
2527	MNT_VNODE_FOREACH_ALL(vp, mp, mvp) {
2528		/*
2529		 * IN_LAZYACCESS is checked here without holding any
2530		 * vnode lock, but this flag is set only while holding
2531		 * vnode interlock.
2532		 */
2533		if (vp->v_type == VNON ||
2534		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
2535		    ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
2536			VI_UNLOCK(vp);
2537			continue;
2538		}
2539		vholdl(vp);
2540		error = vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK);
2541		if (error != 0) {
2542			vdrop(vp);
2543			if (error == ENOENT)
2544				continue;	/* vnode recycled */
2545			MNT_VNODE_FOREACH_ALL_ABORT(mp, mvp);
2546			goto loop;
2547		}
2548		ip = VTOI(vp);
2549		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
2550			ip->i_flag &= ~IN_LAZYACCESS;
2551			ip->i_flag |= IN_MODIFIED;
2552		}
2553		VI_LOCK(vp);
2554		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
2555			VI_UNLOCK(vp);
2556			VOP_UNLOCK(vp, 0);
2557			vdrop(vp);
2558			continue;
2559		}
2560		vinactive(vp, td);
2561		VNASSERT((vp->v_iflag & VI_OWEINACT) == 0, vp,
2562			 ("process_deferred_inactive: got VI_OWEINACT"));
2563		VI_UNLOCK(vp);
2564		VOP_UNLOCK(vp, 0);
2565		vdrop(vp);
2566	}
2567	vn_finished_secondary_write(mp);
2568}
2569
2570#ifndef NO_FFS_SNAPSHOT
2571
2572static struct snapdata *
2573ffs_snapdata_alloc(void)
2574{
2575	struct snapdata *sn;
2576
2577	/*
2578	 * Fetch a snapdata from the free list if there is one available.
2579	 */
2580	mtx_lock(&snapfree_lock);
2581	sn = LIST_FIRST(&snapfree);
2582	if (sn != NULL)
2583		LIST_REMOVE(sn, sn_link);
2584	mtx_unlock(&snapfree_lock);
2585	if (sn != NULL)
2586		return (sn);
2587	/*
2588 	 * If there were no free snapdatas allocate one.
2589	 */
2590	sn = malloc(sizeof *sn, M_UFSMNT, M_WAITOK | M_ZERO);
2591	TAILQ_INIT(&sn->sn_head);
2592	lockinit(&sn->sn_lock, PVFS, "snaplk", VLKTIMEOUT,
2593	    LK_CANRECURSE | LK_NOSHARE);
2594	return (sn);
2595}
2596
2597/*
2598 * The snapdata is never freed because we can not be certain that
2599 * there are no threads sleeping on the snap lock.  Persisting
2600 * them permanently avoids costly synchronization in ffs_lock().
2601 */
2602static void
2603ffs_snapdata_free(struct snapdata *sn)
2604{
2605	mtx_lock(&snapfree_lock);
2606	LIST_INSERT_HEAD(&snapfree, sn, sn_link);
2607	mtx_unlock(&snapfree_lock);
2608}
2609
2610/* Try to free snapdata associated with devvp */
2611static void
2612try_free_snapdata(struct vnode *devvp)
2613{
2614	struct snapdata *sn;
2615	ufs2_daddr_t *snapblklist;
2616
2617	ASSERT_VI_LOCKED(devvp, "try_free_snapdata");
2618	sn = devvp->v_rdev->si_snapdata;
2619
2620	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
2621	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
2622		VI_UNLOCK(devvp);
2623		return;
2624	}
2625
2626	devvp->v_rdev->si_snapdata = NULL;
2627	devvp->v_vflag &= ~VV_COPYONWRITE;
2628	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp));
2629	snapblklist = sn->sn_blklist;
2630	sn->sn_blklist = NULL;
2631	sn->sn_listsize = 0;
2632	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
2633	if (snapblklist != NULL)
2634		free(snapblklist, M_UFSMNT);
2635	ffs_snapdata_free(sn);
2636}
2637
2638static struct snapdata *
2639ffs_snapdata_acquire(struct vnode *devvp)
2640{
2641	struct snapdata *nsn;
2642	struct snapdata *sn;
2643
2644	/*
2645	 * Allocate a free snapdata.  This is done before acquiring the
2646	 * devvp lock to avoid allocation while the devvp interlock is
2647	 * held.
2648	 */
2649	nsn = ffs_snapdata_alloc();
2650	/*
2651	 * If there snapshots already exist on this filesystem grab a
2652	 * reference to the shared lock.  Otherwise this is the first
2653	 * snapshot on this filesystem and we need to use our
2654	 * pre-allocated snapdata.
2655	 */
2656	VI_LOCK(devvp);
2657	if (devvp->v_rdev->si_snapdata == NULL) {
2658		devvp->v_rdev->si_snapdata = nsn;
2659		nsn = NULL;
2660	}
2661	sn = devvp->v_rdev->si_snapdata;
2662	/*
2663	 * Acquire the snapshot lock.
2664	 */
2665	lockmgr(&sn->sn_lock,
2666	    LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY, VI_MTX(devvp));
2667	/*
2668	 * Free any unused snapdata.
2669	 */
2670	if (nsn != NULL)
2671		ffs_snapdata_free(nsn);
2672
2673	return (sn);
2674}
2675
2676#endif
2677