ffs_softdep.c revision 284199
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_softdep.c 284199 2015-06-10 02:04:02Z kib $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/rwlock.h>
73#include <sys/stat.h>
74#include <sys/sysctl.h>
75#include <sys/syslog.h>
76#include <sys/vnode.h>
77#include <sys/conf.h>
78
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/extattr.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89#include <vm/vm.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_object.h>
92
93#include <geom/geom.h>
94
95#include <ddb/ddb.h>
96
97#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
98
99#ifndef SOFTUPDATES
100
101int
102softdep_flushfiles(oldmnt, flags, td)
103	struct mount *oldmnt;
104	int flags;
105	struct thread *td;
106{
107
108	panic("softdep_flushfiles called");
109}
110
111int
112softdep_mount(devvp, mp, fs, cred)
113	struct vnode *devvp;
114	struct mount *mp;
115	struct fs *fs;
116	struct ucred *cred;
117{
118
119	return (0);
120}
121
122void
123softdep_initialize()
124{
125
126	return;
127}
128
129void
130softdep_uninitialize()
131{
132
133	return;
134}
135
136void
137softdep_unmount(mp)
138	struct mount *mp;
139{
140
141	panic("softdep_unmount called");
142}
143
144void
145softdep_setup_sbupdate(ump, fs, bp)
146	struct ufsmount *ump;
147	struct fs *fs;
148	struct buf *bp;
149{
150
151	panic("softdep_setup_sbupdate called");
152}
153
154void
155softdep_setup_inomapdep(bp, ip, newinum, mode)
156	struct buf *bp;
157	struct inode *ip;
158	ino_t newinum;
159	int mode;
160{
161
162	panic("softdep_setup_inomapdep called");
163}
164
165void
166softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
167	struct buf *bp;
168	struct mount *mp;
169	ufs2_daddr_t newblkno;
170	int frags;
171	int oldfrags;
172{
173
174	panic("softdep_setup_blkmapdep called");
175}
176
177void
178softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
179	struct inode *ip;
180	ufs_lbn_t lbn;
181	ufs2_daddr_t newblkno;
182	ufs2_daddr_t oldblkno;
183	long newsize;
184	long oldsize;
185	struct buf *bp;
186{
187
188	panic("softdep_setup_allocdirect called");
189}
190
191void
192softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
193	struct inode *ip;
194	ufs_lbn_t lbn;
195	ufs2_daddr_t newblkno;
196	ufs2_daddr_t oldblkno;
197	long newsize;
198	long oldsize;
199	struct buf *bp;
200{
201
202	panic("softdep_setup_allocext called");
203}
204
205void
206softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
207	struct inode *ip;
208	ufs_lbn_t lbn;
209	struct buf *bp;
210	int ptrno;
211	ufs2_daddr_t newblkno;
212	ufs2_daddr_t oldblkno;
213	struct buf *nbp;
214{
215
216	panic("softdep_setup_allocindir_page called");
217}
218
219void
220softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
221	struct buf *nbp;
222	struct inode *ip;
223	struct buf *bp;
224	int ptrno;
225	ufs2_daddr_t newblkno;
226{
227
228	panic("softdep_setup_allocindir_meta called");
229}
230
231void
232softdep_journal_freeblocks(ip, cred, length, flags)
233	struct inode *ip;
234	struct ucred *cred;
235	off_t length;
236	int flags;
237{
238
239	panic("softdep_journal_freeblocks called");
240}
241
242void
243softdep_journal_fsync(ip)
244	struct inode *ip;
245{
246
247	panic("softdep_journal_fsync called");
248}
249
250void
251softdep_setup_freeblocks(ip, length, flags)
252	struct inode *ip;
253	off_t length;
254	int flags;
255{
256
257	panic("softdep_setup_freeblocks called");
258}
259
260void
261softdep_freefile(pvp, ino, mode)
262		struct vnode *pvp;
263		ino_t ino;
264		int mode;
265{
266
267	panic("softdep_freefile called");
268}
269
270int
271softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
272	struct buf *bp;
273	struct inode *dp;
274	off_t diroffset;
275	ino_t newinum;
276	struct buf *newdirbp;
277	int isnewblk;
278{
279
280	panic("softdep_setup_directory_add called");
281}
282
283void
284softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
285	struct buf *bp;
286	struct inode *dp;
287	caddr_t base;
288	caddr_t oldloc;
289	caddr_t newloc;
290	int entrysize;
291{
292
293	panic("softdep_change_directoryentry_offset called");
294}
295
296void
297softdep_setup_remove(bp, dp, ip, isrmdir)
298	struct buf *bp;
299	struct inode *dp;
300	struct inode *ip;
301	int isrmdir;
302{
303
304	panic("softdep_setup_remove called");
305}
306
307void
308softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
309	struct buf *bp;
310	struct inode *dp;
311	struct inode *ip;
312	ino_t newinum;
313	int isrmdir;
314{
315
316	panic("softdep_setup_directory_change called");
317}
318
319void
320softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
321	struct mount *mp;
322	struct buf *bp;
323	ufs2_daddr_t blkno;
324	int frags;
325	struct workhead *wkhd;
326{
327
328	panic("%s called", __FUNCTION__);
329}
330
331void
332softdep_setup_inofree(mp, bp, ino, wkhd)
333	struct mount *mp;
334	struct buf *bp;
335	ino_t ino;
336	struct workhead *wkhd;
337{
338
339	panic("%s called", __FUNCTION__);
340}
341
342void
343softdep_setup_unlink(dp, ip)
344	struct inode *dp;
345	struct inode *ip;
346{
347
348	panic("%s called", __FUNCTION__);
349}
350
351void
352softdep_setup_link(dp, ip)
353	struct inode *dp;
354	struct inode *ip;
355{
356
357	panic("%s called", __FUNCTION__);
358}
359
360void
361softdep_revert_link(dp, ip)
362	struct inode *dp;
363	struct inode *ip;
364{
365
366	panic("%s called", __FUNCTION__);
367}
368
369void
370softdep_setup_rmdir(dp, ip)
371	struct inode *dp;
372	struct inode *ip;
373{
374
375	panic("%s called", __FUNCTION__);
376}
377
378void
379softdep_revert_rmdir(dp, ip)
380	struct inode *dp;
381	struct inode *ip;
382{
383
384	panic("%s called", __FUNCTION__);
385}
386
387void
388softdep_setup_create(dp, ip)
389	struct inode *dp;
390	struct inode *ip;
391{
392
393	panic("%s called", __FUNCTION__);
394}
395
396void
397softdep_revert_create(dp, ip)
398	struct inode *dp;
399	struct inode *ip;
400{
401
402	panic("%s called", __FUNCTION__);
403}
404
405void
406softdep_setup_mkdir(dp, ip)
407	struct inode *dp;
408	struct inode *ip;
409{
410
411	panic("%s called", __FUNCTION__);
412}
413
414void
415softdep_revert_mkdir(dp, ip)
416	struct inode *dp;
417	struct inode *ip;
418{
419
420	panic("%s called", __FUNCTION__);
421}
422
423void
424softdep_setup_dotdot_link(dp, ip)
425	struct inode *dp;
426	struct inode *ip;
427{
428
429	panic("%s called", __FUNCTION__);
430}
431
432int
433softdep_prealloc(vp, waitok)
434	struct vnode *vp;
435	int waitok;
436{
437
438	panic("%s called", __FUNCTION__);
439}
440
441int
442softdep_journal_lookup(mp, vpp)
443	struct mount *mp;
444	struct vnode **vpp;
445{
446
447	return (ENOENT);
448}
449
450void
451softdep_change_linkcnt(ip)
452	struct inode *ip;
453{
454
455	panic("softdep_change_linkcnt called");
456}
457
458void
459softdep_load_inodeblock(ip)
460	struct inode *ip;
461{
462
463	panic("softdep_load_inodeblock called");
464}
465
466void
467softdep_update_inodeblock(ip, bp, waitfor)
468	struct inode *ip;
469	struct buf *bp;
470	int waitfor;
471{
472
473	panic("softdep_update_inodeblock called");
474}
475
476int
477softdep_fsync(vp)
478	struct vnode *vp;	/* the "in_core" copy of the inode */
479{
480
481	return (0);
482}
483
484void
485softdep_fsync_mountdev(vp)
486	struct vnode *vp;
487{
488
489	return;
490}
491
492int
493softdep_flushworklist(oldmnt, countp, td)
494	struct mount *oldmnt;
495	int *countp;
496	struct thread *td;
497{
498
499	*countp = 0;
500	return (0);
501}
502
503int
504softdep_sync_metadata(struct vnode *vp)
505{
506
507	panic("softdep_sync_metadata called");
508}
509
510int
511softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
512{
513
514	panic("softdep_sync_buf called");
515}
516
517int
518softdep_slowdown(vp)
519	struct vnode *vp;
520{
521
522	panic("softdep_slowdown called");
523}
524
525int
526softdep_request_cleanup(fs, vp, cred, resource)
527	struct fs *fs;
528	struct vnode *vp;
529	struct ucred *cred;
530	int resource;
531{
532
533	return (0);
534}
535
536int
537softdep_check_suspend(struct mount *mp,
538		      struct vnode *devvp,
539		      int softdep_depcnt,
540		      int softdep_accdepcnt,
541		      int secondary_writes,
542		      int secondary_accwrites)
543{
544	struct bufobj *bo;
545	int error;
546
547	(void) softdep_depcnt,
548	(void) softdep_accdepcnt;
549
550	bo = &devvp->v_bufobj;
551	ASSERT_BO_WLOCKED(bo);
552
553	MNT_ILOCK(mp);
554	while (mp->mnt_secondary_writes != 0) {
555		BO_UNLOCK(bo);
556		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
557		    (PUSER - 1) | PDROP, "secwr", 0);
558		BO_LOCK(bo);
559		MNT_ILOCK(mp);
560	}
561
562	/*
563	 * Reasons for needing more work before suspend:
564	 * - Dirty buffers on devvp.
565	 * - Secondary writes occurred after start of vnode sync loop
566	 */
567	error = 0;
568	if (bo->bo_numoutput > 0 ||
569	    bo->bo_dirty.bv_cnt > 0 ||
570	    secondary_writes != 0 ||
571	    mp->mnt_secondary_writes != 0 ||
572	    secondary_accwrites != mp->mnt_secondary_accwrites)
573		error = EAGAIN;
574	BO_UNLOCK(bo);
575	return (error);
576}
577
578void
579softdep_get_depcounts(struct mount *mp,
580		      int *softdepactivep,
581		      int *softdepactiveaccp)
582{
583	(void) mp;
584	*softdepactivep = 0;
585	*softdepactiveaccp = 0;
586}
587
588void
589softdep_buf_append(bp, wkhd)
590	struct buf *bp;
591	struct workhead *wkhd;
592{
593
594	panic("softdep_buf_appendwork called");
595}
596
597void
598softdep_inode_append(ip, cred, wkhd)
599	struct inode *ip;
600	struct ucred *cred;
601	struct workhead *wkhd;
602{
603
604	panic("softdep_inode_appendwork called");
605}
606
607void
608softdep_freework(wkhd)
609	struct workhead *wkhd;
610{
611
612	panic("softdep_freework called");
613}
614
615#else
616
617FEATURE(softupdates, "FFS soft-updates support");
618
619static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
620    "soft updates stats");
621static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
622    "total dependencies allocated");
623static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
624    "high use dependencies allocated");
625static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
626    "current dependencies allocated");
627static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
628    "current dependencies written");
629
630unsigned long dep_current[D_LAST + 1];
631unsigned long dep_highuse[D_LAST + 1];
632unsigned long dep_total[D_LAST + 1];
633unsigned long dep_write[D_LAST + 1];
634
635#define	SOFTDEP_TYPE(type, str, long)					\
636    static MALLOC_DEFINE(M_ ## type, #str, long);			\
637    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
638	&dep_total[D_ ## type], 0, "");					\
639    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
640	&dep_current[D_ ## type], 0, "");				\
641    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
642	&dep_highuse[D_ ## type], 0, "");				\
643    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
644	&dep_write[D_ ## type], 0, "");
645
646SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
647SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
648SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
649    "Block or frag allocated from cyl group map");
650SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
651SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
652SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
653SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
654SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
655SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
656SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
657SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
658SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
659SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
660SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
661SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
662SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
663SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
664SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
665SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
666SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
667SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
668SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
669SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
670SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
671SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
672SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
673SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
674
675static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
676
677static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
678static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
679static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
680
681#define M_SOFTDEP_FLAGS	(M_WAITOK)
682
683/*
684 * translate from workitem type to memory type
685 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
686 */
687static struct malloc_type *memtype[] = {
688	M_PAGEDEP,
689	M_INODEDEP,
690	M_BMSAFEMAP,
691	M_NEWBLK,
692	M_ALLOCDIRECT,
693	M_INDIRDEP,
694	M_ALLOCINDIR,
695	M_FREEFRAG,
696	M_FREEBLKS,
697	M_FREEFILE,
698	M_DIRADD,
699	M_MKDIR,
700	M_DIRREM,
701	M_NEWDIRBLK,
702	M_FREEWORK,
703	M_FREEDEP,
704	M_JADDREF,
705	M_JREMREF,
706	M_JMVREF,
707	M_JNEWBLK,
708	M_JFREEBLK,
709	M_JFREEFRAG,
710	M_JSEG,
711	M_JSEGDEP,
712	M_SBDEP,
713	M_JTRUNC,
714	M_JFSYNC,
715	M_SENTINEL
716};
717
718#define DtoM(type) (memtype[type])
719
720/*
721 * Names of malloc types.
722 */
723#define TYPENAME(type)  \
724	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
725/*
726 * End system adaptation definitions.
727 */
728
729#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
730#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
731
732/*
733 * Internal function prototypes.
734 */
735static	void check_clear_deps(struct mount *);
736static	void softdep_error(char *, int);
737static	int softdep_process_worklist(struct mount *, int);
738static	int softdep_waitidle(struct mount *, int);
739static	void drain_output(struct vnode *);
740static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
741static	int check_inodedep_free(struct inodedep *);
742static	void clear_remove(struct mount *);
743static	void clear_inodedeps(struct mount *);
744static	void unlinked_inodedep(struct mount *, struct inodedep *);
745static	void clear_unlinked_inodedep(struct inodedep *);
746static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
747static	int flush_pagedep_deps(struct vnode *, struct mount *,
748	    struct diraddhd *);
749static	int free_pagedep(struct pagedep *);
750static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
751static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
752static	int flush_deplist(struct allocdirectlst *, int, int *);
753static	int sync_cgs(struct mount *, int);
754static	int handle_written_filepage(struct pagedep *, struct buf *);
755static	int handle_written_sbdep(struct sbdep *, struct buf *);
756static	void initiate_write_sbdep(struct sbdep *);
757static	void diradd_inode_written(struct diradd *, struct inodedep *);
758static	int handle_written_indirdep(struct indirdep *, struct buf *,
759	    struct buf**);
760static	int handle_written_inodeblock(struct inodedep *, struct buf *);
761static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
762	    uint8_t *);
763static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
764static	void handle_written_jaddref(struct jaddref *);
765static	void handle_written_jremref(struct jremref *);
766static	void handle_written_jseg(struct jseg *, struct buf *);
767static	void handle_written_jnewblk(struct jnewblk *);
768static	void handle_written_jblkdep(struct jblkdep *);
769static	void handle_written_jfreefrag(struct jfreefrag *);
770static	void complete_jseg(struct jseg *);
771static	void complete_jsegs(struct jseg *);
772static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
773static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
774static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
775static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
776static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
777static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
778static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
779static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
780static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
781static	inline void inoref_write(struct inoref *, struct jseg *,
782	    struct jrefrec *);
783static	void handle_allocdirect_partdone(struct allocdirect *,
784	    struct workhead *);
785static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
786	    struct workhead *);
787static	void indirdep_complete(struct indirdep *);
788static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
789static	void indirblk_insert(struct freework *);
790static	void indirblk_remove(struct freework *);
791static	void handle_allocindir_partdone(struct allocindir *);
792static	void initiate_write_filepage(struct pagedep *, struct buf *);
793static	void initiate_write_indirdep(struct indirdep*, struct buf *);
794static	void handle_written_mkdir(struct mkdir *, int);
795static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
796	    uint8_t *);
797static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
798static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
799static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
800static	void handle_workitem_freefile(struct freefile *);
801static	int handle_workitem_remove(struct dirrem *, int);
802static	struct dirrem *newdirrem(struct buf *, struct inode *,
803	    struct inode *, int, struct dirrem **);
804static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
805	    struct buf *);
806static	void cancel_indirdep(struct indirdep *, struct buf *,
807	    struct freeblks *);
808static	void free_indirdep(struct indirdep *);
809static	void free_diradd(struct diradd *, struct workhead *);
810static	void merge_diradd(struct inodedep *, struct diradd *);
811static	void complete_diradd(struct diradd *);
812static	struct diradd *diradd_lookup(struct pagedep *, int);
813static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
814	    struct jremref *);
815static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
816	    struct jremref *);
817static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
818	    struct jremref *, struct jremref *);
819static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
820	    struct jremref *);
821static	void cancel_allocindir(struct allocindir *, struct buf *bp,
822	    struct freeblks *, int);
823static	int setup_trunc_indir(struct freeblks *, struct inode *,
824	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
825static	void complete_trunc_indir(struct freework *);
826static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
827	    int);
828static	void complete_mkdir(struct mkdir *);
829static	void free_newdirblk(struct newdirblk *);
830static	void free_jremref(struct jremref *);
831static	void free_jaddref(struct jaddref *);
832static	void free_jsegdep(struct jsegdep *);
833static	void free_jsegs(struct jblocks *);
834static	void rele_jseg(struct jseg *);
835static	void free_jseg(struct jseg *, struct jblocks *);
836static	void free_jnewblk(struct jnewblk *);
837static	void free_jblkdep(struct jblkdep *);
838static	void free_jfreefrag(struct jfreefrag *);
839static	void free_freedep(struct freedep *);
840static	void journal_jremref(struct dirrem *, struct jremref *,
841	    struct inodedep *);
842static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
843static	int cancel_jaddref(struct jaddref *, struct inodedep *,
844	    struct workhead *);
845static	void cancel_jfreefrag(struct jfreefrag *);
846static	inline void setup_freedirect(struct freeblks *, struct inode *,
847	    int, int);
848static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
849static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
850	    ufs_lbn_t, int);
851static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
852static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
853static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
854static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
855static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
856static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
857	    int, int);
858static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
859static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
860static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
861static	void newblk_freefrag(struct newblk*);
862static	void free_newblk(struct newblk *);
863static	void cancel_allocdirect(struct allocdirectlst *,
864	    struct allocdirect *, struct freeblks *);
865static	int check_inode_unwritten(struct inodedep *);
866static	int free_inodedep(struct inodedep *);
867static	void freework_freeblock(struct freework *);
868static	void freework_enqueue(struct freework *);
869static	int handle_workitem_freeblocks(struct freeblks *, int);
870static	int handle_complete_freeblocks(struct freeblks *, int);
871static	void handle_workitem_indirblk(struct freework *);
872static	void handle_written_freework(struct freework *);
873static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
874static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
875	    struct workhead *);
876static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
877	    struct inodedep *, struct allocindir *, ufs_lbn_t);
878static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
879	    ufs2_daddr_t, ufs_lbn_t);
880static	void handle_workitem_freefrag(struct freefrag *);
881static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
882	    ufs_lbn_t);
883static	void allocdirect_merge(struct allocdirectlst *,
884	    struct allocdirect *, struct allocdirect *);
885static	struct freefrag *allocindir_merge(struct allocindir *,
886	    struct allocindir *);
887static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
888	    struct bmsafemap **);
889static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
890	    int cg, struct bmsafemap *);
891static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
892	    struct newblk **);
893static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
894static	int inodedep_find(struct inodedep_hashhead *, ino_t,
895	    struct inodedep **);
896static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
897static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
898	    int, struct pagedep **);
899static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
900	    struct pagedep **);
901static	void pause_timer(void *);
902static	int request_cleanup(struct mount *, int);
903static	void schedule_cleanup(struct mount *);
904static void softdep_ast_cleanup_proc(void);
905static	int process_worklist_item(struct mount *, int, int);
906static	void process_removes(struct vnode *);
907static	void process_truncates(struct vnode *);
908static	void jwork_move(struct workhead *, struct workhead *);
909static	void jwork_insert(struct workhead *, struct jsegdep *);
910static	void add_to_worklist(struct worklist *, int);
911static	void wake_worklist(struct worklist *);
912static	void wait_worklist(struct worklist *, char *);
913static	void remove_from_worklist(struct worklist *);
914static	void softdep_flush(void *);
915static	void softdep_flushjournal(struct mount *);
916static	int softdep_speedup(struct ufsmount *);
917static	void worklist_speedup(struct mount *);
918static	int journal_mount(struct mount *, struct fs *, struct ucred *);
919static	void journal_unmount(struct ufsmount *);
920static	int journal_space(struct ufsmount *, int);
921static	void journal_suspend(struct ufsmount *);
922static	int journal_unsuspend(struct ufsmount *ump);
923static	void softdep_prelink(struct vnode *, struct vnode *);
924static	void add_to_journal(struct worklist *);
925static	void remove_from_journal(struct worklist *);
926static	bool softdep_excess_inodes(struct ufsmount *);
927static	bool softdep_excess_dirrem(struct ufsmount *);
928static	void softdep_process_journal(struct mount *, struct worklist *, int);
929static	struct jremref *newjremref(struct dirrem *, struct inode *,
930	    struct inode *ip, off_t, nlink_t);
931static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
932	    uint16_t);
933static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
934	    uint16_t);
935static	inline struct jsegdep *inoref_jseg(struct inoref *);
936static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
937static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
938	    ufs2_daddr_t, int);
939static	void adjust_newfreework(struct freeblks *, int);
940static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
941static	void move_newblock_dep(struct jaddref *, struct inodedep *);
942static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
943static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
944	    ufs2_daddr_t, long, ufs_lbn_t);
945static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
946	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
947static	int jwait(struct worklist *, int);
948static	struct inodedep *inodedep_lookup_ip(struct inode *);
949static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
950static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
951static	void handle_jwork(struct workhead *);
952static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
953	    struct mkdir **);
954static	struct jblocks *jblocks_create(void);
955static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
956static	void jblocks_free(struct jblocks *, struct mount *, int);
957static	void jblocks_destroy(struct jblocks *);
958static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
959
960/*
961 * Exported softdep operations.
962 */
963static	void softdep_disk_io_initiation(struct buf *);
964static	void softdep_disk_write_complete(struct buf *);
965static	void softdep_deallocate_dependencies(struct buf *);
966static	int softdep_count_dependencies(struct buf *bp, int);
967
968/*
969 * Global lock over all of soft updates.
970 */
971static struct mtx lk;
972MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
973
974#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
975#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
976#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
977
978/*
979 * Per-filesystem soft-updates locking.
980 */
981#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
982#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
983#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
984#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
985#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
986				    RA_WLOCKED)
987
988#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
989#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
990
991/*
992 * Worklist queue management.
993 * These routines require that the lock be held.
994 */
995#ifndef /* NOT */ DEBUG
996#define WORKLIST_INSERT(head, item) do {	\
997	(item)->wk_state |= ONWORKLIST;		\
998	LIST_INSERT_HEAD(head, item, wk_list);	\
999} while (0)
1000#define WORKLIST_REMOVE(item) do {		\
1001	(item)->wk_state &= ~ONWORKLIST;	\
1002	LIST_REMOVE(item, wk_list);		\
1003} while (0)
1004#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1005#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1006
1007#else /* DEBUG */
1008static	void worklist_insert(struct workhead *, struct worklist *, int);
1009static	void worklist_remove(struct worklist *, int);
1010
1011#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1012#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1013#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1014#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1015
1016static void
1017worklist_insert(head, item, locked)
1018	struct workhead *head;
1019	struct worklist *item;
1020	int locked;
1021{
1022
1023	if (locked)
1024		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1025	if (item->wk_state & ONWORKLIST)
1026		panic("worklist_insert: %p %s(0x%X) already on list",
1027		    item, TYPENAME(item->wk_type), item->wk_state);
1028	item->wk_state |= ONWORKLIST;
1029	LIST_INSERT_HEAD(head, item, wk_list);
1030}
1031
1032static void
1033worklist_remove(item, locked)
1034	struct worklist *item;
1035	int locked;
1036{
1037
1038	if (locked)
1039		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1040	if ((item->wk_state & ONWORKLIST) == 0)
1041		panic("worklist_remove: %p %s(0x%X) not on list",
1042		    item, TYPENAME(item->wk_type), item->wk_state);
1043	item->wk_state &= ~ONWORKLIST;
1044	LIST_REMOVE(item, wk_list);
1045}
1046#endif /* DEBUG */
1047
1048/*
1049 * Merge two jsegdeps keeping only the oldest one as newer references
1050 * can't be discarded until after older references.
1051 */
1052static inline struct jsegdep *
1053jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1054{
1055	struct jsegdep *swp;
1056
1057	if (two == NULL)
1058		return (one);
1059
1060	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1061		swp = one;
1062		one = two;
1063		two = swp;
1064	}
1065	WORKLIST_REMOVE(&two->jd_list);
1066	free_jsegdep(two);
1067
1068	return (one);
1069}
1070
1071/*
1072 * If two freedeps are compatible free one to reduce list size.
1073 */
1074static inline struct freedep *
1075freedep_merge(struct freedep *one, struct freedep *two)
1076{
1077	if (two == NULL)
1078		return (one);
1079
1080	if (one->fd_freework == two->fd_freework) {
1081		WORKLIST_REMOVE(&two->fd_list);
1082		free_freedep(two);
1083	}
1084	return (one);
1085}
1086
1087/*
1088 * Move journal work from one list to another.  Duplicate freedeps and
1089 * jsegdeps are coalesced to keep the lists as small as possible.
1090 */
1091static void
1092jwork_move(dst, src)
1093	struct workhead *dst;
1094	struct workhead *src;
1095{
1096	struct freedep *freedep;
1097	struct jsegdep *jsegdep;
1098	struct worklist *wkn;
1099	struct worklist *wk;
1100
1101	KASSERT(dst != src,
1102	    ("jwork_move: dst == src"));
1103	freedep = NULL;
1104	jsegdep = NULL;
1105	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1106		if (wk->wk_type == D_JSEGDEP)
1107			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1108		if (wk->wk_type == D_FREEDEP)
1109			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1110	}
1111
1112	while ((wk = LIST_FIRST(src)) != NULL) {
1113		WORKLIST_REMOVE(wk);
1114		WORKLIST_INSERT(dst, wk);
1115		if (wk->wk_type == D_JSEGDEP) {
1116			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1117			continue;
1118		}
1119		if (wk->wk_type == D_FREEDEP)
1120			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1121	}
1122}
1123
1124static void
1125jwork_insert(dst, jsegdep)
1126	struct workhead *dst;
1127	struct jsegdep *jsegdep;
1128{
1129	struct jsegdep *jsegdepn;
1130	struct worklist *wk;
1131
1132	LIST_FOREACH(wk, dst, wk_list)
1133		if (wk->wk_type == D_JSEGDEP)
1134			break;
1135	if (wk == NULL) {
1136		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1137		return;
1138	}
1139	jsegdepn = WK_JSEGDEP(wk);
1140	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1141		WORKLIST_REMOVE(wk);
1142		free_jsegdep(jsegdepn);
1143		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1144	} else
1145		free_jsegdep(jsegdep);
1146}
1147
1148/*
1149 * Routines for tracking and managing workitems.
1150 */
1151static	void workitem_free(struct worklist *, int);
1152static	void workitem_alloc(struct worklist *, int, struct mount *);
1153static	void workitem_reassign(struct worklist *, int);
1154
1155#define	WORKITEM_FREE(item, type) \
1156	workitem_free((struct worklist *)(item), (type))
1157#define	WORKITEM_REASSIGN(item, type) \
1158	workitem_reassign((struct worklist *)(item), (type))
1159
1160static void
1161workitem_free(item, type)
1162	struct worklist *item;
1163	int type;
1164{
1165	struct ufsmount *ump;
1166
1167#ifdef DEBUG
1168	if (item->wk_state & ONWORKLIST)
1169		panic("workitem_free: %s(0x%X) still on list",
1170		    TYPENAME(item->wk_type), item->wk_state);
1171	if (item->wk_type != type && type != D_NEWBLK)
1172		panic("workitem_free: type mismatch %s != %s",
1173		    TYPENAME(item->wk_type), TYPENAME(type));
1174#endif
1175	if (item->wk_state & IOWAITING)
1176		wakeup(item);
1177	ump = VFSTOUFS(item->wk_mp);
1178	LOCK_OWNED(ump);
1179	KASSERT(ump->softdep_deps > 0,
1180	    ("workitem_free: %s: softdep_deps going negative",
1181	    ump->um_fs->fs_fsmnt));
1182	if (--ump->softdep_deps == 0 && ump->softdep_req)
1183		wakeup(&ump->softdep_deps);
1184	KASSERT(dep_current[item->wk_type] > 0,
1185	    ("workitem_free: %s: dep_current[%s] going negative",
1186	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1187	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1188	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1189	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1190	atomic_subtract_long(&dep_current[item->wk_type], 1);
1191	ump->softdep_curdeps[item->wk_type] -= 1;
1192	free(item, DtoM(type));
1193}
1194
1195static void
1196workitem_alloc(item, type, mp)
1197	struct worklist *item;
1198	int type;
1199	struct mount *mp;
1200{
1201	struct ufsmount *ump;
1202
1203	item->wk_type = type;
1204	item->wk_mp = mp;
1205	item->wk_state = 0;
1206
1207	ump = VFSTOUFS(mp);
1208	ACQUIRE_GBLLOCK(&lk);
1209	dep_current[type]++;
1210	if (dep_current[type] > dep_highuse[type])
1211		dep_highuse[type] = dep_current[type];
1212	dep_total[type]++;
1213	FREE_GBLLOCK(&lk);
1214	ACQUIRE_LOCK(ump);
1215	ump->softdep_curdeps[type] += 1;
1216	ump->softdep_deps++;
1217	ump->softdep_accdeps++;
1218	FREE_LOCK(ump);
1219}
1220
1221static void
1222workitem_reassign(item, newtype)
1223	struct worklist *item;
1224	int newtype;
1225{
1226	struct ufsmount *ump;
1227
1228	ump = VFSTOUFS(item->wk_mp);
1229	LOCK_OWNED(ump);
1230	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1231	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1232	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1233	ump->softdep_curdeps[item->wk_type] -= 1;
1234	ump->softdep_curdeps[newtype] += 1;
1235	KASSERT(dep_current[item->wk_type] > 0,
1236	    ("workitem_reassign: %s: dep_current[%s] going negative",
1237	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1238	ACQUIRE_GBLLOCK(&lk);
1239	dep_current[newtype]++;
1240	dep_current[item->wk_type]--;
1241	if (dep_current[newtype] > dep_highuse[newtype])
1242		dep_highuse[newtype] = dep_current[newtype];
1243	dep_total[newtype]++;
1244	FREE_GBLLOCK(&lk);
1245	item->wk_type = newtype;
1246}
1247
1248/*
1249 * Workitem queue management
1250 */
1251static int max_softdeps;	/* maximum number of structs before slowdown */
1252static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1253static int proc_waiting;	/* tracks whether we have a timeout posted */
1254static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1255static struct callout softdep_callout;
1256static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1257static int req_clear_remove;	/* syncer process flush some freeblks */
1258static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1259
1260/*
1261 * runtime statistics
1262 */
1263static int stat_flush_threads;	/* number of softdep flushing threads */
1264static int stat_worklist_push;	/* number of worklist cleanups */
1265static int stat_blk_limit_push;	/* number of times block limit neared */
1266static int stat_ino_limit_push;	/* number of times inode limit neared */
1267static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1268static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1269static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1270static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1271static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1272static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1273static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1274static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1275static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1276static int stat_journal_min;	/* Times hit journal min threshold */
1277static int stat_journal_low;	/* Times hit journal low threshold */
1278static int stat_journal_wait;	/* Times blocked in jwait(). */
1279static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1280static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1281static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1282static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1283static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1284static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1285static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1286static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1287static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1288static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1289
1290SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1291    &max_softdeps, 0, "");
1292SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1293    &tickdelay, 0, "");
1294SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1295    &stat_flush_threads, 0, "");
1296SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1297    &stat_worklist_push, 0,"");
1298SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1299    &stat_blk_limit_push, 0,"");
1300SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1301    &stat_ino_limit_push, 0,"");
1302SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1303    &stat_blk_limit_hit, 0, "");
1304SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1305    &stat_ino_limit_hit, 0, "");
1306SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1307    &stat_sync_limit_hit, 0, "");
1308SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1309    &stat_indir_blk_ptrs, 0, "");
1310SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1311    &stat_inode_bitmap, 0, "");
1312SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1313    &stat_direct_blk_ptrs, 0, "");
1314SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1315    &stat_dir_entry, 0, "");
1316SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1317    &stat_jaddref, 0, "");
1318SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1319    &stat_jnewblk, 0, "");
1320SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1321    &stat_journal_low, 0, "");
1322SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1323    &stat_journal_min, 0, "");
1324SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1325    &stat_journal_wait, 0, "");
1326SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1327    &stat_jwait_filepage, 0, "");
1328SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1329    &stat_jwait_freeblks, 0, "");
1330SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1331    &stat_jwait_inode, 0, "");
1332SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1333    &stat_jwait_newblk, 0, "");
1334SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1335    &stat_cleanup_blkrequests, 0, "");
1336SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1337    &stat_cleanup_inorequests, 0, "");
1338SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1339    &stat_cleanup_high_delay, 0, "");
1340SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1341    &stat_cleanup_retries, 0, "");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1343    &stat_cleanup_failures, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1345    &softdep_flushcache, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1347    &stat_emptyjblocks, 0, "");
1348
1349SYSCTL_DECL(_vfs_ffs);
1350
1351/* Whether to recompute the summary at mount time */
1352static int compute_summary_at_mount = 0;
1353SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1354	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1355static int print_threads = 0;
1356SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1357    &print_threads, 0, "Notify flusher thread start/stop");
1358
1359/* List of all filesystems mounted with soft updates */
1360static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1361
1362/*
1363 * This function cleans the worklist for a filesystem.
1364 * Each filesystem running with soft dependencies gets its own
1365 * thread to run in this function. The thread is started up in
1366 * softdep_mount and shutdown in softdep_unmount. They show up
1367 * as part of the kernel "bufdaemon" process whose process
1368 * entry is available in bufdaemonproc.
1369 */
1370static int searchfailed;
1371extern struct proc *bufdaemonproc;
1372static void
1373softdep_flush(addr)
1374	void *addr;
1375{
1376	struct mount *mp;
1377	struct thread *td;
1378	struct ufsmount *ump;
1379
1380	td = curthread;
1381	td->td_pflags |= TDP_NORUNNINGBUF;
1382	mp = (struct mount *)addr;
1383	ump = VFSTOUFS(mp);
1384	atomic_add_int(&stat_flush_threads, 1);
1385	ACQUIRE_LOCK(ump);
1386	ump->softdep_flags &= ~FLUSH_STARTING;
1387	wakeup(&ump->softdep_flushtd);
1388	FREE_LOCK(ump);
1389	if (print_threads) {
1390		if (stat_flush_threads == 1)
1391			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1392			    bufdaemonproc->p_pid);
1393		printf("Start thread %s\n", td->td_name);
1394	}
1395	for (;;) {
1396		while (softdep_process_worklist(mp, 0) > 0 ||
1397		    (MOUNTEDSUJ(mp) &&
1398		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1399			kthread_suspend_check();
1400		ACQUIRE_LOCK(ump);
1401		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1402			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1403			    "sdflush", hz / 2);
1404		ump->softdep_flags &= ~FLUSH_CLEANUP;
1405		/*
1406		 * Check to see if we are done and need to exit.
1407		 */
1408		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1409			FREE_LOCK(ump);
1410			continue;
1411		}
1412		ump->softdep_flags &= ~FLUSH_EXIT;
1413		FREE_LOCK(ump);
1414		wakeup(&ump->softdep_flags);
1415		if (print_threads)
1416			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1417		atomic_subtract_int(&stat_flush_threads, 1);
1418		kthread_exit();
1419		panic("kthread_exit failed\n");
1420	}
1421}
1422
1423static void
1424worklist_speedup(mp)
1425	struct mount *mp;
1426{
1427	struct ufsmount *ump;
1428
1429	ump = VFSTOUFS(mp);
1430	LOCK_OWNED(ump);
1431	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1432		ump->softdep_flags |= FLUSH_CLEANUP;
1433	wakeup(&ump->softdep_flushtd);
1434}
1435
1436static int
1437softdep_speedup(ump)
1438	struct ufsmount *ump;
1439{
1440	struct ufsmount *altump;
1441	struct mount_softdeps *sdp;
1442
1443	LOCK_OWNED(ump);
1444	worklist_speedup(ump->um_mountp);
1445	bd_speedup();
1446	/*
1447	 * If we have global shortages, then we need other
1448	 * filesystems to help with the cleanup. Here we wakeup a
1449	 * flusher thread for a filesystem that is over its fair
1450	 * share of resources.
1451	 */
1452	if (req_clear_inodedeps || req_clear_remove) {
1453		ACQUIRE_GBLLOCK(&lk);
1454		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1455			if ((altump = sdp->sd_ump) == ump)
1456				continue;
1457			if (((req_clear_inodedeps &&
1458			    altump->softdep_curdeps[D_INODEDEP] >
1459			    max_softdeps / stat_flush_threads) ||
1460			    (req_clear_remove &&
1461			    altump->softdep_curdeps[D_DIRREM] >
1462			    (max_softdeps / 2) / stat_flush_threads)) &&
1463			    TRY_ACQUIRE_LOCK(altump))
1464				break;
1465		}
1466		if (sdp == NULL) {
1467			searchfailed++;
1468			FREE_GBLLOCK(&lk);
1469		} else {
1470			/*
1471			 * Move to the end of the list so we pick a
1472			 * different one on out next try.
1473			 */
1474			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1475			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1476			FREE_GBLLOCK(&lk);
1477			if ((altump->softdep_flags &
1478			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1479				altump->softdep_flags |= FLUSH_CLEANUP;
1480			altump->um_softdep->sd_cleanups++;
1481			wakeup(&altump->softdep_flushtd);
1482			FREE_LOCK(altump);
1483		}
1484	}
1485	return (speedup_syncer());
1486}
1487
1488/*
1489 * Add an item to the end of the work queue.
1490 * This routine requires that the lock be held.
1491 * This is the only routine that adds items to the list.
1492 * The following routine is the only one that removes items
1493 * and does so in order from first to last.
1494 */
1495
1496#define	WK_HEAD		0x0001	/* Add to HEAD. */
1497#define	WK_NODELAY	0x0002	/* Process immediately. */
1498
1499static void
1500add_to_worklist(wk, flags)
1501	struct worklist *wk;
1502	int flags;
1503{
1504	struct ufsmount *ump;
1505
1506	ump = VFSTOUFS(wk->wk_mp);
1507	LOCK_OWNED(ump);
1508	if (wk->wk_state & ONWORKLIST)
1509		panic("add_to_worklist: %s(0x%X) already on list",
1510		    TYPENAME(wk->wk_type), wk->wk_state);
1511	wk->wk_state |= ONWORKLIST;
1512	if (ump->softdep_on_worklist == 0) {
1513		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1514		ump->softdep_worklist_tail = wk;
1515	} else if (flags & WK_HEAD) {
1516		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1517	} else {
1518		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1519		ump->softdep_worklist_tail = wk;
1520	}
1521	ump->softdep_on_worklist += 1;
1522	if (flags & WK_NODELAY)
1523		worklist_speedup(wk->wk_mp);
1524}
1525
1526/*
1527 * Remove the item to be processed. If we are removing the last
1528 * item on the list, we need to recalculate the tail pointer.
1529 */
1530static void
1531remove_from_worklist(wk)
1532	struct worklist *wk;
1533{
1534	struct ufsmount *ump;
1535
1536	ump = VFSTOUFS(wk->wk_mp);
1537	WORKLIST_REMOVE(wk);
1538	if (ump->softdep_worklist_tail == wk)
1539		ump->softdep_worklist_tail =
1540		    (struct worklist *)wk->wk_list.le_prev;
1541	ump->softdep_on_worklist -= 1;
1542}
1543
1544static void
1545wake_worklist(wk)
1546	struct worklist *wk;
1547{
1548	if (wk->wk_state & IOWAITING) {
1549		wk->wk_state &= ~IOWAITING;
1550		wakeup(wk);
1551	}
1552}
1553
1554static void
1555wait_worklist(wk, wmesg)
1556	struct worklist *wk;
1557	char *wmesg;
1558{
1559	struct ufsmount *ump;
1560
1561	ump = VFSTOUFS(wk->wk_mp);
1562	wk->wk_state |= IOWAITING;
1563	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1564}
1565
1566/*
1567 * Process that runs once per second to handle items in the background queue.
1568 *
1569 * Note that we ensure that everything is done in the order in which they
1570 * appear in the queue. The code below depends on this property to ensure
1571 * that blocks of a file are freed before the inode itself is freed. This
1572 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1573 * until all the old ones have been purged from the dependency lists.
1574 */
1575static int
1576softdep_process_worklist(mp, full)
1577	struct mount *mp;
1578	int full;
1579{
1580	int cnt, matchcnt;
1581	struct ufsmount *ump;
1582	long starttime;
1583
1584	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1585	if (MOUNTEDSOFTDEP(mp) == 0)
1586		return (0);
1587	matchcnt = 0;
1588	ump = VFSTOUFS(mp);
1589	ACQUIRE_LOCK(ump);
1590	starttime = time_second;
1591	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1592	check_clear_deps(mp);
1593	while (ump->softdep_on_worklist > 0) {
1594		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1595			break;
1596		else
1597			matchcnt += cnt;
1598		check_clear_deps(mp);
1599		/*
1600		 * We do not generally want to stop for buffer space, but if
1601		 * we are really being a buffer hog, we will stop and wait.
1602		 */
1603		if (should_yield()) {
1604			FREE_LOCK(ump);
1605			kern_yield(PRI_USER);
1606			bwillwrite();
1607			ACQUIRE_LOCK(ump);
1608		}
1609		/*
1610		 * Never allow processing to run for more than one
1611		 * second. This gives the syncer thread the opportunity
1612		 * to pause if appropriate.
1613		 */
1614		if (!full && starttime != time_second)
1615			break;
1616	}
1617	if (full == 0)
1618		journal_unsuspend(ump);
1619	FREE_LOCK(ump);
1620	return (matchcnt);
1621}
1622
1623/*
1624 * Process all removes associated with a vnode if we are running out of
1625 * journal space.  Any other process which attempts to flush these will
1626 * be unable as we have the vnodes locked.
1627 */
1628static void
1629process_removes(vp)
1630	struct vnode *vp;
1631{
1632	struct inodedep *inodedep;
1633	struct dirrem *dirrem;
1634	struct ufsmount *ump;
1635	struct mount *mp;
1636	ino_t inum;
1637
1638	mp = vp->v_mount;
1639	ump = VFSTOUFS(mp);
1640	LOCK_OWNED(ump);
1641	inum = VTOI(vp)->i_number;
1642	for (;;) {
1643top:
1644		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1645			return;
1646		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1647			/*
1648			 * If another thread is trying to lock this vnode
1649			 * it will fail but we must wait for it to do so
1650			 * before we can proceed.
1651			 */
1652			if (dirrem->dm_state & INPROGRESS) {
1653				wait_worklist(&dirrem->dm_list, "pwrwait");
1654				goto top;
1655			}
1656			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1657			    (COMPLETE | ONWORKLIST))
1658				break;
1659		}
1660		if (dirrem == NULL)
1661			return;
1662		remove_from_worklist(&dirrem->dm_list);
1663		FREE_LOCK(ump);
1664		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1665			panic("process_removes: suspended filesystem");
1666		handle_workitem_remove(dirrem, 0);
1667		vn_finished_secondary_write(mp);
1668		ACQUIRE_LOCK(ump);
1669	}
1670}
1671
1672/*
1673 * Process all truncations associated with a vnode if we are running out
1674 * of journal space.  This is called when the vnode lock is already held
1675 * and no other process can clear the truncation.  This function returns
1676 * a value greater than zero if it did any work.
1677 */
1678static void
1679process_truncates(vp)
1680	struct vnode *vp;
1681{
1682	struct inodedep *inodedep;
1683	struct freeblks *freeblks;
1684	struct ufsmount *ump;
1685	struct mount *mp;
1686	ino_t inum;
1687	int cgwait;
1688
1689	mp = vp->v_mount;
1690	ump = VFSTOUFS(mp);
1691	LOCK_OWNED(ump);
1692	inum = VTOI(vp)->i_number;
1693	for (;;) {
1694		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1695			return;
1696		cgwait = 0;
1697		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1698			/* Journal entries not yet written.  */
1699			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1700				jwait(&LIST_FIRST(
1701				    &freeblks->fb_jblkdephd)->jb_list,
1702				    MNT_WAIT);
1703				break;
1704			}
1705			/* Another thread is executing this item. */
1706			if (freeblks->fb_state & INPROGRESS) {
1707				wait_worklist(&freeblks->fb_list, "ptrwait");
1708				break;
1709			}
1710			/* Freeblks is waiting on a inode write. */
1711			if ((freeblks->fb_state & COMPLETE) == 0) {
1712				FREE_LOCK(ump);
1713				ffs_update(vp, 1);
1714				ACQUIRE_LOCK(ump);
1715				break;
1716			}
1717			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1718			    (ALLCOMPLETE | ONWORKLIST)) {
1719				remove_from_worklist(&freeblks->fb_list);
1720				freeblks->fb_state |= INPROGRESS;
1721				FREE_LOCK(ump);
1722				if (vn_start_secondary_write(NULL, &mp,
1723				    V_NOWAIT))
1724					panic("process_truncates: "
1725					    "suspended filesystem");
1726				handle_workitem_freeblocks(freeblks, 0);
1727				vn_finished_secondary_write(mp);
1728				ACQUIRE_LOCK(ump);
1729				break;
1730			}
1731			if (freeblks->fb_cgwait)
1732				cgwait++;
1733		}
1734		if (cgwait) {
1735			FREE_LOCK(ump);
1736			sync_cgs(mp, MNT_WAIT);
1737			ffs_sync_snap(mp, MNT_WAIT);
1738			ACQUIRE_LOCK(ump);
1739			continue;
1740		}
1741		if (freeblks == NULL)
1742			break;
1743	}
1744	return;
1745}
1746
1747/*
1748 * Process one item on the worklist.
1749 */
1750static int
1751process_worklist_item(mp, target, flags)
1752	struct mount *mp;
1753	int target;
1754	int flags;
1755{
1756	struct worklist sentinel;
1757	struct worklist *wk;
1758	struct ufsmount *ump;
1759	int matchcnt;
1760	int error;
1761
1762	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1763	/*
1764	 * If we are being called because of a process doing a
1765	 * copy-on-write, then it is not safe to write as we may
1766	 * recurse into the copy-on-write routine.
1767	 */
1768	if (curthread->td_pflags & TDP_COWINPROGRESS)
1769		return (-1);
1770	PHOLD(curproc);	/* Don't let the stack go away. */
1771	ump = VFSTOUFS(mp);
1772	LOCK_OWNED(ump);
1773	matchcnt = 0;
1774	sentinel.wk_mp = NULL;
1775	sentinel.wk_type = D_SENTINEL;
1776	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1777	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1778	    wk = LIST_NEXT(&sentinel, wk_list)) {
1779		if (wk->wk_type == D_SENTINEL) {
1780			LIST_REMOVE(&sentinel, wk_list);
1781			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1782			continue;
1783		}
1784		if (wk->wk_state & INPROGRESS)
1785			panic("process_worklist_item: %p already in progress.",
1786			    wk);
1787		wk->wk_state |= INPROGRESS;
1788		remove_from_worklist(wk);
1789		FREE_LOCK(ump);
1790		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1791			panic("process_worklist_item: suspended filesystem");
1792		switch (wk->wk_type) {
1793		case D_DIRREM:
1794			/* removal of a directory entry */
1795			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1796			break;
1797
1798		case D_FREEBLKS:
1799			/* releasing blocks and/or fragments from a file */
1800			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1801			    flags);
1802			break;
1803
1804		case D_FREEFRAG:
1805			/* releasing a fragment when replaced as a file grows */
1806			handle_workitem_freefrag(WK_FREEFRAG(wk));
1807			error = 0;
1808			break;
1809
1810		case D_FREEFILE:
1811			/* releasing an inode when its link count drops to 0 */
1812			handle_workitem_freefile(WK_FREEFILE(wk));
1813			error = 0;
1814			break;
1815
1816		default:
1817			panic("%s_process_worklist: Unknown type %s",
1818			    "softdep", TYPENAME(wk->wk_type));
1819			/* NOTREACHED */
1820		}
1821		vn_finished_secondary_write(mp);
1822		ACQUIRE_LOCK(ump);
1823		if (error == 0) {
1824			if (++matchcnt == target)
1825				break;
1826			continue;
1827		}
1828		/*
1829		 * We have to retry the worklist item later.  Wake up any
1830		 * waiters who may be able to complete it immediately and
1831		 * add the item back to the head so we don't try to execute
1832		 * it again.
1833		 */
1834		wk->wk_state &= ~INPROGRESS;
1835		wake_worklist(wk);
1836		add_to_worklist(wk, WK_HEAD);
1837	}
1838	LIST_REMOVE(&sentinel, wk_list);
1839	/* Sentinal could've become the tail from remove_from_worklist. */
1840	if (ump->softdep_worklist_tail == &sentinel)
1841		ump->softdep_worklist_tail =
1842		    (struct worklist *)sentinel.wk_list.le_prev;
1843	PRELE(curproc);
1844	return (matchcnt);
1845}
1846
1847/*
1848 * Move dependencies from one buffer to another.
1849 */
1850int
1851softdep_move_dependencies(oldbp, newbp)
1852	struct buf *oldbp;
1853	struct buf *newbp;
1854{
1855	struct worklist *wk, *wktail;
1856	struct ufsmount *ump;
1857	int dirty;
1858
1859	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1860		return (0);
1861	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1862	    ("softdep_move_dependencies called on non-softdep filesystem"));
1863	dirty = 0;
1864	wktail = NULL;
1865	ump = VFSTOUFS(wk->wk_mp);
1866	ACQUIRE_LOCK(ump);
1867	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1868		LIST_REMOVE(wk, wk_list);
1869		if (wk->wk_type == D_BMSAFEMAP &&
1870		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1871			dirty = 1;
1872		if (wktail == 0)
1873			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1874		else
1875			LIST_INSERT_AFTER(wktail, wk, wk_list);
1876		wktail = wk;
1877	}
1878	FREE_LOCK(ump);
1879
1880	return (dirty);
1881}
1882
1883/*
1884 * Purge the work list of all items associated with a particular mount point.
1885 */
1886int
1887softdep_flushworklist(oldmnt, countp, td)
1888	struct mount *oldmnt;
1889	int *countp;
1890	struct thread *td;
1891{
1892	struct vnode *devvp;
1893	struct ufsmount *ump;
1894	int count, error;
1895
1896	/*
1897	 * Alternately flush the block device associated with the mount
1898	 * point and process any dependencies that the flushing
1899	 * creates. We continue until no more worklist dependencies
1900	 * are found.
1901	 */
1902	*countp = 0;
1903	error = 0;
1904	ump = VFSTOUFS(oldmnt);
1905	devvp = ump->um_devvp;
1906	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1907		*countp += count;
1908		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1909		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1910		VOP_UNLOCK(devvp, 0);
1911		if (error != 0)
1912			break;
1913	}
1914	return (error);
1915}
1916
1917#define	SU_WAITIDLE_RETRIES	20
1918static int
1919softdep_waitidle(struct mount *mp, int flags __unused)
1920{
1921	struct ufsmount *ump;
1922	struct vnode *devvp;
1923	struct thread *td;
1924	int error, i;
1925
1926	ump = VFSTOUFS(mp);
1927	devvp = ump->um_devvp;
1928	td = curthread;
1929	error = 0;
1930	ACQUIRE_LOCK(ump);
1931	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1932		ump->softdep_req = 1;
1933		KASSERT((flags & FORCECLOSE) == 0 ||
1934		    ump->softdep_on_worklist == 0,
1935		    ("softdep_waitidle: work added after flush"));
1936		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1937		    "softdeps", 10 * hz);
1938		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1939		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1940		VOP_UNLOCK(devvp, 0);
1941		if (error != 0)
1942			break;
1943		ACQUIRE_LOCK(ump);
1944	}
1945	ump->softdep_req = 0;
1946	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1947		error = EBUSY;
1948		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1949		    mp);
1950	}
1951	FREE_LOCK(ump);
1952	return (error);
1953}
1954
1955/*
1956 * Flush all vnodes and worklist items associated with a specified mount point.
1957 */
1958int
1959softdep_flushfiles(oldmnt, flags, td)
1960	struct mount *oldmnt;
1961	int flags;
1962	struct thread *td;
1963{
1964#ifdef QUOTA
1965	struct ufsmount *ump;
1966	int i;
1967#endif
1968	int error, early, depcount, loopcnt, retry_flush_count, retry;
1969	int morework;
1970
1971	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1972	    ("softdep_flushfiles called on non-softdep filesystem"));
1973	loopcnt = 10;
1974	retry_flush_count = 3;
1975retry_flush:
1976	error = 0;
1977
1978	/*
1979	 * Alternately flush the vnodes associated with the mount
1980	 * point and process any dependencies that the flushing
1981	 * creates. In theory, this loop can happen at most twice,
1982	 * but we give it a few extra just to be sure.
1983	 */
1984	for (; loopcnt > 0; loopcnt--) {
1985		/*
1986		 * Do another flush in case any vnodes were brought in
1987		 * as part of the cleanup operations.
1988		 */
1989		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1990		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1991		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1992			break;
1993		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1994		    depcount == 0)
1995			break;
1996	}
1997	/*
1998	 * If we are unmounting then it is an error to fail. If we
1999	 * are simply trying to downgrade to read-only, then filesystem
2000	 * activity can keep us busy forever, so we just fail with EBUSY.
2001	 */
2002	if (loopcnt == 0) {
2003		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2004			panic("softdep_flushfiles: looping");
2005		error = EBUSY;
2006	}
2007	if (!error)
2008		error = softdep_waitidle(oldmnt, flags);
2009	if (!error) {
2010		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2011			retry = 0;
2012			MNT_ILOCK(oldmnt);
2013			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2014			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2015			morework = oldmnt->mnt_nvnodelistsize > 0;
2016#ifdef QUOTA
2017			ump = VFSTOUFS(oldmnt);
2018			UFS_LOCK(ump);
2019			for (i = 0; i < MAXQUOTAS; i++) {
2020				if (ump->um_quotas[i] != NULLVP)
2021					morework = 1;
2022			}
2023			UFS_UNLOCK(ump);
2024#endif
2025			if (morework) {
2026				if (--retry_flush_count > 0) {
2027					retry = 1;
2028					loopcnt = 3;
2029				} else
2030					error = EBUSY;
2031			}
2032			MNT_IUNLOCK(oldmnt);
2033			if (retry)
2034				goto retry_flush;
2035		}
2036	}
2037	return (error);
2038}
2039
2040/*
2041 * Structure hashing.
2042 *
2043 * There are four types of structures that can be looked up:
2044 *	1) pagedep structures identified by mount point, inode number,
2045 *	   and logical block.
2046 *	2) inodedep structures identified by mount point and inode number.
2047 *	3) newblk structures identified by mount point and
2048 *	   physical block number.
2049 *	4) bmsafemap structures identified by mount point and
2050 *	   cylinder group number.
2051 *
2052 * The "pagedep" and "inodedep" dependency structures are hashed
2053 * separately from the file blocks and inodes to which they correspond.
2054 * This separation helps when the in-memory copy of an inode or
2055 * file block must be replaced. It also obviates the need to access
2056 * an inode or file page when simply updating (or de-allocating)
2057 * dependency structures. Lookup of newblk structures is needed to
2058 * find newly allocated blocks when trying to associate them with
2059 * their allocdirect or allocindir structure.
2060 *
2061 * The lookup routines optionally create and hash a new instance when
2062 * an existing entry is not found. The bmsafemap lookup routine always
2063 * allocates a new structure if an existing one is not found.
2064 */
2065#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2066#define NODELAY		0x0002	/* cannot do background work */
2067
2068/*
2069 * Structures and routines associated with pagedep caching.
2070 */
2071#define	PAGEDEP_HASH(ump, inum, lbn) \
2072	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2073
2074static int
2075pagedep_find(pagedephd, ino, lbn, pagedeppp)
2076	struct pagedep_hashhead *pagedephd;
2077	ino_t ino;
2078	ufs_lbn_t lbn;
2079	struct pagedep **pagedeppp;
2080{
2081	struct pagedep *pagedep;
2082
2083	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2084		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2085			*pagedeppp = pagedep;
2086			return (1);
2087		}
2088	}
2089	*pagedeppp = NULL;
2090	return (0);
2091}
2092/*
2093 * Look up a pagedep. Return 1 if found, 0 otherwise.
2094 * If not found, allocate if DEPALLOC flag is passed.
2095 * Found or allocated entry is returned in pagedeppp.
2096 * This routine must be called with splbio interrupts blocked.
2097 */
2098static int
2099pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2100	struct mount *mp;
2101	struct buf *bp;
2102	ino_t ino;
2103	ufs_lbn_t lbn;
2104	int flags;
2105	struct pagedep **pagedeppp;
2106{
2107	struct pagedep *pagedep;
2108	struct pagedep_hashhead *pagedephd;
2109	struct worklist *wk;
2110	struct ufsmount *ump;
2111	int ret;
2112	int i;
2113
2114	ump = VFSTOUFS(mp);
2115	LOCK_OWNED(ump);
2116	if (bp) {
2117		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2118			if (wk->wk_type == D_PAGEDEP) {
2119				*pagedeppp = WK_PAGEDEP(wk);
2120				return (1);
2121			}
2122		}
2123	}
2124	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2125	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2126	if (ret) {
2127		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2128			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2129		return (1);
2130	}
2131	if ((flags & DEPALLOC) == 0)
2132		return (0);
2133	FREE_LOCK(ump);
2134	pagedep = malloc(sizeof(struct pagedep),
2135	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2136	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2137	ACQUIRE_LOCK(ump);
2138	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2139	if (*pagedeppp) {
2140		/*
2141		 * This should never happen since we only create pagedeps
2142		 * with the vnode lock held.  Could be an assert.
2143		 */
2144		WORKITEM_FREE(pagedep, D_PAGEDEP);
2145		return (ret);
2146	}
2147	pagedep->pd_ino = ino;
2148	pagedep->pd_lbn = lbn;
2149	LIST_INIT(&pagedep->pd_dirremhd);
2150	LIST_INIT(&pagedep->pd_pendinghd);
2151	for (i = 0; i < DAHASHSZ; i++)
2152		LIST_INIT(&pagedep->pd_diraddhd[i]);
2153	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2154	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2155	*pagedeppp = pagedep;
2156	return (0);
2157}
2158
2159/*
2160 * Structures and routines associated with inodedep caching.
2161 */
2162#define	INODEDEP_HASH(ump, inum) \
2163      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2164
2165static int
2166inodedep_find(inodedephd, inum, inodedeppp)
2167	struct inodedep_hashhead *inodedephd;
2168	ino_t inum;
2169	struct inodedep **inodedeppp;
2170{
2171	struct inodedep *inodedep;
2172
2173	LIST_FOREACH(inodedep, inodedephd, id_hash)
2174		if (inum == inodedep->id_ino)
2175			break;
2176	if (inodedep) {
2177		*inodedeppp = inodedep;
2178		return (1);
2179	}
2180	*inodedeppp = NULL;
2181
2182	return (0);
2183}
2184/*
2185 * Look up an inodedep. Return 1 if found, 0 if not found.
2186 * If not found, allocate if DEPALLOC flag is passed.
2187 * Found or allocated entry is returned in inodedeppp.
2188 * This routine must be called with splbio interrupts blocked.
2189 */
2190static int
2191inodedep_lookup(mp, inum, flags, inodedeppp)
2192	struct mount *mp;
2193	ino_t inum;
2194	int flags;
2195	struct inodedep **inodedeppp;
2196{
2197	struct inodedep *inodedep;
2198	struct inodedep_hashhead *inodedephd;
2199	struct ufsmount *ump;
2200	struct fs *fs;
2201
2202	ump = VFSTOUFS(mp);
2203	LOCK_OWNED(ump);
2204	fs = ump->um_fs;
2205	inodedephd = INODEDEP_HASH(ump, inum);
2206
2207	if (inodedep_find(inodedephd, inum, inodedeppp))
2208		return (1);
2209	if ((flags & DEPALLOC) == 0)
2210		return (0);
2211	/*
2212	 * If the system is over its limit and our filesystem is
2213	 * responsible for more than our share of that usage and
2214	 * we are not in a rush, request some inodedep cleanup.
2215	 */
2216	if (softdep_excess_inodes(ump))
2217		schedule_cleanup(mp);
2218	else
2219		FREE_LOCK(ump);
2220	inodedep = malloc(sizeof(struct inodedep),
2221		M_INODEDEP, M_SOFTDEP_FLAGS);
2222	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2223	ACQUIRE_LOCK(ump);
2224	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2225		WORKITEM_FREE(inodedep, D_INODEDEP);
2226		return (1);
2227	}
2228	inodedep->id_fs = fs;
2229	inodedep->id_ino = inum;
2230	inodedep->id_state = ALLCOMPLETE;
2231	inodedep->id_nlinkdelta = 0;
2232	inodedep->id_savedino1 = NULL;
2233	inodedep->id_savedsize = -1;
2234	inodedep->id_savedextsize = -1;
2235	inodedep->id_savednlink = -1;
2236	inodedep->id_bmsafemap = NULL;
2237	inodedep->id_mkdiradd = NULL;
2238	LIST_INIT(&inodedep->id_dirremhd);
2239	LIST_INIT(&inodedep->id_pendinghd);
2240	LIST_INIT(&inodedep->id_inowait);
2241	LIST_INIT(&inodedep->id_bufwait);
2242	TAILQ_INIT(&inodedep->id_inoreflst);
2243	TAILQ_INIT(&inodedep->id_inoupdt);
2244	TAILQ_INIT(&inodedep->id_newinoupdt);
2245	TAILQ_INIT(&inodedep->id_extupdt);
2246	TAILQ_INIT(&inodedep->id_newextupdt);
2247	TAILQ_INIT(&inodedep->id_freeblklst);
2248	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2249	*inodedeppp = inodedep;
2250	return (0);
2251}
2252
2253/*
2254 * Structures and routines associated with newblk caching.
2255 */
2256#define	NEWBLK_HASH(ump, inum) \
2257	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2258
2259static int
2260newblk_find(newblkhd, newblkno, flags, newblkpp)
2261	struct newblk_hashhead *newblkhd;
2262	ufs2_daddr_t newblkno;
2263	int flags;
2264	struct newblk **newblkpp;
2265{
2266	struct newblk *newblk;
2267
2268	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2269		if (newblkno != newblk->nb_newblkno)
2270			continue;
2271		/*
2272		 * If we're creating a new dependency don't match those that
2273		 * have already been converted to allocdirects.  This is for
2274		 * a frag extend.
2275		 */
2276		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2277			continue;
2278		break;
2279	}
2280	if (newblk) {
2281		*newblkpp = newblk;
2282		return (1);
2283	}
2284	*newblkpp = NULL;
2285	return (0);
2286}
2287
2288/*
2289 * Look up a newblk. Return 1 if found, 0 if not found.
2290 * If not found, allocate if DEPALLOC flag is passed.
2291 * Found or allocated entry is returned in newblkpp.
2292 */
2293static int
2294newblk_lookup(mp, newblkno, flags, newblkpp)
2295	struct mount *mp;
2296	ufs2_daddr_t newblkno;
2297	int flags;
2298	struct newblk **newblkpp;
2299{
2300	struct newblk *newblk;
2301	struct newblk_hashhead *newblkhd;
2302	struct ufsmount *ump;
2303
2304	ump = VFSTOUFS(mp);
2305	LOCK_OWNED(ump);
2306	newblkhd = NEWBLK_HASH(ump, newblkno);
2307	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2308		return (1);
2309	if ((flags & DEPALLOC) == 0)
2310		return (0);
2311	FREE_LOCK(ump);
2312	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2313	    M_SOFTDEP_FLAGS | M_ZERO);
2314	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2315	ACQUIRE_LOCK(ump);
2316	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2317		WORKITEM_FREE(newblk, D_NEWBLK);
2318		return (1);
2319	}
2320	newblk->nb_freefrag = NULL;
2321	LIST_INIT(&newblk->nb_indirdeps);
2322	LIST_INIT(&newblk->nb_newdirblk);
2323	LIST_INIT(&newblk->nb_jwork);
2324	newblk->nb_state = ATTACHED;
2325	newblk->nb_newblkno = newblkno;
2326	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2327	*newblkpp = newblk;
2328	return (0);
2329}
2330
2331/*
2332 * Structures and routines associated with freed indirect block caching.
2333 */
2334#define	INDIR_HASH(ump, blkno) \
2335	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2336
2337/*
2338 * Lookup an indirect block in the indir hash table.  The freework is
2339 * removed and potentially freed.  The caller must do a blocking journal
2340 * write before writing to the blkno.
2341 */
2342static int
2343indirblk_lookup(mp, blkno)
2344	struct mount *mp;
2345	ufs2_daddr_t blkno;
2346{
2347	struct freework *freework;
2348	struct indir_hashhead *wkhd;
2349	struct ufsmount *ump;
2350
2351	ump = VFSTOUFS(mp);
2352	wkhd = INDIR_HASH(ump, blkno);
2353	TAILQ_FOREACH(freework, wkhd, fw_next) {
2354		if (freework->fw_blkno != blkno)
2355			continue;
2356		indirblk_remove(freework);
2357		return (1);
2358	}
2359	return (0);
2360}
2361
2362/*
2363 * Insert an indirect block represented by freework into the indirblk
2364 * hash table so that it may prevent the block from being re-used prior
2365 * to the journal being written.
2366 */
2367static void
2368indirblk_insert(freework)
2369	struct freework *freework;
2370{
2371	struct jblocks *jblocks;
2372	struct jseg *jseg;
2373	struct ufsmount *ump;
2374
2375	ump = VFSTOUFS(freework->fw_list.wk_mp);
2376	jblocks = ump->softdep_jblocks;
2377	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2378	if (jseg == NULL)
2379		return;
2380
2381	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2382	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2383	    fw_next);
2384	freework->fw_state &= ~DEPCOMPLETE;
2385}
2386
2387static void
2388indirblk_remove(freework)
2389	struct freework *freework;
2390{
2391	struct ufsmount *ump;
2392
2393	ump = VFSTOUFS(freework->fw_list.wk_mp);
2394	LIST_REMOVE(freework, fw_segs);
2395	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2396	freework->fw_state |= DEPCOMPLETE;
2397	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2398		WORKITEM_FREE(freework, D_FREEWORK);
2399}
2400
2401/*
2402 * Executed during filesystem system initialization before
2403 * mounting any filesystems.
2404 */
2405void
2406softdep_initialize()
2407{
2408
2409	TAILQ_INIT(&softdepmounts);
2410	max_softdeps = desiredvnodes * 4;
2411
2412	/* initialise bioops hack */
2413	bioops.io_start = softdep_disk_io_initiation;
2414	bioops.io_complete = softdep_disk_write_complete;
2415	bioops.io_deallocate = softdep_deallocate_dependencies;
2416	bioops.io_countdeps = softdep_count_dependencies;
2417	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2418
2419	/* Initialize the callout with an mtx. */
2420	callout_init_mtx(&softdep_callout, &lk, 0);
2421}
2422
2423/*
2424 * Executed after all filesystems have been unmounted during
2425 * filesystem module unload.
2426 */
2427void
2428softdep_uninitialize()
2429{
2430
2431	/* clear bioops hack */
2432	bioops.io_start = NULL;
2433	bioops.io_complete = NULL;
2434	bioops.io_deallocate = NULL;
2435	bioops.io_countdeps = NULL;
2436	softdep_ast_cleanup = NULL;
2437
2438	callout_drain(&softdep_callout);
2439}
2440
2441/*
2442 * Called at mount time to notify the dependency code that a
2443 * filesystem wishes to use it.
2444 */
2445int
2446softdep_mount(devvp, mp, fs, cred)
2447	struct vnode *devvp;
2448	struct mount *mp;
2449	struct fs *fs;
2450	struct ucred *cred;
2451{
2452	struct csum_total cstotal;
2453	struct mount_softdeps *sdp;
2454	struct ufsmount *ump;
2455	struct cg *cgp;
2456	struct buf *bp;
2457	int i, error, cyl;
2458
2459	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2460	    M_WAITOK | M_ZERO);
2461	MNT_ILOCK(mp);
2462	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2463	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2464		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2465			MNTK_SOFTDEP | MNTK_NOASYNC;
2466	}
2467	ump = VFSTOUFS(mp);
2468	ump->um_softdep = sdp;
2469	MNT_IUNLOCK(mp);
2470	rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2471	sdp->sd_ump = ump;
2472	LIST_INIT(&ump->softdep_workitem_pending);
2473	LIST_INIT(&ump->softdep_journal_pending);
2474	TAILQ_INIT(&ump->softdep_unlinked);
2475	LIST_INIT(&ump->softdep_dirtycg);
2476	ump->softdep_worklist_tail = NULL;
2477	ump->softdep_on_worklist = 0;
2478	ump->softdep_deps = 0;
2479	LIST_INIT(&ump->softdep_mkdirlisthd);
2480	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2481	    &ump->pagedep_hash_size);
2482	ump->pagedep_nextclean = 0;
2483	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2484	    &ump->inodedep_hash_size);
2485	ump->inodedep_nextclean = 0;
2486	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2487	    &ump->newblk_hash_size);
2488	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2489	    &ump->bmsafemap_hash_size);
2490	i = 1 << (ffs(desiredvnodes / 10) - 1);
2491	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2492	    M_FREEWORK, M_WAITOK);
2493	ump->indir_hash_size = i - 1;
2494	for (i = 0; i <= ump->indir_hash_size; i++)
2495		TAILQ_INIT(&ump->indir_hashtbl[i]);
2496	ACQUIRE_GBLLOCK(&lk);
2497	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2498	FREE_GBLLOCK(&lk);
2499	if ((fs->fs_flags & FS_SUJ) &&
2500	    (error = journal_mount(mp, fs, cred)) != 0) {
2501		printf("Failed to start journal: %d\n", error);
2502		softdep_unmount(mp);
2503		return (error);
2504	}
2505	/*
2506	 * Start our flushing thread in the bufdaemon process.
2507	 */
2508	ACQUIRE_LOCK(ump);
2509	ump->softdep_flags |= FLUSH_STARTING;
2510	FREE_LOCK(ump);
2511	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2512	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2513	    mp->mnt_stat.f_mntonname);
2514	ACQUIRE_LOCK(ump);
2515	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2516		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2517		    hz / 2);
2518	}
2519	FREE_LOCK(ump);
2520	/*
2521	 * When doing soft updates, the counters in the
2522	 * superblock may have gotten out of sync. Recomputation
2523	 * can take a long time and can be deferred for background
2524	 * fsck.  However, the old behavior of scanning the cylinder
2525	 * groups and recalculating them at mount time is available
2526	 * by setting vfs.ffs.compute_summary_at_mount to one.
2527	 */
2528	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2529		return (0);
2530	bzero(&cstotal, sizeof cstotal);
2531	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2532		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2533		    fs->fs_cgsize, cred, &bp)) != 0) {
2534			brelse(bp);
2535			softdep_unmount(mp);
2536			return (error);
2537		}
2538		cgp = (struct cg *)bp->b_data;
2539		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2540		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2541		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2542		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2543		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2544		brelse(bp);
2545	}
2546#ifdef DEBUG
2547	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2548		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2549#endif
2550	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2551	return (0);
2552}
2553
2554void
2555softdep_unmount(mp)
2556	struct mount *mp;
2557{
2558	struct ufsmount *ump;
2559#ifdef INVARIANTS
2560	int i;
2561#endif
2562
2563	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2564	    ("softdep_unmount called on non-softdep filesystem"));
2565	ump = VFSTOUFS(mp);
2566	MNT_ILOCK(mp);
2567	mp->mnt_flag &= ~MNT_SOFTDEP;
2568	if (MOUNTEDSUJ(mp) == 0) {
2569		MNT_IUNLOCK(mp);
2570	} else {
2571		mp->mnt_flag &= ~MNT_SUJ;
2572		MNT_IUNLOCK(mp);
2573		journal_unmount(ump);
2574	}
2575	/*
2576	 * Shut down our flushing thread. Check for NULL is if
2577	 * softdep_mount errors out before the thread has been created.
2578	 */
2579	if (ump->softdep_flushtd != NULL) {
2580		ACQUIRE_LOCK(ump);
2581		ump->softdep_flags |= FLUSH_EXIT;
2582		wakeup(&ump->softdep_flushtd);
2583		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2584		    "sdwait", 0);
2585		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2586		    ("Thread shutdown failed"));
2587	}
2588	/*
2589	 * Free up our resources.
2590	 */
2591	ACQUIRE_GBLLOCK(&lk);
2592	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2593	FREE_GBLLOCK(&lk);
2594	rw_destroy(LOCK_PTR(ump));
2595	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2596	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2597	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2598	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2599	    ump->bmsafemap_hash_size);
2600	free(ump->indir_hashtbl, M_FREEWORK);
2601#ifdef INVARIANTS
2602	for (i = 0; i <= D_LAST; i++)
2603		KASSERT(ump->softdep_curdeps[i] == 0,
2604		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2605		    TYPENAME(i), ump->softdep_curdeps[i]));
2606#endif
2607	free(ump->um_softdep, M_MOUNTDATA);
2608}
2609
2610static struct jblocks *
2611jblocks_create(void)
2612{
2613	struct jblocks *jblocks;
2614
2615	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2616	TAILQ_INIT(&jblocks->jb_segs);
2617	jblocks->jb_avail = 10;
2618	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2619	    M_JBLOCKS, M_WAITOK | M_ZERO);
2620
2621	return (jblocks);
2622}
2623
2624static ufs2_daddr_t
2625jblocks_alloc(jblocks, bytes, actual)
2626	struct jblocks *jblocks;
2627	int bytes;
2628	int *actual;
2629{
2630	ufs2_daddr_t daddr;
2631	struct jextent *jext;
2632	int freecnt;
2633	int blocks;
2634
2635	blocks = bytes / DEV_BSIZE;
2636	jext = &jblocks->jb_extent[jblocks->jb_head];
2637	freecnt = jext->je_blocks - jblocks->jb_off;
2638	if (freecnt == 0) {
2639		jblocks->jb_off = 0;
2640		if (++jblocks->jb_head > jblocks->jb_used)
2641			jblocks->jb_head = 0;
2642		jext = &jblocks->jb_extent[jblocks->jb_head];
2643		freecnt = jext->je_blocks;
2644	}
2645	if (freecnt > blocks)
2646		freecnt = blocks;
2647	*actual = freecnt * DEV_BSIZE;
2648	daddr = jext->je_daddr + jblocks->jb_off;
2649	jblocks->jb_off += freecnt;
2650	jblocks->jb_free -= freecnt;
2651
2652	return (daddr);
2653}
2654
2655static void
2656jblocks_free(jblocks, mp, bytes)
2657	struct jblocks *jblocks;
2658	struct mount *mp;
2659	int bytes;
2660{
2661
2662	LOCK_OWNED(VFSTOUFS(mp));
2663	jblocks->jb_free += bytes / DEV_BSIZE;
2664	if (jblocks->jb_suspended)
2665		worklist_speedup(mp);
2666	wakeup(jblocks);
2667}
2668
2669static void
2670jblocks_destroy(jblocks)
2671	struct jblocks *jblocks;
2672{
2673
2674	if (jblocks->jb_extent)
2675		free(jblocks->jb_extent, M_JBLOCKS);
2676	free(jblocks, M_JBLOCKS);
2677}
2678
2679static void
2680jblocks_add(jblocks, daddr, blocks)
2681	struct jblocks *jblocks;
2682	ufs2_daddr_t daddr;
2683	int blocks;
2684{
2685	struct jextent *jext;
2686
2687	jblocks->jb_blocks += blocks;
2688	jblocks->jb_free += blocks;
2689	jext = &jblocks->jb_extent[jblocks->jb_used];
2690	/* Adding the first block. */
2691	if (jext->je_daddr == 0) {
2692		jext->je_daddr = daddr;
2693		jext->je_blocks = blocks;
2694		return;
2695	}
2696	/* Extending the last extent. */
2697	if (jext->je_daddr + jext->je_blocks == daddr) {
2698		jext->je_blocks += blocks;
2699		return;
2700	}
2701	/* Adding a new extent. */
2702	if (++jblocks->jb_used == jblocks->jb_avail) {
2703		jblocks->jb_avail *= 2;
2704		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2705		    M_JBLOCKS, M_WAITOK | M_ZERO);
2706		memcpy(jext, jblocks->jb_extent,
2707		    sizeof(struct jextent) * jblocks->jb_used);
2708		free(jblocks->jb_extent, M_JBLOCKS);
2709		jblocks->jb_extent = jext;
2710	}
2711	jext = &jblocks->jb_extent[jblocks->jb_used];
2712	jext->je_daddr = daddr;
2713	jext->je_blocks = blocks;
2714	return;
2715}
2716
2717int
2718softdep_journal_lookup(mp, vpp)
2719	struct mount *mp;
2720	struct vnode **vpp;
2721{
2722	struct componentname cnp;
2723	struct vnode *dvp;
2724	ino_t sujournal;
2725	int error;
2726
2727	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2728	if (error)
2729		return (error);
2730	bzero(&cnp, sizeof(cnp));
2731	cnp.cn_nameiop = LOOKUP;
2732	cnp.cn_flags = ISLASTCN;
2733	cnp.cn_thread = curthread;
2734	cnp.cn_cred = curthread->td_ucred;
2735	cnp.cn_pnbuf = SUJ_FILE;
2736	cnp.cn_nameptr = SUJ_FILE;
2737	cnp.cn_namelen = strlen(SUJ_FILE);
2738	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2739	vput(dvp);
2740	if (error != 0)
2741		return (error);
2742	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2743	return (error);
2744}
2745
2746/*
2747 * Open and verify the journal file.
2748 */
2749static int
2750journal_mount(mp, fs, cred)
2751	struct mount *mp;
2752	struct fs *fs;
2753	struct ucred *cred;
2754{
2755	struct jblocks *jblocks;
2756	struct ufsmount *ump;
2757	struct vnode *vp;
2758	struct inode *ip;
2759	ufs2_daddr_t blkno;
2760	int bcount;
2761	int error;
2762	int i;
2763
2764	ump = VFSTOUFS(mp);
2765	ump->softdep_journal_tail = NULL;
2766	ump->softdep_on_journal = 0;
2767	ump->softdep_accdeps = 0;
2768	ump->softdep_req = 0;
2769	ump->softdep_jblocks = NULL;
2770	error = softdep_journal_lookup(mp, &vp);
2771	if (error != 0) {
2772		printf("Failed to find journal.  Use tunefs to create one\n");
2773		return (error);
2774	}
2775	ip = VTOI(vp);
2776	if (ip->i_size < SUJ_MIN) {
2777		error = ENOSPC;
2778		goto out;
2779	}
2780	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2781	jblocks = jblocks_create();
2782	for (i = 0; i < bcount; i++) {
2783		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2784		if (error)
2785			break;
2786		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2787	}
2788	if (error) {
2789		jblocks_destroy(jblocks);
2790		goto out;
2791	}
2792	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2793	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2794	ump->softdep_jblocks = jblocks;
2795out:
2796	if (error == 0) {
2797		MNT_ILOCK(mp);
2798		mp->mnt_flag |= MNT_SUJ;
2799		mp->mnt_flag &= ~MNT_SOFTDEP;
2800		MNT_IUNLOCK(mp);
2801		/*
2802		 * Only validate the journal contents if the
2803		 * filesystem is clean, otherwise we write the logs
2804		 * but they'll never be used.  If the filesystem was
2805		 * still dirty when we mounted it the journal is
2806		 * invalid and a new journal can only be valid if it
2807		 * starts from a clean mount.
2808		 */
2809		if (fs->fs_clean) {
2810			DIP_SET(ip, i_modrev, fs->fs_mtime);
2811			ip->i_flags |= IN_MODIFIED;
2812			ffs_update(vp, 1);
2813		}
2814	}
2815	vput(vp);
2816	return (error);
2817}
2818
2819static void
2820journal_unmount(ump)
2821	struct ufsmount *ump;
2822{
2823
2824	if (ump->softdep_jblocks)
2825		jblocks_destroy(ump->softdep_jblocks);
2826	ump->softdep_jblocks = NULL;
2827}
2828
2829/*
2830 * Called when a journal record is ready to be written.  Space is allocated
2831 * and the journal entry is created when the journal is flushed to stable
2832 * store.
2833 */
2834static void
2835add_to_journal(wk)
2836	struct worklist *wk;
2837{
2838	struct ufsmount *ump;
2839
2840	ump = VFSTOUFS(wk->wk_mp);
2841	LOCK_OWNED(ump);
2842	if (wk->wk_state & ONWORKLIST)
2843		panic("add_to_journal: %s(0x%X) already on list",
2844		    TYPENAME(wk->wk_type), wk->wk_state);
2845	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2846	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2847		ump->softdep_jblocks->jb_age = ticks;
2848		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2849	} else
2850		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2851	ump->softdep_journal_tail = wk;
2852	ump->softdep_on_journal += 1;
2853}
2854
2855/*
2856 * Remove an arbitrary item for the journal worklist maintain the tail
2857 * pointer.  This happens when a new operation obviates the need to
2858 * journal an old operation.
2859 */
2860static void
2861remove_from_journal(wk)
2862	struct worklist *wk;
2863{
2864	struct ufsmount *ump;
2865
2866	ump = VFSTOUFS(wk->wk_mp);
2867	LOCK_OWNED(ump);
2868#ifdef SUJ_DEBUG
2869	{
2870		struct worklist *wkn;
2871
2872		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2873			if (wkn == wk)
2874				break;
2875		if (wkn == NULL)
2876			panic("remove_from_journal: %p is not in journal", wk);
2877	}
2878#endif
2879	/*
2880	 * We emulate a TAILQ to save space in most structures which do not
2881	 * require TAILQ semantics.  Here we must update the tail position
2882	 * when removing the tail which is not the final entry. This works
2883	 * only if the worklist linkage are at the beginning of the structure.
2884	 */
2885	if (ump->softdep_journal_tail == wk)
2886		ump->softdep_journal_tail =
2887		    (struct worklist *)wk->wk_list.le_prev;
2888
2889	WORKLIST_REMOVE(wk);
2890	ump->softdep_on_journal -= 1;
2891}
2892
2893/*
2894 * Check for journal space as well as dependency limits so the prelink
2895 * code can throttle both journaled and non-journaled filesystems.
2896 * Threshold is 0 for low and 1 for min.
2897 */
2898static int
2899journal_space(ump, thresh)
2900	struct ufsmount *ump;
2901	int thresh;
2902{
2903	struct jblocks *jblocks;
2904	int limit, avail;
2905
2906	jblocks = ump->softdep_jblocks;
2907	if (jblocks == NULL)
2908		return (1);
2909	/*
2910	 * We use a tighter restriction here to prevent request_cleanup()
2911	 * running in threads from running into locks we currently hold.
2912	 * We have to be over the limit and our filesystem has to be
2913	 * responsible for more than our share of that usage.
2914	 */
2915	limit = (max_softdeps / 10) * 9;
2916	if (dep_current[D_INODEDEP] > limit &&
2917	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2918		return (0);
2919	if (thresh)
2920		thresh = jblocks->jb_min;
2921	else
2922		thresh = jblocks->jb_low;
2923	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2924	avail = jblocks->jb_free - avail;
2925
2926	return (avail > thresh);
2927}
2928
2929static void
2930journal_suspend(ump)
2931	struct ufsmount *ump;
2932{
2933	struct jblocks *jblocks;
2934	struct mount *mp;
2935
2936	mp = UFSTOVFS(ump);
2937	jblocks = ump->softdep_jblocks;
2938	MNT_ILOCK(mp);
2939	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2940		stat_journal_min++;
2941		mp->mnt_kern_flag |= MNTK_SUSPEND;
2942		mp->mnt_susp_owner = ump->softdep_flushtd;
2943	}
2944	jblocks->jb_suspended = 1;
2945	MNT_IUNLOCK(mp);
2946}
2947
2948static int
2949journal_unsuspend(struct ufsmount *ump)
2950{
2951	struct jblocks *jblocks;
2952	struct mount *mp;
2953
2954	mp = UFSTOVFS(ump);
2955	jblocks = ump->softdep_jblocks;
2956
2957	if (jblocks != NULL && jblocks->jb_suspended &&
2958	    journal_space(ump, jblocks->jb_min)) {
2959		jblocks->jb_suspended = 0;
2960		FREE_LOCK(ump);
2961		mp->mnt_susp_owner = curthread;
2962		vfs_write_resume(mp, 0);
2963		ACQUIRE_LOCK(ump);
2964		return (1);
2965	}
2966	return (0);
2967}
2968
2969/*
2970 * Called before any allocation function to be certain that there is
2971 * sufficient space in the journal prior to creating any new records.
2972 * Since in the case of block allocation we may have multiple locked
2973 * buffers at the time of the actual allocation we can not block
2974 * when the journal records are created.  Doing so would create a deadlock
2975 * if any of these buffers needed to be flushed to reclaim space.  Instead
2976 * we require a sufficiently large amount of available space such that
2977 * each thread in the system could have passed this allocation check and
2978 * still have sufficient free space.  With 20% of a minimum journal size
2979 * of 1MB we have 6553 records available.
2980 */
2981int
2982softdep_prealloc(vp, waitok)
2983	struct vnode *vp;
2984	int waitok;
2985{
2986	struct ufsmount *ump;
2987
2988	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2989	    ("softdep_prealloc called on non-softdep filesystem"));
2990	/*
2991	 * Nothing to do if we are not running journaled soft updates.
2992	 * If we currently hold the snapshot lock, we must avoid handling
2993	 * other resources that could cause deadlock.
2994	 */
2995	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
2996		return (0);
2997	ump = VFSTOUFS(vp->v_mount);
2998	ACQUIRE_LOCK(ump);
2999	if (journal_space(ump, 0)) {
3000		FREE_LOCK(ump);
3001		return (0);
3002	}
3003	stat_journal_low++;
3004	FREE_LOCK(ump);
3005	if (waitok == MNT_NOWAIT)
3006		return (ENOSPC);
3007	/*
3008	 * Attempt to sync this vnode once to flush any journal
3009	 * work attached to it.
3010	 */
3011	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3012		ffs_syncvnode(vp, waitok, 0);
3013	ACQUIRE_LOCK(ump);
3014	process_removes(vp);
3015	process_truncates(vp);
3016	if (journal_space(ump, 0) == 0) {
3017		softdep_speedup(ump);
3018		if (journal_space(ump, 1) == 0)
3019			journal_suspend(ump);
3020	}
3021	FREE_LOCK(ump);
3022
3023	return (0);
3024}
3025
3026/*
3027 * Before adjusting a link count on a vnode verify that we have sufficient
3028 * journal space.  If not, process operations that depend on the currently
3029 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3030 * and softdep flush threads can not acquire these locks to reclaim space.
3031 */
3032static void
3033softdep_prelink(dvp, vp)
3034	struct vnode *dvp;
3035	struct vnode *vp;
3036{
3037	struct ufsmount *ump;
3038
3039	ump = VFSTOUFS(dvp->v_mount);
3040	LOCK_OWNED(ump);
3041	/*
3042	 * Nothing to do if we have sufficient journal space.
3043	 * If we currently hold the snapshot lock, we must avoid
3044	 * handling other resources that could cause deadlock.
3045	 */
3046	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3047		return;
3048	stat_journal_low++;
3049	FREE_LOCK(ump);
3050	if (vp)
3051		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3052	ffs_syncvnode(dvp, MNT_WAIT, 0);
3053	ACQUIRE_LOCK(ump);
3054	/* Process vp before dvp as it may create .. removes. */
3055	if (vp) {
3056		process_removes(vp);
3057		process_truncates(vp);
3058	}
3059	process_removes(dvp);
3060	process_truncates(dvp);
3061	softdep_speedup(ump);
3062	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3063	if (journal_space(ump, 0) == 0) {
3064		softdep_speedup(ump);
3065		if (journal_space(ump, 1) == 0)
3066			journal_suspend(ump);
3067	}
3068}
3069
3070static void
3071jseg_write(ump, jseg, data)
3072	struct ufsmount *ump;
3073	struct jseg *jseg;
3074	uint8_t *data;
3075{
3076	struct jsegrec *rec;
3077
3078	rec = (struct jsegrec *)data;
3079	rec->jsr_seq = jseg->js_seq;
3080	rec->jsr_oldest = jseg->js_oldseq;
3081	rec->jsr_cnt = jseg->js_cnt;
3082	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3083	rec->jsr_crc = 0;
3084	rec->jsr_time = ump->um_fs->fs_mtime;
3085}
3086
3087static inline void
3088inoref_write(inoref, jseg, rec)
3089	struct inoref *inoref;
3090	struct jseg *jseg;
3091	struct jrefrec *rec;
3092{
3093
3094	inoref->if_jsegdep->jd_seg = jseg;
3095	rec->jr_ino = inoref->if_ino;
3096	rec->jr_parent = inoref->if_parent;
3097	rec->jr_nlink = inoref->if_nlink;
3098	rec->jr_mode = inoref->if_mode;
3099	rec->jr_diroff = inoref->if_diroff;
3100}
3101
3102static void
3103jaddref_write(jaddref, jseg, data)
3104	struct jaddref *jaddref;
3105	struct jseg *jseg;
3106	uint8_t *data;
3107{
3108	struct jrefrec *rec;
3109
3110	rec = (struct jrefrec *)data;
3111	rec->jr_op = JOP_ADDREF;
3112	inoref_write(&jaddref->ja_ref, jseg, rec);
3113}
3114
3115static void
3116jremref_write(jremref, jseg, data)
3117	struct jremref *jremref;
3118	struct jseg *jseg;
3119	uint8_t *data;
3120{
3121	struct jrefrec *rec;
3122
3123	rec = (struct jrefrec *)data;
3124	rec->jr_op = JOP_REMREF;
3125	inoref_write(&jremref->jr_ref, jseg, rec);
3126}
3127
3128static void
3129jmvref_write(jmvref, jseg, data)
3130	struct jmvref *jmvref;
3131	struct jseg *jseg;
3132	uint8_t *data;
3133{
3134	struct jmvrec *rec;
3135
3136	rec = (struct jmvrec *)data;
3137	rec->jm_op = JOP_MVREF;
3138	rec->jm_ino = jmvref->jm_ino;
3139	rec->jm_parent = jmvref->jm_parent;
3140	rec->jm_oldoff = jmvref->jm_oldoff;
3141	rec->jm_newoff = jmvref->jm_newoff;
3142}
3143
3144static void
3145jnewblk_write(jnewblk, jseg, data)
3146	struct jnewblk *jnewblk;
3147	struct jseg *jseg;
3148	uint8_t *data;
3149{
3150	struct jblkrec *rec;
3151
3152	jnewblk->jn_jsegdep->jd_seg = jseg;
3153	rec = (struct jblkrec *)data;
3154	rec->jb_op = JOP_NEWBLK;
3155	rec->jb_ino = jnewblk->jn_ino;
3156	rec->jb_blkno = jnewblk->jn_blkno;
3157	rec->jb_lbn = jnewblk->jn_lbn;
3158	rec->jb_frags = jnewblk->jn_frags;
3159	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3160}
3161
3162static void
3163jfreeblk_write(jfreeblk, jseg, data)
3164	struct jfreeblk *jfreeblk;
3165	struct jseg *jseg;
3166	uint8_t *data;
3167{
3168	struct jblkrec *rec;
3169
3170	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3171	rec = (struct jblkrec *)data;
3172	rec->jb_op = JOP_FREEBLK;
3173	rec->jb_ino = jfreeblk->jf_ino;
3174	rec->jb_blkno = jfreeblk->jf_blkno;
3175	rec->jb_lbn = jfreeblk->jf_lbn;
3176	rec->jb_frags = jfreeblk->jf_frags;
3177	rec->jb_oldfrags = 0;
3178}
3179
3180static void
3181jfreefrag_write(jfreefrag, jseg, data)
3182	struct jfreefrag *jfreefrag;
3183	struct jseg *jseg;
3184	uint8_t *data;
3185{
3186	struct jblkrec *rec;
3187
3188	jfreefrag->fr_jsegdep->jd_seg = jseg;
3189	rec = (struct jblkrec *)data;
3190	rec->jb_op = JOP_FREEBLK;
3191	rec->jb_ino = jfreefrag->fr_ino;
3192	rec->jb_blkno = jfreefrag->fr_blkno;
3193	rec->jb_lbn = jfreefrag->fr_lbn;
3194	rec->jb_frags = jfreefrag->fr_frags;
3195	rec->jb_oldfrags = 0;
3196}
3197
3198static void
3199jtrunc_write(jtrunc, jseg, data)
3200	struct jtrunc *jtrunc;
3201	struct jseg *jseg;
3202	uint8_t *data;
3203{
3204	struct jtrncrec *rec;
3205
3206	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3207	rec = (struct jtrncrec *)data;
3208	rec->jt_op = JOP_TRUNC;
3209	rec->jt_ino = jtrunc->jt_ino;
3210	rec->jt_size = jtrunc->jt_size;
3211	rec->jt_extsize = jtrunc->jt_extsize;
3212}
3213
3214static void
3215jfsync_write(jfsync, jseg, data)
3216	struct jfsync *jfsync;
3217	struct jseg *jseg;
3218	uint8_t *data;
3219{
3220	struct jtrncrec *rec;
3221
3222	rec = (struct jtrncrec *)data;
3223	rec->jt_op = JOP_SYNC;
3224	rec->jt_ino = jfsync->jfs_ino;
3225	rec->jt_size = jfsync->jfs_size;
3226	rec->jt_extsize = jfsync->jfs_extsize;
3227}
3228
3229static void
3230softdep_flushjournal(mp)
3231	struct mount *mp;
3232{
3233	struct jblocks *jblocks;
3234	struct ufsmount *ump;
3235
3236	if (MOUNTEDSUJ(mp) == 0)
3237		return;
3238	ump = VFSTOUFS(mp);
3239	jblocks = ump->softdep_jblocks;
3240	ACQUIRE_LOCK(ump);
3241	while (ump->softdep_on_journal) {
3242		jblocks->jb_needseg = 1;
3243		softdep_process_journal(mp, NULL, MNT_WAIT);
3244	}
3245	FREE_LOCK(ump);
3246}
3247
3248static void softdep_synchronize_completed(struct bio *);
3249static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3250
3251static void
3252softdep_synchronize_completed(bp)
3253        struct bio *bp;
3254{
3255	struct jseg *oldest;
3256	struct jseg *jseg;
3257	struct ufsmount *ump;
3258
3259	/*
3260	 * caller1 marks the last segment written before we issued the
3261	 * synchronize cache.
3262	 */
3263	jseg = bp->bio_caller1;
3264	if (jseg == NULL) {
3265		g_destroy_bio(bp);
3266		return;
3267	}
3268	ump = VFSTOUFS(jseg->js_list.wk_mp);
3269	ACQUIRE_LOCK(ump);
3270	oldest = NULL;
3271	/*
3272	 * Mark all the journal entries waiting on the synchronize cache
3273	 * as completed so they may continue on.
3274	 */
3275	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3276		jseg->js_state |= COMPLETE;
3277		oldest = jseg;
3278		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3279	}
3280	/*
3281	 * Restart deferred journal entry processing from the oldest
3282	 * completed jseg.
3283	 */
3284	if (oldest)
3285		complete_jsegs(oldest);
3286
3287	FREE_LOCK(ump);
3288	g_destroy_bio(bp);
3289}
3290
3291/*
3292 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3293 * barriers.  The journal must be written prior to any blocks that depend
3294 * on it and the journal can not be released until the blocks have be
3295 * written.  This code handles both barriers simultaneously.
3296 */
3297static void
3298softdep_synchronize(bp, ump, caller1)
3299	struct bio *bp;
3300	struct ufsmount *ump;
3301	void *caller1;
3302{
3303
3304	bp->bio_cmd = BIO_FLUSH;
3305	bp->bio_flags |= BIO_ORDERED;
3306	bp->bio_data = NULL;
3307	bp->bio_offset = ump->um_cp->provider->mediasize;
3308	bp->bio_length = 0;
3309	bp->bio_done = softdep_synchronize_completed;
3310	bp->bio_caller1 = caller1;
3311	g_io_request(bp,
3312	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3313}
3314
3315/*
3316 * Flush some journal records to disk.
3317 */
3318static void
3319softdep_process_journal(mp, needwk, flags)
3320	struct mount *mp;
3321	struct worklist *needwk;
3322	int flags;
3323{
3324	struct jblocks *jblocks;
3325	struct ufsmount *ump;
3326	struct worklist *wk;
3327	struct jseg *jseg;
3328	struct buf *bp;
3329	struct bio *bio;
3330	uint8_t *data;
3331	struct fs *fs;
3332	int shouldflush;
3333	int segwritten;
3334	int jrecmin;	/* Minimum records per block. */
3335	int jrecmax;	/* Maximum records per block. */
3336	int size;
3337	int cnt;
3338	int off;
3339	int devbsize;
3340
3341	if (MOUNTEDSUJ(mp) == 0)
3342		return;
3343	shouldflush = softdep_flushcache;
3344	bio = NULL;
3345	jseg = NULL;
3346	ump = VFSTOUFS(mp);
3347	LOCK_OWNED(ump);
3348	fs = ump->um_fs;
3349	jblocks = ump->softdep_jblocks;
3350	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3351	/*
3352	 * We write anywhere between a disk block and fs block.  The upper
3353	 * bound is picked to prevent buffer cache fragmentation and limit
3354	 * processing time per I/O.
3355	 */
3356	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3357	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3358	segwritten = 0;
3359	for (;;) {
3360		cnt = ump->softdep_on_journal;
3361		/*
3362		 * Criteria for writing a segment:
3363		 * 1) We have a full block.
3364		 * 2) We're called from jwait() and haven't found the
3365		 *    journal item yet.
3366		 * 3) Always write if needseg is set.
3367		 * 4) If we are called from process_worklist and have
3368		 *    not yet written anything we write a partial block
3369		 *    to enforce a 1 second maximum latency on journal
3370		 *    entries.
3371		 */
3372		if (cnt < (jrecmax - 1) && needwk == NULL &&
3373		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3374			break;
3375		cnt++;
3376		/*
3377		 * Verify some free journal space.  softdep_prealloc() should
3378		 * guarantee that we don't run out so this is indicative of
3379		 * a problem with the flow control.  Try to recover
3380		 * gracefully in any event.
3381		 */
3382		while (jblocks->jb_free == 0) {
3383			if (flags != MNT_WAIT)
3384				break;
3385			printf("softdep: Out of journal space!\n");
3386			softdep_speedup(ump);
3387			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3388		}
3389		FREE_LOCK(ump);
3390		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3391		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3392		LIST_INIT(&jseg->js_entries);
3393		LIST_INIT(&jseg->js_indirs);
3394		jseg->js_state = ATTACHED;
3395		if (shouldflush == 0)
3396			jseg->js_state |= COMPLETE;
3397		else if (bio == NULL)
3398			bio = g_alloc_bio();
3399		jseg->js_jblocks = jblocks;
3400		bp = geteblk(fs->fs_bsize, 0);
3401		ACQUIRE_LOCK(ump);
3402		/*
3403		 * If there was a race while we were allocating the block
3404		 * and jseg the entry we care about was likely written.
3405		 * We bail out in both the WAIT and NOWAIT case and assume
3406		 * the caller will loop if the entry it cares about is
3407		 * not written.
3408		 */
3409		cnt = ump->softdep_on_journal;
3410		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3411			bp->b_flags |= B_INVAL | B_NOCACHE;
3412			WORKITEM_FREE(jseg, D_JSEG);
3413			FREE_LOCK(ump);
3414			brelse(bp);
3415			ACQUIRE_LOCK(ump);
3416			break;
3417		}
3418		/*
3419		 * Calculate the disk block size required for the available
3420		 * records rounded to the min size.
3421		 */
3422		if (cnt == 0)
3423			size = devbsize;
3424		else if (cnt < jrecmax)
3425			size = howmany(cnt, jrecmin) * devbsize;
3426		else
3427			size = fs->fs_bsize;
3428		/*
3429		 * Allocate a disk block for this journal data and account
3430		 * for truncation of the requested size if enough contiguous
3431		 * space was not available.
3432		 */
3433		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3434		bp->b_lblkno = bp->b_blkno;
3435		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3436		bp->b_bcount = size;
3437		bp->b_flags &= ~B_INVAL;
3438		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3439		/*
3440		 * Initialize our jseg with cnt records.  Assign the next
3441		 * sequence number to it and link it in-order.
3442		 */
3443		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3444		jseg->js_buf = bp;
3445		jseg->js_cnt = cnt;
3446		jseg->js_refs = cnt + 1;	/* Self ref. */
3447		jseg->js_size = size;
3448		jseg->js_seq = jblocks->jb_nextseq++;
3449		if (jblocks->jb_oldestseg == NULL)
3450			jblocks->jb_oldestseg = jseg;
3451		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3452		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3453		if (jblocks->jb_writeseg == NULL)
3454			jblocks->jb_writeseg = jseg;
3455		/*
3456		 * Start filling in records from the pending list.
3457		 */
3458		data = bp->b_data;
3459		off = 0;
3460
3461		/*
3462		 * Always put a header on the first block.
3463		 * XXX As with below, there might not be a chance to get
3464		 * into the loop.  Ensure that something valid is written.
3465		 */
3466		jseg_write(ump, jseg, data);
3467		off += JREC_SIZE;
3468		data = bp->b_data + off;
3469
3470		/*
3471		 * XXX Something is wrong here.  There's no work to do,
3472		 * but we need to perform and I/O and allow it to complete
3473		 * anyways.
3474		 */
3475		if (LIST_EMPTY(&ump->softdep_journal_pending))
3476			stat_emptyjblocks++;
3477
3478		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3479		    != NULL) {
3480			if (cnt == 0)
3481				break;
3482			/* Place a segment header on every device block. */
3483			if ((off % devbsize) == 0) {
3484				jseg_write(ump, jseg, data);
3485				off += JREC_SIZE;
3486				data = bp->b_data + off;
3487			}
3488			if (wk == needwk)
3489				needwk = NULL;
3490			remove_from_journal(wk);
3491			wk->wk_state |= INPROGRESS;
3492			WORKLIST_INSERT(&jseg->js_entries, wk);
3493			switch (wk->wk_type) {
3494			case D_JADDREF:
3495				jaddref_write(WK_JADDREF(wk), jseg, data);
3496				break;
3497			case D_JREMREF:
3498				jremref_write(WK_JREMREF(wk), jseg, data);
3499				break;
3500			case D_JMVREF:
3501				jmvref_write(WK_JMVREF(wk), jseg, data);
3502				break;
3503			case D_JNEWBLK:
3504				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3505				break;
3506			case D_JFREEBLK:
3507				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3508				break;
3509			case D_JFREEFRAG:
3510				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3511				break;
3512			case D_JTRUNC:
3513				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3514				break;
3515			case D_JFSYNC:
3516				jfsync_write(WK_JFSYNC(wk), jseg, data);
3517				break;
3518			default:
3519				panic("process_journal: Unknown type %s",
3520				    TYPENAME(wk->wk_type));
3521				/* NOTREACHED */
3522			}
3523			off += JREC_SIZE;
3524			data = bp->b_data + off;
3525			cnt--;
3526		}
3527
3528		/* Clear any remaining space so we don't leak kernel data */
3529		if (size > off)
3530			bzero(data, size - off);
3531
3532		/*
3533		 * Write this one buffer and continue.
3534		 */
3535		segwritten = 1;
3536		jblocks->jb_needseg = 0;
3537		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3538		FREE_LOCK(ump);
3539		pbgetvp(ump->um_devvp, bp);
3540		/*
3541		 * We only do the blocking wait once we find the journal
3542		 * entry we're looking for.
3543		 */
3544		if (needwk == NULL && flags == MNT_WAIT)
3545			bwrite(bp);
3546		else
3547			bawrite(bp);
3548		ACQUIRE_LOCK(ump);
3549	}
3550	/*
3551	 * If we wrote a segment issue a synchronize cache so the journal
3552	 * is reflected on disk before the data is written.  Since reclaiming
3553	 * journal space also requires writing a journal record this
3554	 * process also enforces a barrier before reclamation.
3555	 */
3556	if (segwritten && shouldflush) {
3557		softdep_synchronize(bio, ump,
3558		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3559	} else if (bio)
3560		g_destroy_bio(bio);
3561	/*
3562	 * If we've suspended the filesystem because we ran out of journal
3563	 * space either try to sync it here to make some progress or
3564	 * unsuspend it if we already have.
3565	 */
3566	if (flags == 0 && jblocks->jb_suspended) {
3567		if (journal_unsuspend(ump))
3568			return;
3569		FREE_LOCK(ump);
3570		VFS_SYNC(mp, MNT_NOWAIT);
3571		ffs_sbupdate(ump, MNT_WAIT, 0);
3572		ACQUIRE_LOCK(ump);
3573	}
3574}
3575
3576/*
3577 * Complete a jseg, allowing all dependencies awaiting journal writes
3578 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3579 * structures so that the journal segment can be freed to reclaim space.
3580 */
3581static void
3582complete_jseg(jseg)
3583	struct jseg *jseg;
3584{
3585	struct worklist *wk;
3586	struct jmvref *jmvref;
3587	int waiting;
3588#ifdef INVARIANTS
3589	int i = 0;
3590#endif
3591
3592	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3593		WORKLIST_REMOVE(wk);
3594		waiting = wk->wk_state & IOWAITING;
3595		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3596		wk->wk_state |= COMPLETE;
3597		KASSERT(i++ < jseg->js_cnt,
3598		    ("handle_written_jseg: overflow %d >= %d",
3599		    i - 1, jseg->js_cnt));
3600		switch (wk->wk_type) {
3601		case D_JADDREF:
3602			handle_written_jaddref(WK_JADDREF(wk));
3603			break;
3604		case D_JREMREF:
3605			handle_written_jremref(WK_JREMREF(wk));
3606			break;
3607		case D_JMVREF:
3608			rele_jseg(jseg);	/* No jsegdep. */
3609			jmvref = WK_JMVREF(wk);
3610			LIST_REMOVE(jmvref, jm_deps);
3611			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3612				free_pagedep(jmvref->jm_pagedep);
3613			WORKITEM_FREE(jmvref, D_JMVREF);
3614			break;
3615		case D_JNEWBLK:
3616			handle_written_jnewblk(WK_JNEWBLK(wk));
3617			break;
3618		case D_JFREEBLK:
3619			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3620			break;
3621		case D_JTRUNC:
3622			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3623			break;
3624		case D_JFSYNC:
3625			rele_jseg(jseg);	/* No jsegdep. */
3626			WORKITEM_FREE(wk, D_JFSYNC);
3627			break;
3628		case D_JFREEFRAG:
3629			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3630			break;
3631		default:
3632			panic("handle_written_jseg: Unknown type %s",
3633			    TYPENAME(wk->wk_type));
3634			/* NOTREACHED */
3635		}
3636		if (waiting)
3637			wakeup(wk);
3638	}
3639	/* Release the self reference so the structure may be freed. */
3640	rele_jseg(jseg);
3641}
3642
3643/*
3644 * Determine which jsegs are ready for completion processing.  Waits for
3645 * synchronize cache to complete as well as forcing in-order completion
3646 * of journal entries.
3647 */
3648static void
3649complete_jsegs(jseg)
3650	struct jseg *jseg;
3651{
3652	struct jblocks *jblocks;
3653	struct jseg *jsegn;
3654
3655	jblocks = jseg->js_jblocks;
3656	/*
3657	 * Don't allow out of order completions.  If this isn't the first
3658	 * block wait for it to write before we're done.
3659	 */
3660	if (jseg != jblocks->jb_writeseg)
3661		return;
3662	/* Iterate through available jsegs processing their entries. */
3663	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3664		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3665		jsegn = TAILQ_NEXT(jseg, js_next);
3666		complete_jseg(jseg);
3667		jseg = jsegn;
3668	}
3669	jblocks->jb_writeseg = jseg;
3670	/*
3671	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3672	 */
3673	free_jsegs(jblocks);
3674}
3675
3676/*
3677 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3678 * the final completions.
3679 */
3680static void
3681handle_written_jseg(jseg, bp)
3682	struct jseg *jseg;
3683	struct buf *bp;
3684{
3685
3686	if (jseg->js_refs == 0)
3687		panic("handle_written_jseg: No self-reference on %p", jseg);
3688	jseg->js_state |= DEPCOMPLETE;
3689	/*
3690	 * We'll never need this buffer again, set flags so it will be
3691	 * discarded.
3692	 */
3693	bp->b_flags |= B_INVAL | B_NOCACHE;
3694	pbrelvp(bp);
3695	complete_jsegs(jseg);
3696}
3697
3698static inline struct jsegdep *
3699inoref_jseg(inoref)
3700	struct inoref *inoref;
3701{
3702	struct jsegdep *jsegdep;
3703
3704	jsegdep = inoref->if_jsegdep;
3705	inoref->if_jsegdep = NULL;
3706
3707	return (jsegdep);
3708}
3709
3710/*
3711 * Called once a jremref has made it to stable store.  The jremref is marked
3712 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3713 * for the jremref to complete will be awoken by free_jremref.
3714 */
3715static void
3716handle_written_jremref(jremref)
3717	struct jremref *jremref;
3718{
3719	struct inodedep *inodedep;
3720	struct jsegdep *jsegdep;
3721	struct dirrem *dirrem;
3722
3723	/* Grab the jsegdep. */
3724	jsegdep = inoref_jseg(&jremref->jr_ref);
3725	/*
3726	 * Remove us from the inoref list.
3727	 */
3728	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3729	    0, &inodedep) == 0)
3730		panic("handle_written_jremref: Lost inodedep");
3731	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3732	/*
3733	 * Complete the dirrem.
3734	 */
3735	dirrem = jremref->jr_dirrem;
3736	jremref->jr_dirrem = NULL;
3737	LIST_REMOVE(jremref, jr_deps);
3738	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3739	jwork_insert(&dirrem->dm_jwork, jsegdep);
3740	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3741	    (dirrem->dm_state & COMPLETE) != 0)
3742		add_to_worklist(&dirrem->dm_list, 0);
3743	free_jremref(jremref);
3744}
3745
3746/*
3747 * Called once a jaddref has made it to stable store.  The dependency is
3748 * marked complete and any dependent structures are added to the inode
3749 * bufwait list to be completed as soon as it is written.  If a bitmap write
3750 * depends on this entry we move the inode into the inodedephd of the
3751 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3752 */
3753static void
3754handle_written_jaddref(jaddref)
3755	struct jaddref *jaddref;
3756{
3757	struct jsegdep *jsegdep;
3758	struct inodedep *inodedep;
3759	struct diradd *diradd;
3760	struct mkdir *mkdir;
3761
3762	/* Grab the jsegdep. */
3763	jsegdep = inoref_jseg(&jaddref->ja_ref);
3764	mkdir = NULL;
3765	diradd = NULL;
3766	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3767	    0, &inodedep) == 0)
3768		panic("handle_written_jaddref: Lost inodedep.");
3769	if (jaddref->ja_diradd == NULL)
3770		panic("handle_written_jaddref: No dependency");
3771	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3772		diradd = jaddref->ja_diradd;
3773		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3774	} else if (jaddref->ja_state & MKDIR_PARENT) {
3775		mkdir = jaddref->ja_mkdir;
3776		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3777	} else if (jaddref->ja_state & MKDIR_BODY)
3778		mkdir = jaddref->ja_mkdir;
3779	else
3780		panic("handle_written_jaddref: Unknown dependency %p",
3781		    jaddref->ja_diradd);
3782	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3783	/*
3784	 * Remove us from the inode list.
3785	 */
3786	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3787	/*
3788	 * The mkdir may be waiting on the jaddref to clear before freeing.
3789	 */
3790	if (mkdir) {
3791		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3792		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3793		    TYPENAME(mkdir->md_list.wk_type)));
3794		mkdir->md_jaddref = NULL;
3795		diradd = mkdir->md_diradd;
3796		mkdir->md_state |= DEPCOMPLETE;
3797		complete_mkdir(mkdir);
3798	}
3799	jwork_insert(&diradd->da_jwork, jsegdep);
3800	if (jaddref->ja_state & NEWBLOCK) {
3801		inodedep->id_state |= ONDEPLIST;
3802		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3803		    inodedep, id_deps);
3804	}
3805	free_jaddref(jaddref);
3806}
3807
3808/*
3809 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3810 * is placed in the bmsafemap to await notification of a written bitmap.  If
3811 * the operation was canceled we add the segdep to the appropriate
3812 * dependency to free the journal space once the canceling operation
3813 * completes.
3814 */
3815static void
3816handle_written_jnewblk(jnewblk)
3817	struct jnewblk *jnewblk;
3818{
3819	struct bmsafemap *bmsafemap;
3820	struct freefrag *freefrag;
3821	struct freework *freework;
3822	struct jsegdep *jsegdep;
3823	struct newblk *newblk;
3824
3825	/* Grab the jsegdep. */
3826	jsegdep = jnewblk->jn_jsegdep;
3827	jnewblk->jn_jsegdep = NULL;
3828	if (jnewblk->jn_dep == NULL)
3829		panic("handle_written_jnewblk: No dependency for the segdep.");
3830	switch (jnewblk->jn_dep->wk_type) {
3831	case D_NEWBLK:
3832	case D_ALLOCDIRECT:
3833	case D_ALLOCINDIR:
3834		/*
3835		 * Add the written block to the bmsafemap so it can
3836		 * be notified when the bitmap is on disk.
3837		 */
3838		newblk = WK_NEWBLK(jnewblk->jn_dep);
3839		newblk->nb_jnewblk = NULL;
3840		if ((newblk->nb_state & GOINGAWAY) == 0) {
3841			bmsafemap = newblk->nb_bmsafemap;
3842			newblk->nb_state |= ONDEPLIST;
3843			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3844			    nb_deps);
3845		}
3846		jwork_insert(&newblk->nb_jwork, jsegdep);
3847		break;
3848	case D_FREEFRAG:
3849		/*
3850		 * A newblock being removed by a freefrag when replaced by
3851		 * frag extension.
3852		 */
3853		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3854		freefrag->ff_jdep = NULL;
3855		jwork_insert(&freefrag->ff_jwork, jsegdep);
3856		break;
3857	case D_FREEWORK:
3858		/*
3859		 * A direct block was removed by truncate.
3860		 */
3861		freework = WK_FREEWORK(jnewblk->jn_dep);
3862		freework->fw_jnewblk = NULL;
3863		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3864		break;
3865	default:
3866		panic("handle_written_jnewblk: Unknown type %d.",
3867		    jnewblk->jn_dep->wk_type);
3868	}
3869	jnewblk->jn_dep = NULL;
3870	free_jnewblk(jnewblk);
3871}
3872
3873/*
3874 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3875 * an in-flight allocation that has not yet been committed.  Divorce us
3876 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3877 * to the worklist.
3878 */
3879static void
3880cancel_jfreefrag(jfreefrag)
3881	struct jfreefrag *jfreefrag;
3882{
3883	struct freefrag *freefrag;
3884
3885	if (jfreefrag->fr_jsegdep) {
3886		free_jsegdep(jfreefrag->fr_jsegdep);
3887		jfreefrag->fr_jsegdep = NULL;
3888	}
3889	freefrag = jfreefrag->fr_freefrag;
3890	jfreefrag->fr_freefrag = NULL;
3891	free_jfreefrag(jfreefrag);
3892	freefrag->ff_state |= DEPCOMPLETE;
3893	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3894}
3895
3896/*
3897 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3898 */
3899static void
3900free_jfreefrag(jfreefrag)
3901	struct jfreefrag *jfreefrag;
3902{
3903
3904	if (jfreefrag->fr_state & INPROGRESS)
3905		WORKLIST_REMOVE(&jfreefrag->fr_list);
3906	else if (jfreefrag->fr_state & ONWORKLIST)
3907		remove_from_journal(&jfreefrag->fr_list);
3908	if (jfreefrag->fr_freefrag != NULL)
3909		panic("free_jfreefrag:  Still attached to a freefrag.");
3910	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3911}
3912
3913/*
3914 * Called when the journal write for a jfreefrag completes.  The parent
3915 * freefrag is added to the worklist if this completes its dependencies.
3916 */
3917static void
3918handle_written_jfreefrag(jfreefrag)
3919	struct jfreefrag *jfreefrag;
3920{
3921	struct jsegdep *jsegdep;
3922	struct freefrag *freefrag;
3923
3924	/* Grab the jsegdep. */
3925	jsegdep = jfreefrag->fr_jsegdep;
3926	jfreefrag->fr_jsegdep = NULL;
3927	freefrag = jfreefrag->fr_freefrag;
3928	if (freefrag == NULL)
3929		panic("handle_written_jfreefrag: No freefrag.");
3930	freefrag->ff_state |= DEPCOMPLETE;
3931	freefrag->ff_jdep = NULL;
3932	jwork_insert(&freefrag->ff_jwork, jsegdep);
3933	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3934		add_to_worklist(&freefrag->ff_list, 0);
3935	jfreefrag->fr_freefrag = NULL;
3936	free_jfreefrag(jfreefrag);
3937}
3938
3939/*
3940 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3941 * is removed from the freeblks list of pending journal writes and the
3942 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3943 * have been reclaimed.
3944 */
3945static void
3946handle_written_jblkdep(jblkdep)
3947	struct jblkdep *jblkdep;
3948{
3949	struct freeblks *freeblks;
3950	struct jsegdep *jsegdep;
3951
3952	/* Grab the jsegdep. */
3953	jsegdep = jblkdep->jb_jsegdep;
3954	jblkdep->jb_jsegdep = NULL;
3955	freeblks = jblkdep->jb_freeblks;
3956	LIST_REMOVE(jblkdep, jb_deps);
3957	jwork_insert(&freeblks->fb_jwork, jsegdep);
3958	/*
3959	 * If the freeblks is all journaled, we can add it to the worklist.
3960	 */
3961	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3962	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3963		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3964
3965	free_jblkdep(jblkdep);
3966}
3967
3968static struct jsegdep *
3969newjsegdep(struct worklist *wk)
3970{
3971	struct jsegdep *jsegdep;
3972
3973	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3974	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3975	jsegdep->jd_seg = NULL;
3976
3977	return (jsegdep);
3978}
3979
3980static struct jmvref *
3981newjmvref(dp, ino, oldoff, newoff)
3982	struct inode *dp;
3983	ino_t ino;
3984	off_t oldoff;
3985	off_t newoff;
3986{
3987	struct jmvref *jmvref;
3988
3989	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
3990	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
3991	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
3992	jmvref->jm_parent = dp->i_number;
3993	jmvref->jm_ino = ino;
3994	jmvref->jm_oldoff = oldoff;
3995	jmvref->jm_newoff = newoff;
3996
3997	return (jmvref);
3998}
3999
4000/*
4001 * Allocate a new jremref that tracks the removal of ip from dp with the
4002 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4003 * DEPCOMPLETE as we have all the information required for the journal write
4004 * and the directory has already been removed from the buffer.  The caller
4005 * is responsible for linking the jremref into the pagedep and adding it
4006 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4007 * a DOTDOT addition so handle_workitem_remove() can properly assign
4008 * the jsegdep when we're done.
4009 */
4010static struct jremref *
4011newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4012    off_t diroff, nlink_t nlink)
4013{
4014	struct jremref *jremref;
4015
4016	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4017	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
4018	jremref->jr_state = ATTACHED;
4019	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4020	   nlink, ip->i_mode);
4021	jremref->jr_dirrem = dirrem;
4022
4023	return (jremref);
4024}
4025
4026static inline void
4027newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4028    nlink_t nlink, uint16_t mode)
4029{
4030
4031	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4032	inoref->if_diroff = diroff;
4033	inoref->if_ino = ino;
4034	inoref->if_parent = parent;
4035	inoref->if_nlink = nlink;
4036	inoref->if_mode = mode;
4037}
4038
4039/*
4040 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4041 * directory offset may not be known until later.  The caller is responsible
4042 * adding the entry to the journal when this information is available.  nlink
4043 * should be the link count prior to the addition and mode is only required
4044 * to have the correct FMT.
4045 */
4046static struct jaddref *
4047newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4048    uint16_t mode)
4049{
4050	struct jaddref *jaddref;
4051
4052	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4053	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
4054	jaddref->ja_state = ATTACHED;
4055	jaddref->ja_mkdir = NULL;
4056	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4057
4058	return (jaddref);
4059}
4060
4061/*
4062 * Create a new free dependency for a freework.  The caller is responsible
4063 * for adjusting the reference count when it has the lock held.  The freedep
4064 * will track an outstanding bitmap write that will ultimately clear the
4065 * freework to continue.
4066 */
4067static struct freedep *
4068newfreedep(struct freework *freework)
4069{
4070	struct freedep *freedep;
4071
4072	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4073	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4074	freedep->fd_freework = freework;
4075
4076	return (freedep);
4077}
4078
4079/*
4080 * Free a freedep structure once the buffer it is linked to is written.  If
4081 * this is the last reference to the freework schedule it for completion.
4082 */
4083static void
4084free_freedep(freedep)
4085	struct freedep *freedep;
4086{
4087	struct freework *freework;
4088
4089	freework = freedep->fd_freework;
4090	freework->fw_freeblks->fb_cgwait--;
4091	if (--freework->fw_ref == 0)
4092		freework_enqueue(freework);
4093	WORKITEM_FREE(freedep, D_FREEDEP);
4094}
4095
4096/*
4097 * Allocate a new freework structure that may be a level in an indirect
4098 * when parent is not NULL or a top level block when it is.  The top level
4099 * freework structures are allocated without the per-filesystem lock held
4100 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4101 */
4102static struct freework *
4103newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4104	struct ufsmount *ump;
4105	struct freeblks *freeblks;
4106	struct freework *parent;
4107	ufs_lbn_t lbn;
4108	ufs2_daddr_t nb;
4109	int frags;
4110	int off;
4111	int journal;
4112{
4113	struct freework *freework;
4114
4115	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4116	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4117	freework->fw_state = ATTACHED;
4118	freework->fw_jnewblk = NULL;
4119	freework->fw_freeblks = freeblks;
4120	freework->fw_parent = parent;
4121	freework->fw_lbn = lbn;
4122	freework->fw_blkno = nb;
4123	freework->fw_frags = frags;
4124	freework->fw_indir = NULL;
4125	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4126		? 0 : NINDIR(ump->um_fs) + 1;
4127	freework->fw_start = freework->fw_off = off;
4128	if (journal)
4129		newjfreeblk(freeblks, lbn, nb, frags);
4130	if (parent == NULL) {
4131		ACQUIRE_LOCK(ump);
4132		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4133		freeblks->fb_ref++;
4134		FREE_LOCK(ump);
4135	}
4136
4137	return (freework);
4138}
4139
4140/*
4141 * Eliminate a jfreeblk for a block that does not need journaling.
4142 */
4143static void
4144cancel_jfreeblk(freeblks, blkno)
4145	struct freeblks *freeblks;
4146	ufs2_daddr_t blkno;
4147{
4148	struct jfreeblk *jfreeblk;
4149	struct jblkdep *jblkdep;
4150
4151	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4152		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4153			continue;
4154		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4155		if (jfreeblk->jf_blkno == blkno)
4156			break;
4157	}
4158	if (jblkdep == NULL)
4159		return;
4160	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4161	free_jsegdep(jblkdep->jb_jsegdep);
4162	LIST_REMOVE(jblkdep, jb_deps);
4163	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4164}
4165
4166/*
4167 * Allocate a new jfreeblk to journal top level block pointer when truncating
4168 * a file.  The caller must add this to the worklist when the per-filesystem
4169 * lock is held.
4170 */
4171static struct jfreeblk *
4172newjfreeblk(freeblks, lbn, blkno, frags)
4173	struct freeblks *freeblks;
4174	ufs_lbn_t lbn;
4175	ufs2_daddr_t blkno;
4176	int frags;
4177{
4178	struct jfreeblk *jfreeblk;
4179
4180	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4181	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4182	    freeblks->fb_list.wk_mp);
4183	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4184	jfreeblk->jf_dep.jb_freeblks = freeblks;
4185	jfreeblk->jf_ino = freeblks->fb_inum;
4186	jfreeblk->jf_lbn = lbn;
4187	jfreeblk->jf_blkno = blkno;
4188	jfreeblk->jf_frags = frags;
4189	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4190
4191	return (jfreeblk);
4192}
4193
4194/*
4195 * The journal is only prepared to handle full-size block numbers, so we
4196 * have to adjust the record to reflect the change to a full-size block.
4197 * For example, suppose we have a block made up of fragments 8-15 and
4198 * want to free its last two fragments. We are given a request that says:
4199 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4200 * where frags are the number of fragments to free and oldfrags are the
4201 * number of fragments to keep. To block align it, we have to change it to
4202 * have a valid full-size blkno, so it becomes:
4203 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4204 */
4205static void
4206adjust_newfreework(freeblks, frag_offset)
4207	struct freeblks *freeblks;
4208	int frag_offset;
4209{
4210	struct jfreeblk *jfreeblk;
4211
4212	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4213	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4214	    ("adjust_newfreework: Missing freeblks dependency"));
4215
4216	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4217	jfreeblk->jf_blkno -= frag_offset;
4218	jfreeblk->jf_frags += frag_offset;
4219}
4220
4221/*
4222 * Allocate a new jtrunc to track a partial truncation.
4223 */
4224static struct jtrunc *
4225newjtrunc(freeblks, size, extsize)
4226	struct freeblks *freeblks;
4227	off_t size;
4228	int extsize;
4229{
4230	struct jtrunc *jtrunc;
4231
4232	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4233	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4234	    freeblks->fb_list.wk_mp);
4235	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4236	jtrunc->jt_dep.jb_freeblks = freeblks;
4237	jtrunc->jt_ino = freeblks->fb_inum;
4238	jtrunc->jt_size = size;
4239	jtrunc->jt_extsize = extsize;
4240	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4241
4242	return (jtrunc);
4243}
4244
4245/*
4246 * If we're canceling a new bitmap we have to search for another ref
4247 * to move into the bmsafemap dep.  This might be better expressed
4248 * with another structure.
4249 */
4250static void
4251move_newblock_dep(jaddref, inodedep)
4252	struct jaddref *jaddref;
4253	struct inodedep *inodedep;
4254{
4255	struct inoref *inoref;
4256	struct jaddref *jaddrefn;
4257
4258	jaddrefn = NULL;
4259	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4260	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4261		if ((jaddref->ja_state & NEWBLOCK) &&
4262		    inoref->if_list.wk_type == D_JADDREF) {
4263			jaddrefn = (struct jaddref *)inoref;
4264			break;
4265		}
4266	}
4267	if (jaddrefn == NULL)
4268		return;
4269	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4270	jaddrefn->ja_state |= jaddref->ja_state &
4271	    (ATTACHED | UNDONE | NEWBLOCK);
4272	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4273	jaddref->ja_state |= ATTACHED;
4274	LIST_REMOVE(jaddref, ja_bmdeps);
4275	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4276	    ja_bmdeps);
4277}
4278
4279/*
4280 * Cancel a jaddref either before it has been written or while it is being
4281 * written.  This happens when a link is removed before the add reaches
4282 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4283 * and inode to prevent the link count or bitmap from reaching the disk
4284 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4285 * required.
4286 *
4287 * Returns 1 if the canceled addref requires journaling of the remove and
4288 * 0 otherwise.
4289 */
4290static int
4291cancel_jaddref(jaddref, inodedep, wkhd)
4292	struct jaddref *jaddref;
4293	struct inodedep *inodedep;
4294	struct workhead *wkhd;
4295{
4296	struct inoref *inoref;
4297	struct jsegdep *jsegdep;
4298	int needsj;
4299
4300	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4301	    ("cancel_jaddref: Canceling complete jaddref"));
4302	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4303		needsj = 1;
4304	else
4305		needsj = 0;
4306	if (inodedep == NULL)
4307		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4308		    0, &inodedep) == 0)
4309			panic("cancel_jaddref: Lost inodedep");
4310	/*
4311	 * We must adjust the nlink of any reference operation that follows
4312	 * us so that it is consistent with the in-memory reference.  This
4313	 * ensures that inode nlink rollbacks always have the correct link.
4314	 */
4315	if (needsj == 0) {
4316		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4317		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4318			if (inoref->if_state & GOINGAWAY)
4319				break;
4320			inoref->if_nlink--;
4321		}
4322	}
4323	jsegdep = inoref_jseg(&jaddref->ja_ref);
4324	if (jaddref->ja_state & NEWBLOCK)
4325		move_newblock_dep(jaddref, inodedep);
4326	wake_worklist(&jaddref->ja_list);
4327	jaddref->ja_mkdir = NULL;
4328	if (jaddref->ja_state & INPROGRESS) {
4329		jaddref->ja_state &= ~INPROGRESS;
4330		WORKLIST_REMOVE(&jaddref->ja_list);
4331		jwork_insert(wkhd, jsegdep);
4332	} else {
4333		free_jsegdep(jsegdep);
4334		if (jaddref->ja_state & DEPCOMPLETE)
4335			remove_from_journal(&jaddref->ja_list);
4336	}
4337	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4338	/*
4339	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4340	 * can arrange for them to be freed with the bitmap.  Otherwise we
4341	 * no longer need this addref attached to the inoreflst and it
4342	 * will incorrectly adjust nlink if we leave it.
4343	 */
4344	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4345		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4346		    if_deps);
4347		jaddref->ja_state |= COMPLETE;
4348		free_jaddref(jaddref);
4349		return (needsj);
4350	}
4351	/*
4352	 * Leave the head of the list for jsegdeps for fast merging.
4353	 */
4354	if (LIST_FIRST(wkhd) != NULL) {
4355		jaddref->ja_state |= ONWORKLIST;
4356		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4357	} else
4358		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4359
4360	return (needsj);
4361}
4362
4363/*
4364 * Attempt to free a jaddref structure when some work completes.  This
4365 * should only succeed once the entry is written and all dependencies have
4366 * been notified.
4367 */
4368static void
4369free_jaddref(jaddref)
4370	struct jaddref *jaddref;
4371{
4372
4373	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4374		return;
4375	if (jaddref->ja_ref.if_jsegdep)
4376		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4377		    jaddref, jaddref->ja_state);
4378	if (jaddref->ja_state & NEWBLOCK)
4379		LIST_REMOVE(jaddref, ja_bmdeps);
4380	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4381		panic("free_jaddref: Bad state %p(0x%X)",
4382		    jaddref, jaddref->ja_state);
4383	if (jaddref->ja_mkdir != NULL)
4384		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4385	WORKITEM_FREE(jaddref, D_JADDREF);
4386}
4387
4388/*
4389 * Free a jremref structure once it has been written or discarded.
4390 */
4391static void
4392free_jremref(jremref)
4393	struct jremref *jremref;
4394{
4395
4396	if (jremref->jr_ref.if_jsegdep)
4397		free_jsegdep(jremref->jr_ref.if_jsegdep);
4398	if (jremref->jr_state & INPROGRESS)
4399		panic("free_jremref: IO still pending");
4400	WORKITEM_FREE(jremref, D_JREMREF);
4401}
4402
4403/*
4404 * Free a jnewblk structure.
4405 */
4406static void
4407free_jnewblk(jnewblk)
4408	struct jnewblk *jnewblk;
4409{
4410
4411	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4412		return;
4413	LIST_REMOVE(jnewblk, jn_deps);
4414	if (jnewblk->jn_dep != NULL)
4415		panic("free_jnewblk: Dependency still attached.");
4416	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4417}
4418
4419/*
4420 * Cancel a jnewblk which has been been made redundant by frag extension.
4421 */
4422static void
4423cancel_jnewblk(jnewblk, wkhd)
4424	struct jnewblk *jnewblk;
4425	struct workhead *wkhd;
4426{
4427	struct jsegdep *jsegdep;
4428
4429	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4430	jsegdep = jnewblk->jn_jsegdep;
4431	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4432		panic("cancel_jnewblk: Invalid state");
4433	jnewblk->jn_jsegdep  = NULL;
4434	jnewblk->jn_dep = NULL;
4435	jnewblk->jn_state |= GOINGAWAY;
4436	if (jnewblk->jn_state & INPROGRESS) {
4437		jnewblk->jn_state &= ~INPROGRESS;
4438		WORKLIST_REMOVE(&jnewblk->jn_list);
4439		jwork_insert(wkhd, jsegdep);
4440	} else {
4441		free_jsegdep(jsegdep);
4442		remove_from_journal(&jnewblk->jn_list);
4443	}
4444	wake_worklist(&jnewblk->jn_list);
4445	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4446}
4447
4448static void
4449free_jblkdep(jblkdep)
4450	struct jblkdep *jblkdep;
4451{
4452
4453	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4454		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4455	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4456		WORKITEM_FREE(jblkdep, D_JTRUNC);
4457	else
4458		panic("free_jblkdep: Unexpected type %s",
4459		    TYPENAME(jblkdep->jb_list.wk_type));
4460}
4461
4462/*
4463 * Free a single jseg once it is no longer referenced in memory or on
4464 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4465 * to disappear.
4466 */
4467static void
4468free_jseg(jseg, jblocks)
4469	struct jseg *jseg;
4470	struct jblocks *jblocks;
4471{
4472	struct freework *freework;
4473
4474	/*
4475	 * Free freework structures that were lingering to indicate freed
4476	 * indirect blocks that forced journal write ordering on reallocate.
4477	 */
4478	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4479		indirblk_remove(freework);
4480	if (jblocks->jb_oldestseg == jseg)
4481		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4482	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4483	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4484	KASSERT(LIST_EMPTY(&jseg->js_entries),
4485	    ("free_jseg: Freed jseg has valid entries."));
4486	WORKITEM_FREE(jseg, D_JSEG);
4487}
4488
4489/*
4490 * Free all jsegs that meet the criteria for being reclaimed and update
4491 * oldestseg.
4492 */
4493static void
4494free_jsegs(jblocks)
4495	struct jblocks *jblocks;
4496{
4497	struct jseg *jseg;
4498
4499	/*
4500	 * Free only those jsegs which have none allocated before them to
4501	 * preserve the journal space ordering.
4502	 */
4503	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4504		/*
4505		 * Only reclaim space when nothing depends on this journal
4506		 * set and another set has written that it is no longer
4507		 * valid.
4508		 */
4509		if (jseg->js_refs != 0) {
4510			jblocks->jb_oldestseg = jseg;
4511			return;
4512		}
4513		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4514			break;
4515		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4516			break;
4517		/*
4518		 * We can free jsegs that didn't write entries when
4519		 * oldestwrseq == js_seq.
4520		 */
4521		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4522		    jseg->js_cnt != 0)
4523			break;
4524		free_jseg(jseg, jblocks);
4525	}
4526	/*
4527	 * If we exited the loop above we still must discover the
4528	 * oldest valid segment.
4529	 */
4530	if (jseg)
4531		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4532		     jseg = TAILQ_NEXT(jseg, js_next))
4533			if (jseg->js_refs != 0)
4534				break;
4535	jblocks->jb_oldestseg = jseg;
4536	/*
4537	 * The journal has no valid records but some jsegs may still be
4538	 * waiting on oldestwrseq to advance.  We force a small record
4539	 * out to permit these lingering records to be reclaimed.
4540	 */
4541	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4542		jblocks->jb_needseg = 1;
4543}
4544
4545/*
4546 * Release one reference to a jseg and free it if the count reaches 0.  This
4547 * should eventually reclaim journal space as well.
4548 */
4549static void
4550rele_jseg(jseg)
4551	struct jseg *jseg;
4552{
4553
4554	KASSERT(jseg->js_refs > 0,
4555	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4556	if (--jseg->js_refs != 0)
4557		return;
4558	free_jsegs(jseg->js_jblocks);
4559}
4560
4561/*
4562 * Release a jsegdep and decrement the jseg count.
4563 */
4564static void
4565free_jsegdep(jsegdep)
4566	struct jsegdep *jsegdep;
4567{
4568
4569	if (jsegdep->jd_seg)
4570		rele_jseg(jsegdep->jd_seg);
4571	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4572}
4573
4574/*
4575 * Wait for a journal item to make it to disk.  Initiate journal processing
4576 * if required.
4577 */
4578static int
4579jwait(wk, waitfor)
4580	struct worklist *wk;
4581	int waitfor;
4582{
4583
4584	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4585	/*
4586	 * Blocking journal waits cause slow synchronous behavior.  Record
4587	 * stats on the frequency of these blocking operations.
4588	 */
4589	if (waitfor == MNT_WAIT) {
4590		stat_journal_wait++;
4591		switch (wk->wk_type) {
4592		case D_JREMREF:
4593		case D_JMVREF:
4594			stat_jwait_filepage++;
4595			break;
4596		case D_JTRUNC:
4597		case D_JFREEBLK:
4598			stat_jwait_freeblks++;
4599			break;
4600		case D_JNEWBLK:
4601			stat_jwait_newblk++;
4602			break;
4603		case D_JADDREF:
4604			stat_jwait_inode++;
4605			break;
4606		default:
4607			break;
4608		}
4609	}
4610	/*
4611	 * If IO has not started we process the journal.  We can't mark the
4612	 * worklist item as IOWAITING because we drop the lock while
4613	 * processing the journal and the worklist entry may be freed after
4614	 * this point.  The caller may call back in and re-issue the request.
4615	 */
4616	if ((wk->wk_state & INPROGRESS) == 0) {
4617		softdep_process_journal(wk->wk_mp, wk, waitfor);
4618		if (waitfor != MNT_WAIT)
4619			return (EBUSY);
4620		return (0);
4621	}
4622	if (waitfor != MNT_WAIT)
4623		return (EBUSY);
4624	wait_worklist(wk, "jwait");
4625	return (0);
4626}
4627
4628/*
4629 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4630 * appropriate.  This is a convenience function to reduce duplicate code
4631 * for the setup and revert functions below.
4632 */
4633static struct inodedep *
4634inodedep_lookup_ip(ip)
4635	struct inode *ip;
4636{
4637	struct inodedep *inodedep;
4638	int dflags;
4639
4640	KASSERT(ip->i_nlink >= ip->i_effnlink,
4641	    ("inodedep_lookup_ip: bad delta"));
4642	dflags = DEPALLOC;
4643	if (IS_SNAPSHOT(ip))
4644		dflags |= NODELAY;
4645	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
4646	    &inodedep);
4647	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4648	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4649
4650	return (inodedep);
4651}
4652
4653/*
4654 * Called prior to creating a new inode and linking it to a directory.  The
4655 * jaddref structure must already be allocated by softdep_setup_inomapdep
4656 * and it is discovered here so we can initialize the mode and update
4657 * nlinkdelta.
4658 */
4659void
4660softdep_setup_create(dp, ip)
4661	struct inode *dp;
4662	struct inode *ip;
4663{
4664	struct inodedep *inodedep;
4665	struct jaddref *jaddref;
4666	struct vnode *dvp;
4667
4668	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4669	    ("softdep_setup_create called on non-softdep filesystem"));
4670	KASSERT(ip->i_nlink == 1,
4671	    ("softdep_setup_create: Invalid link count."));
4672	dvp = ITOV(dp);
4673	ACQUIRE_LOCK(dp->i_ump);
4674	inodedep = inodedep_lookup_ip(ip);
4675	if (DOINGSUJ(dvp)) {
4676		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4677		    inoreflst);
4678		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4679		    ("softdep_setup_create: No addref structure present."));
4680	}
4681	softdep_prelink(dvp, NULL);
4682	FREE_LOCK(dp->i_ump);
4683}
4684
4685/*
4686 * Create a jaddref structure to track the addition of a DOTDOT link when
4687 * we are reparenting an inode as part of a rename.  This jaddref will be
4688 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4689 * non-journaling softdep.
4690 */
4691void
4692softdep_setup_dotdot_link(dp, ip)
4693	struct inode *dp;
4694	struct inode *ip;
4695{
4696	struct inodedep *inodedep;
4697	struct jaddref *jaddref;
4698	struct vnode *dvp;
4699
4700	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4701	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4702	dvp = ITOV(dp);
4703	jaddref = NULL;
4704	/*
4705	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4706	 * is used as a normal link would be.
4707	 */
4708	if (DOINGSUJ(dvp))
4709		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4710		    dp->i_effnlink - 1, dp->i_mode);
4711	ACQUIRE_LOCK(dp->i_ump);
4712	inodedep = inodedep_lookup_ip(dp);
4713	if (jaddref)
4714		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4715		    if_deps);
4716	softdep_prelink(dvp, ITOV(ip));
4717	FREE_LOCK(dp->i_ump);
4718}
4719
4720/*
4721 * Create a jaddref structure to track a new link to an inode.  The directory
4722 * offset is not known until softdep_setup_directory_add or
4723 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4724 * softdep.
4725 */
4726void
4727softdep_setup_link(dp, ip)
4728	struct inode *dp;
4729	struct inode *ip;
4730{
4731	struct inodedep *inodedep;
4732	struct jaddref *jaddref;
4733	struct vnode *dvp;
4734
4735	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4736	    ("softdep_setup_link called on non-softdep filesystem"));
4737	dvp = ITOV(dp);
4738	jaddref = NULL;
4739	if (DOINGSUJ(dvp))
4740		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4741		    ip->i_mode);
4742	ACQUIRE_LOCK(dp->i_ump);
4743	inodedep = inodedep_lookup_ip(ip);
4744	if (jaddref)
4745		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4746		    if_deps);
4747	softdep_prelink(dvp, ITOV(ip));
4748	FREE_LOCK(dp->i_ump);
4749}
4750
4751/*
4752 * Called to create the jaddref structures to track . and .. references as
4753 * well as lookup and further initialize the incomplete jaddref created
4754 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4755 * nlinkdelta for non-journaling softdep.
4756 */
4757void
4758softdep_setup_mkdir(dp, ip)
4759	struct inode *dp;
4760	struct inode *ip;
4761{
4762	struct inodedep *inodedep;
4763	struct jaddref *dotdotaddref;
4764	struct jaddref *dotaddref;
4765	struct jaddref *jaddref;
4766	struct vnode *dvp;
4767
4768	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4769	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4770	dvp = ITOV(dp);
4771	dotaddref = dotdotaddref = NULL;
4772	if (DOINGSUJ(dvp)) {
4773		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4774		    ip->i_mode);
4775		dotaddref->ja_state |= MKDIR_BODY;
4776		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4777		    dp->i_effnlink - 1, dp->i_mode);
4778		dotdotaddref->ja_state |= MKDIR_PARENT;
4779	}
4780	ACQUIRE_LOCK(dp->i_ump);
4781	inodedep = inodedep_lookup_ip(ip);
4782	if (DOINGSUJ(dvp)) {
4783		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4784		    inoreflst);
4785		KASSERT(jaddref != NULL,
4786		    ("softdep_setup_mkdir: No addref structure present."));
4787		KASSERT(jaddref->ja_parent == dp->i_number,
4788		    ("softdep_setup_mkdir: bad parent %ju",
4789		    (uintmax_t)jaddref->ja_parent));
4790		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4791		    if_deps);
4792	}
4793	inodedep = inodedep_lookup_ip(dp);
4794	if (DOINGSUJ(dvp))
4795		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4796		    &dotdotaddref->ja_ref, if_deps);
4797	softdep_prelink(ITOV(dp), NULL);
4798	FREE_LOCK(dp->i_ump);
4799}
4800
4801/*
4802 * Called to track nlinkdelta of the inode and parent directories prior to
4803 * unlinking a directory.
4804 */
4805void
4806softdep_setup_rmdir(dp, ip)
4807	struct inode *dp;
4808	struct inode *ip;
4809{
4810	struct vnode *dvp;
4811
4812	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4813	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4814	dvp = ITOV(dp);
4815	ACQUIRE_LOCK(dp->i_ump);
4816	(void) inodedep_lookup_ip(ip);
4817	(void) inodedep_lookup_ip(dp);
4818	softdep_prelink(dvp, ITOV(ip));
4819	FREE_LOCK(dp->i_ump);
4820}
4821
4822/*
4823 * Called to track nlinkdelta of the inode and parent directories prior to
4824 * unlink.
4825 */
4826void
4827softdep_setup_unlink(dp, ip)
4828	struct inode *dp;
4829	struct inode *ip;
4830{
4831	struct vnode *dvp;
4832
4833	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4834	    ("softdep_setup_unlink called on non-softdep filesystem"));
4835	dvp = ITOV(dp);
4836	ACQUIRE_LOCK(dp->i_ump);
4837	(void) inodedep_lookup_ip(ip);
4838	(void) inodedep_lookup_ip(dp);
4839	softdep_prelink(dvp, ITOV(ip));
4840	FREE_LOCK(dp->i_ump);
4841}
4842
4843/*
4844 * Called to release the journal structures created by a failed non-directory
4845 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4846 */
4847void
4848softdep_revert_create(dp, ip)
4849	struct inode *dp;
4850	struct inode *ip;
4851{
4852	struct inodedep *inodedep;
4853	struct jaddref *jaddref;
4854	struct vnode *dvp;
4855
4856	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4857	    ("softdep_revert_create called on non-softdep filesystem"));
4858	dvp = ITOV(dp);
4859	ACQUIRE_LOCK(dp->i_ump);
4860	inodedep = inodedep_lookup_ip(ip);
4861	if (DOINGSUJ(dvp)) {
4862		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4863		    inoreflst);
4864		KASSERT(jaddref->ja_parent == dp->i_number,
4865		    ("softdep_revert_create: addref parent mismatch"));
4866		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4867	}
4868	FREE_LOCK(dp->i_ump);
4869}
4870
4871/*
4872 * Called to release the journal structures created by a failed link
4873 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4874 */
4875void
4876softdep_revert_link(dp, ip)
4877	struct inode *dp;
4878	struct inode *ip;
4879{
4880	struct inodedep *inodedep;
4881	struct jaddref *jaddref;
4882	struct vnode *dvp;
4883
4884	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4885	    ("softdep_revert_link called on non-softdep filesystem"));
4886	dvp = ITOV(dp);
4887	ACQUIRE_LOCK(dp->i_ump);
4888	inodedep = inodedep_lookup_ip(ip);
4889	if (DOINGSUJ(dvp)) {
4890		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4891		    inoreflst);
4892		KASSERT(jaddref->ja_parent == dp->i_number,
4893		    ("softdep_revert_link: addref parent mismatch"));
4894		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4895	}
4896	FREE_LOCK(dp->i_ump);
4897}
4898
4899/*
4900 * Called to release the journal structures created by a failed mkdir
4901 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4902 */
4903void
4904softdep_revert_mkdir(dp, ip)
4905	struct inode *dp;
4906	struct inode *ip;
4907{
4908	struct inodedep *inodedep;
4909	struct jaddref *jaddref;
4910	struct jaddref *dotaddref;
4911	struct vnode *dvp;
4912
4913	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4914	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4915	dvp = ITOV(dp);
4916
4917	ACQUIRE_LOCK(dp->i_ump);
4918	inodedep = inodedep_lookup_ip(dp);
4919	if (DOINGSUJ(dvp)) {
4920		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4921		    inoreflst);
4922		KASSERT(jaddref->ja_parent == ip->i_number,
4923		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4924		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4925	}
4926	inodedep = inodedep_lookup_ip(ip);
4927	if (DOINGSUJ(dvp)) {
4928		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4929		    inoreflst);
4930		KASSERT(jaddref->ja_parent == dp->i_number,
4931		    ("softdep_revert_mkdir: addref parent mismatch"));
4932		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4933		    inoreflst, if_deps);
4934		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4935		KASSERT(dotaddref->ja_parent == ip->i_number,
4936		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4937		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4938	}
4939	FREE_LOCK(dp->i_ump);
4940}
4941
4942/*
4943 * Called to correct nlinkdelta after a failed rmdir.
4944 */
4945void
4946softdep_revert_rmdir(dp, ip)
4947	struct inode *dp;
4948	struct inode *ip;
4949{
4950
4951	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4952	    ("softdep_revert_rmdir called on non-softdep filesystem"));
4953	ACQUIRE_LOCK(dp->i_ump);
4954	(void) inodedep_lookup_ip(ip);
4955	(void) inodedep_lookup_ip(dp);
4956	FREE_LOCK(dp->i_ump);
4957}
4958
4959/*
4960 * Protecting the freemaps (or bitmaps).
4961 *
4962 * To eliminate the need to execute fsck before mounting a filesystem
4963 * after a power failure, one must (conservatively) guarantee that the
4964 * on-disk copy of the bitmaps never indicate that a live inode or block is
4965 * free.  So, when a block or inode is allocated, the bitmap should be
4966 * updated (on disk) before any new pointers.  When a block or inode is
4967 * freed, the bitmap should not be updated until all pointers have been
4968 * reset.  The latter dependency is handled by the delayed de-allocation
4969 * approach described below for block and inode de-allocation.  The former
4970 * dependency is handled by calling the following procedure when a block or
4971 * inode is allocated. When an inode is allocated an "inodedep" is created
4972 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4973 * Each "inodedep" is also inserted into the hash indexing structure so
4974 * that any additional link additions can be made dependent on the inode
4975 * allocation.
4976 *
4977 * The ufs filesystem maintains a number of free block counts (e.g., per
4978 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4979 * in addition to the bitmaps.  These counts are used to improve efficiency
4980 * during allocation and therefore must be consistent with the bitmaps.
4981 * There is no convenient way to guarantee post-crash consistency of these
4982 * counts with simple update ordering, for two main reasons: (1) The counts
4983 * and bitmaps for a single cylinder group block are not in the same disk
4984 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4985 * be written and the other not.  (2) Some of the counts are located in the
4986 * superblock rather than the cylinder group block. So, we focus our soft
4987 * updates implementation on protecting the bitmaps. When mounting a
4988 * filesystem, we recompute the auxiliary counts from the bitmaps.
4989 */
4990
4991/*
4992 * Called just after updating the cylinder group block to allocate an inode.
4993 */
4994void
4995softdep_setup_inomapdep(bp, ip, newinum, mode)
4996	struct buf *bp;		/* buffer for cylgroup block with inode map */
4997	struct inode *ip;	/* inode related to allocation */
4998	ino_t newinum;		/* new inode number being allocated */
4999	int mode;
5000{
5001	struct inodedep *inodedep;
5002	struct bmsafemap *bmsafemap;
5003	struct jaddref *jaddref;
5004	struct mount *mp;
5005	struct fs *fs;
5006
5007	mp = UFSTOVFS(ip->i_ump);
5008	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5009	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5010	fs = ip->i_ump->um_fs;
5011	jaddref = NULL;
5012
5013	/*
5014	 * Allocate the journal reference add structure so that the bitmap
5015	 * can be dependent on it.
5016	 */
5017	if (MOUNTEDSUJ(mp)) {
5018		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5019		jaddref->ja_state |= NEWBLOCK;
5020	}
5021
5022	/*
5023	 * Create a dependency for the newly allocated inode.
5024	 * Panic if it already exists as something is seriously wrong.
5025	 * Otherwise add it to the dependency list for the buffer holding
5026	 * the cylinder group map from which it was allocated.
5027	 *
5028	 * We have to preallocate a bmsafemap entry in case it is needed
5029	 * in bmsafemap_lookup since once we allocate the inodedep, we
5030	 * have to finish initializing it before we can FREE_LOCK().
5031	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5032	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5033	 * creating the inodedep as it can be freed during the time
5034	 * that we FREE_LOCK() while allocating the inodedep. We must
5035	 * call workitem_alloc() before entering the locked section as
5036	 * it also acquires the lock and we must avoid trying doing so
5037	 * recursively.
5038	 */
5039	bmsafemap = malloc(sizeof(struct bmsafemap),
5040	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5041	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5042	ACQUIRE_LOCK(ip->i_ump);
5043	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
5044		panic("softdep_setup_inomapdep: dependency %p for new"
5045		    "inode already exists", inodedep);
5046	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5047	if (jaddref) {
5048		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5049		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5050		    if_deps);
5051	} else {
5052		inodedep->id_state |= ONDEPLIST;
5053		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5054	}
5055	inodedep->id_bmsafemap = bmsafemap;
5056	inodedep->id_state &= ~DEPCOMPLETE;
5057	FREE_LOCK(ip->i_ump);
5058}
5059
5060/*
5061 * Called just after updating the cylinder group block to
5062 * allocate block or fragment.
5063 */
5064void
5065softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5066	struct buf *bp;		/* buffer for cylgroup block with block map */
5067	struct mount *mp;	/* filesystem doing allocation */
5068	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5069	int frags;		/* Number of fragments. */
5070	int oldfrags;		/* Previous number of fragments for extend. */
5071{
5072	struct newblk *newblk;
5073	struct bmsafemap *bmsafemap;
5074	struct jnewblk *jnewblk;
5075	struct ufsmount *ump;
5076	struct fs *fs;
5077
5078	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5079	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5080	ump = VFSTOUFS(mp);
5081	fs = ump->um_fs;
5082	jnewblk = NULL;
5083	/*
5084	 * Create a dependency for the newly allocated block.
5085	 * Add it to the dependency list for the buffer holding
5086	 * the cylinder group map from which it was allocated.
5087	 */
5088	if (MOUNTEDSUJ(mp)) {
5089		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5090		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5091		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5092		jnewblk->jn_state = ATTACHED;
5093		jnewblk->jn_blkno = newblkno;
5094		jnewblk->jn_frags = frags;
5095		jnewblk->jn_oldfrags = oldfrags;
5096#ifdef SUJ_DEBUG
5097		{
5098			struct cg *cgp;
5099			uint8_t *blksfree;
5100			long bno;
5101			int i;
5102
5103			cgp = (struct cg *)bp->b_data;
5104			blksfree = cg_blksfree(cgp);
5105			bno = dtogd(fs, jnewblk->jn_blkno);
5106			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5107			    i++) {
5108				if (isset(blksfree, bno + i))
5109					panic("softdep_setup_blkmapdep: "
5110					    "free fragment %d from %d-%d "
5111					    "state 0x%X dep %p", i,
5112					    jnewblk->jn_oldfrags,
5113					    jnewblk->jn_frags,
5114					    jnewblk->jn_state,
5115					    jnewblk->jn_dep);
5116			}
5117		}
5118#endif
5119	}
5120
5121	CTR3(KTR_SUJ,
5122	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5123	    newblkno, frags, oldfrags);
5124	ACQUIRE_LOCK(ump);
5125	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5126		panic("softdep_setup_blkmapdep: found block");
5127	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5128	    dtog(fs, newblkno), NULL);
5129	if (jnewblk) {
5130		jnewblk->jn_dep = (struct worklist *)newblk;
5131		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5132	} else {
5133		newblk->nb_state |= ONDEPLIST;
5134		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5135	}
5136	newblk->nb_bmsafemap = bmsafemap;
5137	newblk->nb_jnewblk = jnewblk;
5138	FREE_LOCK(ump);
5139}
5140
5141#define	BMSAFEMAP_HASH(ump, cg) \
5142      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5143
5144static int
5145bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5146	struct bmsafemap_hashhead *bmsafemaphd;
5147	int cg;
5148	struct bmsafemap **bmsafemapp;
5149{
5150	struct bmsafemap *bmsafemap;
5151
5152	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5153		if (bmsafemap->sm_cg == cg)
5154			break;
5155	if (bmsafemap) {
5156		*bmsafemapp = bmsafemap;
5157		return (1);
5158	}
5159	*bmsafemapp = NULL;
5160
5161	return (0);
5162}
5163
5164/*
5165 * Find the bmsafemap associated with a cylinder group buffer.
5166 * If none exists, create one. The buffer must be locked when
5167 * this routine is called and this routine must be called with
5168 * the softdep lock held. To avoid giving up the lock while
5169 * allocating a new bmsafemap, a preallocated bmsafemap may be
5170 * provided. If it is provided but not needed, it is freed.
5171 */
5172static struct bmsafemap *
5173bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5174	struct mount *mp;
5175	struct buf *bp;
5176	int cg;
5177	struct bmsafemap *newbmsafemap;
5178{
5179	struct bmsafemap_hashhead *bmsafemaphd;
5180	struct bmsafemap *bmsafemap, *collision;
5181	struct worklist *wk;
5182	struct ufsmount *ump;
5183
5184	ump = VFSTOUFS(mp);
5185	LOCK_OWNED(ump);
5186	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5187	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5188		if (wk->wk_type == D_BMSAFEMAP) {
5189			if (newbmsafemap)
5190				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5191			return (WK_BMSAFEMAP(wk));
5192		}
5193	}
5194	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5195	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5196		if (newbmsafemap)
5197			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5198		return (bmsafemap);
5199	}
5200	if (newbmsafemap) {
5201		bmsafemap = newbmsafemap;
5202	} else {
5203		FREE_LOCK(ump);
5204		bmsafemap = malloc(sizeof(struct bmsafemap),
5205			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5206		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5207		ACQUIRE_LOCK(ump);
5208	}
5209	bmsafemap->sm_buf = bp;
5210	LIST_INIT(&bmsafemap->sm_inodedephd);
5211	LIST_INIT(&bmsafemap->sm_inodedepwr);
5212	LIST_INIT(&bmsafemap->sm_newblkhd);
5213	LIST_INIT(&bmsafemap->sm_newblkwr);
5214	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5215	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5216	LIST_INIT(&bmsafemap->sm_freehd);
5217	LIST_INIT(&bmsafemap->sm_freewr);
5218	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5219		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5220		return (collision);
5221	}
5222	bmsafemap->sm_cg = cg;
5223	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5224	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5225	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5226	return (bmsafemap);
5227}
5228
5229/*
5230 * Direct block allocation dependencies.
5231 *
5232 * When a new block is allocated, the corresponding disk locations must be
5233 * initialized (with zeros or new data) before the on-disk inode points to
5234 * them.  Also, the freemap from which the block was allocated must be
5235 * updated (on disk) before the inode's pointer. These two dependencies are
5236 * independent of each other and are needed for all file blocks and indirect
5237 * blocks that are pointed to directly by the inode.  Just before the
5238 * "in-core" version of the inode is updated with a newly allocated block
5239 * number, a procedure (below) is called to setup allocation dependency
5240 * structures.  These structures are removed when the corresponding
5241 * dependencies are satisfied or when the block allocation becomes obsolete
5242 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5243 * fragment that gets upgraded).  All of these cases are handled in
5244 * procedures described later.
5245 *
5246 * When a file extension causes a fragment to be upgraded, either to a larger
5247 * fragment or to a full block, the on-disk location may change (if the
5248 * previous fragment could not simply be extended). In this case, the old
5249 * fragment must be de-allocated, but not until after the inode's pointer has
5250 * been updated. In most cases, this is handled by later procedures, which
5251 * will construct a "freefrag" structure to be added to the workitem queue
5252 * when the inode update is complete (or obsolete).  The main exception to
5253 * this is when an allocation occurs while a pending allocation dependency
5254 * (for the same block pointer) remains.  This case is handled in the main
5255 * allocation dependency setup procedure by immediately freeing the
5256 * unreferenced fragments.
5257 */
5258void
5259softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5260	struct inode *ip;	/* inode to which block is being added */
5261	ufs_lbn_t off;		/* block pointer within inode */
5262	ufs2_daddr_t newblkno;	/* disk block number being added */
5263	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5264	long newsize;		/* size of new block */
5265	long oldsize;		/* size of new block */
5266	struct buf *bp;		/* bp for allocated block */
5267{
5268	struct allocdirect *adp, *oldadp;
5269	struct allocdirectlst *adphead;
5270	struct freefrag *freefrag;
5271	struct inodedep *inodedep;
5272	struct pagedep *pagedep;
5273	struct jnewblk *jnewblk;
5274	struct newblk *newblk;
5275	struct mount *mp;
5276	ufs_lbn_t lbn;
5277
5278	lbn = bp->b_lblkno;
5279	mp = UFSTOVFS(ip->i_ump);
5280	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5281	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5282	if (oldblkno && oldblkno != newblkno)
5283		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5284	else
5285		freefrag = NULL;
5286
5287	CTR6(KTR_SUJ,
5288	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5289	    "off %jd newsize %ld oldsize %d",
5290	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5291	ACQUIRE_LOCK(ip->i_ump);
5292	if (off >= NDADDR) {
5293		if (lbn > 0)
5294			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5295			    lbn, off);
5296		/* allocating an indirect block */
5297		if (oldblkno != 0)
5298			panic("softdep_setup_allocdirect: non-zero indir");
5299	} else {
5300		if (off != lbn)
5301			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5302			    lbn, off);
5303		/*
5304		 * Allocating a direct block.
5305		 *
5306		 * If we are allocating a directory block, then we must
5307		 * allocate an associated pagedep to track additions and
5308		 * deletions.
5309		 */
5310		if ((ip->i_mode & IFMT) == IFDIR)
5311			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5312			    &pagedep);
5313	}
5314	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5315		panic("softdep_setup_allocdirect: lost block");
5316	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5317	    ("softdep_setup_allocdirect: newblk already initialized"));
5318	/*
5319	 * Convert the newblk to an allocdirect.
5320	 */
5321	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5322	adp = (struct allocdirect *)newblk;
5323	newblk->nb_freefrag = freefrag;
5324	adp->ad_offset = off;
5325	adp->ad_oldblkno = oldblkno;
5326	adp->ad_newsize = newsize;
5327	adp->ad_oldsize = oldsize;
5328
5329	/*
5330	 * Finish initializing the journal.
5331	 */
5332	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5333		jnewblk->jn_ino = ip->i_number;
5334		jnewblk->jn_lbn = lbn;
5335		add_to_journal(&jnewblk->jn_list);
5336	}
5337	if (freefrag && freefrag->ff_jdep != NULL &&
5338	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5339		add_to_journal(freefrag->ff_jdep);
5340	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5341	adp->ad_inodedep = inodedep;
5342
5343	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5344	/*
5345	 * The list of allocdirects must be kept in sorted and ascending
5346	 * order so that the rollback routines can quickly determine the
5347	 * first uncommitted block (the size of the file stored on disk
5348	 * ends at the end of the lowest committed fragment, or if there
5349	 * are no fragments, at the end of the highest committed block).
5350	 * Since files generally grow, the typical case is that the new
5351	 * block is to be added at the end of the list. We speed this
5352	 * special case by checking against the last allocdirect in the
5353	 * list before laboriously traversing the list looking for the
5354	 * insertion point.
5355	 */
5356	adphead = &inodedep->id_newinoupdt;
5357	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5358	if (oldadp == NULL || oldadp->ad_offset <= off) {
5359		/* insert at end of list */
5360		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5361		if (oldadp != NULL && oldadp->ad_offset == off)
5362			allocdirect_merge(adphead, adp, oldadp);
5363		FREE_LOCK(ip->i_ump);
5364		return;
5365	}
5366	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5367		if (oldadp->ad_offset >= off)
5368			break;
5369	}
5370	if (oldadp == NULL)
5371		panic("softdep_setup_allocdirect: lost entry");
5372	/* insert in middle of list */
5373	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5374	if (oldadp->ad_offset == off)
5375		allocdirect_merge(adphead, adp, oldadp);
5376
5377	FREE_LOCK(ip->i_ump);
5378}
5379
5380/*
5381 * Merge a newer and older journal record to be stored either in a
5382 * newblock or freefrag.  This handles aggregating journal records for
5383 * fragment allocation into a second record as well as replacing a
5384 * journal free with an aborted journal allocation.  A segment for the
5385 * oldest record will be placed on wkhd if it has been written.  If not
5386 * the segment for the newer record will suffice.
5387 */
5388static struct worklist *
5389jnewblk_merge(new, old, wkhd)
5390	struct worklist *new;
5391	struct worklist *old;
5392	struct workhead *wkhd;
5393{
5394	struct jnewblk *njnewblk;
5395	struct jnewblk *jnewblk;
5396
5397	/* Handle NULLs to simplify callers. */
5398	if (new == NULL)
5399		return (old);
5400	if (old == NULL)
5401		return (new);
5402	/* Replace a jfreefrag with a jnewblk. */
5403	if (new->wk_type == D_JFREEFRAG) {
5404		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5405			panic("jnewblk_merge: blkno mismatch: %p, %p",
5406			    old, new);
5407		cancel_jfreefrag(WK_JFREEFRAG(new));
5408		return (old);
5409	}
5410	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5411		panic("jnewblk_merge: Bad type: old %d new %d\n",
5412		    old->wk_type, new->wk_type);
5413	/*
5414	 * Handle merging of two jnewblk records that describe
5415	 * different sets of fragments in the same block.
5416	 */
5417	jnewblk = WK_JNEWBLK(old);
5418	njnewblk = WK_JNEWBLK(new);
5419	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5420		panic("jnewblk_merge: Merging disparate blocks.");
5421	/*
5422	 * The record may be rolled back in the cg.
5423	 */
5424	if (jnewblk->jn_state & UNDONE) {
5425		jnewblk->jn_state &= ~UNDONE;
5426		njnewblk->jn_state |= UNDONE;
5427		njnewblk->jn_state &= ~ATTACHED;
5428	}
5429	/*
5430	 * We modify the newer addref and free the older so that if neither
5431	 * has been written the most up-to-date copy will be on disk.  If
5432	 * both have been written but rolled back we only temporarily need
5433	 * one of them to fix the bits when the cg write completes.
5434	 */
5435	jnewblk->jn_state |= ATTACHED | COMPLETE;
5436	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5437	cancel_jnewblk(jnewblk, wkhd);
5438	WORKLIST_REMOVE(&jnewblk->jn_list);
5439	free_jnewblk(jnewblk);
5440	return (new);
5441}
5442
5443/*
5444 * Replace an old allocdirect dependency with a newer one.
5445 * This routine must be called with splbio interrupts blocked.
5446 */
5447static void
5448allocdirect_merge(adphead, newadp, oldadp)
5449	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5450	struct allocdirect *newadp;	/* allocdirect being added */
5451	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5452{
5453	struct worklist *wk;
5454	struct freefrag *freefrag;
5455
5456	freefrag = NULL;
5457	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5458	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5459	    newadp->ad_oldsize != oldadp->ad_newsize ||
5460	    newadp->ad_offset >= NDADDR)
5461		panic("%s %jd != new %jd || old size %ld != new %ld",
5462		    "allocdirect_merge: old blkno",
5463		    (intmax_t)newadp->ad_oldblkno,
5464		    (intmax_t)oldadp->ad_newblkno,
5465		    newadp->ad_oldsize, oldadp->ad_newsize);
5466	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5467	newadp->ad_oldsize = oldadp->ad_oldsize;
5468	/*
5469	 * If the old dependency had a fragment to free or had never
5470	 * previously had a block allocated, then the new dependency
5471	 * can immediately post its freefrag and adopt the old freefrag.
5472	 * This action is done by swapping the freefrag dependencies.
5473	 * The new dependency gains the old one's freefrag, and the
5474	 * old one gets the new one and then immediately puts it on
5475	 * the worklist when it is freed by free_newblk. It is
5476	 * not possible to do this swap when the old dependency had a
5477	 * non-zero size but no previous fragment to free. This condition
5478	 * arises when the new block is an extension of the old block.
5479	 * Here, the first part of the fragment allocated to the new
5480	 * dependency is part of the block currently claimed on disk by
5481	 * the old dependency, so cannot legitimately be freed until the
5482	 * conditions for the new dependency are fulfilled.
5483	 */
5484	freefrag = newadp->ad_freefrag;
5485	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5486		newadp->ad_freefrag = oldadp->ad_freefrag;
5487		oldadp->ad_freefrag = freefrag;
5488	}
5489	/*
5490	 * If we are tracking a new directory-block allocation,
5491	 * move it from the old allocdirect to the new allocdirect.
5492	 */
5493	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5494		WORKLIST_REMOVE(wk);
5495		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5496			panic("allocdirect_merge: extra newdirblk");
5497		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5498	}
5499	TAILQ_REMOVE(adphead, oldadp, ad_next);
5500	/*
5501	 * We need to move any journal dependencies over to the freefrag
5502	 * that releases this block if it exists.  Otherwise we are
5503	 * extending an existing block and we'll wait until that is
5504	 * complete to release the journal space and extend the
5505	 * new journal to cover this old space as well.
5506	 */
5507	if (freefrag == NULL) {
5508		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5509			panic("allocdirect_merge: %jd != %jd",
5510			    oldadp->ad_newblkno, newadp->ad_newblkno);
5511		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5512		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5513		    &oldadp->ad_block.nb_jnewblk->jn_list,
5514		    &newadp->ad_block.nb_jwork);
5515		oldadp->ad_block.nb_jnewblk = NULL;
5516		cancel_newblk(&oldadp->ad_block, NULL,
5517		    &newadp->ad_block.nb_jwork);
5518	} else {
5519		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5520		    &freefrag->ff_list, &freefrag->ff_jwork);
5521		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5522		    &freefrag->ff_jwork);
5523	}
5524	free_newblk(&oldadp->ad_block);
5525}
5526
5527/*
5528 * Allocate a jfreefrag structure to journal a single block free.
5529 */
5530static struct jfreefrag *
5531newjfreefrag(freefrag, ip, blkno, size, lbn)
5532	struct freefrag *freefrag;
5533	struct inode *ip;
5534	ufs2_daddr_t blkno;
5535	long size;
5536	ufs_lbn_t lbn;
5537{
5538	struct jfreefrag *jfreefrag;
5539	struct fs *fs;
5540
5541	fs = ip->i_fs;
5542	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5543	    M_SOFTDEP_FLAGS);
5544	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5545	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5546	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5547	jfreefrag->fr_ino = ip->i_number;
5548	jfreefrag->fr_lbn = lbn;
5549	jfreefrag->fr_blkno = blkno;
5550	jfreefrag->fr_frags = numfrags(fs, size);
5551	jfreefrag->fr_freefrag = freefrag;
5552
5553	return (jfreefrag);
5554}
5555
5556/*
5557 * Allocate a new freefrag structure.
5558 */
5559static struct freefrag *
5560newfreefrag(ip, blkno, size, lbn)
5561	struct inode *ip;
5562	ufs2_daddr_t blkno;
5563	long size;
5564	ufs_lbn_t lbn;
5565{
5566	struct freefrag *freefrag;
5567	struct fs *fs;
5568
5569	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5570	    ip->i_number, blkno, size, lbn);
5571	fs = ip->i_fs;
5572	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5573		panic("newfreefrag: frag size");
5574	freefrag = malloc(sizeof(struct freefrag),
5575	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5576	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5577	freefrag->ff_state = ATTACHED;
5578	LIST_INIT(&freefrag->ff_jwork);
5579	freefrag->ff_inum = ip->i_number;
5580	freefrag->ff_vtype = ITOV(ip)->v_type;
5581	freefrag->ff_blkno = blkno;
5582	freefrag->ff_fragsize = size;
5583
5584	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5585		freefrag->ff_jdep = (struct worklist *)
5586		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5587	} else {
5588		freefrag->ff_state |= DEPCOMPLETE;
5589		freefrag->ff_jdep = NULL;
5590	}
5591
5592	return (freefrag);
5593}
5594
5595/*
5596 * This workitem de-allocates fragments that were replaced during
5597 * file block allocation.
5598 */
5599static void
5600handle_workitem_freefrag(freefrag)
5601	struct freefrag *freefrag;
5602{
5603	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5604	struct workhead wkhd;
5605
5606	CTR3(KTR_SUJ,
5607	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5608	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5609	/*
5610	 * It would be illegal to add new completion items to the
5611	 * freefrag after it was schedule to be done so it must be
5612	 * safe to modify the list head here.
5613	 */
5614	LIST_INIT(&wkhd);
5615	ACQUIRE_LOCK(ump);
5616	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5617	/*
5618	 * If the journal has not been written we must cancel it here.
5619	 */
5620	if (freefrag->ff_jdep) {
5621		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5622			panic("handle_workitem_freefrag: Unexpected type %d\n",
5623			    freefrag->ff_jdep->wk_type);
5624		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5625	}
5626	FREE_LOCK(ump);
5627	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5628	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5629	ACQUIRE_LOCK(ump);
5630	WORKITEM_FREE(freefrag, D_FREEFRAG);
5631	FREE_LOCK(ump);
5632}
5633
5634/*
5635 * Set up a dependency structure for an external attributes data block.
5636 * This routine follows much of the structure of softdep_setup_allocdirect.
5637 * See the description of softdep_setup_allocdirect above for details.
5638 */
5639void
5640softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5641	struct inode *ip;
5642	ufs_lbn_t off;
5643	ufs2_daddr_t newblkno;
5644	ufs2_daddr_t oldblkno;
5645	long newsize;
5646	long oldsize;
5647	struct buf *bp;
5648{
5649	struct allocdirect *adp, *oldadp;
5650	struct allocdirectlst *adphead;
5651	struct freefrag *freefrag;
5652	struct inodedep *inodedep;
5653	struct jnewblk *jnewblk;
5654	struct newblk *newblk;
5655	struct mount *mp;
5656	ufs_lbn_t lbn;
5657
5658	mp = UFSTOVFS(ip->i_ump);
5659	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5660	    ("softdep_setup_allocext called on non-softdep filesystem"));
5661	KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5662		    (long long)off));
5663
5664	lbn = bp->b_lblkno;
5665	if (oldblkno && oldblkno != newblkno)
5666		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5667	else
5668		freefrag = NULL;
5669
5670	ACQUIRE_LOCK(ip->i_ump);
5671	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5672		panic("softdep_setup_allocext: lost block");
5673	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5674	    ("softdep_setup_allocext: newblk already initialized"));
5675	/*
5676	 * Convert the newblk to an allocdirect.
5677	 */
5678	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5679	adp = (struct allocdirect *)newblk;
5680	newblk->nb_freefrag = freefrag;
5681	adp->ad_offset = off;
5682	adp->ad_oldblkno = oldblkno;
5683	adp->ad_newsize = newsize;
5684	adp->ad_oldsize = oldsize;
5685	adp->ad_state |=  EXTDATA;
5686
5687	/*
5688	 * Finish initializing the journal.
5689	 */
5690	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5691		jnewblk->jn_ino = ip->i_number;
5692		jnewblk->jn_lbn = lbn;
5693		add_to_journal(&jnewblk->jn_list);
5694	}
5695	if (freefrag && freefrag->ff_jdep != NULL &&
5696	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5697		add_to_journal(freefrag->ff_jdep);
5698	inodedep_lookup(mp, ip->i_number, DEPALLOC | NODELAY, &inodedep);
5699	adp->ad_inodedep = inodedep;
5700
5701	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5702	/*
5703	 * The list of allocdirects must be kept in sorted and ascending
5704	 * order so that the rollback routines can quickly determine the
5705	 * first uncommitted block (the size of the file stored on disk
5706	 * ends at the end of the lowest committed fragment, or if there
5707	 * are no fragments, at the end of the highest committed block).
5708	 * Since files generally grow, the typical case is that the new
5709	 * block is to be added at the end of the list. We speed this
5710	 * special case by checking against the last allocdirect in the
5711	 * list before laboriously traversing the list looking for the
5712	 * insertion point.
5713	 */
5714	adphead = &inodedep->id_newextupdt;
5715	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5716	if (oldadp == NULL || oldadp->ad_offset <= off) {
5717		/* insert at end of list */
5718		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5719		if (oldadp != NULL && oldadp->ad_offset == off)
5720			allocdirect_merge(adphead, adp, oldadp);
5721		FREE_LOCK(ip->i_ump);
5722		return;
5723	}
5724	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5725		if (oldadp->ad_offset >= off)
5726			break;
5727	}
5728	if (oldadp == NULL)
5729		panic("softdep_setup_allocext: lost entry");
5730	/* insert in middle of list */
5731	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5732	if (oldadp->ad_offset == off)
5733		allocdirect_merge(adphead, adp, oldadp);
5734	FREE_LOCK(ip->i_ump);
5735}
5736
5737/*
5738 * Indirect block allocation dependencies.
5739 *
5740 * The same dependencies that exist for a direct block also exist when
5741 * a new block is allocated and pointed to by an entry in a block of
5742 * indirect pointers. The undo/redo states described above are also
5743 * used here. Because an indirect block contains many pointers that
5744 * may have dependencies, a second copy of the entire in-memory indirect
5745 * block is kept. The buffer cache copy is always completely up-to-date.
5746 * The second copy, which is used only as a source for disk writes,
5747 * contains only the safe pointers (i.e., those that have no remaining
5748 * update dependencies). The second copy is freed when all pointers
5749 * are safe. The cache is not allowed to replace indirect blocks with
5750 * pending update dependencies. If a buffer containing an indirect
5751 * block with dependencies is written, these routines will mark it
5752 * dirty again. It can only be successfully written once all the
5753 * dependencies are removed. The ffs_fsync routine in conjunction with
5754 * softdep_sync_metadata work together to get all the dependencies
5755 * removed so that a file can be successfully written to disk. Three
5756 * procedures are used when setting up indirect block pointer
5757 * dependencies. The division is necessary because of the organization
5758 * of the "balloc" routine and because of the distinction between file
5759 * pages and file metadata blocks.
5760 */
5761
5762/*
5763 * Allocate a new allocindir structure.
5764 */
5765static struct allocindir *
5766newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5767	struct inode *ip;	/* inode for file being extended */
5768	int ptrno;		/* offset of pointer in indirect block */
5769	ufs2_daddr_t newblkno;	/* disk block number being added */
5770	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5771	ufs_lbn_t lbn;
5772{
5773	struct newblk *newblk;
5774	struct allocindir *aip;
5775	struct freefrag *freefrag;
5776	struct jnewblk *jnewblk;
5777
5778	if (oldblkno)
5779		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5780	else
5781		freefrag = NULL;
5782	ACQUIRE_LOCK(ip->i_ump);
5783	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5784		panic("new_allocindir: lost block");
5785	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5786	    ("newallocindir: newblk already initialized"));
5787	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5788	newblk->nb_freefrag = freefrag;
5789	aip = (struct allocindir *)newblk;
5790	aip->ai_offset = ptrno;
5791	aip->ai_oldblkno = oldblkno;
5792	aip->ai_lbn = lbn;
5793	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5794		jnewblk->jn_ino = ip->i_number;
5795		jnewblk->jn_lbn = lbn;
5796		add_to_journal(&jnewblk->jn_list);
5797	}
5798	if (freefrag && freefrag->ff_jdep != NULL &&
5799	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5800		add_to_journal(freefrag->ff_jdep);
5801	return (aip);
5802}
5803
5804/*
5805 * Called just before setting an indirect block pointer
5806 * to a newly allocated file page.
5807 */
5808void
5809softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5810	struct inode *ip;	/* inode for file being extended */
5811	ufs_lbn_t lbn;		/* allocated block number within file */
5812	struct buf *bp;		/* buffer with indirect blk referencing page */
5813	int ptrno;		/* offset of pointer in indirect block */
5814	ufs2_daddr_t newblkno;	/* disk block number being added */
5815	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5816	struct buf *nbp;	/* buffer holding allocated page */
5817{
5818	struct inodedep *inodedep;
5819	struct freefrag *freefrag;
5820	struct allocindir *aip;
5821	struct pagedep *pagedep;
5822	struct mount *mp;
5823	int dflags;
5824
5825	mp = UFSTOVFS(ip->i_ump);
5826	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5827	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5828	KASSERT(lbn == nbp->b_lblkno,
5829	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5830	    lbn, bp->b_lblkno));
5831	CTR4(KTR_SUJ,
5832	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5833	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5834	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5835	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5836	dflags = DEPALLOC;
5837	if (IS_SNAPSHOT(ip))
5838		dflags |= NODELAY;
5839	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
5840	/*
5841	 * If we are allocating a directory page, then we must
5842	 * allocate an associated pagedep to track additions and
5843	 * deletions.
5844	 */
5845	if ((ip->i_mode & IFMT) == IFDIR)
5846		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5847	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5848	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5849	FREE_LOCK(ip->i_ump);
5850	if (freefrag)
5851		handle_workitem_freefrag(freefrag);
5852}
5853
5854/*
5855 * Called just before setting an indirect block pointer to a
5856 * newly allocated indirect block.
5857 */
5858void
5859softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5860	struct buf *nbp;	/* newly allocated indirect block */
5861	struct inode *ip;	/* inode for file being extended */
5862	struct buf *bp;		/* indirect block referencing allocated block */
5863	int ptrno;		/* offset of pointer in indirect block */
5864	ufs2_daddr_t newblkno;	/* disk block number being added */
5865{
5866	struct inodedep *inodedep;
5867	struct allocindir *aip;
5868	ufs_lbn_t lbn;
5869	int dflags;
5870
5871	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5872	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5873	CTR3(KTR_SUJ,
5874	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5875	    ip->i_number, newblkno, ptrno);
5876	lbn = nbp->b_lblkno;
5877	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5878	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5879	dflags = DEPALLOC;
5880	if (IS_SNAPSHOT(ip))
5881		dflags |= NODELAY;
5882	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
5883	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5884	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5885		panic("softdep_setup_allocindir_meta: Block already existed");
5886	FREE_LOCK(ip->i_ump);
5887}
5888
5889static void
5890indirdep_complete(indirdep)
5891	struct indirdep *indirdep;
5892{
5893	struct allocindir *aip;
5894
5895	LIST_REMOVE(indirdep, ir_next);
5896	indirdep->ir_state |= DEPCOMPLETE;
5897
5898	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5899		LIST_REMOVE(aip, ai_next);
5900		free_newblk(&aip->ai_block);
5901	}
5902	/*
5903	 * If this indirdep is not attached to a buf it was simply waiting
5904	 * on completion to clear completehd.  free_indirdep() asserts
5905	 * that nothing is dangling.
5906	 */
5907	if ((indirdep->ir_state & ONWORKLIST) == 0)
5908		free_indirdep(indirdep);
5909}
5910
5911static struct indirdep *
5912indirdep_lookup(mp, ip, bp)
5913	struct mount *mp;
5914	struct inode *ip;
5915	struct buf *bp;
5916{
5917	struct indirdep *indirdep, *newindirdep;
5918	struct newblk *newblk;
5919	struct ufsmount *ump;
5920	struct worklist *wk;
5921	struct fs *fs;
5922	ufs2_daddr_t blkno;
5923
5924	ump = VFSTOUFS(mp);
5925	LOCK_OWNED(ump);
5926	indirdep = NULL;
5927	newindirdep = NULL;
5928	fs = ip->i_fs;
5929	for (;;) {
5930		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5931			if (wk->wk_type != D_INDIRDEP)
5932				continue;
5933			indirdep = WK_INDIRDEP(wk);
5934			break;
5935		}
5936		/* Found on the buffer worklist, no new structure to free. */
5937		if (indirdep != NULL && newindirdep == NULL)
5938			return (indirdep);
5939		if (indirdep != NULL && newindirdep != NULL)
5940			panic("indirdep_lookup: simultaneous create");
5941		/* None found on the buffer and a new structure is ready. */
5942		if (indirdep == NULL && newindirdep != NULL)
5943			break;
5944		/* None found and no new structure available. */
5945		FREE_LOCK(ump);
5946		newindirdep = malloc(sizeof(struct indirdep),
5947		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5948		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5949		newindirdep->ir_state = ATTACHED;
5950		if (ip->i_ump->um_fstype == UFS1)
5951			newindirdep->ir_state |= UFS1FMT;
5952		TAILQ_INIT(&newindirdep->ir_trunc);
5953		newindirdep->ir_saveddata = NULL;
5954		LIST_INIT(&newindirdep->ir_deplisthd);
5955		LIST_INIT(&newindirdep->ir_donehd);
5956		LIST_INIT(&newindirdep->ir_writehd);
5957		LIST_INIT(&newindirdep->ir_completehd);
5958		if (bp->b_blkno == bp->b_lblkno) {
5959			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5960			    NULL, NULL);
5961			bp->b_blkno = blkno;
5962		}
5963		newindirdep->ir_freeblks = NULL;
5964		newindirdep->ir_savebp =
5965		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5966		newindirdep->ir_bp = bp;
5967		BUF_KERNPROC(newindirdep->ir_savebp);
5968		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5969		ACQUIRE_LOCK(ump);
5970	}
5971	indirdep = newindirdep;
5972	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5973	/*
5974	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5975	 * that we don't free dependencies until the pointers are valid.
5976	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5977	 * than using the hash.
5978	 */
5979	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5980		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5981	else
5982		indirdep->ir_state |= DEPCOMPLETE;
5983	return (indirdep);
5984}
5985
5986/*
5987 * Called to finish the allocation of the "aip" allocated
5988 * by one of the two routines above.
5989 */
5990static struct freefrag *
5991setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5992	struct buf *bp;		/* in-memory copy of the indirect block */
5993	struct inode *ip;	/* inode for file being extended */
5994	struct inodedep *inodedep; /* Inodedep for ip */
5995	struct allocindir *aip;	/* allocindir allocated by the above routines */
5996	ufs_lbn_t lbn;		/* Logical block number for this block. */
5997{
5998	struct fs *fs;
5999	struct indirdep *indirdep;
6000	struct allocindir *oldaip;
6001	struct freefrag *freefrag;
6002	struct mount *mp;
6003
6004	LOCK_OWNED(ip->i_ump);
6005	mp = UFSTOVFS(ip->i_ump);
6006	fs = ip->i_fs;
6007	if (bp->b_lblkno >= 0)
6008		panic("setup_allocindir_phase2: not indir blk");
6009	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6010	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6011	indirdep = indirdep_lookup(mp, ip, bp);
6012	KASSERT(indirdep->ir_savebp != NULL,
6013	    ("setup_allocindir_phase2 NULL ir_savebp"));
6014	aip->ai_indirdep = indirdep;
6015	/*
6016	 * Check for an unwritten dependency for this indirect offset.  If
6017	 * there is, merge the old dependency into the new one.  This happens
6018	 * as a result of reallocblk only.
6019	 */
6020	freefrag = NULL;
6021	if (aip->ai_oldblkno != 0) {
6022		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6023			if (oldaip->ai_offset == aip->ai_offset) {
6024				freefrag = allocindir_merge(aip, oldaip);
6025				goto done;
6026			}
6027		}
6028		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6029			if (oldaip->ai_offset == aip->ai_offset) {
6030				freefrag = allocindir_merge(aip, oldaip);
6031				goto done;
6032			}
6033		}
6034	}
6035done:
6036	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6037	return (freefrag);
6038}
6039
6040/*
6041 * Merge two allocindirs which refer to the same block.  Move newblock
6042 * dependencies and setup the freefrags appropriately.
6043 */
6044static struct freefrag *
6045allocindir_merge(aip, oldaip)
6046	struct allocindir *aip;
6047	struct allocindir *oldaip;
6048{
6049	struct freefrag *freefrag;
6050	struct worklist *wk;
6051
6052	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6053		panic("allocindir_merge: blkno");
6054	aip->ai_oldblkno = oldaip->ai_oldblkno;
6055	freefrag = aip->ai_freefrag;
6056	aip->ai_freefrag = oldaip->ai_freefrag;
6057	oldaip->ai_freefrag = NULL;
6058	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6059	/*
6060	 * If we are tracking a new directory-block allocation,
6061	 * move it from the old allocindir to the new allocindir.
6062	 */
6063	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6064		WORKLIST_REMOVE(wk);
6065		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6066			panic("allocindir_merge: extra newdirblk");
6067		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6068	}
6069	/*
6070	 * We can skip journaling for this freefrag and just complete
6071	 * any pending journal work for the allocindir that is being
6072	 * removed after the freefrag completes.
6073	 */
6074	if (freefrag->ff_jdep)
6075		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6076	LIST_REMOVE(oldaip, ai_next);
6077	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6078	    &freefrag->ff_list, &freefrag->ff_jwork);
6079	free_newblk(&oldaip->ai_block);
6080
6081	return (freefrag);
6082}
6083
6084static inline void
6085setup_freedirect(freeblks, ip, i, needj)
6086	struct freeblks *freeblks;
6087	struct inode *ip;
6088	int i;
6089	int needj;
6090{
6091	ufs2_daddr_t blkno;
6092	int frags;
6093
6094	blkno = DIP(ip, i_db[i]);
6095	if (blkno == 0)
6096		return;
6097	DIP_SET(ip, i_db[i], 0);
6098	frags = sblksize(ip->i_fs, ip->i_size, i);
6099	frags = numfrags(ip->i_fs, frags);
6100	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
6101}
6102
6103static inline void
6104setup_freeext(freeblks, ip, i, needj)
6105	struct freeblks *freeblks;
6106	struct inode *ip;
6107	int i;
6108	int needj;
6109{
6110	ufs2_daddr_t blkno;
6111	int frags;
6112
6113	blkno = ip->i_din2->di_extb[i];
6114	if (blkno == 0)
6115		return;
6116	ip->i_din2->di_extb[i] = 0;
6117	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
6118	frags = numfrags(ip->i_fs, frags);
6119	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6120}
6121
6122static inline void
6123setup_freeindir(freeblks, ip, i, lbn, needj)
6124	struct freeblks *freeblks;
6125	struct inode *ip;
6126	int i;
6127	ufs_lbn_t lbn;
6128	int needj;
6129{
6130	ufs2_daddr_t blkno;
6131
6132	blkno = DIP(ip, i_ib[i]);
6133	if (blkno == 0)
6134		return;
6135	DIP_SET(ip, i_ib[i], 0);
6136	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
6137	    0, needj);
6138}
6139
6140static inline struct freeblks *
6141newfreeblks(mp, ip)
6142	struct mount *mp;
6143	struct inode *ip;
6144{
6145	struct freeblks *freeblks;
6146
6147	freeblks = malloc(sizeof(struct freeblks),
6148		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6149	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6150	LIST_INIT(&freeblks->fb_jblkdephd);
6151	LIST_INIT(&freeblks->fb_jwork);
6152	freeblks->fb_ref = 0;
6153	freeblks->fb_cgwait = 0;
6154	freeblks->fb_state = ATTACHED;
6155	freeblks->fb_uid = ip->i_uid;
6156	freeblks->fb_inum = ip->i_number;
6157	freeblks->fb_vtype = ITOV(ip)->v_type;
6158	freeblks->fb_modrev = DIP(ip, i_modrev);
6159	freeblks->fb_devvp = ip->i_devvp;
6160	freeblks->fb_chkcnt = 0;
6161	freeblks->fb_len = 0;
6162
6163	return (freeblks);
6164}
6165
6166static void
6167trunc_indirdep(indirdep, freeblks, bp, off)
6168	struct indirdep *indirdep;
6169	struct freeblks *freeblks;
6170	struct buf *bp;
6171	int off;
6172{
6173	struct allocindir *aip, *aipn;
6174
6175	/*
6176	 * The first set of allocindirs won't be in savedbp.
6177	 */
6178	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6179		if (aip->ai_offset > off)
6180			cancel_allocindir(aip, bp, freeblks, 1);
6181	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6182		if (aip->ai_offset > off)
6183			cancel_allocindir(aip, bp, freeblks, 1);
6184	/*
6185	 * These will exist in savedbp.
6186	 */
6187	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6188		if (aip->ai_offset > off)
6189			cancel_allocindir(aip, NULL, freeblks, 0);
6190	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6191		if (aip->ai_offset > off)
6192			cancel_allocindir(aip, NULL, freeblks, 0);
6193}
6194
6195/*
6196 * Follow the chain of indirects down to lastlbn creating a freework
6197 * structure for each.  This will be used to start indir_trunc() at
6198 * the right offset and create the journal records for the parrtial
6199 * truncation.  A second step will handle the truncated dependencies.
6200 */
6201static int
6202setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6203	struct freeblks *freeblks;
6204	struct inode *ip;
6205	ufs_lbn_t lbn;
6206	ufs_lbn_t lastlbn;
6207	ufs2_daddr_t blkno;
6208{
6209	struct indirdep *indirdep;
6210	struct indirdep *indirn;
6211	struct freework *freework;
6212	struct newblk *newblk;
6213	struct mount *mp;
6214	struct buf *bp;
6215	uint8_t *start;
6216	uint8_t *end;
6217	ufs_lbn_t lbnadd;
6218	int level;
6219	int error;
6220	int off;
6221
6222
6223	freework = NULL;
6224	if (blkno == 0)
6225		return (0);
6226	mp = freeblks->fb_list.wk_mp;
6227	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6228	if ((bp->b_flags & B_CACHE) == 0) {
6229		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6230		bp->b_iocmd = BIO_READ;
6231		bp->b_flags &= ~B_INVAL;
6232		bp->b_ioflags &= ~BIO_ERROR;
6233		vfs_busy_pages(bp, 0);
6234		bp->b_iooffset = dbtob(bp->b_blkno);
6235		bstrategy(bp);
6236		curthread->td_ru.ru_inblock++;
6237		error = bufwait(bp);
6238		if (error) {
6239			brelse(bp);
6240			return (error);
6241		}
6242	}
6243	level = lbn_level(lbn);
6244	lbnadd = lbn_offset(ip->i_fs, level);
6245	/*
6246	 * Compute the offset of the last block we want to keep.  Store
6247	 * in the freework the first block we want to completely free.
6248	 */
6249	off = (lastlbn - -(lbn + level)) / lbnadd;
6250	if (off + 1 == NINDIR(ip->i_fs))
6251		goto nowork;
6252	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6253	    0);
6254	/*
6255	 * Link the freework into the indirdep.  This will prevent any new
6256	 * allocations from proceeding until we are finished with the
6257	 * truncate and the block is written.
6258	 */
6259	ACQUIRE_LOCK(ip->i_ump);
6260	indirdep = indirdep_lookup(mp, ip, bp);
6261	if (indirdep->ir_freeblks)
6262		panic("setup_trunc_indir: indirdep already truncated.");
6263	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6264	freework->fw_indir = indirdep;
6265	/*
6266	 * Cancel any allocindirs that will not make it to disk.
6267	 * We have to do this for all copies of the indirdep that
6268	 * live on this newblk.
6269	 */
6270	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6271		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6272		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6273			trunc_indirdep(indirn, freeblks, bp, off);
6274	} else
6275		trunc_indirdep(indirdep, freeblks, bp, off);
6276	FREE_LOCK(ip->i_ump);
6277	/*
6278	 * Creation is protected by the buf lock. The saveddata is only
6279	 * needed if a full truncation follows a partial truncation but it
6280	 * is difficult to allocate in that case so we fetch it anyway.
6281	 */
6282	if (indirdep->ir_saveddata == NULL)
6283		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6284		    M_SOFTDEP_FLAGS);
6285nowork:
6286	/* Fetch the blkno of the child and the zero start offset. */
6287	if (ip->i_ump->um_fstype == UFS1) {
6288		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6289		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6290	} else {
6291		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6292		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6293	}
6294	if (freework) {
6295		/* Zero the truncated pointers. */
6296		end = bp->b_data + bp->b_bcount;
6297		bzero(start, end - start);
6298		bdwrite(bp);
6299	} else
6300		bqrelse(bp);
6301	if (level == 0)
6302		return (0);
6303	lbn++; /* adjust level */
6304	lbn -= (off * lbnadd);
6305	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6306}
6307
6308/*
6309 * Complete the partial truncation of an indirect block setup by
6310 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6311 * copy and writes them to disk before the freeblks is allowed to complete.
6312 */
6313static void
6314complete_trunc_indir(freework)
6315	struct freework *freework;
6316{
6317	struct freework *fwn;
6318	struct indirdep *indirdep;
6319	struct ufsmount *ump;
6320	struct buf *bp;
6321	uintptr_t start;
6322	int count;
6323
6324	ump = VFSTOUFS(freework->fw_list.wk_mp);
6325	LOCK_OWNED(ump);
6326	indirdep = freework->fw_indir;
6327	for (;;) {
6328		bp = indirdep->ir_bp;
6329		/* See if the block was discarded. */
6330		if (bp == NULL)
6331			break;
6332		/* Inline part of getdirtybuf().  We dont want bremfree. */
6333		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6334			break;
6335		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6336		    LOCK_PTR(ump)) == 0)
6337			BUF_UNLOCK(bp);
6338		ACQUIRE_LOCK(ump);
6339	}
6340	freework->fw_state |= DEPCOMPLETE;
6341	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6342	/*
6343	 * Zero the pointers in the saved copy.
6344	 */
6345	if (indirdep->ir_state & UFS1FMT)
6346		start = sizeof(ufs1_daddr_t);
6347	else
6348		start = sizeof(ufs2_daddr_t);
6349	start *= freework->fw_start;
6350	count = indirdep->ir_savebp->b_bcount - start;
6351	start += (uintptr_t)indirdep->ir_savebp->b_data;
6352	bzero((char *)start, count);
6353	/*
6354	 * We need to start the next truncation in the list if it has not
6355	 * been started yet.
6356	 */
6357	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6358	if (fwn != NULL) {
6359		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6360			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6361		if ((fwn->fw_state & ONWORKLIST) == 0)
6362			freework_enqueue(fwn);
6363	}
6364	/*
6365	 * If bp is NULL the block was fully truncated, restore
6366	 * the saved block list otherwise free it if it is no
6367	 * longer needed.
6368	 */
6369	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6370		if (bp == NULL)
6371			bcopy(indirdep->ir_saveddata,
6372			    indirdep->ir_savebp->b_data,
6373			    indirdep->ir_savebp->b_bcount);
6374		free(indirdep->ir_saveddata, M_INDIRDEP);
6375		indirdep->ir_saveddata = NULL;
6376	}
6377	/*
6378	 * When bp is NULL there is a full truncation pending.  We
6379	 * must wait for this full truncation to be journaled before
6380	 * we can release this freework because the disk pointers will
6381	 * never be written as zero.
6382	 */
6383	if (bp == NULL)  {
6384		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6385			handle_written_freework(freework);
6386		else
6387			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6388			   &freework->fw_list);
6389	} else {
6390		/* Complete when the real copy is written. */
6391		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6392		BUF_UNLOCK(bp);
6393	}
6394}
6395
6396/*
6397 * Calculate the number of blocks we are going to release where datablocks
6398 * is the current total and length is the new file size.
6399 */
6400static ufs2_daddr_t
6401blkcount(fs, datablocks, length)
6402	struct fs *fs;
6403	ufs2_daddr_t datablocks;
6404	off_t length;
6405{
6406	off_t totblks, numblks;
6407
6408	totblks = 0;
6409	numblks = howmany(length, fs->fs_bsize);
6410	if (numblks <= NDADDR) {
6411		totblks = howmany(length, fs->fs_fsize);
6412		goto out;
6413	}
6414        totblks = blkstofrags(fs, numblks);
6415	numblks -= NDADDR;
6416	/*
6417	 * Count all single, then double, then triple indirects required.
6418	 * Subtracting one indirects worth of blocks for each pass
6419	 * acknowledges one of each pointed to by the inode.
6420	 */
6421	for (;;) {
6422		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6423		numblks -= NINDIR(fs);
6424		if (numblks <= 0)
6425			break;
6426		numblks = howmany(numblks, NINDIR(fs));
6427	}
6428out:
6429	totblks = fsbtodb(fs, totblks);
6430	/*
6431	 * Handle sparse files.  We can't reclaim more blocks than the inode
6432	 * references.  We will correct it later in handle_complete_freeblks()
6433	 * when we know the real count.
6434	 */
6435	if (totblks > datablocks)
6436		return (0);
6437	return (datablocks - totblks);
6438}
6439
6440/*
6441 * Handle freeblocks for journaled softupdate filesystems.
6442 *
6443 * Contrary to normal softupdates, we must preserve the block pointers in
6444 * indirects until their subordinates are free.  This is to avoid journaling
6445 * every block that is freed which may consume more space than the journal
6446 * itself.  The recovery program will see the free block journals at the
6447 * base of the truncated area and traverse them to reclaim space.  The
6448 * pointers in the inode may be cleared immediately after the journal
6449 * records are written because each direct and indirect pointer in the
6450 * inode is recorded in a journal.  This permits full truncation to proceed
6451 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6452 *
6453 * The algorithm is as follows:
6454 * 1) Traverse the in-memory state and create journal entries to release
6455 *    the relevant blocks and full indirect trees.
6456 * 2) Traverse the indirect block chain adding partial truncation freework
6457 *    records to indirects in the path to lastlbn.  The freework will
6458 *    prevent new allocation dependencies from being satisfied in this
6459 *    indirect until the truncation completes.
6460 * 3) Read and lock the inode block, performing an update with the new size
6461 *    and pointers.  This prevents truncated data from becoming valid on
6462 *    disk through step 4.
6463 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6464 *    eliminate journal work for those records that do not require it.
6465 * 5) Schedule the journal records to be written followed by the inode block.
6466 * 6) Allocate any necessary frags for the end of file.
6467 * 7) Zero any partially truncated blocks.
6468 *
6469 * From this truncation proceeds asynchronously using the freework and
6470 * indir_trunc machinery.  The file will not be extended again into a
6471 * partially truncated indirect block until all work is completed but
6472 * the normal dependency mechanism ensures that it is rolled back/forward
6473 * as appropriate.  Further truncation may occur without delay and is
6474 * serialized in indir_trunc().
6475 */
6476void
6477softdep_journal_freeblocks(ip, cred, length, flags)
6478	struct inode *ip;	/* The inode whose length is to be reduced */
6479	struct ucred *cred;
6480	off_t length;		/* The new length for the file */
6481	int flags;		/* IO_EXT and/or IO_NORMAL */
6482{
6483	struct freeblks *freeblks, *fbn;
6484	struct worklist *wk, *wkn;
6485	struct inodedep *inodedep;
6486	struct jblkdep *jblkdep;
6487	struct allocdirect *adp, *adpn;
6488	struct ufsmount *ump;
6489	struct fs *fs;
6490	struct buf *bp;
6491	struct vnode *vp;
6492	struct mount *mp;
6493	ufs2_daddr_t extblocks, datablocks;
6494	ufs_lbn_t tmpval, lbn, lastlbn;
6495	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
6496
6497	fs = ip->i_fs;
6498	ump = ip->i_ump;
6499	mp = UFSTOVFS(ump);
6500	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6501	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6502	vp = ITOV(ip);
6503	needj = 1;
6504	iboff = -1;
6505	allocblock = 0;
6506	extblocks = 0;
6507	datablocks = 0;
6508	frags = 0;
6509	freeblks = newfreeblks(mp, ip);
6510	ACQUIRE_LOCK(ump);
6511	/*
6512	 * If we're truncating a removed file that will never be written
6513	 * we don't need to journal the block frees.  The canceled journals
6514	 * for the allocations will suffice.
6515	 */
6516	dflags = DEPALLOC;
6517	if (IS_SNAPSHOT(ip))
6518		dflags |= NODELAY;
6519	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6520	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6521	    length == 0)
6522		needj = 0;
6523	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6524	    ip->i_number, length, needj);
6525	FREE_LOCK(ump);
6526	/*
6527	 * Calculate the lbn that we are truncating to.  This results in -1
6528	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6529	 * to keep, not the first lbn we want to truncate.
6530	 */
6531	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6532	lastoff = blkoff(fs, length);
6533	/*
6534	 * Compute frags we are keeping in lastlbn.  0 means all.
6535	 */
6536	if (lastlbn >= 0 && lastlbn < NDADDR) {
6537		frags = fragroundup(fs, lastoff);
6538		/* adp offset of last valid allocdirect. */
6539		iboff = lastlbn;
6540	} else if (lastlbn > 0)
6541		iboff = NDADDR;
6542	if (fs->fs_magic == FS_UFS2_MAGIC)
6543		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6544	/*
6545	 * Handle normal data blocks and indirects.  This section saves
6546	 * values used after the inode update to complete frag and indirect
6547	 * truncation.
6548	 */
6549	if ((flags & IO_NORMAL) != 0) {
6550		/*
6551		 * Handle truncation of whole direct and indirect blocks.
6552		 */
6553		for (i = iboff + 1; i < NDADDR; i++)
6554			setup_freedirect(freeblks, ip, i, needj);
6555		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6556		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6557			/* Release a whole indirect tree. */
6558			if (lbn > lastlbn) {
6559				setup_freeindir(freeblks, ip, i, -lbn -i,
6560				    needj);
6561				continue;
6562			}
6563			iboff = i + NDADDR;
6564			/*
6565			 * Traverse partially truncated indirect tree.
6566			 */
6567			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6568				setup_trunc_indir(freeblks, ip, -lbn - i,
6569				    lastlbn, DIP(ip, i_ib[i]));
6570		}
6571		/*
6572		 * Handle partial truncation to a frag boundary.
6573		 */
6574		if (frags) {
6575			ufs2_daddr_t blkno;
6576			long oldfrags;
6577
6578			oldfrags = blksize(fs, ip, lastlbn);
6579			blkno = DIP(ip, i_db[lastlbn]);
6580			if (blkno && oldfrags != frags) {
6581				oldfrags -= frags;
6582				oldfrags = numfrags(ip->i_fs, oldfrags);
6583				blkno += numfrags(ip->i_fs, frags);
6584				newfreework(ump, freeblks, NULL, lastlbn,
6585				    blkno, oldfrags, 0, needj);
6586				if (needj)
6587					adjust_newfreework(freeblks,
6588					    numfrags(ip->i_fs, frags));
6589			} else if (blkno == 0)
6590				allocblock = 1;
6591		}
6592		/*
6593		 * Add a journal record for partial truncate if we are
6594		 * handling indirect blocks.  Non-indirects need no extra
6595		 * journaling.
6596		 */
6597		if (length != 0 && lastlbn >= NDADDR) {
6598			ip->i_flag |= IN_TRUNCATED;
6599			newjtrunc(freeblks, length, 0);
6600		}
6601		ip->i_size = length;
6602		DIP_SET(ip, i_size, ip->i_size);
6603		datablocks = DIP(ip, i_blocks) - extblocks;
6604		if (length != 0)
6605			datablocks = blkcount(ip->i_fs, datablocks, length);
6606		freeblks->fb_len = length;
6607	}
6608	if ((flags & IO_EXT) != 0) {
6609		for (i = 0; i < NXADDR; i++)
6610			setup_freeext(freeblks, ip, i, needj);
6611		ip->i_din2->di_extsize = 0;
6612		datablocks += extblocks;
6613	}
6614#ifdef QUOTA
6615	/* Reference the quotas in case the block count is wrong in the end. */
6616	quotaref(vp, freeblks->fb_quota);
6617	(void) chkdq(ip, -datablocks, NOCRED, 0);
6618#endif
6619	freeblks->fb_chkcnt = -datablocks;
6620	UFS_LOCK(ump);
6621	fs->fs_pendingblocks += datablocks;
6622	UFS_UNLOCK(ump);
6623	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6624	/*
6625	 * Handle truncation of incomplete alloc direct dependencies.  We
6626	 * hold the inode block locked to prevent incomplete dependencies
6627	 * from reaching the disk while we are eliminating those that
6628	 * have been truncated.  This is a partially inlined ffs_update().
6629	 */
6630	ufs_itimes(vp);
6631	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6632	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6633	    (int)fs->fs_bsize, cred, &bp);
6634	if (error) {
6635		brelse(bp);
6636		softdep_error("softdep_journal_freeblocks", error);
6637		return;
6638	}
6639	if (bp->b_bufsize == fs->fs_bsize)
6640		bp->b_flags |= B_CLUSTEROK;
6641	softdep_update_inodeblock(ip, bp, 0);
6642	if (ump->um_fstype == UFS1)
6643		*((struct ufs1_dinode *)bp->b_data +
6644		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6645	else
6646		*((struct ufs2_dinode *)bp->b_data +
6647		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6648	ACQUIRE_LOCK(ump);
6649	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6650	if ((inodedep->id_state & IOSTARTED) != 0)
6651		panic("softdep_setup_freeblocks: inode busy");
6652	/*
6653	 * Add the freeblks structure to the list of operations that
6654	 * must await the zero'ed inode being written to disk. If we
6655	 * still have a bitmap dependency (needj), then the inode
6656	 * has never been written to disk, so we can process the
6657	 * freeblks below once we have deleted the dependencies.
6658	 */
6659	if (needj)
6660		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6661	else
6662		freeblks->fb_state |= COMPLETE;
6663	if ((flags & IO_NORMAL) != 0) {
6664		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6665			if (adp->ad_offset > iboff)
6666				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6667				    freeblks);
6668			/*
6669			 * Truncate the allocdirect.  We could eliminate
6670			 * or modify journal records as well.
6671			 */
6672			else if (adp->ad_offset == iboff && frags)
6673				adp->ad_newsize = frags;
6674		}
6675	}
6676	if ((flags & IO_EXT) != 0)
6677		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6678			cancel_allocdirect(&inodedep->id_extupdt, adp,
6679			    freeblks);
6680	/*
6681	 * Scan the bufwait list for newblock dependencies that will never
6682	 * make it to disk.
6683	 */
6684	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6685		if (wk->wk_type != D_ALLOCDIRECT)
6686			continue;
6687		adp = WK_ALLOCDIRECT(wk);
6688		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6689		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6690			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6691			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6692			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6693		}
6694	}
6695	/*
6696	 * Add journal work.
6697	 */
6698	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6699		add_to_journal(&jblkdep->jb_list);
6700	FREE_LOCK(ump);
6701	bdwrite(bp);
6702	/*
6703	 * Truncate dependency structures beyond length.
6704	 */
6705	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6706	/*
6707	 * This is only set when we need to allocate a fragment because
6708	 * none existed at the end of a frag-sized file.  It handles only
6709	 * allocating a new, zero filled block.
6710	 */
6711	if (allocblock) {
6712		ip->i_size = length - lastoff;
6713		DIP_SET(ip, i_size, ip->i_size);
6714		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6715		if (error != 0) {
6716			softdep_error("softdep_journal_freeblks", error);
6717			return;
6718		}
6719		ip->i_size = length;
6720		DIP_SET(ip, i_size, length);
6721		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6722		allocbuf(bp, frags);
6723		ffs_update(vp, 0);
6724		bawrite(bp);
6725	} else if (lastoff != 0 && vp->v_type != VDIR) {
6726		int size;
6727
6728		/*
6729		 * Zero the end of a truncated frag or block.
6730		 */
6731		size = sblksize(fs, length, lastlbn);
6732		error = bread(vp, lastlbn, size, cred, &bp);
6733		if (error) {
6734			softdep_error("softdep_journal_freeblks", error);
6735			return;
6736		}
6737		bzero((char *)bp->b_data + lastoff, size - lastoff);
6738		bawrite(bp);
6739
6740	}
6741	ACQUIRE_LOCK(ump);
6742	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6743	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6744	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6745	/*
6746	 * We zero earlier truncations so they don't erroneously
6747	 * update i_blocks.
6748	 */
6749	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6750		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6751			fbn->fb_len = 0;
6752	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6753	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6754		freeblks->fb_state |= INPROGRESS;
6755	else
6756		freeblks = NULL;
6757	FREE_LOCK(ump);
6758	if (freeblks)
6759		handle_workitem_freeblocks(freeblks, 0);
6760	trunc_pages(ip, length, extblocks, flags);
6761
6762}
6763
6764/*
6765 * Flush a JOP_SYNC to the journal.
6766 */
6767void
6768softdep_journal_fsync(ip)
6769	struct inode *ip;
6770{
6771	struct jfsync *jfsync;
6772
6773	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6774	    ("softdep_journal_fsync called on non-softdep filesystem"));
6775	if ((ip->i_flag & IN_TRUNCATED) == 0)
6776		return;
6777	ip->i_flag &= ~IN_TRUNCATED;
6778	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6779	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6780	jfsync->jfs_size = ip->i_size;
6781	jfsync->jfs_ino = ip->i_number;
6782	ACQUIRE_LOCK(ip->i_ump);
6783	add_to_journal(&jfsync->jfs_list);
6784	jwait(&jfsync->jfs_list, MNT_WAIT);
6785	FREE_LOCK(ip->i_ump);
6786}
6787
6788/*
6789 * Block de-allocation dependencies.
6790 *
6791 * When blocks are de-allocated, the on-disk pointers must be nullified before
6792 * the blocks are made available for use by other files.  (The true
6793 * requirement is that old pointers must be nullified before new on-disk
6794 * pointers are set.  We chose this slightly more stringent requirement to
6795 * reduce complexity.) Our implementation handles this dependency by updating
6796 * the inode (or indirect block) appropriately but delaying the actual block
6797 * de-allocation (i.e., freemap and free space count manipulation) until
6798 * after the updated versions reach stable storage.  After the disk is
6799 * updated, the blocks can be safely de-allocated whenever it is convenient.
6800 * This implementation handles only the common case of reducing a file's
6801 * length to zero. Other cases are handled by the conventional synchronous
6802 * write approach.
6803 *
6804 * The ffs implementation with which we worked double-checks
6805 * the state of the block pointers and file size as it reduces
6806 * a file's length.  Some of this code is replicated here in our
6807 * soft updates implementation.  The freeblks->fb_chkcnt field is
6808 * used to transfer a part of this information to the procedure
6809 * that eventually de-allocates the blocks.
6810 *
6811 * This routine should be called from the routine that shortens
6812 * a file's length, before the inode's size or block pointers
6813 * are modified. It will save the block pointer information for
6814 * later release and zero the inode so that the calling routine
6815 * can release it.
6816 */
6817void
6818softdep_setup_freeblocks(ip, length, flags)
6819	struct inode *ip;	/* The inode whose length is to be reduced */
6820	off_t length;		/* The new length for the file */
6821	int flags;		/* IO_EXT and/or IO_NORMAL */
6822{
6823	struct ufs1_dinode *dp1;
6824	struct ufs2_dinode *dp2;
6825	struct freeblks *freeblks;
6826	struct inodedep *inodedep;
6827	struct allocdirect *adp;
6828	struct ufsmount *ump;
6829	struct buf *bp;
6830	struct fs *fs;
6831	ufs2_daddr_t extblocks, datablocks;
6832	struct mount *mp;
6833	int i, delay, error, dflags;
6834	ufs_lbn_t tmpval;
6835	ufs_lbn_t lbn;
6836
6837	ump = ip->i_ump;
6838	mp = UFSTOVFS(ump);
6839	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6840	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
6841	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6842	    ip->i_number, length);
6843	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6844	fs = ip->i_fs;
6845	freeblks = newfreeblks(mp, ip);
6846	extblocks = 0;
6847	datablocks = 0;
6848	if (fs->fs_magic == FS_UFS2_MAGIC)
6849		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6850	if ((flags & IO_NORMAL) != 0) {
6851		for (i = 0; i < NDADDR; i++)
6852			setup_freedirect(freeblks, ip, i, 0);
6853		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6854		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6855			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6856		ip->i_size = 0;
6857		DIP_SET(ip, i_size, 0);
6858		datablocks = DIP(ip, i_blocks) - extblocks;
6859	}
6860	if ((flags & IO_EXT) != 0) {
6861		for (i = 0; i < NXADDR; i++)
6862			setup_freeext(freeblks, ip, i, 0);
6863		ip->i_din2->di_extsize = 0;
6864		datablocks += extblocks;
6865	}
6866#ifdef QUOTA
6867	/* Reference the quotas in case the block count is wrong in the end. */
6868	quotaref(ITOV(ip), freeblks->fb_quota);
6869	(void) chkdq(ip, -datablocks, NOCRED, 0);
6870#endif
6871	freeblks->fb_chkcnt = -datablocks;
6872	UFS_LOCK(ump);
6873	fs->fs_pendingblocks += datablocks;
6874	UFS_UNLOCK(ump);
6875	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6876	/*
6877	 * Push the zero'ed inode to to its disk buffer so that we are free
6878	 * to delete its dependencies below. Once the dependencies are gone
6879	 * the buffer can be safely released.
6880	 */
6881	if ((error = bread(ip->i_devvp,
6882	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6883	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6884		brelse(bp);
6885		softdep_error("softdep_setup_freeblocks", error);
6886	}
6887	if (ump->um_fstype == UFS1) {
6888		dp1 = ((struct ufs1_dinode *)bp->b_data +
6889		    ino_to_fsbo(fs, ip->i_number));
6890		ip->i_din1->di_freelink = dp1->di_freelink;
6891		*dp1 = *ip->i_din1;
6892	} else {
6893		dp2 = ((struct ufs2_dinode *)bp->b_data +
6894		    ino_to_fsbo(fs, ip->i_number));
6895		ip->i_din2->di_freelink = dp2->di_freelink;
6896		*dp2 = *ip->i_din2;
6897	}
6898	/*
6899	 * Find and eliminate any inode dependencies.
6900	 */
6901	ACQUIRE_LOCK(ump);
6902	dflags = DEPALLOC;
6903	if (IS_SNAPSHOT(ip))
6904		dflags |= NODELAY;
6905	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
6906	if ((inodedep->id_state & IOSTARTED) != 0)
6907		panic("softdep_setup_freeblocks: inode busy");
6908	/*
6909	 * Add the freeblks structure to the list of operations that
6910	 * must await the zero'ed inode being written to disk. If we
6911	 * still have a bitmap dependency (delay == 0), then the inode
6912	 * has never been written to disk, so we can process the
6913	 * freeblks below once we have deleted the dependencies.
6914	 */
6915	delay = (inodedep->id_state & DEPCOMPLETE);
6916	if (delay)
6917		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6918	else
6919		freeblks->fb_state |= COMPLETE;
6920	/*
6921	 * Because the file length has been truncated to zero, any
6922	 * pending block allocation dependency structures associated
6923	 * with this inode are obsolete and can simply be de-allocated.
6924	 * We must first merge the two dependency lists to get rid of
6925	 * any duplicate freefrag structures, then purge the merged list.
6926	 * If we still have a bitmap dependency, then the inode has never
6927	 * been written to disk, so we can free any fragments without delay.
6928	 */
6929	if (flags & IO_NORMAL) {
6930		merge_inode_lists(&inodedep->id_newinoupdt,
6931		    &inodedep->id_inoupdt);
6932		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != 0)
6933			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6934			    freeblks);
6935	}
6936	if (flags & IO_EXT) {
6937		merge_inode_lists(&inodedep->id_newextupdt,
6938		    &inodedep->id_extupdt);
6939		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != 0)
6940			cancel_allocdirect(&inodedep->id_extupdt, adp,
6941			    freeblks);
6942	}
6943	FREE_LOCK(ump);
6944	bdwrite(bp);
6945	trunc_dependencies(ip, freeblks, -1, 0, flags);
6946	ACQUIRE_LOCK(ump);
6947	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6948		(void) free_inodedep(inodedep);
6949	freeblks->fb_state |= DEPCOMPLETE;
6950	/*
6951	 * If the inode with zeroed block pointers is now on disk
6952	 * we can start freeing blocks.
6953	 */
6954	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6955		freeblks->fb_state |= INPROGRESS;
6956	else
6957		freeblks = NULL;
6958	FREE_LOCK(ump);
6959	if (freeblks)
6960		handle_workitem_freeblocks(freeblks, 0);
6961	trunc_pages(ip, length, extblocks, flags);
6962}
6963
6964/*
6965 * Eliminate pages from the page cache that back parts of this inode and
6966 * adjust the vnode pager's idea of our size.  This prevents stale data
6967 * from hanging around in the page cache.
6968 */
6969static void
6970trunc_pages(ip, length, extblocks, flags)
6971	struct inode *ip;
6972	off_t length;
6973	ufs2_daddr_t extblocks;
6974	int flags;
6975{
6976	struct vnode *vp;
6977	struct fs *fs;
6978	ufs_lbn_t lbn;
6979	off_t end, extend;
6980
6981	vp = ITOV(ip);
6982	fs = ip->i_fs;
6983	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6984	if ((flags & IO_EXT) != 0)
6985		vn_pages_remove(vp, extend, 0);
6986	if ((flags & IO_NORMAL) == 0)
6987		return;
6988	BO_LOCK(&vp->v_bufobj);
6989	drain_output(vp);
6990	BO_UNLOCK(&vp->v_bufobj);
6991	/*
6992	 * The vnode pager eliminates file pages we eliminate indirects
6993	 * below.
6994	 */
6995	vnode_pager_setsize(vp, length);
6996	/*
6997	 * Calculate the end based on the last indirect we want to keep.  If
6998	 * the block extends into indirects we can just use the negative of
6999	 * its lbn.  Doubles and triples exist at lower numbers so we must
7000	 * be careful not to remove those, if they exist.  double and triple
7001	 * indirect lbns do not overlap with others so it is not important
7002	 * to verify how many levels are required.
7003	 */
7004	lbn = lblkno(fs, length);
7005	if (lbn >= NDADDR) {
7006		/* Calculate the virtual lbn of the triple indirect. */
7007		lbn = -lbn - (NIADDR - 1);
7008		end = OFF_TO_IDX(lblktosize(fs, lbn));
7009	} else
7010		end = extend;
7011	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7012}
7013
7014/*
7015 * See if the buf bp is in the range eliminated by truncation.
7016 */
7017static int
7018trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7019	struct buf *bp;
7020	int *blkoffp;
7021	ufs_lbn_t lastlbn;
7022	int lastoff;
7023	int flags;
7024{
7025	ufs_lbn_t lbn;
7026
7027	*blkoffp = 0;
7028	/* Only match ext/normal blocks as appropriate. */
7029	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7030	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7031		return (0);
7032	/* ALTDATA is always a full truncation. */
7033	if ((bp->b_xflags & BX_ALTDATA) != 0)
7034		return (1);
7035	/* -1 is full truncation. */
7036	if (lastlbn == -1)
7037		return (1);
7038	/*
7039	 * If this is a partial truncate we only want those
7040	 * blocks and indirect blocks that cover the range
7041	 * we're after.
7042	 */
7043	lbn = bp->b_lblkno;
7044	if (lbn < 0)
7045		lbn = -(lbn + lbn_level(lbn));
7046	if (lbn < lastlbn)
7047		return (0);
7048	/* Here we only truncate lblkno if it's partial. */
7049	if (lbn == lastlbn) {
7050		if (lastoff == 0)
7051			return (0);
7052		*blkoffp = lastoff;
7053	}
7054	return (1);
7055}
7056
7057/*
7058 * Eliminate any dependencies that exist in memory beyond lblkno:off
7059 */
7060static void
7061trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7062	struct inode *ip;
7063	struct freeblks *freeblks;
7064	ufs_lbn_t lastlbn;
7065	int lastoff;
7066	int flags;
7067{
7068	struct bufobj *bo;
7069	struct vnode *vp;
7070	struct buf *bp;
7071	int blkoff;
7072
7073	/*
7074	 * We must wait for any I/O in progress to finish so that
7075	 * all potential buffers on the dirty list will be visible.
7076	 * Once they are all there, walk the list and get rid of
7077	 * any dependencies.
7078	 */
7079	vp = ITOV(ip);
7080	bo = &vp->v_bufobj;
7081	BO_LOCK(bo);
7082	drain_output(vp);
7083	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7084		bp->b_vflags &= ~BV_SCANNED;
7085restart:
7086	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7087		if (bp->b_vflags & BV_SCANNED)
7088			continue;
7089		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7090			bp->b_vflags |= BV_SCANNED;
7091			continue;
7092		}
7093		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7094		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7095			goto restart;
7096		BO_UNLOCK(bo);
7097		if (deallocate_dependencies(bp, freeblks, blkoff))
7098			bqrelse(bp);
7099		else
7100			brelse(bp);
7101		BO_LOCK(bo);
7102		goto restart;
7103	}
7104	/*
7105	 * Now do the work of vtruncbuf while also matching indirect blocks.
7106	 */
7107	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7108		bp->b_vflags &= ~BV_SCANNED;
7109cleanrestart:
7110	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7111		if (bp->b_vflags & BV_SCANNED)
7112			continue;
7113		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7114			bp->b_vflags |= BV_SCANNED;
7115			continue;
7116		}
7117		if (BUF_LOCK(bp,
7118		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7119		    BO_LOCKPTR(bo)) == ENOLCK) {
7120			BO_LOCK(bo);
7121			goto cleanrestart;
7122		}
7123		bp->b_vflags |= BV_SCANNED;
7124		bremfree(bp);
7125		if (blkoff != 0) {
7126			allocbuf(bp, blkoff);
7127			bqrelse(bp);
7128		} else {
7129			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7130			brelse(bp);
7131		}
7132		BO_LOCK(bo);
7133		goto cleanrestart;
7134	}
7135	drain_output(vp);
7136	BO_UNLOCK(bo);
7137}
7138
7139static int
7140cancel_pagedep(pagedep, freeblks, blkoff)
7141	struct pagedep *pagedep;
7142	struct freeblks *freeblks;
7143	int blkoff;
7144{
7145	struct jremref *jremref;
7146	struct jmvref *jmvref;
7147	struct dirrem *dirrem, *tmp;
7148	int i;
7149
7150	/*
7151	 * Copy any directory remove dependencies to the list
7152	 * to be processed after the freeblks proceeds.  If
7153	 * directory entry never made it to disk they
7154	 * can be dumped directly onto the work list.
7155	 */
7156	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7157		/* Skip this directory removal if it is intended to remain. */
7158		if (dirrem->dm_offset < blkoff)
7159			continue;
7160		/*
7161		 * If there are any dirrems we wait for the journal write
7162		 * to complete and then restart the buf scan as the lock
7163		 * has been dropped.
7164		 */
7165		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7166			jwait(&jremref->jr_list, MNT_WAIT);
7167			return (ERESTART);
7168		}
7169		LIST_REMOVE(dirrem, dm_next);
7170		dirrem->dm_dirinum = pagedep->pd_ino;
7171		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7172	}
7173	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7174		jwait(&jmvref->jm_list, MNT_WAIT);
7175		return (ERESTART);
7176	}
7177	/*
7178	 * When we're partially truncating a pagedep we just want to flush
7179	 * journal entries and return.  There can not be any adds in the
7180	 * truncated portion of the directory and newblk must remain if
7181	 * part of the block remains.
7182	 */
7183	if (blkoff != 0) {
7184		struct diradd *dap;
7185
7186		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7187			if (dap->da_offset > blkoff)
7188				panic("cancel_pagedep: diradd %p off %d > %d",
7189				    dap, dap->da_offset, blkoff);
7190		for (i = 0; i < DAHASHSZ; i++)
7191			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7192				if (dap->da_offset > blkoff)
7193					panic("cancel_pagedep: diradd %p off %d > %d",
7194					    dap, dap->da_offset, blkoff);
7195		return (0);
7196	}
7197	/*
7198	 * There should be no directory add dependencies present
7199	 * as the directory could not be truncated until all
7200	 * children were removed.
7201	 */
7202	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7203	    ("deallocate_dependencies: pendinghd != NULL"));
7204	for (i = 0; i < DAHASHSZ; i++)
7205		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7206		    ("deallocate_dependencies: diraddhd != NULL"));
7207	if ((pagedep->pd_state & NEWBLOCK) != 0)
7208		free_newdirblk(pagedep->pd_newdirblk);
7209	if (free_pagedep(pagedep) == 0)
7210		panic("Failed to free pagedep %p", pagedep);
7211	return (0);
7212}
7213
7214/*
7215 * Reclaim any dependency structures from a buffer that is about to
7216 * be reallocated to a new vnode. The buffer must be locked, thus,
7217 * no I/O completion operations can occur while we are manipulating
7218 * its associated dependencies. The mutex is held so that other I/O's
7219 * associated with related dependencies do not occur.
7220 */
7221static int
7222deallocate_dependencies(bp, freeblks, off)
7223	struct buf *bp;
7224	struct freeblks *freeblks;
7225	int off;
7226{
7227	struct indirdep *indirdep;
7228	struct pagedep *pagedep;
7229	struct allocdirect *adp;
7230	struct worklist *wk, *wkn;
7231	struct ufsmount *ump;
7232
7233	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7234		goto done;
7235	ump = VFSTOUFS(wk->wk_mp);
7236	ACQUIRE_LOCK(ump);
7237	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7238		switch (wk->wk_type) {
7239		case D_INDIRDEP:
7240			indirdep = WK_INDIRDEP(wk);
7241			if (bp->b_lblkno >= 0 ||
7242			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7243				panic("deallocate_dependencies: not indir");
7244			cancel_indirdep(indirdep, bp, freeblks);
7245			continue;
7246
7247		case D_PAGEDEP:
7248			pagedep = WK_PAGEDEP(wk);
7249			if (cancel_pagedep(pagedep, freeblks, off)) {
7250				FREE_LOCK(ump);
7251				return (ERESTART);
7252			}
7253			continue;
7254
7255		case D_ALLOCINDIR:
7256			/*
7257			 * Simply remove the allocindir, we'll find it via
7258			 * the indirdep where we can clear pointers if
7259			 * needed.
7260			 */
7261			WORKLIST_REMOVE(wk);
7262			continue;
7263
7264		case D_FREEWORK:
7265			/*
7266			 * A truncation is waiting for the zero'd pointers
7267			 * to be written.  It can be freed when the freeblks
7268			 * is journaled.
7269			 */
7270			WORKLIST_REMOVE(wk);
7271			wk->wk_state |= ONDEPLIST;
7272			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7273			break;
7274
7275		case D_ALLOCDIRECT:
7276			adp = WK_ALLOCDIRECT(wk);
7277			if (off != 0)
7278				continue;
7279			/* FALLTHROUGH */
7280		default:
7281			panic("deallocate_dependencies: Unexpected type %s",
7282			    TYPENAME(wk->wk_type));
7283			/* NOTREACHED */
7284		}
7285	}
7286	FREE_LOCK(ump);
7287done:
7288	/*
7289	 * Don't throw away this buf, we were partially truncating and
7290	 * some deps may always remain.
7291	 */
7292	if (off) {
7293		allocbuf(bp, off);
7294		bp->b_vflags |= BV_SCANNED;
7295		return (EBUSY);
7296	}
7297	bp->b_flags |= B_INVAL | B_NOCACHE;
7298
7299	return (0);
7300}
7301
7302/*
7303 * An allocdirect is being canceled due to a truncate.  We must make sure
7304 * the journal entry is released in concert with the blkfree that releases
7305 * the storage.  Completed journal entries must not be released until the
7306 * space is no longer pointed to by the inode or in the bitmap.
7307 */
7308static void
7309cancel_allocdirect(adphead, adp, freeblks)
7310	struct allocdirectlst *adphead;
7311	struct allocdirect *adp;
7312	struct freeblks *freeblks;
7313{
7314	struct freework *freework;
7315	struct newblk *newblk;
7316	struct worklist *wk;
7317
7318	TAILQ_REMOVE(adphead, adp, ad_next);
7319	newblk = (struct newblk *)adp;
7320	freework = NULL;
7321	/*
7322	 * Find the correct freework structure.
7323	 */
7324	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7325		if (wk->wk_type != D_FREEWORK)
7326			continue;
7327		freework = WK_FREEWORK(wk);
7328		if (freework->fw_blkno == newblk->nb_newblkno)
7329			break;
7330	}
7331	if (freework == NULL)
7332		panic("cancel_allocdirect: Freework not found");
7333	/*
7334	 * If a newblk exists at all we still have the journal entry that
7335	 * initiated the allocation so we do not need to journal the free.
7336	 */
7337	cancel_jfreeblk(freeblks, freework->fw_blkno);
7338	/*
7339	 * If the journal hasn't been written the jnewblk must be passed
7340	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7341	 * this by linking the journal dependency into the freework to be
7342	 * freed when freework_freeblock() is called.  If the journal has
7343	 * been written we can simply reclaim the journal space when the
7344	 * freeblks work is complete.
7345	 */
7346	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7347	    &freeblks->fb_jwork);
7348	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7349}
7350
7351
7352/*
7353 * Cancel a new block allocation.  May be an indirect or direct block.  We
7354 * remove it from various lists and return any journal record that needs to
7355 * be resolved by the caller.
7356 *
7357 * A special consideration is made for indirects which were never pointed
7358 * at on disk and will never be found once this block is released.
7359 */
7360static struct jnewblk *
7361cancel_newblk(newblk, wk, wkhd)
7362	struct newblk *newblk;
7363	struct worklist *wk;
7364	struct workhead *wkhd;
7365{
7366	struct jnewblk *jnewblk;
7367
7368	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7369
7370	newblk->nb_state |= GOINGAWAY;
7371	/*
7372	 * Previously we traversed the completedhd on each indirdep
7373	 * attached to this newblk to cancel them and gather journal
7374	 * work.  Since we need only the oldest journal segment and
7375	 * the lowest point on the tree will always have the oldest
7376	 * journal segment we are free to release the segments
7377	 * of any subordinates and may leave the indirdep list to
7378	 * indirdep_complete() when this newblk is freed.
7379	 */
7380	if (newblk->nb_state & ONDEPLIST) {
7381		newblk->nb_state &= ~ONDEPLIST;
7382		LIST_REMOVE(newblk, nb_deps);
7383	}
7384	if (newblk->nb_state & ONWORKLIST)
7385		WORKLIST_REMOVE(&newblk->nb_list);
7386	/*
7387	 * If the journal entry hasn't been written we save a pointer to
7388	 * the dependency that frees it until it is written or the
7389	 * superseding operation completes.
7390	 */
7391	jnewblk = newblk->nb_jnewblk;
7392	if (jnewblk != NULL && wk != NULL) {
7393		newblk->nb_jnewblk = NULL;
7394		jnewblk->jn_dep = wk;
7395	}
7396	if (!LIST_EMPTY(&newblk->nb_jwork))
7397		jwork_move(wkhd, &newblk->nb_jwork);
7398	/*
7399	 * When truncating we must free the newdirblk early to remove
7400	 * the pagedep from the hash before returning.
7401	 */
7402	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7403		free_newdirblk(WK_NEWDIRBLK(wk));
7404	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7405		panic("cancel_newblk: extra newdirblk");
7406
7407	return (jnewblk);
7408}
7409
7410/*
7411 * Schedule the freefrag associated with a newblk to be released once
7412 * the pointers are written and the previous block is no longer needed.
7413 */
7414static void
7415newblk_freefrag(newblk)
7416	struct newblk *newblk;
7417{
7418	struct freefrag *freefrag;
7419
7420	if (newblk->nb_freefrag == NULL)
7421		return;
7422	freefrag = newblk->nb_freefrag;
7423	newblk->nb_freefrag = NULL;
7424	freefrag->ff_state |= COMPLETE;
7425	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7426		add_to_worklist(&freefrag->ff_list, 0);
7427}
7428
7429/*
7430 * Free a newblk. Generate a new freefrag work request if appropriate.
7431 * This must be called after the inode pointer and any direct block pointers
7432 * are valid or fully removed via truncate or frag extension.
7433 */
7434static void
7435free_newblk(newblk)
7436	struct newblk *newblk;
7437{
7438	struct indirdep *indirdep;
7439	struct worklist *wk;
7440
7441	KASSERT(newblk->nb_jnewblk == NULL,
7442	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7443	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7444	    ("free_newblk: unclaimed newblk"));
7445	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7446	newblk_freefrag(newblk);
7447	if (newblk->nb_state & ONDEPLIST)
7448		LIST_REMOVE(newblk, nb_deps);
7449	if (newblk->nb_state & ONWORKLIST)
7450		WORKLIST_REMOVE(&newblk->nb_list);
7451	LIST_REMOVE(newblk, nb_hash);
7452	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7453		free_newdirblk(WK_NEWDIRBLK(wk));
7454	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7455		panic("free_newblk: extra newdirblk");
7456	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7457		indirdep_complete(indirdep);
7458	handle_jwork(&newblk->nb_jwork);
7459	WORKITEM_FREE(newblk, D_NEWBLK);
7460}
7461
7462/*
7463 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7464 * This routine must be called with splbio interrupts blocked.
7465 */
7466static void
7467free_newdirblk(newdirblk)
7468	struct newdirblk *newdirblk;
7469{
7470	struct pagedep *pagedep;
7471	struct diradd *dap;
7472	struct worklist *wk;
7473
7474	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7475	WORKLIST_REMOVE(&newdirblk->db_list);
7476	/*
7477	 * If the pagedep is still linked onto the directory buffer
7478	 * dependency chain, then some of the entries on the
7479	 * pd_pendinghd list may not be committed to disk yet. In
7480	 * this case, we will simply clear the NEWBLOCK flag and
7481	 * let the pd_pendinghd list be processed when the pagedep
7482	 * is next written. If the pagedep is no longer on the buffer
7483	 * dependency chain, then all the entries on the pd_pending
7484	 * list are committed to disk and we can free them here.
7485	 */
7486	pagedep = newdirblk->db_pagedep;
7487	pagedep->pd_state &= ~NEWBLOCK;
7488	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7489		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7490			free_diradd(dap, NULL);
7491		/*
7492		 * If no dependencies remain, the pagedep will be freed.
7493		 */
7494		free_pagedep(pagedep);
7495	}
7496	/* Should only ever be one item in the list. */
7497	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7498		WORKLIST_REMOVE(wk);
7499		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7500	}
7501	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7502}
7503
7504/*
7505 * Prepare an inode to be freed. The actual free operation is not
7506 * done until the zero'ed inode has been written to disk.
7507 */
7508void
7509softdep_freefile(pvp, ino, mode)
7510	struct vnode *pvp;
7511	ino_t ino;
7512	int mode;
7513{
7514	struct inode *ip = VTOI(pvp);
7515	struct inodedep *inodedep;
7516	struct freefile *freefile;
7517	struct freeblks *freeblks;
7518	struct ufsmount *ump;
7519
7520	ump = ip->i_ump;
7521	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7522	    ("softdep_freefile called on non-softdep filesystem"));
7523	/*
7524	 * This sets up the inode de-allocation dependency.
7525	 */
7526	freefile = malloc(sizeof(struct freefile),
7527		M_FREEFILE, M_SOFTDEP_FLAGS);
7528	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7529	freefile->fx_mode = mode;
7530	freefile->fx_oldinum = ino;
7531	freefile->fx_devvp = ip->i_devvp;
7532	LIST_INIT(&freefile->fx_jwork);
7533	UFS_LOCK(ump);
7534	ip->i_fs->fs_pendinginodes += 1;
7535	UFS_UNLOCK(ump);
7536
7537	/*
7538	 * If the inodedep does not exist, then the zero'ed inode has
7539	 * been written to disk. If the allocated inode has never been
7540	 * written to disk, then the on-disk inode is zero'ed. In either
7541	 * case we can free the file immediately.  If the journal was
7542	 * canceled before being written the inode will never make it to
7543	 * disk and we must send the canceled journal entrys to
7544	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7545	 * Any blocks waiting on the inode to write can be safely freed
7546	 * here as it will never been written.
7547	 */
7548	ACQUIRE_LOCK(ump);
7549	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7550	if (inodedep) {
7551		/*
7552		 * Clear out freeblks that no longer need to reference
7553		 * this inode.
7554		 */
7555		while ((freeblks =
7556		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7557			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7558			    fb_next);
7559			freeblks->fb_state &= ~ONDEPLIST;
7560		}
7561		/*
7562		 * Remove this inode from the unlinked list.
7563		 */
7564		if (inodedep->id_state & UNLINKED) {
7565			/*
7566			 * Save the journal work to be freed with the bitmap
7567			 * before we clear UNLINKED.  Otherwise it can be lost
7568			 * if the inode block is written.
7569			 */
7570			handle_bufwait(inodedep, &freefile->fx_jwork);
7571			clear_unlinked_inodedep(inodedep);
7572			/*
7573			 * Re-acquire inodedep as we've dropped the
7574			 * per-filesystem lock in clear_unlinked_inodedep().
7575			 */
7576			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7577		}
7578	}
7579	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7580		FREE_LOCK(ump);
7581		handle_workitem_freefile(freefile);
7582		return;
7583	}
7584	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7585		inodedep->id_state |= GOINGAWAY;
7586	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7587	FREE_LOCK(ump);
7588	if (ip->i_number == ino)
7589		ip->i_flag |= IN_MODIFIED;
7590}
7591
7592/*
7593 * Check to see if an inode has never been written to disk. If
7594 * so free the inodedep and return success, otherwise return failure.
7595 * This routine must be called with splbio interrupts blocked.
7596 *
7597 * If we still have a bitmap dependency, then the inode has never
7598 * been written to disk. Drop the dependency as it is no longer
7599 * necessary since the inode is being deallocated. We set the
7600 * ALLCOMPLETE flags since the bitmap now properly shows that the
7601 * inode is not allocated. Even if the inode is actively being
7602 * written, it has been rolled back to its zero'ed state, so we
7603 * are ensured that a zero inode is what is on the disk. For short
7604 * lived files, this change will usually result in removing all the
7605 * dependencies from the inode so that it can be freed immediately.
7606 */
7607static int
7608check_inode_unwritten(inodedep)
7609	struct inodedep *inodedep;
7610{
7611
7612	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7613
7614	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7615	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7616	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7617	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7618	    !LIST_EMPTY(&inodedep->id_inowait) ||
7619	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7620	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7621	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7622	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7623	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7624	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7625	    inodedep->id_mkdiradd != NULL ||
7626	    inodedep->id_nlinkdelta != 0)
7627		return (0);
7628	/*
7629	 * Another process might be in initiate_write_inodeblock_ufs[12]
7630	 * trying to allocate memory without holding "Softdep Lock".
7631	 */
7632	if ((inodedep->id_state & IOSTARTED) != 0 &&
7633	    inodedep->id_savedino1 == NULL)
7634		return (0);
7635
7636	if (inodedep->id_state & ONDEPLIST)
7637		LIST_REMOVE(inodedep, id_deps);
7638	inodedep->id_state &= ~ONDEPLIST;
7639	inodedep->id_state |= ALLCOMPLETE;
7640	inodedep->id_bmsafemap = NULL;
7641	if (inodedep->id_state & ONWORKLIST)
7642		WORKLIST_REMOVE(&inodedep->id_list);
7643	if (inodedep->id_savedino1 != NULL) {
7644		free(inodedep->id_savedino1, M_SAVEDINO);
7645		inodedep->id_savedino1 = NULL;
7646	}
7647	if (free_inodedep(inodedep) == 0)
7648		panic("check_inode_unwritten: busy inode");
7649	return (1);
7650}
7651
7652static int
7653check_inodedep_free(inodedep)
7654	struct inodedep *inodedep;
7655{
7656
7657	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7658	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7659	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7660	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7661	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7662	    !LIST_EMPTY(&inodedep->id_inowait) ||
7663	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7664	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7665	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7666	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7667	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7668	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7669	    inodedep->id_mkdiradd != NULL ||
7670	    inodedep->id_nlinkdelta != 0 ||
7671	    inodedep->id_savedino1 != NULL)
7672		return (0);
7673	return (1);
7674}
7675
7676/*
7677 * Try to free an inodedep structure. Return 1 if it could be freed.
7678 */
7679static int
7680free_inodedep(inodedep)
7681	struct inodedep *inodedep;
7682{
7683
7684	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7685	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7686	    !check_inodedep_free(inodedep))
7687		return (0);
7688	if (inodedep->id_state & ONDEPLIST)
7689		LIST_REMOVE(inodedep, id_deps);
7690	LIST_REMOVE(inodedep, id_hash);
7691	WORKITEM_FREE(inodedep, D_INODEDEP);
7692	return (1);
7693}
7694
7695/*
7696 * Free the block referenced by a freework structure.  The parent freeblks
7697 * structure is released and completed when the final cg bitmap reaches
7698 * the disk.  This routine may be freeing a jnewblk which never made it to
7699 * disk in which case we do not have to wait as the operation is undone
7700 * in memory immediately.
7701 */
7702static void
7703freework_freeblock(freework)
7704	struct freework *freework;
7705{
7706	struct freeblks *freeblks;
7707	struct jnewblk *jnewblk;
7708	struct ufsmount *ump;
7709	struct workhead wkhd;
7710	struct fs *fs;
7711	int bsize;
7712	int needj;
7713
7714	ump = VFSTOUFS(freework->fw_list.wk_mp);
7715	LOCK_OWNED(ump);
7716	/*
7717	 * Handle partial truncate separately.
7718	 */
7719	if (freework->fw_indir) {
7720		complete_trunc_indir(freework);
7721		return;
7722	}
7723	freeblks = freework->fw_freeblks;
7724	fs = ump->um_fs;
7725	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7726	bsize = lfragtosize(fs, freework->fw_frags);
7727	LIST_INIT(&wkhd);
7728	/*
7729	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7730	 * on the indirblk hashtable and prevents premature freeing.
7731	 */
7732	freework->fw_state |= DEPCOMPLETE;
7733	/*
7734	 * SUJ needs to wait for the segment referencing freed indirect
7735	 * blocks to expire so that we know the checker will not confuse
7736	 * a re-allocated indirect block with its old contents.
7737	 */
7738	if (needj && freework->fw_lbn <= -NDADDR)
7739		indirblk_insert(freework);
7740	/*
7741	 * If we are canceling an existing jnewblk pass it to the free
7742	 * routine, otherwise pass the freeblk which will ultimately
7743	 * release the freeblks.  If we're not journaling, we can just
7744	 * free the freeblks immediately.
7745	 */
7746	jnewblk = freework->fw_jnewblk;
7747	if (jnewblk != NULL) {
7748		cancel_jnewblk(jnewblk, &wkhd);
7749		needj = 0;
7750	} else if (needj) {
7751		freework->fw_state |= DELAYEDFREE;
7752		freeblks->fb_cgwait++;
7753		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7754	}
7755	FREE_LOCK(ump);
7756	freeblks_free(ump, freeblks, btodb(bsize));
7757	CTR4(KTR_SUJ,
7758	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7759	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7760	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7761	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7762	ACQUIRE_LOCK(ump);
7763	/*
7764	 * The jnewblk will be discarded and the bits in the map never
7765	 * made it to disk.  We can immediately free the freeblk.
7766	 */
7767	if (needj == 0)
7768		handle_written_freework(freework);
7769}
7770
7771/*
7772 * We enqueue freework items that need processing back on the freeblks and
7773 * add the freeblks to the worklist.  This makes it easier to find all work
7774 * required to flush a truncation in process_truncates().
7775 */
7776static void
7777freework_enqueue(freework)
7778	struct freework *freework;
7779{
7780	struct freeblks *freeblks;
7781
7782	freeblks = freework->fw_freeblks;
7783	if ((freework->fw_state & INPROGRESS) == 0)
7784		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7785	if ((freeblks->fb_state &
7786	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7787	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7788		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7789}
7790
7791/*
7792 * Start, continue, or finish the process of freeing an indirect block tree.
7793 * The free operation may be paused at any point with fw_off containing the
7794 * offset to restart from.  This enables us to implement some flow control
7795 * for large truncates which may fan out and generate a huge number of
7796 * dependencies.
7797 */
7798static void
7799handle_workitem_indirblk(freework)
7800	struct freework *freework;
7801{
7802	struct freeblks *freeblks;
7803	struct ufsmount *ump;
7804	struct fs *fs;
7805
7806	freeblks = freework->fw_freeblks;
7807	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7808	fs = ump->um_fs;
7809	if (freework->fw_state & DEPCOMPLETE) {
7810		handle_written_freework(freework);
7811		return;
7812	}
7813	if (freework->fw_off == NINDIR(fs)) {
7814		freework_freeblock(freework);
7815		return;
7816	}
7817	freework->fw_state |= INPROGRESS;
7818	FREE_LOCK(ump);
7819	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7820	    freework->fw_lbn);
7821	ACQUIRE_LOCK(ump);
7822}
7823
7824/*
7825 * Called when a freework structure attached to a cg buf is written.  The
7826 * ref on either the parent or the freeblks structure is released and
7827 * the freeblks is added back to the worklist if there is more work to do.
7828 */
7829static void
7830handle_written_freework(freework)
7831	struct freework *freework;
7832{
7833	struct freeblks *freeblks;
7834	struct freework *parent;
7835
7836	freeblks = freework->fw_freeblks;
7837	parent = freework->fw_parent;
7838	if (freework->fw_state & DELAYEDFREE)
7839		freeblks->fb_cgwait--;
7840	freework->fw_state |= COMPLETE;
7841	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7842		WORKITEM_FREE(freework, D_FREEWORK);
7843	if (parent) {
7844		if (--parent->fw_ref == 0)
7845			freework_enqueue(parent);
7846		return;
7847	}
7848	if (--freeblks->fb_ref != 0)
7849		return;
7850	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7851	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7852		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7853}
7854
7855/*
7856 * This workitem routine performs the block de-allocation.
7857 * The workitem is added to the pending list after the updated
7858 * inode block has been written to disk.  As mentioned above,
7859 * checks regarding the number of blocks de-allocated (compared
7860 * to the number of blocks allocated for the file) are also
7861 * performed in this function.
7862 */
7863static int
7864handle_workitem_freeblocks(freeblks, flags)
7865	struct freeblks *freeblks;
7866	int flags;
7867{
7868	struct freework *freework;
7869	struct newblk *newblk;
7870	struct allocindir *aip;
7871	struct ufsmount *ump;
7872	struct worklist *wk;
7873
7874	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7875	    ("handle_workitem_freeblocks: Journal entries not written."));
7876	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7877	ACQUIRE_LOCK(ump);
7878	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7879		WORKLIST_REMOVE(wk);
7880		switch (wk->wk_type) {
7881		case D_DIRREM:
7882			wk->wk_state |= COMPLETE;
7883			add_to_worklist(wk, 0);
7884			continue;
7885
7886		case D_ALLOCDIRECT:
7887			free_newblk(WK_NEWBLK(wk));
7888			continue;
7889
7890		case D_ALLOCINDIR:
7891			aip = WK_ALLOCINDIR(wk);
7892			freework = NULL;
7893			if (aip->ai_state & DELAYEDFREE) {
7894				FREE_LOCK(ump);
7895				freework = newfreework(ump, freeblks, NULL,
7896				    aip->ai_lbn, aip->ai_newblkno,
7897				    ump->um_fs->fs_frag, 0, 0);
7898				ACQUIRE_LOCK(ump);
7899			}
7900			newblk = WK_NEWBLK(wk);
7901			if (newblk->nb_jnewblk) {
7902				freework->fw_jnewblk = newblk->nb_jnewblk;
7903				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7904				newblk->nb_jnewblk = NULL;
7905			}
7906			free_newblk(newblk);
7907			continue;
7908
7909		case D_FREEWORK:
7910			freework = WK_FREEWORK(wk);
7911			if (freework->fw_lbn <= -NDADDR)
7912				handle_workitem_indirblk(freework);
7913			else
7914				freework_freeblock(freework);
7915			continue;
7916		default:
7917			panic("handle_workitem_freeblocks: Unknown type %s",
7918			    TYPENAME(wk->wk_type));
7919		}
7920	}
7921	if (freeblks->fb_ref != 0) {
7922		freeblks->fb_state &= ~INPROGRESS;
7923		wake_worklist(&freeblks->fb_list);
7924		freeblks = NULL;
7925	}
7926	FREE_LOCK(ump);
7927	if (freeblks)
7928		return handle_complete_freeblocks(freeblks, flags);
7929	return (0);
7930}
7931
7932/*
7933 * Handle completion of block free via truncate.  This allows fs_pending
7934 * to track the actual free block count more closely than if we only updated
7935 * it at the end.  We must be careful to handle cases where the block count
7936 * on free was incorrect.
7937 */
7938static void
7939freeblks_free(ump, freeblks, blocks)
7940	struct ufsmount *ump;
7941	struct freeblks *freeblks;
7942	int blocks;
7943{
7944	struct fs *fs;
7945	ufs2_daddr_t remain;
7946
7947	UFS_LOCK(ump);
7948	remain = -freeblks->fb_chkcnt;
7949	freeblks->fb_chkcnt += blocks;
7950	if (remain > 0) {
7951		if (remain < blocks)
7952			blocks = remain;
7953		fs = ump->um_fs;
7954		fs->fs_pendingblocks -= blocks;
7955	}
7956	UFS_UNLOCK(ump);
7957}
7958
7959/*
7960 * Once all of the freework workitems are complete we can retire the
7961 * freeblocks dependency and any journal work awaiting completion.  This
7962 * can not be called until all other dependencies are stable on disk.
7963 */
7964static int
7965handle_complete_freeblocks(freeblks, flags)
7966	struct freeblks *freeblks;
7967	int flags;
7968{
7969	struct inodedep *inodedep;
7970	struct inode *ip;
7971	struct vnode *vp;
7972	struct fs *fs;
7973	struct ufsmount *ump;
7974	ufs2_daddr_t spare;
7975
7976	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7977	fs = ump->um_fs;
7978	flags = LK_EXCLUSIVE | flags;
7979	spare = freeblks->fb_chkcnt;
7980
7981	/*
7982	 * If we did not release the expected number of blocks we may have
7983	 * to adjust the inode block count here.  Only do so if it wasn't
7984	 * a truncation to zero and the modrev still matches.
7985	 */
7986	if (spare && freeblks->fb_len != 0) {
7987		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7988		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7989			return (EBUSY);
7990		ip = VTOI(vp);
7991		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7992			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7993			ip->i_flag |= IN_CHANGE;
7994			/*
7995			 * We must wait so this happens before the
7996			 * journal is reclaimed.
7997			 */
7998			ffs_update(vp, 1);
7999		}
8000		vput(vp);
8001	}
8002	if (spare < 0) {
8003		UFS_LOCK(ump);
8004		fs->fs_pendingblocks += spare;
8005		UFS_UNLOCK(ump);
8006	}
8007#ifdef QUOTA
8008	/* Handle spare. */
8009	if (spare)
8010		quotaadj(freeblks->fb_quota, ump, -spare);
8011	quotarele(freeblks->fb_quota);
8012#endif
8013	ACQUIRE_LOCK(ump);
8014	if (freeblks->fb_state & ONDEPLIST) {
8015		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8016		    0, &inodedep);
8017		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8018		freeblks->fb_state &= ~ONDEPLIST;
8019		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8020			free_inodedep(inodedep);
8021	}
8022	/*
8023	 * All of the freeblock deps must be complete prior to this call
8024	 * so it's now safe to complete earlier outstanding journal entries.
8025	 */
8026	handle_jwork(&freeblks->fb_jwork);
8027	WORKITEM_FREE(freeblks, D_FREEBLKS);
8028	FREE_LOCK(ump);
8029	return (0);
8030}
8031
8032/*
8033 * Release blocks associated with the freeblks and stored in the indirect
8034 * block dbn. If level is greater than SINGLE, the block is an indirect block
8035 * and recursive calls to indirtrunc must be used to cleanse other indirect
8036 * blocks.
8037 *
8038 * This handles partial and complete truncation of blocks.  Partial is noted
8039 * with goingaway == 0.  In this case the freework is completed after the
8040 * zero'd indirects are written to disk.  For full truncation the freework
8041 * is completed after the block is freed.
8042 */
8043static void
8044indir_trunc(freework, dbn, lbn)
8045	struct freework *freework;
8046	ufs2_daddr_t dbn;
8047	ufs_lbn_t lbn;
8048{
8049	struct freework *nfreework;
8050	struct workhead wkhd;
8051	struct freeblks *freeblks;
8052	struct buf *bp;
8053	struct fs *fs;
8054	struct indirdep *indirdep;
8055	struct ufsmount *ump;
8056	ufs1_daddr_t *bap1 = 0;
8057	ufs2_daddr_t nb, nnb, *bap2 = 0;
8058	ufs_lbn_t lbnadd, nlbn;
8059	int i, nblocks, ufs1fmt;
8060	int freedblocks;
8061	int goingaway;
8062	int freedeps;
8063	int needj;
8064	int level;
8065	int cnt;
8066
8067	freeblks = freework->fw_freeblks;
8068	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8069	fs = ump->um_fs;
8070	/*
8071	 * Get buffer of block pointers to be freed.  There are three cases:
8072	 *
8073	 * 1) Partial truncate caches the indirdep pointer in the freework
8074	 *    which provides us a back copy to the save bp which holds the
8075	 *    pointers we want to clear.  When this completes the zero
8076	 *    pointers are written to the real copy.
8077	 * 2) The indirect is being completely truncated, cancel_indirdep()
8078	 *    eliminated the real copy and placed the indirdep on the saved
8079	 *    copy.  The indirdep and buf are discarded when this completes.
8080	 * 3) The indirect was not in memory, we read a copy off of the disk
8081	 *    using the devvp and drop and invalidate the buffer when we're
8082	 *    done.
8083	 */
8084	goingaway = 1;
8085	indirdep = NULL;
8086	if (freework->fw_indir != NULL) {
8087		goingaway = 0;
8088		indirdep = freework->fw_indir;
8089		bp = indirdep->ir_savebp;
8090		if (bp == NULL || bp->b_blkno != dbn)
8091			panic("indir_trunc: Bad saved buf %p blkno %jd",
8092			    bp, (intmax_t)dbn);
8093	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8094		/*
8095		 * The lock prevents the buf dep list from changing and
8096	 	 * indirects on devvp should only ever have one dependency.
8097		 */
8098		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8099		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8100			panic("indir_trunc: Bad indirdep %p from buf %p",
8101			    indirdep, bp);
8102	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8103	    NOCRED, &bp) != 0) {
8104		brelse(bp);
8105		return;
8106	}
8107	ACQUIRE_LOCK(ump);
8108	/* Protects against a race with complete_trunc_indir(). */
8109	freework->fw_state &= ~INPROGRESS;
8110	/*
8111	 * If we have an indirdep we need to enforce the truncation order
8112	 * and discard it when it is complete.
8113	 */
8114	if (indirdep) {
8115		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8116		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8117			/*
8118			 * Add the complete truncate to the list on the
8119			 * indirdep to enforce in-order processing.
8120			 */
8121			if (freework->fw_indir == NULL)
8122				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8123				    freework, fw_next);
8124			FREE_LOCK(ump);
8125			return;
8126		}
8127		/*
8128		 * If we're goingaway, free the indirdep.  Otherwise it will
8129		 * linger until the write completes.
8130		 */
8131		if (goingaway)
8132			free_indirdep(indirdep);
8133	}
8134	FREE_LOCK(ump);
8135	/* Initialize pointers depending on block size. */
8136	if (ump->um_fstype == UFS1) {
8137		bap1 = (ufs1_daddr_t *)bp->b_data;
8138		nb = bap1[freework->fw_off];
8139		ufs1fmt = 1;
8140	} else {
8141		bap2 = (ufs2_daddr_t *)bp->b_data;
8142		nb = bap2[freework->fw_off];
8143		ufs1fmt = 0;
8144	}
8145	level = lbn_level(lbn);
8146	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8147	lbnadd = lbn_offset(fs, level);
8148	nblocks = btodb(fs->fs_bsize);
8149	nfreework = freework;
8150	freedeps = 0;
8151	cnt = 0;
8152	/*
8153	 * Reclaim blocks.  Traverses into nested indirect levels and
8154	 * arranges for the current level to be freed when subordinates
8155	 * are free when journaling.
8156	 */
8157	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8158		if (i != NINDIR(fs) - 1) {
8159			if (ufs1fmt)
8160				nnb = bap1[i+1];
8161			else
8162				nnb = bap2[i+1];
8163		} else
8164			nnb = 0;
8165		if (nb == 0)
8166			continue;
8167		cnt++;
8168		if (level != 0) {
8169			nlbn = (lbn + 1) - (i * lbnadd);
8170			if (needj != 0) {
8171				nfreework = newfreework(ump, freeblks, freework,
8172				    nlbn, nb, fs->fs_frag, 0, 0);
8173				freedeps++;
8174			}
8175			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8176		} else {
8177			struct freedep *freedep;
8178
8179			/*
8180			 * Attempt to aggregate freedep dependencies for
8181			 * all blocks being released to the same CG.
8182			 */
8183			LIST_INIT(&wkhd);
8184			if (needj != 0 &&
8185			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8186				freedep = newfreedep(freework);
8187				WORKLIST_INSERT_UNLOCKED(&wkhd,
8188				    &freedep->fd_list);
8189				freedeps++;
8190			}
8191			CTR3(KTR_SUJ,
8192			    "indir_trunc: ino %d blkno %jd size %ld",
8193			    freeblks->fb_inum, nb, fs->fs_bsize);
8194			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8195			    fs->fs_bsize, freeblks->fb_inum,
8196			    freeblks->fb_vtype, &wkhd);
8197		}
8198	}
8199	if (goingaway) {
8200		bp->b_flags |= B_INVAL | B_NOCACHE;
8201		brelse(bp);
8202	}
8203	freedblocks = 0;
8204	if (level == 0)
8205		freedblocks = (nblocks * cnt);
8206	if (needj == 0)
8207		freedblocks += nblocks;
8208	freeblks_free(ump, freeblks, freedblocks);
8209	/*
8210	 * If we are journaling set up the ref counts and offset so this
8211	 * indirect can be completed when its children are free.
8212	 */
8213	if (needj) {
8214		ACQUIRE_LOCK(ump);
8215		freework->fw_off = i;
8216		freework->fw_ref += freedeps;
8217		freework->fw_ref -= NINDIR(fs) + 1;
8218		if (level == 0)
8219			freeblks->fb_cgwait += freedeps;
8220		if (freework->fw_ref == 0)
8221			freework_freeblock(freework);
8222		FREE_LOCK(ump);
8223		return;
8224	}
8225	/*
8226	 * If we're not journaling we can free the indirect now.
8227	 */
8228	dbn = dbtofsb(fs, dbn);
8229	CTR3(KTR_SUJ,
8230	    "indir_trunc 2: ino %d blkno %jd size %ld",
8231	    freeblks->fb_inum, dbn, fs->fs_bsize);
8232	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8233	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
8234	/* Non SUJ softdep does single-threaded truncations. */
8235	if (freework->fw_blkno == dbn) {
8236		freework->fw_state |= ALLCOMPLETE;
8237		ACQUIRE_LOCK(ump);
8238		handle_written_freework(freework);
8239		FREE_LOCK(ump);
8240	}
8241	return;
8242}
8243
8244/*
8245 * Cancel an allocindir when it is removed via truncation.  When bp is not
8246 * NULL the indirect never appeared on disk and is scheduled to be freed
8247 * independently of the indir so we can more easily track journal work.
8248 */
8249static void
8250cancel_allocindir(aip, bp, freeblks, trunc)
8251	struct allocindir *aip;
8252	struct buf *bp;
8253	struct freeblks *freeblks;
8254	int trunc;
8255{
8256	struct indirdep *indirdep;
8257	struct freefrag *freefrag;
8258	struct newblk *newblk;
8259
8260	newblk = (struct newblk *)aip;
8261	LIST_REMOVE(aip, ai_next);
8262	/*
8263	 * We must eliminate the pointer in bp if it must be freed on its
8264	 * own due to partial truncate or pending journal work.
8265	 */
8266	if (bp && (trunc || newblk->nb_jnewblk)) {
8267		/*
8268		 * Clear the pointer and mark the aip to be freed
8269		 * directly if it never existed on disk.
8270		 */
8271		aip->ai_state |= DELAYEDFREE;
8272		indirdep = aip->ai_indirdep;
8273		if (indirdep->ir_state & UFS1FMT)
8274			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8275		else
8276			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8277	}
8278	/*
8279	 * When truncating the previous pointer will be freed via
8280	 * savedbp.  Eliminate the freefrag which would dup free.
8281	 */
8282	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8283		newblk->nb_freefrag = NULL;
8284		if (freefrag->ff_jdep)
8285			cancel_jfreefrag(
8286			    WK_JFREEFRAG(freefrag->ff_jdep));
8287		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8288		WORKITEM_FREE(freefrag, D_FREEFRAG);
8289	}
8290	/*
8291	 * If the journal hasn't been written the jnewblk must be passed
8292	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8293	 * this by leaving the journal dependency on the newblk to be freed
8294	 * when a freework is created in handle_workitem_freeblocks().
8295	 */
8296	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8297	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8298}
8299
8300/*
8301 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8302 * in to a newdirblk so any subsequent additions are tracked properly.  The
8303 * caller is responsible for adding the mkdir1 dependency to the journal
8304 * and updating id_mkdiradd.  This function returns with the per-filesystem
8305 * lock held.
8306 */
8307static struct mkdir *
8308setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8309	struct diradd *dap;
8310	ino_t newinum;
8311	ino_t dinum;
8312	struct buf *newdirbp;
8313	struct mkdir **mkdirp;
8314{
8315	struct newblk *newblk;
8316	struct pagedep *pagedep;
8317	struct inodedep *inodedep;
8318	struct newdirblk *newdirblk = 0;
8319	struct mkdir *mkdir1, *mkdir2;
8320	struct worklist *wk;
8321	struct jaddref *jaddref;
8322	struct ufsmount *ump;
8323	struct mount *mp;
8324
8325	mp = dap->da_list.wk_mp;
8326	ump = VFSTOUFS(mp);
8327	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8328	    M_SOFTDEP_FLAGS);
8329	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8330	LIST_INIT(&newdirblk->db_mkdir);
8331	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8332	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8333	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8334	mkdir1->md_diradd = dap;
8335	mkdir1->md_jaddref = NULL;
8336	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8337	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8338	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8339	mkdir2->md_diradd = dap;
8340	mkdir2->md_jaddref = NULL;
8341	if (MOUNTEDSUJ(mp) == 0) {
8342		mkdir1->md_state |= DEPCOMPLETE;
8343		mkdir2->md_state |= DEPCOMPLETE;
8344	}
8345	/*
8346	 * Dependency on "." and ".." being written to disk.
8347	 */
8348	mkdir1->md_buf = newdirbp;
8349	ACQUIRE_LOCK(VFSTOUFS(mp));
8350	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8351	/*
8352	 * We must link the pagedep, allocdirect, and newdirblk for
8353	 * the initial file page so the pointer to the new directory
8354	 * is not written until the directory contents are live and
8355	 * any subsequent additions are not marked live until the
8356	 * block is reachable via the inode.
8357	 */
8358	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8359		panic("setup_newdir: lost pagedep");
8360	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8361		if (wk->wk_type == D_ALLOCDIRECT)
8362			break;
8363	if (wk == NULL)
8364		panic("setup_newdir: lost allocdirect");
8365	if (pagedep->pd_state & NEWBLOCK)
8366		panic("setup_newdir: NEWBLOCK already set");
8367	newblk = WK_NEWBLK(wk);
8368	pagedep->pd_state |= NEWBLOCK;
8369	pagedep->pd_newdirblk = newdirblk;
8370	newdirblk->db_pagedep = pagedep;
8371	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8372	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8373	/*
8374	 * Look up the inodedep for the parent directory so that we
8375	 * can link mkdir2 into the pending dotdot jaddref or
8376	 * the inode write if there is none.  If the inode is
8377	 * ALLCOMPLETE and no jaddref is present all dependencies have
8378	 * been satisfied and mkdir2 can be freed.
8379	 */
8380	inodedep_lookup(mp, dinum, 0, &inodedep);
8381	if (MOUNTEDSUJ(mp)) {
8382		if (inodedep == NULL)
8383			panic("setup_newdir: Lost parent.");
8384		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8385		    inoreflst);
8386		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8387		    (jaddref->ja_state & MKDIR_PARENT),
8388		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8389		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8390		mkdir2->md_jaddref = jaddref;
8391		jaddref->ja_mkdir = mkdir2;
8392	} else if (inodedep == NULL ||
8393	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8394		dap->da_state &= ~MKDIR_PARENT;
8395		WORKITEM_FREE(mkdir2, D_MKDIR);
8396		mkdir2 = NULL;
8397	} else {
8398		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8399		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8400	}
8401	*mkdirp = mkdir2;
8402
8403	return (mkdir1);
8404}
8405
8406/*
8407 * Directory entry addition dependencies.
8408 *
8409 * When adding a new directory entry, the inode (with its incremented link
8410 * count) must be written to disk before the directory entry's pointer to it.
8411 * Also, if the inode is newly allocated, the corresponding freemap must be
8412 * updated (on disk) before the directory entry's pointer. These requirements
8413 * are met via undo/redo on the directory entry's pointer, which consists
8414 * simply of the inode number.
8415 *
8416 * As directory entries are added and deleted, the free space within a
8417 * directory block can become fragmented.  The ufs filesystem will compact
8418 * a fragmented directory block to make space for a new entry. When this
8419 * occurs, the offsets of previously added entries change. Any "diradd"
8420 * dependency structures corresponding to these entries must be updated with
8421 * the new offsets.
8422 */
8423
8424/*
8425 * This routine is called after the in-memory inode's link
8426 * count has been incremented, but before the directory entry's
8427 * pointer to the inode has been set.
8428 */
8429int
8430softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8431	struct buf *bp;		/* buffer containing directory block */
8432	struct inode *dp;	/* inode for directory */
8433	off_t diroffset;	/* offset of new entry in directory */
8434	ino_t newinum;		/* inode referenced by new directory entry */
8435	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8436	int isnewblk;		/* entry is in a newly allocated block */
8437{
8438	int offset;		/* offset of new entry within directory block */
8439	ufs_lbn_t lbn;		/* block in directory containing new entry */
8440	struct fs *fs;
8441	struct diradd *dap;
8442	struct newblk *newblk;
8443	struct pagedep *pagedep;
8444	struct inodedep *inodedep;
8445	struct newdirblk *newdirblk = 0;
8446	struct mkdir *mkdir1, *mkdir2;
8447	struct jaddref *jaddref;
8448	struct ufsmount *ump;
8449	struct mount *mp;
8450	int isindir;
8451
8452	ump = dp->i_ump;
8453	mp = UFSTOVFS(ump);
8454	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8455	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8456	/*
8457	 * Whiteouts have no dependencies.
8458	 */
8459	if (newinum == WINO) {
8460		if (newdirbp != NULL)
8461			bdwrite(newdirbp);
8462		return (0);
8463	}
8464	jaddref = NULL;
8465	mkdir1 = mkdir2 = NULL;
8466	fs = dp->i_fs;
8467	lbn = lblkno(fs, diroffset);
8468	offset = blkoff(fs, diroffset);
8469	dap = malloc(sizeof(struct diradd), M_DIRADD,
8470		M_SOFTDEP_FLAGS|M_ZERO);
8471	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8472	dap->da_offset = offset;
8473	dap->da_newinum = newinum;
8474	dap->da_state = ATTACHED;
8475	LIST_INIT(&dap->da_jwork);
8476	isindir = bp->b_lblkno >= NDADDR;
8477	if (isnewblk &&
8478	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8479		newdirblk = malloc(sizeof(struct newdirblk),
8480		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8481		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8482		LIST_INIT(&newdirblk->db_mkdir);
8483	}
8484	/*
8485	 * If we're creating a new directory setup the dependencies and set
8486	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8487	 * we can move on.
8488	 */
8489	if (newdirbp == NULL) {
8490		dap->da_state |= DEPCOMPLETE;
8491		ACQUIRE_LOCK(ump);
8492	} else {
8493		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8494		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8495		    &mkdir2);
8496	}
8497	/*
8498	 * Link into parent directory pagedep to await its being written.
8499	 */
8500	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8501#ifdef DEBUG
8502	if (diradd_lookup(pagedep, offset) != NULL)
8503		panic("softdep_setup_directory_add: %p already at off %d\n",
8504		    diradd_lookup(pagedep, offset), offset);
8505#endif
8506	dap->da_pagedep = pagedep;
8507	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8508	    da_pdlist);
8509	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
8510	/*
8511	 * If we're journaling, link the diradd into the jaddref so it
8512	 * may be completed after the journal entry is written.  Otherwise,
8513	 * link the diradd into its inodedep.  If the inode is not yet
8514	 * written place it on the bufwait list, otherwise do the post-inode
8515	 * write processing to put it on the id_pendinghd list.
8516	 */
8517	if (MOUNTEDSUJ(mp)) {
8518		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8519		    inoreflst);
8520		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8521		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8522		jaddref->ja_diroff = diroffset;
8523		jaddref->ja_diradd = dap;
8524		add_to_journal(&jaddref->ja_list);
8525	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8526		diradd_inode_written(dap, inodedep);
8527	else
8528		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8529	/*
8530	 * Add the journal entries for . and .. links now that the primary
8531	 * link is written.
8532	 */
8533	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8534		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8535		    inoreflst, if_deps);
8536		KASSERT(jaddref != NULL &&
8537		    jaddref->ja_ino == jaddref->ja_parent &&
8538		    (jaddref->ja_state & MKDIR_BODY),
8539		    ("softdep_setup_directory_add: bad dot jaddref %p",
8540		    jaddref));
8541		mkdir1->md_jaddref = jaddref;
8542		jaddref->ja_mkdir = mkdir1;
8543		/*
8544		 * It is important that the dotdot journal entry
8545		 * is added prior to the dot entry since dot writes
8546		 * both the dot and dotdot links.  These both must
8547		 * be added after the primary link for the journal
8548		 * to remain consistent.
8549		 */
8550		add_to_journal(&mkdir2->md_jaddref->ja_list);
8551		add_to_journal(&jaddref->ja_list);
8552	}
8553	/*
8554	 * If we are adding a new directory remember this diradd so that if
8555	 * we rename it we can keep the dot and dotdot dependencies.  If
8556	 * we are adding a new name for an inode that has a mkdiradd we
8557	 * must be in rename and we have to move the dot and dotdot
8558	 * dependencies to this new name.  The old name is being orphaned
8559	 * soon.
8560	 */
8561	if (mkdir1 != NULL) {
8562		if (inodedep->id_mkdiradd != NULL)
8563			panic("softdep_setup_directory_add: Existing mkdir");
8564		inodedep->id_mkdiradd = dap;
8565	} else if (inodedep->id_mkdiradd)
8566		merge_diradd(inodedep, dap);
8567	if (newdirblk) {
8568		/*
8569		 * There is nothing to do if we are already tracking
8570		 * this block.
8571		 */
8572		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8573			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8574			FREE_LOCK(ump);
8575			return (0);
8576		}
8577		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8578		    == 0)
8579			panic("softdep_setup_directory_add: lost entry");
8580		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8581		pagedep->pd_state |= NEWBLOCK;
8582		pagedep->pd_newdirblk = newdirblk;
8583		newdirblk->db_pagedep = pagedep;
8584		FREE_LOCK(ump);
8585		/*
8586		 * If we extended into an indirect signal direnter to sync.
8587		 */
8588		if (isindir)
8589			return (1);
8590		return (0);
8591	}
8592	FREE_LOCK(ump);
8593	return (0);
8594}
8595
8596/*
8597 * This procedure is called to change the offset of a directory
8598 * entry when compacting a directory block which must be owned
8599 * exclusively by the caller. Note that the actual entry movement
8600 * must be done in this procedure to ensure that no I/O completions
8601 * occur while the move is in progress.
8602 */
8603void
8604softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8605	struct buf *bp;		/* Buffer holding directory block. */
8606	struct inode *dp;	/* inode for directory */
8607	caddr_t base;		/* address of dp->i_offset */
8608	caddr_t oldloc;		/* address of old directory location */
8609	caddr_t newloc;		/* address of new directory location */
8610	int entrysize;		/* size of directory entry */
8611{
8612	int offset, oldoffset, newoffset;
8613	struct pagedep *pagedep;
8614	struct jmvref *jmvref;
8615	struct diradd *dap;
8616	struct direct *de;
8617	struct mount *mp;
8618	ufs_lbn_t lbn;
8619	int flags;
8620
8621	mp = UFSTOVFS(dp->i_ump);
8622	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8623	    ("softdep_change_directoryentry_offset called on "
8624	     "non-softdep filesystem"));
8625	de = (struct direct *)oldloc;
8626	jmvref = NULL;
8627	flags = 0;
8628	/*
8629	 * Moves are always journaled as it would be too complex to
8630	 * determine if any affected adds or removes are present in the
8631	 * journal.
8632	 */
8633	if (MOUNTEDSUJ(mp)) {
8634		flags = DEPALLOC;
8635		jmvref = newjmvref(dp, de->d_ino,
8636		    dp->i_offset + (oldloc - base),
8637		    dp->i_offset + (newloc - base));
8638	}
8639	lbn = lblkno(dp->i_fs, dp->i_offset);
8640	offset = blkoff(dp->i_fs, dp->i_offset);
8641	oldoffset = offset + (oldloc - base);
8642	newoffset = offset + (newloc - base);
8643	ACQUIRE_LOCK(dp->i_ump);
8644	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8645		goto done;
8646	dap = diradd_lookup(pagedep, oldoffset);
8647	if (dap) {
8648		dap->da_offset = newoffset;
8649		newoffset = DIRADDHASH(newoffset);
8650		oldoffset = DIRADDHASH(oldoffset);
8651		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8652		    newoffset != oldoffset) {
8653			LIST_REMOVE(dap, da_pdlist);
8654			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8655			    dap, da_pdlist);
8656		}
8657	}
8658done:
8659	if (jmvref) {
8660		jmvref->jm_pagedep = pagedep;
8661		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8662		add_to_journal(&jmvref->jm_list);
8663	}
8664	bcopy(oldloc, newloc, entrysize);
8665	FREE_LOCK(dp->i_ump);
8666}
8667
8668/*
8669 * Move the mkdir dependencies and journal work from one diradd to another
8670 * when renaming a directory.  The new name must depend on the mkdir deps
8671 * completing as the old name did.  Directories can only have one valid link
8672 * at a time so one must be canonical.
8673 */
8674static void
8675merge_diradd(inodedep, newdap)
8676	struct inodedep *inodedep;
8677	struct diradd *newdap;
8678{
8679	struct diradd *olddap;
8680	struct mkdir *mkdir, *nextmd;
8681	struct ufsmount *ump;
8682	short state;
8683
8684	olddap = inodedep->id_mkdiradd;
8685	inodedep->id_mkdiradd = newdap;
8686	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8687		newdap->da_state &= ~DEPCOMPLETE;
8688		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8689		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8690		     mkdir = nextmd) {
8691			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8692			if (mkdir->md_diradd != olddap)
8693				continue;
8694			mkdir->md_diradd = newdap;
8695			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8696			newdap->da_state |= state;
8697			olddap->da_state &= ~state;
8698			if ((olddap->da_state &
8699			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8700				break;
8701		}
8702		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8703			panic("merge_diradd: unfound ref");
8704	}
8705	/*
8706	 * Any mkdir related journal items are not safe to be freed until
8707	 * the new name is stable.
8708	 */
8709	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8710	olddap->da_state |= DEPCOMPLETE;
8711	complete_diradd(olddap);
8712}
8713
8714/*
8715 * Move the diradd to the pending list when all diradd dependencies are
8716 * complete.
8717 */
8718static void
8719complete_diradd(dap)
8720	struct diradd *dap;
8721{
8722	struct pagedep *pagedep;
8723
8724	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8725		if (dap->da_state & DIRCHG)
8726			pagedep = dap->da_previous->dm_pagedep;
8727		else
8728			pagedep = dap->da_pagedep;
8729		LIST_REMOVE(dap, da_pdlist);
8730		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8731	}
8732}
8733
8734/*
8735 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8736 * add entries and conditonally journal the remove.
8737 */
8738static void
8739cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8740	struct diradd *dap;
8741	struct dirrem *dirrem;
8742	struct jremref *jremref;
8743	struct jremref *dotremref;
8744	struct jremref *dotdotremref;
8745{
8746	struct inodedep *inodedep;
8747	struct jaddref *jaddref;
8748	struct inoref *inoref;
8749	struct ufsmount *ump;
8750	struct mkdir *mkdir;
8751
8752	/*
8753	 * If no remove references were allocated we're on a non-journaled
8754	 * filesystem and can skip the cancel step.
8755	 */
8756	if (jremref == NULL) {
8757		free_diradd(dap, NULL);
8758		return;
8759	}
8760	/*
8761	 * Cancel the primary name an free it if it does not require
8762	 * journaling.
8763	 */
8764	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8765	    0, &inodedep) != 0) {
8766		/* Abort the addref that reference this diradd.  */
8767		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8768			if (inoref->if_list.wk_type != D_JADDREF)
8769				continue;
8770			jaddref = (struct jaddref *)inoref;
8771			if (jaddref->ja_diradd != dap)
8772				continue;
8773			if (cancel_jaddref(jaddref, inodedep,
8774			    &dirrem->dm_jwork) == 0) {
8775				free_jremref(jremref);
8776				jremref = NULL;
8777			}
8778			break;
8779		}
8780	}
8781	/*
8782	 * Cancel subordinate names and free them if they do not require
8783	 * journaling.
8784	 */
8785	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8786		ump = VFSTOUFS(dap->da_list.wk_mp);
8787		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8788			if (mkdir->md_diradd != dap)
8789				continue;
8790			if ((jaddref = mkdir->md_jaddref) == NULL)
8791				continue;
8792			mkdir->md_jaddref = NULL;
8793			if (mkdir->md_state & MKDIR_PARENT) {
8794				if (cancel_jaddref(jaddref, NULL,
8795				    &dirrem->dm_jwork) == 0) {
8796					free_jremref(dotdotremref);
8797					dotdotremref = NULL;
8798				}
8799			} else {
8800				if (cancel_jaddref(jaddref, inodedep,
8801				    &dirrem->dm_jwork) == 0) {
8802					free_jremref(dotremref);
8803					dotremref = NULL;
8804				}
8805			}
8806		}
8807	}
8808
8809	if (jremref)
8810		journal_jremref(dirrem, jremref, inodedep);
8811	if (dotremref)
8812		journal_jremref(dirrem, dotremref, inodedep);
8813	if (dotdotremref)
8814		journal_jremref(dirrem, dotdotremref, NULL);
8815	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8816	free_diradd(dap, &dirrem->dm_jwork);
8817}
8818
8819/*
8820 * Free a diradd dependency structure. This routine must be called
8821 * with splbio interrupts blocked.
8822 */
8823static void
8824free_diradd(dap, wkhd)
8825	struct diradd *dap;
8826	struct workhead *wkhd;
8827{
8828	struct dirrem *dirrem;
8829	struct pagedep *pagedep;
8830	struct inodedep *inodedep;
8831	struct mkdir *mkdir, *nextmd;
8832	struct ufsmount *ump;
8833
8834	ump = VFSTOUFS(dap->da_list.wk_mp);
8835	LOCK_OWNED(ump);
8836	LIST_REMOVE(dap, da_pdlist);
8837	if (dap->da_state & ONWORKLIST)
8838		WORKLIST_REMOVE(&dap->da_list);
8839	if ((dap->da_state & DIRCHG) == 0) {
8840		pagedep = dap->da_pagedep;
8841	} else {
8842		dirrem = dap->da_previous;
8843		pagedep = dirrem->dm_pagedep;
8844		dirrem->dm_dirinum = pagedep->pd_ino;
8845		dirrem->dm_state |= COMPLETE;
8846		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8847			add_to_worklist(&dirrem->dm_list, 0);
8848	}
8849	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8850	    0, &inodedep) != 0)
8851		if (inodedep->id_mkdiradd == dap)
8852			inodedep->id_mkdiradd = NULL;
8853	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8854		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8855		     mkdir = nextmd) {
8856			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8857			if (mkdir->md_diradd != dap)
8858				continue;
8859			dap->da_state &=
8860			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8861			LIST_REMOVE(mkdir, md_mkdirs);
8862			if (mkdir->md_state & ONWORKLIST)
8863				WORKLIST_REMOVE(&mkdir->md_list);
8864			if (mkdir->md_jaddref != NULL)
8865				panic("free_diradd: Unexpected jaddref");
8866			WORKITEM_FREE(mkdir, D_MKDIR);
8867			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8868				break;
8869		}
8870		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8871			panic("free_diradd: unfound ref");
8872	}
8873	if (inodedep)
8874		free_inodedep(inodedep);
8875	/*
8876	 * Free any journal segments waiting for the directory write.
8877	 */
8878	handle_jwork(&dap->da_jwork);
8879	WORKITEM_FREE(dap, D_DIRADD);
8880}
8881
8882/*
8883 * Directory entry removal dependencies.
8884 *
8885 * When removing a directory entry, the entry's inode pointer must be
8886 * zero'ed on disk before the corresponding inode's link count is decremented
8887 * (possibly freeing the inode for re-use). This dependency is handled by
8888 * updating the directory entry but delaying the inode count reduction until
8889 * after the directory block has been written to disk. After this point, the
8890 * inode count can be decremented whenever it is convenient.
8891 */
8892
8893/*
8894 * This routine should be called immediately after removing
8895 * a directory entry.  The inode's link count should not be
8896 * decremented by the calling procedure -- the soft updates
8897 * code will do this task when it is safe.
8898 */
8899void
8900softdep_setup_remove(bp, dp, ip, isrmdir)
8901	struct buf *bp;		/* buffer containing directory block */
8902	struct inode *dp;	/* inode for the directory being modified */
8903	struct inode *ip;	/* inode for directory entry being removed */
8904	int isrmdir;		/* indicates if doing RMDIR */
8905{
8906	struct dirrem *dirrem, *prevdirrem;
8907	struct inodedep *inodedep;
8908	int direct;
8909
8910	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8911	    ("softdep_setup_remove called on non-softdep filesystem"));
8912	/*
8913	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8914	 * newdirrem() to setup the full directory remove which requires
8915	 * isrmdir > 1.
8916	 */
8917	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8918	/*
8919	 * Add the dirrem to the inodedep's pending remove list for quick
8920	 * discovery later.
8921	 */
8922	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8923	    &inodedep) == 0)
8924		panic("softdep_setup_remove: Lost inodedep.");
8925	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8926	dirrem->dm_state |= ONDEPLIST;
8927	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8928
8929	/*
8930	 * If the COMPLETE flag is clear, then there were no active
8931	 * entries and we want to roll back to a zeroed entry until
8932	 * the new inode is committed to disk. If the COMPLETE flag is
8933	 * set then we have deleted an entry that never made it to
8934	 * disk. If the entry we deleted resulted from a name change,
8935	 * then the old name still resides on disk. We cannot delete
8936	 * its inode (returned to us in prevdirrem) until the zeroed
8937	 * directory entry gets to disk. The new inode has never been
8938	 * referenced on the disk, so can be deleted immediately.
8939	 */
8940	if ((dirrem->dm_state & COMPLETE) == 0) {
8941		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8942		    dm_next);
8943		FREE_LOCK(ip->i_ump);
8944	} else {
8945		if (prevdirrem != NULL)
8946			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8947			    prevdirrem, dm_next);
8948		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8949		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8950		FREE_LOCK(ip->i_ump);
8951		if (direct)
8952			handle_workitem_remove(dirrem, 0);
8953	}
8954}
8955
8956/*
8957 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8958 * pd_pendinghd list of a pagedep.
8959 */
8960static struct diradd *
8961diradd_lookup(pagedep, offset)
8962	struct pagedep *pagedep;
8963	int offset;
8964{
8965	struct diradd *dap;
8966
8967	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8968		if (dap->da_offset == offset)
8969			return (dap);
8970	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8971		if (dap->da_offset == offset)
8972			return (dap);
8973	return (NULL);
8974}
8975
8976/*
8977 * Search for a .. diradd dependency in a directory that is being removed.
8978 * If the directory was renamed to a new parent we have a diradd rather
8979 * than a mkdir for the .. entry.  We need to cancel it now before
8980 * it is found in truncate().
8981 */
8982static struct jremref *
8983cancel_diradd_dotdot(ip, dirrem, jremref)
8984	struct inode *ip;
8985	struct dirrem *dirrem;
8986	struct jremref *jremref;
8987{
8988	struct pagedep *pagedep;
8989	struct diradd *dap;
8990	struct worklist *wk;
8991
8992	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8993	    &pagedep) == 0)
8994		return (jremref);
8995	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8996	if (dap == NULL)
8997		return (jremref);
8998	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8999	/*
9000	 * Mark any journal work as belonging to the parent so it is freed
9001	 * with the .. reference.
9002	 */
9003	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9004		wk->wk_state |= MKDIR_PARENT;
9005	return (NULL);
9006}
9007
9008/*
9009 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9010 * replace it with a dirrem/diradd pair as a result of re-parenting a
9011 * directory.  This ensures that we don't simultaneously have a mkdir and
9012 * a diradd for the same .. entry.
9013 */
9014static struct jremref *
9015cancel_mkdir_dotdot(ip, dirrem, jremref)
9016	struct inode *ip;
9017	struct dirrem *dirrem;
9018	struct jremref *jremref;
9019{
9020	struct inodedep *inodedep;
9021	struct jaddref *jaddref;
9022	struct ufsmount *ump;
9023	struct mkdir *mkdir;
9024	struct diradd *dap;
9025
9026	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9027	    &inodedep) == 0)
9028		return (jremref);
9029	dap = inodedep->id_mkdiradd;
9030	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9031		return (jremref);
9032	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9033	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9034	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9035		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9036			break;
9037	if (mkdir == NULL)
9038		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9039	if ((jaddref = mkdir->md_jaddref) != NULL) {
9040		mkdir->md_jaddref = NULL;
9041		jaddref->ja_state &= ~MKDIR_PARENT;
9042		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
9043		    &inodedep) == 0)
9044			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9045		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9046			journal_jremref(dirrem, jremref, inodedep);
9047			jremref = NULL;
9048		}
9049	}
9050	if (mkdir->md_state & ONWORKLIST)
9051		WORKLIST_REMOVE(&mkdir->md_list);
9052	mkdir->md_state |= ALLCOMPLETE;
9053	complete_mkdir(mkdir);
9054	return (jremref);
9055}
9056
9057static void
9058journal_jremref(dirrem, jremref, inodedep)
9059	struct dirrem *dirrem;
9060	struct jremref *jremref;
9061	struct inodedep *inodedep;
9062{
9063
9064	if (inodedep == NULL)
9065		if (inodedep_lookup(jremref->jr_list.wk_mp,
9066		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9067			panic("journal_jremref: Lost inodedep");
9068	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9069	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9070	add_to_journal(&jremref->jr_list);
9071}
9072
9073static void
9074dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9075	struct dirrem *dirrem;
9076	struct jremref *jremref;
9077	struct jremref *dotremref;
9078	struct jremref *dotdotremref;
9079{
9080	struct inodedep *inodedep;
9081
9082
9083	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9084	    &inodedep) == 0)
9085		panic("dirrem_journal: Lost inodedep");
9086	journal_jremref(dirrem, jremref, inodedep);
9087	if (dotremref)
9088		journal_jremref(dirrem, dotremref, inodedep);
9089	if (dotdotremref)
9090		journal_jremref(dirrem, dotdotremref, NULL);
9091}
9092
9093/*
9094 * Allocate a new dirrem if appropriate and return it along with
9095 * its associated pagedep. Called without a lock, returns with lock.
9096 */
9097static struct dirrem *
9098newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9099	struct buf *bp;		/* buffer containing directory block */
9100	struct inode *dp;	/* inode for the directory being modified */
9101	struct inode *ip;	/* inode for directory entry being removed */
9102	int isrmdir;		/* indicates if doing RMDIR */
9103	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9104{
9105	int offset;
9106	ufs_lbn_t lbn;
9107	struct diradd *dap;
9108	struct dirrem *dirrem;
9109	struct pagedep *pagedep;
9110	struct jremref *jremref;
9111	struct jremref *dotremref;
9112	struct jremref *dotdotremref;
9113	struct vnode *dvp;
9114
9115	/*
9116	 * Whiteouts have no deletion dependencies.
9117	 */
9118	if (ip == NULL)
9119		panic("newdirrem: whiteout");
9120	dvp = ITOV(dp);
9121	/*
9122	 * If the system is over its limit and our filesystem is
9123	 * responsible for more than our share of that usage and
9124	 * we are not a snapshot, request some inodedep cleanup.
9125	 * Limiting the number of dirrem structures will also limit
9126	 * the number of freefile and freeblks structures.
9127	 */
9128	ACQUIRE_LOCK(ip->i_ump);
9129	if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump))
9130		schedule_cleanup(ITOV(dp)->v_mount);
9131	else
9132		FREE_LOCK(ip->i_ump);
9133	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9134	    M_ZERO);
9135	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9136	LIST_INIT(&dirrem->dm_jremrefhd);
9137	LIST_INIT(&dirrem->dm_jwork);
9138	dirrem->dm_state = isrmdir ? RMDIR : 0;
9139	dirrem->dm_oldinum = ip->i_number;
9140	*prevdirremp = NULL;
9141	/*
9142	 * Allocate remove reference structures to track journal write
9143	 * dependencies.  We will always have one for the link and
9144	 * when doing directories we will always have one more for dot.
9145	 * When renaming a directory we skip the dotdot link change so
9146	 * this is not needed.
9147	 */
9148	jremref = dotremref = dotdotremref = NULL;
9149	if (DOINGSUJ(dvp)) {
9150		if (isrmdir) {
9151			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9152			    ip->i_effnlink + 2);
9153			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9154			    ip->i_effnlink + 1);
9155			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9156			    dp->i_effnlink + 1);
9157			dotdotremref->jr_state |= MKDIR_PARENT;
9158		} else
9159			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9160			    ip->i_effnlink + 1);
9161	}
9162	ACQUIRE_LOCK(ip->i_ump);
9163	lbn = lblkno(dp->i_fs, dp->i_offset);
9164	offset = blkoff(dp->i_fs, dp->i_offset);
9165	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
9166	    &pagedep);
9167	dirrem->dm_pagedep = pagedep;
9168	dirrem->dm_offset = offset;
9169	/*
9170	 * If we're renaming a .. link to a new directory, cancel any
9171	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9172	 * the jremref is preserved for any potential diradd in this
9173	 * location.  This can not coincide with a rmdir.
9174	 */
9175	if (dp->i_offset == DOTDOT_OFFSET) {
9176		if (isrmdir)
9177			panic("newdirrem: .. directory change during remove?");
9178		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9179	}
9180	/*
9181	 * If we're removing a directory search for the .. dependency now and
9182	 * cancel it.  Any pending journal work will be added to the dirrem
9183	 * to be completed when the workitem remove completes.
9184	 */
9185	if (isrmdir)
9186		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9187	/*
9188	 * Check for a diradd dependency for the same directory entry.
9189	 * If present, then both dependencies become obsolete and can
9190	 * be de-allocated.
9191	 */
9192	dap = diradd_lookup(pagedep, offset);
9193	if (dap == NULL) {
9194		/*
9195		 * Link the jremref structures into the dirrem so they are
9196		 * written prior to the pagedep.
9197		 */
9198		if (jremref)
9199			dirrem_journal(dirrem, jremref, dotremref,
9200			    dotdotremref);
9201		return (dirrem);
9202	}
9203	/*
9204	 * Must be ATTACHED at this point.
9205	 */
9206	if ((dap->da_state & ATTACHED) == 0)
9207		panic("newdirrem: not ATTACHED");
9208	if (dap->da_newinum != ip->i_number)
9209		panic("newdirrem: inum %ju should be %ju",
9210		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9211	/*
9212	 * If we are deleting a changed name that never made it to disk,
9213	 * then return the dirrem describing the previous inode (which
9214	 * represents the inode currently referenced from this entry on disk).
9215	 */
9216	if ((dap->da_state & DIRCHG) != 0) {
9217		*prevdirremp = dap->da_previous;
9218		dap->da_state &= ~DIRCHG;
9219		dap->da_pagedep = pagedep;
9220	}
9221	/*
9222	 * We are deleting an entry that never made it to disk.
9223	 * Mark it COMPLETE so we can delete its inode immediately.
9224	 */
9225	dirrem->dm_state |= COMPLETE;
9226	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9227#ifdef SUJ_DEBUG
9228	if (isrmdir == 0) {
9229		struct worklist *wk;
9230
9231		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9232			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9233				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9234	}
9235#endif
9236
9237	return (dirrem);
9238}
9239
9240/*
9241 * Directory entry change dependencies.
9242 *
9243 * Changing an existing directory entry requires that an add operation
9244 * be completed first followed by a deletion. The semantics for the addition
9245 * are identical to the description of adding a new entry above except
9246 * that the rollback is to the old inode number rather than zero. Once
9247 * the addition dependency is completed, the removal is done as described
9248 * in the removal routine above.
9249 */
9250
9251/*
9252 * This routine should be called immediately after changing
9253 * a directory entry.  The inode's link count should not be
9254 * decremented by the calling procedure -- the soft updates
9255 * code will perform this task when it is safe.
9256 */
9257void
9258softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9259	struct buf *bp;		/* buffer containing directory block */
9260	struct inode *dp;	/* inode for the directory being modified */
9261	struct inode *ip;	/* inode for directory entry being removed */
9262	ino_t newinum;		/* new inode number for changed entry */
9263	int isrmdir;		/* indicates if doing RMDIR */
9264{
9265	int offset;
9266	struct diradd *dap = NULL;
9267	struct dirrem *dirrem, *prevdirrem;
9268	struct pagedep *pagedep;
9269	struct inodedep *inodedep;
9270	struct jaddref *jaddref;
9271	struct mount *mp;
9272
9273	offset = blkoff(dp->i_fs, dp->i_offset);
9274	mp = UFSTOVFS(dp->i_ump);
9275	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9276	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9277
9278	/*
9279	 * Whiteouts do not need diradd dependencies.
9280	 */
9281	if (newinum != WINO) {
9282		dap = malloc(sizeof(struct diradd),
9283		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9284		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9285		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9286		dap->da_offset = offset;
9287		dap->da_newinum = newinum;
9288		LIST_INIT(&dap->da_jwork);
9289	}
9290
9291	/*
9292	 * Allocate a new dirrem and ACQUIRE_LOCK.
9293	 */
9294	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9295	pagedep = dirrem->dm_pagedep;
9296	/*
9297	 * The possible values for isrmdir:
9298	 *	0 - non-directory file rename
9299	 *	1 - directory rename within same directory
9300	 *   inum - directory rename to new directory of given inode number
9301	 * When renaming to a new directory, we are both deleting and
9302	 * creating a new directory entry, so the link count on the new
9303	 * directory should not change. Thus we do not need the followup
9304	 * dirrem which is usually done in handle_workitem_remove. We set
9305	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9306	 * followup dirrem.
9307	 */
9308	if (isrmdir > 1)
9309		dirrem->dm_state |= DIRCHG;
9310
9311	/*
9312	 * Whiteouts have no additional dependencies,
9313	 * so just put the dirrem on the correct list.
9314	 */
9315	if (newinum == WINO) {
9316		if ((dirrem->dm_state & COMPLETE) == 0) {
9317			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9318			    dm_next);
9319		} else {
9320			dirrem->dm_dirinum = pagedep->pd_ino;
9321			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9322				add_to_worklist(&dirrem->dm_list, 0);
9323		}
9324		FREE_LOCK(dp->i_ump);
9325		return;
9326	}
9327	/*
9328	 * Add the dirrem to the inodedep's pending remove list for quick
9329	 * discovery later.  A valid nlinkdelta ensures that this lookup
9330	 * will not fail.
9331	 */
9332	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9333		panic("softdep_setup_directory_change: Lost inodedep.");
9334	dirrem->dm_state |= ONDEPLIST;
9335	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9336
9337	/*
9338	 * If the COMPLETE flag is clear, then there were no active
9339	 * entries and we want to roll back to the previous inode until
9340	 * the new inode is committed to disk. If the COMPLETE flag is
9341	 * set, then we have deleted an entry that never made it to disk.
9342	 * If the entry we deleted resulted from a name change, then the old
9343	 * inode reference still resides on disk. Any rollback that we do
9344	 * needs to be to that old inode (returned to us in prevdirrem). If
9345	 * the entry we deleted resulted from a create, then there is
9346	 * no entry on the disk, so we want to roll back to zero rather
9347	 * than the uncommitted inode. In either of the COMPLETE cases we
9348	 * want to immediately free the unwritten and unreferenced inode.
9349	 */
9350	if ((dirrem->dm_state & COMPLETE) == 0) {
9351		dap->da_previous = dirrem;
9352	} else {
9353		if (prevdirrem != NULL) {
9354			dap->da_previous = prevdirrem;
9355		} else {
9356			dap->da_state &= ~DIRCHG;
9357			dap->da_pagedep = pagedep;
9358		}
9359		dirrem->dm_dirinum = pagedep->pd_ino;
9360		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9361			add_to_worklist(&dirrem->dm_list, 0);
9362	}
9363	/*
9364	 * Lookup the jaddref for this journal entry.  We must finish
9365	 * initializing it and make the diradd write dependent on it.
9366	 * If we're not journaling, put it on the id_bufwait list if the
9367	 * inode is not yet written. If it is written, do the post-inode
9368	 * write processing to put it on the id_pendinghd list.
9369	 */
9370	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
9371	if (MOUNTEDSUJ(mp)) {
9372		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9373		    inoreflst);
9374		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9375		    ("softdep_setup_directory_change: bad jaddref %p",
9376		    jaddref));
9377		jaddref->ja_diroff = dp->i_offset;
9378		jaddref->ja_diradd = dap;
9379		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9380		    dap, da_pdlist);
9381		add_to_journal(&jaddref->ja_list);
9382	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9383		dap->da_state |= COMPLETE;
9384		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9385		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9386	} else {
9387		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9388		    dap, da_pdlist);
9389		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9390	}
9391	/*
9392	 * If we're making a new name for a directory that has not been
9393	 * committed when need to move the dot and dotdot references to
9394	 * this new name.
9395	 */
9396	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9397		merge_diradd(inodedep, dap);
9398	FREE_LOCK(dp->i_ump);
9399}
9400
9401/*
9402 * Called whenever the link count on an inode is changed.
9403 * It creates an inode dependency so that the new reference(s)
9404 * to the inode cannot be committed to disk until the updated
9405 * inode has been written.
9406 */
9407void
9408softdep_change_linkcnt(ip)
9409	struct inode *ip;	/* the inode with the increased link count */
9410{
9411	struct inodedep *inodedep;
9412	int dflags;
9413
9414	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9415	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9416	ACQUIRE_LOCK(ip->i_ump);
9417	dflags = DEPALLOC;
9418	if (IS_SNAPSHOT(ip))
9419		dflags |= NODELAY;
9420	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
9421	if (ip->i_nlink < ip->i_effnlink)
9422		panic("softdep_change_linkcnt: bad delta");
9423	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9424	FREE_LOCK(ip->i_ump);
9425}
9426
9427/*
9428 * Attach a sbdep dependency to the superblock buf so that we can keep
9429 * track of the head of the linked list of referenced but unlinked inodes.
9430 */
9431void
9432softdep_setup_sbupdate(ump, fs, bp)
9433	struct ufsmount *ump;
9434	struct fs *fs;
9435	struct buf *bp;
9436{
9437	struct sbdep *sbdep;
9438	struct worklist *wk;
9439
9440	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9441	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9442	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9443		if (wk->wk_type == D_SBDEP)
9444			break;
9445	if (wk != NULL)
9446		return;
9447	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9448	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9449	sbdep->sb_fs = fs;
9450	sbdep->sb_ump = ump;
9451	ACQUIRE_LOCK(ump);
9452	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9453	FREE_LOCK(ump);
9454}
9455
9456/*
9457 * Return the first unlinked inodedep which is ready to be the head of the
9458 * list.  The inodedep and all those after it must have valid next pointers.
9459 */
9460static struct inodedep *
9461first_unlinked_inodedep(ump)
9462	struct ufsmount *ump;
9463{
9464	struct inodedep *inodedep;
9465	struct inodedep *idp;
9466
9467	LOCK_OWNED(ump);
9468	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9469	    inodedep; inodedep = idp) {
9470		if ((inodedep->id_state & UNLINKNEXT) == 0)
9471			return (NULL);
9472		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9473		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9474			break;
9475		if ((inodedep->id_state & UNLINKPREV) == 0)
9476			break;
9477	}
9478	return (inodedep);
9479}
9480
9481/*
9482 * Set the sujfree unlinked head pointer prior to writing a superblock.
9483 */
9484static void
9485initiate_write_sbdep(sbdep)
9486	struct sbdep *sbdep;
9487{
9488	struct inodedep *inodedep;
9489	struct fs *bpfs;
9490	struct fs *fs;
9491
9492	bpfs = sbdep->sb_fs;
9493	fs = sbdep->sb_ump->um_fs;
9494	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9495	if (inodedep) {
9496		fs->fs_sujfree = inodedep->id_ino;
9497		inodedep->id_state |= UNLINKPREV;
9498	} else
9499		fs->fs_sujfree = 0;
9500	bpfs->fs_sujfree = fs->fs_sujfree;
9501}
9502
9503/*
9504 * After a superblock is written determine whether it must be written again
9505 * due to a changing unlinked list head.
9506 */
9507static int
9508handle_written_sbdep(sbdep, bp)
9509	struct sbdep *sbdep;
9510	struct buf *bp;
9511{
9512	struct inodedep *inodedep;
9513	struct fs *fs;
9514
9515	LOCK_OWNED(sbdep->sb_ump);
9516	fs = sbdep->sb_fs;
9517	/*
9518	 * If the superblock doesn't match the in-memory list start over.
9519	 */
9520	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9521	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9522	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9523		bdirty(bp);
9524		return (1);
9525	}
9526	WORKITEM_FREE(sbdep, D_SBDEP);
9527	if (fs->fs_sujfree == 0)
9528		return (0);
9529	/*
9530	 * Now that we have a record of this inode in stable store allow it
9531	 * to be written to free up pending work.  Inodes may see a lot of
9532	 * write activity after they are unlinked which we must not hold up.
9533	 */
9534	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9535		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9536			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9537			    inodedep, inodedep->id_state);
9538		if (inodedep->id_state & UNLINKONLIST)
9539			break;
9540		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9541	}
9542
9543	return (0);
9544}
9545
9546/*
9547 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9548 */
9549static void
9550unlinked_inodedep(mp, inodedep)
9551	struct mount *mp;
9552	struct inodedep *inodedep;
9553{
9554	struct ufsmount *ump;
9555
9556	ump = VFSTOUFS(mp);
9557	LOCK_OWNED(ump);
9558	if (MOUNTEDSUJ(mp) == 0)
9559		return;
9560	ump->um_fs->fs_fmod = 1;
9561	if (inodedep->id_state & UNLINKED)
9562		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9563	inodedep->id_state |= UNLINKED;
9564	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9565}
9566
9567/*
9568 * Remove an inodedep from the unlinked inodedep list.  This may require
9569 * disk writes if the inode has made it that far.
9570 */
9571static void
9572clear_unlinked_inodedep(inodedep)
9573	struct inodedep *inodedep;
9574{
9575	struct ufsmount *ump;
9576	struct inodedep *idp;
9577	struct inodedep *idn;
9578	struct fs *fs;
9579	struct buf *bp;
9580	ino_t ino;
9581	ino_t nino;
9582	ino_t pino;
9583	int error;
9584
9585	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9586	fs = ump->um_fs;
9587	ino = inodedep->id_ino;
9588	error = 0;
9589	for (;;) {
9590		LOCK_OWNED(ump);
9591		KASSERT((inodedep->id_state & UNLINKED) != 0,
9592		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9593		    inodedep));
9594		/*
9595		 * If nothing has yet been written simply remove us from
9596		 * the in memory list and return.  This is the most common
9597		 * case where handle_workitem_remove() loses the final
9598		 * reference.
9599		 */
9600		if ((inodedep->id_state & UNLINKLINKS) == 0)
9601			break;
9602		/*
9603		 * If we have a NEXT pointer and no PREV pointer we can simply
9604		 * clear NEXT's PREV and remove ourselves from the list.  Be
9605		 * careful not to clear PREV if the superblock points at
9606		 * next as well.
9607		 */
9608		idn = TAILQ_NEXT(inodedep, id_unlinked);
9609		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9610			if (idn && fs->fs_sujfree != idn->id_ino)
9611				idn->id_state &= ~UNLINKPREV;
9612			break;
9613		}
9614		/*
9615		 * Here we have an inodedep which is actually linked into
9616		 * the list.  We must remove it by forcing a write to the
9617		 * link before us, whether it be the superblock or an inode.
9618		 * Unfortunately the list may change while we're waiting
9619		 * on the buf lock for either resource so we must loop until
9620		 * we lock the right one.  If both the superblock and an
9621		 * inode point to this inode we must clear the inode first
9622		 * followed by the superblock.
9623		 */
9624		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9625		pino = 0;
9626		if (idp && (idp->id_state & UNLINKNEXT))
9627			pino = idp->id_ino;
9628		FREE_LOCK(ump);
9629		if (pino == 0) {
9630			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9631			    (int)fs->fs_sbsize, 0, 0, 0);
9632		} else {
9633			error = bread(ump->um_devvp,
9634			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9635			    (int)fs->fs_bsize, NOCRED, &bp);
9636			if (error)
9637				brelse(bp);
9638		}
9639		ACQUIRE_LOCK(ump);
9640		if (error)
9641			break;
9642		/* If the list has changed restart the loop. */
9643		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9644		nino = 0;
9645		if (idp && (idp->id_state & UNLINKNEXT))
9646			nino = idp->id_ino;
9647		if (nino != pino ||
9648		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9649			FREE_LOCK(ump);
9650			brelse(bp);
9651			ACQUIRE_LOCK(ump);
9652			continue;
9653		}
9654		nino = 0;
9655		idn = TAILQ_NEXT(inodedep, id_unlinked);
9656		if (idn)
9657			nino = idn->id_ino;
9658		/*
9659		 * Remove us from the in memory list.  After this we cannot
9660		 * access the inodedep.
9661		 */
9662		KASSERT((inodedep->id_state & UNLINKED) != 0,
9663		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9664		    inodedep));
9665		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9666		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9667		FREE_LOCK(ump);
9668		/*
9669		 * The predecessor's next pointer is manually updated here
9670		 * so that the NEXT flag is never cleared for an element
9671		 * that is in the list.
9672		 */
9673		if (pino == 0) {
9674			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9675			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9676			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9677			    bp);
9678		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9679			((struct ufs1_dinode *)bp->b_data +
9680			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9681		else
9682			((struct ufs2_dinode *)bp->b_data +
9683			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9684		/*
9685		 * If the bwrite fails we have no recourse to recover.  The
9686		 * filesystem is corrupted already.
9687		 */
9688		bwrite(bp);
9689		ACQUIRE_LOCK(ump);
9690		/*
9691		 * If the superblock pointer still needs to be cleared force
9692		 * a write here.
9693		 */
9694		if (fs->fs_sujfree == ino) {
9695			FREE_LOCK(ump);
9696			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9697			    (int)fs->fs_sbsize, 0, 0, 0);
9698			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9699			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9700			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9701			    bp);
9702			bwrite(bp);
9703			ACQUIRE_LOCK(ump);
9704		}
9705
9706		if (fs->fs_sujfree != ino)
9707			return;
9708		panic("clear_unlinked_inodedep: Failed to clear free head");
9709	}
9710	if (inodedep->id_ino == fs->fs_sujfree)
9711		panic("clear_unlinked_inodedep: Freeing head of free list");
9712	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9713	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9714	return;
9715}
9716
9717/*
9718 * This workitem decrements the inode's link count.
9719 * If the link count reaches zero, the file is removed.
9720 */
9721static int
9722handle_workitem_remove(dirrem, flags)
9723	struct dirrem *dirrem;
9724	int flags;
9725{
9726	struct inodedep *inodedep;
9727	struct workhead dotdotwk;
9728	struct worklist *wk;
9729	struct ufsmount *ump;
9730	struct mount *mp;
9731	struct vnode *vp;
9732	struct inode *ip;
9733	ino_t oldinum;
9734
9735	if (dirrem->dm_state & ONWORKLIST)
9736		panic("handle_workitem_remove: dirrem %p still on worklist",
9737		    dirrem);
9738	oldinum = dirrem->dm_oldinum;
9739	mp = dirrem->dm_list.wk_mp;
9740	ump = VFSTOUFS(mp);
9741	flags |= LK_EXCLUSIVE;
9742	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9743		return (EBUSY);
9744	ip = VTOI(vp);
9745	ACQUIRE_LOCK(ump);
9746	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9747		panic("handle_workitem_remove: lost inodedep");
9748	if (dirrem->dm_state & ONDEPLIST)
9749		LIST_REMOVE(dirrem, dm_inonext);
9750	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9751	    ("handle_workitem_remove:  Journal entries not written."));
9752
9753	/*
9754	 * Move all dependencies waiting on the remove to complete
9755	 * from the dirrem to the inode inowait list to be completed
9756	 * after the inode has been updated and written to disk.  Any
9757	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9758	 * is removed.
9759	 */
9760	LIST_INIT(&dotdotwk);
9761	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9762		WORKLIST_REMOVE(wk);
9763		if (wk->wk_state & MKDIR_PARENT) {
9764			wk->wk_state &= ~MKDIR_PARENT;
9765			WORKLIST_INSERT(&dotdotwk, wk);
9766			continue;
9767		}
9768		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9769	}
9770	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9771	/*
9772	 * Normal file deletion.
9773	 */
9774	if ((dirrem->dm_state & RMDIR) == 0) {
9775		ip->i_nlink--;
9776		DIP_SET(ip, i_nlink, ip->i_nlink);
9777		ip->i_flag |= IN_CHANGE;
9778		if (ip->i_nlink < ip->i_effnlink)
9779			panic("handle_workitem_remove: bad file delta");
9780		if (ip->i_nlink == 0)
9781			unlinked_inodedep(mp, inodedep);
9782		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9783		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9784		    ("handle_workitem_remove: worklist not empty. %s",
9785		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9786		WORKITEM_FREE(dirrem, D_DIRREM);
9787		FREE_LOCK(ump);
9788		goto out;
9789	}
9790	/*
9791	 * Directory deletion. Decrement reference count for both the
9792	 * just deleted parent directory entry and the reference for ".".
9793	 * Arrange to have the reference count on the parent decremented
9794	 * to account for the loss of "..".
9795	 */
9796	ip->i_nlink -= 2;
9797	DIP_SET(ip, i_nlink, ip->i_nlink);
9798	ip->i_flag |= IN_CHANGE;
9799	if (ip->i_nlink < ip->i_effnlink)
9800		panic("handle_workitem_remove: bad dir delta");
9801	if (ip->i_nlink == 0)
9802		unlinked_inodedep(mp, inodedep);
9803	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9804	/*
9805	 * Rename a directory to a new parent. Since, we are both deleting
9806	 * and creating a new directory entry, the link count on the new
9807	 * directory should not change. Thus we skip the followup dirrem.
9808	 */
9809	if (dirrem->dm_state & DIRCHG) {
9810		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9811		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9812		WORKITEM_FREE(dirrem, D_DIRREM);
9813		FREE_LOCK(ump);
9814		goto out;
9815	}
9816	dirrem->dm_state = ONDEPLIST;
9817	dirrem->dm_oldinum = dirrem->dm_dirinum;
9818	/*
9819	 * Place the dirrem on the parent's diremhd list.
9820	 */
9821	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9822		panic("handle_workitem_remove: lost dir inodedep");
9823	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9824	/*
9825	 * If the allocated inode has never been written to disk, then
9826	 * the on-disk inode is zero'ed and we can remove the file
9827	 * immediately.  When journaling if the inode has been marked
9828	 * unlinked and not DEPCOMPLETE we know it can never be written.
9829	 */
9830	inodedep_lookup(mp, oldinum, 0, &inodedep);
9831	if (inodedep == NULL ||
9832	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9833	    check_inode_unwritten(inodedep)) {
9834		FREE_LOCK(ump);
9835		vput(vp);
9836		return handle_workitem_remove(dirrem, flags);
9837	}
9838	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9839	FREE_LOCK(ump);
9840	ip->i_flag |= IN_CHANGE;
9841out:
9842	ffs_update(vp, 0);
9843	vput(vp);
9844	return (0);
9845}
9846
9847/*
9848 * Inode de-allocation dependencies.
9849 *
9850 * When an inode's link count is reduced to zero, it can be de-allocated. We
9851 * found it convenient to postpone de-allocation until after the inode is
9852 * written to disk with its new link count (zero).  At this point, all of the
9853 * on-disk inode's block pointers are nullified and, with careful dependency
9854 * list ordering, all dependencies related to the inode will be satisfied and
9855 * the corresponding dependency structures de-allocated.  So, if/when the
9856 * inode is reused, there will be no mixing of old dependencies with new
9857 * ones.  This artificial dependency is set up by the block de-allocation
9858 * procedure above (softdep_setup_freeblocks) and completed by the
9859 * following procedure.
9860 */
9861static void
9862handle_workitem_freefile(freefile)
9863	struct freefile *freefile;
9864{
9865	struct workhead wkhd;
9866	struct fs *fs;
9867	struct inodedep *idp;
9868	struct ufsmount *ump;
9869	int error;
9870
9871	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9872	fs = ump->um_fs;
9873#ifdef DEBUG
9874	ACQUIRE_LOCK(ump);
9875	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9876	FREE_LOCK(ump);
9877	if (error)
9878		panic("handle_workitem_freefile: inodedep %p survived", idp);
9879#endif
9880	UFS_LOCK(ump);
9881	fs->fs_pendinginodes -= 1;
9882	UFS_UNLOCK(ump);
9883	LIST_INIT(&wkhd);
9884	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9885	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9886	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9887		softdep_error("handle_workitem_freefile", error);
9888	ACQUIRE_LOCK(ump);
9889	WORKITEM_FREE(freefile, D_FREEFILE);
9890	FREE_LOCK(ump);
9891}
9892
9893
9894/*
9895 * Helper function which unlinks marker element from work list and returns
9896 * the next element on the list.
9897 */
9898static __inline struct worklist *
9899markernext(struct worklist *marker)
9900{
9901	struct worklist *next;
9902
9903	next = LIST_NEXT(marker, wk_list);
9904	LIST_REMOVE(marker, wk_list);
9905	return next;
9906}
9907
9908/*
9909 * Disk writes.
9910 *
9911 * The dependency structures constructed above are most actively used when file
9912 * system blocks are written to disk.  No constraints are placed on when a
9913 * block can be written, but unsatisfied update dependencies are made safe by
9914 * modifying (or replacing) the source memory for the duration of the disk
9915 * write.  When the disk write completes, the memory block is again brought
9916 * up-to-date.
9917 *
9918 * In-core inode structure reclamation.
9919 *
9920 * Because there are a finite number of "in-core" inode structures, they are
9921 * reused regularly.  By transferring all inode-related dependencies to the
9922 * in-memory inode block and indexing them separately (via "inodedep"s), we
9923 * can allow "in-core" inode structures to be reused at any time and avoid
9924 * any increase in contention.
9925 *
9926 * Called just before entering the device driver to initiate a new disk I/O.
9927 * The buffer must be locked, thus, no I/O completion operations can occur
9928 * while we are manipulating its associated dependencies.
9929 */
9930static void
9931softdep_disk_io_initiation(bp)
9932	struct buf *bp;		/* structure describing disk write to occur */
9933{
9934	struct worklist *wk;
9935	struct worklist marker;
9936	struct inodedep *inodedep;
9937	struct freeblks *freeblks;
9938	struct jblkdep *jblkdep;
9939	struct newblk *newblk;
9940	struct ufsmount *ump;
9941
9942	/*
9943	 * We only care about write operations. There should never
9944	 * be dependencies for reads.
9945	 */
9946	if (bp->b_iocmd != BIO_WRITE)
9947		panic("softdep_disk_io_initiation: not write");
9948
9949	if (bp->b_vflags & BV_BKGRDINPROG)
9950		panic("softdep_disk_io_initiation: Writing buffer with "
9951		    "background write in progress: %p", bp);
9952
9953	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9954		return;
9955	ump = VFSTOUFS(wk->wk_mp);
9956
9957	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9958	PHOLD(curproc);			/* Don't swap out kernel stack */
9959	ACQUIRE_LOCK(ump);
9960	/*
9961	 * Do any necessary pre-I/O processing.
9962	 */
9963	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9964	     wk = markernext(&marker)) {
9965		LIST_INSERT_AFTER(wk, &marker, wk_list);
9966		switch (wk->wk_type) {
9967
9968		case D_PAGEDEP:
9969			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9970			continue;
9971
9972		case D_INODEDEP:
9973			inodedep = WK_INODEDEP(wk);
9974			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9975				initiate_write_inodeblock_ufs1(inodedep, bp);
9976			else
9977				initiate_write_inodeblock_ufs2(inodedep, bp);
9978			continue;
9979
9980		case D_INDIRDEP:
9981			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9982			continue;
9983
9984		case D_BMSAFEMAP:
9985			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9986			continue;
9987
9988		case D_JSEG:
9989			WK_JSEG(wk)->js_buf = NULL;
9990			continue;
9991
9992		case D_FREEBLKS:
9993			freeblks = WK_FREEBLKS(wk);
9994			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9995			/*
9996			 * We have to wait for the freeblks to be journaled
9997			 * before we can write an inodeblock with updated
9998			 * pointers.  Be careful to arrange the marker so
9999			 * we revisit the freeblks if it's not removed by
10000			 * the first jwait().
10001			 */
10002			if (jblkdep != NULL) {
10003				LIST_REMOVE(&marker, wk_list);
10004				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10005				jwait(&jblkdep->jb_list, MNT_WAIT);
10006			}
10007			continue;
10008		case D_ALLOCDIRECT:
10009		case D_ALLOCINDIR:
10010			/*
10011			 * We have to wait for the jnewblk to be journaled
10012			 * before we can write to a block if the contents
10013			 * may be confused with an earlier file's indirect
10014			 * at recovery time.  Handle the marker as described
10015			 * above.
10016			 */
10017			newblk = WK_NEWBLK(wk);
10018			if (newblk->nb_jnewblk != NULL &&
10019			    indirblk_lookup(newblk->nb_list.wk_mp,
10020			    newblk->nb_newblkno)) {
10021				LIST_REMOVE(&marker, wk_list);
10022				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10023				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10024			}
10025			continue;
10026
10027		case D_SBDEP:
10028			initiate_write_sbdep(WK_SBDEP(wk));
10029			continue;
10030
10031		case D_MKDIR:
10032		case D_FREEWORK:
10033		case D_FREEDEP:
10034		case D_JSEGDEP:
10035			continue;
10036
10037		default:
10038			panic("handle_disk_io_initiation: Unexpected type %s",
10039			    TYPENAME(wk->wk_type));
10040			/* NOTREACHED */
10041		}
10042	}
10043	FREE_LOCK(ump);
10044	PRELE(curproc);			/* Allow swapout of kernel stack */
10045}
10046
10047/*
10048 * Called from within the procedure above to deal with unsatisfied
10049 * allocation dependencies in a directory. The buffer must be locked,
10050 * thus, no I/O completion operations can occur while we are
10051 * manipulating its associated dependencies.
10052 */
10053static void
10054initiate_write_filepage(pagedep, bp)
10055	struct pagedep *pagedep;
10056	struct buf *bp;
10057{
10058	struct jremref *jremref;
10059	struct jmvref *jmvref;
10060	struct dirrem *dirrem;
10061	struct diradd *dap;
10062	struct direct *ep;
10063	int i;
10064
10065	if (pagedep->pd_state & IOSTARTED) {
10066		/*
10067		 * This can only happen if there is a driver that does not
10068		 * understand chaining. Here biodone will reissue the call
10069		 * to strategy for the incomplete buffers.
10070		 */
10071		printf("initiate_write_filepage: already started\n");
10072		return;
10073	}
10074	pagedep->pd_state |= IOSTARTED;
10075	/*
10076	 * Wait for all journal remove dependencies to hit the disk.
10077	 * We can not allow any potentially conflicting directory adds
10078	 * to be visible before removes and rollback is too difficult.
10079	 * The per-filesystem lock may be dropped and re-acquired, however
10080	 * we hold the buf locked so the dependency can not go away.
10081	 */
10082	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10083		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10084			jwait(&jremref->jr_list, MNT_WAIT);
10085	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10086		jwait(&jmvref->jm_list, MNT_WAIT);
10087	for (i = 0; i < DAHASHSZ; i++) {
10088		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10089			ep = (struct direct *)
10090			    ((char *)bp->b_data + dap->da_offset);
10091			if (ep->d_ino != dap->da_newinum)
10092				panic("%s: dir inum %ju != new %ju",
10093				    "initiate_write_filepage",
10094				    (uintmax_t)ep->d_ino,
10095				    (uintmax_t)dap->da_newinum);
10096			if (dap->da_state & DIRCHG)
10097				ep->d_ino = dap->da_previous->dm_oldinum;
10098			else
10099				ep->d_ino = 0;
10100			dap->da_state &= ~ATTACHED;
10101			dap->da_state |= UNDONE;
10102		}
10103	}
10104}
10105
10106/*
10107 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10108 * Note that any bug fixes made to this routine must be done in the
10109 * version found below.
10110 *
10111 * Called from within the procedure above to deal with unsatisfied
10112 * allocation dependencies in an inodeblock. The buffer must be
10113 * locked, thus, no I/O completion operations can occur while we
10114 * are manipulating its associated dependencies.
10115 */
10116static void
10117initiate_write_inodeblock_ufs1(inodedep, bp)
10118	struct inodedep *inodedep;
10119	struct buf *bp;			/* The inode block */
10120{
10121	struct allocdirect *adp, *lastadp;
10122	struct ufs1_dinode *dp;
10123	struct ufs1_dinode *sip;
10124	struct inoref *inoref;
10125	struct ufsmount *ump;
10126	struct fs *fs;
10127	ufs_lbn_t i;
10128#ifdef INVARIANTS
10129	ufs_lbn_t prevlbn = 0;
10130#endif
10131	int deplist;
10132
10133	if (inodedep->id_state & IOSTARTED)
10134		panic("initiate_write_inodeblock_ufs1: already started");
10135	inodedep->id_state |= IOSTARTED;
10136	fs = inodedep->id_fs;
10137	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10138	LOCK_OWNED(ump);
10139	dp = (struct ufs1_dinode *)bp->b_data +
10140	    ino_to_fsbo(fs, inodedep->id_ino);
10141
10142	/*
10143	 * If we're on the unlinked list but have not yet written our
10144	 * next pointer initialize it here.
10145	 */
10146	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10147		struct inodedep *inon;
10148
10149		inon = TAILQ_NEXT(inodedep, id_unlinked);
10150		dp->di_freelink = inon ? inon->id_ino : 0;
10151	}
10152	/*
10153	 * If the bitmap is not yet written, then the allocated
10154	 * inode cannot be written to disk.
10155	 */
10156	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10157		if (inodedep->id_savedino1 != NULL)
10158			panic("initiate_write_inodeblock_ufs1: I/O underway");
10159		FREE_LOCK(ump);
10160		sip = malloc(sizeof(struct ufs1_dinode),
10161		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10162		ACQUIRE_LOCK(ump);
10163		inodedep->id_savedino1 = sip;
10164		*inodedep->id_savedino1 = *dp;
10165		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10166		dp->di_gen = inodedep->id_savedino1->di_gen;
10167		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10168		return;
10169	}
10170	/*
10171	 * If no dependencies, then there is nothing to roll back.
10172	 */
10173	inodedep->id_savedsize = dp->di_size;
10174	inodedep->id_savedextsize = 0;
10175	inodedep->id_savednlink = dp->di_nlink;
10176	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10177	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10178		return;
10179	/*
10180	 * Revert the link count to that of the first unwritten journal entry.
10181	 */
10182	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10183	if (inoref)
10184		dp->di_nlink = inoref->if_nlink;
10185	/*
10186	 * Set the dependencies to busy.
10187	 */
10188	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10189	     adp = TAILQ_NEXT(adp, ad_next)) {
10190#ifdef INVARIANTS
10191		if (deplist != 0 && prevlbn >= adp->ad_offset)
10192			panic("softdep_write_inodeblock: lbn order");
10193		prevlbn = adp->ad_offset;
10194		if (adp->ad_offset < NDADDR &&
10195		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10196			panic("%s: direct pointer #%jd mismatch %d != %jd",
10197			    "softdep_write_inodeblock",
10198			    (intmax_t)adp->ad_offset,
10199			    dp->di_db[adp->ad_offset],
10200			    (intmax_t)adp->ad_newblkno);
10201		if (adp->ad_offset >= NDADDR &&
10202		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10203			panic("%s: indirect pointer #%jd mismatch %d != %jd",
10204			    "softdep_write_inodeblock",
10205			    (intmax_t)adp->ad_offset - NDADDR,
10206			    dp->di_ib[adp->ad_offset - NDADDR],
10207			    (intmax_t)adp->ad_newblkno);
10208		deplist |= 1 << adp->ad_offset;
10209		if ((adp->ad_state & ATTACHED) == 0)
10210			panic("softdep_write_inodeblock: Unknown state 0x%x",
10211			    adp->ad_state);
10212#endif /* INVARIANTS */
10213		adp->ad_state &= ~ATTACHED;
10214		adp->ad_state |= UNDONE;
10215	}
10216	/*
10217	 * The on-disk inode cannot claim to be any larger than the last
10218	 * fragment that has been written. Otherwise, the on-disk inode
10219	 * might have fragments that were not the last block in the file
10220	 * which would corrupt the filesystem.
10221	 */
10222	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10223	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10224		if (adp->ad_offset >= NDADDR)
10225			break;
10226		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10227		/* keep going until hitting a rollback to a frag */
10228		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10229			continue;
10230		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10231		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10232#ifdef INVARIANTS
10233			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10234				panic("softdep_write_inodeblock: lost dep1");
10235#endif /* INVARIANTS */
10236			dp->di_db[i] = 0;
10237		}
10238		for (i = 0; i < NIADDR; i++) {
10239#ifdef INVARIANTS
10240			if (dp->di_ib[i] != 0 &&
10241			    (deplist & ((1 << NDADDR) << i)) == 0)
10242				panic("softdep_write_inodeblock: lost dep2");
10243#endif /* INVARIANTS */
10244			dp->di_ib[i] = 0;
10245		}
10246		return;
10247	}
10248	/*
10249	 * If we have zero'ed out the last allocated block of the file,
10250	 * roll back the size to the last currently allocated block.
10251	 * We know that this last allocated block is a full-sized as
10252	 * we already checked for fragments in the loop above.
10253	 */
10254	if (lastadp != NULL &&
10255	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10256		for (i = lastadp->ad_offset; i >= 0; i--)
10257			if (dp->di_db[i] != 0)
10258				break;
10259		dp->di_size = (i + 1) * fs->fs_bsize;
10260	}
10261	/*
10262	 * The only dependencies are for indirect blocks.
10263	 *
10264	 * The file size for indirect block additions is not guaranteed.
10265	 * Such a guarantee would be non-trivial to achieve. The conventional
10266	 * synchronous write implementation also does not make this guarantee.
10267	 * Fsck should catch and fix discrepancies. Arguably, the file size
10268	 * can be over-estimated without destroying integrity when the file
10269	 * moves into the indirect blocks (i.e., is large). If we want to
10270	 * postpone fsck, we are stuck with this argument.
10271	 */
10272	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10273		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10274}
10275
10276/*
10277 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10278 * Note that any bug fixes made to this routine must be done in the
10279 * version found above.
10280 *
10281 * Called from within the procedure above to deal with unsatisfied
10282 * allocation dependencies in an inodeblock. The buffer must be
10283 * locked, thus, no I/O completion operations can occur while we
10284 * are manipulating its associated dependencies.
10285 */
10286static void
10287initiate_write_inodeblock_ufs2(inodedep, bp)
10288	struct inodedep *inodedep;
10289	struct buf *bp;			/* The inode block */
10290{
10291	struct allocdirect *adp, *lastadp;
10292	struct ufs2_dinode *dp;
10293	struct ufs2_dinode *sip;
10294	struct inoref *inoref;
10295	struct ufsmount *ump;
10296	struct fs *fs;
10297	ufs_lbn_t i;
10298#ifdef INVARIANTS
10299	ufs_lbn_t prevlbn = 0;
10300#endif
10301	int deplist;
10302
10303	if (inodedep->id_state & IOSTARTED)
10304		panic("initiate_write_inodeblock_ufs2: already started");
10305	inodedep->id_state |= IOSTARTED;
10306	fs = inodedep->id_fs;
10307	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10308	LOCK_OWNED(ump);
10309	dp = (struct ufs2_dinode *)bp->b_data +
10310	    ino_to_fsbo(fs, inodedep->id_ino);
10311
10312	/*
10313	 * If we're on the unlinked list but have not yet written our
10314	 * next pointer initialize it here.
10315	 */
10316	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10317		struct inodedep *inon;
10318
10319		inon = TAILQ_NEXT(inodedep, id_unlinked);
10320		dp->di_freelink = inon ? inon->id_ino : 0;
10321	}
10322	/*
10323	 * If the bitmap is not yet written, then the allocated
10324	 * inode cannot be written to disk.
10325	 */
10326	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10327		if (inodedep->id_savedino2 != NULL)
10328			panic("initiate_write_inodeblock_ufs2: I/O underway");
10329		FREE_LOCK(ump);
10330		sip = malloc(sizeof(struct ufs2_dinode),
10331		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10332		ACQUIRE_LOCK(ump);
10333		inodedep->id_savedino2 = sip;
10334		*inodedep->id_savedino2 = *dp;
10335		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10336		dp->di_gen = inodedep->id_savedino2->di_gen;
10337		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10338		return;
10339	}
10340	/*
10341	 * If no dependencies, then there is nothing to roll back.
10342	 */
10343	inodedep->id_savedsize = dp->di_size;
10344	inodedep->id_savedextsize = dp->di_extsize;
10345	inodedep->id_savednlink = dp->di_nlink;
10346	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10347	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10348	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10349		return;
10350	/*
10351	 * Revert the link count to that of the first unwritten journal entry.
10352	 */
10353	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10354	if (inoref)
10355		dp->di_nlink = inoref->if_nlink;
10356
10357	/*
10358	 * Set the ext data dependencies to busy.
10359	 */
10360	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10361	     adp = TAILQ_NEXT(adp, ad_next)) {
10362#ifdef INVARIANTS
10363		if (deplist != 0 && prevlbn >= adp->ad_offset)
10364			panic("softdep_write_inodeblock: lbn order");
10365		prevlbn = adp->ad_offset;
10366		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10367			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10368			    "softdep_write_inodeblock",
10369			    (intmax_t)adp->ad_offset,
10370			    (intmax_t)dp->di_extb[adp->ad_offset],
10371			    (intmax_t)adp->ad_newblkno);
10372		deplist |= 1 << adp->ad_offset;
10373		if ((adp->ad_state & ATTACHED) == 0)
10374			panic("softdep_write_inodeblock: Unknown state 0x%x",
10375			    adp->ad_state);
10376#endif /* INVARIANTS */
10377		adp->ad_state &= ~ATTACHED;
10378		adp->ad_state |= UNDONE;
10379	}
10380	/*
10381	 * The on-disk inode cannot claim to be any larger than the last
10382	 * fragment that has been written. Otherwise, the on-disk inode
10383	 * might have fragments that were not the last block in the ext
10384	 * data which would corrupt the filesystem.
10385	 */
10386	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10387	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10388		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10389		/* keep going until hitting a rollback to a frag */
10390		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10391			continue;
10392		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10393		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10394#ifdef INVARIANTS
10395			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10396				panic("softdep_write_inodeblock: lost dep1");
10397#endif /* INVARIANTS */
10398			dp->di_extb[i] = 0;
10399		}
10400		lastadp = NULL;
10401		break;
10402	}
10403	/*
10404	 * If we have zero'ed out the last allocated block of the ext
10405	 * data, roll back the size to the last currently allocated block.
10406	 * We know that this last allocated block is a full-sized as
10407	 * we already checked for fragments in the loop above.
10408	 */
10409	if (lastadp != NULL &&
10410	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10411		for (i = lastadp->ad_offset; i >= 0; i--)
10412			if (dp->di_extb[i] != 0)
10413				break;
10414		dp->di_extsize = (i + 1) * fs->fs_bsize;
10415	}
10416	/*
10417	 * Set the file data dependencies to busy.
10418	 */
10419	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10420	     adp = TAILQ_NEXT(adp, ad_next)) {
10421#ifdef INVARIANTS
10422		if (deplist != 0 && prevlbn >= adp->ad_offset)
10423			panic("softdep_write_inodeblock: lbn order");
10424		if ((adp->ad_state & ATTACHED) == 0)
10425			panic("inodedep %p and adp %p not attached", inodedep, adp);
10426		prevlbn = adp->ad_offset;
10427		if (adp->ad_offset < NDADDR &&
10428		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10429			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10430			    "softdep_write_inodeblock",
10431			    (intmax_t)adp->ad_offset,
10432			    (intmax_t)dp->di_db[adp->ad_offset],
10433			    (intmax_t)adp->ad_newblkno);
10434		if (adp->ad_offset >= NDADDR &&
10435		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10436			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10437			    "softdep_write_inodeblock:",
10438			    (intmax_t)adp->ad_offset - NDADDR,
10439			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10440			    (intmax_t)adp->ad_newblkno);
10441		deplist |= 1 << adp->ad_offset;
10442		if ((adp->ad_state & ATTACHED) == 0)
10443			panic("softdep_write_inodeblock: Unknown state 0x%x",
10444			    adp->ad_state);
10445#endif /* INVARIANTS */
10446		adp->ad_state &= ~ATTACHED;
10447		adp->ad_state |= UNDONE;
10448	}
10449	/*
10450	 * The on-disk inode cannot claim to be any larger than the last
10451	 * fragment that has been written. Otherwise, the on-disk inode
10452	 * might have fragments that were not the last block in the file
10453	 * which would corrupt the filesystem.
10454	 */
10455	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10456	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10457		if (adp->ad_offset >= NDADDR)
10458			break;
10459		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10460		/* keep going until hitting a rollback to a frag */
10461		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10462			continue;
10463		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10464		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10465#ifdef INVARIANTS
10466			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10467				panic("softdep_write_inodeblock: lost dep2");
10468#endif /* INVARIANTS */
10469			dp->di_db[i] = 0;
10470		}
10471		for (i = 0; i < NIADDR; i++) {
10472#ifdef INVARIANTS
10473			if (dp->di_ib[i] != 0 &&
10474			    (deplist & ((1 << NDADDR) << i)) == 0)
10475				panic("softdep_write_inodeblock: lost dep3");
10476#endif /* INVARIANTS */
10477			dp->di_ib[i] = 0;
10478		}
10479		return;
10480	}
10481	/*
10482	 * If we have zero'ed out the last allocated block of the file,
10483	 * roll back the size to the last currently allocated block.
10484	 * We know that this last allocated block is a full-sized as
10485	 * we already checked for fragments in the loop above.
10486	 */
10487	if (lastadp != NULL &&
10488	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10489		for (i = lastadp->ad_offset; i >= 0; i--)
10490			if (dp->di_db[i] != 0)
10491				break;
10492		dp->di_size = (i + 1) * fs->fs_bsize;
10493	}
10494	/*
10495	 * The only dependencies are for indirect blocks.
10496	 *
10497	 * The file size for indirect block additions is not guaranteed.
10498	 * Such a guarantee would be non-trivial to achieve. The conventional
10499	 * synchronous write implementation also does not make this guarantee.
10500	 * Fsck should catch and fix discrepancies. Arguably, the file size
10501	 * can be over-estimated without destroying integrity when the file
10502	 * moves into the indirect blocks (i.e., is large). If we want to
10503	 * postpone fsck, we are stuck with this argument.
10504	 */
10505	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10506		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10507}
10508
10509/*
10510 * Cancel an indirdep as a result of truncation.  Release all of the
10511 * children allocindirs and place their journal work on the appropriate
10512 * list.
10513 */
10514static void
10515cancel_indirdep(indirdep, bp, freeblks)
10516	struct indirdep *indirdep;
10517	struct buf *bp;
10518	struct freeblks *freeblks;
10519{
10520	struct allocindir *aip;
10521
10522	/*
10523	 * None of the indirect pointers will ever be visible,
10524	 * so they can simply be tossed. GOINGAWAY ensures
10525	 * that allocated pointers will be saved in the buffer
10526	 * cache until they are freed. Note that they will
10527	 * only be able to be found by their physical address
10528	 * since the inode mapping the logical address will
10529	 * be gone. The save buffer used for the safe copy
10530	 * was allocated in setup_allocindir_phase2 using
10531	 * the physical address so it could be used for this
10532	 * purpose. Hence we swap the safe copy with the real
10533	 * copy, allowing the safe copy to be freed and holding
10534	 * on to the real copy for later use in indir_trunc.
10535	 */
10536	if (indirdep->ir_state & GOINGAWAY)
10537		panic("cancel_indirdep: already gone");
10538	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10539		indirdep->ir_state |= DEPCOMPLETE;
10540		LIST_REMOVE(indirdep, ir_next);
10541	}
10542	indirdep->ir_state |= GOINGAWAY;
10543	/*
10544	 * Pass in bp for blocks still have journal writes
10545	 * pending so we can cancel them on their own.
10546	 */
10547	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != 0)
10548		cancel_allocindir(aip, bp, freeblks, 0);
10549	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0)
10550		cancel_allocindir(aip, NULL, freeblks, 0);
10551	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0)
10552		cancel_allocindir(aip, NULL, freeblks, 0);
10553	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != 0)
10554		cancel_allocindir(aip, NULL, freeblks, 0);
10555	/*
10556	 * If there are pending partial truncations we need to keep the
10557	 * old block copy around until they complete.  This is because
10558	 * the current b_data is not a perfect superset of the available
10559	 * blocks.
10560	 */
10561	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10562		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10563	else
10564		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10565	WORKLIST_REMOVE(&indirdep->ir_list);
10566	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10567	indirdep->ir_bp = NULL;
10568	indirdep->ir_freeblks = freeblks;
10569}
10570
10571/*
10572 * Free an indirdep once it no longer has new pointers to track.
10573 */
10574static void
10575free_indirdep(indirdep)
10576	struct indirdep *indirdep;
10577{
10578
10579	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10580	    ("free_indirdep: Indir trunc list not empty."));
10581	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10582	    ("free_indirdep: Complete head not empty."));
10583	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10584	    ("free_indirdep: write head not empty."));
10585	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10586	    ("free_indirdep: done head not empty."));
10587	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10588	    ("free_indirdep: deplist head not empty."));
10589	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10590	    ("free_indirdep: %p still on newblk list.", indirdep));
10591	KASSERT(indirdep->ir_saveddata == NULL,
10592	    ("free_indirdep: %p still has saved data.", indirdep));
10593	if (indirdep->ir_state & ONWORKLIST)
10594		WORKLIST_REMOVE(&indirdep->ir_list);
10595	WORKITEM_FREE(indirdep, D_INDIRDEP);
10596}
10597
10598/*
10599 * Called before a write to an indirdep.  This routine is responsible for
10600 * rolling back pointers to a safe state which includes only those
10601 * allocindirs which have been completed.
10602 */
10603static void
10604initiate_write_indirdep(indirdep, bp)
10605	struct indirdep *indirdep;
10606	struct buf *bp;
10607{
10608	struct ufsmount *ump;
10609
10610	indirdep->ir_state |= IOSTARTED;
10611	if (indirdep->ir_state & GOINGAWAY)
10612		panic("disk_io_initiation: indirdep gone");
10613	/*
10614	 * If there are no remaining dependencies, this will be writing
10615	 * the real pointers.
10616	 */
10617	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10618	    TAILQ_EMPTY(&indirdep->ir_trunc))
10619		return;
10620	/*
10621	 * Replace up-to-date version with safe version.
10622	 */
10623	if (indirdep->ir_saveddata == NULL) {
10624		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10625		LOCK_OWNED(ump);
10626		FREE_LOCK(ump);
10627		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10628		    M_SOFTDEP_FLAGS);
10629		ACQUIRE_LOCK(ump);
10630	}
10631	indirdep->ir_state &= ~ATTACHED;
10632	indirdep->ir_state |= UNDONE;
10633	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10634	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10635	    bp->b_bcount);
10636}
10637
10638/*
10639 * Called when an inode has been cleared in a cg bitmap.  This finally
10640 * eliminates any canceled jaddrefs
10641 */
10642void
10643softdep_setup_inofree(mp, bp, ino, wkhd)
10644	struct mount *mp;
10645	struct buf *bp;
10646	ino_t ino;
10647	struct workhead *wkhd;
10648{
10649	struct worklist *wk, *wkn;
10650	struct inodedep *inodedep;
10651	struct ufsmount *ump;
10652	uint8_t *inosused;
10653	struct cg *cgp;
10654	struct fs *fs;
10655
10656	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10657	    ("softdep_setup_inofree called on non-softdep filesystem"));
10658	ump = VFSTOUFS(mp);
10659	ACQUIRE_LOCK(ump);
10660	fs = ump->um_fs;
10661	cgp = (struct cg *)bp->b_data;
10662	inosused = cg_inosused(cgp);
10663	if (isset(inosused, ino % fs->fs_ipg))
10664		panic("softdep_setup_inofree: inode %ju not freed.",
10665		    (uintmax_t)ino);
10666	if (inodedep_lookup(mp, ino, 0, &inodedep))
10667		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10668		    (uintmax_t)ino, inodedep);
10669	if (wkhd) {
10670		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10671			if (wk->wk_type != D_JADDREF)
10672				continue;
10673			WORKLIST_REMOVE(wk);
10674			/*
10675			 * We can free immediately even if the jaddref
10676			 * isn't attached in a background write as now
10677			 * the bitmaps are reconciled.
10678			 */
10679			wk->wk_state |= COMPLETE | ATTACHED;
10680			free_jaddref(WK_JADDREF(wk));
10681		}
10682		jwork_move(&bp->b_dep, wkhd);
10683	}
10684	FREE_LOCK(ump);
10685}
10686
10687
10688/*
10689 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10690 * map.  Any dependencies waiting for the write to clear are added to the
10691 * buf's list and any jnewblks that are being canceled are discarded
10692 * immediately.
10693 */
10694void
10695softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10696	struct mount *mp;
10697	struct buf *bp;
10698	ufs2_daddr_t blkno;
10699	int frags;
10700	struct workhead *wkhd;
10701{
10702	struct bmsafemap *bmsafemap;
10703	struct jnewblk *jnewblk;
10704	struct ufsmount *ump;
10705	struct worklist *wk;
10706	struct fs *fs;
10707#ifdef SUJ_DEBUG
10708	uint8_t *blksfree;
10709	struct cg *cgp;
10710	ufs2_daddr_t jstart;
10711	ufs2_daddr_t jend;
10712	ufs2_daddr_t end;
10713	long bno;
10714	int i;
10715#endif
10716
10717	CTR3(KTR_SUJ,
10718	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10719	    blkno, frags, wkhd);
10720
10721	ump = VFSTOUFS(mp);
10722	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10723	    ("softdep_setup_blkfree called on non-softdep filesystem"));
10724	ACQUIRE_LOCK(ump);
10725	/* Lookup the bmsafemap so we track when it is dirty. */
10726	fs = ump->um_fs;
10727	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10728	/*
10729	 * Detach any jnewblks which have been canceled.  They must linger
10730	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10731	 * an unjournaled allocation from hitting the disk.
10732	 */
10733	if (wkhd) {
10734		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10735			CTR2(KTR_SUJ,
10736			    "softdep_setup_blkfree: blkno %jd wk type %d",
10737			    blkno, wk->wk_type);
10738			WORKLIST_REMOVE(wk);
10739			if (wk->wk_type != D_JNEWBLK) {
10740				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10741				continue;
10742			}
10743			jnewblk = WK_JNEWBLK(wk);
10744			KASSERT(jnewblk->jn_state & GOINGAWAY,
10745			    ("softdep_setup_blkfree: jnewblk not canceled."));
10746#ifdef SUJ_DEBUG
10747			/*
10748			 * Assert that this block is free in the bitmap
10749			 * before we discard the jnewblk.
10750			 */
10751			cgp = (struct cg *)bp->b_data;
10752			blksfree = cg_blksfree(cgp);
10753			bno = dtogd(fs, jnewblk->jn_blkno);
10754			for (i = jnewblk->jn_oldfrags;
10755			    i < jnewblk->jn_frags; i++) {
10756				if (isset(blksfree, bno + i))
10757					continue;
10758				panic("softdep_setup_blkfree: not free");
10759			}
10760#endif
10761			/*
10762			 * Even if it's not attached we can free immediately
10763			 * as the new bitmap is correct.
10764			 */
10765			wk->wk_state |= COMPLETE | ATTACHED;
10766			free_jnewblk(jnewblk);
10767		}
10768	}
10769
10770#ifdef SUJ_DEBUG
10771	/*
10772	 * Assert that we are not freeing a block which has an outstanding
10773	 * allocation dependency.
10774	 */
10775	fs = VFSTOUFS(mp)->um_fs;
10776	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10777	end = blkno + frags;
10778	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10779		/*
10780		 * Don't match against blocks that will be freed when the
10781		 * background write is done.
10782		 */
10783		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10784		    (COMPLETE | DEPCOMPLETE))
10785			continue;
10786		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10787		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10788		if ((blkno >= jstart && blkno < jend) ||
10789		    (end > jstart && end <= jend)) {
10790			printf("state 0x%X %jd - %d %d dep %p\n",
10791			    jnewblk->jn_state, jnewblk->jn_blkno,
10792			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10793			    jnewblk->jn_dep);
10794			panic("softdep_setup_blkfree: "
10795			    "%jd-%jd(%d) overlaps with %jd-%jd",
10796			    blkno, end, frags, jstart, jend);
10797		}
10798	}
10799#endif
10800	FREE_LOCK(ump);
10801}
10802
10803/*
10804 * Revert a block allocation when the journal record that describes it
10805 * is not yet written.
10806 */
10807static int
10808jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10809	struct jnewblk *jnewblk;
10810	struct fs *fs;
10811	struct cg *cgp;
10812	uint8_t *blksfree;
10813{
10814	ufs1_daddr_t fragno;
10815	long cgbno, bbase;
10816	int frags, blk;
10817	int i;
10818
10819	frags = 0;
10820	cgbno = dtogd(fs, jnewblk->jn_blkno);
10821	/*
10822	 * We have to test which frags need to be rolled back.  We may
10823	 * be operating on a stale copy when doing background writes.
10824	 */
10825	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10826		if (isclr(blksfree, cgbno + i))
10827			frags++;
10828	if (frags == 0)
10829		return (0);
10830	/*
10831	 * This is mostly ffs_blkfree() sans some validation and
10832	 * superblock updates.
10833	 */
10834	if (frags == fs->fs_frag) {
10835		fragno = fragstoblks(fs, cgbno);
10836		ffs_setblock(fs, blksfree, fragno);
10837		ffs_clusteracct(fs, cgp, fragno, 1);
10838		cgp->cg_cs.cs_nbfree++;
10839	} else {
10840		cgbno += jnewblk->jn_oldfrags;
10841		bbase = cgbno - fragnum(fs, cgbno);
10842		/* Decrement the old frags.  */
10843		blk = blkmap(fs, blksfree, bbase);
10844		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10845		/* Deallocate the fragment */
10846		for (i = 0; i < frags; i++)
10847			setbit(blksfree, cgbno + i);
10848		cgp->cg_cs.cs_nffree += frags;
10849		/* Add back in counts associated with the new frags */
10850		blk = blkmap(fs, blksfree, bbase);
10851		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10852		/* If a complete block has been reassembled, account for it. */
10853		fragno = fragstoblks(fs, bbase);
10854		if (ffs_isblock(fs, blksfree, fragno)) {
10855			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10856			ffs_clusteracct(fs, cgp, fragno, 1);
10857			cgp->cg_cs.cs_nbfree++;
10858		}
10859	}
10860	stat_jnewblk++;
10861	jnewblk->jn_state &= ~ATTACHED;
10862	jnewblk->jn_state |= UNDONE;
10863
10864	return (frags);
10865}
10866
10867static void
10868initiate_write_bmsafemap(bmsafemap, bp)
10869	struct bmsafemap *bmsafemap;
10870	struct buf *bp;			/* The cg block. */
10871{
10872	struct jaddref *jaddref;
10873	struct jnewblk *jnewblk;
10874	uint8_t *inosused;
10875	uint8_t *blksfree;
10876	struct cg *cgp;
10877	struct fs *fs;
10878	ino_t ino;
10879
10880	if (bmsafemap->sm_state & IOSTARTED)
10881		return;
10882	bmsafemap->sm_state |= IOSTARTED;
10883	/*
10884	 * Clear any inode allocations which are pending journal writes.
10885	 */
10886	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10887		cgp = (struct cg *)bp->b_data;
10888		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10889		inosused = cg_inosused(cgp);
10890		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10891			ino = jaddref->ja_ino % fs->fs_ipg;
10892			if (isset(inosused, ino)) {
10893				if ((jaddref->ja_mode & IFMT) == IFDIR)
10894					cgp->cg_cs.cs_ndir--;
10895				cgp->cg_cs.cs_nifree++;
10896				clrbit(inosused, ino);
10897				jaddref->ja_state &= ~ATTACHED;
10898				jaddref->ja_state |= UNDONE;
10899				stat_jaddref++;
10900			} else
10901				panic("initiate_write_bmsafemap: inode %ju "
10902				    "marked free", (uintmax_t)jaddref->ja_ino);
10903		}
10904	}
10905	/*
10906	 * Clear any block allocations which are pending journal writes.
10907	 */
10908	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10909		cgp = (struct cg *)bp->b_data;
10910		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10911		blksfree = cg_blksfree(cgp);
10912		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10913			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10914				continue;
10915			panic("initiate_write_bmsafemap: block %jd "
10916			    "marked free", jnewblk->jn_blkno);
10917		}
10918	}
10919	/*
10920	 * Move allocation lists to the written lists so they can be
10921	 * cleared once the block write is complete.
10922	 */
10923	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10924	    inodedep, id_deps);
10925	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10926	    newblk, nb_deps);
10927	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10928	    wk_list);
10929}
10930
10931/*
10932 * This routine is called during the completion interrupt
10933 * service routine for a disk write (from the procedure called
10934 * by the device driver to inform the filesystem caches of
10935 * a request completion).  It should be called early in this
10936 * procedure, before the block is made available to other
10937 * processes or other routines are called.
10938 *
10939 */
10940static void
10941softdep_disk_write_complete(bp)
10942	struct buf *bp;		/* describes the completed disk write */
10943{
10944	struct worklist *wk;
10945	struct worklist *owk;
10946	struct ufsmount *ump;
10947	struct workhead reattach;
10948	struct freeblks *freeblks;
10949	struct buf *sbp;
10950
10951	/*
10952	 * If an error occurred while doing the write, then the data
10953	 * has not hit the disk and the dependencies cannot be unrolled.
10954	 */
10955	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10956		return;
10957	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10958		return;
10959	ump = VFSTOUFS(wk->wk_mp);
10960	LIST_INIT(&reattach);
10961	/*
10962	 * This lock must not be released anywhere in this code segment.
10963	 */
10964	sbp = NULL;
10965	owk = NULL;
10966	ACQUIRE_LOCK(ump);
10967	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10968		WORKLIST_REMOVE(wk);
10969		atomic_add_long(&dep_write[wk->wk_type], 1);
10970		if (wk == owk)
10971			panic("duplicate worklist: %p\n", wk);
10972		owk = wk;
10973		switch (wk->wk_type) {
10974
10975		case D_PAGEDEP:
10976			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10977				WORKLIST_INSERT(&reattach, wk);
10978			continue;
10979
10980		case D_INODEDEP:
10981			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10982				WORKLIST_INSERT(&reattach, wk);
10983			continue;
10984
10985		case D_BMSAFEMAP:
10986			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10987				WORKLIST_INSERT(&reattach, wk);
10988			continue;
10989
10990		case D_MKDIR:
10991			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10992			continue;
10993
10994		case D_ALLOCDIRECT:
10995			wk->wk_state |= COMPLETE;
10996			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10997			continue;
10998
10999		case D_ALLOCINDIR:
11000			wk->wk_state |= COMPLETE;
11001			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11002			continue;
11003
11004		case D_INDIRDEP:
11005			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
11006				WORKLIST_INSERT(&reattach, wk);
11007			continue;
11008
11009		case D_FREEBLKS:
11010			wk->wk_state |= COMPLETE;
11011			freeblks = WK_FREEBLKS(wk);
11012			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11013			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11014				add_to_worklist(wk, WK_NODELAY);
11015			continue;
11016
11017		case D_FREEWORK:
11018			handle_written_freework(WK_FREEWORK(wk));
11019			break;
11020
11021		case D_JSEGDEP:
11022			free_jsegdep(WK_JSEGDEP(wk));
11023			continue;
11024
11025		case D_JSEG:
11026			handle_written_jseg(WK_JSEG(wk), bp);
11027			continue;
11028
11029		case D_SBDEP:
11030			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11031				WORKLIST_INSERT(&reattach, wk);
11032			continue;
11033
11034		case D_FREEDEP:
11035			free_freedep(WK_FREEDEP(wk));
11036			continue;
11037
11038		default:
11039			panic("handle_disk_write_complete: Unknown type %s",
11040			    TYPENAME(wk->wk_type));
11041			/* NOTREACHED */
11042		}
11043	}
11044	/*
11045	 * Reattach any requests that must be redone.
11046	 */
11047	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11048		WORKLIST_REMOVE(wk);
11049		WORKLIST_INSERT(&bp->b_dep, wk);
11050	}
11051	FREE_LOCK(ump);
11052	if (sbp)
11053		brelse(sbp);
11054}
11055
11056/*
11057 * Called from within softdep_disk_write_complete above. Note that
11058 * this routine is always called from interrupt level with further
11059 * splbio interrupts blocked.
11060 */
11061static void
11062handle_allocdirect_partdone(adp, wkhd)
11063	struct allocdirect *adp;	/* the completed allocdirect */
11064	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11065{
11066	struct allocdirectlst *listhead;
11067	struct allocdirect *listadp;
11068	struct inodedep *inodedep;
11069	long bsize;
11070
11071	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11072		return;
11073	/*
11074	 * The on-disk inode cannot claim to be any larger than the last
11075	 * fragment that has been written. Otherwise, the on-disk inode
11076	 * might have fragments that were not the last block in the file
11077	 * which would corrupt the filesystem. Thus, we cannot free any
11078	 * allocdirects after one whose ad_oldblkno claims a fragment as
11079	 * these blocks must be rolled back to zero before writing the inode.
11080	 * We check the currently active set of allocdirects in id_inoupdt
11081	 * or id_extupdt as appropriate.
11082	 */
11083	inodedep = adp->ad_inodedep;
11084	bsize = inodedep->id_fs->fs_bsize;
11085	if (adp->ad_state & EXTDATA)
11086		listhead = &inodedep->id_extupdt;
11087	else
11088		listhead = &inodedep->id_inoupdt;
11089	TAILQ_FOREACH(listadp, listhead, ad_next) {
11090		/* found our block */
11091		if (listadp == adp)
11092			break;
11093		/* continue if ad_oldlbn is not a fragment */
11094		if (listadp->ad_oldsize == 0 ||
11095		    listadp->ad_oldsize == bsize)
11096			continue;
11097		/* hit a fragment */
11098		return;
11099	}
11100	/*
11101	 * If we have reached the end of the current list without
11102	 * finding the just finished dependency, then it must be
11103	 * on the future dependency list. Future dependencies cannot
11104	 * be freed until they are moved to the current list.
11105	 */
11106	if (listadp == NULL) {
11107#ifdef DEBUG
11108		if (adp->ad_state & EXTDATA)
11109			listhead = &inodedep->id_newextupdt;
11110		else
11111			listhead = &inodedep->id_newinoupdt;
11112		TAILQ_FOREACH(listadp, listhead, ad_next)
11113			/* found our block */
11114			if (listadp == adp)
11115				break;
11116		if (listadp == NULL)
11117			panic("handle_allocdirect_partdone: lost dep");
11118#endif /* DEBUG */
11119		return;
11120	}
11121	/*
11122	 * If we have found the just finished dependency, then queue
11123	 * it along with anything that follows it that is complete.
11124	 * Since the pointer has not yet been written in the inode
11125	 * as the dependency prevents it, place the allocdirect on the
11126	 * bufwait list where it will be freed once the pointer is
11127	 * valid.
11128	 */
11129	if (wkhd == NULL)
11130		wkhd = &inodedep->id_bufwait;
11131	for (; adp; adp = listadp) {
11132		listadp = TAILQ_NEXT(adp, ad_next);
11133		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11134			return;
11135		TAILQ_REMOVE(listhead, adp, ad_next);
11136		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11137	}
11138}
11139
11140/*
11141 * Called from within softdep_disk_write_complete above.  This routine
11142 * completes successfully written allocindirs.
11143 */
11144static void
11145handle_allocindir_partdone(aip)
11146	struct allocindir *aip;		/* the completed allocindir */
11147{
11148	struct indirdep *indirdep;
11149
11150	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11151		return;
11152	indirdep = aip->ai_indirdep;
11153	LIST_REMOVE(aip, ai_next);
11154	/*
11155	 * Don't set a pointer while the buffer is undergoing IO or while
11156	 * we have active truncations.
11157	 */
11158	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11159		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11160		return;
11161	}
11162	if (indirdep->ir_state & UFS1FMT)
11163		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11164		    aip->ai_newblkno;
11165	else
11166		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11167		    aip->ai_newblkno;
11168	/*
11169	 * Await the pointer write before freeing the allocindir.
11170	 */
11171	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11172}
11173
11174/*
11175 * Release segments held on a jwork list.
11176 */
11177static void
11178handle_jwork(wkhd)
11179	struct workhead *wkhd;
11180{
11181	struct worklist *wk;
11182
11183	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11184		WORKLIST_REMOVE(wk);
11185		switch (wk->wk_type) {
11186		case D_JSEGDEP:
11187			free_jsegdep(WK_JSEGDEP(wk));
11188			continue;
11189		case D_FREEDEP:
11190			free_freedep(WK_FREEDEP(wk));
11191			continue;
11192		case D_FREEFRAG:
11193			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11194			WORKITEM_FREE(wk, D_FREEFRAG);
11195			continue;
11196		case D_FREEWORK:
11197			handle_written_freework(WK_FREEWORK(wk));
11198			continue;
11199		default:
11200			panic("handle_jwork: Unknown type %s\n",
11201			    TYPENAME(wk->wk_type));
11202		}
11203	}
11204}
11205
11206/*
11207 * Handle the bufwait list on an inode when it is safe to release items
11208 * held there.  This normally happens after an inode block is written but
11209 * may be delayed and handled later if there are pending journal items that
11210 * are not yet safe to be released.
11211 */
11212static struct freefile *
11213handle_bufwait(inodedep, refhd)
11214	struct inodedep *inodedep;
11215	struct workhead *refhd;
11216{
11217	struct jaddref *jaddref;
11218	struct freefile *freefile;
11219	struct worklist *wk;
11220
11221	freefile = NULL;
11222	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11223		WORKLIST_REMOVE(wk);
11224		switch (wk->wk_type) {
11225		case D_FREEFILE:
11226			/*
11227			 * We defer adding freefile to the worklist
11228			 * until all other additions have been made to
11229			 * ensure that it will be done after all the
11230			 * old blocks have been freed.
11231			 */
11232			if (freefile != NULL)
11233				panic("handle_bufwait: freefile");
11234			freefile = WK_FREEFILE(wk);
11235			continue;
11236
11237		case D_MKDIR:
11238			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11239			continue;
11240
11241		case D_DIRADD:
11242			diradd_inode_written(WK_DIRADD(wk), inodedep);
11243			continue;
11244
11245		case D_FREEFRAG:
11246			wk->wk_state |= COMPLETE;
11247			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11248				add_to_worklist(wk, 0);
11249			continue;
11250
11251		case D_DIRREM:
11252			wk->wk_state |= COMPLETE;
11253			add_to_worklist(wk, 0);
11254			continue;
11255
11256		case D_ALLOCDIRECT:
11257		case D_ALLOCINDIR:
11258			free_newblk(WK_NEWBLK(wk));
11259			continue;
11260
11261		case D_JNEWBLK:
11262			wk->wk_state |= COMPLETE;
11263			free_jnewblk(WK_JNEWBLK(wk));
11264			continue;
11265
11266		/*
11267		 * Save freed journal segments and add references on
11268		 * the supplied list which will delay their release
11269		 * until the cg bitmap is cleared on disk.
11270		 */
11271		case D_JSEGDEP:
11272			if (refhd == NULL)
11273				free_jsegdep(WK_JSEGDEP(wk));
11274			else
11275				WORKLIST_INSERT(refhd, wk);
11276			continue;
11277
11278		case D_JADDREF:
11279			jaddref = WK_JADDREF(wk);
11280			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11281			    if_deps);
11282			/*
11283			 * Transfer any jaddrefs to the list to be freed with
11284			 * the bitmap if we're handling a removed file.
11285			 */
11286			if (refhd == NULL) {
11287				wk->wk_state |= COMPLETE;
11288				free_jaddref(jaddref);
11289			} else
11290				WORKLIST_INSERT(refhd, wk);
11291			continue;
11292
11293		default:
11294			panic("handle_bufwait: Unknown type %p(%s)",
11295			    wk, TYPENAME(wk->wk_type));
11296			/* NOTREACHED */
11297		}
11298	}
11299	return (freefile);
11300}
11301/*
11302 * Called from within softdep_disk_write_complete above to restore
11303 * in-memory inode block contents to their most up-to-date state. Note
11304 * that this routine is always called from interrupt level with further
11305 * splbio interrupts blocked.
11306 */
11307static int
11308handle_written_inodeblock(inodedep, bp)
11309	struct inodedep *inodedep;
11310	struct buf *bp;		/* buffer containing the inode block */
11311{
11312	struct freefile *freefile;
11313	struct allocdirect *adp, *nextadp;
11314	struct ufs1_dinode *dp1 = NULL;
11315	struct ufs2_dinode *dp2 = NULL;
11316	struct workhead wkhd;
11317	int hadchanges, fstype;
11318	ino_t freelink;
11319
11320	LIST_INIT(&wkhd);
11321	hadchanges = 0;
11322	freefile = NULL;
11323	if ((inodedep->id_state & IOSTARTED) == 0)
11324		panic("handle_written_inodeblock: not started");
11325	inodedep->id_state &= ~IOSTARTED;
11326	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11327		fstype = UFS1;
11328		dp1 = (struct ufs1_dinode *)bp->b_data +
11329		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11330		freelink = dp1->di_freelink;
11331	} else {
11332		fstype = UFS2;
11333		dp2 = (struct ufs2_dinode *)bp->b_data +
11334		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11335		freelink = dp2->di_freelink;
11336	}
11337	/*
11338	 * Leave this inodeblock dirty until it's in the list.
11339	 */
11340	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11341		struct inodedep *inon;
11342
11343		inon = TAILQ_NEXT(inodedep, id_unlinked);
11344		if ((inon == NULL && freelink == 0) ||
11345		    (inon && inon->id_ino == freelink)) {
11346			if (inon)
11347				inon->id_state |= UNLINKPREV;
11348			inodedep->id_state |= UNLINKNEXT;
11349		}
11350		hadchanges = 1;
11351	}
11352	/*
11353	 * If we had to rollback the inode allocation because of
11354	 * bitmaps being incomplete, then simply restore it.
11355	 * Keep the block dirty so that it will not be reclaimed until
11356	 * all associated dependencies have been cleared and the
11357	 * corresponding updates written to disk.
11358	 */
11359	if (inodedep->id_savedino1 != NULL) {
11360		hadchanges = 1;
11361		if (fstype == UFS1)
11362			*dp1 = *inodedep->id_savedino1;
11363		else
11364			*dp2 = *inodedep->id_savedino2;
11365		free(inodedep->id_savedino1, M_SAVEDINO);
11366		inodedep->id_savedino1 = NULL;
11367		if ((bp->b_flags & B_DELWRI) == 0)
11368			stat_inode_bitmap++;
11369		bdirty(bp);
11370		/*
11371		 * If the inode is clear here and GOINGAWAY it will never
11372		 * be written.  Process the bufwait and clear any pending
11373		 * work which may include the freefile.
11374		 */
11375		if (inodedep->id_state & GOINGAWAY)
11376			goto bufwait;
11377		return (1);
11378	}
11379	inodedep->id_state |= COMPLETE;
11380	/*
11381	 * Roll forward anything that had to be rolled back before
11382	 * the inode could be updated.
11383	 */
11384	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11385		nextadp = TAILQ_NEXT(adp, ad_next);
11386		if (adp->ad_state & ATTACHED)
11387			panic("handle_written_inodeblock: new entry");
11388		if (fstype == UFS1) {
11389			if (adp->ad_offset < NDADDR) {
11390				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11391					panic("%s %s #%jd mismatch %d != %jd",
11392					    "handle_written_inodeblock:",
11393					    "direct pointer",
11394					    (intmax_t)adp->ad_offset,
11395					    dp1->di_db[adp->ad_offset],
11396					    (intmax_t)adp->ad_oldblkno);
11397				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11398			} else {
11399				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11400					panic("%s: %s #%jd allocated as %d",
11401					    "handle_written_inodeblock",
11402					    "indirect pointer",
11403					    (intmax_t)adp->ad_offset - NDADDR,
11404					    dp1->di_ib[adp->ad_offset - NDADDR]);
11405				dp1->di_ib[adp->ad_offset - NDADDR] =
11406				    adp->ad_newblkno;
11407			}
11408		} else {
11409			if (adp->ad_offset < NDADDR) {
11410				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11411					panic("%s: %s #%jd %s %jd != %jd",
11412					    "handle_written_inodeblock",
11413					    "direct pointer",
11414					    (intmax_t)adp->ad_offset, "mismatch",
11415					    (intmax_t)dp2->di_db[adp->ad_offset],
11416					    (intmax_t)adp->ad_oldblkno);
11417				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11418			} else {
11419				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11420					panic("%s: %s #%jd allocated as %jd",
11421					    "handle_written_inodeblock",
11422					    "indirect pointer",
11423					    (intmax_t)adp->ad_offset - NDADDR,
11424					    (intmax_t)
11425					    dp2->di_ib[adp->ad_offset - NDADDR]);
11426				dp2->di_ib[adp->ad_offset - NDADDR] =
11427				    adp->ad_newblkno;
11428			}
11429		}
11430		adp->ad_state &= ~UNDONE;
11431		adp->ad_state |= ATTACHED;
11432		hadchanges = 1;
11433	}
11434	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11435		nextadp = TAILQ_NEXT(adp, ad_next);
11436		if (adp->ad_state & ATTACHED)
11437			panic("handle_written_inodeblock: new entry");
11438		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11439			panic("%s: direct pointers #%jd %s %jd != %jd",
11440			    "handle_written_inodeblock",
11441			    (intmax_t)adp->ad_offset, "mismatch",
11442			    (intmax_t)dp2->di_extb[adp->ad_offset],
11443			    (intmax_t)adp->ad_oldblkno);
11444		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11445		adp->ad_state &= ~UNDONE;
11446		adp->ad_state |= ATTACHED;
11447		hadchanges = 1;
11448	}
11449	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11450		stat_direct_blk_ptrs++;
11451	/*
11452	 * Reset the file size to its most up-to-date value.
11453	 */
11454	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11455		panic("handle_written_inodeblock: bad size");
11456	if (inodedep->id_savednlink > LINK_MAX)
11457		panic("handle_written_inodeblock: Invalid link count "
11458		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11459	if (fstype == UFS1) {
11460		if (dp1->di_nlink != inodedep->id_savednlink) {
11461			dp1->di_nlink = inodedep->id_savednlink;
11462			hadchanges = 1;
11463		}
11464		if (dp1->di_size != inodedep->id_savedsize) {
11465			dp1->di_size = inodedep->id_savedsize;
11466			hadchanges = 1;
11467		}
11468	} else {
11469		if (dp2->di_nlink != inodedep->id_savednlink) {
11470			dp2->di_nlink = inodedep->id_savednlink;
11471			hadchanges = 1;
11472		}
11473		if (dp2->di_size != inodedep->id_savedsize) {
11474			dp2->di_size = inodedep->id_savedsize;
11475			hadchanges = 1;
11476		}
11477		if (dp2->di_extsize != inodedep->id_savedextsize) {
11478			dp2->di_extsize = inodedep->id_savedextsize;
11479			hadchanges = 1;
11480		}
11481	}
11482	inodedep->id_savedsize = -1;
11483	inodedep->id_savedextsize = -1;
11484	inodedep->id_savednlink = -1;
11485	/*
11486	 * If there were any rollbacks in the inode block, then it must be
11487	 * marked dirty so that its will eventually get written back in
11488	 * its correct form.
11489	 */
11490	if (hadchanges)
11491		bdirty(bp);
11492bufwait:
11493	/*
11494	 * Process any allocdirects that completed during the update.
11495	 */
11496	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11497		handle_allocdirect_partdone(adp, &wkhd);
11498	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11499		handle_allocdirect_partdone(adp, &wkhd);
11500	/*
11501	 * Process deallocations that were held pending until the
11502	 * inode had been written to disk. Freeing of the inode
11503	 * is delayed until after all blocks have been freed to
11504	 * avoid creation of new <vfsid, inum, lbn> triples
11505	 * before the old ones have been deleted.  Completely
11506	 * unlinked inodes are not processed until the unlinked
11507	 * inode list is written or the last reference is removed.
11508	 */
11509	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11510		freefile = handle_bufwait(inodedep, NULL);
11511		if (freefile && !LIST_EMPTY(&wkhd)) {
11512			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11513			freefile = NULL;
11514		}
11515	}
11516	/*
11517	 * Move rolled forward dependency completions to the bufwait list
11518	 * now that those that were already written have been processed.
11519	 */
11520	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11521		panic("handle_written_inodeblock: bufwait but no changes");
11522	jwork_move(&inodedep->id_bufwait, &wkhd);
11523
11524	if (freefile != NULL) {
11525		/*
11526		 * If the inode is goingaway it was never written.  Fake up
11527		 * the state here so free_inodedep() can succeed.
11528		 */
11529		if (inodedep->id_state & GOINGAWAY)
11530			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11531		if (free_inodedep(inodedep) == 0)
11532			panic("handle_written_inodeblock: live inodedep %p",
11533			    inodedep);
11534		add_to_worklist(&freefile->fx_list, 0);
11535		return (0);
11536	}
11537
11538	/*
11539	 * If no outstanding dependencies, free it.
11540	 */
11541	if (free_inodedep(inodedep) ||
11542	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11543	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11544	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11545	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11546		return (0);
11547	return (hadchanges);
11548}
11549
11550static int
11551handle_written_indirdep(indirdep, bp, bpp)
11552	struct indirdep *indirdep;
11553	struct buf *bp;
11554	struct buf **bpp;
11555{
11556	struct allocindir *aip;
11557	struct buf *sbp;
11558	int chgs;
11559
11560	if (indirdep->ir_state & GOINGAWAY)
11561		panic("handle_written_indirdep: indirdep gone");
11562	if ((indirdep->ir_state & IOSTARTED) == 0)
11563		panic("handle_written_indirdep: IO not started");
11564	chgs = 0;
11565	/*
11566	 * If there were rollbacks revert them here.
11567	 */
11568	if (indirdep->ir_saveddata) {
11569		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11570		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11571			free(indirdep->ir_saveddata, M_INDIRDEP);
11572			indirdep->ir_saveddata = NULL;
11573		}
11574		chgs = 1;
11575	}
11576	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11577	indirdep->ir_state |= ATTACHED;
11578	/*
11579	 * Move allocindirs with written pointers to the completehd if
11580	 * the indirdep's pointer is not yet written.  Otherwise
11581	 * free them here.
11582	 */
11583	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != 0) {
11584		LIST_REMOVE(aip, ai_next);
11585		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11586			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11587			    ai_next);
11588			newblk_freefrag(&aip->ai_block);
11589			continue;
11590		}
11591		free_newblk(&aip->ai_block);
11592	}
11593	/*
11594	 * Move allocindirs that have finished dependency processing from
11595	 * the done list to the write list after updating the pointers.
11596	 */
11597	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11598		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != 0) {
11599			handle_allocindir_partdone(aip);
11600			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11601				panic("disk_write_complete: not gone");
11602			chgs = 1;
11603		}
11604	}
11605	/*
11606	 * Preserve the indirdep if there were any changes or if it is not
11607	 * yet valid on disk.
11608	 */
11609	if (chgs) {
11610		stat_indir_blk_ptrs++;
11611		bdirty(bp);
11612		return (1);
11613	}
11614	/*
11615	 * If there were no changes we can discard the savedbp and detach
11616	 * ourselves from the buf.  We are only carrying completed pointers
11617	 * in this case.
11618	 */
11619	sbp = indirdep->ir_savebp;
11620	sbp->b_flags |= B_INVAL | B_NOCACHE;
11621	indirdep->ir_savebp = NULL;
11622	indirdep->ir_bp = NULL;
11623	if (*bpp != NULL)
11624		panic("handle_written_indirdep: bp already exists.");
11625	*bpp = sbp;
11626	/*
11627	 * The indirdep may not be freed until its parent points at it.
11628	 */
11629	if (indirdep->ir_state & DEPCOMPLETE)
11630		free_indirdep(indirdep);
11631
11632	return (0);
11633}
11634
11635/*
11636 * Process a diradd entry after its dependent inode has been written.
11637 * This routine must be called with splbio interrupts blocked.
11638 */
11639static void
11640diradd_inode_written(dap, inodedep)
11641	struct diradd *dap;
11642	struct inodedep *inodedep;
11643{
11644
11645	dap->da_state |= COMPLETE;
11646	complete_diradd(dap);
11647	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11648}
11649
11650/*
11651 * Returns true if the bmsafemap will have rollbacks when written.  Must only
11652 * be called with the per-filesystem lock and the buf lock on the cg held.
11653 */
11654static int
11655bmsafemap_backgroundwrite(bmsafemap, bp)
11656	struct bmsafemap *bmsafemap;
11657	struct buf *bp;
11658{
11659	int dirty;
11660
11661	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11662	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11663	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11664	/*
11665	 * If we're initiating a background write we need to process the
11666	 * rollbacks as they exist now, not as they exist when IO starts.
11667	 * No other consumers will look at the contents of the shadowed
11668	 * buf so this is safe to do here.
11669	 */
11670	if (bp->b_xflags & BX_BKGRDMARKER)
11671		initiate_write_bmsafemap(bmsafemap, bp);
11672
11673	return (dirty);
11674}
11675
11676/*
11677 * Re-apply an allocation when a cg write is complete.
11678 */
11679static int
11680jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11681	struct jnewblk *jnewblk;
11682	struct fs *fs;
11683	struct cg *cgp;
11684	uint8_t *blksfree;
11685{
11686	ufs1_daddr_t fragno;
11687	ufs2_daddr_t blkno;
11688	long cgbno, bbase;
11689	int frags, blk;
11690	int i;
11691
11692	frags = 0;
11693	cgbno = dtogd(fs, jnewblk->jn_blkno);
11694	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11695		if (isclr(blksfree, cgbno + i))
11696			panic("jnewblk_rollforward: re-allocated fragment");
11697		frags++;
11698	}
11699	if (frags == fs->fs_frag) {
11700		blkno = fragstoblks(fs, cgbno);
11701		ffs_clrblock(fs, blksfree, (long)blkno);
11702		ffs_clusteracct(fs, cgp, blkno, -1);
11703		cgp->cg_cs.cs_nbfree--;
11704	} else {
11705		bbase = cgbno - fragnum(fs, cgbno);
11706		cgbno += jnewblk->jn_oldfrags;
11707                /* If a complete block had been reassembled, account for it. */
11708		fragno = fragstoblks(fs, bbase);
11709		if (ffs_isblock(fs, blksfree, fragno)) {
11710			cgp->cg_cs.cs_nffree += fs->fs_frag;
11711			ffs_clusteracct(fs, cgp, fragno, -1);
11712			cgp->cg_cs.cs_nbfree--;
11713		}
11714		/* Decrement the old frags.  */
11715		blk = blkmap(fs, blksfree, bbase);
11716		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11717		/* Allocate the fragment */
11718		for (i = 0; i < frags; i++)
11719			clrbit(blksfree, cgbno + i);
11720		cgp->cg_cs.cs_nffree -= frags;
11721		/* Add back in counts associated with the new frags */
11722		blk = blkmap(fs, blksfree, bbase);
11723		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11724	}
11725	return (frags);
11726}
11727
11728/*
11729 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11730 * changes if it's not a background write.  Set all written dependencies
11731 * to DEPCOMPLETE and free the structure if possible.
11732 */
11733static int
11734handle_written_bmsafemap(bmsafemap, bp)
11735	struct bmsafemap *bmsafemap;
11736	struct buf *bp;
11737{
11738	struct newblk *newblk;
11739	struct inodedep *inodedep;
11740	struct jaddref *jaddref, *jatmp;
11741	struct jnewblk *jnewblk, *jntmp;
11742	struct ufsmount *ump;
11743	uint8_t *inosused;
11744	uint8_t *blksfree;
11745	struct cg *cgp;
11746	struct fs *fs;
11747	ino_t ino;
11748	int foreground;
11749	int chgs;
11750
11751	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11752		panic("initiate_write_bmsafemap: Not started\n");
11753	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11754	chgs = 0;
11755	bmsafemap->sm_state &= ~IOSTARTED;
11756	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11757	/*
11758	 * Release journal work that was waiting on the write.
11759	 */
11760	handle_jwork(&bmsafemap->sm_freewr);
11761
11762	/*
11763	 * Restore unwritten inode allocation pending jaddref writes.
11764	 */
11765	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11766		cgp = (struct cg *)bp->b_data;
11767		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11768		inosused = cg_inosused(cgp);
11769		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11770		    ja_bmdeps, jatmp) {
11771			if ((jaddref->ja_state & UNDONE) == 0)
11772				continue;
11773			ino = jaddref->ja_ino % fs->fs_ipg;
11774			if (isset(inosused, ino))
11775				panic("handle_written_bmsafemap: "
11776				    "re-allocated inode");
11777			/* Do the roll-forward only if it's a real copy. */
11778			if (foreground) {
11779				if ((jaddref->ja_mode & IFMT) == IFDIR)
11780					cgp->cg_cs.cs_ndir++;
11781				cgp->cg_cs.cs_nifree--;
11782				setbit(inosused, ino);
11783				chgs = 1;
11784			}
11785			jaddref->ja_state &= ~UNDONE;
11786			jaddref->ja_state |= ATTACHED;
11787			free_jaddref(jaddref);
11788		}
11789	}
11790	/*
11791	 * Restore any block allocations which are pending journal writes.
11792	 */
11793	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11794		cgp = (struct cg *)bp->b_data;
11795		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11796		blksfree = cg_blksfree(cgp);
11797		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11798		    jntmp) {
11799			if ((jnewblk->jn_state & UNDONE) == 0)
11800				continue;
11801			/* Do the roll-forward only if it's a real copy. */
11802			if (foreground &&
11803			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11804				chgs = 1;
11805			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11806			jnewblk->jn_state |= ATTACHED;
11807			free_jnewblk(jnewblk);
11808		}
11809	}
11810	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11811		newblk->nb_state |= DEPCOMPLETE;
11812		newblk->nb_state &= ~ONDEPLIST;
11813		newblk->nb_bmsafemap = NULL;
11814		LIST_REMOVE(newblk, nb_deps);
11815		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11816			handle_allocdirect_partdone(
11817			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11818		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11819			handle_allocindir_partdone(
11820			    WK_ALLOCINDIR(&newblk->nb_list));
11821		else if (newblk->nb_list.wk_type != D_NEWBLK)
11822			panic("handle_written_bmsafemap: Unexpected type: %s",
11823			    TYPENAME(newblk->nb_list.wk_type));
11824	}
11825	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11826		inodedep->id_state |= DEPCOMPLETE;
11827		inodedep->id_state &= ~ONDEPLIST;
11828		LIST_REMOVE(inodedep, id_deps);
11829		inodedep->id_bmsafemap = NULL;
11830	}
11831	LIST_REMOVE(bmsafemap, sm_next);
11832	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11833	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11834	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11835	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11836	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11837		LIST_REMOVE(bmsafemap, sm_hash);
11838		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11839		return (0);
11840	}
11841	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11842	if (foreground)
11843		bdirty(bp);
11844	return (1);
11845}
11846
11847/*
11848 * Try to free a mkdir dependency.
11849 */
11850static void
11851complete_mkdir(mkdir)
11852	struct mkdir *mkdir;
11853{
11854	struct diradd *dap;
11855
11856	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11857		return;
11858	LIST_REMOVE(mkdir, md_mkdirs);
11859	dap = mkdir->md_diradd;
11860	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11861	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11862		dap->da_state |= DEPCOMPLETE;
11863		complete_diradd(dap);
11864	}
11865	WORKITEM_FREE(mkdir, D_MKDIR);
11866}
11867
11868/*
11869 * Handle the completion of a mkdir dependency.
11870 */
11871static void
11872handle_written_mkdir(mkdir, type)
11873	struct mkdir *mkdir;
11874	int type;
11875{
11876
11877	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11878		panic("handle_written_mkdir: bad type");
11879	mkdir->md_state |= COMPLETE;
11880	complete_mkdir(mkdir);
11881}
11882
11883static int
11884free_pagedep(pagedep)
11885	struct pagedep *pagedep;
11886{
11887	int i;
11888
11889	if (pagedep->pd_state & NEWBLOCK)
11890		return (0);
11891	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11892		return (0);
11893	for (i = 0; i < DAHASHSZ; i++)
11894		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11895			return (0);
11896	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11897		return (0);
11898	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11899		return (0);
11900	if (pagedep->pd_state & ONWORKLIST)
11901		WORKLIST_REMOVE(&pagedep->pd_list);
11902	LIST_REMOVE(pagedep, pd_hash);
11903	WORKITEM_FREE(pagedep, D_PAGEDEP);
11904
11905	return (1);
11906}
11907
11908/*
11909 * Called from within softdep_disk_write_complete above.
11910 * A write operation was just completed. Removed inodes can
11911 * now be freed and associated block pointers may be committed.
11912 * Note that this routine is always called from interrupt level
11913 * with further splbio interrupts blocked.
11914 */
11915static int
11916handle_written_filepage(pagedep, bp)
11917	struct pagedep *pagedep;
11918	struct buf *bp;		/* buffer containing the written page */
11919{
11920	struct dirrem *dirrem;
11921	struct diradd *dap, *nextdap;
11922	struct direct *ep;
11923	int i, chgs;
11924
11925	if ((pagedep->pd_state & IOSTARTED) == 0)
11926		panic("handle_written_filepage: not started");
11927	pagedep->pd_state &= ~IOSTARTED;
11928	/*
11929	 * Process any directory removals that have been committed.
11930	 */
11931	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11932		LIST_REMOVE(dirrem, dm_next);
11933		dirrem->dm_state |= COMPLETE;
11934		dirrem->dm_dirinum = pagedep->pd_ino;
11935		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11936		    ("handle_written_filepage: Journal entries not written."));
11937		add_to_worklist(&dirrem->dm_list, 0);
11938	}
11939	/*
11940	 * Free any directory additions that have been committed.
11941	 * If it is a newly allocated block, we have to wait until
11942	 * the on-disk directory inode claims the new block.
11943	 */
11944	if ((pagedep->pd_state & NEWBLOCK) == 0)
11945		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11946			free_diradd(dap, NULL);
11947	/*
11948	 * Uncommitted directory entries must be restored.
11949	 */
11950	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11951		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11952		     dap = nextdap) {
11953			nextdap = LIST_NEXT(dap, da_pdlist);
11954			if (dap->da_state & ATTACHED)
11955				panic("handle_written_filepage: attached");
11956			ep = (struct direct *)
11957			    ((char *)bp->b_data + dap->da_offset);
11958			ep->d_ino = dap->da_newinum;
11959			dap->da_state &= ~UNDONE;
11960			dap->da_state |= ATTACHED;
11961			chgs = 1;
11962			/*
11963			 * If the inode referenced by the directory has
11964			 * been written out, then the dependency can be
11965			 * moved to the pending list.
11966			 */
11967			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11968				LIST_REMOVE(dap, da_pdlist);
11969				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11970				    da_pdlist);
11971			}
11972		}
11973	}
11974	/*
11975	 * If there were any rollbacks in the directory, then it must be
11976	 * marked dirty so that its will eventually get written back in
11977	 * its correct form.
11978	 */
11979	if (chgs) {
11980		if ((bp->b_flags & B_DELWRI) == 0)
11981			stat_dir_entry++;
11982		bdirty(bp);
11983		return (1);
11984	}
11985	/*
11986	 * If we are not waiting for a new directory block to be
11987	 * claimed by its inode, then the pagedep will be freed.
11988	 * Otherwise it will remain to track any new entries on
11989	 * the page in case they are fsync'ed.
11990	 */
11991	free_pagedep(pagedep);
11992	return (0);
11993}
11994
11995/*
11996 * Writing back in-core inode structures.
11997 *
11998 * The filesystem only accesses an inode's contents when it occupies an
11999 * "in-core" inode structure.  These "in-core" structures are separate from
12000 * the page frames used to cache inode blocks.  Only the latter are
12001 * transferred to/from the disk.  So, when the updated contents of the
12002 * "in-core" inode structure are copied to the corresponding in-memory inode
12003 * block, the dependencies are also transferred.  The following procedure is
12004 * called when copying a dirty "in-core" inode to a cached inode block.
12005 */
12006
12007/*
12008 * Called when an inode is loaded from disk. If the effective link count
12009 * differed from the actual link count when it was last flushed, then we
12010 * need to ensure that the correct effective link count is put back.
12011 */
12012void
12013softdep_load_inodeblock(ip)
12014	struct inode *ip;	/* the "in_core" copy of the inode */
12015{
12016	struct inodedep *inodedep;
12017
12018	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12019	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12020	/*
12021	 * Check for alternate nlink count.
12022	 */
12023	ip->i_effnlink = ip->i_nlink;
12024	ACQUIRE_LOCK(ip->i_ump);
12025	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
12026	    &inodedep) == 0) {
12027		FREE_LOCK(ip->i_ump);
12028		return;
12029	}
12030	ip->i_effnlink -= inodedep->id_nlinkdelta;
12031	FREE_LOCK(ip->i_ump);
12032}
12033
12034/*
12035 * This routine is called just before the "in-core" inode
12036 * information is to be copied to the in-memory inode block.
12037 * Recall that an inode block contains several inodes. If
12038 * the force flag is set, then the dependencies will be
12039 * cleared so that the update can always be made. Note that
12040 * the buffer is locked when this routine is called, so we
12041 * will never be in the middle of writing the inode block
12042 * to disk.
12043 */
12044void
12045softdep_update_inodeblock(ip, bp, waitfor)
12046	struct inode *ip;	/* the "in_core" copy of the inode */
12047	struct buf *bp;		/* the buffer containing the inode block */
12048	int waitfor;		/* nonzero => update must be allowed */
12049{
12050	struct inodedep *inodedep;
12051	struct inoref *inoref;
12052	struct ufsmount *ump;
12053	struct worklist *wk;
12054	struct mount *mp;
12055	struct buf *ibp;
12056	struct fs *fs;
12057	int error;
12058
12059	ump = ip->i_ump;
12060	mp = UFSTOVFS(ump);
12061	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12062	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12063	fs = ip->i_fs;
12064	/*
12065	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12066	 * does not have access to the in-core ip so must write directly into
12067	 * the inode block buffer when setting freelink.
12068	 */
12069	if (fs->fs_magic == FS_UFS1_MAGIC)
12070		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12071		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12072	else
12073		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12074		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12075	/*
12076	 * If the effective link count is not equal to the actual link
12077	 * count, then we must track the difference in an inodedep while
12078	 * the inode is (potentially) tossed out of the cache. Otherwise,
12079	 * if there is no existing inodedep, then there are no dependencies
12080	 * to track.
12081	 */
12082	ACQUIRE_LOCK(ump);
12083again:
12084	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12085		FREE_LOCK(ump);
12086		if (ip->i_effnlink != ip->i_nlink)
12087			panic("softdep_update_inodeblock: bad link count");
12088		return;
12089	}
12090	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12091		panic("softdep_update_inodeblock: bad delta");
12092	/*
12093	 * If we're flushing all dependencies we must also move any waiting
12094	 * for journal writes onto the bufwait list prior to I/O.
12095	 */
12096	if (waitfor) {
12097		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12098			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12099			    == DEPCOMPLETE) {
12100				jwait(&inoref->if_list, MNT_WAIT);
12101				goto again;
12102			}
12103		}
12104	}
12105	/*
12106	 * Changes have been initiated. Anything depending on these
12107	 * changes cannot occur until this inode has been written.
12108	 */
12109	inodedep->id_state &= ~COMPLETE;
12110	if ((inodedep->id_state & ONWORKLIST) == 0)
12111		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12112	/*
12113	 * Any new dependencies associated with the incore inode must
12114	 * now be moved to the list associated with the buffer holding
12115	 * the in-memory copy of the inode. Once merged process any
12116	 * allocdirects that are completed by the merger.
12117	 */
12118	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12119	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12120		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12121		    NULL);
12122	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12123	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12124		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12125		    NULL);
12126	/*
12127	 * Now that the inode has been pushed into the buffer, the
12128	 * operations dependent on the inode being written to disk
12129	 * can be moved to the id_bufwait so that they will be
12130	 * processed when the buffer I/O completes.
12131	 */
12132	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12133		WORKLIST_REMOVE(wk);
12134		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12135	}
12136	/*
12137	 * Newly allocated inodes cannot be written until the bitmap
12138	 * that allocates them have been written (indicated by
12139	 * DEPCOMPLETE being set in id_state). If we are doing a
12140	 * forced sync (e.g., an fsync on a file), we force the bitmap
12141	 * to be written so that the update can be done.
12142	 */
12143	if (waitfor == 0) {
12144		FREE_LOCK(ump);
12145		return;
12146	}
12147retry:
12148	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12149		FREE_LOCK(ump);
12150		return;
12151	}
12152	ibp = inodedep->id_bmsafemap->sm_buf;
12153	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12154	if (ibp == NULL) {
12155		/*
12156		 * If ibp came back as NULL, the dependency could have been
12157		 * freed while we slept.  Look it up again, and check to see
12158		 * that it has completed.
12159		 */
12160		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12161			goto retry;
12162		FREE_LOCK(ump);
12163		return;
12164	}
12165	FREE_LOCK(ump);
12166	if ((error = bwrite(ibp)) != 0)
12167		softdep_error("softdep_update_inodeblock: bwrite", error);
12168}
12169
12170/*
12171 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12172 * old inode dependency list (such as id_inoupdt). This routine must be
12173 * called with splbio interrupts blocked.
12174 */
12175static void
12176merge_inode_lists(newlisthead, oldlisthead)
12177	struct allocdirectlst *newlisthead;
12178	struct allocdirectlst *oldlisthead;
12179{
12180	struct allocdirect *listadp, *newadp;
12181
12182	newadp = TAILQ_FIRST(newlisthead);
12183	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12184		if (listadp->ad_offset < newadp->ad_offset) {
12185			listadp = TAILQ_NEXT(listadp, ad_next);
12186			continue;
12187		}
12188		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12189		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12190		if (listadp->ad_offset == newadp->ad_offset) {
12191			allocdirect_merge(oldlisthead, newadp,
12192			    listadp);
12193			listadp = newadp;
12194		}
12195		newadp = TAILQ_FIRST(newlisthead);
12196	}
12197	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12198		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12199		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12200	}
12201}
12202
12203/*
12204 * If we are doing an fsync, then we must ensure that any directory
12205 * entries for the inode have been written after the inode gets to disk.
12206 */
12207int
12208softdep_fsync(vp)
12209	struct vnode *vp;	/* the "in_core" copy of the inode */
12210{
12211	struct inodedep *inodedep;
12212	struct pagedep *pagedep;
12213	struct inoref *inoref;
12214	struct ufsmount *ump;
12215	struct worklist *wk;
12216	struct diradd *dap;
12217	struct mount *mp;
12218	struct vnode *pvp;
12219	struct inode *ip;
12220	struct buf *bp;
12221	struct fs *fs;
12222	struct thread *td = curthread;
12223	int error, flushparent, pagedep_new_block;
12224	ino_t parentino;
12225	ufs_lbn_t lbn;
12226
12227	ip = VTOI(vp);
12228	fs = ip->i_fs;
12229	ump = ip->i_ump;
12230	mp = vp->v_mount;
12231	if (MOUNTEDSOFTDEP(mp) == 0)
12232		return (0);
12233	ACQUIRE_LOCK(ump);
12234restart:
12235	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12236		FREE_LOCK(ump);
12237		return (0);
12238	}
12239	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12240		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12241		    == DEPCOMPLETE) {
12242			jwait(&inoref->if_list, MNT_WAIT);
12243			goto restart;
12244		}
12245	}
12246	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12247	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12248	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12249	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12250	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12251		panic("softdep_fsync: pending ops %p", inodedep);
12252	for (error = 0, flushparent = 0; ; ) {
12253		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12254			break;
12255		if (wk->wk_type != D_DIRADD)
12256			panic("softdep_fsync: Unexpected type %s",
12257			    TYPENAME(wk->wk_type));
12258		dap = WK_DIRADD(wk);
12259		/*
12260		 * Flush our parent if this directory entry has a MKDIR_PARENT
12261		 * dependency or is contained in a newly allocated block.
12262		 */
12263		if (dap->da_state & DIRCHG)
12264			pagedep = dap->da_previous->dm_pagedep;
12265		else
12266			pagedep = dap->da_pagedep;
12267		parentino = pagedep->pd_ino;
12268		lbn = pagedep->pd_lbn;
12269		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12270			panic("softdep_fsync: dirty");
12271		if ((dap->da_state & MKDIR_PARENT) ||
12272		    (pagedep->pd_state & NEWBLOCK))
12273			flushparent = 1;
12274		else
12275			flushparent = 0;
12276		/*
12277		 * If we are being fsync'ed as part of vgone'ing this vnode,
12278		 * then we will not be able to release and recover the
12279		 * vnode below, so we just have to give up on writing its
12280		 * directory entry out. It will eventually be written, just
12281		 * not now, but then the user was not asking to have it
12282		 * written, so we are not breaking any promises.
12283		 */
12284		if (vp->v_iflag & VI_DOOMED)
12285			break;
12286		/*
12287		 * We prevent deadlock by always fetching inodes from the
12288		 * root, moving down the directory tree. Thus, when fetching
12289		 * our parent directory, we first try to get the lock. If
12290		 * that fails, we must unlock ourselves before requesting
12291		 * the lock on our parent. See the comment in ufs_lookup
12292		 * for details on possible races.
12293		 */
12294		FREE_LOCK(ump);
12295		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12296		    FFSV_FORCEINSMQ)) {
12297			error = vfs_busy(mp, MBF_NOWAIT);
12298			if (error != 0) {
12299				vfs_ref(mp);
12300				VOP_UNLOCK(vp, 0);
12301				error = vfs_busy(mp, 0);
12302				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12303				vfs_rel(mp);
12304				if (error != 0)
12305					return (ENOENT);
12306				if (vp->v_iflag & VI_DOOMED) {
12307					vfs_unbusy(mp);
12308					return (ENOENT);
12309				}
12310			}
12311			VOP_UNLOCK(vp, 0);
12312			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12313			    &pvp, FFSV_FORCEINSMQ);
12314			vfs_unbusy(mp);
12315			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12316			if (vp->v_iflag & VI_DOOMED) {
12317				if (error == 0)
12318					vput(pvp);
12319				error = ENOENT;
12320			}
12321			if (error != 0)
12322				return (error);
12323		}
12324		/*
12325		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12326		 * that are contained in direct blocks will be resolved by
12327		 * doing a ffs_update. Pagedeps contained in indirect blocks
12328		 * may require a complete sync'ing of the directory. So, we
12329		 * try the cheap and fast ffs_update first, and if that fails,
12330		 * then we do the slower ffs_syncvnode of the directory.
12331		 */
12332		if (flushparent) {
12333			int locked;
12334
12335			if ((error = ffs_update(pvp, 1)) != 0) {
12336				vput(pvp);
12337				return (error);
12338			}
12339			ACQUIRE_LOCK(ump);
12340			locked = 1;
12341			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12342				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12343					if (wk->wk_type != D_DIRADD)
12344						panic("softdep_fsync: Unexpected type %s",
12345						      TYPENAME(wk->wk_type));
12346					dap = WK_DIRADD(wk);
12347					if (dap->da_state & DIRCHG)
12348						pagedep = dap->da_previous->dm_pagedep;
12349					else
12350						pagedep = dap->da_pagedep;
12351					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12352					FREE_LOCK(ump);
12353					locked = 0;
12354					if (pagedep_new_block && (error =
12355					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12356						vput(pvp);
12357						return (error);
12358					}
12359				}
12360			}
12361			if (locked)
12362				FREE_LOCK(ump);
12363		}
12364		/*
12365		 * Flush directory page containing the inode's name.
12366		 */
12367		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12368		    &bp);
12369		if (error == 0)
12370			error = bwrite(bp);
12371		else
12372			brelse(bp);
12373		vput(pvp);
12374		if (error != 0)
12375			return (error);
12376		ACQUIRE_LOCK(ump);
12377		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12378			break;
12379	}
12380	FREE_LOCK(ump);
12381	return (0);
12382}
12383
12384/*
12385 * Flush all the dirty bitmaps associated with the block device
12386 * before flushing the rest of the dirty blocks so as to reduce
12387 * the number of dependencies that will have to be rolled back.
12388 *
12389 * XXX Unused?
12390 */
12391void
12392softdep_fsync_mountdev(vp)
12393	struct vnode *vp;
12394{
12395	struct buf *bp, *nbp;
12396	struct worklist *wk;
12397	struct bufobj *bo;
12398
12399	if (!vn_isdisk(vp, NULL))
12400		panic("softdep_fsync_mountdev: vnode not a disk");
12401	bo = &vp->v_bufobj;
12402restart:
12403	BO_LOCK(bo);
12404	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12405		/*
12406		 * If it is already scheduled, skip to the next buffer.
12407		 */
12408		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12409			continue;
12410
12411		if ((bp->b_flags & B_DELWRI) == 0)
12412			panic("softdep_fsync_mountdev: not dirty");
12413		/*
12414		 * We are only interested in bitmaps with outstanding
12415		 * dependencies.
12416		 */
12417		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12418		    wk->wk_type != D_BMSAFEMAP ||
12419		    (bp->b_vflags & BV_BKGRDINPROG)) {
12420			BUF_UNLOCK(bp);
12421			continue;
12422		}
12423		BO_UNLOCK(bo);
12424		bremfree(bp);
12425		(void) bawrite(bp);
12426		goto restart;
12427	}
12428	drain_output(vp);
12429	BO_UNLOCK(bo);
12430}
12431
12432/*
12433 * Sync all cylinder groups that were dirty at the time this function is
12434 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12435 * is used to flush freedep activity that may be holding up writes to a
12436 * indirect block.
12437 */
12438static int
12439sync_cgs(mp, waitfor)
12440	struct mount *mp;
12441	int waitfor;
12442{
12443	struct bmsafemap *bmsafemap;
12444	struct bmsafemap *sentinel;
12445	struct ufsmount *ump;
12446	struct buf *bp;
12447	int error;
12448
12449	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12450	sentinel->sm_cg = -1;
12451	ump = VFSTOUFS(mp);
12452	error = 0;
12453	ACQUIRE_LOCK(ump);
12454	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12455	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12456	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12457		/* Skip sentinels and cgs with no work to release. */
12458		if (bmsafemap->sm_cg == -1 ||
12459		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12460		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12461			LIST_REMOVE(sentinel, sm_next);
12462			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12463			continue;
12464		}
12465		/*
12466		 * If we don't get the lock and we're waiting try again, if
12467		 * not move on to the next buf and try to sync it.
12468		 */
12469		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12470		if (bp == NULL && waitfor == MNT_WAIT)
12471			continue;
12472		LIST_REMOVE(sentinel, sm_next);
12473		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12474		if (bp == NULL)
12475			continue;
12476		FREE_LOCK(ump);
12477		if (waitfor == MNT_NOWAIT)
12478			bawrite(bp);
12479		else
12480			error = bwrite(bp);
12481		ACQUIRE_LOCK(ump);
12482		if (error)
12483			break;
12484	}
12485	LIST_REMOVE(sentinel, sm_next);
12486	FREE_LOCK(ump);
12487	free(sentinel, M_BMSAFEMAP);
12488	return (error);
12489}
12490
12491/*
12492 * This routine is called when we are trying to synchronously flush a
12493 * file. This routine must eliminate any filesystem metadata dependencies
12494 * so that the syncing routine can succeed.
12495 */
12496int
12497softdep_sync_metadata(struct vnode *vp)
12498{
12499	struct inode *ip;
12500	int error;
12501
12502	ip = VTOI(vp);
12503	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12504	    ("softdep_sync_metadata called on non-softdep filesystem"));
12505	/*
12506	 * Ensure that any direct block dependencies have been cleared,
12507	 * truncations are started, and inode references are journaled.
12508	 */
12509	ACQUIRE_LOCK(ip->i_ump);
12510	/*
12511	 * Write all journal records to prevent rollbacks on devvp.
12512	 */
12513	if (vp->v_type == VCHR)
12514		softdep_flushjournal(vp->v_mount);
12515	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12516	/*
12517	 * Ensure that all truncates are written so we won't find deps on
12518	 * indirect blocks.
12519	 */
12520	process_truncates(vp);
12521	FREE_LOCK(ip->i_ump);
12522
12523	return (error);
12524}
12525
12526/*
12527 * This routine is called when we are attempting to sync a buf with
12528 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12529 * other IO it can but returns EBUSY if the buffer is not yet able to
12530 * be written.  Dependencies which will not cause rollbacks will always
12531 * return 0.
12532 */
12533int
12534softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12535{
12536	struct indirdep *indirdep;
12537	struct pagedep *pagedep;
12538	struct allocindir *aip;
12539	struct newblk *newblk;
12540	struct ufsmount *ump;
12541	struct buf *nbp;
12542	struct worklist *wk;
12543	int i, error;
12544
12545	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12546	    ("softdep_sync_buf called on non-softdep filesystem"));
12547	/*
12548	 * For VCHR we just don't want to force flush any dependencies that
12549	 * will cause rollbacks.
12550	 */
12551	if (vp->v_type == VCHR) {
12552		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12553			return (EBUSY);
12554		return (0);
12555	}
12556	ump = VTOI(vp)->i_ump;
12557	ACQUIRE_LOCK(ump);
12558	/*
12559	 * As we hold the buffer locked, none of its dependencies
12560	 * will disappear.
12561	 */
12562	error = 0;
12563top:
12564	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12565		switch (wk->wk_type) {
12566
12567		case D_ALLOCDIRECT:
12568		case D_ALLOCINDIR:
12569			newblk = WK_NEWBLK(wk);
12570			if (newblk->nb_jnewblk != NULL) {
12571				if (waitfor == MNT_NOWAIT) {
12572					error = EBUSY;
12573					goto out_unlock;
12574				}
12575				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12576				goto top;
12577			}
12578			if (newblk->nb_state & DEPCOMPLETE ||
12579			    waitfor == MNT_NOWAIT)
12580				continue;
12581			nbp = newblk->nb_bmsafemap->sm_buf;
12582			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12583			if (nbp == NULL)
12584				goto top;
12585			FREE_LOCK(ump);
12586			if ((error = bwrite(nbp)) != 0)
12587				goto out;
12588			ACQUIRE_LOCK(ump);
12589			continue;
12590
12591		case D_INDIRDEP:
12592			indirdep = WK_INDIRDEP(wk);
12593			if (waitfor == MNT_NOWAIT) {
12594				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12595				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12596					error = EBUSY;
12597					goto out_unlock;
12598				}
12599			}
12600			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12601				panic("softdep_sync_buf: truncation pending.");
12602		restart:
12603			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12604				newblk = (struct newblk *)aip;
12605				if (newblk->nb_jnewblk != NULL) {
12606					jwait(&newblk->nb_jnewblk->jn_list,
12607					    waitfor);
12608					goto restart;
12609				}
12610				if (newblk->nb_state & DEPCOMPLETE)
12611					continue;
12612				nbp = newblk->nb_bmsafemap->sm_buf;
12613				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12614				if (nbp == NULL)
12615					goto restart;
12616				FREE_LOCK(ump);
12617				if ((error = bwrite(nbp)) != 0)
12618					goto out;
12619				ACQUIRE_LOCK(ump);
12620				goto restart;
12621			}
12622			continue;
12623
12624		case D_PAGEDEP:
12625			/*
12626			 * Only flush directory entries in synchronous passes.
12627			 */
12628			if (waitfor != MNT_WAIT) {
12629				error = EBUSY;
12630				goto out_unlock;
12631			}
12632			/*
12633			 * While syncing snapshots, we must allow recursive
12634			 * lookups.
12635			 */
12636			BUF_AREC(bp);
12637			/*
12638			 * We are trying to sync a directory that may
12639			 * have dependencies on both its own metadata
12640			 * and/or dependencies on the inodes of any
12641			 * recently allocated files. We walk its diradd
12642			 * lists pushing out the associated inode.
12643			 */
12644			pagedep = WK_PAGEDEP(wk);
12645			for (i = 0; i < DAHASHSZ; i++) {
12646				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12647					continue;
12648				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12649				    &pagedep->pd_diraddhd[i]))) {
12650					BUF_NOREC(bp);
12651					goto out_unlock;
12652				}
12653			}
12654			BUF_NOREC(bp);
12655			continue;
12656
12657		case D_FREEWORK:
12658		case D_FREEDEP:
12659		case D_JSEGDEP:
12660		case D_JNEWBLK:
12661			continue;
12662
12663		default:
12664			panic("softdep_sync_buf: Unknown type %s",
12665			    TYPENAME(wk->wk_type));
12666			/* NOTREACHED */
12667		}
12668	}
12669out_unlock:
12670	FREE_LOCK(ump);
12671out:
12672	return (error);
12673}
12674
12675/*
12676 * Flush the dependencies associated with an inodedep.
12677 * Called with splbio blocked.
12678 */
12679static int
12680flush_inodedep_deps(vp, mp, ino)
12681	struct vnode *vp;
12682	struct mount *mp;
12683	ino_t ino;
12684{
12685	struct inodedep *inodedep;
12686	struct inoref *inoref;
12687	struct ufsmount *ump;
12688	int error, waitfor;
12689
12690	/*
12691	 * This work is done in two passes. The first pass grabs most
12692	 * of the buffers and begins asynchronously writing them. The
12693	 * only way to wait for these asynchronous writes is to sleep
12694	 * on the filesystem vnode which may stay busy for a long time
12695	 * if the filesystem is active. So, instead, we make a second
12696	 * pass over the dependencies blocking on each write. In the
12697	 * usual case we will be blocking against a write that we
12698	 * initiated, so when it is done the dependency will have been
12699	 * resolved. Thus the second pass is expected to end quickly.
12700	 * We give a brief window at the top of the loop to allow
12701	 * any pending I/O to complete.
12702	 */
12703	ump = VFSTOUFS(mp);
12704	LOCK_OWNED(ump);
12705	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12706		if (error)
12707			return (error);
12708		FREE_LOCK(ump);
12709		ACQUIRE_LOCK(ump);
12710restart:
12711		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12712			return (0);
12713		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12714			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12715			    == DEPCOMPLETE) {
12716				jwait(&inoref->if_list, MNT_WAIT);
12717				goto restart;
12718			}
12719		}
12720		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12721		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12722		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12723		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12724			continue;
12725		/*
12726		 * If pass2, we are done, otherwise do pass 2.
12727		 */
12728		if (waitfor == MNT_WAIT)
12729			break;
12730		waitfor = MNT_WAIT;
12731	}
12732	/*
12733	 * Try freeing inodedep in case all dependencies have been removed.
12734	 */
12735	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12736		(void) free_inodedep(inodedep);
12737	return (0);
12738}
12739
12740/*
12741 * Flush an inode dependency list.
12742 * Called with splbio blocked.
12743 */
12744static int
12745flush_deplist(listhead, waitfor, errorp)
12746	struct allocdirectlst *listhead;
12747	int waitfor;
12748	int *errorp;
12749{
12750	struct allocdirect *adp;
12751	struct newblk *newblk;
12752	struct ufsmount *ump;
12753	struct buf *bp;
12754
12755	if ((adp = TAILQ_FIRST(listhead)) == NULL)
12756		return (0);
12757	ump = VFSTOUFS(adp->ad_list.wk_mp);
12758	LOCK_OWNED(ump);
12759	TAILQ_FOREACH(adp, listhead, ad_next) {
12760		newblk = (struct newblk *)adp;
12761		if (newblk->nb_jnewblk != NULL) {
12762			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12763			return (1);
12764		}
12765		if (newblk->nb_state & DEPCOMPLETE)
12766			continue;
12767		bp = newblk->nb_bmsafemap->sm_buf;
12768		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12769		if (bp == NULL) {
12770			if (waitfor == MNT_NOWAIT)
12771				continue;
12772			return (1);
12773		}
12774		FREE_LOCK(ump);
12775		if (waitfor == MNT_NOWAIT)
12776			bawrite(bp);
12777		else
12778			*errorp = bwrite(bp);
12779		ACQUIRE_LOCK(ump);
12780		return (1);
12781	}
12782	return (0);
12783}
12784
12785/*
12786 * Flush dependencies associated with an allocdirect block.
12787 */
12788static int
12789flush_newblk_dep(vp, mp, lbn)
12790	struct vnode *vp;
12791	struct mount *mp;
12792	ufs_lbn_t lbn;
12793{
12794	struct newblk *newblk;
12795	struct ufsmount *ump;
12796	struct bufobj *bo;
12797	struct inode *ip;
12798	struct buf *bp;
12799	ufs2_daddr_t blkno;
12800	int error;
12801
12802	error = 0;
12803	bo = &vp->v_bufobj;
12804	ip = VTOI(vp);
12805	blkno = DIP(ip, i_db[lbn]);
12806	if (blkno == 0)
12807		panic("flush_newblk_dep: Missing block");
12808	ump = VFSTOUFS(mp);
12809	ACQUIRE_LOCK(ump);
12810	/*
12811	 * Loop until all dependencies related to this block are satisfied.
12812	 * We must be careful to restart after each sleep in case a write
12813	 * completes some part of this process for us.
12814	 */
12815	for (;;) {
12816		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12817			FREE_LOCK(ump);
12818			break;
12819		}
12820		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12821			panic("flush_newblk_deps: Bad newblk %p", newblk);
12822		/*
12823		 * Flush the journal.
12824		 */
12825		if (newblk->nb_jnewblk != NULL) {
12826			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12827			continue;
12828		}
12829		/*
12830		 * Write the bitmap dependency.
12831		 */
12832		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12833			bp = newblk->nb_bmsafemap->sm_buf;
12834			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12835			if (bp == NULL)
12836				continue;
12837			FREE_LOCK(ump);
12838			error = bwrite(bp);
12839			if (error)
12840				break;
12841			ACQUIRE_LOCK(ump);
12842			continue;
12843		}
12844		/*
12845		 * Write the buffer.
12846		 */
12847		FREE_LOCK(ump);
12848		BO_LOCK(bo);
12849		bp = gbincore(bo, lbn);
12850		if (bp != NULL) {
12851			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12852			    LK_INTERLOCK, BO_LOCKPTR(bo));
12853			if (error == ENOLCK) {
12854				ACQUIRE_LOCK(ump);
12855				continue; /* Slept, retry */
12856			}
12857			if (error != 0)
12858				break;	/* Failed */
12859			if (bp->b_flags & B_DELWRI) {
12860				bremfree(bp);
12861				error = bwrite(bp);
12862				if (error)
12863					break;
12864			} else
12865				BUF_UNLOCK(bp);
12866		} else
12867			BO_UNLOCK(bo);
12868		/*
12869		 * We have to wait for the direct pointers to
12870		 * point at the newdirblk before the dependency
12871		 * will go away.
12872		 */
12873		error = ffs_update(vp, 1);
12874		if (error)
12875			break;
12876		ACQUIRE_LOCK(ump);
12877	}
12878	return (error);
12879}
12880
12881/*
12882 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12883 * Called with splbio blocked.
12884 */
12885static int
12886flush_pagedep_deps(pvp, mp, diraddhdp)
12887	struct vnode *pvp;
12888	struct mount *mp;
12889	struct diraddhd *diraddhdp;
12890{
12891	struct inodedep *inodedep;
12892	struct inoref *inoref;
12893	struct ufsmount *ump;
12894	struct diradd *dap;
12895	struct vnode *vp;
12896	int error = 0;
12897	struct buf *bp;
12898	ino_t inum;
12899	struct diraddhd unfinished;
12900
12901	LIST_INIT(&unfinished);
12902	ump = VFSTOUFS(mp);
12903	LOCK_OWNED(ump);
12904restart:
12905	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12906		/*
12907		 * Flush ourselves if this directory entry
12908		 * has a MKDIR_PARENT dependency.
12909		 */
12910		if (dap->da_state & MKDIR_PARENT) {
12911			FREE_LOCK(ump);
12912			if ((error = ffs_update(pvp, 1)) != 0)
12913				break;
12914			ACQUIRE_LOCK(ump);
12915			/*
12916			 * If that cleared dependencies, go on to next.
12917			 */
12918			if (dap != LIST_FIRST(diraddhdp))
12919				continue;
12920			/*
12921			 * All MKDIR_PARENT dependencies and all the
12922			 * NEWBLOCK pagedeps that are contained in direct
12923			 * blocks were resolved by doing above ffs_update.
12924			 * Pagedeps contained in indirect blocks may
12925			 * require a complete sync'ing of the directory.
12926			 * We are in the midst of doing a complete sync,
12927			 * so if they are not resolved in this pass we
12928			 * defer them for now as they will be sync'ed by
12929			 * our caller shortly.
12930			 */
12931			LIST_REMOVE(dap, da_pdlist);
12932			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12933			continue;
12934		}
12935		/*
12936		 * A newly allocated directory must have its "." and
12937		 * ".." entries written out before its name can be
12938		 * committed in its parent.
12939		 */
12940		inum = dap->da_newinum;
12941		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12942			panic("flush_pagedep_deps: lost inode1");
12943		/*
12944		 * Wait for any pending journal adds to complete so we don't
12945		 * cause rollbacks while syncing.
12946		 */
12947		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12948			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12949			    == DEPCOMPLETE) {
12950				jwait(&inoref->if_list, MNT_WAIT);
12951				goto restart;
12952			}
12953		}
12954		if (dap->da_state & MKDIR_BODY) {
12955			FREE_LOCK(ump);
12956			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12957			    FFSV_FORCEINSMQ)))
12958				break;
12959			error = flush_newblk_dep(vp, mp, 0);
12960			/*
12961			 * If we still have the dependency we might need to
12962			 * update the vnode to sync the new link count to
12963			 * disk.
12964			 */
12965			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12966				error = ffs_update(vp, 1);
12967			vput(vp);
12968			if (error != 0)
12969				break;
12970			ACQUIRE_LOCK(ump);
12971			/*
12972			 * If that cleared dependencies, go on to next.
12973			 */
12974			if (dap != LIST_FIRST(diraddhdp))
12975				continue;
12976			if (dap->da_state & MKDIR_BODY) {
12977				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12978				    &inodedep);
12979				panic("flush_pagedep_deps: MKDIR_BODY "
12980				    "inodedep %p dap %p vp %p",
12981				    inodedep, dap, vp);
12982			}
12983		}
12984		/*
12985		 * Flush the inode on which the directory entry depends.
12986		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12987		 * the only remaining dependency is that the updated inode
12988		 * count must get pushed to disk. The inode has already
12989		 * been pushed into its inode buffer (via VOP_UPDATE) at
12990		 * the time of the reference count change. So we need only
12991		 * locate that buffer, ensure that there will be no rollback
12992		 * caused by a bitmap dependency, then write the inode buffer.
12993		 */
12994retry:
12995		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12996			panic("flush_pagedep_deps: lost inode");
12997		/*
12998		 * If the inode still has bitmap dependencies,
12999		 * push them to disk.
13000		 */
13001		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13002			bp = inodedep->id_bmsafemap->sm_buf;
13003			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13004			if (bp == NULL)
13005				goto retry;
13006			FREE_LOCK(ump);
13007			if ((error = bwrite(bp)) != 0)
13008				break;
13009			ACQUIRE_LOCK(ump);
13010			if (dap != LIST_FIRST(diraddhdp))
13011				continue;
13012		}
13013		/*
13014		 * If the inode is still sitting in a buffer waiting
13015		 * to be written or waiting for the link count to be
13016		 * adjusted update it here to flush it to disk.
13017		 */
13018		if (dap == LIST_FIRST(diraddhdp)) {
13019			FREE_LOCK(ump);
13020			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13021			    FFSV_FORCEINSMQ)))
13022				break;
13023			error = ffs_update(vp, 1);
13024			vput(vp);
13025			if (error)
13026				break;
13027			ACQUIRE_LOCK(ump);
13028		}
13029		/*
13030		 * If we have failed to get rid of all the dependencies
13031		 * then something is seriously wrong.
13032		 */
13033		if (dap == LIST_FIRST(diraddhdp)) {
13034			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13035			panic("flush_pagedep_deps: failed to flush "
13036			    "inodedep %p ino %ju dap %p",
13037			    inodedep, (uintmax_t)inum, dap);
13038		}
13039	}
13040	if (error)
13041		ACQUIRE_LOCK(ump);
13042	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13043		LIST_REMOVE(dap, da_pdlist);
13044		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13045	}
13046	return (error);
13047}
13048
13049/*
13050 * A large burst of file addition or deletion activity can drive the
13051 * memory load excessively high. First attempt to slow things down
13052 * using the techniques below. If that fails, this routine requests
13053 * the offending operations to fall back to running synchronously
13054 * until the memory load returns to a reasonable level.
13055 */
13056int
13057softdep_slowdown(vp)
13058	struct vnode *vp;
13059{
13060	struct ufsmount *ump;
13061	int jlow;
13062	int max_softdeps_hard;
13063
13064	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13065	    ("softdep_slowdown called on non-softdep filesystem"));
13066	ump = VFSTOUFS(vp->v_mount);
13067	ACQUIRE_LOCK(ump);
13068	jlow = 0;
13069	/*
13070	 * Check for journal space if needed.
13071	 */
13072	if (DOINGSUJ(vp)) {
13073		if (journal_space(ump, 0) == 0)
13074			jlow = 1;
13075	}
13076	/*
13077	 * If the system is under its limits and our filesystem is
13078	 * not responsible for more than our share of the usage and
13079	 * we are not low on journal space, then no need to slow down.
13080	 */
13081	max_softdeps_hard = max_softdeps * 11 / 10;
13082	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13083	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13084	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13085	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13086	    ump->softdep_curdeps[D_DIRREM] <
13087	    (max_softdeps_hard / 2) / stat_flush_threads &&
13088	    ump->softdep_curdeps[D_INODEDEP] <
13089	    max_softdeps_hard / stat_flush_threads &&
13090	    ump->softdep_curdeps[D_INDIRDEP] <
13091	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13092	    ump->softdep_curdeps[D_FREEBLKS] <
13093	    max_softdeps_hard / stat_flush_threads) {
13094		FREE_LOCK(ump);
13095  		return (0);
13096	}
13097	/*
13098	 * If the journal is low or our filesystem is over its limit
13099	 * then speedup the cleanup.
13100	 */
13101	if (ump->softdep_curdeps[D_INDIRDEP] <
13102	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13103		softdep_speedup(ump);
13104	stat_sync_limit_hit += 1;
13105	FREE_LOCK(ump);
13106	/*
13107	 * We only slow down the rate at which new dependencies are
13108	 * generated if we are not using journaling. With journaling,
13109	 * the cleanup should always be sufficient to keep things
13110	 * under control.
13111	 */
13112	if (DOINGSUJ(vp))
13113		return (0);
13114	return (1);
13115}
13116
13117/*
13118 * Called by the allocation routines when they are about to fail
13119 * in the hope that we can free up the requested resource (inodes
13120 * or disk space).
13121 *
13122 * First check to see if the work list has anything on it. If it has,
13123 * clean up entries until we successfully free the requested resource.
13124 * Because this process holds inodes locked, we cannot handle any remove
13125 * requests that might block on a locked inode as that could lead to
13126 * deadlock. If the worklist yields none of the requested resource,
13127 * start syncing out vnodes to free up the needed space.
13128 */
13129int
13130softdep_request_cleanup(fs, vp, cred, resource)
13131	struct fs *fs;
13132	struct vnode *vp;
13133	struct ucred *cred;
13134	int resource;
13135{
13136	struct ufsmount *ump;
13137	struct mount *mp;
13138	struct vnode *lvp, *mvp;
13139	long starttime;
13140	ufs2_daddr_t needed;
13141	int error;
13142
13143	/*
13144	 * If we are being called because of a process doing a
13145	 * copy-on-write, then it is not safe to process any
13146	 * worklist items as we will recurse into the copyonwrite
13147	 * routine.  This will result in an incoherent snapshot.
13148	 * If the vnode that we hold is a snapshot, we must avoid
13149	 * handling other resources that could cause deadlock.
13150	 */
13151	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13152		return (0);
13153
13154	if (resource == FLUSH_BLOCKS_WAIT)
13155		stat_cleanup_blkrequests += 1;
13156	else
13157		stat_cleanup_inorequests += 1;
13158
13159	mp = vp->v_mount;
13160	ump = VFSTOUFS(mp);
13161	mtx_assert(UFS_MTX(ump), MA_OWNED);
13162	UFS_UNLOCK(ump);
13163	error = ffs_update(vp, 1);
13164	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13165		UFS_LOCK(ump);
13166		return (0);
13167	}
13168	/*
13169	 * If we are in need of resources, start by cleaning up
13170	 * any block removals associated with our inode.
13171	 */
13172	ACQUIRE_LOCK(ump);
13173	process_removes(vp);
13174	process_truncates(vp);
13175	FREE_LOCK(ump);
13176	/*
13177	 * Now clean up at least as many resources as we will need.
13178	 *
13179	 * When requested to clean up inodes, the number that are needed
13180	 * is set by the number of simultaneous writers (mnt_writeopcount)
13181	 * plus a bit of slop (2) in case some more writers show up while
13182	 * we are cleaning.
13183	 *
13184	 * When requested to free up space, the amount of space that
13185	 * we need is enough blocks to allocate a full-sized segment
13186	 * (fs_contigsumsize). The number of such segments that will
13187	 * be needed is set by the number of simultaneous writers
13188	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13189	 * writers show up while we are cleaning.
13190	 *
13191	 * Additionally, if we are unpriviledged and allocating space,
13192	 * we need to ensure that we clean up enough blocks to get the
13193	 * needed number of blocks over the threshhold of the minimum
13194	 * number of blocks required to be kept free by the filesystem
13195	 * (fs_minfree).
13196	 */
13197	if (resource == FLUSH_INODES_WAIT) {
13198		needed = vp->v_mount->mnt_writeopcount + 2;
13199	} else if (resource == FLUSH_BLOCKS_WAIT) {
13200		needed = (vp->v_mount->mnt_writeopcount + 2) *
13201		    fs->fs_contigsumsize;
13202		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13203			needed += fragstoblks(fs,
13204			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13205			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13206	} else {
13207		UFS_LOCK(ump);
13208		printf("softdep_request_cleanup: Unknown resource type %d\n",
13209		    resource);
13210		return (0);
13211	}
13212	starttime = time_second;
13213retry:
13214	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13215	    fs->fs_cstotal.cs_nbfree <= needed) ||
13216	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13217	    fs->fs_cstotal.cs_nifree <= needed)) {
13218		ACQUIRE_LOCK(ump);
13219		if (ump->softdep_on_worklist > 0 &&
13220		    process_worklist_item(UFSTOVFS(ump),
13221		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13222			stat_worklist_push += 1;
13223		FREE_LOCK(ump);
13224	}
13225	/*
13226	 * If we still need resources and there are no more worklist
13227	 * entries to process to obtain them, we have to start flushing
13228	 * the dirty vnodes to force the release of additional requests
13229	 * to the worklist that we can then process to reap addition
13230	 * resources. We walk the vnodes associated with the mount point
13231	 * until we get the needed worklist requests that we can reap.
13232	 */
13233	if ((resource == FLUSH_BLOCKS_WAIT &&
13234	     fs->fs_cstotal.cs_nbfree <= needed) ||
13235	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13236	     fs->fs_cstotal.cs_nifree <= needed)) {
13237		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13238			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13239				VI_UNLOCK(lvp);
13240				continue;
13241			}
13242			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13243			    curthread))
13244				continue;
13245			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13246				vput(lvp);
13247				continue;
13248			}
13249			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13250			vput(lvp);
13251		}
13252		lvp = ump->um_devvp;
13253		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13254			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13255			VOP_UNLOCK(lvp, 0);
13256		}
13257		if (ump->softdep_on_worklist > 0) {
13258			stat_cleanup_retries += 1;
13259			goto retry;
13260		}
13261		stat_cleanup_failures += 1;
13262	}
13263	if (time_second - starttime > stat_cleanup_high_delay)
13264		stat_cleanup_high_delay = time_second - starttime;
13265	UFS_LOCK(ump);
13266	return (1);
13267}
13268
13269static bool
13270softdep_excess_inodes(struct ufsmount *ump)
13271{
13272
13273	return (dep_current[D_INODEDEP] > max_softdeps &&
13274	    ump->softdep_curdeps[D_INODEDEP] > max_softdeps /
13275	    stat_flush_threads);
13276}
13277
13278static bool
13279softdep_excess_dirrem(struct ufsmount *ump)
13280{
13281
13282	return (dep_current[D_DIRREM] > max_softdeps / 2 &&
13283	    ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) /
13284	    stat_flush_threads);
13285}
13286
13287static void
13288schedule_cleanup(struct mount *mp)
13289{
13290	struct ufsmount *ump;
13291	struct thread *td;
13292
13293	ump = VFSTOUFS(mp);
13294	LOCK_OWNED(ump);
13295	FREE_LOCK(ump);
13296	td = curthread;
13297	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13298	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13299		/*
13300		 * No ast is delivered to kernel threads, so nobody
13301		 * would deref the mp.  Some kernel threads
13302		 * explicitely check for AST, e.g. NFS daemon does
13303		 * this in the serving loop.
13304		 */
13305		return;
13306	}
13307	if (td->td_su != NULL)
13308		vfs_rel(td->td_su);
13309	vfs_ref(mp);
13310	td->td_su = mp;
13311	thread_lock(td);
13312	td->td_flags |= TDF_ASTPENDING;
13313	thread_unlock(td);
13314}
13315
13316static void
13317softdep_ast_cleanup_proc(void)
13318{
13319	struct thread *td;
13320	struct mount *mp;
13321	struct ufsmount *ump;
13322	int error;
13323	bool req;
13324
13325	td = curthread;
13326	mp = td->td_su;
13327	if (mp == NULL)
13328		return;
13329	td->td_su = NULL;
13330	error = vfs_busy(mp, MBF_NOWAIT);
13331	vfs_rel(mp);
13332	if (error != 0)
13333		return;
13334	if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13335		ump = VFSTOUFS(mp);
13336		for (;;) {
13337			req = false;
13338			ACQUIRE_LOCK(ump);
13339			if (softdep_excess_inodes(ump)) {
13340				req = true;
13341				request_cleanup(mp, FLUSH_INODES);
13342			}
13343			if (softdep_excess_dirrem(ump)) {
13344				req = true;
13345				request_cleanup(mp, FLUSH_BLOCKS);
13346			}
13347			FREE_LOCK(ump);
13348			if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13349				break;
13350		}
13351	}
13352	vfs_unbusy(mp);
13353}
13354
13355/*
13356 * If memory utilization has gotten too high, deliberately slow things
13357 * down and speed up the I/O processing.
13358 */
13359static int
13360request_cleanup(mp, resource)
13361	struct mount *mp;
13362	int resource;
13363{
13364	struct thread *td = curthread;
13365	struct ufsmount *ump;
13366
13367	ump = VFSTOUFS(mp);
13368	LOCK_OWNED(ump);
13369	/*
13370	 * We never hold up the filesystem syncer or buf daemon.
13371	 */
13372	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13373		return (0);
13374	/*
13375	 * First check to see if the work list has gotten backlogged.
13376	 * If it has, co-opt this process to help clean up two entries.
13377	 * Because this process may hold inodes locked, we cannot
13378	 * handle any remove requests that might block on a locked
13379	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13380	 * to avoid recursively processing the worklist.
13381	 */
13382	if (ump->softdep_on_worklist > max_softdeps / 10) {
13383		td->td_pflags |= TDP_SOFTDEP;
13384		process_worklist_item(mp, 2, LK_NOWAIT);
13385		td->td_pflags &= ~TDP_SOFTDEP;
13386		stat_worklist_push += 2;
13387		return(1);
13388	}
13389	/*
13390	 * Next, we attempt to speed up the syncer process. If that
13391	 * is successful, then we allow the process to continue.
13392	 */
13393	if (softdep_speedup(ump) &&
13394	    resource != FLUSH_BLOCKS_WAIT &&
13395	    resource != FLUSH_INODES_WAIT)
13396		return(0);
13397	/*
13398	 * If we are resource constrained on inode dependencies, try
13399	 * flushing some dirty inodes. Otherwise, we are constrained
13400	 * by file deletions, so try accelerating flushes of directories
13401	 * with removal dependencies. We would like to do the cleanup
13402	 * here, but we probably hold an inode locked at this point and
13403	 * that might deadlock against one that we try to clean. So,
13404	 * the best that we can do is request the syncer daemon to do
13405	 * the cleanup for us.
13406	 */
13407	switch (resource) {
13408
13409	case FLUSH_INODES:
13410	case FLUSH_INODES_WAIT:
13411		ACQUIRE_GBLLOCK(&lk);
13412		stat_ino_limit_push += 1;
13413		req_clear_inodedeps += 1;
13414		FREE_GBLLOCK(&lk);
13415		stat_countp = &stat_ino_limit_hit;
13416		break;
13417
13418	case FLUSH_BLOCKS:
13419	case FLUSH_BLOCKS_WAIT:
13420		ACQUIRE_GBLLOCK(&lk);
13421		stat_blk_limit_push += 1;
13422		req_clear_remove += 1;
13423		FREE_GBLLOCK(&lk);
13424		stat_countp = &stat_blk_limit_hit;
13425		break;
13426
13427	default:
13428		panic("request_cleanup: unknown type");
13429	}
13430	/*
13431	 * Hopefully the syncer daemon will catch up and awaken us.
13432	 * We wait at most tickdelay before proceeding in any case.
13433	 */
13434	ACQUIRE_GBLLOCK(&lk);
13435	FREE_LOCK(ump);
13436	proc_waiting += 1;
13437	if (callout_pending(&softdep_callout) == FALSE)
13438		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13439		    pause_timer, 0);
13440
13441	if ((td->td_pflags & TDP_KTHREAD) == 0)
13442		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13443	proc_waiting -= 1;
13444	FREE_GBLLOCK(&lk);
13445	ACQUIRE_LOCK(ump);
13446	return (1);
13447}
13448
13449/*
13450 * Awaken processes pausing in request_cleanup and clear proc_waiting
13451 * to indicate that there is no longer a timer running. Pause_timer
13452 * will be called with the global softdep mutex (&lk) locked.
13453 */
13454static void
13455pause_timer(arg)
13456	void *arg;
13457{
13458
13459	GBLLOCK_OWNED(&lk);
13460	/*
13461	 * The callout_ API has acquired mtx and will hold it around this
13462	 * function call.
13463	 */
13464	*stat_countp += proc_waiting;
13465	wakeup(&proc_waiting);
13466}
13467
13468/*
13469 * If requested, try removing inode or removal dependencies.
13470 */
13471static void
13472check_clear_deps(mp)
13473	struct mount *mp;
13474{
13475
13476	/*
13477	 * If we are suspended, it may be because of our using
13478	 * too many inodedeps, so help clear them out.
13479	 */
13480	if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13481		clear_inodedeps(mp);
13482	/*
13483	 * General requests for cleanup of backed up dependencies
13484	 */
13485	ACQUIRE_GBLLOCK(&lk);
13486	if (req_clear_inodedeps) {
13487		req_clear_inodedeps -= 1;
13488		FREE_GBLLOCK(&lk);
13489		clear_inodedeps(mp);
13490		ACQUIRE_GBLLOCK(&lk);
13491		wakeup(&proc_waiting);
13492	}
13493	if (req_clear_remove) {
13494		req_clear_remove -= 1;
13495		FREE_GBLLOCK(&lk);
13496		clear_remove(mp);
13497		ACQUIRE_GBLLOCK(&lk);
13498		wakeup(&proc_waiting);
13499	}
13500	FREE_GBLLOCK(&lk);
13501}
13502
13503/*
13504 * Flush out a directory with at least one removal dependency in an effort to
13505 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13506 */
13507static void
13508clear_remove(mp)
13509	struct mount *mp;
13510{
13511	struct pagedep_hashhead *pagedephd;
13512	struct pagedep *pagedep;
13513	struct ufsmount *ump;
13514	struct vnode *vp;
13515	struct bufobj *bo;
13516	int error, cnt;
13517	ino_t ino;
13518
13519	ump = VFSTOUFS(mp);
13520	LOCK_OWNED(ump);
13521
13522	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13523		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13524		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13525			ump->pagedep_nextclean = 0;
13526		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13527			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13528				continue;
13529			ino = pagedep->pd_ino;
13530			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13531				continue;
13532			FREE_LOCK(ump);
13533
13534			/*
13535			 * Let unmount clear deps
13536			 */
13537			error = vfs_busy(mp, MBF_NOWAIT);
13538			if (error != 0)
13539				goto finish_write;
13540			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13541			     FFSV_FORCEINSMQ);
13542			vfs_unbusy(mp);
13543			if (error != 0) {
13544				softdep_error("clear_remove: vget", error);
13545				goto finish_write;
13546			}
13547			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13548				softdep_error("clear_remove: fsync", error);
13549			bo = &vp->v_bufobj;
13550			BO_LOCK(bo);
13551			drain_output(vp);
13552			BO_UNLOCK(bo);
13553			vput(vp);
13554		finish_write:
13555			vn_finished_write(mp);
13556			ACQUIRE_LOCK(ump);
13557			return;
13558		}
13559	}
13560}
13561
13562/*
13563 * Clear out a block of dirty inodes in an effort to reduce
13564 * the number of inodedep dependency structures.
13565 */
13566static void
13567clear_inodedeps(mp)
13568	struct mount *mp;
13569{
13570	struct inodedep_hashhead *inodedephd;
13571	struct inodedep *inodedep;
13572	struct ufsmount *ump;
13573	struct vnode *vp;
13574	struct fs *fs;
13575	int error, cnt;
13576	ino_t firstino, lastino, ino;
13577
13578	ump = VFSTOUFS(mp);
13579	fs = ump->um_fs;
13580	LOCK_OWNED(ump);
13581	/*
13582	 * Pick a random inode dependency to be cleared.
13583	 * We will then gather up all the inodes in its block
13584	 * that have dependencies and flush them out.
13585	 */
13586	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13587		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13588		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13589			ump->inodedep_nextclean = 0;
13590		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13591			break;
13592	}
13593	if (inodedep == NULL)
13594		return;
13595	/*
13596	 * Find the last inode in the block with dependencies.
13597	 */
13598	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13599	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13600		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13601			break;
13602	/*
13603	 * Asynchronously push all but the last inode with dependencies.
13604	 * Synchronously push the last inode with dependencies to ensure
13605	 * that the inode block gets written to free up the inodedeps.
13606	 */
13607	for (ino = firstino; ino <= lastino; ino++) {
13608		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13609			continue;
13610		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13611			continue;
13612		FREE_LOCK(ump);
13613		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13614		if (error != 0) {
13615			vn_finished_write(mp);
13616			ACQUIRE_LOCK(ump);
13617			return;
13618		}
13619		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13620		    FFSV_FORCEINSMQ)) != 0) {
13621			softdep_error("clear_inodedeps: vget", error);
13622			vfs_unbusy(mp);
13623			vn_finished_write(mp);
13624			ACQUIRE_LOCK(ump);
13625			return;
13626		}
13627		vfs_unbusy(mp);
13628		if (ino == lastino) {
13629			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13630				softdep_error("clear_inodedeps: fsync1", error);
13631		} else {
13632			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13633				softdep_error("clear_inodedeps: fsync2", error);
13634			BO_LOCK(&vp->v_bufobj);
13635			drain_output(vp);
13636			BO_UNLOCK(&vp->v_bufobj);
13637		}
13638		vput(vp);
13639		vn_finished_write(mp);
13640		ACQUIRE_LOCK(ump);
13641	}
13642}
13643
13644void
13645softdep_buf_append(bp, wkhd)
13646	struct buf *bp;
13647	struct workhead *wkhd;
13648{
13649	struct worklist *wk;
13650	struct ufsmount *ump;
13651
13652	if ((wk = LIST_FIRST(wkhd)) == NULL)
13653		return;
13654	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13655	    ("softdep_buf_append called on non-softdep filesystem"));
13656	ump = VFSTOUFS(wk->wk_mp);
13657	ACQUIRE_LOCK(ump);
13658	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13659		WORKLIST_REMOVE(wk);
13660		WORKLIST_INSERT(&bp->b_dep, wk);
13661	}
13662	FREE_LOCK(ump);
13663
13664}
13665
13666void
13667softdep_inode_append(ip, cred, wkhd)
13668	struct inode *ip;
13669	struct ucred *cred;
13670	struct workhead *wkhd;
13671{
13672	struct buf *bp;
13673	struct fs *fs;
13674	int error;
13675
13676	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13677	    ("softdep_inode_append called on non-softdep filesystem"));
13678	fs = ip->i_fs;
13679	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13680	    (int)fs->fs_bsize, cred, &bp);
13681	if (error) {
13682		bqrelse(bp);
13683		softdep_freework(wkhd);
13684		return;
13685	}
13686	softdep_buf_append(bp, wkhd);
13687	bqrelse(bp);
13688}
13689
13690void
13691softdep_freework(wkhd)
13692	struct workhead *wkhd;
13693{
13694	struct worklist *wk;
13695	struct ufsmount *ump;
13696
13697	if ((wk = LIST_FIRST(wkhd)) == NULL)
13698		return;
13699	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13700	    ("softdep_freework called on non-softdep filesystem"));
13701	ump = VFSTOUFS(wk->wk_mp);
13702	ACQUIRE_LOCK(ump);
13703	handle_jwork(wkhd);
13704	FREE_LOCK(ump);
13705}
13706
13707/*
13708 * Function to determine if the buffer has outstanding dependencies
13709 * that will cause a roll-back if the buffer is written. If wantcount
13710 * is set, return number of dependencies, otherwise just yes or no.
13711 */
13712static int
13713softdep_count_dependencies(bp, wantcount)
13714	struct buf *bp;
13715	int wantcount;
13716{
13717	struct worklist *wk;
13718	struct ufsmount *ump;
13719	struct bmsafemap *bmsafemap;
13720	struct freework *freework;
13721	struct inodedep *inodedep;
13722	struct indirdep *indirdep;
13723	struct freeblks *freeblks;
13724	struct allocindir *aip;
13725	struct pagedep *pagedep;
13726	struct dirrem *dirrem;
13727	struct newblk *newblk;
13728	struct mkdir *mkdir;
13729	struct diradd *dap;
13730	int i, retval;
13731
13732	retval = 0;
13733	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13734		return (0);
13735	ump = VFSTOUFS(wk->wk_mp);
13736	ACQUIRE_LOCK(ump);
13737	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13738		switch (wk->wk_type) {
13739
13740		case D_INODEDEP:
13741			inodedep = WK_INODEDEP(wk);
13742			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13743				/* bitmap allocation dependency */
13744				retval += 1;
13745				if (!wantcount)
13746					goto out;
13747			}
13748			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13749				/* direct block pointer dependency */
13750				retval += 1;
13751				if (!wantcount)
13752					goto out;
13753			}
13754			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13755				/* direct block pointer dependency */
13756				retval += 1;
13757				if (!wantcount)
13758					goto out;
13759			}
13760			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13761				/* Add reference dependency. */
13762				retval += 1;
13763				if (!wantcount)
13764					goto out;
13765			}
13766			continue;
13767
13768		case D_INDIRDEP:
13769			indirdep = WK_INDIRDEP(wk);
13770
13771			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13772				/* indirect truncation dependency */
13773				retval += 1;
13774				if (!wantcount)
13775					goto out;
13776			}
13777
13778			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13779				/* indirect block pointer dependency */
13780				retval += 1;
13781				if (!wantcount)
13782					goto out;
13783			}
13784			continue;
13785
13786		case D_PAGEDEP:
13787			pagedep = WK_PAGEDEP(wk);
13788			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13789				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13790					/* Journal remove ref dependency. */
13791					retval += 1;
13792					if (!wantcount)
13793						goto out;
13794				}
13795			}
13796			for (i = 0; i < DAHASHSZ; i++) {
13797
13798				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13799					/* directory entry dependency */
13800					retval += 1;
13801					if (!wantcount)
13802						goto out;
13803				}
13804			}
13805			continue;
13806
13807		case D_BMSAFEMAP:
13808			bmsafemap = WK_BMSAFEMAP(wk);
13809			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13810				/* Add reference dependency. */
13811				retval += 1;
13812				if (!wantcount)
13813					goto out;
13814			}
13815			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13816				/* Allocate block dependency. */
13817				retval += 1;
13818				if (!wantcount)
13819					goto out;
13820			}
13821			continue;
13822
13823		case D_FREEBLKS:
13824			freeblks = WK_FREEBLKS(wk);
13825			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13826				/* Freeblk journal dependency. */
13827				retval += 1;
13828				if (!wantcount)
13829					goto out;
13830			}
13831			continue;
13832
13833		case D_ALLOCDIRECT:
13834		case D_ALLOCINDIR:
13835			newblk = WK_NEWBLK(wk);
13836			if (newblk->nb_jnewblk) {
13837				/* Journal allocate dependency. */
13838				retval += 1;
13839				if (!wantcount)
13840					goto out;
13841			}
13842			continue;
13843
13844		case D_MKDIR:
13845			mkdir = WK_MKDIR(wk);
13846			if (mkdir->md_jaddref) {
13847				/* Journal reference dependency. */
13848				retval += 1;
13849				if (!wantcount)
13850					goto out;
13851			}
13852			continue;
13853
13854		case D_FREEWORK:
13855		case D_FREEDEP:
13856		case D_JSEGDEP:
13857		case D_JSEG:
13858		case D_SBDEP:
13859			/* never a dependency on these blocks */
13860			continue;
13861
13862		default:
13863			panic("softdep_count_dependencies: Unexpected type %s",
13864			    TYPENAME(wk->wk_type));
13865			/* NOTREACHED */
13866		}
13867	}
13868out:
13869	FREE_LOCK(ump);
13870	return retval;
13871}
13872
13873/*
13874 * Acquire exclusive access to a buffer.
13875 * Must be called with a locked mtx parameter.
13876 * Return acquired buffer or NULL on failure.
13877 */
13878static struct buf *
13879getdirtybuf(bp, lock, waitfor)
13880	struct buf *bp;
13881	struct rwlock *lock;
13882	int waitfor;
13883{
13884	int error;
13885
13886	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13887		if (waitfor != MNT_WAIT)
13888			return (NULL);
13889		error = BUF_LOCK(bp,
13890		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13891		/*
13892		 * Even if we sucessfully acquire bp here, we have dropped
13893		 * lock, which may violates our guarantee.
13894		 */
13895		if (error == 0)
13896			BUF_UNLOCK(bp);
13897		else if (error != ENOLCK)
13898			panic("getdirtybuf: inconsistent lock: %d", error);
13899		rw_wlock(lock);
13900		return (NULL);
13901	}
13902	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13903		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13904			rw_wunlock(lock);
13905			BO_LOCK(bp->b_bufobj);
13906			BUF_UNLOCK(bp);
13907			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13908				bp->b_vflags |= BV_BKGRDWAIT;
13909				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13910				       PRIBIO | PDROP, "getbuf", 0);
13911			} else
13912				BO_UNLOCK(bp->b_bufobj);
13913			rw_wlock(lock);
13914			return (NULL);
13915		}
13916		BUF_UNLOCK(bp);
13917		if (waitfor != MNT_WAIT)
13918			return (NULL);
13919		/*
13920		 * The lock argument must be bp->b_vp's mutex in
13921		 * this case.
13922		 */
13923#ifdef	DEBUG_VFS_LOCKS
13924		if (bp->b_vp->v_type != VCHR)
13925			ASSERT_BO_WLOCKED(bp->b_bufobj);
13926#endif
13927		bp->b_vflags |= BV_BKGRDWAIT;
13928		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13929		return (NULL);
13930	}
13931	if ((bp->b_flags & B_DELWRI) == 0) {
13932		BUF_UNLOCK(bp);
13933		return (NULL);
13934	}
13935	bremfree(bp);
13936	return (bp);
13937}
13938
13939
13940/*
13941 * Check if it is safe to suspend the file system now.  On entry,
13942 * the vnode interlock for devvp should be held.  Return 0 with
13943 * the mount interlock held if the file system can be suspended now,
13944 * otherwise return EAGAIN with the mount interlock held.
13945 */
13946int
13947softdep_check_suspend(struct mount *mp,
13948		      struct vnode *devvp,
13949		      int softdep_depcnt,
13950		      int softdep_accdepcnt,
13951		      int secondary_writes,
13952		      int secondary_accwrites)
13953{
13954	struct bufobj *bo;
13955	struct ufsmount *ump;
13956	struct inodedep *inodedep;
13957	int error, unlinked;
13958
13959	bo = &devvp->v_bufobj;
13960	ASSERT_BO_WLOCKED(bo);
13961
13962	/*
13963	 * If we are not running with soft updates, then we need only
13964	 * deal with secondary writes as we try to suspend.
13965	 */
13966	if (MOUNTEDSOFTDEP(mp) == 0) {
13967		MNT_ILOCK(mp);
13968		while (mp->mnt_secondary_writes != 0) {
13969			BO_UNLOCK(bo);
13970			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
13971			    (PUSER - 1) | PDROP, "secwr", 0);
13972			BO_LOCK(bo);
13973			MNT_ILOCK(mp);
13974		}
13975
13976		/*
13977		 * Reasons for needing more work before suspend:
13978		 * - Dirty buffers on devvp.
13979		 * - Secondary writes occurred after start of vnode sync loop
13980		 */
13981		error = 0;
13982		if (bo->bo_numoutput > 0 ||
13983		    bo->bo_dirty.bv_cnt > 0 ||
13984		    secondary_writes != 0 ||
13985		    mp->mnt_secondary_writes != 0 ||
13986		    secondary_accwrites != mp->mnt_secondary_accwrites)
13987			error = EAGAIN;
13988		BO_UNLOCK(bo);
13989		return (error);
13990	}
13991
13992	/*
13993	 * If we are running with soft updates, then we need to coordinate
13994	 * with them as we try to suspend.
13995	 */
13996	ump = VFSTOUFS(mp);
13997	for (;;) {
13998		if (!TRY_ACQUIRE_LOCK(ump)) {
13999			BO_UNLOCK(bo);
14000			ACQUIRE_LOCK(ump);
14001			FREE_LOCK(ump);
14002			BO_LOCK(bo);
14003			continue;
14004		}
14005		MNT_ILOCK(mp);
14006		if (mp->mnt_secondary_writes != 0) {
14007			FREE_LOCK(ump);
14008			BO_UNLOCK(bo);
14009			msleep(&mp->mnt_secondary_writes,
14010			       MNT_MTX(mp),
14011			       (PUSER - 1) | PDROP, "secwr", 0);
14012			BO_LOCK(bo);
14013			continue;
14014		}
14015		break;
14016	}
14017
14018	unlinked = 0;
14019	if (MOUNTEDSUJ(mp)) {
14020		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14021		    inodedep != NULL;
14022		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14023			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14024			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14025			    UNLINKONLIST) ||
14026			    !check_inodedep_free(inodedep))
14027				continue;
14028			unlinked++;
14029		}
14030	}
14031
14032	/*
14033	 * Reasons for needing more work before suspend:
14034	 * - Dirty buffers on devvp.
14035	 * - Softdep activity occurred after start of vnode sync loop
14036	 * - Secondary writes occurred after start of vnode sync loop
14037	 */
14038	error = 0;
14039	if (bo->bo_numoutput > 0 ||
14040	    bo->bo_dirty.bv_cnt > 0 ||
14041	    softdep_depcnt != unlinked ||
14042	    ump->softdep_deps != unlinked ||
14043	    softdep_accdepcnt != ump->softdep_accdeps ||
14044	    secondary_writes != 0 ||
14045	    mp->mnt_secondary_writes != 0 ||
14046	    secondary_accwrites != mp->mnt_secondary_accwrites)
14047		error = EAGAIN;
14048	FREE_LOCK(ump);
14049	BO_UNLOCK(bo);
14050	return (error);
14051}
14052
14053
14054/*
14055 * Get the number of dependency structures for the file system, both
14056 * the current number and the total number allocated.  These will
14057 * later be used to detect that softdep processing has occurred.
14058 */
14059void
14060softdep_get_depcounts(struct mount *mp,
14061		      int *softdep_depsp,
14062		      int *softdep_accdepsp)
14063{
14064	struct ufsmount *ump;
14065
14066	if (MOUNTEDSOFTDEP(mp) == 0) {
14067		*softdep_depsp = 0;
14068		*softdep_accdepsp = 0;
14069		return;
14070	}
14071	ump = VFSTOUFS(mp);
14072	ACQUIRE_LOCK(ump);
14073	*softdep_depsp = ump->softdep_deps;
14074	*softdep_accdepsp = ump->softdep_accdeps;
14075	FREE_LOCK(ump);
14076}
14077
14078/*
14079 * Wait for pending output on a vnode to complete.
14080 * Must be called with vnode lock and interlock locked.
14081 *
14082 * XXX: Should just be a call to bufobj_wwait().
14083 */
14084static void
14085drain_output(vp)
14086	struct vnode *vp;
14087{
14088	struct bufobj *bo;
14089
14090	bo = &vp->v_bufobj;
14091	ASSERT_VOP_LOCKED(vp, "drain_output");
14092	ASSERT_BO_WLOCKED(bo);
14093
14094	while (bo->bo_numoutput) {
14095		bo->bo_flag |= BO_WWAIT;
14096		msleep((caddr_t)&bo->bo_numoutput,
14097		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
14098	}
14099}
14100
14101/*
14102 * Called whenever a buffer that is being invalidated or reallocated
14103 * contains dependencies. This should only happen if an I/O error has
14104 * occurred. The routine is called with the buffer locked.
14105 */
14106static void
14107softdep_deallocate_dependencies(bp)
14108	struct buf *bp;
14109{
14110
14111	if ((bp->b_ioflags & BIO_ERROR) == 0)
14112		panic("softdep_deallocate_dependencies: dangling deps");
14113	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14114		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14115	else
14116		printf("softdep_deallocate_dependencies: "
14117		    "got error %d while accessing filesystem\n", bp->b_error);
14118	if (bp->b_error != ENXIO)
14119		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14120}
14121
14122/*
14123 * Function to handle asynchronous write errors in the filesystem.
14124 */
14125static void
14126softdep_error(func, error)
14127	char *func;
14128	int error;
14129{
14130
14131	/* XXX should do something better! */
14132	printf("%s: got error %d while accessing filesystem\n", func, error);
14133}
14134
14135#ifdef DDB
14136
14137static void
14138inodedep_print(struct inodedep *inodedep, int verbose)
14139{
14140	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
14141	    " saveino %p\n",
14142	    inodedep, inodedep->id_fs, inodedep->id_state,
14143	    (intmax_t)inodedep->id_ino,
14144	    (intmax_t)fsbtodb(inodedep->id_fs,
14145	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14146	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
14147	    inodedep->id_savedino1);
14148
14149	if (verbose == 0)
14150		return;
14151
14152	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14153	    "mkdiradd %p\n",
14154	    LIST_FIRST(&inodedep->id_pendinghd),
14155	    LIST_FIRST(&inodedep->id_bufwait),
14156	    LIST_FIRST(&inodedep->id_inowait),
14157	    TAILQ_FIRST(&inodedep->id_inoreflst),
14158	    inodedep->id_mkdiradd);
14159	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14160	    TAILQ_FIRST(&inodedep->id_inoupdt),
14161	    TAILQ_FIRST(&inodedep->id_newinoupdt),
14162	    TAILQ_FIRST(&inodedep->id_extupdt),
14163	    TAILQ_FIRST(&inodedep->id_newextupdt));
14164}
14165
14166DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14167{
14168
14169	if (have_addr == 0) {
14170		db_printf("Address required\n");
14171		return;
14172	}
14173	inodedep_print((struct inodedep*)addr, 1);
14174}
14175
14176DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14177{
14178	struct inodedep_hashhead *inodedephd;
14179	struct inodedep *inodedep;
14180	struct ufsmount *ump;
14181	int cnt;
14182
14183	if (have_addr == 0) {
14184		db_printf("Address required\n");
14185		return;
14186	}
14187	ump = (struct ufsmount *)addr;
14188	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14189		inodedephd = &ump->inodedep_hashtbl[cnt];
14190		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14191			inodedep_print(inodedep, 0);
14192		}
14193	}
14194}
14195
14196DB_SHOW_COMMAND(worklist, db_show_worklist)
14197{
14198	struct worklist *wk;
14199
14200	if (have_addr == 0) {
14201		db_printf("Address required\n");
14202		return;
14203	}
14204	wk = (struct worklist *)addr;
14205	printf("worklist: %p type %s state 0x%X\n",
14206	    wk, TYPENAME(wk->wk_type), wk->wk_state);
14207}
14208
14209DB_SHOW_COMMAND(workhead, db_show_workhead)
14210{
14211	struct workhead *wkhd;
14212	struct worklist *wk;
14213	int i;
14214
14215	if (have_addr == 0) {
14216		db_printf("Address required\n");
14217		return;
14218	}
14219	wkhd = (struct workhead *)addr;
14220	wk = LIST_FIRST(wkhd);
14221	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14222		db_printf("worklist: %p type %s state 0x%X",
14223		    wk, TYPENAME(wk->wk_type), wk->wk_state);
14224	if (i == 100)
14225		db_printf("workhead overflow");
14226	printf("\n");
14227}
14228
14229
14230DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14231{
14232	struct mkdirlist *mkdirlisthd;
14233	struct jaddref *jaddref;
14234	struct diradd *diradd;
14235	struct mkdir *mkdir;
14236
14237	if (have_addr == 0) {
14238		db_printf("Address required\n");
14239		return;
14240	}
14241	mkdirlisthd = (struct mkdirlist *)addr;
14242	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14243		diradd = mkdir->md_diradd;
14244		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14245		    mkdir, mkdir->md_state, diradd, diradd->da_state);
14246		if ((jaddref = mkdir->md_jaddref) != NULL)
14247			db_printf(" jaddref %p jaddref state 0x%X",
14248			    jaddref, jaddref->ja_state);
14249		db_printf("\n");
14250	}
14251}
14252
14253/* exported to ffs_vfsops.c */
14254extern void db_print_ffs(struct ufsmount *ump);
14255void
14256db_print_ffs(struct ufsmount *ump)
14257{
14258	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14259	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14260	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14261	    ump->softdep_deps, ump->softdep_req);
14262}
14263
14264#endif /* DDB */
14265
14266#endif /* SOFTUPDATES */
14267