ffs_softdep.c revision 306177
1/*-
2 * Copyright 1998, 2000 Marshall Kirk McKusick.
3 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
4 * All rights reserved.
5 *
6 * The soft updates code is derived from the appendix of a University
7 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
8 * "Soft Updates: A Solution to the Metadata Update Problem in File
9 * Systems", CSE-TR-254-95, August 1995).
10 *
11 * Further information about soft updates can be obtained from:
12 *
13 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
14 *	1614 Oxford Street		mckusick@mckusick.com
15 *	Berkeley, CA 94709-1608		+1-510-843-9542
16 *	USA
17 *
18 * Redistribution and use in source and binary forms, with or without
19 * modification, are permitted provided that the following conditions
20 * are met:
21 *
22 * 1. Redistributions of source code must retain the above copyright
23 *    notice, this list of conditions and the following disclaimer.
24 * 2. Redistributions in binary form must reproduce the above copyright
25 *    notice, this list of conditions and the following disclaimer in the
26 *    documentation and/or other materials provided with the distribution.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
29 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
32 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
33 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
34 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
35 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
36 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
37 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 *
39 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
40 */
41
42#include <sys/cdefs.h>
43__FBSDID("$FreeBSD: stable/10/sys/ufs/ffs/ffs_softdep.c 306177 2016-09-22 10:46:08Z kib $");
44
45#include "opt_ffs.h"
46#include "opt_quota.h"
47#include "opt_ddb.h"
48
49/*
50 * For now we want the safety net that the DEBUG flag provides.
51 */
52#ifndef DEBUG
53#define DEBUG
54#endif
55
56#include <sys/param.h>
57#include <sys/kernel.h>
58#include <sys/systm.h>
59#include <sys/bio.h>
60#include <sys/buf.h>
61#include <sys/kdb.h>
62#include <sys/kthread.h>
63#include <sys/ktr.h>
64#include <sys/limits.h>
65#include <sys/lock.h>
66#include <sys/malloc.h>
67#include <sys/mount.h>
68#include <sys/mutex.h>
69#include <sys/namei.h>
70#include <sys/priv.h>
71#include <sys/proc.h>
72#include <sys/rwlock.h>
73#include <sys/stat.h>
74#include <sys/sysctl.h>
75#include <sys/syslog.h>
76#include <sys/vnode.h>
77#include <sys/conf.h>
78
79#include <ufs/ufs/dir.h>
80#include <ufs/ufs/extattr.h>
81#include <ufs/ufs/quota.h>
82#include <ufs/ufs/inode.h>
83#include <ufs/ufs/ufsmount.h>
84#include <ufs/ffs/fs.h>
85#include <ufs/ffs/softdep.h>
86#include <ufs/ffs/ffs_extern.h>
87#include <ufs/ufs/ufs_extern.h>
88
89#include <vm/vm.h>
90#include <vm/vm_extern.h>
91#include <vm/vm_object.h>
92
93#include <geom/geom.h>
94
95#include <ddb/ddb.h>
96
97#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
98
99#ifndef SOFTUPDATES
100
101int
102softdep_flushfiles(oldmnt, flags, td)
103	struct mount *oldmnt;
104	int flags;
105	struct thread *td;
106{
107
108	panic("softdep_flushfiles called");
109}
110
111int
112softdep_mount(devvp, mp, fs, cred)
113	struct vnode *devvp;
114	struct mount *mp;
115	struct fs *fs;
116	struct ucred *cred;
117{
118
119	return (0);
120}
121
122void
123softdep_initialize()
124{
125
126	return;
127}
128
129void
130softdep_uninitialize()
131{
132
133	return;
134}
135
136void
137softdep_unmount(mp)
138	struct mount *mp;
139{
140
141	panic("softdep_unmount called");
142}
143
144void
145softdep_setup_sbupdate(ump, fs, bp)
146	struct ufsmount *ump;
147	struct fs *fs;
148	struct buf *bp;
149{
150
151	panic("softdep_setup_sbupdate called");
152}
153
154void
155softdep_setup_inomapdep(bp, ip, newinum, mode)
156	struct buf *bp;
157	struct inode *ip;
158	ino_t newinum;
159	int mode;
160{
161
162	panic("softdep_setup_inomapdep called");
163}
164
165void
166softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
167	struct buf *bp;
168	struct mount *mp;
169	ufs2_daddr_t newblkno;
170	int frags;
171	int oldfrags;
172{
173
174	panic("softdep_setup_blkmapdep called");
175}
176
177void
178softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
179	struct inode *ip;
180	ufs_lbn_t lbn;
181	ufs2_daddr_t newblkno;
182	ufs2_daddr_t oldblkno;
183	long newsize;
184	long oldsize;
185	struct buf *bp;
186{
187
188	panic("softdep_setup_allocdirect called");
189}
190
191void
192softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
193	struct inode *ip;
194	ufs_lbn_t lbn;
195	ufs2_daddr_t newblkno;
196	ufs2_daddr_t oldblkno;
197	long newsize;
198	long oldsize;
199	struct buf *bp;
200{
201
202	panic("softdep_setup_allocext called");
203}
204
205void
206softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
207	struct inode *ip;
208	ufs_lbn_t lbn;
209	struct buf *bp;
210	int ptrno;
211	ufs2_daddr_t newblkno;
212	ufs2_daddr_t oldblkno;
213	struct buf *nbp;
214{
215
216	panic("softdep_setup_allocindir_page called");
217}
218
219void
220softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
221	struct buf *nbp;
222	struct inode *ip;
223	struct buf *bp;
224	int ptrno;
225	ufs2_daddr_t newblkno;
226{
227
228	panic("softdep_setup_allocindir_meta called");
229}
230
231void
232softdep_journal_freeblocks(ip, cred, length, flags)
233	struct inode *ip;
234	struct ucred *cred;
235	off_t length;
236	int flags;
237{
238
239	panic("softdep_journal_freeblocks called");
240}
241
242void
243softdep_journal_fsync(ip)
244	struct inode *ip;
245{
246
247	panic("softdep_journal_fsync called");
248}
249
250void
251softdep_setup_freeblocks(ip, length, flags)
252	struct inode *ip;
253	off_t length;
254	int flags;
255{
256
257	panic("softdep_setup_freeblocks called");
258}
259
260void
261softdep_freefile(pvp, ino, mode)
262		struct vnode *pvp;
263		ino_t ino;
264		int mode;
265{
266
267	panic("softdep_freefile called");
268}
269
270int
271softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
272	struct buf *bp;
273	struct inode *dp;
274	off_t diroffset;
275	ino_t newinum;
276	struct buf *newdirbp;
277	int isnewblk;
278{
279
280	panic("softdep_setup_directory_add called");
281}
282
283void
284softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
285	struct buf *bp;
286	struct inode *dp;
287	caddr_t base;
288	caddr_t oldloc;
289	caddr_t newloc;
290	int entrysize;
291{
292
293	panic("softdep_change_directoryentry_offset called");
294}
295
296void
297softdep_setup_remove(bp, dp, ip, isrmdir)
298	struct buf *bp;
299	struct inode *dp;
300	struct inode *ip;
301	int isrmdir;
302{
303
304	panic("softdep_setup_remove called");
305}
306
307void
308softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
309	struct buf *bp;
310	struct inode *dp;
311	struct inode *ip;
312	ino_t newinum;
313	int isrmdir;
314{
315
316	panic("softdep_setup_directory_change called");
317}
318
319void
320softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
321	struct mount *mp;
322	struct buf *bp;
323	ufs2_daddr_t blkno;
324	int frags;
325	struct workhead *wkhd;
326{
327
328	panic("%s called", __FUNCTION__);
329}
330
331void
332softdep_setup_inofree(mp, bp, ino, wkhd)
333	struct mount *mp;
334	struct buf *bp;
335	ino_t ino;
336	struct workhead *wkhd;
337{
338
339	panic("%s called", __FUNCTION__);
340}
341
342void
343softdep_setup_unlink(dp, ip)
344	struct inode *dp;
345	struct inode *ip;
346{
347
348	panic("%s called", __FUNCTION__);
349}
350
351void
352softdep_setup_link(dp, ip)
353	struct inode *dp;
354	struct inode *ip;
355{
356
357	panic("%s called", __FUNCTION__);
358}
359
360void
361softdep_revert_link(dp, ip)
362	struct inode *dp;
363	struct inode *ip;
364{
365
366	panic("%s called", __FUNCTION__);
367}
368
369void
370softdep_setup_rmdir(dp, ip)
371	struct inode *dp;
372	struct inode *ip;
373{
374
375	panic("%s called", __FUNCTION__);
376}
377
378void
379softdep_revert_rmdir(dp, ip)
380	struct inode *dp;
381	struct inode *ip;
382{
383
384	panic("%s called", __FUNCTION__);
385}
386
387void
388softdep_setup_create(dp, ip)
389	struct inode *dp;
390	struct inode *ip;
391{
392
393	panic("%s called", __FUNCTION__);
394}
395
396void
397softdep_revert_create(dp, ip)
398	struct inode *dp;
399	struct inode *ip;
400{
401
402	panic("%s called", __FUNCTION__);
403}
404
405void
406softdep_setup_mkdir(dp, ip)
407	struct inode *dp;
408	struct inode *ip;
409{
410
411	panic("%s called", __FUNCTION__);
412}
413
414void
415softdep_revert_mkdir(dp, ip)
416	struct inode *dp;
417	struct inode *ip;
418{
419
420	panic("%s called", __FUNCTION__);
421}
422
423void
424softdep_setup_dotdot_link(dp, ip)
425	struct inode *dp;
426	struct inode *ip;
427{
428
429	panic("%s called", __FUNCTION__);
430}
431
432int
433softdep_prealloc(vp, waitok)
434	struct vnode *vp;
435	int waitok;
436{
437
438	panic("%s called", __FUNCTION__);
439}
440
441int
442softdep_journal_lookup(mp, vpp)
443	struct mount *mp;
444	struct vnode **vpp;
445{
446
447	return (ENOENT);
448}
449
450void
451softdep_change_linkcnt(ip)
452	struct inode *ip;
453{
454
455	panic("softdep_change_linkcnt called");
456}
457
458void
459softdep_load_inodeblock(ip)
460	struct inode *ip;
461{
462
463	panic("softdep_load_inodeblock called");
464}
465
466void
467softdep_update_inodeblock(ip, bp, waitfor)
468	struct inode *ip;
469	struct buf *bp;
470	int waitfor;
471{
472
473	panic("softdep_update_inodeblock called");
474}
475
476int
477softdep_fsync(vp)
478	struct vnode *vp;	/* the "in_core" copy of the inode */
479{
480
481	return (0);
482}
483
484void
485softdep_fsync_mountdev(vp)
486	struct vnode *vp;
487{
488
489	return;
490}
491
492int
493softdep_flushworklist(oldmnt, countp, td)
494	struct mount *oldmnt;
495	int *countp;
496	struct thread *td;
497{
498
499	*countp = 0;
500	return (0);
501}
502
503int
504softdep_sync_metadata(struct vnode *vp)
505{
506
507	panic("softdep_sync_metadata called");
508}
509
510int
511softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
512{
513
514	panic("softdep_sync_buf called");
515}
516
517int
518softdep_slowdown(vp)
519	struct vnode *vp;
520{
521
522	panic("softdep_slowdown called");
523}
524
525int
526softdep_request_cleanup(fs, vp, cred, resource)
527	struct fs *fs;
528	struct vnode *vp;
529	struct ucred *cred;
530	int resource;
531{
532
533	return (0);
534}
535
536int
537softdep_check_suspend(struct mount *mp,
538		      struct vnode *devvp,
539		      int softdep_depcnt,
540		      int softdep_accdepcnt,
541		      int secondary_writes,
542		      int secondary_accwrites)
543{
544	struct bufobj *bo;
545	int error;
546
547	(void) softdep_depcnt,
548	(void) softdep_accdepcnt;
549
550	bo = &devvp->v_bufobj;
551	ASSERT_BO_WLOCKED(bo);
552
553	MNT_ILOCK(mp);
554	while (mp->mnt_secondary_writes != 0) {
555		BO_UNLOCK(bo);
556		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
557		    (PUSER - 1) | PDROP, "secwr", 0);
558		BO_LOCK(bo);
559		MNT_ILOCK(mp);
560	}
561
562	/*
563	 * Reasons for needing more work before suspend:
564	 * - Dirty buffers on devvp.
565	 * - Secondary writes occurred after start of vnode sync loop
566	 */
567	error = 0;
568	if (bo->bo_numoutput > 0 ||
569	    bo->bo_dirty.bv_cnt > 0 ||
570	    secondary_writes != 0 ||
571	    mp->mnt_secondary_writes != 0 ||
572	    secondary_accwrites != mp->mnt_secondary_accwrites)
573		error = EAGAIN;
574	BO_UNLOCK(bo);
575	return (error);
576}
577
578void
579softdep_get_depcounts(struct mount *mp,
580		      int *softdepactivep,
581		      int *softdepactiveaccp)
582{
583	(void) mp;
584	*softdepactivep = 0;
585	*softdepactiveaccp = 0;
586}
587
588void
589softdep_buf_append(bp, wkhd)
590	struct buf *bp;
591	struct workhead *wkhd;
592{
593
594	panic("softdep_buf_appendwork called");
595}
596
597void
598softdep_inode_append(ip, cred, wkhd)
599	struct inode *ip;
600	struct ucred *cred;
601	struct workhead *wkhd;
602{
603
604	panic("softdep_inode_appendwork called");
605}
606
607void
608softdep_freework(wkhd)
609	struct workhead *wkhd;
610{
611
612	panic("softdep_freework called");
613}
614
615#else
616
617FEATURE(softupdates, "FFS soft-updates support");
618
619static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW, 0,
620    "soft updates stats");
621static SYSCTL_NODE(_debug_softdep, OID_AUTO, total, CTLFLAG_RW, 0,
622    "total dependencies allocated");
623static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse, CTLFLAG_RW, 0,
624    "high use dependencies allocated");
625static SYSCTL_NODE(_debug_softdep, OID_AUTO, current, CTLFLAG_RW, 0,
626    "current dependencies allocated");
627static SYSCTL_NODE(_debug_softdep, OID_AUTO, write, CTLFLAG_RW, 0,
628    "current dependencies written");
629
630unsigned long dep_current[D_LAST + 1];
631unsigned long dep_highuse[D_LAST + 1];
632unsigned long dep_total[D_LAST + 1];
633unsigned long dep_write[D_LAST + 1];
634
635#define	SOFTDEP_TYPE(type, str, long)					\
636    static MALLOC_DEFINE(M_ ## type, #str, long);			\
637    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
638	&dep_total[D_ ## type], 0, "");					\
639    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
640	&dep_current[D_ ## type], 0, "");				\
641    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
642	&dep_highuse[D_ ## type], 0, "");				\
643    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
644	&dep_write[D_ ## type], 0, "");
645
646SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
647SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
648SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
649    "Block or frag allocated from cyl group map");
650SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
651SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
652SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
653SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
654SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
655SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
656SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
657SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
658SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
659SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
660SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
661SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
662SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
663SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
664SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
665SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
666SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
667SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
668SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
669SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
670SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
671SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
672SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
673SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
674
675static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
676
677static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
678static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
679static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
680
681#define M_SOFTDEP_FLAGS	(M_WAITOK)
682
683/*
684 * translate from workitem type to memory type
685 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
686 */
687static struct malloc_type *memtype[] = {
688	M_PAGEDEP,
689	M_INODEDEP,
690	M_BMSAFEMAP,
691	M_NEWBLK,
692	M_ALLOCDIRECT,
693	M_INDIRDEP,
694	M_ALLOCINDIR,
695	M_FREEFRAG,
696	M_FREEBLKS,
697	M_FREEFILE,
698	M_DIRADD,
699	M_MKDIR,
700	M_DIRREM,
701	M_NEWDIRBLK,
702	M_FREEWORK,
703	M_FREEDEP,
704	M_JADDREF,
705	M_JREMREF,
706	M_JMVREF,
707	M_JNEWBLK,
708	M_JFREEBLK,
709	M_JFREEFRAG,
710	M_JSEG,
711	M_JSEGDEP,
712	M_SBDEP,
713	M_JTRUNC,
714	M_JFSYNC,
715	M_SENTINEL
716};
717
718#define DtoM(type) (memtype[type])
719
720/*
721 * Names of malloc types.
722 */
723#define TYPENAME(type)  \
724	((unsigned)(type) <= D_LAST ? memtype[type]->ks_shortdesc : "???")
725/*
726 * End system adaptation definitions.
727 */
728
729#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
730#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
731
732/*
733 * Internal function prototypes.
734 */
735static	void check_clear_deps(struct mount *);
736static	void softdep_error(char *, int);
737static	int softdep_process_worklist(struct mount *, int);
738static	int softdep_waitidle(struct mount *, int);
739static	void drain_output(struct vnode *);
740static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
741static	int check_inodedep_free(struct inodedep *);
742static	void clear_remove(struct mount *);
743static	void clear_inodedeps(struct mount *);
744static	void unlinked_inodedep(struct mount *, struct inodedep *);
745static	void clear_unlinked_inodedep(struct inodedep *);
746static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
747static	int flush_pagedep_deps(struct vnode *, struct mount *,
748	    struct diraddhd *);
749static	int free_pagedep(struct pagedep *);
750static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
751static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
752static	int flush_deplist(struct allocdirectlst *, int, int *);
753static	int sync_cgs(struct mount *, int);
754static	int handle_written_filepage(struct pagedep *, struct buf *);
755static	int handle_written_sbdep(struct sbdep *, struct buf *);
756static	void initiate_write_sbdep(struct sbdep *);
757static	void diradd_inode_written(struct diradd *, struct inodedep *);
758static	int handle_written_indirdep(struct indirdep *, struct buf *,
759	    struct buf**);
760static	int handle_written_inodeblock(struct inodedep *, struct buf *);
761static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
762	    uint8_t *);
763static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *);
764static	void handle_written_jaddref(struct jaddref *);
765static	void handle_written_jremref(struct jremref *);
766static	void handle_written_jseg(struct jseg *, struct buf *);
767static	void handle_written_jnewblk(struct jnewblk *);
768static	void handle_written_jblkdep(struct jblkdep *);
769static	void handle_written_jfreefrag(struct jfreefrag *);
770static	void complete_jseg(struct jseg *);
771static	void complete_jsegs(struct jseg *);
772static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
773static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
774static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
775static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
776static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
777static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
778static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
779static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
780static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
781static	inline void inoref_write(struct inoref *, struct jseg *,
782	    struct jrefrec *);
783static	void handle_allocdirect_partdone(struct allocdirect *,
784	    struct workhead *);
785static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
786	    struct workhead *);
787static	void indirdep_complete(struct indirdep *);
788static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
789static	void indirblk_insert(struct freework *);
790static	void indirblk_remove(struct freework *);
791static	void handle_allocindir_partdone(struct allocindir *);
792static	void initiate_write_filepage(struct pagedep *, struct buf *);
793static	void initiate_write_indirdep(struct indirdep*, struct buf *);
794static	void handle_written_mkdir(struct mkdir *, int);
795static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
796	    uint8_t *);
797static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
798static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
799static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
800static	void handle_workitem_freefile(struct freefile *);
801static	int handle_workitem_remove(struct dirrem *, int);
802static	struct dirrem *newdirrem(struct buf *, struct inode *,
803	    struct inode *, int, struct dirrem **);
804static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
805	    struct buf *);
806static	void cancel_indirdep(struct indirdep *, struct buf *,
807	    struct freeblks *);
808static	void free_indirdep(struct indirdep *);
809static	void free_diradd(struct diradd *, struct workhead *);
810static	void merge_diradd(struct inodedep *, struct diradd *);
811static	void complete_diradd(struct diradd *);
812static	struct diradd *diradd_lookup(struct pagedep *, int);
813static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
814	    struct jremref *);
815static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
816	    struct jremref *);
817static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
818	    struct jremref *, struct jremref *);
819static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
820	    struct jremref *);
821static	void cancel_allocindir(struct allocindir *, struct buf *bp,
822	    struct freeblks *, int);
823static	int setup_trunc_indir(struct freeblks *, struct inode *,
824	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
825static	void complete_trunc_indir(struct freework *);
826static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
827	    int);
828static	void complete_mkdir(struct mkdir *);
829static	void free_newdirblk(struct newdirblk *);
830static	void free_jremref(struct jremref *);
831static	void free_jaddref(struct jaddref *);
832static	void free_jsegdep(struct jsegdep *);
833static	void free_jsegs(struct jblocks *);
834static	void rele_jseg(struct jseg *);
835static	void free_jseg(struct jseg *, struct jblocks *);
836static	void free_jnewblk(struct jnewblk *);
837static	void free_jblkdep(struct jblkdep *);
838static	void free_jfreefrag(struct jfreefrag *);
839static	void free_freedep(struct freedep *);
840static	void journal_jremref(struct dirrem *, struct jremref *,
841	    struct inodedep *);
842static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
843static	int cancel_jaddref(struct jaddref *, struct inodedep *,
844	    struct workhead *);
845static	void cancel_jfreefrag(struct jfreefrag *);
846static	inline void setup_freedirect(struct freeblks *, struct inode *,
847	    int, int);
848static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
849static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
850	    ufs_lbn_t, int);
851static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
852static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
853static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
854static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
855static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
856static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
857	    int, int);
858static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
859static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
860static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
861static	void newblk_freefrag(struct newblk*);
862static	void free_newblk(struct newblk *);
863static	void cancel_allocdirect(struct allocdirectlst *,
864	    struct allocdirect *, struct freeblks *);
865static	int check_inode_unwritten(struct inodedep *);
866static	int free_inodedep(struct inodedep *);
867static	void freework_freeblock(struct freework *);
868static	void freework_enqueue(struct freework *);
869static	int handle_workitem_freeblocks(struct freeblks *, int);
870static	int handle_complete_freeblocks(struct freeblks *, int);
871static	void handle_workitem_indirblk(struct freework *);
872static	void handle_written_freework(struct freework *);
873static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
874static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
875	    struct workhead *);
876static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
877	    struct inodedep *, struct allocindir *, ufs_lbn_t);
878static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
879	    ufs2_daddr_t, ufs_lbn_t);
880static	void handle_workitem_freefrag(struct freefrag *);
881static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
882	    ufs_lbn_t);
883static	void allocdirect_merge(struct allocdirectlst *,
884	    struct allocdirect *, struct allocdirect *);
885static	struct freefrag *allocindir_merge(struct allocindir *,
886	    struct allocindir *);
887static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
888	    struct bmsafemap **);
889static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
890	    int cg, struct bmsafemap *);
891static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
892	    struct newblk **);
893static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
894static	int inodedep_find(struct inodedep_hashhead *, ino_t,
895	    struct inodedep **);
896static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
897static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
898	    int, struct pagedep **);
899static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
900	    struct pagedep **);
901static	void pause_timer(void *);
902static	int request_cleanup(struct mount *, int);
903static	void schedule_cleanup(struct mount *);
904static void softdep_ast_cleanup_proc(void);
905static	int process_worklist_item(struct mount *, int, int);
906static	void process_removes(struct vnode *);
907static	void process_truncates(struct vnode *);
908static	void jwork_move(struct workhead *, struct workhead *);
909static	void jwork_insert(struct workhead *, struct jsegdep *);
910static	void add_to_worklist(struct worklist *, int);
911static	void wake_worklist(struct worklist *);
912static	void wait_worklist(struct worklist *, char *);
913static	void remove_from_worklist(struct worklist *);
914static	void softdep_flush(void *);
915static	void softdep_flushjournal(struct mount *);
916static	int softdep_speedup(struct ufsmount *);
917static	void worklist_speedup(struct mount *);
918static	int journal_mount(struct mount *, struct fs *, struct ucred *);
919static	void journal_unmount(struct ufsmount *);
920static	int journal_space(struct ufsmount *, int);
921static	void journal_suspend(struct ufsmount *);
922static	int journal_unsuspend(struct ufsmount *ump);
923static	void softdep_prelink(struct vnode *, struct vnode *);
924static	void add_to_journal(struct worklist *);
925static	void remove_from_journal(struct worklist *);
926static	bool softdep_excess_items(struct ufsmount *, int);
927static	void softdep_process_journal(struct mount *, struct worklist *, int);
928static	struct jremref *newjremref(struct dirrem *, struct inode *,
929	    struct inode *ip, off_t, nlink_t);
930static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
931	    uint16_t);
932static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
933	    uint16_t);
934static	inline struct jsegdep *inoref_jseg(struct inoref *);
935static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
936static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
937	    ufs2_daddr_t, int);
938static	void adjust_newfreework(struct freeblks *, int);
939static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
940static	void move_newblock_dep(struct jaddref *, struct inodedep *);
941static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
942static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
943	    ufs2_daddr_t, long, ufs_lbn_t);
944static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
945	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
946static	int jwait(struct worklist *, int);
947static	struct inodedep *inodedep_lookup_ip(struct inode *);
948static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
949static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
950static	void handle_jwork(struct workhead *);
951static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
952	    struct mkdir **);
953static	struct jblocks *jblocks_create(void);
954static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
955static	void jblocks_free(struct jblocks *, struct mount *, int);
956static	void jblocks_destroy(struct jblocks *);
957static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
958
959/*
960 * Exported softdep operations.
961 */
962static	void softdep_disk_io_initiation(struct buf *);
963static	void softdep_disk_write_complete(struct buf *);
964static	void softdep_deallocate_dependencies(struct buf *);
965static	int softdep_count_dependencies(struct buf *bp, int);
966
967/*
968 * Global lock over all of soft updates.
969 */
970static struct mtx lk;
971MTX_SYSINIT(softdep_lock, &lk, "Global Softdep Lock", MTX_DEF);
972
973#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
974#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
975#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
976
977/*
978 * Per-filesystem soft-updates locking.
979 */
980#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
981#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
982#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
983#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
984#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
985				    RA_WLOCKED)
986
987#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
988#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
989
990/*
991 * Worklist queue management.
992 * These routines require that the lock be held.
993 */
994#ifndef /* NOT */ DEBUG
995#define WORKLIST_INSERT(head, item) do {	\
996	(item)->wk_state |= ONWORKLIST;		\
997	LIST_INSERT_HEAD(head, item, wk_list);	\
998} while (0)
999#define WORKLIST_REMOVE(item) do {		\
1000	(item)->wk_state &= ~ONWORKLIST;	\
1001	LIST_REMOVE(item, wk_list);		\
1002} while (0)
1003#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1004#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1005
1006#else /* DEBUG */
1007static	void worklist_insert(struct workhead *, struct worklist *, int);
1008static	void worklist_remove(struct worklist *, int);
1009
1010#define WORKLIST_INSERT(head, item) worklist_insert(head, item, 1)
1011#define WORKLIST_INSERT_UNLOCKED(head, item) worklist_insert(head, item, 0)
1012#define WORKLIST_REMOVE(item) worklist_remove(item, 1)
1013#define WORKLIST_REMOVE_UNLOCKED(item) worklist_remove(item, 0)
1014
1015static void
1016worklist_insert(head, item, locked)
1017	struct workhead *head;
1018	struct worklist *item;
1019	int locked;
1020{
1021
1022	if (locked)
1023		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1024	if (item->wk_state & ONWORKLIST)
1025		panic("worklist_insert: %p %s(0x%X) already on list",
1026		    item, TYPENAME(item->wk_type), item->wk_state);
1027	item->wk_state |= ONWORKLIST;
1028	LIST_INSERT_HEAD(head, item, wk_list);
1029}
1030
1031static void
1032worklist_remove(item, locked)
1033	struct worklist *item;
1034	int locked;
1035{
1036
1037	if (locked)
1038		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1039	if ((item->wk_state & ONWORKLIST) == 0)
1040		panic("worklist_remove: %p %s(0x%X) not on list",
1041		    item, TYPENAME(item->wk_type), item->wk_state);
1042	item->wk_state &= ~ONWORKLIST;
1043	LIST_REMOVE(item, wk_list);
1044}
1045#endif /* DEBUG */
1046
1047/*
1048 * Merge two jsegdeps keeping only the oldest one as newer references
1049 * can't be discarded until after older references.
1050 */
1051static inline struct jsegdep *
1052jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1053{
1054	struct jsegdep *swp;
1055
1056	if (two == NULL)
1057		return (one);
1058
1059	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1060		swp = one;
1061		one = two;
1062		two = swp;
1063	}
1064	WORKLIST_REMOVE(&two->jd_list);
1065	free_jsegdep(two);
1066
1067	return (one);
1068}
1069
1070/*
1071 * If two freedeps are compatible free one to reduce list size.
1072 */
1073static inline struct freedep *
1074freedep_merge(struct freedep *one, struct freedep *two)
1075{
1076	if (two == NULL)
1077		return (one);
1078
1079	if (one->fd_freework == two->fd_freework) {
1080		WORKLIST_REMOVE(&two->fd_list);
1081		free_freedep(two);
1082	}
1083	return (one);
1084}
1085
1086/*
1087 * Move journal work from one list to another.  Duplicate freedeps and
1088 * jsegdeps are coalesced to keep the lists as small as possible.
1089 */
1090static void
1091jwork_move(dst, src)
1092	struct workhead *dst;
1093	struct workhead *src;
1094{
1095	struct freedep *freedep;
1096	struct jsegdep *jsegdep;
1097	struct worklist *wkn;
1098	struct worklist *wk;
1099
1100	KASSERT(dst != src,
1101	    ("jwork_move: dst == src"));
1102	freedep = NULL;
1103	jsegdep = NULL;
1104	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1105		if (wk->wk_type == D_JSEGDEP)
1106			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1107		if (wk->wk_type == D_FREEDEP)
1108			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1109	}
1110
1111	while ((wk = LIST_FIRST(src)) != NULL) {
1112		WORKLIST_REMOVE(wk);
1113		WORKLIST_INSERT(dst, wk);
1114		if (wk->wk_type == D_JSEGDEP) {
1115			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1116			continue;
1117		}
1118		if (wk->wk_type == D_FREEDEP)
1119			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1120	}
1121}
1122
1123static void
1124jwork_insert(dst, jsegdep)
1125	struct workhead *dst;
1126	struct jsegdep *jsegdep;
1127{
1128	struct jsegdep *jsegdepn;
1129	struct worklist *wk;
1130
1131	LIST_FOREACH(wk, dst, wk_list)
1132		if (wk->wk_type == D_JSEGDEP)
1133			break;
1134	if (wk == NULL) {
1135		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1136		return;
1137	}
1138	jsegdepn = WK_JSEGDEP(wk);
1139	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1140		WORKLIST_REMOVE(wk);
1141		free_jsegdep(jsegdepn);
1142		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1143	} else
1144		free_jsegdep(jsegdep);
1145}
1146
1147/*
1148 * Routines for tracking and managing workitems.
1149 */
1150static	void workitem_free(struct worklist *, int);
1151static	void workitem_alloc(struct worklist *, int, struct mount *);
1152static	void workitem_reassign(struct worklist *, int);
1153
1154#define	WORKITEM_FREE(item, type) \
1155	workitem_free((struct worklist *)(item), (type))
1156#define	WORKITEM_REASSIGN(item, type) \
1157	workitem_reassign((struct worklist *)(item), (type))
1158
1159static void
1160workitem_free(item, type)
1161	struct worklist *item;
1162	int type;
1163{
1164	struct ufsmount *ump;
1165
1166#ifdef DEBUG
1167	if (item->wk_state & ONWORKLIST)
1168		panic("workitem_free: %s(0x%X) still on list",
1169		    TYPENAME(item->wk_type), item->wk_state);
1170	if (item->wk_type != type && type != D_NEWBLK)
1171		panic("workitem_free: type mismatch %s != %s",
1172		    TYPENAME(item->wk_type), TYPENAME(type));
1173#endif
1174	if (item->wk_state & IOWAITING)
1175		wakeup(item);
1176	ump = VFSTOUFS(item->wk_mp);
1177	LOCK_OWNED(ump);
1178	KASSERT(ump->softdep_deps > 0,
1179	    ("workitem_free: %s: softdep_deps going negative",
1180	    ump->um_fs->fs_fsmnt));
1181	if (--ump->softdep_deps == 0 && ump->softdep_req)
1182		wakeup(&ump->softdep_deps);
1183	KASSERT(dep_current[item->wk_type] > 0,
1184	    ("workitem_free: %s: dep_current[%s] going negative",
1185	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1186	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1187	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1188	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1189	atomic_subtract_long(&dep_current[item->wk_type], 1);
1190	ump->softdep_curdeps[item->wk_type] -= 1;
1191	free(item, DtoM(type));
1192}
1193
1194static void
1195workitem_alloc(item, type, mp)
1196	struct worklist *item;
1197	int type;
1198	struct mount *mp;
1199{
1200	struct ufsmount *ump;
1201
1202	item->wk_type = type;
1203	item->wk_mp = mp;
1204	item->wk_state = 0;
1205
1206	ump = VFSTOUFS(mp);
1207	ACQUIRE_GBLLOCK(&lk);
1208	dep_current[type]++;
1209	if (dep_current[type] > dep_highuse[type])
1210		dep_highuse[type] = dep_current[type];
1211	dep_total[type]++;
1212	FREE_GBLLOCK(&lk);
1213	ACQUIRE_LOCK(ump);
1214	ump->softdep_curdeps[type] += 1;
1215	ump->softdep_deps++;
1216	ump->softdep_accdeps++;
1217	FREE_LOCK(ump);
1218}
1219
1220static void
1221workitem_reassign(item, newtype)
1222	struct worklist *item;
1223	int newtype;
1224{
1225	struct ufsmount *ump;
1226
1227	ump = VFSTOUFS(item->wk_mp);
1228	LOCK_OWNED(ump);
1229	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1230	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1231	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1232	ump->softdep_curdeps[item->wk_type] -= 1;
1233	ump->softdep_curdeps[newtype] += 1;
1234	KASSERT(dep_current[item->wk_type] > 0,
1235	    ("workitem_reassign: %s: dep_current[%s] going negative",
1236	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1237	ACQUIRE_GBLLOCK(&lk);
1238	dep_current[newtype]++;
1239	dep_current[item->wk_type]--;
1240	if (dep_current[newtype] > dep_highuse[newtype])
1241		dep_highuse[newtype] = dep_current[newtype];
1242	dep_total[newtype]++;
1243	FREE_GBLLOCK(&lk);
1244	item->wk_type = newtype;
1245}
1246
1247/*
1248 * Workitem queue management
1249 */
1250static int max_softdeps;	/* maximum number of structs before slowdown */
1251static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1252static int proc_waiting;	/* tracks whether we have a timeout posted */
1253static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1254static struct callout softdep_callout;
1255static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1256static int req_clear_remove;	/* syncer process flush some freeblks */
1257static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1258
1259/*
1260 * runtime statistics
1261 */
1262static int stat_flush_threads;	/* number of softdep flushing threads */
1263static int stat_worklist_push;	/* number of worklist cleanups */
1264static int stat_blk_limit_push;	/* number of times block limit neared */
1265static int stat_ino_limit_push;	/* number of times inode limit neared */
1266static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1267static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1268static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1269static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1270static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1271static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1272static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1273static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1274static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1275static int stat_journal_min;	/* Times hit journal min threshold */
1276static int stat_journal_low;	/* Times hit journal low threshold */
1277static int stat_journal_wait;	/* Times blocked in jwait(). */
1278static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1279static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1280static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1281static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1282static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1283static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1284static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1285static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1286static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1287static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1288
1289SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1290    &max_softdeps, 0, "");
1291SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1292    &tickdelay, 0, "");
1293SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1294    &stat_flush_threads, 0, "");
1295SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push, CTLFLAG_RW,
1296    &stat_worklist_push, 0,"");
1297SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push, CTLFLAG_RW,
1298    &stat_blk_limit_push, 0,"");
1299SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push, CTLFLAG_RW,
1300    &stat_ino_limit_push, 0,"");
1301SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit, CTLFLAG_RW,
1302    &stat_blk_limit_hit, 0, "");
1303SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit, CTLFLAG_RW,
1304    &stat_ino_limit_hit, 0, "");
1305SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit, CTLFLAG_RW,
1306    &stat_sync_limit_hit, 0, "");
1307SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs, CTLFLAG_RW,
1308    &stat_indir_blk_ptrs, 0, "");
1309SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap, CTLFLAG_RW,
1310    &stat_inode_bitmap, 0, "");
1311SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs, CTLFLAG_RW,
1312    &stat_direct_blk_ptrs, 0, "");
1313SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry, CTLFLAG_RW,
1314    &stat_dir_entry, 0, "");
1315SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback, CTLFLAG_RW,
1316    &stat_jaddref, 0, "");
1317SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback, CTLFLAG_RW,
1318    &stat_jnewblk, 0, "");
1319SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low, CTLFLAG_RW,
1320    &stat_journal_low, 0, "");
1321SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min, CTLFLAG_RW,
1322    &stat_journal_min, 0, "");
1323SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait, CTLFLAG_RW,
1324    &stat_journal_wait, 0, "");
1325SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage, CTLFLAG_RW,
1326    &stat_jwait_filepage, 0, "");
1327SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks, CTLFLAG_RW,
1328    &stat_jwait_freeblks, 0, "");
1329SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode, CTLFLAG_RW,
1330    &stat_jwait_inode, 0, "");
1331SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk, CTLFLAG_RW,
1332    &stat_jwait_newblk, 0, "");
1333SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests, CTLFLAG_RW,
1334    &stat_cleanup_blkrequests, 0, "");
1335SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests, CTLFLAG_RW,
1336    &stat_cleanup_inorequests, 0, "");
1337SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay, CTLFLAG_RW,
1338    &stat_cleanup_high_delay, 0, "");
1339SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries, CTLFLAG_RW,
1340    &stat_cleanup_retries, 0, "");
1341SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures, CTLFLAG_RW,
1342    &stat_cleanup_failures, 0, "");
1343SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1344    &softdep_flushcache, 0, "");
1345SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1346    &stat_emptyjblocks, 0, "");
1347
1348SYSCTL_DECL(_vfs_ffs);
1349
1350/* Whether to recompute the summary at mount time */
1351static int compute_summary_at_mount = 0;
1352SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1353	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1354static int print_threads = 0;
1355SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1356    &print_threads, 0, "Notify flusher thread start/stop");
1357
1358/* List of all filesystems mounted with soft updates */
1359static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1360
1361/*
1362 * This function cleans the worklist for a filesystem.
1363 * Each filesystem running with soft dependencies gets its own
1364 * thread to run in this function. The thread is started up in
1365 * softdep_mount and shutdown in softdep_unmount. They show up
1366 * as part of the kernel "bufdaemon" process whose process
1367 * entry is available in bufdaemonproc.
1368 */
1369static int searchfailed;
1370extern struct proc *bufdaemonproc;
1371static void
1372softdep_flush(addr)
1373	void *addr;
1374{
1375	struct mount *mp;
1376	struct thread *td;
1377	struct ufsmount *ump;
1378
1379	td = curthread;
1380	td->td_pflags |= TDP_NORUNNINGBUF;
1381	mp = (struct mount *)addr;
1382	ump = VFSTOUFS(mp);
1383	atomic_add_int(&stat_flush_threads, 1);
1384	ACQUIRE_LOCK(ump);
1385	ump->softdep_flags &= ~FLUSH_STARTING;
1386	wakeup(&ump->softdep_flushtd);
1387	FREE_LOCK(ump);
1388	if (print_threads) {
1389		if (stat_flush_threads == 1)
1390			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1391			    bufdaemonproc->p_pid);
1392		printf("Start thread %s\n", td->td_name);
1393	}
1394	for (;;) {
1395		while (softdep_process_worklist(mp, 0) > 0 ||
1396		    (MOUNTEDSUJ(mp) &&
1397		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1398			kthread_suspend_check();
1399		ACQUIRE_LOCK(ump);
1400		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1401			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1402			    "sdflush", hz / 2);
1403		ump->softdep_flags &= ~FLUSH_CLEANUP;
1404		/*
1405		 * Check to see if we are done and need to exit.
1406		 */
1407		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1408			FREE_LOCK(ump);
1409			continue;
1410		}
1411		ump->softdep_flags &= ~FLUSH_EXIT;
1412		FREE_LOCK(ump);
1413		wakeup(&ump->softdep_flags);
1414		if (print_threads)
1415			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1416		atomic_subtract_int(&stat_flush_threads, 1);
1417		kthread_exit();
1418		panic("kthread_exit failed\n");
1419	}
1420}
1421
1422static void
1423worklist_speedup(mp)
1424	struct mount *mp;
1425{
1426	struct ufsmount *ump;
1427
1428	ump = VFSTOUFS(mp);
1429	LOCK_OWNED(ump);
1430	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1431		ump->softdep_flags |= FLUSH_CLEANUP;
1432	wakeup(&ump->softdep_flushtd);
1433}
1434
1435static int
1436softdep_speedup(ump)
1437	struct ufsmount *ump;
1438{
1439	struct ufsmount *altump;
1440	struct mount_softdeps *sdp;
1441
1442	LOCK_OWNED(ump);
1443	worklist_speedup(ump->um_mountp);
1444	bd_speedup();
1445	/*
1446	 * If we have global shortages, then we need other
1447	 * filesystems to help with the cleanup. Here we wakeup a
1448	 * flusher thread for a filesystem that is over its fair
1449	 * share of resources.
1450	 */
1451	if (req_clear_inodedeps || req_clear_remove) {
1452		ACQUIRE_GBLLOCK(&lk);
1453		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1454			if ((altump = sdp->sd_ump) == ump)
1455				continue;
1456			if (((req_clear_inodedeps &&
1457			    altump->softdep_curdeps[D_INODEDEP] >
1458			    max_softdeps / stat_flush_threads) ||
1459			    (req_clear_remove &&
1460			    altump->softdep_curdeps[D_DIRREM] >
1461			    (max_softdeps / 2) / stat_flush_threads)) &&
1462			    TRY_ACQUIRE_LOCK(altump))
1463				break;
1464		}
1465		if (sdp == NULL) {
1466			searchfailed++;
1467			FREE_GBLLOCK(&lk);
1468		} else {
1469			/*
1470			 * Move to the end of the list so we pick a
1471			 * different one on out next try.
1472			 */
1473			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1474			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1475			FREE_GBLLOCK(&lk);
1476			if ((altump->softdep_flags &
1477			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1478				altump->softdep_flags |= FLUSH_CLEANUP;
1479			altump->um_softdep->sd_cleanups++;
1480			wakeup(&altump->softdep_flushtd);
1481			FREE_LOCK(altump);
1482		}
1483	}
1484	return (speedup_syncer());
1485}
1486
1487/*
1488 * Add an item to the end of the work queue.
1489 * This routine requires that the lock be held.
1490 * This is the only routine that adds items to the list.
1491 * The following routine is the only one that removes items
1492 * and does so in order from first to last.
1493 */
1494
1495#define	WK_HEAD		0x0001	/* Add to HEAD. */
1496#define	WK_NODELAY	0x0002	/* Process immediately. */
1497
1498static void
1499add_to_worklist(wk, flags)
1500	struct worklist *wk;
1501	int flags;
1502{
1503	struct ufsmount *ump;
1504
1505	ump = VFSTOUFS(wk->wk_mp);
1506	LOCK_OWNED(ump);
1507	if (wk->wk_state & ONWORKLIST)
1508		panic("add_to_worklist: %s(0x%X) already on list",
1509		    TYPENAME(wk->wk_type), wk->wk_state);
1510	wk->wk_state |= ONWORKLIST;
1511	if (ump->softdep_on_worklist == 0) {
1512		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1513		ump->softdep_worklist_tail = wk;
1514	} else if (flags & WK_HEAD) {
1515		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1516	} else {
1517		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1518		ump->softdep_worklist_tail = wk;
1519	}
1520	ump->softdep_on_worklist += 1;
1521	if (flags & WK_NODELAY)
1522		worklist_speedup(wk->wk_mp);
1523}
1524
1525/*
1526 * Remove the item to be processed. If we are removing the last
1527 * item on the list, we need to recalculate the tail pointer.
1528 */
1529static void
1530remove_from_worklist(wk)
1531	struct worklist *wk;
1532{
1533	struct ufsmount *ump;
1534
1535	ump = VFSTOUFS(wk->wk_mp);
1536	WORKLIST_REMOVE(wk);
1537	if (ump->softdep_worklist_tail == wk)
1538		ump->softdep_worklist_tail =
1539		    (struct worklist *)wk->wk_list.le_prev;
1540	ump->softdep_on_worklist -= 1;
1541}
1542
1543static void
1544wake_worklist(wk)
1545	struct worklist *wk;
1546{
1547	if (wk->wk_state & IOWAITING) {
1548		wk->wk_state &= ~IOWAITING;
1549		wakeup(wk);
1550	}
1551}
1552
1553static void
1554wait_worklist(wk, wmesg)
1555	struct worklist *wk;
1556	char *wmesg;
1557{
1558	struct ufsmount *ump;
1559
1560	ump = VFSTOUFS(wk->wk_mp);
1561	wk->wk_state |= IOWAITING;
1562	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1563}
1564
1565/*
1566 * Process that runs once per second to handle items in the background queue.
1567 *
1568 * Note that we ensure that everything is done in the order in which they
1569 * appear in the queue. The code below depends on this property to ensure
1570 * that blocks of a file are freed before the inode itself is freed. This
1571 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1572 * until all the old ones have been purged from the dependency lists.
1573 */
1574static int
1575softdep_process_worklist(mp, full)
1576	struct mount *mp;
1577	int full;
1578{
1579	int cnt, matchcnt;
1580	struct ufsmount *ump;
1581	long starttime;
1582
1583	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1584	if (MOUNTEDSOFTDEP(mp) == 0)
1585		return (0);
1586	matchcnt = 0;
1587	ump = VFSTOUFS(mp);
1588	ACQUIRE_LOCK(ump);
1589	starttime = time_second;
1590	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1591	check_clear_deps(mp);
1592	while (ump->softdep_on_worklist > 0) {
1593		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1594			break;
1595		else
1596			matchcnt += cnt;
1597		check_clear_deps(mp);
1598		/*
1599		 * We do not generally want to stop for buffer space, but if
1600		 * we are really being a buffer hog, we will stop and wait.
1601		 */
1602		if (should_yield()) {
1603			FREE_LOCK(ump);
1604			kern_yield(PRI_USER);
1605			bwillwrite();
1606			ACQUIRE_LOCK(ump);
1607		}
1608		/*
1609		 * Never allow processing to run for more than one
1610		 * second. This gives the syncer thread the opportunity
1611		 * to pause if appropriate.
1612		 */
1613		if (!full && starttime != time_second)
1614			break;
1615	}
1616	if (full == 0)
1617		journal_unsuspend(ump);
1618	FREE_LOCK(ump);
1619	return (matchcnt);
1620}
1621
1622/*
1623 * Process all removes associated with a vnode if we are running out of
1624 * journal space.  Any other process which attempts to flush these will
1625 * be unable as we have the vnodes locked.
1626 */
1627static void
1628process_removes(vp)
1629	struct vnode *vp;
1630{
1631	struct inodedep *inodedep;
1632	struct dirrem *dirrem;
1633	struct ufsmount *ump;
1634	struct mount *mp;
1635	ino_t inum;
1636
1637	mp = vp->v_mount;
1638	ump = VFSTOUFS(mp);
1639	LOCK_OWNED(ump);
1640	inum = VTOI(vp)->i_number;
1641	for (;;) {
1642top:
1643		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1644			return;
1645		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1646			/*
1647			 * If another thread is trying to lock this vnode
1648			 * it will fail but we must wait for it to do so
1649			 * before we can proceed.
1650			 */
1651			if (dirrem->dm_state & INPROGRESS) {
1652				wait_worklist(&dirrem->dm_list, "pwrwait");
1653				goto top;
1654			}
1655			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1656			    (COMPLETE | ONWORKLIST))
1657				break;
1658		}
1659		if (dirrem == NULL)
1660			return;
1661		remove_from_worklist(&dirrem->dm_list);
1662		FREE_LOCK(ump);
1663		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1664			panic("process_removes: suspended filesystem");
1665		handle_workitem_remove(dirrem, 0);
1666		vn_finished_secondary_write(mp);
1667		ACQUIRE_LOCK(ump);
1668	}
1669}
1670
1671/*
1672 * Process all truncations associated with a vnode if we are running out
1673 * of journal space.  This is called when the vnode lock is already held
1674 * and no other process can clear the truncation.  This function returns
1675 * a value greater than zero if it did any work.
1676 */
1677static void
1678process_truncates(vp)
1679	struct vnode *vp;
1680{
1681	struct inodedep *inodedep;
1682	struct freeblks *freeblks;
1683	struct ufsmount *ump;
1684	struct mount *mp;
1685	ino_t inum;
1686	int cgwait;
1687
1688	mp = vp->v_mount;
1689	ump = VFSTOUFS(mp);
1690	LOCK_OWNED(ump);
1691	inum = VTOI(vp)->i_number;
1692	for (;;) {
1693		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1694			return;
1695		cgwait = 0;
1696		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1697			/* Journal entries not yet written.  */
1698			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1699				jwait(&LIST_FIRST(
1700				    &freeblks->fb_jblkdephd)->jb_list,
1701				    MNT_WAIT);
1702				break;
1703			}
1704			/* Another thread is executing this item. */
1705			if (freeblks->fb_state & INPROGRESS) {
1706				wait_worklist(&freeblks->fb_list, "ptrwait");
1707				break;
1708			}
1709			/* Freeblks is waiting on a inode write. */
1710			if ((freeblks->fb_state & COMPLETE) == 0) {
1711				FREE_LOCK(ump);
1712				ffs_update(vp, 1);
1713				ACQUIRE_LOCK(ump);
1714				break;
1715			}
1716			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1717			    (ALLCOMPLETE | ONWORKLIST)) {
1718				remove_from_worklist(&freeblks->fb_list);
1719				freeblks->fb_state |= INPROGRESS;
1720				FREE_LOCK(ump);
1721				if (vn_start_secondary_write(NULL, &mp,
1722				    V_NOWAIT))
1723					panic("process_truncates: "
1724					    "suspended filesystem");
1725				handle_workitem_freeblocks(freeblks, 0);
1726				vn_finished_secondary_write(mp);
1727				ACQUIRE_LOCK(ump);
1728				break;
1729			}
1730			if (freeblks->fb_cgwait)
1731				cgwait++;
1732		}
1733		if (cgwait) {
1734			FREE_LOCK(ump);
1735			sync_cgs(mp, MNT_WAIT);
1736			ffs_sync_snap(mp, MNT_WAIT);
1737			ACQUIRE_LOCK(ump);
1738			continue;
1739		}
1740		if (freeblks == NULL)
1741			break;
1742	}
1743	return;
1744}
1745
1746/*
1747 * Process one item on the worklist.
1748 */
1749static int
1750process_worklist_item(mp, target, flags)
1751	struct mount *mp;
1752	int target;
1753	int flags;
1754{
1755	struct worklist sentinel;
1756	struct worklist *wk;
1757	struct ufsmount *ump;
1758	int matchcnt;
1759	int error;
1760
1761	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1762	/*
1763	 * If we are being called because of a process doing a
1764	 * copy-on-write, then it is not safe to write as we may
1765	 * recurse into the copy-on-write routine.
1766	 */
1767	if (curthread->td_pflags & TDP_COWINPROGRESS)
1768		return (-1);
1769	PHOLD(curproc);	/* Don't let the stack go away. */
1770	ump = VFSTOUFS(mp);
1771	LOCK_OWNED(ump);
1772	matchcnt = 0;
1773	sentinel.wk_mp = NULL;
1774	sentinel.wk_type = D_SENTINEL;
1775	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1776	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1777	    wk = LIST_NEXT(&sentinel, wk_list)) {
1778		if (wk->wk_type == D_SENTINEL) {
1779			LIST_REMOVE(&sentinel, wk_list);
1780			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1781			continue;
1782		}
1783		if (wk->wk_state & INPROGRESS)
1784			panic("process_worklist_item: %p already in progress.",
1785			    wk);
1786		wk->wk_state |= INPROGRESS;
1787		remove_from_worklist(wk);
1788		FREE_LOCK(ump);
1789		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1790			panic("process_worklist_item: suspended filesystem");
1791		switch (wk->wk_type) {
1792		case D_DIRREM:
1793			/* removal of a directory entry */
1794			error = handle_workitem_remove(WK_DIRREM(wk), flags);
1795			break;
1796
1797		case D_FREEBLKS:
1798			/* releasing blocks and/or fragments from a file */
1799			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
1800			    flags);
1801			break;
1802
1803		case D_FREEFRAG:
1804			/* releasing a fragment when replaced as a file grows */
1805			handle_workitem_freefrag(WK_FREEFRAG(wk));
1806			error = 0;
1807			break;
1808
1809		case D_FREEFILE:
1810			/* releasing an inode when its link count drops to 0 */
1811			handle_workitem_freefile(WK_FREEFILE(wk));
1812			error = 0;
1813			break;
1814
1815		default:
1816			panic("%s_process_worklist: Unknown type %s",
1817			    "softdep", TYPENAME(wk->wk_type));
1818			/* NOTREACHED */
1819		}
1820		vn_finished_secondary_write(mp);
1821		ACQUIRE_LOCK(ump);
1822		if (error == 0) {
1823			if (++matchcnt == target)
1824				break;
1825			continue;
1826		}
1827		/*
1828		 * We have to retry the worklist item later.  Wake up any
1829		 * waiters who may be able to complete it immediately and
1830		 * add the item back to the head so we don't try to execute
1831		 * it again.
1832		 */
1833		wk->wk_state &= ~INPROGRESS;
1834		wake_worklist(wk);
1835		add_to_worklist(wk, WK_HEAD);
1836	}
1837	LIST_REMOVE(&sentinel, wk_list);
1838	/* Sentinal could've become the tail from remove_from_worklist. */
1839	if (ump->softdep_worklist_tail == &sentinel)
1840		ump->softdep_worklist_tail =
1841		    (struct worklist *)sentinel.wk_list.le_prev;
1842	PRELE(curproc);
1843	return (matchcnt);
1844}
1845
1846/*
1847 * Move dependencies from one buffer to another.
1848 */
1849int
1850softdep_move_dependencies(oldbp, newbp)
1851	struct buf *oldbp;
1852	struct buf *newbp;
1853{
1854	struct worklist *wk, *wktail;
1855	struct ufsmount *ump;
1856	int dirty;
1857
1858	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
1859		return (0);
1860	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
1861	    ("softdep_move_dependencies called on non-softdep filesystem"));
1862	dirty = 0;
1863	wktail = NULL;
1864	ump = VFSTOUFS(wk->wk_mp);
1865	ACQUIRE_LOCK(ump);
1866	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
1867		LIST_REMOVE(wk, wk_list);
1868		if (wk->wk_type == D_BMSAFEMAP &&
1869		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
1870			dirty = 1;
1871		if (wktail == NULL)
1872			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
1873		else
1874			LIST_INSERT_AFTER(wktail, wk, wk_list);
1875		wktail = wk;
1876	}
1877	FREE_LOCK(ump);
1878
1879	return (dirty);
1880}
1881
1882/*
1883 * Purge the work list of all items associated with a particular mount point.
1884 */
1885int
1886softdep_flushworklist(oldmnt, countp, td)
1887	struct mount *oldmnt;
1888	int *countp;
1889	struct thread *td;
1890{
1891	struct vnode *devvp;
1892	struct ufsmount *ump;
1893	int count, error;
1894
1895	/*
1896	 * Alternately flush the block device associated with the mount
1897	 * point and process any dependencies that the flushing
1898	 * creates. We continue until no more worklist dependencies
1899	 * are found.
1900	 */
1901	*countp = 0;
1902	error = 0;
1903	ump = VFSTOUFS(oldmnt);
1904	devvp = ump->um_devvp;
1905	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
1906		*countp += count;
1907		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1908		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1909		VOP_UNLOCK(devvp, 0);
1910		if (error != 0)
1911			break;
1912	}
1913	return (error);
1914}
1915
1916#define	SU_WAITIDLE_RETRIES	20
1917static int
1918softdep_waitidle(struct mount *mp, int flags __unused)
1919{
1920	struct ufsmount *ump;
1921	struct vnode *devvp;
1922	struct thread *td;
1923	int error, i;
1924
1925	ump = VFSTOUFS(mp);
1926	devvp = ump->um_devvp;
1927	td = curthread;
1928	error = 0;
1929	ACQUIRE_LOCK(ump);
1930	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
1931		ump->softdep_req = 1;
1932		KASSERT((flags & FORCECLOSE) == 0 ||
1933		    ump->softdep_on_worklist == 0,
1934		    ("softdep_waitidle: work added after flush"));
1935		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
1936		    "softdeps", 10 * hz);
1937		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1938		error = VOP_FSYNC(devvp, MNT_WAIT, td);
1939		VOP_UNLOCK(devvp, 0);
1940		ACQUIRE_LOCK(ump);
1941		if (error != 0)
1942			break;
1943	}
1944	ump->softdep_req = 0;
1945	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
1946		error = EBUSY;
1947		printf("softdep_waitidle: Failed to flush worklist for %p\n",
1948		    mp);
1949	}
1950	FREE_LOCK(ump);
1951	return (error);
1952}
1953
1954/*
1955 * Flush all vnodes and worklist items associated with a specified mount point.
1956 */
1957int
1958softdep_flushfiles(oldmnt, flags, td)
1959	struct mount *oldmnt;
1960	int flags;
1961	struct thread *td;
1962{
1963#ifdef QUOTA
1964	struct ufsmount *ump;
1965	int i;
1966#endif
1967	int error, early, depcount, loopcnt, retry_flush_count, retry;
1968	int morework;
1969
1970	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
1971	    ("softdep_flushfiles called on non-softdep filesystem"));
1972	loopcnt = 10;
1973	retry_flush_count = 3;
1974retry_flush:
1975	error = 0;
1976
1977	/*
1978	 * Alternately flush the vnodes associated with the mount
1979	 * point and process any dependencies that the flushing
1980	 * creates. In theory, this loop can happen at most twice,
1981	 * but we give it a few extra just to be sure.
1982	 */
1983	for (; loopcnt > 0; loopcnt--) {
1984		/*
1985		 * Do another flush in case any vnodes were brought in
1986		 * as part of the cleanup operations.
1987		 */
1988		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
1989		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
1990		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
1991			break;
1992		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
1993		    depcount == 0)
1994			break;
1995	}
1996	/*
1997	 * If we are unmounting then it is an error to fail. If we
1998	 * are simply trying to downgrade to read-only, then filesystem
1999	 * activity can keep us busy forever, so we just fail with EBUSY.
2000	 */
2001	if (loopcnt == 0) {
2002		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2003			panic("softdep_flushfiles: looping");
2004		error = EBUSY;
2005	}
2006	if (!error)
2007		error = softdep_waitidle(oldmnt, flags);
2008	if (!error) {
2009		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2010			retry = 0;
2011			MNT_ILOCK(oldmnt);
2012			KASSERT((oldmnt->mnt_kern_flag & MNTK_NOINSMNTQ) != 0,
2013			    ("softdep_flushfiles: !MNTK_NOINSMNTQ"));
2014			morework = oldmnt->mnt_nvnodelistsize > 0;
2015#ifdef QUOTA
2016			ump = VFSTOUFS(oldmnt);
2017			UFS_LOCK(ump);
2018			for (i = 0; i < MAXQUOTAS; i++) {
2019				if (ump->um_quotas[i] != NULLVP)
2020					morework = 1;
2021			}
2022			UFS_UNLOCK(ump);
2023#endif
2024			if (morework) {
2025				if (--retry_flush_count > 0) {
2026					retry = 1;
2027					loopcnt = 3;
2028				} else
2029					error = EBUSY;
2030			}
2031			MNT_IUNLOCK(oldmnt);
2032			if (retry)
2033				goto retry_flush;
2034		}
2035	}
2036	return (error);
2037}
2038
2039/*
2040 * Structure hashing.
2041 *
2042 * There are four types of structures that can be looked up:
2043 *	1) pagedep structures identified by mount point, inode number,
2044 *	   and logical block.
2045 *	2) inodedep structures identified by mount point and inode number.
2046 *	3) newblk structures identified by mount point and
2047 *	   physical block number.
2048 *	4) bmsafemap structures identified by mount point and
2049 *	   cylinder group number.
2050 *
2051 * The "pagedep" and "inodedep" dependency structures are hashed
2052 * separately from the file blocks and inodes to which they correspond.
2053 * This separation helps when the in-memory copy of an inode or
2054 * file block must be replaced. It also obviates the need to access
2055 * an inode or file page when simply updating (or de-allocating)
2056 * dependency structures. Lookup of newblk structures is needed to
2057 * find newly allocated blocks when trying to associate them with
2058 * their allocdirect or allocindir structure.
2059 *
2060 * The lookup routines optionally create and hash a new instance when
2061 * an existing entry is not found. The bmsafemap lookup routine always
2062 * allocates a new structure if an existing one is not found.
2063 */
2064#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2065
2066/*
2067 * Structures and routines associated with pagedep caching.
2068 */
2069#define	PAGEDEP_HASH(ump, inum, lbn) \
2070	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2071
2072static int
2073pagedep_find(pagedephd, ino, lbn, pagedeppp)
2074	struct pagedep_hashhead *pagedephd;
2075	ino_t ino;
2076	ufs_lbn_t lbn;
2077	struct pagedep **pagedeppp;
2078{
2079	struct pagedep *pagedep;
2080
2081	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2082		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2083			*pagedeppp = pagedep;
2084			return (1);
2085		}
2086	}
2087	*pagedeppp = NULL;
2088	return (0);
2089}
2090/*
2091 * Look up a pagedep. Return 1 if found, 0 otherwise.
2092 * If not found, allocate if DEPALLOC flag is passed.
2093 * Found or allocated entry is returned in pagedeppp.
2094 * This routine must be called with splbio interrupts blocked.
2095 */
2096static int
2097pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2098	struct mount *mp;
2099	struct buf *bp;
2100	ino_t ino;
2101	ufs_lbn_t lbn;
2102	int flags;
2103	struct pagedep **pagedeppp;
2104{
2105	struct pagedep *pagedep;
2106	struct pagedep_hashhead *pagedephd;
2107	struct worklist *wk;
2108	struct ufsmount *ump;
2109	int ret;
2110	int i;
2111
2112	ump = VFSTOUFS(mp);
2113	LOCK_OWNED(ump);
2114	if (bp) {
2115		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2116			if (wk->wk_type == D_PAGEDEP) {
2117				*pagedeppp = WK_PAGEDEP(wk);
2118				return (1);
2119			}
2120		}
2121	}
2122	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2123	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2124	if (ret) {
2125		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2126			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2127		return (1);
2128	}
2129	if ((flags & DEPALLOC) == 0)
2130		return (0);
2131	FREE_LOCK(ump);
2132	pagedep = malloc(sizeof(struct pagedep),
2133	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2134	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2135	ACQUIRE_LOCK(ump);
2136	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2137	if (*pagedeppp) {
2138		/*
2139		 * This should never happen since we only create pagedeps
2140		 * with the vnode lock held.  Could be an assert.
2141		 */
2142		WORKITEM_FREE(pagedep, D_PAGEDEP);
2143		return (ret);
2144	}
2145	pagedep->pd_ino = ino;
2146	pagedep->pd_lbn = lbn;
2147	LIST_INIT(&pagedep->pd_dirremhd);
2148	LIST_INIT(&pagedep->pd_pendinghd);
2149	for (i = 0; i < DAHASHSZ; i++)
2150		LIST_INIT(&pagedep->pd_diraddhd[i]);
2151	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2152	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2153	*pagedeppp = pagedep;
2154	return (0);
2155}
2156
2157/*
2158 * Structures and routines associated with inodedep caching.
2159 */
2160#define	INODEDEP_HASH(ump, inum) \
2161      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2162
2163static int
2164inodedep_find(inodedephd, inum, inodedeppp)
2165	struct inodedep_hashhead *inodedephd;
2166	ino_t inum;
2167	struct inodedep **inodedeppp;
2168{
2169	struct inodedep *inodedep;
2170
2171	LIST_FOREACH(inodedep, inodedephd, id_hash)
2172		if (inum == inodedep->id_ino)
2173			break;
2174	if (inodedep) {
2175		*inodedeppp = inodedep;
2176		return (1);
2177	}
2178	*inodedeppp = NULL;
2179
2180	return (0);
2181}
2182/*
2183 * Look up an inodedep. Return 1 if found, 0 if not found.
2184 * If not found, allocate if DEPALLOC flag is passed.
2185 * Found or allocated entry is returned in inodedeppp.
2186 * This routine must be called with splbio interrupts blocked.
2187 */
2188static int
2189inodedep_lookup(mp, inum, flags, inodedeppp)
2190	struct mount *mp;
2191	ino_t inum;
2192	int flags;
2193	struct inodedep **inodedeppp;
2194{
2195	struct inodedep *inodedep;
2196	struct inodedep_hashhead *inodedephd;
2197	struct ufsmount *ump;
2198	struct fs *fs;
2199
2200	ump = VFSTOUFS(mp);
2201	LOCK_OWNED(ump);
2202	fs = ump->um_fs;
2203	inodedephd = INODEDEP_HASH(ump, inum);
2204
2205	if (inodedep_find(inodedephd, inum, inodedeppp))
2206		return (1);
2207	if ((flags & DEPALLOC) == 0)
2208		return (0);
2209	/*
2210	 * If the system is over its limit and our filesystem is
2211	 * responsible for more than our share of that usage and
2212	 * we are not in a rush, request some inodedep cleanup.
2213	 */
2214	if (softdep_excess_items(ump, D_INODEDEP))
2215		schedule_cleanup(mp);
2216	else
2217		FREE_LOCK(ump);
2218	inodedep = malloc(sizeof(struct inodedep),
2219		M_INODEDEP, M_SOFTDEP_FLAGS);
2220	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2221	ACQUIRE_LOCK(ump);
2222	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2223		WORKITEM_FREE(inodedep, D_INODEDEP);
2224		return (1);
2225	}
2226	inodedep->id_fs = fs;
2227	inodedep->id_ino = inum;
2228	inodedep->id_state = ALLCOMPLETE;
2229	inodedep->id_nlinkdelta = 0;
2230	inodedep->id_savedino1 = NULL;
2231	inodedep->id_savedsize = -1;
2232	inodedep->id_savedextsize = -1;
2233	inodedep->id_savednlink = -1;
2234	inodedep->id_bmsafemap = NULL;
2235	inodedep->id_mkdiradd = NULL;
2236	LIST_INIT(&inodedep->id_dirremhd);
2237	LIST_INIT(&inodedep->id_pendinghd);
2238	LIST_INIT(&inodedep->id_inowait);
2239	LIST_INIT(&inodedep->id_bufwait);
2240	TAILQ_INIT(&inodedep->id_inoreflst);
2241	TAILQ_INIT(&inodedep->id_inoupdt);
2242	TAILQ_INIT(&inodedep->id_newinoupdt);
2243	TAILQ_INIT(&inodedep->id_extupdt);
2244	TAILQ_INIT(&inodedep->id_newextupdt);
2245	TAILQ_INIT(&inodedep->id_freeblklst);
2246	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2247	*inodedeppp = inodedep;
2248	return (0);
2249}
2250
2251/*
2252 * Structures and routines associated with newblk caching.
2253 */
2254#define	NEWBLK_HASH(ump, inum) \
2255	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2256
2257static int
2258newblk_find(newblkhd, newblkno, flags, newblkpp)
2259	struct newblk_hashhead *newblkhd;
2260	ufs2_daddr_t newblkno;
2261	int flags;
2262	struct newblk **newblkpp;
2263{
2264	struct newblk *newblk;
2265
2266	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2267		if (newblkno != newblk->nb_newblkno)
2268			continue;
2269		/*
2270		 * If we're creating a new dependency don't match those that
2271		 * have already been converted to allocdirects.  This is for
2272		 * a frag extend.
2273		 */
2274		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2275			continue;
2276		break;
2277	}
2278	if (newblk) {
2279		*newblkpp = newblk;
2280		return (1);
2281	}
2282	*newblkpp = NULL;
2283	return (0);
2284}
2285
2286/*
2287 * Look up a newblk. Return 1 if found, 0 if not found.
2288 * If not found, allocate if DEPALLOC flag is passed.
2289 * Found or allocated entry is returned in newblkpp.
2290 */
2291static int
2292newblk_lookup(mp, newblkno, flags, newblkpp)
2293	struct mount *mp;
2294	ufs2_daddr_t newblkno;
2295	int flags;
2296	struct newblk **newblkpp;
2297{
2298	struct newblk *newblk;
2299	struct newblk_hashhead *newblkhd;
2300	struct ufsmount *ump;
2301
2302	ump = VFSTOUFS(mp);
2303	LOCK_OWNED(ump);
2304	newblkhd = NEWBLK_HASH(ump, newblkno);
2305	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2306		return (1);
2307	if ((flags & DEPALLOC) == 0)
2308		return (0);
2309	if (softdep_excess_items(ump, D_NEWBLK) ||
2310	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
2311	    softdep_excess_items(ump, D_ALLOCINDIR))
2312		schedule_cleanup(mp);
2313	else
2314		FREE_LOCK(ump);
2315	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2316	    M_SOFTDEP_FLAGS | M_ZERO);
2317	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2318	ACQUIRE_LOCK(ump);
2319	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2320		WORKITEM_FREE(newblk, D_NEWBLK);
2321		return (1);
2322	}
2323	newblk->nb_freefrag = NULL;
2324	LIST_INIT(&newblk->nb_indirdeps);
2325	LIST_INIT(&newblk->nb_newdirblk);
2326	LIST_INIT(&newblk->nb_jwork);
2327	newblk->nb_state = ATTACHED;
2328	newblk->nb_newblkno = newblkno;
2329	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2330	*newblkpp = newblk;
2331	return (0);
2332}
2333
2334/*
2335 * Structures and routines associated with freed indirect block caching.
2336 */
2337#define	INDIR_HASH(ump, blkno) \
2338	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2339
2340/*
2341 * Lookup an indirect block in the indir hash table.  The freework is
2342 * removed and potentially freed.  The caller must do a blocking journal
2343 * write before writing to the blkno.
2344 */
2345static int
2346indirblk_lookup(mp, blkno)
2347	struct mount *mp;
2348	ufs2_daddr_t blkno;
2349{
2350	struct freework *freework;
2351	struct indir_hashhead *wkhd;
2352	struct ufsmount *ump;
2353
2354	ump = VFSTOUFS(mp);
2355	wkhd = INDIR_HASH(ump, blkno);
2356	TAILQ_FOREACH(freework, wkhd, fw_next) {
2357		if (freework->fw_blkno != blkno)
2358			continue;
2359		indirblk_remove(freework);
2360		return (1);
2361	}
2362	return (0);
2363}
2364
2365/*
2366 * Insert an indirect block represented by freework into the indirblk
2367 * hash table so that it may prevent the block from being re-used prior
2368 * to the journal being written.
2369 */
2370static void
2371indirblk_insert(freework)
2372	struct freework *freework;
2373{
2374	struct jblocks *jblocks;
2375	struct jseg *jseg;
2376	struct ufsmount *ump;
2377
2378	ump = VFSTOUFS(freework->fw_list.wk_mp);
2379	jblocks = ump->softdep_jblocks;
2380	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2381	if (jseg == NULL)
2382		return;
2383
2384	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2385	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2386	    fw_next);
2387	freework->fw_state &= ~DEPCOMPLETE;
2388}
2389
2390static void
2391indirblk_remove(freework)
2392	struct freework *freework;
2393{
2394	struct ufsmount *ump;
2395
2396	ump = VFSTOUFS(freework->fw_list.wk_mp);
2397	LIST_REMOVE(freework, fw_segs);
2398	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2399	freework->fw_state |= DEPCOMPLETE;
2400	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2401		WORKITEM_FREE(freework, D_FREEWORK);
2402}
2403
2404/*
2405 * Executed during filesystem system initialization before
2406 * mounting any filesystems.
2407 */
2408void
2409softdep_initialize()
2410{
2411
2412	TAILQ_INIT(&softdepmounts);
2413#ifdef __LP64__
2414	max_softdeps = desiredvnodes * 4;
2415#else
2416	max_softdeps = desiredvnodes * 2;
2417#endif
2418
2419	/* initialise bioops hack */
2420	bioops.io_start = softdep_disk_io_initiation;
2421	bioops.io_complete = softdep_disk_write_complete;
2422	bioops.io_deallocate = softdep_deallocate_dependencies;
2423	bioops.io_countdeps = softdep_count_dependencies;
2424	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2425
2426	/* Initialize the callout with an mtx. */
2427	callout_init_mtx(&softdep_callout, &lk, 0);
2428}
2429
2430/*
2431 * Executed after all filesystems have been unmounted during
2432 * filesystem module unload.
2433 */
2434void
2435softdep_uninitialize()
2436{
2437
2438	/* clear bioops hack */
2439	bioops.io_start = NULL;
2440	bioops.io_complete = NULL;
2441	bioops.io_deallocate = NULL;
2442	bioops.io_countdeps = NULL;
2443	softdep_ast_cleanup = NULL;
2444
2445	callout_drain(&softdep_callout);
2446}
2447
2448/*
2449 * Called at mount time to notify the dependency code that a
2450 * filesystem wishes to use it.
2451 */
2452int
2453softdep_mount(devvp, mp, fs, cred)
2454	struct vnode *devvp;
2455	struct mount *mp;
2456	struct fs *fs;
2457	struct ucred *cred;
2458{
2459	struct csum_total cstotal;
2460	struct mount_softdeps *sdp;
2461	struct ufsmount *ump;
2462	struct cg *cgp;
2463	struct buf *bp;
2464	int i, error, cyl;
2465
2466	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2467	    M_WAITOK | M_ZERO);
2468	MNT_ILOCK(mp);
2469	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2470	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2471		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2472			MNTK_SOFTDEP | MNTK_NOASYNC;
2473	}
2474	ump = VFSTOUFS(mp);
2475	ump->um_softdep = sdp;
2476	MNT_IUNLOCK(mp);
2477	rw_init(LOCK_PTR(ump), "Per-Filesystem Softdep Lock");
2478	sdp->sd_ump = ump;
2479	LIST_INIT(&ump->softdep_workitem_pending);
2480	LIST_INIT(&ump->softdep_journal_pending);
2481	TAILQ_INIT(&ump->softdep_unlinked);
2482	LIST_INIT(&ump->softdep_dirtycg);
2483	ump->softdep_worklist_tail = NULL;
2484	ump->softdep_on_worklist = 0;
2485	ump->softdep_deps = 0;
2486	LIST_INIT(&ump->softdep_mkdirlisthd);
2487	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2488	    &ump->pagedep_hash_size);
2489	ump->pagedep_nextclean = 0;
2490	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2491	    &ump->inodedep_hash_size);
2492	ump->inodedep_nextclean = 0;
2493	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2494	    &ump->newblk_hash_size);
2495	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2496	    &ump->bmsafemap_hash_size);
2497	i = 1 << (ffs(desiredvnodes / 10) - 1);
2498	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2499	    M_FREEWORK, M_WAITOK);
2500	ump->indir_hash_size = i - 1;
2501	for (i = 0; i <= ump->indir_hash_size; i++)
2502		TAILQ_INIT(&ump->indir_hashtbl[i]);
2503	ACQUIRE_GBLLOCK(&lk);
2504	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2505	FREE_GBLLOCK(&lk);
2506	if ((fs->fs_flags & FS_SUJ) &&
2507	    (error = journal_mount(mp, fs, cred)) != 0) {
2508		printf("Failed to start journal: %d\n", error);
2509		softdep_unmount(mp);
2510		return (error);
2511	}
2512	/*
2513	 * Start our flushing thread in the bufdaemon process.
2514	 */
2515	ACQUIRE_LOCK(ump);
2516	ump->softdep_flags |= FLUSH_STARTING;
2517	FREE_LOCK(ump);
2518	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2519	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2520	    mp->mnt_stat.f_mntonname);
2521	ACQUIRE_LOCK(ump);
2522	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2523		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2524		    hz / 2);
2525	}
2526	FREE_LOCK(ump);
2527	/*
2528	 * When doing soft updates, the counters in the
2529	 * superblock may have gotten out of sync. Recomputation
2530	 * can take a long time and can be deferred for background
2531	 * fsck.  However, the old behavior of scanning the cylinder
2532	 * groups and recalculating them at mount time is available
2533	 * by setting vfs.ffs.compute_summary_at_mount to one.
2534	 */
2535	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2536		return (0);
2537	bzero(&cstotal, sizeof cstotal);
2538	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2539		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2540		    fs->fs_cgsize, cred, &bp)) != 0) {
2541			brelse(bp);
2542			softdep_unmount(mp);
2543			return (error);
2544		}
2545		cgp = (struct cg *)bp->b_data;
2546		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2547		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2548		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2549		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2550		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2551		brelse(bp);
2552	}
2553#ifdef DEBUG
2554	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2555		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2556#endif
2557	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2558	return (0);
2559}
2560
2561void
2562softdep_unmount(mp)
2563	struct mount *mp;
2564{
2565	struct ufsmount *ump;
2566#ifdef INVARIANTS
2567	int i;
2568#endif
2569
2570	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2571	    ("softdep_unmount called on non-softdep filesystem"));
2572	ump = VFSTOUFS(mp);
2573	MNT_ILOCK(mp);
2574	mp->mnt_flag &= ~MNT_SOFTDEP;
2575	if (MOUNTEDSUJ(mp) == 0) {
2576		MNT_IUNLOCK(mp);
2577	} else {
2578		mp->mnt_flag &= ~MNT_SUJ;
2579		MNT_IUNLOCK(mp);
2580		journal_unmount(ump);
2581	}
2582	/*
2583	 * Shut down our flushing thread. Check for NULL is if
2584	 * softdep_mount errors out before the thread has been created.
2585	 */
2586	if (ump->softdep_flushtd != NULL) {
2587		ACQUIRE_LOCK(ump);
2588		ump->softdep_flags |= FLUSH_EXIT;
2589		wakeup(&ump->softdep_flushtd);
2590		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2591		    "sdwait", 0);
2592		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2593		    ("Thread shutdown failed"));
2594	}
2595	/*
2596	 * Free up our resources.
2597	 */
2598	ACQUIRE_GBLLOCK(&lk);
2599	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2600	FREE_GBLLOCK(&lk);
2601	rw_destroy(LOCK_PTR(ump));
2602	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2603	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2604	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2605	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2606	    ump->bmsafemap_hash_size);
2607	free(ump->indir_hashtbl, M_FREEWORK);
2608#ifdef INVARIANTS
2609	for (i = 0; i <= D_LAST; i++)
2610		KASSERT(ump->softdep_curdeps[i] == 0,
2611		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2612		    TYPENAME(i), ump->softdep_curdeps[i]));
2613#endif
2614	free(ump->um_softdep, M_MOUNTDATA);
2615}
2616
2617static struct jblocks *
2618jblocks_create(void)
2619{
2620	struct jblocks *jblocks;
2621
2622	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2623	TAILQ_INIT(&jblocks->jb_segs);
2624	jblocks->jb_avail = 10;
2625	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2626	    M_JBLOCKS, M_WAITOK | M_ZERO);
2627
2628	return (jblocks);
2629}
2630
2631static ufs2_daddr_t
2632jblocks_alloc(jblocks, bytes, actual)
2633	struct jblocks *jblocks;
2634	int bytes;
2635	int *actual;
2636{
2637	ufs2_daddr_t daddr;
2638	struct jextent *jext;
2639	int freecnt;
2640	int blocks;
2641
2642	blocks = bytes / DEV_BSIZE;
2643	jext = &jblocks->jb_extent[jblocks->jb_head];
2644	freecnt = jext->je_blocks - jblocks->jb_off;
2645	if (freecnt == 0) {
2646		jblocks->jb_off = 0;
2647		if (++jblocks->jb_head > jblocks->jb_used)
2648			jblocks->jb_head = 0;
2649		jext = &jblocks->jb_extent[jblocks->jb_head];
2650		freecnt = jext->je_blocks;
2651	}
2652	if (freecnt > blocks)
2653		freecnt = blocks;
2654	*actual = freecnt * DEV_BSIZE;
2655	daddr = jext->je_daddr + jblocks->jb_off;
2656	jblocks->jb_off += freecnt;
2657	jblocks->jb_free -= freecnt;
2658
2659	return (daddr);
2660}
2661
2662static void
2663jblocks_free(jblocks, mp, bytes)
2664	struct jblocks *jblocks;
2665	struct mount *mp;
2666	int bytes;
2667{
2668
2669	LOCK_OWNED(VFSTOUFS(mp));
2670	jblocks->jb_free += bytes / DEV_BSIZE;
2671	if (jblocks->jb_suspended)
2672		worklist_speedup(mp);
2673	wakeup(jblocks);
2674}
2675
2676static void
2677jblocks_destroy(jblocks)
2678	struct jblocks *jblocks;
2679{
2680
2681	if (jblocks->jb_extent)
2682		free(jblocks->jb_extent, M_JBLOCKS);
2683	free(jblocks, M_JBLOCKS);
2684}
2685
2686static void
2687jblocks_add(jblocks, daddr, blocks)
2688	struct jblocks *jblocks;
2689	ufs2_daddr_t daddr;
2690	int blocks;
2691{
2692	struct jextent *jext;
2693
2694	jblocks->jb_blocks += blocks;
2695	jblocks->jb_free += blocks;
2696	jext = &jblocks->jb_extent[jblocks->jb_used];
2697	/* Adding the first block. */
2698	if (jext->je_daddr == 0) {
2699		jext->je_daddr = daddr;
2700		jext->je_blocks = blocks;
2701		return;
2702	}
2703	/* Extending the last extent. */
2704	if (jext->je_daddr + jext->je_blocks == daddr) {
2705		jext->je_blocks += blocks;
2706		return;
2707	}
2708	/* Adding a new extent. */
2709	if (++jblocks->jb_used == jblocks->jb_avail) {
2710		jblocks->jb_avail *= 2;
2711		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2712		    M_JBLOCKS, M_WAITOK | M_ZERO);
2713		memcpy(jext, jblocks->jb_extent,
2714		    sizeof(struct jextent) * jblocks->jb_used);
2715		free(jblocks->jb_extent, M_JBLOCKS);
2716		jblocks->jb_extent = jext;
2717	}
2718	jext = &jblocks->jb_extent[jblocks->jb_used];
2719	jext->je_daddr = daddr;
2720	jext->je_blocks = blocks;
2721	return;
2722}
2723
2724int
2725softdep_journal_lookup(mp, vpp)
2726	struct mount *mp;
2727	struct vnode **vpp;
2728{
2729	struct componentname cnp;
2730	struct vnode *dvp;
2731	ino_t sujournal;
2732	int error;
2733
2734	error = VFS_VGET(mp, ROOTINO, LK_EXCLUSIVE, &dvp);
2735	if (error)
2736		return (error);
2737	bzero(&cnp, sizeof(cnp));
2738	cnp.cn_nameiop = LOOKUP;
2739	cnp.cn_flags = ISLASTCN;
2740	cnp.cn_thread = curthread;
2741	cnp.cn_cred = curthread->td_ucred;
2742	cnp.cn_pnbuf = SUJ_FILE;
2743	cnp.cn_nameptr = SUJ_FILE;
2744	cnp.cn_namelen = strlen(SUJ_FILE);
2745	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2746	vput(dvp);
2747	if (error != 0)
2748		return (error);
2749	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2750	return (error);
2751}
2752
2753/*
2754 * Open and verify the journal file.
2755 */
2756static int
2757journal_mount(mp, fs, cred)
2758	struct mount *mp;
2759	struct fs *fs;
2760	struct ucred *cred;
2761{
2762	struct jblocks *jblocks;
2763	struct ufsmount *ump;
2764	struct vnode *vp;
2765	struct inode *ip;
2766	ufs2_daddr_t blkno;
2767	int bcount;
2768	int error;
2769	int i;
2770
2771	ump = VFSTOUFS(mp);
2772	ump->softdep_journal_tail = NULL;
2773	ump->softdep_on_journal = 0;
2774	ump->softdep_accdeps = 0;
2775	ump->softdep_req = 0;
2776	ump->softdep_jblocks = NULL;
2777	error = softdep_journal_lookup(mp, &vp);
2778	if (error != 0) {
2779		printf("Failed to find journal.  Use tunefs to create one\n");
2780		return (error);
2781	}
2782	ip = VTOI(vp);
2783	if (ip->i_size < SUJ_MIN) {
2784		error = ENOSPC;
2785		goto out;
2786	}
2787	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
2788	jblocks = jblocks_create();
2789	for (i = 0; i < bcount; i++) {
2790		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
2791		if (error)
2792			break;
2793		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
2794	}
2795	if (error) {
2796		jblocks_destroy(jblocks);
2797		goto out;
2798	}
2799	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
2800	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
2801	ump->softdep_jblocks = jblocks;
2802out:
2803	if (error == 0) {
2804		MNT_ILOCK(mp);
2805		mp->mnt_flag |= MNT_SUJ;
2806		mp->mnt_flag &= ~MNT_SOFTDEP;
2807		MNT_IUNLOCK(mp);
2808		/*
2809		 * Only validate the journal contents if the
2810		 * filesystem is clean, otherwise we write the logs
2811		 * but they'll never be used.  If the filesystem was
2812		 * still dirty when we mounted it the journal is
2813		 * invalid and a new journal can only be valid if it
2814		 * starts from a clean mount.
2815		 */
2816		if (fs->fs_clean) {
2817			DIP_SET(ip, i_modrev, fs->fs_mtime);
2818			ip->i_flags |= IN_MODIFIED;
2819			ffs_update(vp, 1);
2820		}
2821	}
2822	vput(vp);
2823	return (error);
2824}
2825
2826static void
2827journal_unmount(ump)
2828	struct ufsmount *ump;
2829{
2830
2831	if (ump->softdep_jblocks)
2832		jblocks_destroy(ump->softdep_jblocks);
2833	ump->softdep_jblocks = NULL;
2834}
2835
2836/*
2837 * Called when a journal record is ready to be written.  Space is allocated
2838 * and the journal entry is created when the journal is flushed to stable
2839 * store.
2840 */
2841static void
2842add_to_journal(wk)
2843	struct worklist *wk;
2844{
2845	struct ufsmount *ump;
2846
2847	ump = VFSTOUFS(wk->wk_mp);
2848	LOCK_OWNED(ump);
2849	if (wk->wk_state & ONWORKLIST)
2850		panic("add_to_journal: %s(0x%X) already on list",
2851		    TYPENAME(wk->wk_type), wk->wk_state);
2852	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
2853	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
2854		ump->softdep_jblocks->jb_age = ticks;
2855		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
2856	} else
2857		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
2858	ump->softdep_journal_tail = wk;
2859	ump->softdep_on_journal += 1;
2860}
2861
2862/*
2863 * Remove an arbitrary item for the journal worklist maintain the tail
2864 * pointer.  This happens when a new operation obviates the need to
2865 * journal an old operation.
2866 */
2867static void
2868remove_from_journal(wk)
2869	struct worklist *wk;
2870{
2871	struct ufsmount *ump;
2872
2873	ump = VFSTOUFS(wk->wk_mp);
2874	LOCK_OWNED(ump);
2875#ifdef SUJ_DEBUG
2876	{
2877		struct worklist *wkn;
2878
2879		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
2880			if (wkn == wk)
2881				break;
2882		if (wkn == NULL)
2883			panic("remove_from_journal: %p is not in journal", wk);
2884	}
2885#endif
2886	/*
2887	 * We emulate a TAILQ to save space in most structures which do not
2888	 * require TAILQ semantics.  Here we must update the tail position
2889	 * when removing the tail which is not the final entry. This works
2890	 * only if the worklist linkage are at the beginning of the structure.
2891	 */
2892	if (ump->softdep_journal_tail == wk)
2893		ump->softdep_journal_tail =
2894		    (struct worklist *)wk->wk_list.le_prev;
2895
2896	WORKLIST_REMOVE(wk);
2897	ump->softdep_on_journal -= 1;
2898}
2899
2900/*
2901 * Check for journal space as well as dependency limits so the prelink
2902 * code can throttle both journaled and non-journaled filesystems.
2903 * Threshold is 0 for low and 1 for min.
2904 */
2905static int
2906journal_space(ump, thresh)
2907	struct ufsmount *ump;
2908	int thresh;
2909{
2910	struct jblocks *jblocks;
2911	int limit, avail;
2912
2913	jblocks = ump->softdep_jblocks;
2914	if (jblocks == NULL)
2915		return (1);
2916	/*
2917	 * We use a tighter restriction here to prevent request_cleanup()
2918	 * running in threads from running into locks we currently hold.
2919	 * We have to be over the limit and our filesystem has to be
2920	 * responsible for more than our share of that usage.
2921	 */
2922	limit = (max_softdeps / 10) * 9;
2923	if (dep_current[D_INODEDEP] > limit &&
2924	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
2925		return (0);
2926	if (thresh)
2927		thresh = jblocks->jb_min;
2928	else
2929		thresh = jblocks->jb_low;
2930	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
2931	avail = jblocks->jb_free - avail;
2932
2933	return (avail > thresh);
2934}
2935
2936static void
2937journal_suspend(ump)
2938	struct ufsmount *ump;
2939{
2940	struct jblocks *jblocks;
2941	struct mount *mp;
2942
2943	mp = UFSTOVFS(ump);
2944	jblocks = ump->softdep_jblocks;
2945	MNT_ILOCK(mp);
2946	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
2947		stat_journal_min++;
2948		mp->mnt_kern_flag |= MNTK_SUSPEND;
2949		mp->mnt_susp_owner = ump->softdep_flushtd;
2950	}
2951	jblocks->jb_suspended = 1;
2952	MNT_IUNLOCK(mp);
2953}
2954
2955static int
2956journal_unsuspend(struct ufsmount *ump)
2957{
2958	struct jblocks *jblocks;
2959	struct mount *mp;
2960
2961	mp = UFSTOVFS(ump);
2962	jblocks = ump->softdep_jblocks;
2963
2964	if (jblocks != NULL && jblocks->jb_suspended &&
2965	    journal_space(ump, jblocks->jb_min)) {
2966		jblocks->jb_suspended = 0;
2967		FREE_LOCK(ump);
2968		mp->mnt_susp_owner = curthread;
2969		vfs_write_resume(mp, 0);
2970		ACQUIRE_LOCK(ump);
2971		return (1);
2972	}
2973	return (0);
2974}
2975
2976/*
2977 * Called before any allocation function to be certain that there is
2978 * sufficient space in the journal prior to creating any new records.
2979 * Since in the case of block allocation we may have multiple locked
2980 * buffers at the time of the actual allocation we can not block
2981 * when the journal records are created.  Doing so would create a deadlock
2982 * if any of these buffers needed to be flushed to reclaim space.  Instead
2983 * we require a sufficiently large amount of available space such that
2984 * each thread in the system could have passed this allocation check and
2985 * still have sufficient free space.  With 20% of a minimum journal size
2986 * of 1MB we have 6553 records available.
2987 */
2988int
2989softdep_prealloc(vp, waitok)
2990	struct vnode *vp;
2991	int waitok;
2992{
2993	struct ufsmount *ump;
2994
2995	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
2996	    ("softdep_prealloc called on non-softdep filesystem"));
2997	/*
2998	 * Nothing to do if we are not running journaled soft updates.
2999	 * If we currently hold the snapshot lock, we must avoid
3000	 * handling other resources that could cause deadlock.  Do not
3001	 * touch quotas vnode since it is typically recursed with
3002	 * other vnode locks held.
3003	 */
3004	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3005	    (vp->v_vflag & VV_SYSTEM) != 0)
3006		return (0);
3007	ump = VFSTOUFS(vp->v_mount);
3008	ACQUIRE_LOCK(ump);
3009	if (journal_space(ump, 0)) {
3010		FREE_LOCK(ump);
3011		return (0);
3012	}
3013	stat_journal_low++;
3014	FREE_LOCK(ump);
3015	if (waitok == MNT_NOWAIT)
3016		return (ENOSPC);
3017	/*
3018	 * Attempt to sync this vnode once to flush any journal
3019	 * work attached to it.
3020	 */
3021	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3022		ffs_syncvnode(vp, waitok, 0);
3023	ACQUIRE_LOCK(ump);
3024	process_removes(vp);
3025	process_truncates(vp);
3026	if (journal_space(ump, 0) == 0) {
3027		softdep_speedup(ump);
3028		if (journal_space(ump, 1) == 0)
3029			journal_suspend(ump);
3030	}
3031	FREE_LOCK(ump);
3032
3033	return (0);
3034}
3035
3036/*
3037 * Before adjusting a link count on a vnode verify that we have sufficient
3038 * journal space.  If not, process operations that depend on the currently
3039 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3040 * and softdep flush threads can not acquire these locks to reclaim space.
3041 */
3042static void
3043softdep_prelink(dvp, vp)
3044	struct vnode *dvp;
3045	struct vnode *vp;
3046{
3047	struct ufsmount *ump;
3048
3049	ump = VFSTOUFS(dvp->v_mount);
3050	LOCK_OWNED(ump);
3051	/*
3052	 * Nothing to do if we have sufficient journal space.
3053	 * If we currently hold the snapshot lock, we must avoid
3054	 * handling other resources that could cause deadlock.
3055	 */
3056	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
3057		return;
3058	stat_journal_low++;
3059	FREE_LOCK(ump);
3060	if (vp)
3061		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3062	ffs_syncvnode(dvp, MNT_WAIT, 0);
3063	ACQUIRE_LOCK(ump);
3064	/* Process vp before dvp as it may create .. removes. */
3065	if (vp) {
3066		process_removes(vp);
3067		process_truncates(vp);
3068	}
3069	process_removes(dvp);
3070	process_truncates(dvp);
3071	softdep_speedup(ump);
3072	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3073	if (journal_space(ump, 0) == 0) {
3074		softdep_speedup(ump);
3075		if (journal_space(ump, 1) == 0)
3076			journal_suspend(ump);
3077	}
3078}
3079
3080static void
3081jseg_write(ump, jseg, data)
3082	struct ufsmount *ump;
3083	struct jseg *jseg;
3084	uint8_t *data;
3085{
3086	struct jsegrec *rec;
3087
3088	rec = (struct jsegrec *)data;
3089	rec->jsr_seq = jseg->js_seq;
3090	rec->jsr_oldest = jseg->js_oldseq;
3091	rec->jsr_cnt = jseg->js_cnt;
3092	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3093	rec->jsr_crc = 0;
3094	rec->jsr_time = ump->um_fs->fs_mtime;
3095}
3096
3097static inline void
3098inoref_write(inoref, jseg, rec)
3099	struct inoref *inoref;
3100	struct jseg *jseg;
3101	struct jrefrec *rec;
3102{
3103
3104	inoref->if_jsegdep->jd_seg = jseg;
3105	rec->jr_ino = inoref->if_ino;
3106	rec->jr_parent = inoref->if_parent;
3107	rec->jr_nlink = inoref->if_nlink;
3108	rec->jr_mode = inoref->if_mode;
3109	rec->jr_diroff = inoref->if_diroff;
3110}
3111
3112static void
3113jaddref_write(jaddref, jseg, data)
3114	struct jaddref *jaddref;
3115	struct jseg *jseg;
3116	uint8_t *data;
3117{
3118	struct jrefrec *rec;
3119
3120	rec = (struct jrefrec *)data;
3121	rec->jr_op = JOP_ADDREF;
3122	inoref_write(&jaddref->ja_ref, jseg, rec);
3123}
3124
3125static void
3126jremref_write(jremref, jseg, data)
3127	struct jremref *jremref;
3128	struct jseg *jseg;
3129	uint8_t *data;
3130{
3131	struct jrefrec *rec;
3132
3133	rec = (struct jrefrec *)data;
3134	rec->jr_op = JOP_REMREF;
3135	inoref_write(&jremref->jr_ref, jseg, rec);
3136}
3137
3138static void
3139jmvref_write(jmvref, jseg, data)
3140	struct jmvref *jmvref;
3141	struct jseg *jseg;
3142	uint8_t *data;
3143{
3144	struct jmvrec *rec;
3145
3146	rec = (struct jmvrec *)data;
3147	rec->jm_op = JOP_MVREF;
3148	rec->jm_ino = jmvref->jm_ino;
3149	rec->jm_parent = jmvref->jm_parent;
3150	rec->jm_oldoff = jmvref->jm_oldoff;
3151	rec->jm_newoff = jmvref->jm_newoff;
3152}
3153
3154static void
3155jnewblk_write(jnewblk, jseg, data)
3156	struct jnewblk *jnewblk;
3157	struct jseg *jseg;
3158	uint8_t *data;
3159{
3160	struct jblkrec *rec;
3161
3162	jnewblk->jn_jsegdep->jd_seg = jseg;
3163	rec = (struct jblkrec *)data;
3164	rec->jb_op = JOP_NEWBLK;
3165	rec->jb_ino = jnewblk->jn_ino;
3166	rec->jb_blkno = jnewblk->jn_blkno;
3167	rec->jb_lbn = jnewblk->jn_lbn;
3168	rec->jb_frags = jnewblk->jn_frags;
3169	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3170}
3171
3172static void
3173jfreeblk_write(jfreeblk, jseg, data)
3174	struct jfreeblk *jfreeblk;
3175	struct jseg *jseg;
3176	uint8_t *data;
3177{
3178	struct jblkrec *rec;
3179
3180	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3181	rec = (struct jblkrec *)data;
3182	rec->jb_op = JOP_FREEBLK;
3183	rec->jb_ino = jfreeblk->jf_ino;
3184	rec->jb_blkno = jfreeblk->jf_blkno;
3185	rec->jb_lbn = jfreeblk->jf_lbn;
3186	rec->jb_frags = jfreeblk->jf_frags;
3187	rec->jb_oldfrags = 0;
3188}
3189
3190static void
3191jfreefrag_write(jfreefrag, jseg, data)
3192	struct jfreefrag *jfreefrag;
3193	struct jseg *jseg;
3194	uint8_t *data;
3195{
3196	struct jblkrec *rec;
3197
3198	jfreefrag->fr_jsegdep->jd_seg = jseg;
3199	rec = (struct jblkrec *)data;
3200	rec->jb_op = JOP_FREEBLK;
3201	rec->jb_ino = jfreefrag->fr_ino;
3202	rec->jb_blkno = jfreefrag->fr_blkno;
3203	rec->jb_lbn = jfreefrag->fr_lbn;
3204	rec->jb_frags = jfreefrag->fr_frags;
3205	rec->jb_oldfrags = 0;
3206}
3207
3208static void
3209jtrunc_write(jtrunc, jseg, data)
3210	struct jtrunc *jtrunc;
3211	struct jseg *jseg;
3212	uint8_t *data;
3213{
3214	struct jtrncrec *rec;
3215
3216	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3217	rec = (struct jtrncrec *)data;
3218	rec->jt_op = JOP_TRUNC;
3219	rec->jt_ino = jtrunc->jt_ino;
3220	rec->jt_size = jtrunc->jt_size;
3221	rec->jt_extsize = jtrunc->jt_extsize;
3222}
3223
3224static void
3225jfsync_write(jfsync, jseg, data)
3226	struct jfsync *jfsync;
3227	struct jseg *jseg;
3228	uint8_t *data;
3229{
3230	struct jtrncrec *rec;
3231
3232	rec = (struct jtrncrec *)data;
3233	rec->jt_op = JOP_SYNC;
3234	rec->jt_ino = jfsync->jfs_ino;
3235	rec->jt_size = jfsync->jfs_size;
3236	rec->jt_extsize = jfsync->jfs_extsize;
3237}
3238
3239static void
3240softdep_flushjournal(mp)
3241	struct mount *mp;
3242{
3243	struct jblocks *jblocks;
3244	struct ufsmount *ump;
3245
3246	if (MOUNTEDSUJ(mp) == 0)
3247		return;
3248	ump = VFSTOUFS(mp);
3249	jblocks = ump->softdep_jblocks;
3250	ACQUIRE_LOCK(ump);
3251	while (ump->softdep_on_journal) {
3252		jblocks->jb_needseg = 1;
3253		softdep_process_journal(mp, NULL, MNT_WAIT);
3254	}
3255	FREE_LOCK(ump);
3256}
3257
3258static void softdep_synchronize_completed(struct bio *);
3259static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3260
3261static void
3262softdep_synchronize_completed(bp)
3263        struct bio *bp;
3264{
3265	struct jseg *oldest;
3266	struct jseg *jseg;
3267	struct ufsmount *ump;
3268
3269	/*
3270	 * caller1 marks the last segment written before we issued the
3271	 * synchronize cache.
3272	 */
3273	jseg = bp->bio_caller1;
3274	if (jseg == NULL) {
3275		g_destroy_bio(bp);
3276		return;
3277	}
3278	ump = VFSTOUFS(jseg->js_list.wk_mp);
3279	ACQUIRE_LOCK(ump);
3280	oldest = NULL;
3281	/*
3282	 * Mark all the journal entries waiting on the synchronize cache
3283	 * as completed so they may continue on.
3284	 */
3285	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3286		jseg->js_state |= COMPLETE;
3287		oldest = jseg;
3288		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3289	}
3290	/*
3291	 * Restart deferred journal entry processing from the oldest
3292	 * completed jseg.
3293	 */
3294	if (oldest)
3295		complete_jsegs(oldest);
3296
3297	FREE_LOCK(ump);
3298	g_destroy_bio(bp);
3299}
3300
3301/*
3302 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3303 * barriers.  The journal must be written prior to any blocks that depend
3304 * on it and the journal can not be released until the blocks have be
3305 * written.  This code handles both barriers simultaneously.
3306 */
3307static void
3308softdep_synchronize(bp, ump, caller1)
3309	struct bio *bp;
3310	struct ufsmount *ump;
3311	void *caller1;
3312{
3313
3314	bp->bio_cmd = BIO_FLUSH;
3315	bp->bio_flags |= BIO_ORDERED;
3316	bp->bio_data = NULL;
3317	bp->bio_offset = ump->um_cp->provider->mediasize;
3318	bp->bio_length = 0;
3319	bp->bio_done = softdep_synchronize_completed;
3320	bp->bio_caller1 = caller1;
3321	g_io_request(bp,
3322	    (struct g_consumer *)ump->um_devvp->v_bufobj.bo_private);
3323}
3324
3325/*
3326 * Flush some journal records to disk.
3327 */
3328static void
3329softdep_process_journal(mp, needwk, flags)
3330	struct mount *mp;
3331	struct worklist *needwk;
3332	int flags;
3333{
3334	struct jblocks *jblocks;
3335	struct ufsmount *ump;
3336	struct worklist *wk;
3337	struct jseg *jseg;
3338	struct buf *bp;
3339	struct bio *bio;
3340	uint8_t *data;
3341	struct fs *fs;
3342	int shouldflush;
3343	int segwritten;
3344	int jrecmin;	/* Minimum records per block. */
3345	int jrecmax;	/* Maximum records per block. */
3346	int size;
3347	int cnt;
3348	int off;
3349	int devbsize;
3350
3351	if (MOUNTEDSUJ(mp) == 0)
3352		return;
3353	shouldflush = softdep_flushcache;
3354	bio = NULL;
3355	jseg = NULL;
3356	ump = VFSTOUFS(mp);
3357	LOCK_OWNED(ump);
3358	fs = ump->um_fs;
3359	jblocks = ump->softdep_jblocks;
3360	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3361	/*
3362	 * We write anywhere between a disk block and fs block.  The upper
3363	 * bound is picked to prevent buffer cache fragmentation and limit
3364	 * processing time per I/O.
3365	 */
3366	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3367	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3368	segwritten = 0;
3369	for (;;) {
3370		cnt = ump->softdep_on_journal;
3371		/*
3372		 * Criteria for writing a segment:
3373		 * 1) We have a full block.
3374		 * 2) We're called from jwait() and haven't found the
3375		 *    journal item yet.
3376		 * 3) Always write if needseg is set.
3377		 * 4) If we are called from process_worklist and have
3378		 *    not yet written anything we write a partial block
3379		 *    to enforce a 1 second maximum latency on journal
3380		 *    entries.
3381		 */
3382		if (cnt < (jrecmax - 1) && needwk == NULL &&
3383		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3384			break;
3385		cnt++;
3386		/*
3387		 * Verify some free journal space.  softdep_prealloc() should
3388		 * guarantee that we don't run out so this is indicative of
3389		 * a problem with the flow control.  Try to recover
3390		 * gracefully in any event.
3391		 */
3392		while (jblocks->jb_free == 0) {
3393			if (flags != MNT_WAIT)
3394				break;
3395			printf("softdep: Out of journal space!\n");
3396			softdep_speedup(ump);
3397			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3398		}
3399		FREE_LOCK(ump);
3400		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3401		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3402		LIST_INIT(&jseg->js_entries);
3403		LIST_INIT(&jseg->js_indirs);
3404		jseg->js_state = ATTACHED;
3405		if (shouldflush == 0)
3406			jseg->js_state |= COMPLETE;
3407		else if (bio == NULL)
3408			bio = g_alloc_bio();
3409		jseg->js_jblocks = jblocks;
3410		bp = geteblk(fs->fs_bsize, 0);
3411		ACQUIRE_LOCK(ump);
3412		/*
3413		 * If there was a race while we were allocating the block
3414		 * and jseg the entry we care about was likely written.
3415		 * We bail out in both the WAIT and NOWAIT case and assume
3416		 * the caller will loop if the entry it cares about is
3417		 * not written.
3418		 */
3419		cnt = ump->softdep_on_journal;
3420		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3421			bp->b_flags |= B_INVAL | B_NOCACHE;
3422			WORKITEM_FREE(jseg, D_JSEG);
3423			FREE_LOCK(ump);
3424			brelse(bp);
3425			ACQUIRE_LOCK(ump);
3426			break;
3427		}
3428		/*
3429		 * Calculate the disk block size required for the available
3430		 * records rounded to the min size.
3431		 */
3432		if (cnt == 0)
3433			size = devbsize;
3434		else if (cnt < jrecmax)
3435			size = howmany(cnt, jrecmin) * devbsize;
3436		else
3437			size = fs->fs_bsize;
3438		/*
3439		 * Allocate a disk block for this journal data and account
3440		 * for truncation of the requested size if enough contiguous
3441		 * space was not available.
3442		 */
3443		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3444		bp->b_lblkno = bp->b_blkno;
3445		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3446		bp->b_bcount = size;
3447		bp->b_flags &= ~B_INVAL;
3448		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3449		/*
3450		 * Initialize our jseg with cnt records.  Assign the next
3451		 * sequence number to it and link it in-order.
3452		 */
3453		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3454		jseg->js_buf = bp;
3455		jseg->js_cnt = cnt;
3456		jseg->js_refs = cnt + 1;	/* Self ref. */
3457		jseg->js_size = size;
3458		jseg->js_seq = jblocks->jb_nextseq++;
3459		if (jblocks->jb_oldestseg == NULL)
3460			jblocks->jb_oldestseg = jseg;
3461		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3462		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3463		if (jblocks->jb_writeseg == NULL)
3464			jblocks->jb_writeseg = jseg;
3465		/*
3466		 * Start filling in records from the pending list.
3467		 */
3468		data = bp->b_data;
3469		off = 0;
3470
3471		/*
3472		 * Always put a header on the first block.
3473		 * XXX As with below, there might not be a chance to get
3474		 * into the loop.  Ensure that something valid is written.
3475		 */
3476		jseg_write(ump, jseg, data);
3477		off += JREC_SIZE;
3478		data = bp->b_data + off;
3479
3480		/*
3481		 * XXX Something is wrong here.  There's no work to do,
3482		 * but we need to perform and I/O and allow it to complete
3483		 * anyways.
3484		 */
3485		if (LIST_EMPTY(&ump->softdep_journal_pending))
3486			stat_emptyjblocks++;
3487
3488		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3489		    != NULL) {
3490			if (cnt == 0)
3491				break;
3492			/* Place a segment header on every device block. */
3493			if ((off % devbsize) == 0) {
3494				jseg_write(ump, jseg, data);
3495				off += JREC_SIZE;
3496				data = bp->b_data + off;
3497			}
3498			if (wk == needwk)
3499				needwk = NULL;
3500			remove_from_journal(wk);
3501			wk->wk_state |= INPROGRESS;
3502			WORKLIST_INSERT(&jseg->js_entries, wk);
3503			switch (wk->wk_type) {
3504			case D_JADDREF:
3505				jaddref_write(WK_JADDREF(wk), jseg, data);
3506				break;
3507			case D_JREMREF:
3508				jremref_write(WK_JREMREF(wk), jseg, data);
3509				break;
3510			case D_JMVREF:
3511				jmvref_write(WK_JMVREF(wk), jseg, data);
3512				break;
3513			case D_JNEWBLK:
3514				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3515				break;
3516			case D_JFREEBLK:
3517				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3518				break;
3519			case D_JFREEFRAG:
3520				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3521				break;
3522			case D_JTRUNC:
3523				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3524				break;
3525			case D_JFSYNC:
3526				jfsync_write(WK_JFSYNC(wk), jseg, data);
3527				break;
3528			default:
3529				panic("process_journal: Unknown type %s",
3530				    TYPENAME(wk->wk_type));
3531				/* NOTREACHED */
3532			}
3533			off += JREC_SIZE;
3534			data = bp->b_data + off;
3535			cnt--;
3536		}
3537
3538		/* Clear any remaining space so we don't leak kernel data */
3539		if (size > off)
3540			bzero(data, size - off);
3541
3542		/*
3543		 * Write this one buffer and continue.
3544		 */
3545		segwritten = 1;
3546		jblocks->jb_needseg = 0;
3547		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3548		FREE_LOCK(ump);
3549		pbgetvp(ump->um_devvp, bp);
3550		/*
3551		 * We only do the blocking wait once we find the journal
3552		 * entry we're looking for.
3553		 */
3554		if (needwk == NULL && flags == MNT_WAIT)
3555			bwrite(bp);
3556		else
3557			bawrite(bp);
3558		ACQUIRE_LOCK(ump);
3559	}
3560	/*
3561	 * If we wrote a segment issue a synchronize cache so the journal
3562	 * is reflected on disk before the data is written.  Since reclaiming
3563	 * journal space also requires writing a journal record this
3564	 * process also enforces a barrier before reclamation.
3565	 */
3566	if (segwritten && shouldflush) {
3567		softdep_synchronize(bio, ump,
3568		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3569	} else if (bio)
3570		g_destroy_bio(bio);
3571	/*
3572	 * If we've suspended the filesystem because we ran out of journal
3573	 * space either try to sync it here to make some progress or
3574	 * unsuspend it if we already have.
3575	 */
3576	if (flags == 0 && jblocks->jb_suspended) {
3577		if (journal_unsuspend(ump))
3578			return;
3579		FREE_LOCK(ump);
3580		VFS_SYNC(mp, MNT_NOWAIT);
3581		ffs_sbupdate(ump, MNT_WAIT, 0);
3582		ACQUIRE_LOCK(ump);
3583	}
3584}
3585
3586/*
3587 * Complete a jseg, allowing all dependencies awaiting journal writes
3588 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3589 * structures so that the journal segment can be freed to reclaim space.
3590 */
3591static void
3592complete_jseg(jseg)
3593	struct jseg *jseg;
3594{
3595	struct worklist *wk;
3596	struct jmvref *jmvref;
3597	int waiting;
3598#ifdef INVARIANTS
3599	int i = 0;
3600#endif
3601
3602	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3603		WORKLIST_REMOVE(wk);
3604		waiting = wk->wk_state & IOWAITING;
3605		wk->wk_state &= ~(INPROGRESS | IOWAITING);
3606		wk->wk_state |= COMPLETE;
3607		KASSERT(i++ < jseg->js_cnt,
3608		    ("handle_written_jseg: overflow %d >= %d",
3609		    i - 1, jseg->js_cnt));
3610		switch (wk->wk_type) {
3611		case D_JADDREF:
3612			handle_written_jaddref(WK_JADDREF(wk));
3613			break;
3614		case D_JREMREF:
3615			handle_written_jremref(WK_JREMREF(wk));
3616			break;
3617		case D_JMVREF:
3618			rele_jseg(jseg);	/* No jsegdep. */
3619			jmvref = WK_JMVREF(wk);
3620			LIST_REMOVE(jmvref, jm_deps);
3621			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3622				free_pagedep(jmvref->jm_pagedep);
3623			WORKITEM_FREE(jmvref, D_JMVREF);
3624			break;
3625		case D_JNEWBLK:
3626			handle_written_jnewblk(WK_JNEWBLK(wk));
3627			break;
3628		case D_JFREEBLK:
3629			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3630			break;
3631		case D_JTRUNC:
3632			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3633			break;
3634		case D_JFSYNC:
3635			rele_jseg(jseg);	/* No jsegdep. */
3636			WORKITEM_FREE(wk, D_JFSYNC);
3637			break;
3638		case D_JFREEFRAG:
3639			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3640			break;
3641		default:
3642			panic("handle_written_jseg: Unknown type %s",
3643			    TYPENAME(wk->wk_type));
3644			/* NOTREACHED */
3645		}
3646		if (waiting)
3647			wakeup(wk);
3648	}
3649	/* Release the self reference so the structure may be freed. */
3650	rele_jseg(jseg);
3651}
3652
3653/*
3654 * Determine which jsegs are ready for completion processing.  Waits for
3655 * synchronize cache to complete as well as forcing in-order completion
3656 * of journal entries.
3657 */
3658static void
3659complete_jsegs(jseg)
3660	struct jseg *jseg;
3661{
3662	struct jblocks *jblocks;
3663	struct jseg *jsegn;
3664
3665	jblocks = jseg->js_jblocks;
3666	/*
3667	 * Don't allow out of order completions.  If this isn't the first
3668	 * block wait for it to write before we're done.
3669	 */
3670	if (jseg != jblocks->jb_writeseg)
3671		return;
3672	/* Iterate through available jsegs processing their entries. */
3673	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
3674		jblocks->jb_oldestwrseq = jseg->js_oldseq;
3675		jsegn = TAILQ_NEXT(jseg, js_next);
3676		complete_jseg(jseg);
3677		jseg = jsegn;
3678	}
3679	jblocks->jb_writeseg = jseg;
3680	/*
3681	 * Attempt to free jsegs now that oldestwrseq may have advanced.
3682	 */
3683	free_jsegs(jblocks);
3684}
3685
3686/*
3687 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
3688 * the final completions.
3689 */
3690static void
3691handle_written_jseg(jseg, bp)
3692	struct jseg *jseg;
3693	struct buf *bp;
3694{
3695
3696	if (jseg->js_refs == 0)
3697		panic("handle_written_jseg: No self-reference on %p", jseg);
3698	jseg->js_state |= DEPCOMPLETE;
3699	/*
3700	 * We'll never need this buffer again, set flags so it will be
3701	 * discarded.
3702	 */
3703	bp->b_flags |= B_INVAL | B_NOCACHE;
3704	pbrelvp(bp);
3705	complete_jsegs(jseg);
3706}
3707
3708static inline struct jsegdep *
3709inoref_jseg(inoref)
3710	struct inoref *inoref;
3711{
3712	struct jsegdep *jsegdep;
3713
3714	jsegdep = inoref->if_jsegdep;
3715	inoref->if_jsegdep = NULL;
3716
3717	return (jsegdep);
3718}
3719
3720/*
3721 * Called once a jremref has made it to stable store.  The jremref is marked
3722 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
3723 * for the jremref to complete will be awoken by free_jremref.
3724 */
3725static void
3726handle_written_jremref(jremref)
3727	struct jremref *jremref;
3728{
3729	struct inodedep *inodedep;
3730	struct jsegdep *jsegdep;
3731	struct dirrem *dirrem;
3732
3733	/* Grab the jsegdep. */
3734	jsegdep = inoref_jseg(&jremref->jr_ref);
3735	/*
3736	 * Remove us from the inoref list.
3737	 */
3738	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
3739	    0, &inodedep) == 0)
3740		panic("handle_written_jremref: Lost inodedep");
3741	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
3742	/*
3743	 * Complete the dirrem.
3744	 */
3745	dirrem = jremref->jr_dirrem;
3746	jremref->jr_dirrem = NULL;
3747	LIST_REMOVE(jremref, jr_deps);
3748	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
3749	jwork_insert(&dirrem->dm_jwork, jsegdep);
3750	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
3751	    (dirrem->dm_state & COMPLETE) != 0)
3752		add_to_worklist(&dirrem->dm_list, 0);
3753	free_jremref(jremref);
3754}
3755
3756/*
3757 * Called once a jaddref has made it to stable store.  The dependency is
3758 * marked complete and any dependent structures are added to the inode
3759 * bufwait list to be completed as soon as it is written.  If a bitmap write
3760 * depends on this entry we move the inode into the inodedephd of the
3761 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
3762 */
3763static void
3764handle_written_jaddref(jaddref)
3765	struct jaddref *jaddref;
3766{
3767	struct jsegdep *jsegdep;
3768	struct inodedep *inodedep;
3769	struct diradd *diradd;
3770	struct mkdir *mkdir;
3771
3772	/* Grab the jsegdep. */
3773	jsegdep = inoref_jseg(&jaddref->ja_ref);
3774	mkdir = NULL;
3775	diradd = NULL;
3776	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
3777	    0, &inodedep) == 0)
3778		panic("handle_written_jaddref: Lost inodedep.");
3779	if (jaddref->ja_diradd == NULL)
3780		panic("handle_written_jaddref: No dependency");
3781	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
3782		diradd = jaddref->ja_diradd;
3783		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
3784	} else if (jaddref->ja_state & MKDIR_PARENT) {
3785		mkdir = jaddref->ja_mkdir;
3786		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
3787	} else if (jaddref->ja_state & MKDIR_BODY)
3788		mkdir = jaddref->ja_mkdir;
3789	else
3790		panic("handle_written_jaddref: Unknown dependency %p",
3791		    jaddref->ja_diradd);
3792	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
3793	/*
3794	 * Remove us from the inode list.
3795	 */
3796	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
3797	/*
3798	 * The mkdir may be waiting on the jaddref to clear before freeing.
3799	 */
3800	if (mkdir) {
3801		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
3802		    ("handle_written_jaddref: Incorrect type for mkdir %s",
3803		    TYPENAME(mkdir->md_list.wk_type)));
3804		mkdir->md_jaddref = NULL;
3805		diradd = mkdir->md_diradd;
3806		mkdir->md_state |= DEPCOMPLETE;
3807		complete_mkdir(mkdir);
3808	}
3809	jwork_insert(&diradd->da_jwork, jsegdep);
3810	if (jaddref->ja_state & NEWBLOCK) {
3811		inodedep->id_state |= ONDEPLIST;
3812		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
3813		    inodedep, id_deps);
3814	}
3815	free_jaddref(jaddref);
3816}
3817
3818/*
3819 * Called once a jnewblk journal is written.  The allocdirect or allocindir
3820 * is placed in the bmsafemap to await notification of a written bitmap.  If
3821 * the operation was canceled we add the segdep to the appropriate
3822 * dependency to free the journal space once the canceling operation
3823 * completes.
3824 */
3825static void
3826handle_written_jnewblk(jnewblk)
3827	struct jnewblk *jnewblk;
3828{
3829	struct bmsafemap *bmsafemap;
3830	struct freefrag *freefrag;
3831	struct freework *freework;
3832	struct jsegdep *jsegdep;
3833	struct newblk *newblk;
3834
3835	/* Grab the jsegdep. */
3836	jsegdep = jnewblk->jn_jsegdep;
3837	jnewblk->jn_jsegdep = NULL;
3838	if (jnewblk->jn_dep == NULL)
3839		panic("handle_written_jnewblk: No dependency for the segdep.");
3840	switch (jnewblk->jn_dep->wk_type) {
3841	case D_NEWBLK:
3842	case D_ALLOCDIRECT:
3843	case D_ALLOCINDIR:
3844		/*
3845		 * Add the written block to the bmsafemap so it can
3846		 * be notified when the bitmap is on disk.
3847		 */
3848		newblk = WK_NEWBLK(jnewblk->jn_dep);
3849		newblk->nb_jnewblk = NULL;
3850		if ((newblk->nb_state & GOINGAWAY) == 0) {
3851			bmsafemap = newblk->nb_bmsafemap;
3852			newblk->nb_state |= ONDEPLIST;
3853			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
3854			    nb_deps);
3855		}
3856		jwork_insert(&newblk->nb_jwork, jsegdep);
3857		break;
3858	case D_FREEFRAG:
3859		/*
3860		 * A newblock being removed by a freefrag when replaced by
3861		 * frag extension.
3862		 */
3863		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
3864		freefrag->ff_jdep = NULL;
3865		jwork_insert(&freefrag->ff_jwork, jsegdep);
3866		break;
3867	case D_FREEWORK:
3868		/*
3869		 * A direct block was removed by truncate.
3870		 */
3871		freework = WK_FREEWORK(jnewblk->jn_dep);
3872		freework->fw_jnewblk = NULL;
3873		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
3874		break;
3875	default:
3876		panic("handle_written_jnewblk: Unknown type %d.",
3877		    jnewblk->jn_dep->wk_type);
3878	}
3879	jnewblk->jn_dep = NULL;
3880	free_jnewblk(jnewblk);
3881}
3882
3883/*
3884 * Cancel a jfreefrag that won't be needed, probably due to colliding with
3885 * an in-flight allocation that has not yet been committed.  Divorce us
3886 * from the freefrag and mark it DEPCOMPLETE so that it may be added
3887 * to the worklist.
3888 */
3889static void
3890cancel_jfreefrag(jfreefrag)
3891	struct jfreefrag *jfreefrag;
3892{
3893	struct freefrag *freefrag;
3894
3895	if (jfreefrag->fr_jsegdep) {
3896		free_jsegdep(jfreefrag->fr_jsegdep);
3897		jfreefrag->fr_jsegdep = NULL;
3898	}
3899	freefrag = jfreefrag->fr_freefrag;
3900	jfreefrag->fr_freefrag = NULL;
3901	free_jfreefrag(jfreefrag);
3902	freefrag->ff_state |= DEPCOMPLETE;
3903	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
3904}
3905
3906/*
3907 * Free a jfreefrag when the parent freefrag is rendered obsolete.
3908 */
3909static void
3910free_jfreefrag(jfreefrag)
3911	struct jfreefrag *jfreefrag;
3912{
3913
3914	if (jfreefrag->fr_state & INPROGRESS)
3915		WORKLIST_REMOVE(&jfreefrag->fr_list);
3916	else if (jfreefrag->fr_state & ONWORKLIST)
3917		remove_from_journal(&jfreefrag->fr_list);
3918	if (jfreefrag->fr_freefrag != NULL)
3919		panic("free_jfreefrag:  Still attached to a freefrag.");
3920	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
3921}
3922
3923/*
3924 * Called when the journal write for a jfreefrag completes.  The parent
3925 * freefrag is added to the worklist if this completes its dependencies.
3926 */
3927static void
3928handle_written_jfreefrag(jfreefrag)
3929	struct jfreefrag *jfreefrag;
3930{
3931	struct jsegdep *jsegdep;
3932	struct freefrag *freefrag;
3933
3934	/* Grab the jsegdep. */
3935	jsegdep = jfreefrag->fr_jsegdep;
3936	jfreefrag->fr_jsegdep = NULL;
3937	freefrag = jfreefrag->fr_freefrag;
3938	if (freefrag == NULL)
3939		panic("handle_written_jfreefrag: No freefrag.");
3940	freefrag->ff_state |= DEPCOMPLETE;
3941	freefrag->ff_jdep = NULL;
3942	jwork_insert(&freefrag->ff_jwork, jsegdep);
3943	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
3944		add_to_worklist(&freefrag->ff_list, 0);
3945	jfreefrag->fr_freefrag = NULL;
3946	free_jfreefrag(jfreefrag);
3947}
3948
3949/*
3950 * Called when the journal write for a jfreeblk completes.  The jfreeblk
3951 * is removed from the freeblks list of pending journal writes and the
3952 * jsegdep is moved to the freeblks jwork to be completed when all blocks
3953 * have been reclaimed.
3954 */
3955static void
3956handle_written_jblkdep(jblkdep)
3957	struct jblkdep *jblkdep;
3958{
3959	struct freeblks *freeblks;
3960	struct jsegdep *jsegdep;
3961
3962	/* Grab the jsegdep. */
3963	jsegdep = jblkdep->jb_jsegdep;
3964	jblkdep->jb_jsegdep = NULL;
3965	freeblks = jblkdep->jb_freeblks;
3966	LIST_REMOVE(jblkdep, jb_deps);
3967	jwork_insert(&freeblks->fb_jwork, jsegdep);
3968	/*
3969	 * If the freeblks is all journaled, we can add it to the worklist.
3970	 */
3971	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
3972	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
3973		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
3974
3975	free_jblkdep(jblkdep);
3976}
3977
3978static struct jsegdep *
3979newjsegdep(struct worklist *wk)
3980{
3981	struct jsegdep *jsegdep;
3982
3983	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
3984	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
3985	jsegdep->jd_seg = NULL;
3986
3987	return (jsegdep);
3988}
3989
3990static struct jmvref *
3991newjmvref(dp, ino, oldoff, newoff)
3992	struct inode *dp;
3993	ino_t ino;
3994	off_t oldoff;
3995	off_t newoff;
3996{
3997	struct jmvref *jmvref;
3998
3999	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4000	workitem_alloc(&jmvref->jm_list, D_JMVREF, UFSTOVFS(dp->i_ump));
4001	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4002	jmvref->jm_parent = dp->i_number;
4003	jmvref->jm_ino = ino;
4004	jmvref->jm_oldoff = oldoff;
4005	jmvref->jm_newoff = newoff;
4006
4007	return (jmvref);
4008}
4009
4010/*
4011 * Allocate a new jremref that tracks the removal of ip from dp with the
4012 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4013 * DEPCOMPLETE as we have all the information required for the journal write
4014 * and the directory has already been removed from the buffer.  The caller
4015 * is responsible for linking the jremref into the pagedep and adding it
4016 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4017 * a DOTDOT addition so handle_workitem_remove() can properly assign
4018 * the jsegdep when we're done.
4019 */
4020static struct jremref *
4021newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4022    off_t diroff, nlink_t nlink)
4023{
4024	struct jremref *jremref;
4025
4026	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4027	workitem_alloc(&jremref->jr_list, D_JREMREF, UFSTOVFS(dp->i_ump));
4028	jremref->jr_state = ATTACHED;
4029	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4030	   nlink, ip->i_mode);
4031	jremref->jr_dirrem = dirrem;
4032
4033	return (jremref);
4034}
4035
4036static inline void
4037newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4038    nlink_t nlink, uint16_t mode)
4039{
4040
4041	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4042	inoref->if_diroff = diroff;
4043	inoref->if_ino = ino;
4044	inoref->if_parent = parent;
4045	inoref->if_nlink = nlink;
4046	inoref->if_mode = mode;
4047}
4048
4049/*
4050 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4051 * directory offset may not be known until later.  The caller is responsible
4052 * adding the entry to the journal when this information is available.  nlink
4053 * should be the link count prior to the addition and mode is only required
4054 * to have the correct FMT.
4055 */
4056static struct jaddref *
4057newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4058    uint16_t mode)
4059{
4060	struct jaddref *jaddref;
4061
4062	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4063	workitem_alloc(&jaddref->ja_list, D_JADDREF, UFSTOVFS(dp->i_ump));
4064	jaddref->ja_state = ATTACHED;
4065	jaddref->ja_mkdir = NULL;
4066	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4067
4068	return (jaddref);
4069}
4070
4071/*
4072 * Create a new free dependency for a freework.  The caller is responsible
4073 * for adjusting the reference count when it has the lock held.  The freedep
4074 * will track an outstanding bitmap write that will ultimately clear the
4075 * freework to continue.
4076 */
4077static struct freedep *
4078newfreedep(struct freework *freework)
4079{
4080	struct freedep *freedep;
4081
4082	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4083	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4084	freedep->fd_freework = freework;
4085
4086	return (freedep);
4087}
4088
4089/*
4090 * Free a freedep structure once the buffer it is linked to is written.  If
4091 * this is the last reference to the freework schedule it for completion.
4092 */
4093static void
4094free_freedep(freedep)
4095	struct freedep *freedep;
4096{
4097	struct freework *freework;
4098
4099	freework = freedep->fd_freework;
4100	freework->fw_freeblks->fb_cgwait--;
4101	if (--freework->fw_ref == 0)
4102		freework_enqueue(freework);
4103	WORKITEM_FREE(freedep, D_FREEDEP);
4104}
4105
4106/*
4107 * Allocate a new freework structure that may be a level in an indirect
4108 * when parent is not NULL or a top level block when it is.  The top level
4109 * freework structures are allocated without the per-filesystem lock held
4110 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4111 */
4112static struct freework *
4113newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4114	struct ufsmount *ump;
4115	struct freeblks *freeblks;
4116	struct freework *parent;
4117	ufs_lbn_t lbn;
4118	ufs2_daddr_t nb;
4119	int frags;
4120	int off;
4121	int journal;
4122{
4123	struct freework *freework;
4124
4125	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4126	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4127	freework->fw_state = ATTACHED;
4128	freework->fw_jnewblk = NULL;
4129	freework->fw_freeblks = freeblks;
4130	freework->fw_parent = parent;
4131	freework->fw_lbn = lbn;
4132	freework->fw_blkno = nb;
4133	freework->fw_frags = frags;
4134	freework->fw_indir = NULL;
4135	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 || lbn >= -NXADDR)
4136		? 0 : NINDIR(ump->um_fs) + 1;
4137	freework->fw_start = freework->fw_off = off;
4138	if (journal)
4139		newjfreeblk(freeblks, lbn, nb, frags);
4140	if (parent == NULL) {
4141		ACQUIRE_LOCK(ump);
4142		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4143		freeblks->fb_ref++;
4144		FREE_LOCK(ump);
4145	}
4146
4147	return (freework);
4148}
4149
4150/*
4151 * Eliminate a jfreeblk for a block that does not need journaling.
4152 */
4153static void
4154cancel_jfreeblk(freeblks, blkno)
4155	struct freeblks *freeblks;
4156	ufs2_daddr_t blkno;
4157{
4158	struct jfreeblk *jfreeblk;
4159	struct jblkdep *jblkdep;
4160
4161	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4162		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4163			continue;
4164		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4165		if (jfreeblk->jf_blkno == blkno)
4166			break;
4167	}
4168	if (jblkdep == NULL)
4169		return;
4170	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4171	free_jsegdep(jblkdep->jb_jsegdep);
4172	LIST_REMOVE(jblkdep, jb_deps);
4173	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4174}
4175
4176/*
4177 * Allocate a new jfreeblk to journal top level block pointer when truncating
4178 * a file.  The caller must add this to the worklist when the per-filesystem
4179 * lock is held.
4180 */
4181static struct jfreeblk *
4182newjfreeblk(freeblks, lbn, blkno, frags)
4183	struct freeblks *freeblks;
4184	ufs_lbn_t lbn;
4185	ufs2_daddr_t blkno;
4186	int frags;
4187{
4188	struct jfreeblk *jfreeblk;
4189
4190	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4191	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4192	    freeblks->fb_list.wk_mp);
4193	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4194	jfreeblk->jf_dep.jb_freeblks = freeblks;
4195	jfreeblk->jf_ino = freeblks->fb_inum;
4196	jfreeblk->jf_lbn = lbn;
4197	jfreeblk->jf_blkno = blkno;
4198	jfreeblk->jf_frags = frags;
4199	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4200
4201	return (jfreeblk);
4202}
4203
4204/*
4205 * The journal is only prepared to handle full-size block numbers, so we
4206 * have to adjust the record to reflect the change to a full-size block.
4207 * For example, suppose we have a block made up of fragments 8-15 and
4208 * want to free its last two fragments. We are given a request that says:
4209 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4210 * where frags are the number of fragments to free and oldfrags are the
4211 * number of fragments to keep. To block align it, we have to change it to
4212 * have a valid full-size blkno, so it becomes:
4213 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4214 */
4215static void
4216adjust_newfreework(freeblks, frag_offset)
4217	struct freeblks *freeblks;
4218	int frag_offset;
4219{
4220	struct jfreeblk *jfreeblk;
4221
4222	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4223	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4224	    ("adjust_newfreework: Missing freeblks dependency"));
4225
4226	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4227	jfreeblk->jf_blkno -= frag_offset;
4228	jfreeblk->jf_frags += frag_offset;
4229}
4230
4231/*
4232 * Allocate a new jtrunc to track a partial truncation.
4233 */
4234static struct jtrunc *
4235newjtrunc(freeblks, size, extsize)
4236	struct freeblks *freeblks;
4237	off_t size;
4238	int extsize;
4239{
4240	struct jtrunc *jtrunc;
4241
4242	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4243	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4244	    freeblks->fb_list.wk_mp);
4245	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4246	jtrunc->jt_dep.jb_freeblks = freeblks;
4247	jtrunc->jt_ino = freeblks->fb_inum;
4248	jtrunc->jt_size = size;
4249	jtrunc->jt_extsize = extsize;
4250	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4251
4252	return (jtrunc);
4253}
4254
4255/*
4256 * If we're canceling a new bitmap we have to search for another ref
4257 * to move into the bmsafemap dep.  This might be better expressed
4258 * with another structure.
4259 */
4260static void
4261move_newblock_dep(jaddref, inodedep)
4262	struct jaddref *jaddref;
4263	struct inodedep *inodedep;
4264{
4265	struct inoref *inoref;
4266	struct jaddref *jaddrefn;
4267
4268	jaddrefn = NULL;
4269	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4270	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4271		if ((jaddref->ja_state & NEWBLOCK) &&
4272		    inoref->if_list.wk_type == D_JADDREF) {
4273			jaddrefn = (struct jaddref *)inoref;
4274			break;
4275		}
4276	}
4277	if (jaddrefn == NULL)
4278		return;
4279	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4280	jaddrefn->ja_state |= jaddref->ja_state &
4281	    (ATTACHED | UNDONE | NEWBLOCK);
4282	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4283	jaddref->ja_state |= ATTACHED;
4284	LIST_REMOVE(jaddref, ja_bmdeps);
4285	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4286	    ja_bmdeps);
4287}
4288
4289/*
4290 * Cancel a jaddref either before it has been written or while it is being
4291 * written.  This happens when a link is removed before the add reaches
4292 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4293 * and inode to prevent the link count or bitmap from reaching the disk
4294 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4295 * required.
4296 *
4297 * Returns 1 if the canceled addref requires journaling of the remove and
4298 * 0 otherwise.
4299 */
4300static int
4301cancel_jaddref(jaddref, inodedep, wkhd)
4302	struct jaddref *jaddref;
4303	struct inodedep *inodedep;
4304	struct workhead *wkhd;
4305{
4306	struct inoref *inoref;
4307	struct jsegdep *jsegdep;
4308	int needsj;
4309
4310	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4311	    ("cancel_jaddref: Canceling complete jaddref"));
4312	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4313		needsj = 1;
4314	else
4315		needsj = 0;
4316	if (inodedep == NULL)
4317		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4318		    0, &inodedep) == 0)
4319			panic("cancel_jaddref: Lost inodedep");
4320	/*
4321	 * We must adjust the nlink of any reference operation that follows
4322	 * us so that it is consistent with the in-memory reference.  This
4323	 * ensures that inode nlink rollbacks always have the correct link.
4324	 */
4325	if (needsj == 0) {
4326		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4327		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4328			if (inoref->if_state & GOINGAWAY)
4329				break;
4330			inoref->if_nlink--;
4331		}
4332	}
4333	jsegdep = inoref_jseg(&jaddref->ja_ref);
4334	if (jaddref->ja_state & NEWBLOCK)
4335		move_newblock_dep(jaddref, inodedep);
4336	wake_worklist(&jaddref->ja_list);
4337	jaddref->ja_mkdir = NULL;
4338	if (jaddref->ja_state & INPROGRESS) {
4339		jaddref->ja_state &= ~INPROGRESS;
4340		WORKLIST_REMOVE(&jaddref->ja_list);
4341		jwork_insert(wkhd, jsegdep);
4342	} else {
4343		free_jsegdep(jsegdep);
4344		if (jaddref->ja_state & DEPCOMPLETE)
4345			remove_from_journal(&jaddref->ja_list);
4346	}
4347	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4348	/*
4349	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4350	 * can arrange for them to be freed with the bitmap.  Otherwise we
4351	 * no longer need this addref attached to the inoreflst and it
4352	 * will incorrectly adjust nlink if we leave it.
4353	 */
4354	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4355		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4356		    if_deps);
4357		jaddref->ja_state |= COMPLETE;
4358		free_jaddref(jaddref);
4359		return (needsj);
4360	}
4361	/*
4362	 * Leave the head of the list for jsegdeps for fast merging.
4363	 */
4364	if (LIST_FIRST(wkhd) != NULL) {
4365		jaddref->ja_state |= ONWORKLIST;
4366		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4367	} else
4368		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4369
4370	return (needsj);
4371}
4372
4373/*
4374 * Attempt to free a jaddref structure when some work completes.  This
4375 * should only succeed once the entry is written and all dependencies have
4376 * been notified.
4377 */
4378static void
4379free_jaddref(jaddref)
4380	struct jaddref *jaddref;
4381{
4382
4383	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4384		return;
4385	if (jaddref->ja_ref.if_jsegdep)
4386		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4387		    jaddref, jaddref->ja_state);
4388	if (jaddref->ja_state & NEWBLOCK)
4389		LIST_REMOVE(jaddref, ja_bmdeps);
4390	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4391		panic("free_jaddref: Bad state %p(0x%X)",
4392		    jaddref, jaddref->ja_state);
4393	if (jaddref->ja_mkdir != NULL)
4394		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4395	WORKITEM_FREE(jaddref, D_JADDREF);
4396}
4397
4398/*
4399 * Free a jremref structure once it has been written or discarded.
4400 */
4401static void
4402free_jremref(jremref)
4403	struct jremref *jremref;
4404{
4405
4406	if (jremref->jr_ref.if_jsegdep)
4407		free_jsegdep(jremref->jr_ref.if_jsegdep);
4408	if (jremref->jr_state & INPROGRESS)
4409		panic("free_jremref: IO still pending");
4410	WORKITEM_FREE(jremref, D_JREMREF);
4411}
4412
4413/*
4414 * Free a jnewblk structure.
4415 */
4416static void
4417free_jnewblk(jnewblk)
4418	struct jnewblk *jnewblk;
4419{
4420
4421	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4422		return;
4423	LIST_REMOVE(jnewblk, jn_deps);
4424	if (jnewblk->jn_dep != NULL)
4425		panic("free_jnewblk: Dependency still attached.");
4426	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4427}
4428
4429/*
4430 * Cancel a jnewblk which has been been made redundant by frag extension.
4431 */
4432static void
4433cancel_jnewblk(jnewblk, wkhd)
4434	struct jnewblk *jnewblk;
4435	struct workhead *wkhd;
4436{
4437	struct jsegdep *jsegdep;
4438
4439	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4440	jsegdep = jnewblk->jn_jsegdep;
4441	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4442		panic("cancel_jnewblk: Invalid state");
4443	jnewblk->jn_jsegdep  = NULL;
4444	jnewblk->jn_dep = NULL;
4445	jnewblk->jn_state |= GOINGAWAY;
4446	if (jnewblk->jn_state & INPROGRESS) {
4447		jnewblk->jn_state &= ~INPROGRESS;
4448		WORKLIST_REMOVE(&jnewblk->jn_list);
4449		jwork_insert(wkhd, jsegdep);
4450	} else {
4451		free_jsegdep(jsegdep);
4452		remove_from_journal(&jnewblk->jn_list);
4453	}
4454	wake_worklist(&jnewblk->jn_list);
4455	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4456}
4457
4458static void
4459free_jblkdep(jblkdep)
4460	struct jblkdep *jblkdep;
4461{
4462
4463	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4464		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4465	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4466		WORKITEM_FREE(jblkdep, D_JTRUNC);
4467	else
4468		panic("free_jblkdep: Unexpected type %s",
4469		    TYPENAME(jblkdep->jb_list.wk_type));
4470}
4471
4472/*
4473 * Free a single jseg once it is no longer referenced in memory or on
4474 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4475 * to disappear.
4476 */
4477static void
4478free_jseg(jseg, jblocks)
4479	struct jseg *jseg;
4480	struct jblocks *jblocks;
4481{
4482	struct freework *freework;
4483
4484	/*
4485	 * Free freework structures that were lingering to indicate freed
4486	 * indirect blocks that forced journal write ordering on reallocate.
4487	 */
4488	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4489		indirblk_remove(freework);
4490	if (jblocks->jb_oldestseg == jseg)
4491		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4492	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4493	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4494	KASSERT(LIST_EMPTY(&jseg->js_entries),
4495	    ("free_jseg: Freed jseg has valid entries."));
4496	WORKITEM_FREE(jseg, D_JSEG);
4497}
4498
4499/*
4500 * Free all jsegs that meet the criteria for being reclaimed and update
4501 * oldestseg.
4502 */
4503static void
4504free_jsegs(jblocks)
4505	struct jblocks *jblocks;
4506{
4507	struct jseg *jseg;
4508
4509	/*
4510	 * Free only those jsegs which have none allocated before them to
4511	 * preserve the journal space ordering.
4512	 */
4513	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4514		/*
4515		 * Only reclaim space when nothing depends on this journal
4516		 * set and another set has written that it is no longer
4517		 * valid.
4518		 */
4519		if (jseg->js_refs != 0) {
4520			jblocks->jb_oldestseg = jseg;
4521			return;
4522		}
4523		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4524			break;
4525		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4526			break;
4527		/*
4528		 * We can free jsegs that didn't write entries when
4529		 * oldestwrseq == js_seq.
4530		 */
4531		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4532		    jseg->js_cnt != 0)
4533			break;
4534		free_jseg(jseg, jblocks);
4535	}
4536	/*
4537	 * If we exited the loop above we still must discover the
4538	 * oldest valid segment.
4539	 */
4540	if (jseg)
4541		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4542		     jseg = TAILQ_NEXT(jseg, js_next))
4543			if (jseg->js_refs != 0)
4544				break;
4545	jblocks->jb_oldestseg = jseg;
4546	/*
4547	 * The journal has no valid records but some jsegs may still be
4548	 * waiting on oldestwrseq to advance.  We force a small record
4549	 * out to permit these lingering records to be reclaimed.
4550	 */
4551	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4552		jblocks->jb_needseg = 1;
4553}
4554
4555/*
4556 * Release one reference to a jseg and free it if the count reaches 0.  This
4557 * should eventually reclaim journal space as well.
4558 */
4559static void
4560rele_jseg(jseg)
4561	struct jseg *jseg;
4562{
4563
4564	KASSERT(jseg->js_refs > 0,
4565	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4566	if (--jseg->js_refs != 0)
4567		return;
4568	free_jsegs(jseg->js_jblocks);
4569}
4570
4571/*
4572 * Release a jsegdep and decrement the jseg count.
4573 */
4574static void
4575free_jsegdep(jsegdep)
4576	struct jsegdep *jsegdep;
4577{
4578
4579	if (jsegdep->jd_seg)
4580		rele_jseg(jsegdep->jd_seg);
4581	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4582}
4583
4584/*
4585 * Wait for a journal item to make it to disk.  Initiate journal processing
4586 * if required.
4587 */
4588static int
4589jwait(wk, waitfor)
4590	struct worklist *wk;
4591	int waitfor;
4592{
4593
4594	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4595	/*
4596	 * Blocking journal waits cause slow synchronous behavior.  Record
4597	 * stats on the frequency of these blocking operations.
4598	 */
4599	if (waitfor == MNT_WAIT) {
4600		stat_journal_wait++;
4601		switch (wk->wk_type) {
4602		case D_JREMREF:
4603		case D_JMVREF:
4604			stat_jwait_filepage++;
4605			break;
4606		case D_JTRUNC:
4607		case D_JFREEBLK:
4608			stat_jwait_freeblks++;
4609			break;
4610		case D_JNEWBLK:
4611			stat_jwait_newblk++;
4612			break;
4613		case D_JADDREF:
4614			stat_jwait_inode++;
4615			break;
4616		default:
4617			break;
4618		}
4619	}
4620	/*
4621	 * If IO has not started we process the journal.  We can't mark the
4622	 * worklist item as IOWAITING because we drop the lock while
4623	 * processing the journal and the worklist entry may be freed after
4624	 * this point.  The caller may call back in and re-issue the request.
4625	 */
4626	if ((wk->wk_state & INPROGRESS) == 0) {
4627		softdep_process_journal(wk->wk_mp, wk, waitfor);
4628		if (waitfor != MNT_WAIT)
4629			return (EBUSY);
4630		return (0);
4631	}
4632	if (waitfor != MNT_WAIT)
4633		return (EBUSY);
4634	wait_worklist(wk, "jwait");
4635	return (0);
4636}
4637
4638/*
4639 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4640 * appropriate.  This is a convenience function to reduce duplicate code
4641 * for the setup and revert functions below.
4642 */
4643static struct inodedep *
4644inodedep_lookup_ip(ip)
4645	struct inode *ip;
4646{
4647	struct inodedep *inodedep;
4648
4649	KASSERT(ip->i_nlink >= ip->i_effnlink,
4650	    ("inodedep_lookup_ip: bad delta"));
4651	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
4652	    &inodedep);
4653	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
4654	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
4655
4656	return (inodedep);
4657}
4658
4659/*
4660 * Called prior to creating a new inode and linking it to a directory.  The
4661 * jaddref structure must already be allocated by softdep_setup_inomapdep
4662 * and it is discovered here so we can initialize the mode and update
4663 * nlinkdelta.
4664 */
4665void
4666softdep_setup_create(dp, ip)
4667	struct inode *dp;
4668	struct inode *ip;
4669{
4670	struct inodedep *inodedep;
4671	struct jaddref *jaddref;
4672	struct vnode *dvp;
4673
4674	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4675	    ("softdep_setup_create called on non-softdep filesystem"));
4676	KASSERT(ip->i_nlink == 1,
4677	    ("softdep_setup_create: Invalid link count."));
4678	dvp = ITOV(dp);
4679	ACQUIRE_LOCK(dp->i_ump);
4680	inodedep = inodedep_lookup_ip(ip);
4681	if (DOINGSUJ(dvp)) {
4682		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4683		    inoreflst);
4684		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
4685		    ("softdep_setup_create: No addref structure present."));
4686	}
4687	softdep_prelink(dvp, NULL);
4688	FREE_LOCK(dp->i_ump);
4689}
4690
4691/*
4692 * Create a jaddref structure to track the addition of a DOTDOT link when
4693 * we are reparenting an inode as part of a rename.  This jaddref will be
4694 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
4695 * non-journaling softdep.
4696 */
4697void
4698softdep_setup_dotdot_link(dp, ip)
4699	struct inode *dp;
4700	struct inode *ip;
4701{
4702	struct inodedep *inodedep;
4703	struct jaddref *jaddref;
4704	struct vnode *dvp;
4705
4706	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4707	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
4708	dvp = ITOV(dp);
4709	jaddref = NULL;
4710	/*
4711	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
4712	 * is used as a normal link would be.
4713	 */
4714	if (DOINGSUJ(dvp))
4715		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4716		    dp->i_effnlink - 1, dp->i_mode);
4717	ACQUIRE_LOCK(dp->i_ump);
4718	inodedep = inodedep_lookup_ip(dp);
4719	if (jaddref)
4720		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4721		    if_deps);
4722	softdep_prelink(dvp, ITOV(ip));
4723	FREE_LOCK(dp->i_ump);
4724}
4725
4726/*
4727 * Create a jaddref structure to track a new link to an inode.  The directory
4728 * offset is not known until softdep_setup_directory_add or
4729 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
4730 * softdep.
4731 */
4732void
4733softdep_setup_link(dp, ip)
4734	struct inode *dp;
4735	struct inode *ip;
4736{
4737	struct inodedep *inodedep;
4738	struct jaddref *jaddref;
4739	struct vnode *dvp;
4740
4741	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4742	    ("softdep_setup_link called on non-softdep filesystem"));
4743	dvp = ITOV(dp);
4744	jaddref = NULL;
4745	if (DOINGSUJ(dvp))
4746		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
4747		    ip->i_mode);
4748	ACQUIRE_LOCK(dp->i_ump);
4749	inodedep = inodedep_lookup_ip(ip);
4750	if (jaddref)
4751		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
4752		    if_deps);
4753	softdep_prelink(dvp, ITOV(ip));
4754	FREE_LOCK(dp->i_ump);
4755}
4756
4757/*
4758 * Called to create the jaddref structures to track . and .. references as
4759 * well as lookup and further initialize the incomplete jaddref created
4760 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
4761 * nlinkdelta for non-journaling softdep.
4762 */
4763void
4764softdep_setup_mkdir(dp, ip)
4765	struct inode *dp;
4766	struct inode *ip;
4767{
4768	struct inodedep *inodedep;
4769	struct jaddref *dotdotaddref;
4770	struct jaddref *dotaddref;
4771	struct jaddref *jaddref;
4772	struct vnode *dvp;
4773
4774	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4775	    ("softdep_setup_mkdir called on non-softdep filesystem"));
4776	dvp = ITOV(dp);
4777	dotaddref = dotdotaddref = NULL;
4778	if (DOINGSUJ(dvp)) {
4779		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
4780		    ip->i_mode);
4781		dotaddref->ja_state |= MKDIR_BODY;
4782		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
4783		    dp->i_effnlink - 1, dp->i_mode);
4784		dotdotaddref->ja_state |= MKDIR_PARENT;
4785	}
4786	ACQUIRE_LOCK(dp->i_ump);
4787	inodedep = inodedep_lookup_ip(ip);
4788	if (DOINGSUJ(dvp)) {
4789		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4790		    inoreflst);
4791		KASSERT(jaddref != NULL,
4792		    ("softdep_setup_mkdir: No addref structure present."));
4793		KASSERT(jaddref->ja_parent == dp->i_number,
4794		    ("softdep_setup_mkdir: bad parent %ju",
4795		    (uintmax_t)jaddref->ja_parent));
4796		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
4797		    if_deps);
4798	}
4799	inodedep = inodedep_lookup_ip(dp);
4800	if (DOINGSUJ(dvp))
4801		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
4802		    &dotdotaddref->ja_ref, if_deps);
4803	softdep_prelink(ITOV(dp), NULL);
4804	FREE_LOCK(dp->i_ump);
4805}
4806
4807/*
4808 * Called to track nlinkdelta of the inode and parent directories prior to
4809 * unlinking a directory.
4810 */
4811void
4812softdep_setup_rmdir(dp, ip)
4813	struct inode *dp;
4814	struct inode *ip;
4815{
4816	struct vnode *dvp;
4817
4818	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4819	    ("softdep_setup_rmdir called on non-softdep filesystem"));
4820	dvp = ITOV(dp);
4821	ACQUIRE_LOCK(dp->i_ump);
4822	(void) inodedep_lookup_ip(ip);
4823	(void) inodedep_lookup_ip(dp);
4824	softdep_prelink(dvp, ITOV(ip));
4825	FREE_LOCK(dp->i_ump);
4826}
4827
4828/*
4829 * Called to track nlinkdelta of the inode and parent directories prior to
4830 * unlink.
4831 */
4832void
4833softdep_setup_unlink(dp, ip)
4834	struct inode *dp;
4835	struct inode *ip;
4836{
4837	struct vnode *dvp;
4838
4839	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4840	    ("softdep_setup_unlink called on non-softdep filesystem"));
4841	dvp = ITOV(dp);
4842	ACQUIRE_LOCK(dp->i_ump);
4843	(void) inodedep_lookup_ip(ip);
4844	(void) inodedep_lookup_ip(dp);
4845	softdep_prelink(dvp, ITOV(ip));
4846	FREE_LOCK(dp->i_ump);
4847}
4848
4849/*
4850 * Called to release the journal structures created by a failed non-directory
4851 * creation.  Adjusts nlinkdelta for non-journaling softdep.
4852 */
4853void
4854softdep_revert_create(dp, ip)
4855	struct inode *dp;
4856	struct inode *ip;
4857{
4858	struct inodedep *inodedep;
4859	struct jaddref *jaddref;
4860	struct vnode *dvp;
4861
4862	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4863	    ("softdep_revert_create called on non-softdep filesystem"));
4864	dvp = ITOV(dp);
4865	ACQUIRE_LOCK(dp->i_ump);
4866	inodedep = inodedep_lookup_ip(ip);
4867	if (DOINGSUJ(dvp)) {
4868		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4869		    inoreflst);
4870		KASSERT(jaddref->ja_parent == dp->i_number,
4871		    ("softdep_revert_create: addref parent mismatch"));
4872		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4873	}
4874	FREE_LOCK(dp->i_ump);
4875}
4876
4877/*
4878 * Called to release the journal structures created by a failed link
4879 * addition.  Adjusts nlinkdelta for non-journaling softdep.
4880 */
4881void
4882softdep_revert_link(dp, ip)
4883	struct inode *dp;
4884	struct inode *ip;
4885{
4886	struct inodedep *inodedep;
4887	struct jaddref *jaddref;
4888	struct vnode *dvp;
4889
4890	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4891	    ("softdep_revert_link called on non-softdep filesystem"));
4892	dvp = ITOV(dp);
4893	ACQUIRE_LOCK(dp->i_ump);
4894	inodedep = inodedep_lookup_ip(ip);
4895	if (DOINGSUJ(dvp)) {
4896		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4897		    inoreflst);
4898		KASSERT(jaddref->ja_parent == dp->i_number,
4899		    ("softdep_revert_link: addref parent mismatch"));
4900		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4901	}
4902	FREE_LOCK(dp->i_ump);
4903}
4904
4905/*
4906 * Called to release the journal structures created by a failed mkdir
4907 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
4908 */
4909void
4910softdep_revert_mkdir(dp, ip)
4911	struct inode *dp;
4912	struct inode *ip;
4913{
4914	struct inodedep *inodedep;
4915	struct jaddref *jaddref;
4916	struct jaddref *dotaddref;
4917	struct vnode *dvp;
4918
4919	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4920	    ("softdep_revert_mkdir called on non-softdep filesystem"));
4921	dvp = ITOV(dp);
4922
4923	ACQUIRE_LOCK(dp->i_ump);
4924	inodedep = inodedep_lookup_ip(dp);
4925	if (DOINGSUJ(dvp)) {
4926		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4927		    inoreflst);
4928		KASSERT(jaddref->ja_parent == ip->i_number,
4929		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
4930		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4931	}
4932	inodedep = inodedep_lookup_ip(ip);
4933	if (DOINGSUJ(dvp)) {
4934		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
4935		    inoreflst);
4936		KASSERT(jaddref->ja_parent == dp->i_number,
4937		    ("softdep_revert_mkdir: addref parent mismatch"));
4938		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
4939		    inoreflst, if_deps);
4940		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
4941		KASSERT(dotaddref->ja_parent == ip->i_number,
4942		    ("softdep_revert_mkdir: dot addref parent mismatch"));
4943		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
4944	}
4945	FREE_LOCK(dp->i_ump);
4946}
4947
4948/*
4949 * Called to correct nlinkdelta after a failed rmdir.
4950 */
4951void
4952softdep_revert_rmdir(dp, ip)
4953	struct inode *dp;
4954	struct inode *ip;
4955{
4956
4957	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(dp->i_ump)) != 0,
4958	    ("softdep_revert_rmdir called on non-softdep filesystem"));
4959	ACQUIRE_LOCK(dp->i_ump);
4960	(void) inodedep_lookup_ip(ip);
4961	(void) inodedep_lookup_ip(dp);
4962	FREE_LOCK(dp->i_ump);
4963}
4964
4965/*
4966 * Protecting the freemaps (or bitmaps).
4967 *
4968 * To eliminate the need to execute fsck before mounting a filesystem
4969 * after a power failure, one must (conservatively) guarantee that the
4970 * on-disk copy of the bitmaps never indicate that a live inode or block is
4971 * free.  So, when a block or inode is allocated, the bitmap should be
4972 * updated (on disk) before any new pointers.  When a block or inode is
4973 * freed, the bitmap should not be updated until all pointers have been
4974 * reset.  The latter dependency is handled by the delayed de-allocation
4975 * approach described below for block and inode de-allocation.  The former
4976 * dependency is handled by calling the following procedure when a block or
4977 * inode is allocated. When an inode is allocated an "inodedep" is created
4978 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
4979 * Each "inodedep" is also inserted into the hash indexing structure so
4980 * that any additional link additions can be made dependent on the inode
4981 * allocation.
4982 *
4983 * The ufs filesystem maintains a number of free block counts (e.g., per
4984 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
4985 * in addition to the bitmaps.  These counts are used to improve efficiency
4986 * during allocation and therefore must be consistent with the bitmaps.
4987 * There is no convenient way to guarantee post-crash consistency of these
4988 * counts with simple update ordering, for two main reasons: (1) The counts
4989 * and bitmaps for a single cylinder group block are not in the same disk
4990 * sector.  If a disk write is interrupted (e.g., by power failure), one may
4991 * be written and the other not.  (2) Some of the counts are located in the
4992 * superblock rather than the cylinder group block. So, we focus our soft
4993 * updates implementation on protecting the bitmaps. When mounting a
4994 * filesystem, we recompute the auxiliary counts from the bitmaps.
4995 */
4996
4997/*
4998 * Called just after updating the cylinder group block to allocate an inode.
4999 */
5000void
5001softdep_setup_inomapdep(bp, ip, newinum, mode)
5002	struct buf *bp;		/* buffer for cylgroup block with inode map */
5003	struct inode *ip;	/* inode related to allocation */
5004	ino_t newinum;		/* new inode number being allocated */
5005	int mode;
5006{
5007	struct inodedep *inodedep;
5008	struct bmsafemap *bmsafemap;
5009	struct jaddref *jaddref;
5010	struct mount *mp;
5011	struct fs *fs;
5012
5013	mp = UFSTOVFS(ip->i_ump);
5014	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5015	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5016	fs = ip->i_ump->um_fs;
5017	jaddref = NULL;
5018
5019	/*
5020	 * Allocate the journal reference add structure so that the bitmap
5021	 * can be dependent on it.
5022	 */
5023	if (MOUNTEDSUJ(mp)) {
5024		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5025		jaddref->ja_state |= NEWBLOCK;
5026	}
5027
5028	/*
5029	 * Create a dependency for the newly allocated inode.
5030	 * Panic if it already exists as something is seriously wrong.
5031	 * Otherwise add it to the dependency list for the buffer holding
5032	 * the cylinder group map from which it was allocated.
5033	 *
5034	 * We have to preallocate a bmsafemap entry in case it is needed
5035	 * in bmsafemap_lookup since once we allocate the inodedep, we
5036	 * have to finish initializing it before we can FREE_LOCK().
5037	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5038	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5039	 * creating the inodedep as it can be freed during the time
5040	 * that we FREE_LOCK() while allocating the inodedep. We must
5041	 * call workitem_alloc() before entering the locked section as
5042	 * it also acquires the lock and we must avoid trying doing so
5043	 * recursively.
5044	 */
5045	bmsafemap = malloc(sizeof(struct bmsafemap),
5046	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5047	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5048	ACQUIRE_LOCK(ip->i_ump);
5049	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5050		panic("softdep_setup_inomapdep: dependency %p for new"
5051		    "inode already exists", inodedep);
5052	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5053	if (jaddref) {
5054		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5055		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5056		    if_deps);
5057	} else {
5058		inodedep->id_state |= ONDEPLIST;
5059		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5060	}
5061	inodedep->id_bmsafemap = bmsafemap;
5062	inodedep->id_state &= ~DEPCOMPLETE;
5063	FREE_LOCK(ip->i_ump);
5064}
5065
5066/*
5067 * Called just after updating the cylinder group block to
5068 * allocate block or fragment.
5069 */
5070void
5071softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5072	struct buf *bp;		/* buffer for cylgroup block with block map */
5073	struct mount *mp;	/* filesystem doing allocation */
5074	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5075	int frags;		/* Number of fragments. */
5076	int oldfrags;		/* Previous number of fragments for extend. */
5077{
5078	struct newblk *newblk;
5079	struct bmsafemap *bmsafemap;
5080	struct jnewblk *jnewblk;
5081	struct ufsmount *ump;
5082	struct fs *fs;
5083
5084	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5085	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5086	ump = VFSTOUFS(mp);
5087	fs = ump->um_fs;
5088	jnewblk = NULL;
5089	/*
5090	 * Create a dependency for the newly allocated block.
5091	 * Add it to the dependency list for the buffer holding
5092	 * the cylinder group map from which it was allocated.
5093	 */
5094	if (MOUNTEDSUJ(mp)) {
5095		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5096		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5097		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5098		jnewblk->jn_state = ATTACHED;
5099		jnewblk->jn_blkno = newblkno;
5100		jnewblk->jn_frags = frags;
5101		jnewblk->jn_oldfrags = oldfrags;
5102#ifdef SUJ_DEBUG
5103		{
5104			struct cg *cgp;
5105			uint8_t *blksfree;
5106			long bno;
5107			int i;
5108
5109			cgp = (struct cg *)bp->b_data;
5110			blksfree = cg_blksfree(cgp);
5111			bno = dtogd(fs, jnewblk->jn_blkno);
5112			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5113			    i++) {
5114				if (isset(blksfree, bno + i))
5115					panic("softdep_setup_blkmapdep: "
5116					    "free fragment %d from %d-%d "
5117					    "state 0x%X dep %p", i,
5118					    jnewblk->jn_oldfrags,
5119					    jnewblk->jn_frags,
5120					    jnewblk->jn_state,
5121					    jnewblk->jn_dep);
5122			}
5123		}
5124#endif
5125	}
5126
5127	CTR3(KTR_SUJ,
5128	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5129	    newblkno, frags, oldfrags);
5130	ACQUIRE_LOCK(ump);
5131	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5132		panic("softdep_setup_blkmapdep: found block");
5133	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5134	    dtog(fs, newblkno), NULL);
5135	if (jnewblk) {
5136		jnewblk->jn_dep = (struct worklist *)newblk;
5137		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5138	} else {
5139		newblk->nb_state |= ONDEPLIST;
5140		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5141	}
5142	newblk->nb_bmsafemap = bmsafemap;
5143	newblk->nb_jnewblk = jnewblk;
5144	FREE_LOCK(ump);
5145}
5146
5147#define	BMSAFEMAP_HASH(ump, cg) \
5148      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5149
5150static int
5151bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5152	struct bmsafemap_hashhead *bmsafemaphd;
5153	int cg;
5154	struct bmsafemap **bmsafemapp;
5155{
5156	struct bmsafemap *bmsafemap;
5157
5158	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5159		if (bmsafemap->sm_cg == cg)
5160			break;
5161	if (bmsafemap) {
5162		*bmsafemapp = bmsafemap;
5163		return (1);
5164	}
5165	*bmsafemapp = NULL;
5166
5167	return (0);
5168}
5169
5170/*
5171 * Find the bmsafemap associated with a cylinder group buffer.
5172 * If none exists, create one. The buffer must be locked when
5173 * this routine is called and this routine must be called with
5174 * the softdep lock held. To avoid giving up the lock while
5175 * allocating a new bmsafemap, a preallocated bmsafemap may be
5176 * provided. If it is provided but not needed, it is freed.
5177 */
5178static struct bmsafemap *
5179bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5180	struct mount *mp;
5181	struct buf *bp;
5182	int cg;
5183	struct bmsafemap *newbmsafemap;
5184{
5185	struct bmsafemap_hashhead *bmsafemaphd;
5186	struct bmsafemap *bmsafemap, *collision;
5187	struct worklist *wk;
5188	struct ufsmount *ump;
5189
5190	ump = VFSTOUFS(mp);
5191	LOCK_OWNED(ump);
5192	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5193	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5194		if (wk->wk_type == D_BMSAFEMAP) {
5195			if (newbmsafemap)
5196				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5197			return (WK_BMSAFEMAP(wk));
5198		}
5199	}
5200	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5201	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5202		if (newbmsafemap)
5203			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5204		return (bmsafemap);
5205	}
5206	if (newbmsafemap) {
5207		bmsafemap = newbmsafemap;
5208	} else {
5209		FREE_LOCK(ump);
5210		bmsafemap = malloc(sizeof(struct bmsafemap),
5211			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5212		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5213		ACQUIRE_LOCK(ump);
5214	}
5215	bmsafemap->sm_buf = bp;
5216	LIST_INIT(&bmsafemap->sm_inodedephd);
5217	LIST_INIT(&bmsafemap->sm_inodedepwr);
5218	LIST_INIT(&bmsafemap->sm_newblkhd);
5219	LIST_INIT(&bmsafemap->sm_newblkwr);
5220	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5221	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5222	LIST_INIT(&bmsafemap->sm_freehd);
5223	LIST_INIT(&bmsafemap->sm_freewr);
5224	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5225		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5226		return (collision);
5227	}
5228	bmsafemap->sm_cg = cg;
5229	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5230	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5231	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5232	return (bmsafemap);
5233}
5234
5235/*
5236 * Direct block allocation dependencies.
5237 *
5238 * When a new block is allocated, the corresponding disk locations must be
5239 * initialized (with zeros or new data) before the on-disk inode points to
5240 * them.  Also, the freemap from which the block was allocated must be
5241 * updated (on disk) before the inode's pointer. These two dependencies are
5242 * independent of each other and are needed for all file blocks and indirect
5243 * blocks that are pointed to directly by the inode.  Just before the
5244 * "in-core" version of the inode is updated with a newly allocated block
5245 * number, a procedure (below) is called to setup allocation dependency
5246 * structures.  These structures are removed when the corresponding
5247 * dependencies are satisfied or when the block allocation becomes obsolete
5248 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5249 * fragment that gets upgraded).  All of these cases are handled in
5250 * procedures described later.
5251 *
5252 * When a file extension causes a fragment to be upgraded, either to a larger
5253 * fragment or to a full block, the on-disk location may change (if the
5254 * previous fragment could not simply be extended). In this case, the old
5255 * fragment must be de-allocated, but not until after the inode's pointer has
5256 * been updated. In most cases, this is handled by later procedures, which
5257 * will construct a "freefrag" structure to be added to the workitem queue
5258 * when the inode update is complete (or obsolete).  The main exception to
5259 * this is when an allocation occurs while a pending allocation dependency
5260 * (for the same block pointer) remains.  This case is handled in the main
5261 * allocation dependency setup procedure by immediately freeing the
5262 * unreferenced fragments.
5263 */
5264void
5265softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5266	struct inode *ip;	/* inode to which block is being added */
5267	ufs_lbn_t off;		/* block pointer within inode */
5268	ufs2_daddr_t newblkno;	/* disk block number being added */
5269	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5270	long newsize;		/* size of new block */
5271	long oldsize;		/* size of new block */
5272	struct buf *bp;		/* bp for allocated block */
5273{
5274	struct allocdirect *adp, *oldadp;
5275	struct allocdirectlst *adphead;
5276	struct freefrag *freefrag;
5277	struct inodedep *inodedep;
5278	struct pagedep *pagedep;
5279	struct jnewblk *jnewblk;
5280	struct newblk *newblk;
5281	struct mount *mp;
5282	ufs_lbn_t lbn;
5283
5284	lbn = bp->b_lblkno;
5285	mp = UFSTOVFS(ip->i_ump);
5286	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5287	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5288	if (oldblkno && oldblkno != newblkno)
5289		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5290	else
5291		freefrag = NULL;
5292
5293	CTR6(KTR_SUJ,
5294	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5295	    "off %jd newsize %ld oldsize %d",
5296	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5297	ACQUIRE_LOCK(ip->i_ump);
5298	if (off >= NDADDR) {
5299		if (lbn > 0)
5300			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5301			    lbn, off);
5302		/* allocating an indirect block */
5303		if (oldblkno != 0)
5304			panic("softdep_setup_allocdirect: non-zero indir");
5305	} else {
5306		if (off != lbn)
5307			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5308			    lbn, off);
5309		/*
5310		 * Allocating a direct block.
5311		 *
5312		 * If we are allocating a directory block, then we must
5313		 * allocate an associated pagedep to track additions and
5314		 * deletions.
5315		 */
5316		if ((ip->i_mode & IFMT) == IFDIR)
5317			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5318			    &pagedep);
5319	}
5320	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5321		panic("softdep_setup_allocdirect: lost block");
5322	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5323	    ("softdep_setup_allocdirect: newblk already initialized"));
5324	/*
5325	 * Convert the newblk to an allocdirect.
5326	 */
5327	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5328	adp = (struct allocdirect *)newblk;
5329	newblk->nb_freefrag = freefrag;
5330	adp->ad_offset = off;
5331	adp->ad_oldblkno = oldblkno;
5332	adp->ad_newsize = newsize;
5333	adp->ad_oldsize = oldsize;
5334
5335	/*
5336	 * Finish initializing the journal.
5337	 */
5338	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5339		jnewblk->jn_ino = ip->i_number;
5340		jnewblk->jn_lbn = lbn;
5341		add_to_journal(&jnewblk->jn_list);
5342	}
5343	if (freefrag && freefrag->ff_jdep != NULL &&
5344	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5345		add_to_journal(freefrag->ff_jdep);
5346	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5347	adp->ad_inodedep = inodedep;
5348
5349	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5350	/*
5351	 * The list of allocdirects must be kept in sorted and ascending
5352	 * order so that the rollback routines can quickly determine the
5353	 * first uncommitted block (the size of the file stored on disk
5354	 * ends at the end of the lowest committed fragment, or if there
5355	 * are no fragments, at the end of the highest committed block).
5356	 * Since files generally grow, the typical case is that the new
5357	 * block is to be added at the end of the list. We speed this
5358	 * special case by checking against the last allocdirect in the
5359	 * list before laboriously traversing the list looking for the
5360	 * insertion point.
5361	 */
5362	adphead = &inodedep->id_newinoupdt;
5363	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5364	if (oldadp == NULL || oldadp->ad_offset <= off) {
5365		/* insert at end of list */
5366		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5367		if (oldadp != NULL && oldadp->ad_offset == off)
5368			allocdirect_merge(adphead, adp, oldadp);
5369		FREE_LOCK(ip->i_ump);
5370		return;
5371	}
5372	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5373		if (oldadp->ad_offset >= off)
5374			break;
5375	}
5376	if (oldadp == NULL)
5377		panic("softdep_setup_allocdirect: lost entry");
5378	/* insert in middle of list */
5379	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5380	if (oldadp->ad_offset == off)
5381		allocdirect_merge(adphead, adp, oldadp);
5382
5383	FREE_LOCK(ip->i_ump);
5384}
5385
5386/*
5387 * Merge a newer and older journal record to be stored either in a
5388 * newblock or freefrag.  This handles aggregating journal records for
5389 * fragment allocation into a second record as well as replacing a
5390 * journal free with an aborted journal allocation.  A segment for the
5391 * oldest record will be placed on wkhd if it has been written.  If not
5392 * the segment for the newer record will suffice.
5393 */
5394static struct worklist *
5395jnewblk_merge(new, old, wkhd)
5396	struct worklist *new;
5397	struct worklist *old;
5398	struct workhead *wkhd;
5399{
5400	struct jnewblk *njnewblk;
5401	struct jnewblk *jnewblk;
5402
5403	/* Handle NULLs to simplify callers. */
5404	if (new == NULL)
5405		return (old);
5406	if (old == NULL)
5407		return (new);
5408	/* Replace a jfreefrag with a jnewblk. */
5409	if (new->wk_type == D_JFREEFRAG) {
5410		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5411			panic("jnewblk_merge: blkno mismatch: %p, %p",
5412			    old, new);
5413		cancel_jfreefrag(WK_JFREEFRAG(new));
5414		return (old);
5415	}
5416	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5417		panic("jnewblk_merge: Bad type: old %d new %d\n",
5418		    old->wk_type, new->wk_type);
5419	/*
5420	 * Handle merging of two jnewblk records that describe
5421	 * different sets of fragments in the same block.
5422	 */
5423	jnewblk = WK_JNEWBLK(old);
5424	njnewblk = WK_JNEWBLK(new);
5425	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5426		panic("jnewblk_merge: Merging disparate blocks.");
5427	/*
5428	 * The record may be rolled back in the cg.
5429	 */
5430	if (jnewblk->jn_state & UNDONE) {
5431		jnewblk->jn_state &= ~UNDONE;
5432		njnewblk->jn_state |= UNDONE;
5433		njnewblk->jn_state &= ~ATTACHED;
5434	}
5435	/*
5436	 * We modify the newer addref and free the older so that if neither
5437	 * has been written the most up-to-date copy will be on disk.  If
5438	 * both have been written but rolled back we only temporarily need
5439	 * one of them to fix the bits when the cg write completes.
5440	 */
5441	jnewblk->jn_state |= ATTACHED | COMPLETE;
5442	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5443	cancel_jnewblk(jnewblk, wkhd);
5444	WORKLIST_REMOVE(&jnewblk->jn_list);
5445	free_jnewblk(jnewblk);
5446	return (new);
5447}
5448
5449/*
5450 * Replace an old allocdirect dependency with a newer one.
5451 * This routine must be called with splbio interrupts blocked.
5452 */
5453static void
5454allocdirect_merge(adphead, newadp, oldadp)
5455	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5456	struct allocdirect *newadp;	/* allocdirect being added */
5457	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5458{
5459	struct worklist *wk;
5460	struct freefrag *freefrag;
5461
5462	freefrag = NULL;
5463	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5464	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5465	    newadp->ad_oldsize != oldadp->ad_newsize ||
5466	    newadp->ad_offset >= NDADDR)
5467		panic("%s %jd != new %jd || old size %ld != new %ld",
5468		    "allocdirect_merge: old blkno",
5469		    (intmax_t)newadp->ad_oldblkno,
5470		    (intmax_t)oldadp->ad_newblkno,
5471		    newadp->ad_oldsize, oldadp->ad_newsize);
5472	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5473	newadp->ad_oldsize = oldadp->ad_oldsize;
5474	/*
5475	 * If the old dependency had a fragment to free or had never
5476	 * previously had a block allocated, then the new dependency
5477	 * can immediately post its freefrag and adopt the old freefrag.
5478	 * This action is done by swapping the freefrag dependencies.
5479	 * The new dependency gains the old one's freefrag, and the
5480	 * old one gets the new one and then immediately puts it on
5481	 * the worklist when it is freed by free_newblk. It is
5482	 * not possible to do this swap when the old dependency had a
5483	 * non-zero size but no previous fragment to free. This condition
5484	 * arises when the new block is an extension of the old block.
5485	 * Here, the first part of the fragment allocated to the new
5486	 * dependency is part of the block currently claimed on disk by
5487	 * the old dependency, so cannot legitimately be freed until the
5488	 * conditions for the new dependency are fulfilled.
5489	 */
5490	freefrag = newadp->ad_freefrag;
5491	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5492		newadp->ad_freefrag = oldadp->ad_freefrag;
5493		oldadp->ad_freefrag = freefrag;
5494	}
5495	/*
5496	 * If we are tracking a new directory-block allocation,
5497	 * move it from the old allocdirect to the new allocdirect.
5498	 */
5499	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5500		WORKLIST_REMOVE(wk);
5501		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5502			panic("allocdirect_merge: extra newdirblk");
5503		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5504	}
5505	TAILQ_REMOVE(adphead, oldadp, ad_next);
5506	/*
5507	 * We need to move any journal dependencies over to the freefrag
5508	 * that releases this block if it exists.  Otherwise we are
5509	 * extending an existing block and we'll wait until that is
5510	 * complete to release the journal space and extend the
5511	 * new journal to cover this old space as well.
5512	 */
5513	if (freefrag == NULL) {
5514		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5515			panic("allocdirect_merge: %jd != %jd",
5516			    oldadp->ad_newblkno, newadp->ad_newblkno);
5517		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5518		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5519		    &oldadp->ad_block.nb_jnewblk->jn_list,
5520		    &newadp->ad_block.nb_jwork);
5521		oldadp->ad_block.nb_jnewblk = NULL;
5522		cancel_newblk(&oldadp->ad_block, NULL,
5523		    &newadp->ad_block.nb_jwork);
5524	} else {
5525		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5526		    &freefrag->ff_list, &freefrag->ff_jwork);
5527		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5528		    &freefrag->ff_jwork);
5529	}
5530	free_newblk(&oldadp->ad_block);
5531}
5532
5533/*
5534 * Allocate a jfreefrag structure to journal a single block free.
5535 */
5536static struct jfreefrag *
5537newjfreefrag(freefrag, ip, blkno, size, lbn)
5538	struct freefrag *freefrag;
5539	struct inode *ip;
5540	ufs2_daddr_t blkno;
5541	long size;
5542	ufs_lbn_t lbn;
5543{
5544	struct jfreefrag *jfreefrag;
5545	struct fs *fs;
5546
5547	fs = ip->i_fs;
5548	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5549	    M_SOFTDEP_FLAGS);
5550	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, UFSTOVFS(ip->i_ump));
5551	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5552	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5553	jfreefrag->fr_ino = ip->i_number;
5554	jfreefrag->fr_lbn = lbn;
5555	jfreefrag->fr_blkno = blkno;
5556	jfreefrag->fr_frags = numfrags(fs, size);
5557	jfreefrag->fr_freefrag = freefrag;
5558
5559	return (jfreefrag);
5560}
5561
5562/*
5563 * Allocate a new freefrag structure.
5564 */
5565static struct freefrag *
5566newfreefrag(ip, blkno, size, lbn)
5567	struct inode *ip;
5568	ufs2_daddr_t blkno;
5569	long size;
5570	ufs_lbn_t lbn;
5571{
5572	struct freefrag *freefrag;
5573	struct fs *fs;
5574
5575	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5576	    ip->i_number, blkno, size, lbn);
5577	fs = ip->i_fs;
5578	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5579		panic("newfreefrag: frag size");
5580	freefrag = malloc(sizeof(struct freefrag),
5581	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5582	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ip->i_ump));
5583	freefrag->ff_state = ATTACHED;
5584	LIST_INIT(&freefrag->ff_jwork);
5585	freefrag->ff_inum = ip->i_number;
5586	freefrag->ff_vtype = ITOV(ip)->v_type;
5587	freefrag->ff_blkno = blkno;
5588	freefrag->ff_fragsize = size;
5589
5590	if (MOUNTEDSUJ(UFSTOVFS(ip->i_ump))) {
5591		freefrag->ff_jdep = (struct worklist *)
5592		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5593	} else {
5594		freefrag->ff_state |= DEPCOMPLETE;
5595		freefrag->ff_jdep = NULL;
5596	}
5597
5598	return (freefrag);
5599}
5600
5601/*
5602 * This workitem de-allocates fragments that were replaced during
5603 * file block allocation.
5604 */
5605static void
5606handle_workitem_freefrag(freefrag)
5607	struct freefrag *freefrag;
5608{
5609	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5610	struct workhead wkhd;
5611
5612	CTR3(KTR_SUJ,
5613	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5614	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5615	/*
5616	 * It would be illegal to add new completion items to the
5617	 * freefrag after it was schedule to be done so it must be
5618	 * safe to modify the list head here.
5619	 */
5620	LIST_INIT(&wkhd);
5621	ACQUIRE_LOCK(ump);
5622	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5623	/*
5624	 * If the journal has not been written we must cancel it here.
5625	 */
5626	if (freefrag->ff_jdep) {
5627		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5628			panic("handle_workitem_freefrag: Unexpected type %d\n",
5629			    freefrag->ff_jdep->wk_type);
5630		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5631	}
5632	FREE_LOCK(ump);
5633	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5634	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype, &wkhd);
5635	ACQUIRE_LOCK(ump);
5636	WORKITEM_FREE(freefrag, D_FREEFRAG);
5637	FREE_LOCK(ump);
5638}
5639
5640/*
5641 * Set up a dependency structure for an external attributes data block.
5642 * This routine follows much of the structure of softdep_setup_allocdirect.
5643 * See the description of softdep_setup_allocdirect above for details.
5644 */
5645void
5646softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5647	struct inode *ip;
5648	ufs_lbn_t off;
5649	ufs2_daddr_t newblkno;
5650	ufs2_daddr_t oldblkno;
5651	long newsize;
5652	long oldsize;
5653	struct buf *bp;
5654{
5655	struct allocdirect *adp, *oldadp;
5656	struct allocdirectlst *adphead;
5657	struct freefrag *freefrag;
5658	struct inodedep *inodedep;
5659	struct jnewblk *jnewblk;
5660	struct newblk *newblk;
5661	struct mount *mp;
5662	ufs_lbn_t lbn;
5663
5664	mp = UFSTOVFS(ip->i_ump);
5665	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5666	    ("softdep_setup_allocext called on non-softdep filesystem"));
5667	KASSERT(off < NXADDR, ("softdep_setup_allocext: lbn %lld > NXADDR",
5668		    (long long)off));
5669
5670	lbn = bp->b_lblkno;
5671	if (oldblkno && oldblkno != newblkno)
5672		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn);
5673	else
5674		freefrag = NULL;
5675
5676	ACQUIRE_LOCK(ip->i_ump);
5677	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5678		panic("softdep_setup_allocext: lost block");
5679	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5680	    ("softdep_setup_allocext: newblk already initialized"));
5681	/*
5682	 * Convert the newblk to an allocdirect.
5683	 */
5684	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5685	adp = (struct allocdirect *)newblk;
5686	newblk->nb_freefrag = freefrag;
5687	adp->ad_offset = off;
5688	adp->ad_oldblkno = oldblkno;
5689	adp->ad_newsize = newsize;
5690	adp->ad_oldsize = oldsize;
5691	adp->ad_state |=  EXTDATA;
5692
5693	/*
5694	 * Finish initializing the journal.
5695	 */
5696	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5697		jnewblk->jn_ino = ip->i_number;
5698		jnewblk->jn_lbn = lbn;
5699		add_to_journal(&jnewblk->jn_list);
5700	}
5701	if (freefrag && freefrag->ff_jdep != NULL &&
5702	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5703		add_to_journal(freefrag->ff_jdep);
5704	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5705	adp->ad_inodedep = inodedep;
5706
5707	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5708	/*
5709	 * The list of allocdirects must be kept in sorted and ascending
5710	 * order so that the rollback routines can quickly determine the
5711	 * first uncommitted block (the size of the file stored on disk
5712	 * ends at the end of the lowest committed fragment, or if there
5713	 * are no fragments, at the end of the highest committed block).
5714	 * Since files generally grow, the typical case is that the new
5715	 * block is to be added at the end of the list. We speed this
5716	 * special case by checking against the last allocdirect in the
5717	 * list before laboriously traversing the list looking for the
5718	 * insertion point.
5719	 */
5720	adphead = &inodedep->id_newextupdt;
5721	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5722	if (oldadp == NULL || oldadp->ad_offset <= off) {
5723		/* insert at end of list */
5724		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5725		if (oldadp != NULL && oldadp->ad_offset == off)
5726			allocdirect_merge(adphead, adp, oldadp);
5727		FREE_LOCK(ip->i_ump);
5728		return;
5729	}
5730	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5731		if (oldadp->ad_offset >= off)
5732			break;
5733	}
5734	if (oldadp == NULL)
5735		panic("softdep_setup_allocext: lost entry");
5736	/* insert in middle of list */
5737	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5738	if (oldadp->ad_offset == off)
5739		allocdirect_merge(adphead, adp, oldadp);
5740	FREE_LOCK(ip->i_ump);
5741}
5742
5743/*
5744 * Indirect block allocation dependencies.
5745 *
5746 * The same dependencies that exist for a direct block also exist when
5747 * a new block is allocated and pointed to by an entry in a block of
5748 * indirect pointers. The undo/redo states described above are also
5749 * used here. Because an indirect block contains many pointers that
5750 * may have dependencies, a second copy of the entire in-memory indirect
5751 * block is kept. The buffer cache copy is always completely up-to-date.
5752 * The second copy, which is used only as a source for disk writes,
5753 * contains only the safe pointers (i.e., those that have no remaining
5754 * update dependencies). The second copy is freed when all pointers
5755 * are safe. The cache is not allowed to replace indirect blocks with
5756 * pending update dependencies. If a buffer containing an indirect
5757 * block with dependencies is written, these routines will mark it
5758 * dirty again. It can only be successfully written once all the
5759 * dependencies are removed. The ffs_fsync routine in conjunction with
5760 * softdep_sync_metadata work together to get all the dependencies
5761 * removed so that a file can be successfully written to disk. Three
5762 * procedures are used when setting up indirect block pointer
5763 * dependencies. The division is necessary because of the organization
5764 * of the "balloc" routine and because of the distinction between file
5765 * pages and file metadata blocks.
5766 */
5767
5768/*
5769 * Allocate a new allocindir structure.
5770 */
5771static struct allocindir *
5772newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
5773	struct inode *ip;	/* inode for file being extended */
5774	int ptrno;		/* offset of pointer in indirect block */
5775	ufs2_daddr_t newblkno;	/* disk block number being added */
5776	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5777	ufs_lbn_t lbn;
5778{
5779	struct newblk *newblk;
5780	struct allocindir *aip;
5781	struct freefrag *freefrag;
5782	struct jnewblk *jnewblk;
5783
5784	if (oldblkno)
5785		freefrag = newfreefrag(ip, oldblkno, ip->i_fs->fs_bsize, lbn);
5786	else
5787		freefrag = NULL;
5788	ACQUIRE_LOCK(ip->i_ump);
5789	if (newblk_lookup(UFSTOVFS(ip->i_ump), newblkno, 0, &newblk) == 0)
5790		panic("new_allocindir: lost block");
5791	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5792	    ("newallocindir: newblk already initialized"));
5793	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
5794	newblk->nb_freefrag = freefrag;
5795	aip = (struct allocindir *)newblk;
5796	aip->ai_offset = ptrno;
5797	aip->ai_oldblkno = oldblkno;
5798	aip->ai_lbn = lbn;
5799	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5800		jnewblk->jn_ino = ip->i_number;
5801		jnewblk->jn_lbn = lbn;
5802		add_to_journal(&jnewblk->jn_list);
5803	}
5804	if (freefrag && freefrag->ff_jdep != NULL &&
5805	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5806		add_to_journal(freefrag->ff_jdep);
5807	return (aip);
5808}
5809
5810/*
5811 * Called just before setting an indirect block pointer
5812 * to a newly allocated file page.
5813 */
5814void
5815softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
5816	struct inode *ip;	/* inode for file being extended */
5817	ufs_lbn_t lbn;		/* allocated block number within file */
5818	struct buf *bp;		/* buffer with indirect blk referencing page */
5819	int ptrno;		/* offset of pointer in indirect block */
5820	ufs2_daddr_t newblkno;	/* disk block number being added */
5821	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
5822	struct buf *nbp;	/* buffer holding allocated page */
5823{
5824	struct inodedep *inodedep;
5825	struct freefrag *freefrag;
5826	struct allocindir *aip;
5827	struct pagedep *pagedep;
5828	struct mount *mp;
5829
5830	mp = UFSTOVFS(ip->i_ump);
5831	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5832	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
5833	KASSERT(lbn == nbp->b_lblkno,
5834	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
5835	    lbn, bp->b_lblkno));
5836	CTR4(KTR_SUJ,
5837	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
5838	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
5839	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
5840	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
5841	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5842	/*
5843	 * If we are allocating a directory page, then we must
5844	 * allocate an associated pagedep to track additions and
5845	 * deletions.
5846	 */
5847	if ((ip->i_mode & IFMT) == IFDIR)
5848		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
5849	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5850	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
5851	FREE_LOCK(ip->i_ump);
5852	if (freefrag)
5853		handle_workitem_freefrag(freefrag);
5854}
5855
5856/*
5857 * Called just before setting an indirect block pointer to a
5858 * newly allocated indirect block.
5859 */
5860void
5861softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
5862	struct buf *nbp;	/* newly allocated indirect block */
5863	struct inode *ip;	/* inode for file being extended */
5864	struct buf *bp;		/* indirect block referencing allocated block */
5865	int ptrno;		/* offset of pointer in indirect block */
5866	ufs2_daddr_t newblkno;	/* disk block number being added */
5867{
5868	struct inodedep *inodedep;
5869	struct allocindir *aip;
5870	ufs_lbn_t lbn;
5871
5872	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
5873	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
5874	CTR3(KTR_SUJ,
5875	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
5876	    ip->i_number, newblkno, ptrno);
5877	lbn = nbp->b_lblkno;
5878	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
5879	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
5880	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
5881	    &inodedep);
5882	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
5883	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
5884		panic("softdep_setup_allocindir_meta: Block already existed");
5885	FREE_LOCK(ip->i_ump);
5886}
5887
5888static void
5889indirdep_complete(indirdep)
5890	struct indirdep *indirdep;
5891{
5892	struct allocindir *aip;
5893
5894	LIST_REMOVE(indirdep, ir_next);
5895	indirdep->ir_state |= DEPCOMPLETE;
5896
5897	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
5898		LIST_REMOVE(aip, ai_next);
5899		free_newblk(&aip->ai_block);
5900	}
5901	/*
5902	 * If this indirdep is not attached to a buf it was simply waiting
5903	 * on completion to clear completehd.  free_indirdep() asserts
5904	 * that nothing is dangling.
5905	 */
5906	if ((indirdep->ir_state & ONWORKLIST) == 0)
5907		free_indirdep(indirdep);
5908}
5909
5910static struct indirdep *
5911indirdep_lookup(mp, ip, bp)
5912	struct mount *mp;
5913	struct inode *ip;
5914	struct buf *bp;
5915{
5916	struct indirdep *indirdep, *newindirdep;
5917	struct newblk *newblk;
5918	struct ufsmount *ump;
5919	struct worklist *wk;
5920	struct fs *fs;
5921	ufs2_daddr_t blkno;
5922
5923	ump = VFSTOUFS(mp);
5924	LOCK_OWNED(ump);
5925	indirdep = NULL;
5926	newindirdep = NULL;
5927	fs = ip->i_fs;
5928	for (;;) {
5929		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5930			if (wk->wk_type != D_INDIRDEP)
5931				continue;
5932			indirdep = WK_INDIRDEP(wk);
5933			break;
5934		}
5935		/* Found on the buffer worklist, no new structure to free. */
5936		if (indirdep != NULL && newindirdep == NULL)
5937			return (indirdep);
5938		if (indirdep != NULL && newindirdep != NULL)
5939			panic("indirdep_lookup: simultaneous create");
5940		/* None found on the buffer and a new structure is ready. */
5941		if (indirdep == NULL && newindirdep != NULL)
5942			break;
5943		/* None found and no new structure available. */
5944		FREE_LOCK(ump);
5945		newindirdep = malloc(sizeof(struct indirdep),
5946		    M_INDIRDEP, M_SOFTDEP_FLAGS);
5947		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
5948		newindirdep->ir_state = ATTACHED;
5949		if (ip->i_ump->um_fstype == UFS1)
5950			newindirdep->ir_state |= UFS1FMT;
5951		TAILQ_INIT(&newindirdep->ir_trunc);
5952		newindirdep->ir_saveddata = NULL;
5953		LIST_INIT(&newindirdep->ir_deplisthd);
5954		LIST_INIT(&newindirdep->ir_donehd);
5955		LIST_INIT(&newindirdep->ir_writehd);
5956		LIST_INIT(&newindirdep->ir_completehd);
5957		if (bp->b_blkno == bp->b_lblkno) {
5958			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
5959			    NULL, NULL);
5960			bp->b_blkno = blkno;
5961		}
5962		newindirdep->ir_freeblks = NULL;
5963		newindirdep->ir_savebp =
5964		    getblk(ip->i_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
5965		newindirdep->ir_bp = bp;
5966		BUF_KERNPROC(newindirdep->ir_savebp);
5967		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
5968		ACQUIRE_LOCK(ump);
5969	}
5970	indirdep = newindirdep;
5971	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
5972	/*
5973	 * If the block is not yet allocated we don't set DEPCOMPLETE so
5974	 * that we don't free dependencies until the pointers are valid.
5975	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
5976	 * than using the hash.
5977	 */
5978	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
5979		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
5980	else
5981		indirdep->ir_state |= DEPCOMPLETE;
5982	return (indirdep);
5983}
5984
5985/*
5986 * Called to finish the allocation of the "aip" allocated
5987 * by one of the two routines above.
5988 */
5989static struct freefrag *
5990setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
5991	struct buf *bp;		/* in-memory copy of the indirect block */
5992	struct inode *ip;	/* inode for file being extended */
5993	struct inodedep *inodedep; /* Inodedep for ip */
5994	struct allocindir *aip;	/* allocindir allocated by the above routines */
5995	ufs_lbn_t lbn;		/* Logical block number for this block. */
5996{
5997	struct fs *fs;
5998	struct indirdep *indirdep;
5999	struct allocindir *oldaip;
6000	struct freefrag *freefrag;
6001	struct mount *mp;
6002
6003	LOCK_OWNED(ip->i_ump);
6004	mp = UFSTOVFS(ip->i_ump);
6005	fs = ip->i_fs;
6006	if (bp->b_lblkno >= 0)
6007		panic("setup_allocindir_phase2: not indir blk");
6008	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6009	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6010	indirdep = indirdep_lookup(mp, ip, bp);
6011	KASSERT(indirdep->ir_savebp != NULL,
6012	    ("setup_allocindir_phase2 NULL ir_savebp"));
6013	aip->ai_indirdep = indirdep;
6014	/*
6015	 * Check for an unwritten dependency for this indirect offset.  If
6016	 * there is, merge the old dependency into the new one.  This happens
6017	 * as a result of reallocblk only.
6018	 */
6019	freefrag = NULL;
6020	if (aip->ai_oldblkno != 0) {
6021		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6022			if (oldaip->ai_offset == aip->ai_offset) {
6023				freefrag = allocindir_merge(aip, oldaip);
6024				goto done;
6025			}
6026		}
6027		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6028			if (oldaip->ai_offset == aip->ai_offset) {
6029				freefrag = allocindir_merge(aip, oldaip);
6030				goto done;
6031			}
6032		}
6033	}
6034done:
6035	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6036	return (freefrag);
6037}
6038
6039/*
6040 * Merge two allocindirs which refer to the same block.  Move newblock
6041 * dependencies and setup the freefrags appropriately.
6042 */
6043static struct freefrag *
6044allocindir_merge(aip, oldaip)
6045	struct allocindir *aip;
6046	struct allocindir *oldaip;
6047{
6048	struct freefrag *freefrag;
6049	struct worklist *wk;
6050
6051	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6052		panic("allocindir_merge: blkno");
6053	aip->ai_oldblkno = oldaip->ai_oldblkno;
6054	freefrag = aip->ai_freefrag;
6055	aip->ai_freefrag = oldaip->ai_freefrag;
6056	oldaip->ai_freefrag = NULL;
6057	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6058	/*
6059	 * If we are tracking a new directory-block allocation,
6060	 * move it from the old allocindir to the new allocindir.
6061	 */
6062	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6063		WORKLIST_REMOVE(wk);
6064		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6065			panic("allocindir_merge: extra newdirblk");
6066		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6067	}
6068	/*
6069	 * We can skip journaling for this freefrag and just complete
6070	 * any pending journal work for the allocindir that is being
6071	 * removed after the freefrag completes.
6072	 */
6073	if (freefrag->ff_jdep)
6074		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6075	LIST_REMOVE(oldaip, ai_next);
6076	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6077	    &freefrag->ff_list, &freefrag->ff_jwork);
6078	free_newblk(&oldaip->ai_block);
6079
6080	return (freefrag);
6081}
6082
6083static inline void
6084setup_freedirect(freeblks, ip, i, needj)
6085	struct freeblks *freeblks;
6086	struct inode *ip;
6087	int i;
6088	int needj;
6089{
6090	ufs2_daddr_t blkno;
6091	int frags;
6092
6093	blkno = DIP(ip, i_db[i]);
6094	if (blkno == 0)
6095		return;
6096	DIP_SET(ip, i_db[i], 0);
6097	frags = sblksize(ip->i_fs, ip->i_size, i);
6098	frags = numfrags(ip->i_fs, frags);
6099	newfreework(ip->i_ump, freeblks, NULL, i, blkno, frags, 0, needj);
6100}
6101
6102static inline void
6103setup_freeext(freeblks, ip, i, needj)
6104	struct freeblks *freeblks;
6105	struct inode *ip;
6106	int i;
6107	int needj;
6108{
6109	ufs2_daddr_t blkno;
6110	int frags;
6111
6112	blkno = ip->i_din2->di_extb[i];
6113	if (blkno == 0)
6114		return;
6115	ip->i_din2->di_extb[i] = 0;
6116	frags = sblksize(ip->i_fs, ip->i_din2->di_extsize, i);
6117	frags = numfrags(ip->i_fs, frags);
6118	newfreework(ip->i_ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6119}
6120
6121static inline void
6122setup_freeindir(freeblks, ip, i, lbn, needj)
6123	struct freeblks *freeblks;
6124	struct inode *ip;
6125	int i;
6126	ufs_lbn_t lbn;
6127	int needj;
6128{
6129	ufs2_daddr_t blkno;
6130
6131	blkno = DIP(ip, i_ib[i]);
6132	if (blkno == 0)
6133		return;
6134	DIP_SET(ip, i_ib[i], 0);
6135	newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, ip->i_fs->fs_frag,
6136	    0, needj);
6137}
6138
6139static inline struct freeblks *
6140newfreeblks(mp, ip)
6141	struct mount *mp;
6142	struct inode *ip;
6143{
6144	struct freeblks *freeblks;
6145
6146	freeblks = malloc(sizeof(struct freeblks),
6147		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6148	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6149	LIST_INIT(&freeblks->fb_jblkdephd);
6150	LIST_INIT(&freeblks->fb_jwork);
6151	freeblks->fb_ref = 0;
6152	freeblks->fb_cgwait = 0;
6153	freeblks->fb_state = ATTACHED;
6154	freeblks->fb_uid = ip->i_uid;
6155	freeblks->fb_inum = ip->i_number;
6156	freeblks->fb_vtype = ITOV(ip)->v_type;
6157	freeblks->fb_modrev = DIP(ip, i_modrev);
6158	freeblks->fb_devvp = ip->i_devvp;
6159	freeblks->fb_chkcnt = 0;
6160	freeblks->fb_len = 0;
6161
6162	return (freeblks);
6163}
6164
6165static void
6166trunc_indirdep(indirdep, freeblks, bp, off)
6167	struct indirdep *indirdep;
6168	struct freeblks *freeblks;
6169	struct buf *bp;
6170	int off;
6171{
6172	struct allocindir *aip, *aipn;
6173
6174	/*
6175	 * The first set of allocindirs won't be in savedbp.
6176	 */
6177	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6178		if (aip->ai_offset > off)
6179			cancel_allocindir(aip, bp, freeblks, 1);
6180	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6181		if (aip->ai_offset > off)
6182			cancel_allocindir(aip, bp, freeblks, 1);
6183	/*
6184	 * These will exist in savedbp.
6185	 */
6186	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6187		if (aip->ai_offset > off)
6188			cancel_allocindir(aip, NULL, freeblks, 0);
6189	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6190		if (aip->ai_offset > off)
6191			cancel_allocindir(aip, NULL, freeblks, 0);
6192}
6193
6194/*
6195 * Follow the chain of indirects down to lastlbn creating a freework
6196 * structure for each.  This will be used to start indir_trunc() at
6197 * the right offset and create the journal records for the parrtial
6198 * truncation.  A second step will handle the truncated dependencies.
6199 */
6200static int
6201setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6202	struct freeblks *freeblks;
6203	struct inode *ip;
6204	ufs_lbn_t lbn;
6205	ufs_lbn_t lastlbn;
6206	ufs2_daddr_t blkno;
6207{
6208	struct indirdep *indirdep;
6209	struct indirdep *indirn;
6210	struct freework *freework;
6211	struct newblk *newblk;
6212	struct mount *mp;
6213	struct buf *bp;
6214	uint8_t *start;
6215	uint8_t *end;
6216	ufs_lbn_t lbnadd;
6217	int level;
6218	int error;
6219	int off;
6220
6221
6222	freework = NULL;
6223	if (blkno == 0)
6224		return (0);
6225	mp = freeblks->fb_list.wk_mp;
6226	bp = getblk(ITOV(ip), lbn, mp->mnt_stat.f_iosize, 0, 0, 0);
6227	if ((bp->b_flags & B_CACHE) == 0) {
6228		bp->b_blkno = blkptrtodb(VFSTOUFS(mp), blkno);
6229		bp->b_iocmd = BIO_READ;
6230		bp->b_flags &= ~B_INVAL;
6231		bp->b_ioflags &= ~BIO_ERROR;
6232		vfs_busy_pages(bp, 0);
6233		bp->b_iooffset = dbtob(bp->b_blkno);
6234		bstrategy(bp);
6235		curthread->td_ru.ru_inblock++;
6236		error = bufwait(bp);
6237		if (error) {
6238			brelse(bp);
6239			return (error);
6240		}
6241	}
6242	level = lbn_level(lbn);
6243	lbnadd = lbn_offset(ip->i_fs, level);
6244	/*
6245	 * Compute the offset of the last block we want to keep.  Store
6246	 * in the freework the first block we want to completely free.
6247	 */
6248	off = (lastlbn - -(lbn + level)) / lbnadd;
6249	if (off + 1 == NINDIR(ip->i_fs))
6250		goto nowork;
6251	freework = newfreework(ip->i_ump, freeblks, NULL, lbn, blkno, 0, off+1,
6252	    0);
6253	/*
6254	 * Link the freework into the indirdep.  This will prevent any new
6255	 * allocations from proceeding until we are finished with the
6256	 * truncate and the block is written.
6257	 */
6258	ACQUIRE_LOCK(ip->i_ump);
6259	indirdep = indirdep_lookup(mp, ip, bp);
6260	if (indirdep->ir_freeblks)
6261		panic("setup_trunc_indir: indirdep already truncated.");
6262	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6263	freework->fw_indir = indirdep;
6264	/*
6265	 * Cancel any allocindirs that will not make it to disk.
6266	 * We have to do this for all copies of the indirdep that
6267	 * live on this newblk.
6268	 */
6269	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6270		newblk_lookup(mp, dbtofsb(ip->i_fs, bp->b_blkno), 0, &newblk);
6271		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6272			trunc_indirdep(indirn, freeblks, bp, off);
6273	} else
6274		trunc_indirdep(indirdep, freeblks, bp, off);
6275	FREE_LOCK(ip->i_ump);
6276	/*
6277	 * Creation is protected by the buf lock. The saveddata is only
6278	 * needed if a full truncation follows a partial truncation but it
6279	 * is difficult to allocate in that case so we fetch it anyway.
6280	 */
6281	if (indirdep->ir_saveddata == NULL)
6282		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6283		    M_SOFTDEP_FLAGS);
6284nowork:
6285	/* Fetch the blkno of the child and the zero start offset. */
6286	if (ip->i_ump->um_fstype == UFS1) {
6287		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6288		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6289	} else {
6290		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6291		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6292	}
6293	if (freework) {
6294		/* Zero the truncated pointers. */
6295		end = bp->b_data + bp->b_bcount;
6296		bzero(start, end - start);
6297		bdwrite(bp);
6298	} else
6299		bqrelse(bp);
6300	if (level == 0)
6301		return (0);
6302	lbn++; /* adjust level */
6303	lbn -= (off * lbnadd);
6304	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6305}
6306
6307/*
6308 * Complete the partial truncation of an indirect block setup by
6309 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6310 * copy and writes them to disk before the freeblks is allowed to complete.
6311 */
6312static void
6313complete_trunc_indir(freework)
6314	struct freework *freework;
6315{
6316	struct freework *fwn;
6317	struct indirdep *indirdep;
6318	struct ufsmount *ump;
6319	struct buf *bp;
6320	uintptr_t start;
6321	int count;
6322
6323	ump = VFSTOUFS(freework->fw_list.wk_mp);
6324	LOCK_OWNED(ump);
6325	indirdep = freework->fw_indir;
6326	for (;;) {
6327		bp = indirdep->ir_bp;
6328		/* See if the block was discarded. */
6329		if (bp == NULL)
6330			break;
6331		/* Inline part of getdirtybuf().  We dont want bremfree. */
6332		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6333			break;
6334		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6335		    LOCK_PTR(ump)) == 0)
6336			BUF_UNLOCK(bp);
6337		ACQUIRE_LOCK(ump);
6338	}
6339	freework->fw_state |= DEPCOMPLETE;
6340	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6341	/*
6342	 * Zero the pointers in the saved copy.
6343	 */
6344	if (indirdep->ir_state & UFS1FMT)
6345		start = sizeof(ufs1_daddr_t);
6346	else
6347		start = sizeof(ufs2_daddr_t);
6348	start *= freework->fw_start;
6349	count = indirdep->ir_savebp->b_bcount - start;
6350	start += (uintptr_t)indirdep->ir_savebp->b_data;
6351	bzero((char *)start, count);
6352	/*
6353	 * We need to start the next truncation in the list if it has not
6354	 * been started yet.
6355	 */
6356	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6357	if (fwn != NULL) {
6358		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6359			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6360		if ((fwn->fw_state & ONWORKLIST) == 0)
6361			freework_enqueue(fwn);
6362	}
6363	/*
6364	 * If bp is NULL the block was fully truncated, restore
6365	 * the saved block list otherwise free it if it is no
6366	 * longer needed.
6367	 */
6368	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6369		if (bp == NULL)
6370			bcopy(indirdep->ir_saveddata,
6371			    indirdep->ir_savebp->b_data,
6372			    indirdep->ir_savebp->b_bcount);
6373		free(indirdep->ir_saveddata, M_INDIRDEP);
6374		indirdep->ir_saveddata = NULL;
6375	}
6376	/*
6377	 * When bp is NULL there is a full truncation pending.  We
6378	 * must wait for this full truncation to be journaled before
6379	 * we can release this freework because the disk pointers will
6380	 * never be written as zero.
6381	 */
6382	if (bp == NULL)  {
6383		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6384			handle_written_freework(freework);
6385		else
6386			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6387			   &freework->fw_list);
6388	} else {
6389		/* Complete when the real copy is written. */
6390		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6391		BUF_UNLOCK(bp);
6392	}
6393}
6394
6395/*
6396 * Calculate the number of blocks we are going to release where datablocks
6397 * is the current total and length is the new file size.
6398 */
6399static ufs2_daddr_t
6400blkcount(fs, datablocks, length)
6401	struct fs *fs;
6402	ufs2_daddr_t datablocks;
6403	off_t length;
6404{
6405	off_t totblks, numblks;
6406
6407	totblks = 0;
6408	numblks = howmany(length, fs->fs_bsize);
6409	if (numblks <= NDADDR) {
6410		totblks = howmany(length, fs->fs_fsize);
6411		goto out;
6412	}
6413        totblks = blkstofrags(fs, numblks);
6414	numblks -= NDADDR;
6415	/*
6416	 * Count all single, then double, then triple indirects required.
6417	 * Subtracting one indirects worth of blocks for each pass
6418	 * acknowledges one of each pointed to by the inode.
6419	 */
6420	for (;;) {
6421		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6422		numblks -= NINDIR(fs);
6423		if (numblks <= 0)
6424			break;
6425		numblks = howmany(numblks, NINDIR(fs));
6426	}
6427out:
6428	totblks = fsbtodb(fs, totblks);
6429	/*
6430	 * Handle sparse files.  We can't reclaim more blocks than the inode
6431	 * references.  We will correct it later in handle_complete_freeblks()
6432	 * when we know the real count.
6433	 */
6434	if (totblks > datablocks)
6435		return (0);
6436	return (datablocks - totblks);
6437}
6438
6439/*
6440 * Handle freeblocks for journaled softupdate filesystems.
6441 *
6442 * Contrary to normal softupdates, we must preserve the block pointers in
6443 * indirects until their subordinates are free.  This is to avoid journaling
6444 * every block that is freed which may consume more space than the journal
6445 * itself.  The recovery program will see the free block journals at the
6446 * base of the truncated area and traverse them to reclaim space.  The
6447 * pointers in the inode may be cleared immediately after the journal
6448 * records are written because each direct and indirect pointer in the
6449 * inode is recorded in a journal.  This permits full truncation to proceed
6450 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6451 *
6452 * The algorithm is as follows:
6453 * 1) Traverse the in-memory state and create journal entries to release
6454 *    the relevant blocks and full indirect trees.
6455 * 2) Traverse the indirect block chain adding partial truncation freework
6456 *    records to indirects in the path to lastlbn.  The freework will
6457 *    prevent new allocation dependencies from being satisfied in this
6458 *    indirect until the truncation completes.
6459 * 3) Read and lock the inode block, performing an update with the new size
6460 *    and pointers.  This prevents truncated data from becoming valid on
6461 *    disk through step 4.
6462 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6463 *    eliminate journal work for those records that do not require it.
6464 * 5) Schedule the journal records to be written followed by the inode block.
6465 * 6) Allocate any necessary frags for the end of file.
6466 * 7) Zero any partially truncated blocks.
6467 *
6468 * From this truncation proceeds asynchronously using the freework and
6469 * indir_trunc machinery.  The file will not be extended again into a
6470 * partially truncated indirect block until all work is completed but
6471 * the normal dependency mechanism ensures that it is rolled back/forward
6472 * as appropriate.  Further truncation may occur without delay and is
6473 * serialized in indir_trunc().
6474 */
6475void
6476softdep_journal_freeblocks(ip, cred, length, flags)
6477	struct inode *ip;	/* The inode whose length is to be reduced */
6478	struct ucred *cred;
6479	off_t length;		/* The new length for the file */
6480	int flags;		/* IO_EXT and/or IO_NORMAL */
6481{
6482	struct freeblks *freeblks, *fbn;
6483	struct worklist *wk, *wkn;
6484	struct inodedep *inodedep;
6485	struct jblkdep *jblkdep;
6486	struct allocdirect *adp, *adpn;
6487	struct ufsmount *ump;
6488	struct fs *fs;
6489	struct buf *bp;
6490	struct vnode *vp;
6491	struct mount *mp;
6492	ufs2_daddr_t extblocks, datablocks;
6493	ufs_lbn_t tmpval, lbn, lastlbn;
6494	int frags, lastoff, iboff, allocblock, needj, error, i;
6495
6496	fs = ip->i_fs;
6497	ump = ip->i_ump;
6498	mp = UFSTOVFS(ump);
6499	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6500	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6501	vp = ITOV(ip);
6502	needj = 1;
6503	iboff = -1;
6504	allocblock = 0;
6505	extblocks = 0;
6506	datablocks = 0;
6507	frags = 0;
6508	freeblks = newfreeblks(mp, ip);
6509	ACQUIRE_LOCK(ump);
6510	/*
6511	 * If we're truncating a removed file that will never be written
6512	 * we don't need to journal the block frees.  The canceled journals
6513	 * for the allocations will suffice.
6514	 */
6515	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6516	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6517	    length == 0)
6518		needj = 0;
6519	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6520	    ip->i_number, length, needj);
6521	FREE_LOCK(ump);
6522	/*
6523	 * Calculate the lbn that we are truncating to.  This results in -1
6524	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6525	 * to keep, not the first lbn we want to truncate.
6526	 */
6527	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6528	lastoff = blkoff(fs, length);
6529	/*
6530	 * Compute frags we are keeping in lastlbn.  0 means all.
6531	 */
6532	if (lastlbn >= 0 && lastlbn < NDADDR) {
6533		frags = fragroundup(fs, lastoff);
6534		/* adp offset of last valid allocdirect. */
6535		iboff = lastlbn;
6536	} else if (lastlbn > 0)
6537		iboff = NDADDR;
6538	if (fs->fs_magic == FS_UFS2_MAGIC)
6539		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6540	/*
6541	 * Handle normal data blocks and indirects.  This section saves
6542	 * values used after the inode update to complete frag and indirect
6543	 * truncation.
6544	 */
6545	if ((flags & IO_NORMAL) != 0) {
6546		/*
6547		 * Handle truncation of whole direct and indirect blocks.
6548		 */
6549		for (i = iboff + 1; i < NDADDR; i++)
6550			setup_freedirect(freeblks, ip, i, needj);
6551		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6552		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6553			/* Release a whole indirect tree. */
6554			if (lbn > lastlbn) {
6555				setup_freeindir(freeblks, ip, i, -lbn -i,
6556				    needj);
6557				continue;
6558			}
6559			iboff = i + NDADDR;
6560			/*
6561			 * Traverse partially truncated indirect tree.
6562			 */
6563			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6564				setup_trunc_indir(freeblks, ip, -lbn - i,
6565				    lastlbn, DIP(ip, i_ib[i]));
6566		}
6567		/*
6568		 * Handle partial truncation to a frag boundary.
6569		 */
6570		if (frags) {
6571			ufs2_daddr_t blkno;
6572			long oldfrags;
6573
6574			oldfrags = blksize(fs, ip, lastlbn);
6575			blkno = DIP(ip, i_db[lastlbn]);
6576			if (blkno && oldfrags != frags) {
6577				oldfrags -= frags;
6578				oldfrags = numfrags(ip->i_fs, oldfrags);
6579				blkno += numfrags(ip->i_fs, frags);
6580				newfreework(ump, freeblks, NULL, lastlbn,
6581				    blkno, oldfrags, 0, needj);
6582				if (needj)
6583					adjust_newfreework(freeblks,
6584					    numfrags(ip->i_fs, frags));
6585			} else if (blkno == 0)
6586				allocblock = 1;
6587		}
6588		/*
6589		 * Add a journal record for partial truncate if we are
6590		 * handling indirect blocks.  Non-indirects need no extra
6591		 * journaling.
6592		 */
6593		if (length != 0 && lastlbn >= NDADDR) {
6594			ip->i_flag |= IN_TRUNCATED;
6595			newjtrunc(freeblks, length, 0);
6596		}
6597		ip->i_size = length;
6598		DIP_SET(ip, i_size, ip->i_size);
6599		datablocks = DIP(ip, i_blocks) - extblocks;
6600		if (length != 0)
6601			datablocks = blkcount(ip->i_fs, datablocks, length);
6602		freeblks->fb_len = length;
6603	}
6604	if ((flags & IO_EXT) != 0) {
6605		for (i = 0; i < NXADDR; i++)
6606			setup_freeext(freeblks, ip, i, needj);
6607		ip->i_din2->di_extsize = 0;
6608		datablocks += extblocks;
6609	}
6610#ifdef QUOTA
6611	/* Reference the quotas in case the block count is wrong in the end. */
6612	quotaref(vp, freeblks->fb_quota);
6613	(void) chkdq(ip, -datablocks, NOCRED, 0);
6614#endif
6615	freeblks->fb_chkcnt = -datablocks;
6616	UFS_LOCK(ump);
6617	fs->fs_pendingblocks += datablocks;
6618	UFS_UNLOCK(ump);
6619	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6620	/*
6621	 * Handle truncation of incomplete alloc direct dependencies.  We
6622	 * hold the inode block locked to prevent incomplete dependencies
6623	 * from reaching the disk while we are eliminating those that
6624	 * have been truncated.  This is a partially inlined ffs_update().
6625	 */
6626	ufs_itimes(vp);
6627	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
6628	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6629	    (int)fs->fs_bsize, cred, &bp);
6630	if (error) {
6631		brelse(bp);
6632		softdep_error("softdep_journal_freeblocks", error);
6633		return;
6634	}
6635	if (bp->b_bufsize == fs->fs_bsize)
6636		bp->b_flags |= B_CLUSTEROK;
6637	softdep_update_inodeblock(ip, bp, 0);
6638	if (ump->um_fstype == UFS1)
6639		*((struct ufs1_dinode *)bp->b_data +
6640		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
6641	else
6642		*((struct ufs2_dinode *)bp->b_data +
6643		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
6644	ACQUIRE_LOCK(ump);
6645	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6646	if ((inodedep->id_state & IOSTARTED) != 0)
6647		panic("softdep_setup_freeblocks: inode busy");
6648	/*
6649	 * Add the freeblks structure to the list of operations that
6650	 * must await the zero'ed inode being written to disk. If we
6651	 * still have a bitmap dependency (needj), then the inode
6652	 * has never been written to disk, so we can process the
6653	 * freeblks below once we have deleted the dependencies.
6654	 */
6655	if (needj)
6656		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6657	else
6658		freeblks->fb_state |= COMPLETE;
6659	if ((flags & IO_NORMAL) != 0) {
6660		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
6661			if (adp->ad_offset > iboff)
6662				cancel_allocdirect(&inodedep->id_inoupdt, adp,
6663				    freeblks);
6664			/*
6665			 * Truncate the allocdirect.  We could eliminate
6666			 * or modify journal records as well.
6667			 */
6668			else if (adp->ad_offset == iboff && frags)
6669				adp->ad_newsize = frags;
6670		}
6671	}
6672	if ((flags & IO_EXT) != 0)
6673		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6674			cancel_allocdirect(&inodedep->id_extupdt, adp,
6675			    freeblks);
6676	/*
6677	 * Scan the bufwait list for newblock dependencies that will never
6678	 * make it to disk.
6679	 */
6680	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
6681		if (wk->wk_type != D_ALLOCDIRECT)
6682			continue;
6683		adp = WK_ALLOCDIRECT(wk);
6684		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
6685		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
6686			cancel_jfreeblk(freeblks, adp->ad_newblkno);
6687			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
6688			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
6689		}
6690	}
6691	/*
6692	 * Add journal work.
6693	 */
6694	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
6695		add_to_journal(&jblkdep->jb_list);
6696	FREE_LOCK(ump);
6697	bdwrite(bp);
6698	/*
6699	 * Truncate dependency structures beyond length.
6700	 */
6701	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
6702	/*
6703	 * This is only set when we need to allocate a fragment because
6704	 * none existed at the end of a frag-sized file.  It handles only
6705	 * allocating a new, zero filled block.
6706	 */
6707	if (allocblock) {
6708		ip->i_size = length - lastoff;
6709		DIP_SET(ip, i_size, ip->i_size);
6710		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
6711		if (error != 0) {
6712			softdep_error("softdep_journal_freeblks", error);
6713			return;
6714		}
6715		ip->i_size = length;
6716		DIP_SET(ip, i_size, length);
6717		ip->i_flag |= IN_CHANGE | IN_UPDATE;
6718		allocbuf(bp, frags);
6719		ffs_update(vp, 0);
6720		bawrite(bp);
6721	} else if (lastoff != 0 && vp->v_type != VDIR) {
6722		int size;
6723
6724		/*
6725		 * Zero the end of a truncated frag or block.
6726		 */
6727		size = sblksize(fs, length, lastlbn);
6728		error = bread(vp, lastlbn, size, cred, &bp);
6729		if (error) {
6730			softdep_error("softdep_journal_freeblks", error);
6731			return;
6732		}
6733		bzero((char *)bp->b_data + lastoff, size - lastoff);
6734		bawrite(bp);
6735
6736	}
6737	ACQUIRE_LOCK(ump);
6738	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6739	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
6740	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
6741	/*
6742	 * We zero earlier truncations so they don't erroneously
6743	 * update i_blocks.
6744	 */
6745	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
6746		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
6747			fbn->fb_len = 0;
6748	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
6749	    LIST_EMPTY(&freeblks->fb_jblkdephd))
6750		freeblks->fb_state |= INPROGRESS;
6751	else
6752		freeblks = NULL;
6753	FREE_LOCK(ump);
6754	if (freeblks)
6755		handle_workitem_freeblocks(freeblks, 0);
6756	trunc_pages(ip, length, extblocks, flags);
6757
6758}
6759
6760/*
6761 * Flush a JOP_SYNC to the journal.
6762 */
6763void
6764softdep_journal_fsync(ip)
6765	struct inode *ip;
6766{
6767	struct jfsync *jfsync;
6768
6769	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
6770	    ("softdep_journal_fsync called on non-softdep filesystem"));
6771	if ((ip->i_flag & IN_TRUNCATED) == 0)
6772		return;
6773	ip->i_flag &= ~IN_TRUNCATED;
6774	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
6775	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ip->i_ump));
6776	jfsync->jfs_size = ip->i_size;
6777	jfsync->jfs_ino = ip->i_number;
6778	ACQUIRE_LOCK(ip->i_ump);
6779	add_to_journal(&jfsync->jfs_list);
6780	jwait(&jfsync->jfs_list, MNT_WAIT);
6781	FREE_LOCK(ip->i_ump);
6782}
6783
6784/*
6785 * Block de-allocation dependencies.
6786 *
6787 * When blocks are de-allocated, the on-disk pointers must be nullified before
6788 * the blocks are made available for use by other files.  (The true
6789 * requirement is that old pointers must be nullified before new on-disk
6790 * pointers are set.  We chose this slightly more stringent requirement to
6791 * reduce complexity.) Our implementation handles this dependency by updating
6792 * the inode (or indirect block) appropriately but delaying the actual block
6793 * de-allocation (i.e., freemap and free space count manipulation) until
6794 * after the updated versions reach stable storage.  After the disk is
6795 * updated, the blocks can be safely de-allocated whenever it is convenient.
6796 * This implementation handles only the common case of reducing a file's
6797 * length to zero. Other cases are handled by the conventional synchronous
6798 * write approach.
6799 *
6800 * The ffs implementation with which we worked double-checks
6801 * the state of the block pointers and file size as it reduces
6802 * a file's length.  Some of this code is replicated here in our
6803 * soft updates implementation.  The freeblks->fb_chkcnt field is
6804 * used to transfer a part of this information to the procedure
6805 * that eventually de-allocates the blocks.
6806 *
6807 * This routine should be called from the routine that shortens
6808 * a file's length, before the inode's size or block pointers
6809 * are modified. It will save the block pointer information for
6810 * later release and zero the inode so that the calling routine
6811 * can release it.
6812 */
6813void
6814softdep_setup_freeblocks(ip, length, flags)
6815	struct inode *ip;	/* The inode whose length is to be reduced */
6816	off_t length;		/* The new length for the file */
6817	int flags;		/* IO_EXT and/or IO_NORMAL */
6818{
6819	struct ufs1_dinode *dp1;
6820	struct ufs2_dinode *dp2;
6821	struct freeblks *freeblks;
6822	struct inodedep *inodedep;
6823	struct allocdirect *adp;
6824	struct ufsmount *ump;
6825	struct buf *bp;
6826	struct fs *fs;
6827	ufs2_daddr_t extblocks, datablocks;
6828	struct mount *mp;
6829	int i, delay, error;
6830	ufs_lbn_t tmpval;
6831	ufs_lbn_t lbn;
6832
6833	ump = ip->i_ump;
6834	mp = UFSTOVFS(ump);
6835	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6836	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
6837	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
6838	    ip->i_number, length);
6839	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
6840	fs = ip->i_fs;
6841	freeblks = newfreeblks(mp, ip);
6842	extblocks = 0;
6843	datablocks = 0;
6844	if (fs->fs_magic == FS_UFS2_MAGIC)
6845		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6846	if ((flags & IO_NORMAL) != 0) {
6847		for (i = 0; i < NDADDR; i++)
6848			setup_freedirect(freeblks, ip, i, 0);
6849		for (i = 0, tmpval = NINDIR(fs), lbn = NDADDR; i < NIADDR;
6850		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
6851			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
6852		ip->i_size = 0;
6853		DIP_SET(ip, i_size, 0);
6854		datablocks = DIP(ip, i_blocks) - extblocks;
6855	}
6856	if ((flags & IO_EXT) != 0) {
6857		for (i = 0; i < NXADDR; i++)
6858			setup_freeext(freeblks, ip, i, 0);
6859		ip->i_din2->di_extsize = 0;
6860		datablocks += extblocks;
6861	}
6862#ifdef QUOTA
6863	/* Reference the quotas in case the block count is wrong in the end. */
6864	quotaref(ITOV(ip), freeblks->fb_quota);
6865	(void) chkdq(ip, -datablocks, NOCRED, 0);
6866#endif
6867	freeblks->fb_chkcnt = -datablocks;
6868	UFS_LOCK(ump);
6869	fs->fs_pendingblocks += datablocks;
6870	UFS_UNLOCK(ump);
6871	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
6872	/*
6873	 * Push the zero'ed inode to to its disk buffer so that we are free
6874	 * to delete its dependencies below. Once the dependencies are gone
6875	 * the buffer can be safely released.
6876	 */
6877	if ((error = bread(ip->i_devvp,
6878	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
6879	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
6880		brelse(bp);
6881		softdep_error("softdep_setup_freeblocks", error);
6882	}
6883	if (ump->um_fstype == UFS1) {
6884		dp1 = ((struct ufs1_dinode *)bp->b_data +
6885		    ino_to_fsbo(fs, ip->i_number));
6886		ip->i_din1->di_freelink = dp1->di_freelink;
6887		*dp1 = *ip->i_din1;
6888	} else {
6889		dp2 = ((struct ufs2_dinode *)bp->b_data +
6890		    ino_to_fsbo(fs, ip->i_number));
6891		ip->i_din2->di_freelink = dp2->di_freelink;
6892		*dp2 = *ip->i_din2;
6893	}
6894	/*
6895	 * Find and eliminate any inode dependencies.
6896	 */
6897	ACQUIRE_LOCK(ump);
6898	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6899	if ((inodedep->id_state & IOSTARTED) != 0)
6900		panic("softdep_setup_freeblocks: inode busy");
6901	/*
6902	 * Add the freeblks structure to the list of operations that
6903	 * must await the zero'ed inode being written to disk. If we
6904	 * still have a bitmap dependency (delay == 0), then the inode
6905	 * has never been written to disk, so we can process the
6906	 * freeblks below once we have deleted the dependencies.
6907	 */
6908	delay = (inodedep->id_state & DEPCOMPLETE);
6909	if (delay)
6910		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
6911	else
6912		freeblks->fb_state |= COMPLETE;
6913	/*
6914	 * Because the file length has been truncated to zero, any
6915	 * pending block allocation dependency structures associated
6916	 * with this inode are obsolete and can simply be de-allocated.
6917	 * We must first merge the two dependency lists to get rid of
6918	 * any duplicate freefrag structures, then purge the merged list.
6919	 * If we still have a bitmap dependency, then the inode has never
6920	 * been written to disk, so we can free any fragments without delay.
6921	 */
6922	if (flags & IO_NORMAL) {
6923		merge_inode_lists(&inodedep->id_newinoupdt,
6924		    &inodedep->id_inoupdt);
6925		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
6926			cancel_allocdirect(&inodedep->id_inoupdt, adp,
6927			    freeblks);
6928	}
6929	if (flags & IO_EXT) {
6930		merge_inode_lists(&inodedep->id_newextupdt,
6931		    &inodedep->id_extupdt);
6932		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
6933			cancel_allocdirect(&inodedep->id_extupdt, adp,
6934			    freeblks);
6935	}
6936	FREE_LOCK(ump);
6937	bdwrite(bp);
6938	trunc_dependencies(ip, freeblks, -1, 0, flags);
6939	ACQUIRE_LOCK(ump);
6940	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
6941		(void) free_inodedep(inodedep);
6942	freeblks->fb_state |= DEPCOMPLETE;
6943	/*
6944	 * If the inode with zeroed block pointers is now on disk
6945	 * we can start freeing blocks.
6946	 */
6947	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
6948		freeblks->fb_state |= INPROGRESS;
6949	else
6950		freeblks = NULL;
6951	FREE_LOCK(ump);
6952	if (freeblks)
6953		handle_workitem_freeblocks(freeblks, 0);
6954	trunc_pages(ip, length, extblocks, flags);
6955}
6956
6957/*
6958 * Eliminate pages from the page cache that back parts of this inode and
6959 * adjust the vnode pager's idea of our size.  This prevents stale data
6960 * from hanging around in the page cache.
6961 */
6962static void
6963trunc_pages(ip, length, extblocks, flags)
6964	struct inode *ip;
6965	off_t length;
6966	ufs2_daddr_t extblocks;
6967	int flags;
6968{
6969	struct vnode *vp;
6970	struct fs *fs;
6971	ufs_lbn_t lbn;
6972	off_t end, extend;
6973
6974	vp = ITOV(ip);
6975	fs = ip->i_fs;
6976	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
6977	if ((flags & IO_EXT) != 0)
6978		vn_pages_remove(vp, extend, 0);
6979	if ((flags & IO_NORMAL) == 0)
6980		return;
6981	BO_LOCK(&vp->v_bufobj);
6982	drain_output(vp);
6983	BO_UNLOCK(&vp->v_bufobj);
6984	/*
6985	 * The vnode pager eliminates file pages we eliminate indirects
6986	 * below.
6987	 */
6988	vnode_pager_setsize(vp, length);
6989	/*
6990	 * Calculate the end based on the last indirect we want to keep.  If
6991	 * the block extends into indirects we can just use the negative of
6992	 * its lbn.  Doubles and triples exist at lower numbers so we must
6993	 * be careful not to remove those, if they exist.  double and triple
6994	 * indirect lbns do not overlap with others so it is not important
6995	 * to verify how many levels are required.
6996	 */
6997	lbn = lblkno(fs, length);
6998	if (lbn >= NDADDR) {
6999		/* Calculate the virtual lbn of the triple indirect. */
7000		lbn = -lbn - (NIADDR - 1);
7001		end = OFF_TO_IDX(lblktosize(fs, lbn));
7002	} else
7003		end = extend;
7004	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7005}
7006
7007/*
7008 * See if the buf bp is in the range eliminated by truncation.
7009 */
7010static int
7011trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7012	struct buf *bp;
7013	int *blkoffp;
7014	ufs_lbn_t lastlbn;
7015	int lastoff;
7016	int flags;
7017{
7018	ufs_lbn_t lbn;
7019
7020	*blkoffp = 0;
7021	/* Only match ext/normal blocks as appropriate. */
7022	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7023	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7024		return (0);
7025	/* ALTDATA is always a full truncation. */
7026	if ((bp->b_xflags & BX_ALTDATA) != 0)
7027		return (1);
7028	/* -1 is full truncation. */
7029	if (lastlbn == -1)
7030		return (1);
7031	/*
7032	 * If this is a partial truncate we only want those
7033	 * blocks and indirect blocks that cover the range
7034	 * we're after.
7035	 */
7036	lbn = bp->b_lblkno;
7037	if (lbn < 0)
7038		lbn = -(lbn + lbn_level(lbn));
7039	if (lbn < lastlbn)
7040		return (0);
7041	/* Here we only truncate lblkno if it's partial. */
7042	if (lbn == lastlbn) {
7043		if (lastoff == 0)
7044			return (0);
7045		*blkoffp = lastoff;
7046	}
7047	return (1);
7048}
7049
7050/*
7051 * Eliminate any dependencies that exist in memory beyond lblkno:off
7052 */
7053static void
7054trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7055	struct inode *ip;
7056	struct freeblks *freeblks;
7057	ufs_lbn_t lastlbn;
7058	int lastoff;
7059	int flags;
7060{
7061	struct bufobj *bo;
7062	struct vnode *vp;
7063	struct buf *bp;
7064	int blkoff;
7065
7066	/*
7067	 * We must wait for any I/O in progress to finish so that
7068	 * all potential buffers on the dirty list will be visible.
7069	 * Once they are all there, walk the list and get rid of
7070	 * any dependencies.
7071	 */
7072	vp = ITOV(ip);
7073	bo = &vp->v_bufobj;
7074	BO_LOCK(bo);
7075	drain_output(vp);
7076	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7077		bp->b_vflags &= ~BV_SCANNED;
7078restart:
7079	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7080		if (bp->b_vflags & BV_SCANNED)
7081			continue;
7082		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7083			bp->b_vflags |= BV_SCANNED;
7084			continue;
7085		}
7086		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7087		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7088			goto restart;
7089		BO_UNLOCK(bo);
7090		if (deallocate_dependencies(bp, freeblks, blkoff))
7091			bqrelse(bp);
7092		else
7093			brelse(bp);
7094		BO_LOCK(bo);
7095		goto restart;
7096	}
7097	/*
7098	 * Now do the work of vtruncbuf while also matching indirect blocks.
7099	 */
7100	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7101		bp->b_vflags &= ~BV_SCANNED;
7102cleanrestart:
7103	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7104		if (bp->b_vflags & BV_SCANNED)
7105			continue;
7106		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7107			bp->b_vflags |= BV_SCANNED;
7108			continue;
7109		}
7110		if (BUF_LOCK(bp,
7111		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7112		    BO_LOCKPTR(bo)) == ENOLCK) {
7113			BO_LOCK(bo);
7114			goto cleanrestart;
7115		}
7116		bp->b_vflags |= BV_SCANNED;
7117		bremfree(bp);
7118		if (blkoff != 0) {
7119			allocbuf(bp, blkoff);
7120			bqrelse(bp);
7121		} else {
7122			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7123			brelse(bp);
7124		}
7125		BO_LOCK(bo);
7126		goto cleanrestart;
7127	}
7128	drain_output(vp);
7129	BO_UNLOCK(bo);
7130}
7131
7132static int
7133cancel_pagedep(pagedep, freeblks, blkoff)
7134	struct pagedep *pagedep;
7135	struct freeblks *freeblks;
7136	int blkoff;
7137{
7138	struct jremref *jremref;
7139	struct jmvref *jmvref;
7140	struct dirrem *dirrem, *tmp;
7141	int i;
7142
7143	/*
7144	 * Copy any directory remove dependencies to the list
7145	 * to be processed after the freeblks proceeds.  If
7146	 * directory entry never made it to disk they
7147	 * can be dumped directly onto the work list.
7148	 */
7149	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7150		/* Skip this directory removal if it is intended to remain. */
7151		if (dirrem->dm_offset < blkoff)
7152			continue;
7153		/*
7154		 * If there are any dirrems we wait for the journal write
7155		 * to complete and then restart the buf scan as the lock
7156		 * has been dropped.
7157		 */
7158		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7159			jwait(&jremref->jr_list, MNT_WAIT);
7160			return (ERESTART);
7161		}
7162		LIST_REMOVE(dirrem, dm_next);
7163		dirrem->dm_dirinum = pagedep->pd_ino;
7164		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7165	}
7166	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7167		jwait(&jmvref->jm_list, MNT_WAIT);
7168		return (ERESTART);
7169	}
7170	/*
7171	 * When we're partially truncating a pagedep we just want to flush
7172	 * journal entries and return.  There can not be any adds in the
7173	 * truncated portion of the directory and newblk must remain if
7174	 * part of the block remains.
7175	 */
7176	if (blkoff != 0) {
7177		struct diradd *dap;
7178
7179		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7180			if (dap->da_offset > blkoff)
7181				panic("cancel_pagedep: diradd %p off %d > %d",
7182				    dap, dap->da_offset, blkoff);
7183		for (i = 0; i < DAHASHSZ; i++)
7184			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7185				if (dap->da_offset > blkoff)
7186					panic("cancel_pagedep: diradd %p off %d > %d",
7187					    dap, dap->da_offset, blkoff);
7188		return (0);
7189	}
7190	/*
7191	 * There should be no directory add dependencies present
7192	 * as the directory could not be truncated until all
7193	 * children were removed.
7194	 */
7195	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7196	    ("deallocate_dependencies: pendinghd != NULL"));
7197	for (i = 0; i < DAHASHSZ; i++)
7198		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7199		    ("deallocate_dependencies: diraddhd != NULL"));
7200	if ((pagedep->pd_state & NEWBLOCK) != 0)
7201		free_newdirblk(pagedep->pd_newdirblk);
7202	if (free_pagedep(pagedep) == 0)
7203		panic("Failed to free pagedep %p", pagedep);
7204	return (0);
7205}
7206
7207/*
7208 * Reclaim any dependency structures from a buffer that is about to
7209 * be reallocated to a new vnode. The buffer must be locked, thus,
7210 * no I/O completion operations can occur while we are manipulating
7211 * its associated dependencies. The mutex is held so that other I/O's
7212 * associated with related dependencies do not occur.
7213 */
7214static int
7215deallocate_dependencies(bp, freeblks, off)
7216	struct buf *bp;
7217	struct freeblks *freeblks;
7218	int off;
7219{
7220	struct indirdep *indirdep;
7221	struct pagedep *pagedep;
7222	struct worklist *wk, *wkn;
7223	struct ufsmount *ump;
7224
7225	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
7226		goto done;
7227	ump = VFSTOUFS(wk->wk_mp);
7228	ACQUIRE_LOCK(ump);
7229	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7230		switch (wk->wk_type) {
7231		case D_INDIRDEP:
7232			indirdep = WK_INDIRDEP(wk);
7233			if (bp->b_lblkno >= 0 ||
7234			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7235				panic("deallocate_dependencies: not indir");
7236			cancel_indirdep(indirdep, bp, freeblks);
7237			continue;
7238
7239		case D_PAGEDEP:
7240			pagedep = WK_PAGEDEP(wk);
7241			if (cancel_pagedep(pagedep, freeblks, off)) {
7242				FREE_LOCK(ump);
7243				return (ERESTART);
7244			}
7245			continue;
7246
7247		case D_ALLOCINDIR:
7248			/*
7249			 * Simply remove the allocindir, we'll find it via
7250			 * the indirdep where we can clear pointers if
7251			 * needed.
7252			 */
7253			WORKLIST_REMOVE(wk);
7254			continue;
7255
7256		case D_FREEWORK:
7257			/*
7258			 * A truncation is waiting for the zero'd pointers
7259			 * to be written.  It can be freed when the freeblks
7260			 * is journaled.
7261			 */
7262			WORKLIST_REMOVE(wk);
7263			wk->wk_state |= ONDEPLIST;
7264			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7265			break;
7266
7267		case D_ALLOCDIRECT:
7268			if (off != 0)
7269				continue;
7270			/* FALLTHROUGH */
7271		default:
7272			panic("deallocate_dependencies: Unexpected type %s",
7273			    TYPENAME(wk->wk_type));
7274			/* NOTREACHED */
7275		}
7276	}
7277	FREE_LOCK(ump);
7278done:
7279	/*
7280	 * Don't throw away this buf, we were partially truncating and
7281	 * some deps may always remain.
7282	 */
7283	if (off) {
7284		allocbuf(bp, off);
7285		bp->b_vflags |= BV_SCANNED;
7286		return (EBUSY);
7287	}
7288	bp->b_flags |= B_INVAL | B_NOCACHE;
7289
7290	return (0);
7291}
7292
7293/*
7294 * An allocdirect is being canceled due to a truncate.  We must make sure
7295 * the journal entry is released in concert with the blkfree that releases
7296 * the storage.  Completed journal entries must not be released until the
7297 * space is no longer pointed to by the inode or in the bitmap.
7298 */
7299static void
7300cancel_allocdirect(adphead, adp, freeblks)
7301	struct allocdirectlst *adphead;
7302	struct allocdirect *adp;
7303	struct freeblks *freeblks;
7304{
7305	struct freework *freework;
7306	struct newblk *newblk;
7307	struct worklist *wk;
7308
7309	TAILQ_REMOVE(adphead, adp, ad_next);
7310	newblk = (struct newblk *)adp;
7311	freework = NULL;
7312	/*
7313	 * Find the correct freework structure.
7314	 */
7315	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7316		if (wk->wk_type != D_FREEWORK)
7317			continue;
7318		freework = WK_FREEWORK(wk);
7319		if (freework->fw_blkno == newblk->nb_newblkno)
7320			break;
7321	}
7322	if (freework == NULL)
7323		panic("cancel_allocdirect: Freework not found");
7324	/*
7325	 * If a newblk exists at all we still have the journal entry that
7326	 * initiated the allocation so we do not need to journal the free.
7327	 */
7328	cancel_jfreeblk(freeblks, freework->fw_blkno);
7329	/*
7330	 * If the journal hasn't been written the jnewblk must be passed
7331	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7332	 * this by linking the journal dependency into the freework to be
7333	 * freed when freework_freeblock() is called.  If the journal has
7334	 * been written we can simply reclaim the journal space when the
7335	 * freeblks work is complete.
7336	 */
7337	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7338	    &freeblks->fb_jwork);
7339	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7340}
7341
7342
7343/*
7344 * Cancel a new block allocation.  May be an indirect or direct block.  We
7345 * remove it from various lists and return any journal record that needs to
7346 * be resolved by the caller.
7347 *
7348 * A special consideration is made for indirects which were never pointed
7349 * at on disk and will never be found once this block is released.
7350 */
7351static struct jnewblk *
7352cancel_newblk(newblk, wk, wkhd)
7353	struct newblk *newblk;
7354	struct worklist *wk;
7355	struct workhead *wkhd;
7356{
7357	struct jnewblk *jnewblk;
7358
7359	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7360
7361	newblk->nb_state |= GOINGAWAY;
7362	/*
7363	 * Previously we traversed the completedhd on each indirdep
7364	 * attached to this newblk to cancel them and gather journal
7365	 * work.  Since we need only the oldest journal segment and
7366	 * the lowest point on the tree will always have the oldest
7367	 * journal segment we are free to release the segments
7368	 * of any subordinates and may leave the indirdep list to
7369	 * indirdep_complete() when this newblk is freed.
7370	 */
7371	if (newblk->nb_state & ONDEPLIST) {
7372		newblk->nb_state &= ~ONDEPLIST;
7373		LIST_REMOVE(newblk, nb_deps);
7374	}
7375	if (newblk->nb_state & ONWORKLIST)
7376		WORKLIST_REMOVE(&newblk->nb_list);
7377	/*
7378	 * If the journal entry hasn't been written we save a pointer to
7379	 * the dependency that frees it until it is written or the
7380	 * superseding operation completes.
7381	 */
7382	jnewblk = newblk->nb_jnewblk;
7383	if (jnewblk != NULL && wk != NULL) {
7384		newblk->nb_jnewblk = NULL;
7385		jnewblk->jn_dep = wk;
7386	}
7387	if (!LIST_EMPTY(&newblk->nb_jwork))
7388		jwork_move(wkhd, &newblk->nb_jwork);
7389	/*
7390	 * When truncating we must free the newdirblk early to remove
7391	 * the pagedep from the hash before returning.
7392	 */
7393	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7394		free_newdirblk(WK_NEWDIRBLK(wk));
7395	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7396		panic("cancel_newblk: extra newdirblk");
7397
7398	return (jnewblk);
7399}
7400
7401/*
7402 * Schedule the freefrag associated with a newblk to be released once
7403 * the pointers are written and the previous block is no longer needed.
7404 */
7405static void
7406newblk_freefrag(newblk)
7407	struct newblk *newblk;
7408{
7409	struct freefrag *freefrag;
7410
7411	if (newblk->nb_freefrag == NULL)
7412		return;
7413	freefrag = newblk->nb_freefrag;
7414	newblk->nb_freefrag = NULL;
7415	freefrag->ff_state |= COMPLETE;
7416	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7417		add_to_worklist(&freefrag->ff_list, 0);
7418}
7419
7420/*
7421 * Free a newblk. Generate a new freefrag work request if appropriate.
7422 * This must be called after the inode pointer and any direct block pointers
7423 * are valid or fully removed via truncate or frag extension.
7424 */
7425static void
7426free_newblk(newblk)
7427	struct newblk *newblk;
7428{
7429	struct indirdep *indirdep;
7430	struct worklist *wk;
7431
7432	KASSERT(newblk->nb_jnewblk == NULL,
7433	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7434	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7435	    ("free_newblk: unclaimed newblk"));
7436	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7437	newblk_freefrag(newblk);
7438	if (newblk->nb_state & ONDEPLIST)
7439		LIST_REMOVE(newblk, nb_deps);
7440	if (newblk->nb_state & ONWORKLIST)
7441		WORKLIST_REMOVE(&newblk->nb_list);
7442	LIST_REMOVE(newblk, nb_hash);
7443	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7444		free_newdirblk(WK_NEWDIRBLK(wk));
7445	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7446		panic("free_newblk: extra newdirblk");
7447	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7448		indirdep_complete(indirdep);
7449	handle_jwork(&newblk->nb_jwork);
7450	WORKITEM_FREE(newblk, D_NEWBLK);
7451}
7452
7453/*
7454 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7455 * This routine must be called with splbio interrupts blocked.
7456 */
7457static void
7458free_newdirblk(newdirblk)
7459	struct newdirblk *newdirblk;
7460{
7461	struct pagedep *pagedep;
7462	struct diradd *dap;
7463	struct worklist *wk;
7464
7465	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7466	WORKLIST_REMOVE(&newdirblk->db_list);
7467	/*
7468	 * If the pagedep is still linked onto the directory buffer
7469	 * dependency chain, then some of the entries on the
7470	 * pd_pendinghd list may not be committed to disk yet. In
7471	 * this case, we will simply clear the NEWBLOCK flag and
7472	 * let the pd_pendinghd list be processed when the pagedep
7473	 * is next written. If the pagedep is no longer on the buffer
7474	 * dependency chain, then all the entries on the pd_pending
7475	 * list are committed to disk and we can free them here.
7476	 */
7477	pagedep = newdirblk->db_pagedep;
7478	pagedep->pd_state &= ~NEWBLOCK;
7479	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7480		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7481			free_diradd(dap, NULL);
7482		/*
7483		 * If no dependencies remain, the pagedep will be freed.
7484		 */
7485		free_pagedep(pagedep);
7486	}
7487	/* Should only ever be one item in the list. */
7488	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7489		WORKLIST_REMOVE(wk);
7490		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7491	}
7492	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7493}
7494
7495/*
7496 * Prepare an inode to be freed. The actual free operation is not
7497 * done until the zero'ed inode has been written to disk.
7498 */
7499void
7500softdep_freefile(pvp, ino, mode)
7501	struct vnode *pvp;
7502	ino_t ino;
7503	int mode;
7504{
7505	struct inode *ip = VTOI(pvp);
7506	struct inodedep *inodedep;
7507	struct freefile *freefile;
7508	struct freeblks *freeblks;
7509	struct ufsmount *ump;
7510
7511	ump = ip->i_ump;
7512	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7513	    ("softdep_freefile called on non-softdep filesystem"));
7514	/*
7515	 * This sets up the inode de-allocation dependency.
7516	 */
7517	freefile = malloc(sizeof(struct freefile),
7518		M_FREEFILE, M_SOFTDEP_FLAGS);
7519	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7520	freefile->fx_mode = mode;
7521	freefile->fx_oldinum = ino;
7522	freefile->fx_devvp = ip->i_devvp;
7523	LIST_INIT(&freefile->fx_jwork);
7524	UFS_LOCK(ump);
7525	ip->i_fs->fs_pendinginodes += 1;
7526	UFS_UNLOCK(ump);
7527
7528	/*
7529	 * If the inodedep does not exist, then the zero'ed inode has
7530	 * been written to disk. If the allocated inode has never been
7531	 * written to disk, then the on-disk inode is zero'ed. In either
7532	 * case we can free the file immediately.  If the journal was
7533	 * canceled before being written the inode will never make it to
7534	 * disk and we must send the canceled journal entrys to
7535	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7536	 * Any blocks waiting on the inode to write can be safely freed
7537	 * here as it will never been written.
7538	 */
7539	ACQUIRE_LOCK(ump);
7540	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7541	if (inodedep) {
7542		/*
7543		 * Clear out freeblks that no longer need to reference
7544		 * this inode.
7545		 */
7546		while ((freeblks =
7547		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7548			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7549			    fb_next);
7550			freeblks->fb_state &= ~ONDEPLIST;
7551		}
7552		/*
7553		 * Remove this inode from the unlinked list.
7554		 */
7555		if (inodedep->id_state & UNLINKED) {
7556			/*
7557			 * Save the journal work to be freed with the bitmap
7558			 * before we clear UNLINKED.  Otherwise it can be lost
7559			 * if the inode block is written.
7560			 */
7561			handle_bufwait(inodedep, &freefile->fx_jwork);
7562			clear_unlinked_inodedep(inodedep);
7563			/*
7564			 * Re-acquire inodedep as we've dropped the
7565			 * per-filesystem lock in clear_unlinked_inodedep().
7566			 */
7567			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7568		}
7569	}
7570	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7571		FREE_LOCK(ump);
7572		handle_workitem_freefile(freefile);
7573		return;
7574	}
7575	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7576		inodedep->id_state |= GOINGAWAY;
7577	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7578	FREE_LOCK(ump);
7579	if (ip->i_number == ino)
7580		ip->i_flag |= IN_MODIFIED;
7581}
7582
7583/*
7584 * Check to see if an inode has never been written to disk. If
7585 * so free the inodedep and return success, otherwise return failure.
7586 * This routine must be called with splbio interrupts blocked.
7587 *
7588 * If we still have a bitmap dependency, then the inode has never
7589 * been written to disk. Drop the dependency as it is no longer
7590 * necessary since the inode is being deallocated. We set the
7591 * ALLCOMPLETE flags since the bitmap now properly shows that the
7592 * inode is not allocated. Even if the inode is actively being
7593 * written, it has been rolled back to its zero'ed state, so we
7594 * are ensured that a zero inode is what is on the disk. For short
7595 * lived files, this change will usually result in removing all the
7596 * dependencies from the inode so that it can be freed immediately.
7597 */
7598static int
7599check_inode_unwritten(inodedep)
7600	struct inodedep *inodedep;
7601{
7602
7603	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7604
7605	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
7606	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7607	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7608	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7609	    !LIST_EMPTY(&inodedep->id_inowait) ||
7610	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7611	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7612	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7613	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7614	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7615	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7616	    inodedep->id_mkdiradd != NULL ||
7617	    inodedep->id_nlinkdelta != 0)
7618		return (0);
7619	/*
7620	 * Another process might be in initiate_write_inodeblock_ufs[12]
7621	 * trying to allocate memory without holding "Softdep Lock".
7622	 */
7623	if ((inodedep->id_state & IOSTARTED) != 0 &&
7624	    inodedep->id_savedino1 == NULL)
7625		return (0);
7626
7627	if (inodedep->id_state & ONDEPLIST)
7628		LIST_REMOVE(inodedep, id_deps);
7629	inodedep->id_state &= ~ONDEPLIST;
7630	inodedep->id_state |= ALLCOMPLETE;
7631	inodedep->id_bmsafemap = NULL;
7632	if (inodedep->id_state & ONWORKLIST)
7633		WORKLIST_REMOVE(&inodedep->id_list);
7634	if (inodedep->id_savedino1 != NULL) {
7635		free(inodedep->id_savedino1, M_SAVEDINO);
7636		inodedep->id_savedino1 = NULL;
7637	}
7638	if (free_inodedep(inodedep) == 0)
7639		panic("check_inode_unwritten: busy inode");
7640	return (1);
7641}
7642
7643static int
7644check_inodedep_free(inodedep)
7645	struct inodedep *inodedep;
7646{
7647
7648	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7649	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
7650	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
7651	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
7652	    !LIST_EMPTY(&inodedep->id_bufwait) ||
7653	    !LIST_EMPTY(&inodedep->id_inowait) ||
7654	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
7655	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
7656	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
7657	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
7658	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
7659	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
7660	    inodedep->id_mkdiradd != NULL ||
7661	    inodedep->id_nlinkdelta != 0 ||
7662	    inodedep->id_savedino1 != NULL)
7663		return (0);
7664	return (1);
7665}
7666
7667/*
7668 * Try to free an inodedep structure. Return 1 if it could be freed.
7669 */
7670static int
7671free_inodedep(inodedep)
7672	struct inodedep *inodedep;
7673{
7674
7675	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
7676	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
7677	    !check_inodedep_free(inodedep))
7678		return (0);
7679	if (inodedep->id_state & ONDEPLIST)
7680		LIST_REMOVE(inodedep, id_deps);
7681	LIST_REMOVE(inodedep, id_hash);
7682	WORKITEM_FREE(inodedep, D_INODEDEP);
7683	return (1);
7684}
7685
7686/*
7687 * Free the block referenced by a freework structure.  The parent freeblks
7688 * structure is released and completed when the final cg bitmap reaches
7689 * the disk.  This routine may be freeing a jnewblk which never made it to
7690 * disk in which case we do not have to wait as the operation is undone
7691 * in memory immediately.
7692 */
7693static void
7694freework_freeblock(freework)
7695	struct freework *freework;
7696{
7697	struct freeblks *freeblks;
7698	struct jnewblk *jnewblk;
7699	struct ufsmount *ump;
7700	struct workhead wkhd;
7701	struct fs *fs;
7702	int bsize;
7703	int needj;
7704
7705	ump = VFSTOUFS(freework->fw_list.wk_mp);
7706	LOCK_OWNED(ump);
7707	/*
7708	 * Handle partial truncate separately.
7709	 */
7710	if (freework->fw_indir) {
7711		complete_trunc_indir(freework);
7712		return;
7713	}
7714	freeblks = freework->fw_freeblks;
7715	fs = ump->um_fs;
7716	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
7717	bsize = lfragtosize(fs, freework->fw_frags);
7718	LIST_INIT(&wkhd);
7719	/*
7720	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
7721	 * on the indirblk hashtable and prevents premature freeing.
7722	 */
7723	freework->fw_state |= DEPCOMPLETE;
7724	/*
7725	 * SUJ needs to wait for the segment referencing freed indirect
7726	 * blocks to expire so that we know the checker will not confuse
7727	 * a re-allocated indirect block with its old contents.
7728	 */
7729	if (needj && freework->fw_lbn <= -NDADDR)
7730		indirblk_insert(freework);
7731	/*
7732	 * If we are canceling an existing jnewblk pass it to the free
7733	 * routine, otherwise pass the freeblk which will ultimately
7734	 * release the freeblks.  If we're not journaling, we can just
7735	 * free the freeblks immediately.
7736	 */
7737	jnewblk = freework->fw_jnewblk;
7738	if (jnewblk != NULL) {
7739		cancel_jnewblk(jnewblk, &wkhd);
7740		needj = 0;
7741	} else if (needj) {
7742		freework->fw_state |= DELAYEDFREE;
7743		freeblks->fb_cgwait++;
7744		WORKLIST_INSERT(&wkhd, &freework->fw_list);
7745	}
7746	FREE_LOCK(ump);
7747	freeblks_free(ump, freeblks, btodb(bsize));
7748	CTR4(KTR_SUJ,
7749	    "freework_freeblock: ino %d blkno %jd lbn %jd size %ld",
7750	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
7751	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
7752	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd);
7753	ACQUIRE_LOCK(ump);
7754	/*
7755	 * The jnewblk will be discarded and the bits in the map never
7756	 * made it to disk.  We can immediately free the freeblk.
7757	 */
7758	if (needj == 0)
7759		handle_written_freework(freework);
7760}
7761
7762/*
7763 * We enqueue freework items that need processing back on the freeblks and
7764 * add the freeblks to the worklist.  This makes it easier to find all work
7765 * required to flush a truncation in process_truncates().
7766 */
7767static void
7768freework_enqueue(freework)
7769	struct freework *freework;
7770{
7771	struct freeblks *freeblks;
7772
7773	freeblks = freework->fw_freeblks;
7774	if ((freework->fw_state & INPROGRESS) == 0)
7775		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
7776	if ((freeblks->fb_state &
7777	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
7778	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7779		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7780}
7781
7782/*
7783 * Start, continue, or finish the process of freeing an indirect block tree.
7784 * The free operation may be paused at any point with fw_off containing the
7785 * offset to restart from.  This enables us to implement some flow control
7786 * for large truncates which may fan out and generate a huge number of
7787 * dependencies.
7788 */
7789static void
7790handle_workitem_indirblk(freework)
7791	struct freework *freework;
7792{
7793	struct freeblks *freeblks;
7794	struct ufsmount *ump;
7795	struct fs *fs;
7796
7797	freeblks = freework->fw_freeblks;
7798	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7799	fs = ump->um_fs;
7800	if (freework->fw_state & DEPCOMPLETE) {
7801		handle_written_freework(freework);
7802		return;
7803	}
7804	if (freework->fw_off == NINDIR(fs)) {
7805		freework_freeblock(freework);
7806		return;
7807	}
7808	freework->fw_state |= INPROGRESS;
7809	FREE_LOCK(ump);
7810	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
7811	    freework->fw_lbn);
7812	ACQUIRE_LOCK(ump);
7813}
7814
7815/*
7816 * Called when a freework structure attached to a cg buf is written.  The
7817 * ref on either the parent or the freeblks structure is released and
7818 * the freeblks is added back to the worklist if there is more work to do.
7819 */
7820static void
7821handle_written_freework(freework)
7822	struct freework *freework;
7823{
7824	struct freeblks *freeblks;
7825	struct freework *parent;
7826
7827	freeblks = freework->fw_freeblks;
7828	parent = freework->fw_parent;
7829	if (freework->fw_state & DELAYEDFREE)
7830		freeblks->fb_cgwait--;
7831	freework->fw_state |= COMPLETE;
7832	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
7833		WORKITEM_FREE(freework, D_FREEWORK);
7834	if (parent) {
7835		if (--parent->fw_ref == 0)
7836			freework_enqueue(parent);
7837		return;
7838	}
7839	if (--freeblks->fb_ref != 0)
7840		return;
7841	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
7842	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
7843		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
7844}
7845
7846/*
7847 * This workitem routine performs the block de-allocation.
7848 * The workitem is added to the pending list after the updated
7849 * inode block has been written to disk.  As mentioned above,
7850 * checks regarding the number of blocks de-allocated (compared
7851 * to the number of blocks allocated for the file) are also
7852 * performed in this function.
7853 */
7854static int
7855handle_workitem_freeblocks(freeblks, flags)
7856	struct freeblks *freeblks;
7857	int flags;
7858{
7859	struct freework *freework;
7860	struct newblk *newblk;
7861	struct allocindir *aip;
7862	struct ufsmount *ump;
7863	struct worklist *wk;
7864
7865	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
7866	    ("handle_workitem_freeblocks: Journal entries not written."));
7867	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7868	ACQUIRE_LOCK(ump);
7869	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
7870		WORKLIST_REMOVE(wk);
7871		switch (wk->wk_type) {
7872		case D_DIRREM:
7873			wk->wk_state |= COMPLETE;
7874			add_to_worklist(wk, 0);
7875			continue;
7876
7877		case D_ALLOCDIRECT:
7878			free_newblk(WK_NEWBLK(wk));
7879			continue;
7880
7881		case D_ALLOCINDIR:
7882			aip = WK_ALLOCINDIR(wk);
7883			freework = NULL;
7884			if (aip->ai_state & DELAYEDFREE) {
7885				FREE_LOCK(ump);
7886				freework = newfreework(ump, freeblks, NULL,
7887				    aip->ai_lbn, aip->ai_newblkno,
7888				    ump->um_fs->fs_frag, 0, 0);
7889				ACQUIRE_LOCK(ump);
7890			}
7891			newblk = WK_NEWBLK(wk);
7892			if (newblk->nb_jnewblk) {
7893				freework->fw_jnewblk = newblk->nb_jnewblk;
7894				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
7895				newblk->nb_jnewblk = NULL;
7896			}
7897			free_newblk(newblk);
7898			continue;
7899
7900		case D_FREEWORK:
7901			freework = WK_FREEWORK(wk);
7902			if (freework->fw_lbn <= -NDADDR)
7903				handle_workitem_indirblk(freework);
7904			else
7905				freework_freeblock(freework);
7906			continue;
7907		default:
7908			panic("handle_workitem_freeblocks: Unknown type %s",
7909			    TYPENAME(wk->wk_type));
7910		}
7911	}
7912	if (freeblks->fb_ref != 0) {
7913		freeblks->fb_state &= ~INPROGRESS;
7914		wake_worklist(&freeblks->fb_list);
7915		freeblks = NULL;
7916	}
7917	FREE_LOCK(ump);
7918	if (freeblks)
7919		return handle_complete_freeblocks(freeblks, flags);
7920	return (0);
7921}
7922
7923/*
7924 * Handle completion of block free via truncate.  This allows fs_pending
7925 * to track the actual free block count more closely than if we only updated
7926 * it at the end.  We must be careful to handle cases where the block count
7927 * on free was incorrect.
7928 */
7929static void
7930freeblks_free(ump, freeblks, blocks)
7931	struct ufsmount *ump;
7932	struct freeblks *freeblks;
7933	int blocks;
7934{
7935	struct fs *fs;
7936	ufs2_daddr_t remain;
7937
7938	UFS_LOCK(ump);
7939	remain = -freeblks->fb_chkcnt;
7940	freeblks->fb_chkcnt += blocks;
7941	if (remain > 0) {
7942		if (remain < blocks)
7943			blocks = remain;
7944		fs = ump->um_fs;
7945		fs->fs_pendingblocks -= blocks;
7946	}
7947	UFS_UNLOCK(ump);
7948}
7949
7950/*
7951 * Once all of the freework workitems are complete we can retire the
7952 * freeblocks dependency and any journal work awaiting completion.  This
7953 * can not be called until all other dependencies are stable on disk.
7954 */
7955static int
7956handle_complete_freeblocks(freeblks, flags)
7957	struct freeblks *freeblks;
7958	int flags;
7959{
7960	struct inodedep *inodedep;
7961	struct inode *ip;
7962	struct vnode *vp;
7963	struct fs *fs;
7964	struct ufsmount *ump;
7965	ufs2_daddr_t spare;
7966
7967	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
7968	fs = ump->um_fs;
7969	flags = LK_EXCLUSIVE | flags;
7970	spare = freeblks->fb_chkcnt;
7971
7972	/*
7973	 * If we did not release the expected number of blocks we may have
7974	 * to adjust the inode block count here.  Only do so if it wasn't
7975	 * a truncation to zero and the modrev still matches.
7976	 */
7977	if (spare && freeblks->fb_len != 0) {
7978		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
7979		    flags, &vp, FFSV_FORCEINSMQ) != 0)
7980			return (EBUSY);
7981		ip = VTOI(vp);
7982		if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
7983			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
7984			ip->i_flag |= IN_CHANGE;
7985			/*
7986			 * We must wait so this happens before the
7987			 * journal is reclaimed.
7988			 */
7989			ffs_update(vp, 1);
7990		}
7991		vput(vp);
7992	}
7993	if (spare < 0) {
7994		UFS_LOCK(ump);
7995		fs->fs_pendingblocks += spare;
7996		UFS_UNLOCK(ump);
7997	}
7998#ifdef QUOTA
7999	/* Handle spare. */
8000	if (spare)
8001		quotaadj(freeblks->fb_quota, ump, -spare);
8002	quotarele(freeblks->fb_quota);
8003#endif
8004	ACQUIRE_LOCK(ump);
8005	if (freeblks->fb_state & ONDEPLIST) {
8006		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8007		    0, &inodedep);
8008		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8009		freeblks->fb_state &= ~ONDEPLIST;
8010		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8011			free_inodedep(inodedep);
8012	}
8013	/*
8014	 * All of the freeblock deps must be complete prior to this call
8015	 * so it's now safe to complete earlier outstanding journal entries.
8016	 */
8017	handle_jwork(&freeblks->fb_jwork);
8018	WORKITEM_FREE(freeblks, D_FREEBLKS);
8019	FREE_LOCK(ump);
8020	return (0);
8021}
8022
8023/*
8024 * Release blocks associated with the freeblks and stored in the indirect
8025 * block dbn. If level is greater than SINGLE, the block is an indirect block
8026 * and recursive calls to indirtrunc must be used to cleanse other indirect
8027 * blocks.
8028 *
8029 * This handles partial and complete truncation of blocks.  Partial is noted
8030 * with goingaway == 0.  In this case the freework is completed after the
8031 * zero'd indirects are written to disk.  For full truncation the freework
8032 * is completed after the block is freed.
8033 */
8034static void
8035indir_trunc(freework, dbn, lbn)
8036	struct freework *freework;
8037	ufs2_daddr_t dbn;
8038	ufs_lbn_t lbn;
8039{
8040	struct freework *nfreework;
8041	struct workhead wkhd;
8042	struct freeblks *freeblks;
8043	struct buf *bp;
8044	struct fs *fs;
8045	struct indirdep *indirdep;
8046	struct ufsmount *ump;
8047	ufs1_daddr_t *bap1;
8048	ufs2_daddr_t nb, nnb, *bap2;
8049	ufs_lbn_t lbnadd, nlbn;
8050	int i, nblocks, ufs1fmt;
8051	int freedblocks;
8052	int goingaway;
8053	int freedeps;
8054	int needj;
8055	int level;
8056	int cnt;
8057
8058	freeblks = freework->fw_freeblks;
8059	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8060	fs = ump->um_fs;
8061	/*
8062	 * Get buffer of block pointers to be freed.  There are three cases:
8063	 *
8064	 * 1) Partial truncate caches the indirdep pointer in the freework
8065	 *    which provides us a back copy to the save bp which holds the
8066	 *    pointers we want to clear.  When this completes the zero
8067	 *    pointers are written to the real copy.
8068	 * 2) The indirect is being completely truncated, cancel_indirdep()
8069	 *    eliminated the real copy and placed the indirdep on the saved
8070	 *    copy.  The indirdep and buf are discarded when this completes.
8071	 * 3) The indirect was not in memory, we read a copy off of the disk
8072	 *    using the devvp and drop and invalidate the buffer when we're
8073	 *    done.
8074	 */
8075	goingaway = 1;
8076	indirdep = NULL;
8077	if (freework->fw_indir != NULL) {
8078		goingaway = 0;
8079		indirdep = freework->fw_indir;
8080		bp = indirdep->ir_savebp;
8081		if (bp == NULL || bp->b_blkno != dbn)
8082			panic("indir_trunc: Bad saved buf %p blkno %jd",
8083			    bp, (intmax_t)dbn);
8084	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8085		/*
8086		 * The lock prevents the buf dep list from changing and
8087	 	 * indirects on devvp should only ever have one dependency.
8088		 */
8089		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8090		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8091			panic("indir_trunc: Bad indirdep %p from buf %p",
8092			    indirdep, bp);
8093	} else if (bread(freeblks->fb_devvp, dbn, (int)fs->fs_bsize,
8094	    NOCRED, &bp) != 0) {
8095		brelse(bp);
8096		return;
8097	}
8098	ACQUIRE_LOCK(ump);
8099	/* Protects against a race with complete_trunc_indir(). */
8100	freework->fw_state &= ~INPROGRESS;
8101	/*
8102	 * If we have an indirdep we need to enforce the truncation order
8103	 * and discard it when it is complete.
8104	 */
8105	if (indirdep) {
8106		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8107		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8108			/*
8109			 * Add the complete truncate to the list on the
8110			 * indirdep to enforce in-order processing.
8111			 */
8112			if (freework->fw_indir == NULL)
8113				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8114				    freework, fw_next);
8115			FREE_LOCK(ump);
8116			return;
8117		}
8118		/*
8119		 * If we're goingaway, free the indirdep.  Otherwise it will
8120		 * linger until the write completes.
8121		 */
8122		if (goingaway)
8123			free_indirdep(indirdep);
8124	}
8125	FREE_LOCK(ump);
8126	/* Initialize pointers depending on block size. */
8127	if (ump->um_fstype == UFS1) {
8128		bap1 = (ufs1_daddr_t *)bp->b_data;
8129		nb = bap1[freework->fw_off];
8130		ufs1fmt = 1;
8131		bap2 = NULL;
8132	} else {
8133		bap2 = (ufs2_daddr_t *)bp->b_data;
8134		nb = bap2[freework->fw_off];
8135		ufs1fmt = 0;
8136		bap1 = NULL;
8137	}
8138	level = lbn_level(lbn);
8139	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8140	lbnadd = lbn_offset(fs, level);
8141	nblocks = btodb(fs->fs_bsize);
8142	nfreework = freework;
8143	freedeps = 0;
8144	cnt = 0;
8145	/*
8146	 * Reclaim blocks.  Traverses into nested indirect levels and
8147	 * arranges for the current level to be freed when subordinates
8148	 * are free when journaling.
8149	 */
8150	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8151		if (i != NINDIR(fs) - 1) {
8152			if (ufs1fmt)
8153				nnb = bap1[i+1];
8154			else
8155				nnb = bap2[i+1];
8156		} else
8157			nnb = 0;
8158		if (nb == 0)
8159			continue;
8160		cnt++;
8161		if (level != 0) {
8162			nlbn = (lbn + 1) - (i * lbnadd);
8163			if (needj != 0) {
8164				nfreework = newfreework(ump, freeblks, freework,
8165				    nlbn, nb, fs->fs_frag, 0, 0);
8166				freedeps++;
8167			}
8168			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8169		} else {
8170			struct freedep *freedep;
8171
8172			/*
8173			 * Attempt to aggregate freedep dependencies for
8174			 * all blocks being released to the same CG.
8175			 */
8176			LIST_INIT(&wkhd);
8177			if (needj != 0 &&
8178			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8179				freedep = newfreedep(freework);
8180				WORKLIST_INSERT_UNLOCKED(&wkhd,
8181				    &freedep->fd_list);
8182				freedeps++;
8183			}
8184			CTR3(KTR_SUJ,
8185			    "indir_trunc: ino %d blkno %jd size %ld",
8186			    freeblks->fb_inum, nb, fs->fs_bsize);
8187			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8188			    fs->fs_bsize, freeblks->fb_inum,
8189			    freeblks->fb_vtype, &wkhd);
8190		}
8191	}
8192	if (goingaway) {
8193		bp->b_flags |= B_INVAL | B_NOCACHE;
8194		brelse(bp);
8195	}
8196	freedblocks = 0;
8197	if (level == 0)
8198		freedblocks = (nblocks * cnt);
8199	if (needj == 0)
8200		freedblocks += nblocks;
8201	freeblks_free(ump, freeblks, freedblocks);
8202	/*
8203	 * If we are journaling set up the ref counts and offset so this
8204	 * indirect can be completed when its children are free.
8205	 */
8206	if (needj) {
8207		ACQUIRE_LOCK(ump);
8208		freework->fw_off = i;
8209		freework->fw_ref += freedeps;
8210		freework->fw_ref -= NINDIR(fs) + 1;
8211		if (level == 0)
8212			freeblks->fb_cgwait += freedeps;
8213		if (freework->fw_ref == 0)
8214			freework_freeblock(freework);
8215		FREE_LOCK(ump);
8216		return;
8217	}
8218	/*
8219	 * If we're not journaling we can free the indirect now.
8220	 */
8221	dbn = dbtofsb(fs, dbn);
8222	CTR3(KTR_SUJ,
8223	    "indir_trunc 2: ino %d blkno %jd size %ld",
8224	    freeblks->fb_inum, dbn, fs->fs_bsize);
8225	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8226	    freeblks->fb_inum, freeblks->fb_vtype, NULL);
8227	/* Non SUJ softdep does single-threaded truncations. */
8228	if (freework->fw_blkno == dbn) {
8229		freework->fw_state |= ALLCOMPLETE;
8230		ACQUIRE_LOCK(ump);
8231		handle_written_freework(freework);
8232		FREE_LOCK(ump);
8233	}
8234	return;
8235}
8236
8237/*
8238 * Cancel an allocindir when it is removed via truncation.  When bp is not
8239 * NULL the indirect never appeared on disk and is scheduled to be freed
8240 * independently of the indir so we can more easily track journal work.
8241 */
8242static void
8243cancel_allocindir(aip, bp, freeblks, trunc)
8244	struct allocindir *aip;
8245	struct buf *bp;
8246	struct freeblks *freeblks;
8247	int trunc;
8248{
8249	struct indirdep *indirdep;
8250	struct freefrag *freefrag;
8251	struct newblk *newblk;
8252
8253	newblk = (struct newblk *)aip;
8254	LIST_REMOVE(aip, ai_next);
8255	/*
8256	 * We must eliminate the pointer in bp if it must be freed on its
8257	 * own due to partial truncate or pending journal work.
8258	 */
8259	if (bp && (trunc || newblk->nb_jnewblk)) {
8260		/*
8261		 * Clear the pointer and mark the aip to be freed
8262		 * directly if it never existed on disk.
8263		 */
8264		aip->ai_state |= DELAYEDFREE;
8265		indirdep = aip->ai_indirdep;
8266		if (indirdep->ir_state & UFS1FMT)
8267			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8268		else
8269			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8270	}
8271	/*
8272	 * When truncating the previous pointer will be freed via
8273	 * savedbp.  Eliminate the freefrag which would dup free.
8274	 */
8275	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8276		newblk->nb_freefrag = NULL;
8277		if (freefrag->ff_jdep)
8278			cancel_jfreefrag(
8279			    WK_JFREEFRAG(freefrag->ff_jdep));
8280		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8281		WORKITEM_FREE(freefrag, D_FREEFRAG);
8282	}
8283	/*
8284	 * If the journal hasn't been written the jnewblk must be passed
8285	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8286	 * this by leaving the journal dependency on the newblk to be freed
8287	 * when a freework is created in handle_workitem_freeblocks().
8288	 */
8289	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8290	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8291}
8292
8293/*
8294 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8295 * in to a newdirblk so any subsequent additions are tracked properly.  The
8296 * caller is responsible for adding the mkdir1 dependency to the journal
8297 * and updating id_mkdiradd.  This function returns with the per-filesystem
8298 * lock held.
8299 */
8300static struct mkdir *
8301setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8302	struct diradd *dap;
8303	ino_t newinum;
8304	ino_t dinum;
8305	struct buf *newdirbp;
8306	struct mkdir **mkdirp;
8307{
8308	struct newblk *newblk;
8309	struct pagedep *pagedep;
8310	struct inodedep *inodedep;
8311	struct newdirblk *newdirblk;
8312	struct mkdir *mkdir1, *mkdir2;
8313	struct worklist *wk;
8314	struct jaddref *jaddref;
8315	struct ufsmount *ump;
8316	struct mount *mp;
8317
8318	mp = dap->da_list.wk_mp;
8319	ump = VFSTOUFS(mp);
8320	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8321	    M_SOFTDEP_FLAGS);
8322	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8323	LIST_INIT(&newdirblk->db_mkdir);
8324	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8325	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8326	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8327	mkdir1->md_diradd = dap;
8328	mkdir1->md_jaddref = NULL;
8329	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8330	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8331	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8332	mkdir2->md_diradd = dap;
8333	mkdir2->md_jaddref = NULL;
8334	if (MOUNTEDSUJ(mp) == 0) {
8335		mkdir1->md_state |= DEPCOMPLETE;
8336		mkdir2->md_state |= DEPCOMPLETE;
8337	}
8338	/*
8339	 * Dependency on "." and ".." being written to disk.
8340	 */
8341	mkdir1->md_buf = newdirbp;
8342	ACQUIRE_LOCK(VFSTOUFS(mp));
8343	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8344	/*
8345	 * We must link the pagedep, allocdirect, and newdirblk for
8346	 * the initial file page so the pointer to the new directory
8347	 * is not written until the directory contents are live and
8348	 * any subsequent additions are not marked live until the
8349	 * block is reachable via the inode.
8350	 */
8351	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8352		panic("setup_newdir: lost pagedep");
8353	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8354		if (wk->wk_type == D_ALLOCDIRECT)
8355			break;
8356	if (wk == NULL)
8357		panic("setup_newdir: lost allocdirect");
8358	if (pagedep->pd_state & NEWBLOCK)
8359		panic("setup_newdir: NEWBLOCK already set");
8360	newblk = WK_NEWBLK(wk);
8361	pagedep->pd_state |= NEWBLOCK;
8362	pagedep->pd_newdirblk = newdirblk;
8363	newdirblk->db_pagedep = pagedep;
8364	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8365	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8366	/*
8367	 * Look up the inodedep for the parent directory so that we
8368	 * can link mkdir2 into the pending dotdot jaddref or
8369	 * the inode write if there is none.  If the inode is
8370	 * ALLCOMPLETE and no jaddref is present all dependencies have
8371	 * been satisfied and mkdir2 can be freed.
8372	 */
8373	inodedep_lookup(mp, dinum, 0, &inodedep);
8374	if (MOUNTEDSUJ(mp)) {
8375		if (inodedep == NULL)
8376			panic("setup_newdir: Lost parent.");
8377		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8378		    inoreflst);
8379		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8380		    (jaddref->ja_state & MKDIR_PARENT),
8381		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8382		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8383		mkdir2->md_jaddref = jaddref;
8384		jaddref->ja_mkdir = mkdir2;
8385	} else if (inodedep == NULL ||
8386	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8387		dap->da_state &= ~MKDIR_PARENT;
8388		WORKITEM_FREE(mkdir2, D_MKDIR);
8389		mkdir2 = NULL;
8390	} else {
8391		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8392		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8393	}
8394	*mkdirp = mkdir2;
8395
8396	return (mkdir1);
8397}
8398
8399/*
8400 * Directory entry addition dependencies.
8401 *
8402 * When adding a new directory entry, the inode (with its incremented link
8403 * count) must be written to disk before the directory entry's pointer to it.
8404 * Also, if the inode is newly allocated, the corresponding freemap must be
8405 * updated (on disk) before the directory entry's pointer. These requirements
8406 * are met via undo/redo on the directory entry's pointer, which consists
8407 * simply of the inode number.
8408 *
8409 * As directory entries are added and deleted, the free space within a
8410 * directory block can become fragmented.  The ufs filesystem will compact
8411 * a fragmented directory block to make space for a new entry. When this
8412 * occurs, the offsets of previously added entries change. Any "diradd"
8413 * dependency structures corresponding to these entries must be updated with
8414 * the new offsets.
8415 */
8416
8417/*
8418 * This routine is called after the in-memory inode's link
8419 * count has been incremented, but before the directory entry's
8420 * pointer to the inode has been set.
8421 */
8422int
8423softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8424	struct buf *bp;		/* buffer containing directory block */
8425	struct inode *dp;	/* inode for directory */
8426	off_t diroffset;	/* offset of new entry in directory */
8427	ino_t newinum;		/* inode referenced by new directory entry */
8428	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8429	int isnewblk;		/* entry is in a newly allocated block */
8430{
8431	int offset;		/* offset of new entry within directory block */
8432	ufs_lbn_t lbn;		/* block in directory containing new entry */
8433	struct fs *fs;
8434	struct diradd *dap;
8435	struct newblk *newblk;
8436	struct pagedep *pagedep;
8437	struct inodedep *inodedep;
8438	struct newdirblk *newdirblk;
8439	struct mkdir *mkdir1, *mkdir2;
8440	struct jaddref *jaddref;
8441	struct ufsmount *ump;
8442	struct mount *mp;
8443	int isindir;
8444
8445	ump = dp->i_ump;
8446	mp = UFSTOVFS(ump);
8447	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8448	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8449	/*
8450	 * Whiteouts have no dependencies.
8451	 */
8452	if (newinum == WINO) {
8453		if (newdirbp != NULL)
8454			bdwrite(newdirbp);
8455		return (0);
8456	}
8457	jaddref = NULL;
8458	mkdir1 = mkdir2 = NULL;
8459	fs = dp->i_fs;
8460	lbn = lblkno(fs, diroffset);
8461	offset = blkoff(fs, diroffset);
8462	dap = malloc(sizeof(struct diradd), M_DIRADD,
8463		M_SOFTDEP_FLAGS|M_ZERO);
8464	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8465	dap->da_offset = offset;
8466	dap->da_newinum = newinum;
8467	dap->da_state = ATTACHED;
8468	LIST_INIT(&dap->da_jwork);
8469	isindir = bp->b_lblkno >= NDADDR;
8470	newdirblk = NULL;
8471	if (isnewblk &&
8472	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8473		newdirblk = malloc(sizeof(struct newdirblk),
8474		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8475		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8476		LIST_INIT(&newdirblk->db_mkdir);
8477	}
8478	/*
8479	 * If we're creating a new directory setup the dependencies and set
8480	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8481	 * we can move on.
8482	 */
8483	if (newdirbp == NULL) {
8484		dap->da_state |= DEPCOMPLETE;
8485		ACQUIRE_LOCK(ump);
8486	} else {
8487		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8488		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8489		    &mkdir2);
8490	}
8491	/*
8492	 * Link into parent directory pagedep to await its being written.
8493	 */
8494	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8495#ifdef DEBUG
8496	if (diradd_lookup(pagedep, offset) != NULL)
8497		panic("softdep_setup_directory_add: %p already at off %d\n",
8498		    diradd_lookup(pagedep, offset), offset);
8499#endif
8500	dap->da_pagedep = pagedep;
8501	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8502	    da_pdlist);
8503	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8504	/*
8505	 * If we're journaling, link the diradd into the jaddref so it
8506	 * may be completed after the journal entry is written.  Otherwise,
8507	 * link the diradd into its inodedep.  If the inode is not yet
8508	 * written place it on the bufwait list, otherwise do the post-inode
8509	 * write processing to put it on the id_pendinghd list.
8510	 */
8511	if (MOUNTEDSUJ(mp)) {
8512		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8513		    inoreflst);
8514		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8515		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8516		jaddref->ja_diroff = diroffset;
8517		jaddref->ja_diradd = dap;
8518		add_to_journal(&jaddref->ja_list);
8519	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8520		diradd_inode_written(dap, inodedep);
8521	else
8522		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8523	/*
8524	 * Add the journal entries for . and .. links now that the primary
8525	 * link is written.
8526	 */
8527	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8528		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8529		    inoreflst, if_deps);
8530		KASSERT(jaddref != NULL &&
8531		    jaddref->ja_ino == jaddref->ja_parent &&
8532		    (jaddref->ja_state & MKDIR_BODY),
8533		    ("softdep_setup_directory_add: bad dot jaddref %p",
8534		    jaddref));
8535		mkdir1->md_jaddref = jaddref;
8536		jaddref->ja_mkdir = mkdir1;
8537		/*
8538		 * It is important that the dotdot journal entry
8539		 * is added prior to the dot entry since dot writes
8540		 * both the dot and dotdot links.  These both must
8541		 * be added after the primary link for the journal
8542		 * to remain consistent.
8543		 */
8544		add_to_journal(&mkdir2->md_jaddref->ja_list);
8545		add_to_journal(&jaddref->ja_list);
8546	}
8547	/*
8548	 * If we are adding a new directory remember this diradd so that if
8549	 * we rename it we can keep the dot and dotdot dependencies.  If
8550	 * we are adding a new name for an inode that has a mkdiradd we
8551	 * must be in rename and we have to move the dot and dotdot
8552	 * dependencies to this new name.  The old name is being orphaned
8553	 * soon.
8554	 */
8555	if (mkdir1 != NULL) {
8556		if (inodedep->id_mkdiradd != NULL)
8557			panic("softdep_setup_directory_add: Existing mkdir");
8558		inodedep->id_mkdiradd = dap;
8559	} else if (inodedep->id_mkdiradd)
8560		merge_diradd(inodedep, dap);
8561	if (newdirblk != NULL) {
8562		/*
8563		 * There is nothing to do if we are already tracking
8564		 * this block.
8565		 */
8566		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8567			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8568			FREE_LOCK(ump);
8569			return (0);
8570		}
8571		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8572		    == 0)
8573			panic("softdep_setup_directory_add: lost entry");
8574		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8575		pagedep->pd_state |= NEWBLOCK;
8576		pagedep->pd_newdirblk = newdirblk;
8577		newdirblk->db_pagedep = pagedep;
8578		FREE_LOCK(ump);
8579		/*
8580		 * If we extended into an indirect signal direnter to sync.
8581		 */
8582		if (isindir)
8583			return (1);
8584		return (0);
8585	}
8586	FREE_LOCK(ump);
8587	return (0);
8588}
8589
8590/*
8591 * This procedure is called to change the offset of a directory
8592 * entry when compacting a directory block which must be owned
8593 * exclusively by the caller. Note that the actual entry movement
8594 * must be done in this procedure to ensure that no I/O completions
8595 * occur while the move is in progress.
8596 */
8597void
8598softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
8599	struct buf *bp;		/* Buffer holding directory block. */
8600	struct inode *dp;	/* inode for directory */
8601	caddr_t base;		/* address of dp->i_offset */
8602	caddr_t oldloc;		/* address of old directory location */
8603	caddr_t newloc;		/* address of new directory location */
8604	int entrysize;		/* size of directory entry */
8605{
8606	int offset, oldoffset, newoffset;
8607	struct pagedep *pagedep;
8608	struct jmvref *jmvref;
8609	struct diradd *dap;
8610	struct direct *de;
8611	struct mount *mp;
8612	ufs_lbn_t lbn;
8613	int flags;
8614
8615	mp = UFSTOVFS(dp->i_ump);
8616	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8617	    ("softdep_change_directoryentry_offset called on "
8618	     "non-softdep filesystem"));
8619	de = (struct direct *)oldloc;
8620	jmvref = NULL;
8621	flags = 0;
8622	/*
8623	 * Moves are always journaled as it would be too complex to
8624	 * determine if any affected adds or removes are present in the
8625	 * journal.
8626	 */
8627	if (MOUNTEDSUJ(mp)) {
8628		flags = DEPALLOC;
8629		jmvref = newjmvref(dp, de->d_ino,
8630		    dp->i_offset + (oldloc - base),
8631		    dp->i_offset + (newloc - base));
8632	}
8633	lbn = lblkno(dp->i_fs, dp->i_offset);
8634	offset = blkoff(dp->i_fs, dp->i_offset);
8635	oldoffset = offset + (oldloc - base);
8636	newoffset = offset + (newloc - base);
8637	ACQUIRE_LOCK(dp->i_ump);
8638	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
8639		goto done;
8640	dap = diradd_lookup(pagedep, oldoffset);
8641	if (dap) {
8642		dap->da_offset = newoffset;
8643		newoffset = DIRADDHASH(newoffset);
8644		oldoffset = DIRADDHASH(oldoffset);
8645		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
8646		    newoffset != oldoffset) {
8647			LIST_REMOVE(dap, da_pdlist);
8648			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
8649			    dap, da_pdlist);
8650		}
8651	}
8652done:
8653	if (jmvref) {
8654		jmvref->jm_pagedep = pagedep;
8655		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
8656		add_to_journal(&jmvref->jm_list);
8657	}
8658	bcopy(oldloc, newloc, entrysize);
8659	FREE_LOCK(dp->i_ump);
8660}
8661
8662/*
8663 * Move the mkdir dependencies and journal work from one diradd to another
8664 * when renaming a directory.  The new name must depend on the mkdir deps
8665 * completing as the old name did.  Directories can only have one valid link
8666 * at a time so one must be canonical.
8667 */
8668static void
8669merge_diradd(inodedep, newdap)
8670	struct inodedep *inodedep;
8671	struct diradd *newdap;
8672{
8673	struct diradd *olddap;
8674	struct mkdir *mkdir, *nextmd;
8675	struct ufsmount *ump;
8676	short state;
8677
8678	olddap = inodedep->id_mkdiradd;
8679	inodedep->id_mkdiradd = newdap;
8680	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8681		newdap->da_state &= ~DEPCOMPLETE;
8682		ump = VFSTOUFS(inodedep->id_list.wk_mp);
8683		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8684		     mkdir = nextmd) {
8685			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8686			if (mkdir->md_diradd != olddap)
8687				continue;
8688			mkdir->md_diradd = newdap;
8689			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
8690			newdap->da_state |= state;
8691			olddap->da_state &= ~state;
8692			if ((olddap->da_state &
8693			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
8694				break;
8695		}
8696		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8697			panic("merge_diradd: unfound ref");
8698	}
8699	/*
8700	 * Any mkdir related journal items are not safe to be freed until
8701	 * the new name is stable.
8702	 */
8703	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
8704	olddap->da_state |= DEPCOMPLETE;
8705	complete_diradd(olddap);
8706}
8707
8708/*
8709 * Move the diradd to the pending list when all diradd dependencies are
8710 * complete.
8711 */
8712static void
8713complete_diradd(dap)
8714	struct diradd *dap;
8715{
8716	struct pagedep *pagedep;
8717
8718	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
8719		if (dap->da_state & DIRCHG)
8720			pagedep = dap->da_previous->dm_pagedep;
8721		else
8722			pagedep = dap->da_pagedep;
8723		LIST_REMOVE(dap, da_pdlist);
8724		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
8725	}
8726}
8727
8728/*
8729 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
8730 * add entries and conditonally journal the remove.
8731 */
8732static void
8733cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
8734	struct diradd *dap;
8735	struct dirrem *dirrem;
8736	struct jremref *jremref;
8737	struct jremref *dotremref;
8738	struct jremref *dotdotremref;
8739{
8740	struct inodedep *inodedep;
8741	struct jaddref *jaddref;
8742	struct inoref *inoref;
8743	struct ufsmount *ump;
8744	struct mkdir *mkdir;
8745
8746	/*
8747	 * If no remove references were allocated we're on a non-journaled
8748	 * filesystem and can skip the cancel step.
8749	 */
8750	if (jremref == NULL) {
8751		free_diradd(dap, NULL);
8752		return;
8753	}
8754	/*
8755	 * Cancel the primary name an free it if it does not require
8756	 * journaling.
8757	 */
8758	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
8759	    0, &inodedep) != 0) {
8760		/* Abort the addref that reference this diradd.  */
8761		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
8762			if (inoref->if_list.wk_type != D_JADDREF)
8763				continue;
8764			jaddref = (struct jaddref *)inoref;
8765			if (jaddref->ja_diradd != dap)
8766				continue;
8767			if (cancel_jaddref(jaddref, inodedep,
8768			    &dirrem->dm_jwork) == 0) {
8769				free_jremref(jremref);
8770				jremref = NULL;
8771			}
8772			break;
8773		}
8774	}
8775	/*
8776	 * Cancel subordinate names and free them if they do not require
8777	 * journaling.
8778	 */
8779	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8780		ump = VFSTOUFS(dap->da_list.wk_mp);
8781		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
8782			if (mkdir->md_diradd != dap)
8783				continue;
8784			if ((jaddref = mkdir->md_jaddref) == NULL)
8785				continue;
8786			mkdir->md_jaddref = NULL;
8787			if (mkdir->md_state & MKDIR_PARENT) {
8788				if (cancel_jaddref(jaddref, NULL,
8789				    &dirrem->dm_jwork) == 0) {
8790					free_jremref(dotdotremref);
8791					dotdotremref = NULL;
8792				}
8793			} else {
8794				if (cancel_jaddref(jaddref, inodedep,
8795				    &dirrem->dm_jwork) == 0) {
8796					free_jremref(dotremref);
8797					dotremref = NULL;
8798				}
8799			}
8800		}
8801	}
8802
8803	if (jremref)
8804		journal_jremref(dirrem, jremref, inodedep);
8805	if (dotremref)
8806		journal_jremref(dirrem, dotremref, inodedep);
8807	if (dotdotremref)
8808		journal_jremref(dirrem, dotdotremref, NULL);
8809	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
8810	free_diradd(dap, &dirrem->dm_jwork);
8811}
8812
8813/*
8814 * Free a diradd dependency structure. This routine must be called
8815 * with splbio interrupts blocked.
8816 */
8817static void
8818free_diradd(dap, wkhd)
8819	struct diradd *dap;
8820	struct workhead *wkhd;
8821{
8822	struct dirrem *dirrem;
8823	struct pagedep *pagedep;
8824	struct inodedep *inodedep;
8825	struct mkdir *mkdir, *nextmd;
8826	struct ufsmount *ump;
8827
8828	ump = VFSTOUFS(dap->da_list.wk_mp);
8829	LOCK_OWNED(ump);
8830	LIST_REMOVE(dap, da_pdlist);
8831	if (dap->da_state & ONWORKLIST)
8832		WORKLIST_REMOVE(&dap->da_list);
8833	if ((dap->da_state & DIRCHG) == 0) {
8834		pagedep = dap->da_pagedep;
8835	} else {
8836		dirrem = dap->da_previous;
8837		pagedep = dirrem->dm_pagedep;
8838		dirrem->dm_dirinum = pagedep->pd_ino;
8839		dirrem->dm_state |= COMPLETE;
8840		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
8841			add_to_worklist(&dirrem->dm_list, 0);
8842	}
8843	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
8844	    0, &inodedep) != 0)
8845		if (inodedep->id_mkdiradd == dap)
8846			inodedep->id_mkdiradd = NULL;
8847	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
8848		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
8849		     mkdir = nextmd) {
8850			nextmd = LIST_NEXT(mkdir, md_mkdirs);
8851			if (mkdir->md_diradd != dap)
8852				continue;
8853			dap->da_state &=
8854			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
8855			LIST_REMOVE(mkdir, md_mkdirs);
8856			if (mkdir->md_state & ONWORKLIST)
8857				WORKLIST_REMOVE(&mkdir->md_list);
8858			if (mkdir->md_jaddref != NULL)
8859				panic("free_diradd: Unexpected jaddref");
8860			WORKITEM_FREE(mkdir, D_MKDIR);
8861			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
8862				break;
8863		}
8864		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
8865			panic("free_diradd: unfound ref");
8866	}
8867	if (inodedep)
8868		free_inodedep(inodedep);
8869	/*
8870	 * Free any journal segments waiting for the directory write.
8871	 */
8872	handle_jwork(&dap->da_jwork);
8873	WORKITEM_FREE(dap, D_DIRADD);
8874}
8875
8876/*
8877 * Directory entry removal dependencies.
8878 *
8879 * When removing a directory entry, the entry's inode pointer must be
8880 * zero'ed on disk before the corresponding inode's link count is decremented
8881 * (possibly freeing the inode for re-use). This dependency is handled by
8882 * updating the directory entry but delaying the inode count reduction until
8883 * after the directory block has been written to disk. After this point, the
8884 * inode count can be decremented whenever it is convenient.
8885 */
8886
8887/*
8888 * This routine should be called immediately after removing
8889 * a directory entry.  The inode's link count should not be
8890 * decremented by the calling procedure -- the soft updates
8891 * code will do this task when it is safe.
8892 */
8893void
8894softdep_setup_remove(bp, dp, ip, isrmdir)
8895	struct buf *bp;		/* buffer containing directory block */
8896	struct inode *dp;	/* inode for the directory being modified */
8897	struct inode *ip;	/* inode for directory entry being removed */
8898	int isrmdir;		/* indicates if doing RMDIR */
8899{
8900	struct dirrem *dirrem, *prevdirrem;
8901	struct inodedep *inodedep;
8902	int direct;
8903
8904	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
8905	    ("softdep_setup_remove called on non-softdep filesystem"));
8906	/*
8907	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
8908	 * newdirrem() to setup the full directory remove which requires
8909	 * isrmdir > 1.
8910	 */
8911	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
8912	/*
8913	 * Add the dirrem to the inodedep's pending remove list for quick
8914	 * discovery later.
8915	 */
8916	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
8917	    &inodedep) == 0)
8918		panic("softdep_setup_remove: Lost inodedep.");
8919	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
8920	dirrem->dm_state |= ONDEPLIST;
8921	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
8922
8923	/*
8924	 * If the COMPLETE flag is clear, then there were no active
8925	 * entries and we want to roll back to a zeroed entry until
8926	 * the new inode is committed to disk. If the COMPLETE flag is
8927	 * set then we have deleted an entry that never made it to
8928	 * disk. If the entry we deleted resulted from a name change,
8929	 * then the old name still resides on disk. We cannot delete
8930	 * its inode (returned to us in prevdirrem) until the zeroed
8931	 * directory entry gets to disk. The new inode has never been
8932	 * referenced on the disk, so can be deleted immediately.
8933	 */
8934	if ((dirrem->dm_state & COMPLETE) == 0) {
8935		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
8936		    dm_next);
8937		FREE_LOCK(ip->i_ump);
8938	} else {
8939		if (prevdirrem != NULL)
8940			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
8941			    prevdirrem, dm_next);
8942		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
8943		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
8944		FREE_LOCK(ip->i_ump);
8945		if (direct)
8946			handle_workitem_remove(dirrem, 0);
8947	}
8948}
8949
8950/*
8951 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
8952 * pd_pendinghd list of a pagedep.
8953 */
8954static struct diradd *
8955diradd_lookup(pagedep, offset)
8956	struct pagedep *pagedep;
8957	int offset;
8958{
8959	struct diradd *dap;
8960
8961	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
8962		if (dap->da_offset == offset)
8963			return (dap);
8964	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
8965		if (dap->da_offset == offset)
8966			return (dap);
8967	return (NULL);
8968}
8969
8970/*
8971 * Search for a .. diradd dependency in a directory that is being removed.
8972 * If the directory was renamed to a new parent we have a diradd rather
8973 * than a mkdir for the .. entry.  We need to cancel it now before
8974 * it is found in truncate().
8975 */
8976static struct jremref *
8977cancel_diradd_dotdot(ip, dirrem, jremref)
8978	struct inode *ip;
8979	struct dirrem *dirrem;
8980	struct jremref *jremref;
8981{
8982	struct pagedep *pagedep;
8983	struct diradd *dap;
8984	struct worklist *wk;
8985
8986	if (pagedep_lookup(UFSTOVFS(ip->i_ump), NULL, ip->i_number, 0, 0,
8987	    &pagedep) == 0)
8988		return (jremref);
8989	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
8990	if (dap == NULL)
8991		return (jremref);
8992	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
8993	/*
8994	 * Mark any journal work as belonging to the parent so it is freed
8995	 * with the .. reference.
8996	 */
8997	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
8998		wk->wk_state |= MKDIR_PARENT;
8999	return (NULL);
9000}
9001
9002/*
9003 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9004 * replace it with a dirrem/diradd pair as a result of re-parenting a
9005 * directory.  This ensures that we don't simultaneously have a mkdir and
9006 * a diradd for the same .. entry.
9007 */
9008static struct jremref *
9009cancel_mkdir_dotdot(ip, dirrem, jremref)
9010	struct inode *ip;
9011	struct dirrem *dirrem;
9012	struct jremref *jremref;
9013{
9014	struct inodedep *inodedep;
9015	struct jaddref *jaddref;
9016	struct ufsmount *ump;
9017	struct mkdir *mkdir;
9018	struct diradd *dap;
9019
9020	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
9021	    &inodedep) == 0)
9022		return (jremref);
9023	dap = inodedep->id_mkdiradd;
9024	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9025		return (jremref);
9026	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9027	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9028	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9029		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9030			break;
9031	if (mkdir == NULL)
9032		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9033	if ((jaddref = mkdir->md_jaddref) != NULL) {
9034		mkdir->md_jaddref = NULL;
9035		jaddref->ja_state &= ~MKDIR_PARENT;
9036		if (inodedep_lookup(UFSTOVFS(ip->i_ump), jaddref->ja_ino, 0,
9037		    &inodedep) == 0)
9038			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9039		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9040			journal_jremref(dirrem, jremref, inodedep);
9041			jremref = NULL;
9042		}
9043	}
9044	if (mkdir->md_state & ONWORKLIST)
9045		WORKLIST_REMOVE(&mkdir->md_list);
9046	mkdir->md_state |= ALLCOMPLETE;
9047	complete_mkdir(mkdir);
9048	return (jremref);
9049}
9050
9051static void
9052journal_jremref(dirrem, jremref, inodedep)
9053	struct dirrem *dirrem;
9054	struct jremref *jremref;
9055	struct inodedep *inodedep;
9056{
9057
9058	if (inodedep == NULL)
9059		if (inodedep_lookup(jremref->jr_list.wk_mp,
9060		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9061			panic("journal_jremref: Lost inodedep");
9062	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9063	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9064	add_to_journal(&jremref->jr_list);
9065}
9066
9067static void
9068dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9069	struct dirrem *dirrem;
9070	struct jremref *jremref;
9071	struct jremref *dotremref;
9072	struct jremref *dotdotremref;
9073{
9074	struct inodedep *inodedep;
9075
9076
9077	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9078	    &inodedep) == 0)
9079		panic("dirrem_journal: Lost inodedep");
9080	journal_jremref(dirrem, jremref, inodedep);
9081	if (dotremref)
9082		journal_jremref(dirrem, dotremref, inodedep);
9083	if (dotdotremref)
9084		journal_jremref(dirrem, dotdotremref, NULL);
9085}
9086
9087/*
9088 * Allocate a new dirrem if appropriate and return it along with
9089 * its associated pagedep. Called without a lock, returns with lock.
9090 */
9091static struct dirrem *
9092newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9093	struct buf *bp;		/* buffer containing directory block */
9094	struct inode *dp;	/* inode for the directory being modified */
9095	struct inode *ip;	/* inode for directory entry being removed */
9096	int isrmdir;		/* indicates if doing RMDIR */
9097	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9098{
9099	int offset;
9100	ufs_lbn_t lbn;
9101	struct diradd *dap;
9102	struct dirrem *dirrem;
9103	struct pagedep *pagedep;
9104	struct jremref *jremref;
9105	struct jremref *dotremref;
9106	struct jremref *dotdotremref;
9107	struct vnode *dvp;
9108
9109	/*
9110	 * Whiteouts have no deletion dependencies.
9111	 */
9112	if (ip == NULL)
9113		panic("newdirrem: whiteout");
9114	dvp = ITOV(dp);
9115	/*
9116	 * If the system is over its limit and our filesystem is
9117	 * responsible for more than our share of that usage and
9118	 * we are not a snapshot, request some inodedep cleanup.
9119	 * Limiting the number of dirrem structures will also limit
9120	 * the number of freefile and freeblks structures.
9121	 */
9122	ACQUIRE_LOCK(ip->i_ump);
9123	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ip->i_ump, D_DIRREM))
9124		schedule_cleanup(ITOV(dp)->v_mount);
9125	else
9126		FREE_LOCK(ip->i_ump);
9127	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9128	    M_ZERO);
9129	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9130	LIST_INIT(&dirrem->dm_jremrefhd);
9131	LIST_INIT(&dirrem->dm_jwork);
9132	dirrem->dm_state = isrmdir ? RMDIR : 0;
9133	dirrem->dm_oldinum = ip->i_number;
9134	*prevdirremp = NULL;
9135	/*
9136	 * Allocate remove reference structures to track journal write
9137	 * dependencies.  We will always have one for the link and
9138	 * when doing directories we will always have one more for dot.
9139	 * When renaming a directory we skip the dotdot link change so
9140	 * this is not needed.
9141	 */
9142	jremref = dotremref = dotdotremref = NULL;
9143	if (DOINGSUJ(dvp)) {
9144		if (isrmdir) {
9145			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9146			    ip->i_effnlink + 2);
9147			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9148			    ip->i_effnlink + 1);
9149			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9150			    dp->i_effnlink + 1);
9151			dotdotremref->jr_state |= MKDIR_PARENT;
9152		} else
9153			jremref = newjremref(dirrem, dp, ip, dp->i_offset,
9154			    ip->i_effnlink + 1);
9155	}
9156	ACQUIRE_LOCK(ip->i_ump);
9157	lbn = lblkno(dp->i_fs, dp->i_offset);
9158	offset = blkoff(dp->i_fs, dp->i_offset);
9159	pagedep_lookup(UFSTOVFS(dp->i_ump), bp, dp->i_number, lbn, DEPALLOC,
9160	    &pagedep);
9161	dirrem->dm_pagedep = pagedep;
9162	dirrem->dm_offset = offset;
9163	/*
9164	 * If we're renaming a .. link to a new directory, cancel any
9165	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9166	 * the jremref is preserved for any potential diradd in this
9167	 * location.  This can not coincide with a rmdir.
9168	 */
9169	if (dp->i_offset == DOTDOT_OFFSET) {
9170		if (isrmdir)
9171			panic("newdirrem: .. directory change during remove?");
9172		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9173	}
9174	/*
9175	 * If we're removing a directory search for the .. dependency now and
9176	 * cancel it.  Any pending journal work will be added to the dirrem
9177	 * to be completed when the workitem remove completes.
9178	 */
9179	if (isrmdir)
9180		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9181	/*
9182	 * Check for a diradd dependency for the same directory entry.
9183	 * If present, then both dependencies become obsolete and can
9184	 * be de-allocated.
9185	 */
9186	dap = diradd_lookup(pagedep, offset);
9187	if (dap == NULL) {
9188		/*
9189		 * Link the jremref structures into the dirrem so they are
9190		 * written prior to the pagedep.
9191		 */
9192		if (jremref)
9193			dirrem_journal(dirrem, jremref, dotremref,
9194			    dotdotremref);
9195		return (dirrem);
9196	}
9197	/*
9198	 * Must be ATTACHED at this point.
9199	 */
9200	if ((dap->da_state & ATTACHED) == 0)
9201		panic("newdirrem: not ATTACHED");
9202	if (dap->da_newinum != ip->i_number)
9203		panic("newdirrem: inum %ju should be %ju",
9204		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9205	/*
9206	 * If we are deleting a changed name that never made it to disk,
9207	 * then return the dirrem describing the previous inode (which
9208	 * represents the inode currently referenced from this entry on disk).
9209	 */
9210	if ((dap->da_state & DIRCHG) != 0) {
9211		*prevdirremp = dap->da_previous;
9212		dap->da_state &= ~DIRCHG;
9213		dap->da_pagedep = pagedep;
9214	}
9215	/*
9216	 * We are deleting an entry that never made it to disk.
9217	 * Mark it COMPLETE so we can delete its inode immediately.
9218	 */
9219	dirrem->dm_state |= COMPLETE;
9220	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9221#ifdef SUJ_DEBUG
9222	if (isrmdir == 0) {
9223		struct worklist *wk;
9224
9225		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9226			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9227				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9228	}
9229#endif
9230
9231	return (dirrem);
9232}
9233
9234/*
9235 * Directory entry change dependencies.
9236 *
9237 * Changing an existing directory entry requires that an add operation
9238 * be completed first followed by a deletion. The semantics for the addition
9239 * are identical to the description of adding a new entry above except
9240 * that the rollback is to the old inode number rather than zero. Once
9241 * the addition dependency is completed, the removal is done as described
9242 * in the removal routine above.
9243 */
9244
9245/*
9246 * This routine should be called immediately after changing
9247 * a directory entry.  The inode's link count should not be
9248 * decremented by the calling procedure -- the soft updates
9249 * code will perform this task when it is safe.
9250 */
9251void
9252softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9253	struct buf *bp;		/* buffer containing directory block */
9254	struct inode *dp;	/* inode for the directory being modified */
9255	struct inode *ip;	/* inode for directory entry being removed */
9256	ino_t newinum;		/* new inode number for changed entry */
9257	int isrmdir;		/* indicates if doing RMDIR */
9258{
9259	int offset;
9260	struct diradd *dap = NULL;
9261	struct dirrem *dirrem, *prevdirrem;
9262	struct pagedep *pagedep;
9263	struct inodedep *inodedep;
9264	struct jaddref *jaddref;
9265	struct mount *mp;
9266
9267	offset = blkoff(dp->i_fs, dp->i_offset);
9268	mp = UFSTOVFS(dp->i_ump);
9269	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9270	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9271
9272	/*
9273	 * Whiteouts do not need diradd dependencies.
9274	 */
9275	if (newinum != WINO) {
9276		dap = malloc(sizeof(struct diradd),
9277		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9278		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9279		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9280		dap->da_offset = offset;
9281		dap->da_newinum = newinum;
9282		LIST_INIT(&dap->da_jwork);
9283	}
9284
9285	/*
9286	 * Allocate a new dirrem and ACQUIRE_LOCK.
9287	 */
9288	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9289	pagedep = dirrem->dm_pagedep;
9290	/*
9291	 * The possible values for isrmdir:
9292	 *	0 - non-directory file rename
9293	 *	1 - directory rename within same directory
9294	 *   inum - directory rename to new directory of given inode number
9295	 * When renaming to a new directory, we are both deleting and
9296	 * creating a new directory entry, so the link count on the new
9297	 * directory should not change. Thus we do not need the followup
9298	 * dirrem which is usually done in handle_workitem_remove. We set
9299	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9300	 * followup dirrem.
9301	 */
9302	if (isrmdir > 1)
9303		dirrem->dm_state |= DIRCHG;
9304
9305	/*
9306	 * Whiteouts have no additional dependencies,
9307	 * so just put the dirrem on the correct list.
9308	 */
9309	if (newinum == WINO) {
9310		if ((dirrem->dm_state & COMPLETE) == 0) {
9311			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9312			    dm_next);
9313		} else {
9314			dirrem->dm_dirinum = pagedep->pd_ino;
9315			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9316				add_to_worklist(&dirrem->dm_list, 0);
9317		}
9318		FREE_LOCK(dp->i_ump);
9319		return;
9320	}
9321	/*
9322	 * Add the dirrem to the inodedep's pending remove list for quick
9323	 * discovery later.  A valid nlinkdelta ensures that this lookup
9324	 * will not fail.
9325	 */
9326	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9327		panic("softdep_setup_directory_change: Lost inodedep.");
9328	dirrem->dm_state |= ONDEPLIST;
9329	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9330
9331	/*
9332	 * If the COMPLETE flag is clear, then there were no active
9333	 * entries and we want to roll back to the previous inode until
9334	 * the new inode is committed to disk. If the COMPLETE flag is
9335	 * set, then we have deleted an entry that never made it to disk.
9336	 * If the entry we deleted resulted from a name change, then the old
9337	 * inode reference still resides on disk. Any rollback that we do
9338	 * needs to be to that old inode (returned to us in prevdirrem). If
9339	 * the entry we deleted resulted from a create, then there is
9340	 * no entry on the disk, so we want to roll back to zero rather
9341	 * than the uncommitted inode. In either of the COMPLETE cases we
9342	 * want to immediately free the unwritten and unreferenced inode.
9343	 */
9344	if ((dirrem->dm_state & COMPLETE) == 0) {
9345		dap->da_previous = dirrem;
9346	} else {
9347		if (prevdirrem != NULL) {
9348			dap->da_previous = prevdirrem;
9349		} else {
9350			dap->da_state &= ~DIRCHG;
9351			dap->da_pagedep = pagedep;
9352		}
9353		dirrem->dm_dirinum = pagedep->pd_ino;
9354		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9355			add_to_worklist(&dirrem->dm_list, 0);
9356	}
9357	/*
9358	 * Lookup the jaddref for this journal entry.  We must finish
9359	 * initializing it and make the diradd write dependent on it.
9360	 * If we're not journaling, put it on the id_bufwait list if the
9361	 * inode is not yet written. If it is written, do the post-inode
9362	 * write processing to put it on the id_pendinghd list.
9363	 */
9364	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9365	if (MOUNTEDSUJ(mp)) {
9366		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9367		    inoreflst);
9368		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9369		    ("softdep_setup_directory_change: bad jaddref %p",
9370		    jaddref));
9371		jaddref->ja_diroff = dp->i_offset;
9372		jaddref->ja_diradd = dap;
9373		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9374		    dap, da_pdlist);
9375		add_to_journal(&jaddref->ja_list);
9376	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9377		dap->da_state |= COMPLETE;
9378		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9379		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9380	} else {
9381		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9382		    dap, da_pdlist);
9383		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9384	}
9385	/*
9386	 * If we're making a new name for a directory that has not been
9387	 * committed when need to move the dot and dotdot references to
9388	 * this new name.
9389	 */
9390	if (inodedep->id_mkdiradd && dp->i_offset != DOTDOT_OFFSET)
9391		merge_diradd(inodedep, dap);
9392	FREE_LOCK(dp->i_ump);
9393}
9394
9395/*
9396 * Called whenever the link count on an inode is changed.
9397 * It creates an inode dependency so that the new reference(s)
9398 * to the inode cannot be committed to disk until the updated
9399 * inode has been written.
9400 */
9401void
9402softdep_change_linkcnt(ip)
9403	struct inode *ip;	/* the inode with the increased link count */
9404{
9405	struct inodedep *inodedep;
9406
9407	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
9408	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9409	ACQUIRE_LOCK(ip->i_ump);
9410	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC,
9411	    &inodedep);
9412	if (ip->i_nlink < ip->i_effnlink)
9413		panic("softdep_change_linkcnt: bad delta");
9414	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9415	FREE_LOCK(ip->i_ump);
9416}
9417
9418/*
9419 * Attach a sbdep dependency to the superblock buf so that we can keep
9420 * track of the head of the linked list of referenced but unlinked inodes.
9421 */
9422void
9423softdep_setup_sbupdate(ump, fs, bp)
9424	struct ufsmount *ump;
9425	struct fs *fs;
9426	struct buf *bp;
9427{
9428	struct sbdep *sbdep;
9429	struct worklist *wk;
9430
9431	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9432	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9433	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9434		if (wk->wk_type == D_SBDEP)
9435			break;
9436	if (wk != NULL)
9437		return;
9438	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9439	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9440	sbdep->sb_fs = fs;
9441	sbdep->sb_ump = ump;
9442	ACQUIRE_LOCK(ump);
9443	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9444	FREE_LOCK(ump);
9445}
9446
9447/*
9448 * Return the first unlinked inodedep which is ready to be the head of the
9449 * list.  The inodedep and all those after it must have valid next pointers.
9450 */
9451static struct inodedep *
9452first_unlinked_inodedep(ump)
9453	struct ufsmount *ump;
9454{
9455	struct inodedep *inodedep;
9456	struct inodedep *idp;
9457
9458	LOCK_OWNED(ump);
9459	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9460	    inodedep; inodedep = idp) {
9461		if ((inodedep->id_state & UNLINKNEXT) == 0)
9462			return (NULL);
9463		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9464		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9465			break;
9466		if ((inodedep->id_state & UNLINKPREV) == 0)
9467			break;
9468	}
9469	return (inodedep);
9470}
9471
9472/*
9473 * Set the sujfree unlinked head pointer prior to writing a superblock.
9474 */
9475static void
9476initiate_write_sbdep(sbdep)
9477	struct sbdep *sbdep;
9478{
9479	struct inodedep *inodedep;
9480	struct fs *bpfs;
9481	struct fs *fs;
9482
9483	bpfs = sbdep->sb_fs;
9484	fs = sbdep->sb_ump->um_fs;
9485	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9486	if (inodedep) {
9487		fs->fs_sujfree = inodedep->id_ino;
9488		inodedep->id_state |= UNLINKPREV;
9489	} else
9490		fs->fs_sujfree = 0;
9491	bpfs->fs_sujfree = fs->fs_sujfree;
9492}
9493
9494/*
9495 * After a superblock is written determine whether it must be written again
9496 * due to a changing unlinked list head.
9497 */
9498static int
9499handle_written_sbdep(sbdep, bp)
9500	struct sbdep *sbdep;
9501	struct buf *bp;
9502{
9503	struct inodedep *inodedep;
9504	struct fs *fs;
9505
9506	LOCK_OWNED(sbdep->sb_ump);
9507	fs = sbdep->sb_fs;
9508	/*
9509	 * If the superblock doesn't match the in-memory list start over.
9510	 */
9511	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9512	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9513	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9514		bdirty(bp);
9515		return (1);
9516	}
9517	WORKITEM_FREE(sbdep, D_SBDEP);
9518	if (fs->fs_sujfree == 0)
9519		return (0);
9520	/*
9521	 * Now that we have a record of this inode in stable store allow it
9522	 * to be written to free up pending work.  Inodes may see a lot of
9523	 * write activity after they are unlinked which we must not hold up.
9524	 */
9525	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9526		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9527			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9528			    inodedep, inodedep->id_state);
9529		if (inodedep->id_state & UNLINKONLIST)
9530			break;
9531		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9532	}
9533
9534	return (0);
9535}
9536
9537/*
9538 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9539 */
9540static void
9541unlinked_inodedep(mp, inodedep)
9542	struct mount *mp;
9543	struct inodedep *inodedep;
9544{
9545	struct ufsmount *ump;
9546
9547	ump = VFSTOUFS(mp);
9548	LOCK_OWNED(ump);
9549	if (MOUNTEDSUJ(mp) == 0)
9550		return;
9551	ump->um_fs->fs_fmod = 1;
9552	if (inodedep->id_state & UNLINKED)
9553		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9554	inodedep->id_state |= UNLINKED;
9555	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9556}
9557
9558/*
9559 * Remove an inodedep from the unlinked inodedep list.  This may require
9560 * disk writes if the inode has made it that far.
9561 */
9562static void
9563clear_unlinked_inodedep(inodedep)
9564	struct inodedep *inodedep;
9565{
9566	struct ufsmount *ump;
9567	struct inodedep *idp;
9568	struct inodedep *idn;
9569	struct fs *fs;
9570	struct buf *bp;
9571	ino_t ino;
9572	ino_t nino;
9573	ino_t pino;
9574	int error;
9575
9576	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9577	fs = ump->um_fs;
9578	ino = inodedep->id_ino;
9579	error = 0;
9580	for (;;) {
9581		LOCK_OWNED(ump);
9582		KASSERT((inodedep->id_state & UNLINKED) != 0,
9583		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9584		    inodedep));
9585		/*
9586		 * If nothing has yet been written simply remove us from
9587		 * the in memory list and return.  This is the most common
9588		 * case where handle_workitem_remove() loses the final
9589		 * reference.
9590		 */
9591		if ((inodedep->id_state & UNLINKLINKS) == 0)
9592			break;
9593		/*
9594		 * If we have a NEXT pointer and no PREV pointer we can simply
9595		 * clear NEXT's PREV and remove ourselves from the list.  Be
9596		 * careful not to clear PREV if the superblock points at
9597		 * next as well.
9598		 */
9599		idn = TAILQ_NEXT(inodedep, id_unlinked);
9600		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
9601			if (idn && fs->fs_sujfree != idn->id_ino)
9602				idn->id_state &= ~UNLINKPREV;
9603			break;
9604		}
9605		/*
9606		 * Here we have an inodedep which is actually linked into
9607		 * the list.  We must remove it by forcing a write to the
9608		 * link before us, whether it be the superblock or an inode.
9609		 * Unfortunately the list may change while we're waiting
9610		 * on the buf lock for either resource so we must loop until
9611		 * we lock the right one.  If both the superblock and an
9612		 * inode point to this inode we must clear the inode first
9613		 * followed by the superblock.
9614		 */
9615		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9616		pino = 0;
9617		if (idp && (idp->id_state & UNLINKNEXT))
9618			pino = idp->id_ino;
9619		FREE_LOCK(ump);
9620		if (pino == 0) {
9621			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9622			    (int)fs->fs_sbsize, 0, 0, 0);
9623		} else {
9624			error = bread(ump->um_devvp,
9625			    fsbtodb(fs, ino_to_fsba(fs, pino)),
9626			    (int)fs->fs_bsize, NOCRED, &bp);
9627			if (error)
9628				brelse(bp);
9629		}
9630		ACQUIRE_LOCK(ump);
9631		if (error)
9632			break;
9633		/* If the list has changed restart the loop. */
9634		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9635		nino = 0;
9636		if (idp && (idp->id_state & UNLINKNEXT))
9637			nino = idp->id_ino;
9638		if (nino != pino ||
9639		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
9640			FREE_LOCK(ump);
9641			brelse(bp);
9642			ACQUIRE_LOCK(ump);
9643			continue;
9644		}
9645		nino = 0;
9646		idn = TAILQ_NEXT(inodedep, id_unlinked);
9647		if (idn)
9648			nino = idn->id_ino;
9649		/*
9650		 * Remove us from the in memory list.  After this we cannot
9651		 * access the inodedep.
9652		 */
9653		KASSERT((inodedep->id_state & UNLINKED) != 0,
9654		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
9655		    inodedep));
9656		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9657		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9658		FREE_LOCK(ump);
9659		/*
9660		 * The predecessor's next pointer is manually updated here
9661		 * so that the NEXT flag is never cleared for an element
9662		 * that is in the list.
9663		 */
9664		if (pino == 0) {
9665			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9666			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9667			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9668			    bp);
9669		} else if (fs->fs_magic == FS_UFS1_MAGIC)
9670			((struct ufs1_dinode *)bp->b_data +
9671			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9672		else
9673			((struct ufs2_dinode *)bp->b_data +
9674			    ino_to_fsbo(fs, pino))->di_freelink = nino;
9675		/*
9676		 * If the bwrite fails we have no recourse to recover.  The
9677		 * filesystem is corrupted already.
9678		 */
9679		bwrite(bp);
9680		ACQUIRE_LOCK(ump);
9681		/*
9682		 * If the superblock pointer still needs to be cleared force
9683		 * a write here.
9684		 */
9685		if (fs->fs_sujfree == ino) {
9686			FREE_LOCK(ump);
9687			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
9688			    (int)fs->fs_sbsize, 0, 0, 0);
9689			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
9690			ffs_oldfscompat_write((struct fs *)bp->b_data, ump);
9691			softdep_setup_sbupdate(ump, (struct fs *)bp->b_data,
9692			    bp);
9693			bwrite(bp);
9694			ACQUIRE_LOCK(ump);
9695		}
9696
9697		if (fs->fs_sujfree != ino)
9698			return;
9699		panic("clear_unlinked_inodedep: Failed to clear free head");
9700	}
9701	if (inodedep->id_ino == fs->fs_sujfree)
9702		panic("clear_unlinked_inodedep: Freeing head of free list");
9703	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
9704	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
9705	return;
9706}
9707
9708/*
9709 * This workitem decrements the inode's link count.
9710 * If the link count reaches zero, the file is removed.
9711 */
9712static int
9713handle_workitem_remove(dirrem, flags)
9714	struct dirrem *dirrem;
9715	int flags;
9716{
9717	struct inodedep *inodedep;
9718	struct workhead dotdotwk;
9719	struct worklist *wk;
9720	struct ufsmount *ump;
9721	struct mount *mp;
9722	struct vnode *vp;
9723	struct inode *ip;
9724	ino_t oldinum;
9725
9726	if (dirrem->dm_state & ONWORKLIST)
9727		panic("handle_workitem_remove: dirrem %p still on worklist",
9728		    dirrem);
9729	oldinum = dirrem->dm_oldinum;
9730	mp = dirrem->dm_list.wk_mp;
9731	ump = VFSTOUFS(mp);
9732	flags |= LK_EXCLUSIVE;
9733	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
9734		return (EBUSY);
9735	ip = VTOI(vp);
9736	ACQUIRE_LOCK(ump);
9737	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
9738		panic("handle_workitem_remove: lost inodedep");
9739	if (dirrem->dm_state & ONDEPLIST)
9740		LIST_REMOVE(dirrem, dm_inonext);
9741	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
9742	    ("handle_workitem_remove:  Journal entries not written."));
9743
9744	/*
9745	 * Move all dependencies waiting on the remove to complete
9746	 * from the dirrem to the inode inowait list to be completed
9747	 * after the inode has been updated and written to disk.  Any
9748	 * marked MKDIR_PARENT are saved to be completed when the .. ref
9749	 * is removed.
9750	 */
9751	LIST_INIT(&dotdotwk);
9752	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
9753		WORKLIST_REMOVE(wk);
9754		if (wk->wk_state & MKDIR_PARENT) {
9755			wk->wk_state &= ~MKDIR_PARENT;
9756			WORKLIST_INSERT(&dotdotwk, wk);
9757			continue;
9758		}
9759		WORKLIST_INSERT(&inodedep->id_inowait, wk);
9760	}
9761	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
9762	/*
9763	 * Normal file deletion.
9764	 */
9765	if ((dirrem->dm_state & RMDIR) == 0) {
9766		ip->i_nlink--;
9767		DIP_SET(ip, i_nlink, ip->i_nlink);
9768		ip->i_flag |= IN_CHANGE;
9769		if (ip->i_nlink < ip->i_effnlink)
9770			panic("handle_workitem_remove: bad file delta");
9771		if (ip->i_nlink == 0)
9772			unlinked_inodedep(mp, inodedep);
9773		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9774		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9775		    ("handle_workitem_remove: worklist not empty. %s",
9776		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
9777		WORKITEM_FREE(dirrem, D_DIRREM);
9778		FREE_LOCK(ump);
9779		goto out;
9780	}
9781	/*
9782	 * Directory deletion. Decrement reference count for both the
9783	 * just deleted parent directory entry and the reference for ".".
9784	 * Arrange to have the reference count on the parent decremented
9785	 * to account for the loss of "..".
9786	 */
9787	ip->i_nlink -= 2;
9788	DIP_SET(ip, i_nlink, ip->i_nlink);
9789	ip->i_flag |= IN_CHANGE;
9790	if (ip->i_nlink < ip->i_effnlink)
9791		panic("handle_workitem_remove: bad dir delta");
9792	if (ip->i_nlink == 0)
9793		unlinked_inodedep(mp, inodedep);
9794	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9795	/*
9796	 * Rename a directory to a new parent. Since, we are both deleting
9797	 * and creating a new directory entry, the link count on the new
9798	 * directory should not change. Thus we skip the followup dirrem.
9799	 */
9800	if (dirrem->dm_state & DIRCHG) {
9801		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
9802		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
9803		WORKITEM_FREE(dirrem, D_DIRREM);
9804		FREE_LOCK(ump);
9805		goto out;
9806	}
9807	dirrem->dm_state = ONDEPLIST;
9808	dirrem->dm_oldinum = dirrem->dm_dirinum;
9809	/*
9810	 * Place the dirrem on the parent's diremhd list.
9811	 */
9812	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
9813		panic("handle_workitem_remove: lost dir inodedep");
9814	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9815	/*
9816	 * If the allocated inode has never been written to disk, then
9817	 * the on-disk inode is zero'ed and we can remove the file
9818	 * immediately.  When journaling if the inode has been marked
9819	 * unlinked and not DEPCOMPLETE we know it can never be written.
9820	 */
9821	inodedep_lookup(mp, oldinum, 0, &inodedep);
9822	if (inodedep == NULL ||
9823	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
9824	    check_inode_unwritten(inodedep)) {
9825		FREE_LOCK(ump);
9826		vput(vp);
9827		return handle_workitem_remove(dirrem, flags);
9828	}
9829	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
9830	FREE_LOCK(ump);
9831	ip->i_flag |= IN_CHANGE;
9832out:
9833	ffs_update(vp, 0);
9834	vput(vp);
9835	return (0);
9836}
9837
9838/*
9839 * Inode de-allocation dependencies.
9840 *
9841 * When an inode's link count is reduced to zero, it can be de-allocated. We
9842 * found it convenient to postpone de-allocation until after the inode is
9843 * written to disk with its new link count (zero).  At this point, all of the
9844 * on-disk inode's block pointers are nullified and, with careful dependency
9845 * list ordering, all dependencies related to the inode will be satisfied and
9846 * the corresponding dependency structures de-allocated.  So, if/when the
9847 * inode is reused, there will be no mixing of old dependencies with new
9848 * ones.  This artificial dependency is set up by the block de-allocation
9849 * procedure above (softdep_setup_freeblocks) and completed by the
9850 * following procedure.
9851 */
9852static void
9853handle_workitem_freefile(freefile)
9854	struct freefile *freefile;
9855{
9856	struct workhead wkhd;
9857	struct fs *fs;
9858	struct inodedep *idp;
9859	struct ufsmount *ump;
9860	int error;
9861
9862	ump = VFSTOUFS(freefile->fx_list.wk_mp);
9863	fs = ump->um_fs;
9864#ifdef DEBUG
9865	ACQUIRE_LOCK(ump);
9866	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
9867	FREE_LOCK(ump);
9868	if (error)
9869		panic("handle_workitem_freefile: inodedep %p survived", idp);
9870#endif
9871	UFS_LOCK(ump);
9872	fs->fs_pendinginodes -= 1;
9873	UFS_UNLOCK(ump);
9874	LIST_INIT(&wkhd);
9875	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
9876	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
9877	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
9878		softdep_error("handle_workitem_freefile", error);
9879	ACQUIRE_LOCK(ump);
9880	WORKITEM_FREE(freefile, D_FREEFILE);
9881	FREE_LOCK(ump);
9882}
9883
9884
9885/*
9886 * Helper function which unlinks marker element from work list and returns
9887 * the next element on the list.
9888 */
9889static __inline struct worklist *
9890markernext(struct worklist *marker)
9891{
9892	struct worklist *next;
9893
9894	next = LIST_NEXT(marker, wk_list);
9895	LIST_REMOVE(marker, wk_list);
9896	return next;
9897}
9898
9899/*
9900 * Disk writes.
9901 *
9902 * The dependency structures constructed above are most actively used when file
9903 * system blocks are written to disk.  No constraints are placed on when a
9904 * block can be written, but unsatisfied update dependencies are made safe by
9905 * modifying (or replacing) the source memory for the duration of the disk
9906 * write.  When the disk write completes, the memory block is again brought
9907 * up-to-date.
9908 *
9909 * In-core inode structure reclamation.
9910 *
9911 * Because there are a finite number of "in-core" inode structures, they are
9912 * reused regularly.  By transferring all inode-related dependencies to the
9913 * in-memory inode block and indexing them separately (via "inodedep"s), we
9914 * can allow "in-core" inode structures to be reused at any time and avoid
9915 * any increase in contention.
9916 *
9917 * Called just before entering the device driver to initiate a new disk I/O.
9918 * The buffer must be locked, thus, no I/O completion operations can occur
9919 * while we are manipulating its associated dependencies.
9920 */
9921static void
9922softdep_disk_io_initiation(bp)
9923	struct buf *bp;		/* structure describing disk write to occur */
9924{
9925	struct worklist *wk;
9926	struct worklist marker;
9927	struct inodedep *inodedep;
9928	struct freeblks *freeblks;
9929	struct jblkdep *jblkdep;
9930	struct newblk *newblk;
9931	struct ufsmount *ump;
9932
9933	/*
9934	 * We only care about write operations. There should never
9935	 * be dependencies for reads.
9936	 */
9937	if (bp->b_iocmd != BIO_WRITE)
9938		panic("softdep_disk_io_initiation: not write");
9939
9940	if (bp->b_vflags & BV_BKGRDINPROG)
9941		panic("softdep_disk_io_initiation: Writing buffer with "
9942		    "background write in progress: %p", bp);
9943
9944	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
9945		return;
9946	ump = VFSTOUFS(wk->wk_mp);
9947
9948	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
9949	PHOLD(curproc);			/* Don't swap out kernel stack */
9950	ACQUIRE_LOCK(ump);
9951	/*
9952	 * Do any necessary pre-I/O processing.
9953	 */
9954	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
9955	     wk = markernext(&marker)) {
9956		LIST_INSERT_AFTER(wk, &marker, wk_list);
9957		switch (wk->wk_type) {
9958
9959		case D_PAGEDEP:
9960			initiate_write_filepage(WK_PAGEDEP(wk), bp);
9961			continue;
9962
9963		case D_INODEDEP:
9964			inodedep = WK_INODEDEP(wk);
9965			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
9966				initiate_write_inodeblock_ufs1(inodedep, bp);
9967			else
9968				initiate_write_inodeblock_ufs2(inodedep, bp);
9969			continue;
9970
9971		case D_INDIRDEP:
9972			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
9973			continue;
9974
9975		case D_BMSAFEMAP:
9976			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
9977			continue;
9978
9979		case D_JSEG:
9980			WK_JSEG(wk)->js_buf = NULL;
9981			continue;
9982
9983		case D_FREEBLKS:
9984			freeblks = WK_FREEBLKS(wk);
9985			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
9986			/*
9987			 * We have to wait for the freeblks to be journaled
9988			 * before we can write an inodeblock with updated
9989			 * pointers.  Be careful to arrange the marker so
9990			 * we revisit the freeblks if it's not removed by
9991			 * the first jwait().
9992			 */
9993			if (jblkdep != NULL) {
9994				LIST_REMOVE(&marker, wk_list);
9995				LIST_INSERT_BEFORE(wk, &marker, wk_list);
9996				jwait(&jblkdep->jb_list, MNT_WAIT);
9997			}
9998			continue;
9999		case D_ALLOCDIRECT:
10000		case D_ALLOCINDIR:
10001			/*
10002			 * We have to wait for the jnewblk to be journaled
10003			 * before we can write to a block if the contents
10004			 * may be confused with an earlier file's indirect
10005			 * at recovery time.  Handle the marker as described
10006			 * above.
10007			 */
10008			newblk = WK_NEWBLK(wk);
10009			if (newblk->nb_jnewblk != NULL &&
10010			    indirblk_lookup(newblk->nb_list.wk_mp,
10011			    newblk->nb_newblkno)) {
10012				LIST_REMOVE(&marker, wk_list);
10013				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10014				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10015			}
10016			continue;
10017
10018		case D_SBDEP:
10019			initiate_write_sbdep(WK_SBDEP(wk));
10020			continue;
10021
10022		case D_MKDIR:
10023		case D_FREEWORK:
10024		case D_FREEDEP:
10025		case D_JSEGDEP:
10026			continue;
10027
10028		default:
10029			panic("handle_disk_io_initiation: Unexpected type %s",
10030			    TYPENAME(wk->wk_type));
10031			/* NOTREACHED */
10032		}
10033	}
10034	FREE_LOCK(ump);
10035	PRELE(curproc);			/* Allow swapout of kernel stack */
10036}
10037
10038/*
10039 * Called from within the procedure above to deal with unsatisfied
10040 * allocation dependencies in a directory. The buffer must be locked,
10041 * thus, no I/O completion operations can occur while we are
10042 * manipulating its associated dependencies.
10043 */
10044static void
10045initiate_write_filepage(pagedep, bp)
10046	struct pagedep *pagedep;
10047	struct buf *bp;
10048{
10049	struct jremref *jremref;
10050	struct jmvref *jmvref;
10051	struct dirrem *dirrem;
10052	struct diradd *dap;
10053	struct direct *ep;
10054	int i;
10055
10056	if (pagedep->pd_state & IOSTARTED) {
10057		/*
10058		 * This can only happen if there is a driver that does not
10059		 * understand chaining. Here biodone will reissue the call
10060		 * to strategy for the incomplete buffers.
10061		 */
10062		printf("initiate_write_filepage: already started\n");
10063		return;
10064	}
10065	pagedep->pd_state |= IOSTARTED;
10066	/*
10067	 * Wait for all journal remove dependencies to hit the disk.
10068	 * We can not allow any potentially conflicting directory adds
10069	 * to be visible before removes and rollback is too difficult.
10070	 * The per-filesystem lock may be dropped and re-acquired, however
10071	 * we hold the buf locked so the dependency can not go away.
10072	 */
10073	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10074		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10075			jwait(&jremref->jr_list, MNT_WAIT);
10076	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10077		jwait(&jmvref->jm_list, MNT_WAIT);
10078	for (i = 0; i < DAHASHSZ; i++) {
10079		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10080			ep = (struct direct *)
10081			    ((char *)bp->b_data + dap->da_offset);
10082			if (ep->d_ino != dap->da_newinum)
10083				panic("%s: dir inum %ju != new %ju",
10084				    "initiate_write_filepage",
10085				    (uintmax_t)ep->d_ino,
10086				    (uintmax_t)dap->da_newinum);
10087			if (dap->da_state & DIRCHG)
10088				ep->d_ino = dap->da_previous->dm_oldinum;
10089			else
10090				ep->d_ino = 0;
10091			dap->da_state &= ~ATTACHED;
10092			dap->da_state |= UNDONE;
10093		}
10094	}
10095}
10096
10097/*
10098 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10099 * Note that any bug fixes made to this routine must be done in the
10100 * version found below.
10101 *
10102 * Called from within the procedure above to deal with unsatisfied
10103 * allocation dependencies in an inodeblock. The buffer must be
10104 * locked, thus, no I/O completion operations can occur while we
10105 * are manipulating its associated dependencies.
10106 */
10107static void
10108initiate_write_inodeblock_ufs1(inodedep, bp)
10109	struct inodedep *inodedep;
10110	struct buf *bp;			/* The inode block */
10111{
10112	struct allocdirect *adp, *lastadp;
10113	struct ufs1_dinode *dp;
10114	struct ufs1_dinode *sip;
10115	struct inoref *inoref;
10116	struct ufsmount *ump;
10117	struct fs *fs;
10118	ufs_lbn_t i;
10119#ifdef INVARIANTS
10120	ufs_lbn_t prevlbn = 0;
10121#endif
10122	int deplist;
10123
10124	if (inodedep->id_state & IOSTARTED)
10125		panic("initiate_write_inodeblock_ufs1: already started");
10126	inodedep->id_state |= IOSTARTED;
10127	fs = inodedep->id_fs;
10128	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10129	LOCK_OWNED(ump);
10130	dp = (struct ufs1_dinode *)bp->b_data +
10131	    ino_to_fsbo(fs, inodedep->id_ino);
10132
10133	/*
10134	 * If we're on the unlinked list but have not yet written our
10135	 * next pointer initialize it here.
10136	 */
10137	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10138		struct inodedep *inon;
10139
10140		inon = TAILQ_NEXT(inodedep, id_unlinked);
10141		dp->di_freelink = inon ? inon->id_ino : 0;
10142	}
10143	/*
10144	 * If the bitmap is not yet written, then the allocated
10145	 * inode cannot be written to disk.
10146	 */
10147	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10148		if (inodedep->id_savedino1 != NULL)
10149			panic("initiate_write_inodeblock_ufs1: I/O underway");
10150		FREE_LOCK(ump);
10151		sip = malloc(sizeof(struct ufs1_dinode),
10152		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10153		ACQUIRE_LOCK(ump);
10154		inodedep->id_savedino1 = sip;
10155		*inodedep->id_savedino1 = *dp;
10156		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10157		dp->di_gen = inodedep->id_savedino1->di_gen;
10158		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10159		return;
10160	}
10161	/*
10162	 * If no dependencies, then there is nothing to roll back.
10163	 */
10164	inodedep->id_savedsize = dp->di_size;
10165	inodedep->id_savedextsize = 0;
10166	inodedep->id_savednlink = dp->di_nlink;
10167	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10168	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10169		return;
10170	/*
10171	 * Revert the link count to that of the first unwritten journal entry.
10172	 */
10173	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10174	if (inoref)
10175		dp->di_nlink = inoref->if_nlink;
10176	/*
10177	 * Set the dependencies to busy.
10178	 */
10179	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10180	     adp = TAILQ_NEXT(adp, ad_next)) {
10181#ifdef INVARIANTS
10182		if (deplist != 0 && prevlbn >= adp->ad_offset)
10183			panic("softdep_write_inodeblock: lbn order");
10184		prevlbn = adp->ad_offset;
10185		if (adp->ad_offset < NDADDR &&
10186		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10187			panic("%s: direct pointer #%jd mismatch %d != %jd",
10188			    "softdep_write_inodeblock",
10189			    (intmax_t)adp->ad_offset,
10190			    dp->di_db[adp->ad_offset],
10191			    (intmax_t)adp->ad_newblkno);
10192		if (adp->ad_offset >= NDADDR &&
10193		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10194			panic("%s: indirect pointer #%jd mismatch %d != %jd",
10195			    "softdep_write_inodeblock",
10196			    (intmax_t)adp->ad_offset - NDADDR,
10197			    dp->di_ib[adp->ad_offset - NDADDR],
10198			    (intmax_t)adp->ad_newblkno);
10199		deplist |= 1 << adp->ad_offset;
10200		if ((adp->ad_state & ATTACHED) == 0)
10201			panic("softdep_write_inodeblock: Unknown state 0x%x",
10202			    adp->ad_state);
10203#endif /* INVARIANTS */
10204		adp->ad_state &= ~ATTACHED;
10205		adp->ad_state |= UNDONE;
10206	}
10207	/*
10208	 * The on-disk inode cannot claim to be any larger than the last
10209	 * fragment that has been written. Otherwise, the on-disk inode
10210	 * might have fragments that were not the last block in the file
10211	 * which would corrupt the filesystem.
10212	 */
10213	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10214	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10215		if (adp->ad_offset >= NDADDR)
10216			break;
10217		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10218		/* keep going until hitting a rollback to a frag */
10219		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10220			continue;
10221		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10222		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10223#ifdef INVARIANTS
10224			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10225				panic("softdep_write_inodeblock: lost dep1");
10226#endif /* INVARIANTS */
10227			dp->di_db[i] = 0;
10228		}
10229		for (i = 0; i < NIADDR; i++) {
10230#ifdef INVARIANTS
10231			if (dp->di_ib[i] != 0 &&
10232			    (deplist & ((1 << NDADDR) << i)) == 0)
10233				panic("softdep_write_inodeblock: lost dep2");
10234#endif /* INVARIANTS */
10235			dp->di_ib[i] = 0;
10236		}
10237		return;
10238	}
10239	/*
10240	 * If we have zero'ed out the last allocated block of the file,
10241	 * roll back the size to the last currently allocated block.
10242	 * We know that this last allocated block is a full-sized as
10243	 * we already checked for fragments in the loop above.
10244	 */
10245	if (lastadp != NULL &&
10246	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10247		for (i = lastadp->ad_offset; i >= 0; i--)
10248			if (dp->di_db[i] != 0)
10249				break;
10250		dp->di_size = (i + 1) * fs->fs_bsize;
10251	}
10252	/*
10253	 * The only dependencies are for indirect blocks.
10254	 *
10255	 * The file size for indirect block additions is not guaranteed.
10256	 * Such a guarantee would be non-trivial to achieve. The conventional
10257	 * synchronous write implementation also does not make this guarantee.
10258	 * Fsck should catch and fix discrepancies. Arguably, the file size
10259	 * can be over-estimated without destroying integrity when the file
10260	 * moves into the indirect blocks (i.e., is large). If we want to
10261	 * postpone fsck, we are stuck with this argument.
10262	 */
10263	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10264		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10265}
10266
10267/*
10268 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10269 * Note that any bug fixes made to this routine must be done in the
10270 * version found above.
10271 *
10272 * Called from within the procedure above to deal with unsatisfied
10273 * allocation dependencies in an inodeblock. The buffer must be
10274 * locked, thus, no I/O completion operations can occur while we
10275 * are manipulating its associated dependencies.
10276 */
10277static void
10278initiate_write_inodeblock_ufs2(inodedep, bp)
10279	struct inodedep *inodedep;
10280	struct buf *bp;			/* The inode block */
10281{
10282	struct allocdirect *adp, *lastadp;
10283	struct ufs2_dinode *dp;
10284	struct ufs2_dinode *sip;
10285	struct inoref *inoref;
10286	struct ufsmount *ump;
10287	struct fs *fs;
10288	ufs_lbn_t i;
10289#ifdef INVARIANTS
10290	ufs_lbn_t prevlbn = 0;
10291#endif
10292	int deplist;
10293
10294	if (inodedep->id_state & IOSTARTED)
10295		panic("initiate_write_inodeblock_ufs2: already started");
10296	inodedep->id_state |= IOSTARTED;
10297	fs = inodedep->id_fs;
10298	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10299	LOCK_OWNED(ump);
10300	dp = (struct ufs2_dinode *)bp->b_data +
10301	    ino_to_fsbo(fs, inodedep->id_ino);
10302
10303	/*
10304	 * If we're on the unlinked list but have not yet written our
10305	 * next pointer initialize it here.
10306	 */
10307	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10308		struct inodedep *inon;
10309
10310		inon = TAILQ_NEXT(inodedep, id_unlinked);
10311		dp->di_freelink = inon ? inon->id_ino : 0;
10312	}
10313	/*
10314	 * If the bitmap is not yet written, then the allocated
10315	 * inode cannot be written to disk.
10316	 */
10317	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10318		if (inodedep->id_savedino2 != NULL)
10319			panic("initiate_write_inodeblock_ufs2: I/O underway");
10320		FREE_LOCK(ump);
10321		sip = malloc(sizeof(struct ufs2_dinode),
10322		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10323		ACQUIRE_LOCK(ump);
10324		inodedep->id_savedino2 = sip;
10325		*inodedep->id_savedino2 = *dp;
10326		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10327		dp->di_gen = inodedep->id_savedino2->di_gen;
10328		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10329		return;
10330	}
10331	/*
10332	 * If no dependencies, then there is nothing to roll back.
10333	 */
10334	inodedep->id_savedsize = dp->di_size;
10335	inodedep->id_savedextsize = dp->di_extsize;
10336	inodedep->id_savednlink = dp->di_nlink;
10337	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10338	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10339	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10340		return;
10341	/*
10342	 * Revert the link count to that of the first unwritten journal entry.
10343	 */
10344	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10345	if (inoref)
10346		dp->di_nlink = inoref->if_nlink;
10347
10348	/*
10349	 * Set the ext data dependencies to busy.
10350	 */
10351	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10352	     adp = TAILQ_NEXT(adp, ad_next)) {
10353#ifdef INVARIANTS
10354		if (deplist != 0 && prevlbn >= adp->ad_offset)
10355			panic("softdep_write_inodeblock: lbn order");
10356		prevlbn = adp->ad_offset;
10357		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10358			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10359			    "softdep_write_inodeblock",
10360			    (intmax_t)adp->ad_offset,
10361			    (intmax_t)dp->di_extb[adp->ad_offset],
10362			    (intmax_t)adp->ad_newblkno);
10363		deplist |= 1 << adp->ad_offset;
10364		if ((adp->ad_state & ATTACHED) == 0)
10365			panic("softdep_write_inodeblock: Unknown state 0x%x",
10366			    adp->ad_state);
10367#endif /* INVARIANTS */
10368		adp->ad_state &= ~ATTACHED;
10369		adp->ad_state |= UNDONE;
10370	}
10371	/*
10372	 * The on-disk inode cannot claim to be any larger than the last
10373	 * fragment that has been written. Otherwise, the on-disk inode
10374	 * might have fragments that were not the last block in the ext
10375	 * data which would corrupt the filesystem.
10376	 */
10377	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10378	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10379		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10380		/* keep going until hitting a rollback to a frag */
10381		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10382			continue;
10383		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10384		for (i = adp->ad_offset + 1; i < NXADDR; i++) {
10385#ifdef INVARIANTS
10386			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10387				panic("softdep_write_inodeblock: lost dep1");
10388#endif /* INVARIANTS */
10389			dp->di_extb[i] = 0;
10390		}
10391		lastadp = NULL;
10392		break;
10393	}
10394	/*
10395	 * If we have zero'ed out the last allocated block of the ext
10396	 * data, roll back the size to the last currently allocated block.
10397	 * We know that this last allocated block is a full-sized as
10398	 * we already checked for fragments in the loop above.
10399	 */
10400	if (lastadp != NULL &&
10401	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10402		for (i = lastadp->ad_offset; i >= 0; i--)
10403			if (dp->di_extb[i] != 0)
10404				break;
10405		dp->di_extsize = (i + 1) * fs->fs_bsize;
10406	}
10407	/*
10408	 * Set the file data dependencies to busy.
10409	 */
10410	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10411	     adp = TAILQ_NEXT(adp, ad_next)) {
10412#ifdef INVARIANTS
10413		if (deplist != 0 && prevlbn >= adp->ad_offset)
10414			panic("softdep_write_inodeblock: lbn order");
10415		if ((adp->ad_state & ATTACHED) == 0)
10416			panic("inodedep %p and adp %p not attached", inodedep, adp);
10417		prevlbn = adp->ad_offset;
10418		if (adp->ad_offset < NDADDR &&
10419		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10420			panic("%s: direct pointer #%jd mismatch %jd != %jd",
10421			    "softdep_write_inodeblock",
10422			    (intmax_t)adp->ad_offset,
10423			    (intmax_t)dp->di_db[adp->ad_offset],
10424			    (intmax_t)adp->ad_newblkno);
10425		if (adp->ad_offset >= NDADDR &&
10426		    dp->di_ib[adp->ad_offset - NDADDR] != adp->ad_newblkno)
10427			panic("%s indirect pointer #%jd mismatch %jd != %jd",
10428			    "softdep_write_inodeblock:",
10429			    (intmax_t)adp->ad_offset - NDADDR,
10430			    (intmax_t)dp->di_ib[adp->ad_offset - NDADDR],
10431			    (intmax_t)adp->ad_newblkno);
10432		deplist |= 1 << adp->ad_offset;
10433		if ((adp->ad_state & ATTACHED) == 0)
10434			panic("softdep_write_inodeblock: Unknown state 0x%x",
10435			    adp->ad_state);
10436#endif /* INVARIANTS */
10437		adp->ad_state &= ~ATTACHED;
10438		adp->ad_state |= UNDONE;
10439	}
10440	/*
10441	 * The on-disk inode cannot claim to be any larger than the last
10442	 * fragment that has been written. Otherwise, the on-disk inode
10443	 * might have fragments that were not the last block in the file
10444	 * which would corrupt the filesystem.
10445	 */
10446	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10447	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10448		if (adp->ad_offset >= NDADDR)
10449			break;
10450		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10451		/* keep going until hitting a rollback to a frag */
10452		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10453			continue;
10454		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10455		for (i = adp->ad_offset + 1; i < NDADDR; i++) {
10456#ifdef INVARIANTS
10457			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10458				panic("softdep_write_inodeblock: lost dep2");
10459#endif /* INVARIANTS */
10460			dp->di_db[i] = 0;
10461		}
10462		for (i = 0; i < NIADDR; i++) {
10463#ifdef INVARIANTS
10464			if (dp->di_ib[i] != 0 &&
10465			    (deplist & ((1 << NDADDR) << i)) == 0)
10466				panic("softdep_write_inodeblock: lost dep3");
10467#endif /* INVARIANTS */
10468			dp->di_ib[i] = 0;
10469		}
10470		return;
10471	}
10472	/*
10473	 * If we have zero'ed out the last allocated block of the file,
10474	 * roll back the size to the last currently allocated block.
10475	 * We know that this last allocated block is a full-sized as
10476	 * we already checked for fragments in the loop above.
10477	 */
10478	if (lastadp != NULL &&
10479	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10480		for (i = lastadp->ad_offset; i >= 0; i--)
10481			if (dp->di_db[i] != 0)
10482				break;
10483		dp->di_size = (i + 1) * fs->fs_bsize;
10484	}
10485	/*
10486	 * The only dependencies are for indirect blocks.
10487	 *
10488	 * The file size for indirect block additions is not guaranteed.
10489	 * Such a guarantee would be non-trivial to achieve. The conventional
10490	 * synchronous write implementation also does not make this guarantee.
10491	 * Fsck should catch and fix discrepancies. Arguably, the file size
10492	 * can be over-estimated without destroying integrity when the file
10493	 * moves into the indirect blocks (i.e., is large). If we want to
10494	 * postpone fsck, we are stuck with this argument.
10495	 */
10496	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10497		dp->di_ib[adp->ad_offset - NDADDR] = 0;
10498}
10499
10500/*
10501 * Cancel an indirdep as a result of truncation.  Release all of the
10502 * children allocindirs and place their journal work on the appropriate
10503 * list.
10504 */
10505static void
10506cancel_indirdep(indirdep, bp, freeblks)
10507	struct indirdep *indirdep;
10508	struct buf *bp;
10509	struct freeblks *freeblks;
10510{
10511	struct allocindir *aip;
10512
10513	/*
10514	 * None of the indirect pointers will ever be visible,
10515	 * so they can simply be tossed. GOINGAWAY ensures
10516	 * that allocated pointers will be saved in the buffer
10517	 * cache until they are freed. Note that they will
10518	 * only be able to be found by their physical address
10519	 * since the inode mapping the logical address will
10520	 * be gone. The save buffer used for the safe copy
10521	 * was allocated in setup_allocindir_phase2 using
10522	 * the physical address so it could be used for this
10523	 * purpose. Hence we swap the safe copy with the real
10524	 * copy, allowing the safe copy to be freed and holding
10525	 * on to the real copy for later use in indir_trunc.
10526	 */
10527	if (indirdep->ir_state & GOINGAWAY)
10528		panic("cancel_indirdep: already gone");
10529	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
10530		indirdep->ir_state |= DEPCOMPLETE;
10531		LIST_REMOVE(indirdep, ir_next);
10532	}
10533	indirdep->ir_state |= GOINGAWAY;
10534	/*
10535	 * Pass in bp for blocks still have journal writes
10536	 * pending so we can cancel them on their own.
10537	 */
10538	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
10539		cancel_allocindir(aip, bp, freeblks, 0);
10540	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
10541		cancel_allocindir(aip, NULL, freeblks, 0);
10542	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
10543		cancel_allocindir(aip, NULL, freeblks, 0);
10544	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
10545		cancel_allocindir(aip, NULL, freeblks, 0);
10546	/*
10547	 * If there are pending partial truncations we need to keep the
10548	 * old block copy around until they complete.  This is because
10549	 * the current b_data is not a perfect superset of the available
10550	 * blocks.
10551	 */
10552	if (TAILQ_EMPTY(&indirdep->ir_trunc))
10553		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
10554	else
10555		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10556	WORKLIST_REMOVE(&indirdep->ir_list);
10557	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
10558	indirdep->ir_bp = NULL;
10559	indirdep->ir_freeblks = freeblks;
10560}
10561
10562/*
10563 * Free an indirdep once it no longer has new pointers to track.
10564 */
10565static void
10566free_indirdep(indirdep)
10567	struct indirdep *indirdep;
10568{
10569
10570	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
10571	    ("free_indirdep: Indir trunc list not empty."));
10572	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
10573	    ("free_indirdep: Complete head not empty."));
10574	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
10575	    ("free_indirdep: write head not empty."));
10576	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
10577	    ("free_indirdep: done head not empty."));
10578	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
10579	    ("free_indirdep: deplist head not empty."));
10580	KASSERT((indirdep->ir_state & DEPCOMPLETE),
10581	    ("free_indirdep: %p still on newblk list.", indirdep));
10582	KASSERT(indirdep->ir_saveddata == NULL,
10583	    ("free_indirdep: %p still has saved data.", indirdep));
10584	if (indirdep->ir_state & ONWORKLIST)
10585		WORKLIST_REMOVE(&indirdep->ir_list);
10586	WORKITEM_FREE(indirdep, D_INDIRDEP);
10587}
10588
10589/*
10590 * Called before a write to an indirdep.  This routine is responsible for
10591 * rolling back pointers to a safe state which includes only those
10592 * allocindirs which have been completed.
10593 */
10594static void
10595initiate_write_indirdep(indirdep, bp)
10596	struct indirdep *indirdep;
10597	struct buf *bp;
10598{
10599	struct ufsmount *ump;
10600
10601	indirdep->ir_state |= IOSTARTED;
10602	if (indirdep->ir_state & GOINGAWAY)
10603		panic("disk_io_initiation: indirdep gone");
10604	/*
10605	 * If there are no remaining dependencies, this will be writing
10606	 * the real pointers.
10607	 */
10608	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
10609	    TAILQ_EMPTY(&indirdep->ir_trunc))
10610		return;
10611	/*
10612	 * Replace up-to-date version with safe version.
10613	 */
10614	if (indirdep->ir_saveddata == NULL) {
10615		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
10616		LOCK_OWNED(ump);
10617		FREE_LOCK(ump);
10618		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
10619		    M_SOFTDEP_FLAGS);
10620		ACQUIRE_LOCK(ump);
10621	}
10622	indirdep->ir_state &= ~ATTACHED;
10623	indirdep->ir_state |= UNDONE;
10624	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
10625	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
10626	    bp->b_bcount);
10627}
10628
10629/*
10630 * Called when an inode has been cleared in a cg bitmap.  This finally
10631 * eliminates any canceled jaddrefs
10632 */
10633void
10634softdep_setup_inofree(mp, bp, ino, wkhd)
10635	struct mount *mp;
10636	struct buf *bp;
10637	ino_t ino;
10638	struct workhead *wkhd;
10639{
10640	struct worklist *wk, *wkn;
10641	struct inodedep *inodedep;
10642	struct ufsmount *ump;
10643	uint8_t *inosused;
10644	struct cg *cgp;
10645	struct fs *fs;
10646
10647	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
10648	    ("softdep_setup_inofree called on non-softdep filesystem"));
10649	ump = VFSTOUFS(mp);
10650	ACQUIRE_LOCK(ump);
10651	fs = ump->um_fs;
10652	cgp = (struct cg *)bp->b_data;
10653	inosused = cg_inosused(cgp);
10654	if (isset(inosused, ino % fs->fs_ipg))
10655		panic("softdep_setup_inofree: inode %ju not freed.",
10656		    (uintmax_t)ino);
10657	if (inodedep_lookup(mp, ino, 0, &inodedep))
10658		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
10659		    (uintmax_t)ino, inodedep);
10660	if (wkhd) {
10661		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
10662			if (wk->wk_type != D_JADDREF)
10663				continue;
10664			WORKLIST_REMOVE(wk);
10665			/*
10666			 * We can free immediately even if the jaddref
10667			 * isn't attached in a background write as now
10668			 * the bitmaps are reconciled.
10669			 */
10670			wk->wk_state |= COMPLETE | ATTACHED;
10671			free_jaddref(WK_JADDREF(wk));
10672		}
10673		jwork_move(&bp->b_dep, wkhd);
10674	}
10675	FREE_LOCK(ump);
10676}
10677
10678
10679/*
10680 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
10681 * map.  Any dependencies waiting for the write to clear are added to the
10682 * buf's list and any jnewblks that are being canceled are discarded
10683 * immediately.
10684 */
10685void
10686softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
10687	struct mount *mp;
10688	struct buf *bp;
10689	ufs2_daddr_t blkno;
10690	int frags;
10691	struct workhead *wkhd;
10692{
10693	struct bmsafemap *bmsafemap;
10694	struct jnewblk *jnewblk;
10695	struct ufsmount *ump;
10696	struct worklist *wk;
10697	struct fs *fs;
10698#ifdef SUJ_DEBUG
10699	uint8_t *blksfree;
10700	struct cg *cgp;
10701	ufs2_daddr_t jstart;
10702	ufs2_daddr_t jend;
10703	ufs2_daddr_t end;
10704	long bno;
10705	int i;
10706#endif
10707
10708	CTR3(KTR_SUJ,
10709	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
10710	    blkno, frags, wkhd);
10711
10712	ump = VFSTOUFS(mp);
10713	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
10714	    ("softdep_setup_blkfree called on non-softdep filesystem"));
10715	ACQUIRE_LOCK(ump);
10716	/* Lookup the bmsafemap so we track when it is dirty. */
10717	fs = ump->um_fs;
10718	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10719	/*
10720	 * Detach any jnewblks which have been canceled.  They must linger
10721	 * until the bitmap is cleared again by ffs_blkfree() to prevent
10722	 * an unjournaled allocation from hitting the disk.
10723	 */
10724	if (wkhd) {
10725		while ((wk = LIST_FIRST(wkhd)) != NULL) {
10726			CTR2(KTR_SUJ,
10727			    "softdep_setup_blkfree: blkno %jd wk type %d",
10728			    blkno, wk->wk_type);
10729			WORKLIST_REMOVE(wk);
10730			if (wk->wk_type != D_JNEWBLK) {
10731				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
10732				continue;
10733			}
10734			jnewblk = WK_JNEWBLK(wk);
10735			KASSERT(jnewblk->jn_state & GOINGAWAY,
10736			    ("softdep_setup_blkfree: jnewblk not canceled."));
10737#ifdef SUJ_DEBUG
10738			/*
10739			 * Assert that this block is free in the bitmap
10740			 * before we discard the jnewblk.
10741			 */
10742			cgp = (struct cg *)bp->b_data;
10743			blksfree = cg_blksfree(cgp);
10744			bno = dtogd(fs, jnewblk->jn_blkno);
10745			for (i = jnewblk->jn_oldfrags;
10746			    i < jnewblk->jn_frags; i++) {
10747				if (isset(blksfree, bno + i))
10748					continue;
10749				panic("softdep_setup_blkfree: not free");
10750			}
10751#endif
10752			/*
10753			 * Even if it's not attached we can free immediately
10754			 * as the new bitmap is correct.
10755			 */
10756			wk->wk_state |= COMPLETE | ATTACHED;
10757			free_jnewblk(jnewblk);
10758		}
10759	}
10760
10761#ifdef SUJ_DEBUG
10762	/*
10763	 * Assert that we are not freeing a block which has an outstanding
10764	 * allocation dependency.
10765	 */
10766	fs = VFSTOUFS(mp)->um_fs;
10767	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
10768	end = blkno + frags;
10769	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10770		/*
10771		 * Don't match against blocks that will be freed when the
10772		 * background write is done.
10773		 */
10774		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
10775		    (COMPLETE | DEPCOMPLETE))
10776			continue;
10777		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
10778		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
10779		if ((blkno >= jstart && blkno < jend) ||
10780		    (end > jstart && end <= jend)) {
10781			printf("state 0x%X %jd - %d %d dep %p\n",
10782			    jnewblk->jn_state, jnewblk->jn_blkno,
10783			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
10784			    jnewblk->jn_dep);
10785			panic("softdep_setup_blkfree: "
10786			    "%jd-%jd(%d) overlaps with %jd-%jd",
10787			    blkno, end, frags, jstart, jend);
10788		}
10789	}
10790#endif
10791	FREE_LOCK(ump);
10792}
10793
10794/*
10795 * Revert a block allocation when the journal record that describes it
10796 * is not yet written.
10797 */
10798static int
10799jnewblk_rollback(jnewblk, fs, cgp, blksfree)
10800	struct jnewblk *jnewblk;
10801	struct fs *fs;
10802	struct cg *cgp;
10803	uint8_t *blksfree;
10804{
10805	ufs1_daddr_t fragno;
10806	long cgbno, bbase;
10807	int frags, blk;
10808	int i;
10809
10810	frags = 0;
10811	cgbno = dtogd(fs, jnewblk->jn_blkno);
10812	/*
10813	 * We have to test which frags need to be rolled back.  We may
10814	 * be operating on a stale copy when doing background writes.
10815	 */
10816	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
10817		if (isclr(blksfree, cgbno + i))
10818			frags++;
10819	if (frags == 0)
10820		return (0);
10821	/*
10822	 * This is mostly ffs_blkfree() sans some validation and
10823	 * superblock updates.
10824	 */
10825	if (frags == fs->fs_frag) {
10826		fragno = fragstoblks(fs, cgbno);
10827		ffs_setblock(fs, blksfree, fragno);
10828		ffs_clusteracct(fs, cgp, fragno, 1);
10829		cgp->cg_cs.cs_nbfree++;
10830	} else {
10831		cgbno += jnewblk->jn_oldfrags;
10832		bbase = cgbno - fragnum(fs, cgbno);
10833		/* Decrement the old frags.  */
10834		blk = blkmap(fs, blksfree, bbase);
10835		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
10836		/* Deallocate the fragment */
10837		for (i = 0; i < frags; i++)
10838			setbit(blksfree, cgbno + i);
10839		cgp->cg_cs.cs_nffree += frags;
10840		/* Add back in counts associated with the new frags */
10841		blk = blkmap(fs, blksfree, bbase);
10842		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
10843		/* If a complete block has been reassembled, account for it. */
10844		fragno = fragstoblks(fs, bbase);
10845		if (ffs_isblock(fs, blksfree, fragno)) {
10846			cgp->cg_cs.cs_nffree -= fs->fs_frag;
10847			ffs_clusteracct(fs, cgp, fragno, 1);
10848			cgp->cg_cs.cs_nbfree++;
10849		}
10850	}
10851	stat_jnewblk++;
10852	jnewblk->jn_state &= ~ATTACHED;
10853	jnewblk->jn_state |= UNDONE;
10854
10855	return (frags);
10856}
10857
10858static void
10859initiate_write_bmsafemap(bmsafemap, bp)
10860	struct bmsafemap *bmsafemap;
10861	struct buf *bp;			/* The cg block. */
10862{
10863	struct jaddref *jaddref;
10864	struct jnewblk *jnewblk;
10865	uint8_t *inosused;
10866	uint8_t *blksfree;
10867	struct cg *cgp;
10868	struct fs *fs;
10869	ino_t ino;
10870
10871	if (bmsafemap->sm_state & IOSTARTED)
10872		return;
10873	bmsafemap->sm_state |= IOSTARTED;
10874	/*
10875	 * Clear any inode allocations which are pending journal writes.
10876	 */
10877	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
10878		cgp = (struct cg *)bp->b_data;
10879		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10880		inosused = cg_inosused(cgp);
10881		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
10882			ino = jaddref->ja_ino % fs->fs_ipg;
10883			if (isset(inosused, ino)) {
10884				if ((jaddref->ja_mode & IFMT) == IFDIR)
10885					cgp->cg_cs.cs_ndir--;
10886				cgp->cg_cs.cs_nifree++;
10887				clrbit(inosused, ino);
10888				jaddref->ja_state &= ~ATTACHED;
10889				jaddref->ja_state |= UNDONE;
10890				stat_jaddref++;
10891			} else
10892				panic("initiate_write_bmsafemap: inode %ju "
10893				    "marked free", (uintmax_t)jaddref->ja_ino);
10894		}
10895	}
10896	/*
10897	 * Clear any block allocations which are pending journal writes.
10898	 */
10899	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
10900		cgp = (struct cg *)bp->b_data;
10901		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
10902		blksfree = cg_blksfree(cgp);
10903		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
10904			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
10905				continue;
10906			panic("initiate_write_bmsafemap: block %jd "
10907			    "marked free", jnewblk->jn_blkno);
10908		}
10909	}
10910	/*
10911	 * Move allocation lists to the written lists so they can be
10912	 * cleared once the block write is complete.
10913	 */
10914	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
10915	    inodedep, id_deps);
10916	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
10917	    newblk, nb_deps);
10918	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
10919	    wk_list);
10920}
10921
10922/*
10923 * This routine is called during the completion interrupt
10924 * service routine for a disk write (from the procedure called
10925 * by the device driver to inform the filesystem caches of
10926 * a request completion).  It should be called early in this
10927 * procedure, before the block is made available to other
10928 * processes or other routines are called.
10929 *
10930 */
10931static void
10932softdep_disk_write_complete(bp)
10933	struct buf *bp;		/* describes the completed disk write */
10934{
10935	struct worklist *wk;
10936	struct worklist *owk;
10937	struct ufsmount *ump;
10938	struct workhead reattach;
10939	struct freeblks *freeblks;
10940	struct buf *sbp;
10941
10942	/*
10943	 * If an error occurred while doing the write, then the data
10944	 * has not hit the disk and the dependencies cannot be unrolled.
10945	 */
10946	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0)
10947		return;
10948	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
10949		return;
10950	ump = VFSTOUFS(wk->wk_mp);
10951	LIST_INIT(&reattach);
10952	/*
10953	 * This lock must not be released anywhere in this code segment.
10954	 */
10955	sbp = NULL;
10956	owk = NULL;
10957	ACQUIRE_LOCK(ump);
10958	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
10959		WORKLIST_REMOVE(wk);
10960		atomic_add_long(&dep_write[wk->wk_type], 1);
10961		if (wk == owk)
10962			panic("duplicate worklist: %p\n", wk);
10963		owk = wk;
10964		switch (wk->wk_type) {
10965
10966		case D_PAGEDEP:
10967			if (handle_written_filepage(WK_PAGEDEP(wk), bp))
10968				WORKLIST_INSERT(&reattach, wk);
10969			continue;
10970
10971		case D_INODEDEP:
10972			if (handle_written_inodeblock(WK_INODEDEP(wk), bp))
10973				WORKLIST_INSERT(&reattach, wk);
10974			continue;
10975
10976		case D_BMSAFEMAP:
10977			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp))
10978				WORKLIST_INSERT(&reattach, wk);
10979			continue;
10980
10981		case D_MKDIR:
10982			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
10983			continue;
10984
10985		case D_ALLOCDIRECT:
10986			wk->wk_state |= COMPLETE;
10987			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
10988			continue;
10989
10990		case D_ALLOCINDIR:
10991			wk->wk_state |= COMPLETE;
10992			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
10993			continue;
10994
10995		case D_INDIRDEP:
10996			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp))
10997				WORKLIST_INSERT(&reattach, wk);
10998			continue;
10999
11000		case D_FREEBLKS:
11001			wk->wk_state |= COMPLETE;
11002			freeblks = WK_FREEBLKS(wk);
11003			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11004			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11005				add_to_worklist(wk, WK_NODELAY);
11006			continue;
11007
11008		case D_FREEWORK:
11009			handle_written_freework(WK_FREEWORK(wk));
11010			break;
11011
11012		case D_JSEGDEP:
11013			free_jsegdep(WK_JSEGDEP(wk));
11014			continue;
11015
11016		case D_JSEG:
11017			handle_written_jseg(WK_JSEG(wk), bp);
11018			continue;
11019
11020		case D_SBDEP:
11021			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11022				WORKLIST_INSERT(&reattach, wk);
11023			continue;
11024
11025		case D_FREEDEP:
11026			free_freedep(WK_FREEDEP(wk));
11027			continue;
11028
11029		default:
11030			panic("handle_disk_write_complete: Unknown type %s",
11031			    TYPENAME(wk->wk_type));
11032			/* NOTREACHED */
11033		}
11034	}
11035	/*
11036	 * Reattach any requests that must be redone.
11037	 */
11038	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11039		WORKLIST_REMOVE(wk);
11040		WORKLIST_INSERT(&bp->b_dep, wk);
11041	}
11042	FREE_LOCK(ump);
11043	if (sbp)
11044		brelse(sbp);
11045}
11046
11047/*
11048 * Called from within softdep_disk_write_complete above. Note that
11049 * this routine is always called from interrupt level with further
11050 * splbio interrupts blocked.
11051 */
11052static void
11053handle_allocdirect_partdone(adp, wkhd)
11054	struct allocdirect *adp;	/* the completed allocdirect */
11055	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11056{
11057	struct allocdirectlst *listhead;
11058	struct allocdirect *listadp;
11059	struct inodedep *inodedep;
11060	long bsize;
11061
11062	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11063		return;
11064	/*
11065	 * The on-disk inode cannot claim to be any larger than the last
11066	 * fragment that has been written. Otherwise, the on-disk inode
11067	 * might have fragments that were not the last block in the file
11068	 * which would corrupt the filesystem. Thus, we cannot free any
11069	 * allocdirects after one whose ad_oldblkno claims a fragment as
11070	 * these blocks must be rolled back to zero before writing the inode.
11071	 * We check the currently active set of allocdirects in id_inoupdt
11072	 * or id_extupdt as appropriate.
11073	 */
11074	inodedep = adp->ad_inodedep;
11075	bsize = inodedep->id_fs->fs_bsize;
11076	if (adp->ad_state & EXTDATA)
11077		listhead = &inodedep->id_extupdt;
11078	else
11079		listhead = &inodedep->id_inoupdt;
11080	TAILQ_FOREACH(listadp, listhead, ad_next) {
11081		/* found our block */
11082		if (listadp == adp)
11083			break;
11084		/* continue if ad_oldlbn is not a fragment */
11085		if (listadp->ad_oldsize == 0 ||
11086		    listadp->ad_oldsize == bsize)
11087			continue;
11088		/* hit a fragment */
11089		return;
11090	}
11091	/*
11092	 * If we have reached the end of the current list without
11093	 * finding the just finished dependency, then it must be
11094	 * on the future dependency list. Future dependencies cannot
11095	 * be freed until they are moved to the current list.
11096	 */
11097	if (listadp == NULL) {
11098#ifdef DEBUG
11099		if (adp->ad_state & EXTDATA)
11100			listhead = &inodedep->id_newextupdt;
11101		else
11102			listhead = &inodedep->id_newinoupdt;
11103		TAILQ_FOREACH(listadp, listhead, ad_next)
11104			/* found our block */
11105			if (listadp == adp)
11106				break;
11107		if (listadp == NULL)
11108			panic("handle_allocdirect_partdone: lost dep");
11109#endif /* DEBUG */
11110		return;
11111	}
11112	/*
11113	 * If we have found the just finished dependency, then queue
11114	 * it along with anything that follows it that is complete.
11115	 * Since the pointer has not yet been written in the inode
11116	 * as the dependency prevents it, place the allocdirect on the
11117	 * bufwait list where it will be freed once the pointer is
11118	 * valid.
11119	 */
11120	if (wkhd == NULL)
11121		wkhd = &inodedep->id_bufwait;
11122	for (; adp; adp = listadp) {
11123		listadp = TAILQ_NEXT(adp, ad_next);
11124		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11125			return;
11126		TAILQ_REMOVE(listhead, adp, ad_next);
11127		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11128	}
11129}
11130
11131/*
11132 * Called from within softdep_disk_write_complete above.  This routine
11133 * completes successfully written allocindirs.
11134 */
11135static void
11136handle_allocindir_partdone(aip)
11137	struct allocindir *aip;		/* the completed allocindir */
11138{
11139	struct indirdep *indirdep;
11140
11141	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11142		return;
11143	indirdep = aip->ai_indirdep;
11144	LIST_REMOVE(aip, ai_next);
11145	/*
11146	 * Don't set a pointer while the buffer is undergoing IO or while
11147	 * we have active truncations.
11148	 */
11149	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11150		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11151		return;
11152	}
11153	if (indirdep->ir_state & UFS1FMT)
11154		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11155		    aip->ai_newblkno;
11156	else
11157		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11158		    aip->ai_newblkno;
11159	/*
11160	 * Await the pointer write before freeing the allocindir.
11161	 */
11162	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11163}
11164
11165/*
11166 * Release segments held on a jwork list.
11167 */
11168static void
11169handle_jwork(wkhd)
11170	struct workhead *wkhd;
11171{
11172	struct worklist *wk;
11173
11174	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11175		WORKLIST_REMOVE(wk);
11176		switch (wk->wk_type) {
11177		case D_JSEGDEP:
11178			free_jsegdep(WK_JSEGDEP(wk));
11179			continue;
11180		case D_FREEDEP:
11181			free_freedep(WK_FREEDEP(wk));
11182			continue;
11183		case D_FREEFRAG:
11184			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11185			WORKITEM_FREE(wk, D_FREEFRAG);
11186			continue;
11187		case D_FREEWORK:
11188			handle_written_freework(WK_FREEWORK(wk));
11189			continue;
11190		default:
11191			panic("handle_jwork: Unknown type %s\n",
11192			    TYPENAME(wk->wk_type));
11193		}
11194	}
11195}
11196
11197/*
11198 * Handle the bufwait list on an inode when it is safe to release items
11199 * held there.  This normally happens after an inode block is written but
11200 * may be delayed and handled later if there are pending journal items that
11201 * are not yet safe to be released.
11202 */
11203static struct freefile *
11204handle_bufwait(inodedep, refhd)
11205	struct inodedep *inodedep;
11206	struct workhead *refhd;
11207{
11208	struct jaddref *jaddref;
11209	struct freefile *freefile;
11210	struct worklist *wk;
11211
11212	freefile = NULL;
11213	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11214		WORKLIST_REMOVE(wk);
11215		switch (wk->wk_type) {
11216		case D_FREEFILE:
11217			/*
11218			 * We defer adding freefile to the worklist
11219			 * until all other additions have been made to
11220			 * ensure that it will be done after all the
11221			 * old blocks have been freed.
11222			 */
11223			if (freefile != NULL)
11224				panic("handle_bufwait: freefile");
11225			freefile = WK_FREEFILE(wk);
11226			continue;
11227
11228		case D_MKDIR:
11229			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11230			continue;
11231
11232		case D_DIRADD:
11233			diradd_inode_written(WK_DIRADD(wk), inodedep);
11234			continue;
11235
11236		case D_FREEFRAG:
11237			wk->wk_state |= COMPLETE;
11238			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11239				add_to_worklist(wk, 0);
11240			continue;
11241
11242		case D_DIRREM:
11243			wk->wk_state |= COMPLETE;
11244			add_to_worklist(wk, 0);
11245			continue;
11246
11247		case D_ALLOCDIRECT:
11248		case D_ALLOCINDIR:
11249			free_newblk(WK_NEWBLK(wk));
11250			continue;
11251
11252		case D_JNEWBLK:
11253			wk->wk_state |= COMPLETE;
11254			free_jnewblk(WK_JNEWBLK(wk));
11255			continue;
11256
11257		/*
11258		 * Save freed journal segments and add references on
11259		 * the supplied list which will delay their release
11260		 * until the cg bitmap is cleared on disk.
11261		 */
11262		case D_JSEGDEP:
11263			if (refhd == NULL)
11264				free_jsegdep(WK_JSEGDEP(wk));
11265			else
11266				WORKLIST_INSERT(refhd, wk);
11267			continue;
11268
11269		case D_JADDREF:
11270			jaddref = WK_JADDREF(wk);
11271			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11272			    if_deps);
11273			/*
11274			 * Transfer any jaddrefs to the list to be freed with
11275			 * the bitmap if we're handling a removed file.
11276			 */
11277			if (refhd == NULL) {
11278				wk->wk_state |= COMPLETE;
11279				free_jaddref(jaddref);
11280			} else
11281				WORKLIST_INSERT(refhd, wk);
11282			continue;
11283
11284		default:
11285			panic("handle_bufwait: Unknown type %p(%s)",
11286			    wk, TYPENAME(wk->wk_type));
11287			/* NOTREACHED */
11288		}
11289	}
11290	return (freefile);
11291}
11292/*
11293 * Called from within softdep_disk_write_complete above to restore
11294 * in-memory inode block contents to their most up-to-date state. Note
11295 * that this routine is always called from interrupt level with further
11296 * splbio interrupts blocked.
11297 */
11298static int
11299handle_written_inodeblock(inodedep, bp)
11300	struct inodedep *inodedep;
11301	struct buf *bp;		/* buffer containing the inode block */
11302{
11303	struct freefile *freefile;
11304	struct allocdirect *adp, *nextadp;
11305	struct ufs1_dinode *dp1 = NULL;
11306	struct ufs2_dinode *dp2 = NULL;
11307	struct workhead wkhd;
11308	int hadchanges, fstype;
11309	ino_t freelink;
11310
11311	LIST_INIT(&wkhd);
11312	hadchanges = 0;
11313	freefile = NULL;
11314	if ((inodedep->id_state & IOSTARTED) == 0)
11315		panic("handle_written_inodeblock: not started");
11316	inodedep->id_state &= ~IOSTARTED;
11317	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11318		fstype = UFS1;
11319		dp1 = (struct ufs1_dinode *)bp->b_data +
11320		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11321		freelink = dp1->di_freelink;
11322	} else {
11323		fstype = UFS2;
11324		dp2 = (struct ufs2_dinode *)bp->b_data +
11325		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11326		freelink = dp2->di_freelink;
11327	}
11328	/*
11329	 * Leave this inodeblock dirty until it's in the list.
11330	 */
11331	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED) {
11332		struct inodedep *inon;
11333
11334		inon = TAILQ_NEXT(inodedep, id_unlinked);
11335		if ((inon == NULL && freelink == 0) ||
11336		    (inon && inon->id_ino == freelink)) {
11337			if (inon)
11338				inon->id_state |= UNLINKPREV;
11339			inodedep->id_state |= UNLINKNEXT;
11340		}
11341		hadchanges = 1;
11342	}
11343	/*
11344	 * If we had to rollback the inode allocation because of
11345	 * bitmaps being incomplete, then simply restore it.
11346	 * Keep the block dirty so that it will not be reclaimed until
11347	 * all associated dependencies have been cleared and the
11348	 * corresponding updates written to disk.
11349	 */
11350	if (inodedep->id_savedino1 != NULL) {
11351		hadchanges = 1;
11352		if (fstype == UFS1)
11353			*dp1 = *inodedep->id_savedino1;
11354		else
11355			*dp2 = *inodedep->id_savedino2;
11356		free(inodedep->id_savedino1, M_SAVEDINO);
11357		inodedep->id_savedino1 = NULL;
11358		if ((bp->b_flags & B_DELWRI) == 0)
11359			stat_inode_bitmap++;
11360		bdirty(bp);
11361		/*
11362		 * If the inode is clear here and GOINGAWAY it will never
11363		 * be written.  Process the bufwait and clear any pending
11364		 * work which may include the freefile.
11365		 */
11366		if (inodedep->id_state & GOINGAWAY)
11367			goto bufwait;
11368		return (1);
11369	}
11370	inodedep->id_state |= COMPLETE;
11371	/*
11372	 * Roll forward anything that had to be rolled back before
11373	 * the inode could be updated.
11374	 */
11375	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11376		nextadp = TAILQ_NEXT(adp, ad_next);
11377		if (adp->ad_state & ATTACHED)
11378			panic("handle_written_inodeblock: new entry");
11379		if (fstype == UFS1) {
11380			if (adp->ad_offset < NDADDR) {
11381				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11382					panic("%s %s #%jd mismatch %d != %jd",
11383					    "handle_written_inodeblock:",
11384					    "direct pointer",
11385					    (intmax_t)adp->ad_offset,
11386					    dp1->di_db[adp->ad_offset],
11387					    (intmax_t)adp->ad_oldblkno);
11388				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11389			} else {
11390				if (dp1->di_ib[adp->ad_offset - NDADDR] != 0)
11391					panic("%s: %s #%jd allocated as %d",
11392					    "handle_written_inodeblock",
11393					    "indirect pointer",
11394					    (intmax_t)adp->ad_offset - NDADDR,
11395					    dp1->di_ib[adp->ad_offset - NDADDR]);
11396				dp1->di_ib[adp->ad_offset - NDADDR] =
11397				    adp->ad_newblkno;
11398			}
11399		} else {
11400			if (adp->ad_offset < NDADDR) {
11401				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11402					panic("%s: %s #%jd %s %jd != %jd",
11403					    "handle_written_inodeblock",
11404					    "direct pointer",
11405					    (intmax_t)adp->ad_offset, "mismatch",
11406					    (intmax_t)dp2->di_db[adp->ad_offset],
11407					    (intmax_t)adp->ad_oldblkno);
11408				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11409			} else {
11410				if (dp2->di_ib[adp->ad_offset - NDADDR] != 0)
11411					panic("%s: %s #%jd allocated as %jd",
11412					    "handle_written_inodeblock",
11413					    "indirect pointer",
11414					    (intmax_t)adp->ad_offset - NDADDR,
11415					    (intmax_t)
11416					    dp2->di_ib[adp->ad_offset - NDADDR]);
11417				dp2->di_ib[adp->ad_offset - NDADDR] =
11418				    adp->ad_newblkno;
11419			}
11420		}
11421		adp->ad_state &= ~UNDONE;
11422		adp->ad_state |= ATTACHED;
11423		hadchanges = 1;
11424	}
11425	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11426		nextadp = TAILQ_NEXT(adp, ad_next);
11427		if (adp->ad_state & ATTACHED)
11428			panic("handle_written_inodeblock: new entry");
11429		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11430			panic("%s: direct pointers #%jd %s %jd != %jd",
11431			    "handle_written_inodeblock",
11432			    (intmax_t)adp->ad_offset, "mismatch",
11433			    (intmax_t)dp2->di_extb[adp->ad_offset],
11434			    (intmax_t)adp->ad_oldblkno);
11435		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11436		adp->ad_state &= ~UNDONE;
11437		adp->ad_state |= ATTACHED;
11438		hadchanges = 1;
11439	}
11440	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11441		stat_direct_blk_ptrs++;
11442	/*
11443	 * Reset the file size to its most up-to-date value.
11444	 */
11445	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11446		panic("handle_written_inodeblock: bad size");
11447	if (inodedep->id_savednlink > LINK_MAX)
11448		panic("handle_written_inodeblock: Invalid link count "
11449		    "%d for inodedep %p", inodedep->id_savednlink, inodedep);
11450	if (fstype == UFS1) {
11451		if (dp1->di_nlink != inodedep->id_savednlink) {
11452			dp1->di_nlink = inodedep->id_savednlink;
11453			hadchanges = 1;
11454		}
11455		if (dp1->di_size != inodedep->id_savedsize) {
11456			dp1->di_size = inodedep->id_savedsize;
11457			hadchanges = 1;
11458		}
11459	} else {
11460		if (dp2->di_nlink != inodedep->id_savednlink) {
11461			dp2->di_nlink = inodedep->id_savednlink;
11462			hadchanges = 1;
11463		}
11464		if (dp2->di_size != inodedep->id_savedsize) {
11465			dp2->di_size = inodedep->id_savedsize;
11466			hadchanges = 1;
11467		}
11468		if (dp2->di_extsize != inodedep->id_savedextsize) {
11469			dp2->di_extsize = inodedep->id_savedextsize;
11470			hadchanges = 1;
11471		}
11472	}
11473	inodedep->id_savedsize = -1;
11474	inodedep->id_savedextsize = -1;
11475	inodedep->id_savednlink = -1;
11476	/*
11477	 * If there were any rollbacks in the inode block, then it must be
11478	 * marked dirty so that its will eventually get written back in
11479	 * its correct form.
11480	 */
11481	if (hadchanges)
11482		bdirty(bp);
11483bufwait:
11484	/*
11485	 * Process any allocdirects that completed during the update.
11486	 */
11487	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
11488		handle_allocdirect_partdone(adp, &wkhd);
11489	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
11490		handle_allocdirect_partdone(adp, &wkhd);
11491	/*
11492	 * Process deallocations that were held pending until the
11493	 * inode had been written to disk. Freeing of the inode
11494	 * is delayed until after all blocks have been freed to
11495	 * avoid creation of new <vfsid, inum, lbn> triples
11496	 * before the old ones have been deleted.  Completely
11497	 * unlinked inodes are not processed until the unlinked
11498	 * inode list is written or the last reference is removed.
11499	 */
11500	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
11501		freefile = handle_bufwait(inodedep, NULL);
11502		if (freefile && !LIST_EMPTY(&wkhd)) {
11503			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
11504			freefile = NULL;
11505		}
11506	}
11507	/*
11508	 * Move rolled forward dependency completions to the bufwait list
11509	 * now that those that were already written have been processed.
11510	 */
11511	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
11512		panic("handle_written_inodeblock: bufwait but no changes");
11513	jwork_move(&inodedep->id_bufwait, &wkhd);
11514
11515	if (freefile != NULL) {
11516		/*
11517		 * If the inode is goingaway it was never written.  Fake up
11518		 * the state here so free_inodedep() can succeed.
11519		 */
11520		if (inodedep->id_state & GOINGAWAY)
11521			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
11522		if (free_inodedep(inodedep) == 0)
11523			panic("handle_written_inodeblock: live inodedep %p",
11524			    inodedep);
11525		add_to_worklist(&freefile->fx_list, 0);
11526		return (0);
11527	}
11528
11529	/*
11530	 * If no outstanding dependencies, free it.
11531	 */
11532	if (free_inodedep(inodedep) ||
11533	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
11534	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
11535	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
11536	     LIST_FIRST(&inodedep->id_bufwait) == 0))
11537		return (0);
11538	return (hadchanges);
11539}
11540
11541static int
11542handle_written_indirdep(indirdep, bp, bpp)
11543	struct indirdep *indirdep;
11544	struct buf *bp;
11545	struct buf **bpp;
11546{
11547	struct allocindir *aip;
11548	struct buf *sbp;
11549	int chgs;
11550
11551	if (indirdep->ir_state & GOINGAWAY)
11552		panic("handle_written_indirdep: indirdep gone");
11553	if ((indirdep->ir_state & IOSTARTED) == 0)
11554		panic("handle_written_indirdep: IO not started");
11555	chgs = 0;
11556	/*
11557	 * If there were rollbacks revert them here.
11558	 */
11559	if (indirdep->ir_saveddata) {
11560		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
11561		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11562			free(indirdep->ir_saveddata, M_INDIRDEP);
11563			indirdep->ir_saveddata = NULL;
11564		}
11565		chgs = 1;
11566	}
11567	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
11568	indirdep->ir_state |= ATTACHED;
11569	/*
11570	 * Move allocindirs with written pointers to the completehd if
11571	 * the indirdep's pointer is not yet written.  Otherwise
11572	 * free them here.
11573	 */
11574	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
11575		LIST_REMOVE(aip, ai_next);
11576		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11577			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
11578			    ai_next);
11579			newblk_freefrag(&aip->ai_block);
11580			continue;
11581		}
11582		free_newblk(&aip->ai_block);
11583	}
11584	/*
11585	 * Move allocindirs that have finished dependency processing from
11586	 * the done list to the write list after updating the pointers.
11587	 */
11588	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
11589		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
11590			handle_allocindir_partdone(aip);
11591			if (aip == LIST_FIRST(&indirdep->ir_donehd))
11592				panic("disk_write_complete: not gone");
11593			chgs = 1;
11594		}
11595	}
11596	/*
11597	 * Preserve the indirdep if there were any changes or if it is not
11598	 * yet valid on disk.
11599	 */
11600	if (chgs) {
11601		stat_indir_blk_ptrs++;
11602		bdirty(bp);
11603		return (1);
11604	}
11605	/*
11606	 * If there were no changes we can discard the savedbp and detach
11607	 * ourselves from the buf.  We are only carrying completed pointers
11608	 * in this case.
11609	 */
11610	sbp = indirdep->ir_savebp;
11611	sbp->b_flags |= B_INVAL | B_NOCACHE;
11612	indirdep->ir_savebp = NULL;
11613	indirdep->ir_bp = NULL;
11614	if (*bpp != NULL)
11615		panic("handle_written_indirdep: bp already exists.");
11616	*bpp = sbp;
11617	/*
11618	 * The indirdep may not be freed until its parent points at it.
11619	 */
11620	if (indirdep->ir_state & DEPCOMPLETE)
11621		free_indirdep(indirdep);
11622
11623	return (0);
11624}
11625
11626/*
11627 * Process a diradd entry after its dependent inode has been written.
11628 * This routine must be called with splbio interrupts blocked.
11629 */
11630static void
11631diradd_inode_written(dap, inodedep)
11632	struct diradd *dap;
11633	struct inodedep *inodedep;
11634{
11635
11636	dap->da_state |= COMPLETE;
11637	complete_diradd(dap);
11638	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
11639}
11640
11641/*
11642 * Returns true if the bmsafemap will have rollbacks when written.  Must only
11643 * be called with the per-filesystem lock and the buf lock on the cg held.
11644 */
11645static int
11646bmsafemap_backgroundwrite(bmsafemap, bp)
11647	struct bmsafemap *bmsafemap;
11648	struct buf *bp;
11649{
11650	int dirty;
11651
11652	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
11653	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
11654	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
11655	/*
11656	 * If we're initiating a background write we need to process the
11657	 * rollbacks as they exist now, not as they exist when IO starts.
11658	 * No other consumers will look at the contents of the shadowed
11659	 * buf so this is safe to do here.
11660	 */
11661	if (bp->b_xflags & BX_BKGRDMARKER)
11662		initiate_write_bmsafemap(bmsafemap, bp);
11663
11664	return (dirty);
11665}
11666
11667/*
11668 * Re-apply an allocation when a cg write is complete.
11669 */
11670static int
11671jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
11672	struct jnewblk *jnewblk;
11673	struct fs *fs;
11674	struct cg *cgp;
11675	uint8_t *blksfree;
11676{
11677	ufs1_daddr_t fragno;
11678	ufs2_daddr_t blkno;
11679	long cgbno, bbase;
11680	int frags, blk;
11681	int i;
11682
11683	frags = 0;
11684	cgbno = dtogd(fs, jnewblk->jn_blkno);
11685	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
11686		if (isclr(blksfree, cgbno + i))
11687			panic("jnewblk_rollforward: re-allocated fragment");
11688		frags++;
11689	}
11690	if (frags == fs->fs_frag) {
11691		blkno = fragstoblks(fs, cgbno);
11692		ffs_clrblock(fs, blksfree, (long)blkno);
11693		ffs_clusteracct(fs, cgp, blkno, -1);
11694		cgp->cg_cs.cs_nbfree--;
11695	} else {
11696		bbase = cgbno - fragnum(fs, cgbno);
11697		cgbno += jnewblk->jn_oldfrags;
11698                /* If a complete block had been reassembled, account for it. */
11699		fragno = fragstoblks(fs, bbase);
11700		if (ffs_isblock(fs, blksfree, fragno)) {
11701			cgp->cg_cs.cs_nffree += fs->fs_frag;
11702			ffs_clusteracct(fs, cgp, fragno, -1);
11703			cgp->cg_cs.cs_nbfree--;
11704		}
11705		/* Decrement the old frags.  */
11706		blk = blkmap(fs, blksfree, bbase);
11707		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11708		/* Allocate the fragment */
11709		for (i = 0; i < frags; i++)
11710			clrbit(blksfree, cgbno + i);
11711		cgp->cg_cs.cs_nffree -= frags;
11712		/* Add back in counts associated with the new frags */
11713		blk = blkmap(fs, blksfree, bbase);
11714		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11715	}
11716	return (frags);
11717}
11718
11719/*
11720 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
11721 * changes if it's not a background write.  Set all written dependencies
11722 * to DEPCOMPLETE and free the structure if possible.
11723 */
11724static int
11725handle_written_bmsafemap(bmsafemap, bp)
11726	struct bmsafemap *bmsafemap;
11727	struct buf *bp;
11728{
11729	struct newblk *newblk;
11730	struct inodedep *inodedep;
11731	struct jaddref *jaddref, *jatmp;
11732	struct jnewblk *jnewblk, *jntmp;
11733	struct ufsmount *ump;
11734	uint8_t *inosused;
11735	uint8_t *blksfree;
11736	struct cg *cgp;
11737	struct fs *fs;
11738	ino_t ino;
11739	int foreground;
11740	int chgs;
11741
11742	if ((bmsafemap->sm_state & IOSTARTED) == 0)
11743		panic("initiate_write_bmsafemap: Not started\n");
11744	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
11745	chgs = 0;
11746	bmsafemap->sm_state &= ~IOSTARTED;
11747	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
11748	/*
11749	 * Release journal work that was waiting on the write.
11750	 */
11751	handle_jwork(&bmsafemap->sm_freewr);
11752
11753	/*
11754	 * Restore unwritten inode allocation pending jaddref writes.
11755	 */
11756	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
11757		cgp = (struct cg *)bp->b_data;
11758		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11759		inosused = cg_inosused(cgp);
11760		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
11761		    ja_bmdeps, jatmp) {
11762			if ((jaddref->ja_state & UNDONE) == 0)
11763				continue;
11764			ino = jaddref->ja_ino % fs->fs_ipg;
11765			if (isset(inosused, ino))
11766				panic("handle_written_bmsafemap: "
11767				    "re-allocated inode");
11768			/* Do the roll-forward only if it's a real copy. */
11769			if (foreground) {
11770				if ((jaddref->ja_mode & IFMT) == IFDIR)
11771					cgp->cg_cs.cs_ndir++;
11772				cgp->cg_cs.cs_nifree--;
11773				setbit(inosused, ino);
11774				chgs = 1;
11775			}
11776			jaddref->ja_state &= ~UNDONE;
11777			jaddref->ja_state |= ATTACHED;
11778			free_jaddref(jaddref);
11779		}
11780	}
11781	/*
11782	 * Restore any block allocations which are pending journal writes.
11783	 */
11784	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11785		cgp = (struct cg *)bp->b_data;
11786		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11787		blksfree = cg_blksfree(cgp);
11788		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
11789		    jntmp) {
11790			if ((jnewblk->jn_state & UNDONE) == 0)
11791				continue;
11792			/* Do the roll-forward only if it's a real copy. */
11793			if (foreground &&
11794			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
11795				chgs = 1;
11796			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
11797			jnewblk->jn_state |= ATTACHED;
11798			free_jnewblk(jnewblk);
11799		}
11800	}
11801	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
11802		newblk->nb_state |= DEPCOMPLETE;
11803		newblk->nb_state &= ~ONDEPLIST;
11804		newblk->nb_bmsafemap = NULL;
11805		LIST_REMOVE(newblk, nb_deps);
11806		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
11807			handle_allocdirect_partdone(
11808			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
11809		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
11810			handle_allocindir_partdone(
11811			    WK_ALLOCINDIR(&newblk->nb_list));
11812		else if (newblk->nb_list.wk_type != D_NEWBLK)
11813			panic("handle_written_bmsafemap: Unexpected type: %s",
11814			    TYPENAME(newblk->nb_list.wk_type));
11815	}
11816	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
11817		inodedep->id_state |= DEPCOMPLETE;
11818		inodedep->id_state &= ~ONDEPLIST;
11819		LIST_REMOVE(inodedep, id_deps);
11820		inodedep->id_bmsafemap = NULL;
11821	}
11822	LIST_REMOVE(bmsafemap, sm_next);
11823	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
11824	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
11825	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
11826	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
11827	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
11828		LIST_REMOVE(bmsafemap, sm_hash);
11829		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
11830		return (0);
11831	}
11832	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
11833	if (foreground)
11834		bdirty(bp);
11835	return (1);
11836}
11837
11838/*
11839 * Try to free a mkdir dependency.
11840 */
11841static void
11842complete_mkdir(mkdir)
11843	struct mkdir *mkdir;
11844{
11845	struct diradd *dap;
11846
11847	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
11848		return;
11849	LIST_REMOVE(mkdir, md_mkdirs);
11850	dap = mkdir->md_diradd;
11851	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
11852	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
11853		dap->da_state |= DEPCOMPLETE;
11854		complete_diradd(dap);
11855	}
11856	WORKITEM_FREE(mkdir, D_MKDIR);
11857}
11858
11859/*
11860 * Handle the completion of a mkdir dependency.
11861 */
11862static void
11863handle_written_mkdir(mkdir, type)
11864	struct mkdir *mkdir;
11865	int type;
11866{
11867
11868	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
11869		panic("handle_written_mkdir: bad type");
11870	mkdir->md_state |= COMPLETE;
11871	complete_mkdir(mkdir);
11872}
11873
11874static int
11875free_pagedep(pagedep)
11876	struct pagedep *pagedep;
11877{
11878	int i;
11879
11880	if (pagedep->pd_state & NEWBLOCK)
11881		return (0);
11882	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
11883		return (0);
11884	for (i = 0; i < DAHASHSZ; i++)
11885		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
11886			return (0);
11887	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
11888		return (0);
11889	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
11890		return (0);
11891	if (pagedep->pd_state & ONWORKLIST)
11892		WORKLIST_REMOVE(&pagedep->pd_list);
11893	LIST_REMOVE(pagedep, pd_hash);
11894	WORKITEM_FREE(pagedep, D_PAGEDEP);
11895
11896	return (1);
11897}
11898
11899/*
11900 * Called from within softdep_disk_write_complete above.
11901 * A write operation was just completed. Removed inodes can
11902 * now be freed and associated block pointers may be committed.
11903 * Note that this routine is always called from interrupt level
11904 * with further splbio interrupts blocked.
11905 */
11906static int
11907handle_written_filepage(pagedep, bp)
11908	struct pagedep *pagedep;
11909	struct buf *bp;		/* buffer containing the written page */
11910{
11911	struct dirrem *dirrem;
11912	struct diradd *dap, *nextdap;
11913	struct direct *ep;
11914	int i, chgs;
11915
11916	if ((pagedep->pd_state & IOSTARTED) == 0)
11917		panic("handle_written_filepage: not started");
11918	pagedep->pd_state &= ~IOSTARTED;
11919	/*
11920	 * Process any directory removals that have been committed.
11921	 */
11922	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
11923		LIST_REMOVE(dirrem, dm_next);
11924		dirrem->dm_state |= COMPLETE;
11925		dirrem->dm_dirinum = pagedep->pd_ino;
11926		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
11927		    ("handle_written_filepage: Journal entries not written."));
11928		add_to_worklist(&dirrem->dm_list, 0);
11929	}
11930	/*
11931	 * Free any directory additions that have been committed.
11932	 * If it is a newly allocated block, we have to wait until
11933	 * the on-disk directory inode claims the new block.
11934	 */
11935	if ((pagedep->pd_state & NEWBLOCK) == 0)
11936		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
11937			free_diradd(dap, NULL);
11938	/*
11939	 * Uncommitted directory entries must be restored.
11940	 */
11941	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
11942		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
11943		     dap = nextdap) {
11944			nextdap = LIST_NEXT(dap, da_pdlist);
11945			if (dap->da_state & ATTACHED)
11946				panic("handle_written_filepage: attached");
11947			ep = (struct direct *)
11948			    ((char *)bp->b_data + dap->da_offset);
11949			ep->d_ino = dap->da_newinum;
11950			dap->da_state &= ~UNDONE;
11951			dap->da_state |= ATTACHED;
11952			chgs = 1;
11953			/*
11954			 * If the inode referenced by the directory has
11955			 * been written out, then the dependency can be
11956			 * moved to the pending list.
11957			 */
11958			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
11959				LIST_REMOVE(dap, da_pdlist);
11960				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
11961				    da_pdlist);
11962			}
11963		}
11964	}
11965	/*
11966	 * If there were any rollbacks in the directory, then it must be
11967	 * marked dirty so that its will eventually get written back in
11968	 * its correct form.
11969	 */
11970	if (chgs) {
11971		if ((bp->b_flags & B_DELWRI) == 0)
11972			stat_dir_entry++;
11973		bdirty(bp);
11974		return (1);
11975	}
11976	/*
11977	 * If we are not waiting for a new directory block to be
11978	 * claimed by its inode, then the pagedep will be freed.
11979	 * Otherwise it will remain to track any new entries on
11980	 * the page in case they are fsync'ed.
11981	 */
11982	free_pagedep(pagedep);
11983	return (0);
11984}
11985
11986/*
11987 * Writing back in-core inode structures.
11988 *
11989 * The filesystem only accesses an inode's contents when it occupies an
11990 * "in-core" inode structure.  These "in-core" structures are separate from
11991 * the page frames used to cache inode blocks.  Only the latter are
11992 * transferred to/from the disk.  So, when the updated contents of the
11993 * "in-core" inode structure are copied to the corresponding in-memory inode
11994 * block, the dependencies are also transferred.  The following procedure is
11995 * called when copying a dirty "in-core" inode to a cached inode block.
11996 */
11997
11998/*
11999 * Called when an inode is loaded from disk. If the effective link count
12000 * differed from the actual link count when it was last flushed, then we
12001 * need to ensure that the correct effective link count is put back.
12002 */
12003void
12004softdep_load_inodeblock(ip)
12005	struct inode *ip;	/* the "in_core" copy of the inode */
12006{
12007	struct inodedep *inodedep;
12008
12009	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12010	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12011	/*
12012	 * Check for alternate nlink count.
12013	 */
12014	ip->i_effnlink = ip->i_nlink;
12015	ACQUIRE_LOCK(ip->i_ump);
12016	if (inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, 0,
12017	    &inodedep) == 0) {
12018		FREE_LOCK(ip->i_ump);
12019		return;
12020	}
12021	ip->i_effnlink -= inodedep->id_nlinkdelta;
12022	FREE_LOCK(ip->i_ump);
12023}
12024
12025/*
12026 * This routine is called just before the "in-core" inode
12027 * information is to be copied to the in-memory inode block.
12028 * Recall that an inode block contains several inodes. If
12029 * the force flag is set, then the dependencies will be
12030 * cleared so that the update can always be made. Note that
12031 * the buffer is locked when this routine is called, so we
12032 * will never be in the middle of writing the inode block
12033 * to disk.
12034 */
12035void
12036softdep_update_inodeblock(ip, bp, waitfor)
12037	struct inode *ip;	/* the "in_core" copy of the inode */
12038	struct buf *bp;		/* the buffer containing the inode block */
12039	int waitfor;		/* nonzero => update must be allowed */
12040{
12041	struct inodedep *inodedep;
12042	struct inoref *inoref;
12043	struct ufsmount *ump;
12044	struct worklist *wk;
12045	struct mount *mp;
12046	struct buf *ibp;
12047	struct fs *fs;
12048	int error;
12049
12050	ump = ip->i_ump;
12051	mp = UFSTOVFS(ump);
12052	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12053	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12054	fs = ip->i_fs;
12055	/*
12056	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12057	 * does not have access to the in-core ip so must write directly into
12058	 * the inode block buffer when setting freelink.
12059	 */
12060	if (fs->fs_magic == FS_UFS1_MAGIC)
12061		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12062		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12063	else
12064		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12065		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12066	/*
12067	 * If the effective link count is not equal to the actual link
12068	 * count, then we must track the difference in an inodedep while
12069	 * the inode is (potentially) tossed out of the cache. Otherwise,
12070	 * if there is no existing inodedep, then there are no dependencies
12071	 * to track.
12072	 */
12073	ACQUIRE_LOCK(ump);
12074again:
12075	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12076		FREE_LOCK(ump);
12077		if (ip->i_effnlink != ip->i_nlink)
12078			panic("softdep_update_inodeblock: bad link count");
12079		return;
12080	}
12081	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12082		panic("softdep_update_inodeblock: bad delta");
12083	/*
12084	 * If we're flushing all dependencies we must also move any waiting
12085	 * for journal writes onto the bufwait list prior to I/O.
12086	 */
12087	if (waitfor) {
12088		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12089			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12090			    == DEPCOMPLETE) {
12091				jwait(&inoref->if_list, MNT_WAIT);
12092				goto again;
12093			}
12094		}
12095	}
12096	/*
12097	 * Changes have been initiated. Anything depending on these
12098	 * changes cannot occur until this inode has been written.
12099	 */
12100	inodedep->id_state &= ~COMPLETE;
12101	if ((inodedep->id_state & ONWORKLIST) == 0)
12102		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12103	/*
12104	 * Any new dependencies associated with the incore inode must
12105	 * now be moved to the list associated with the buffer holding
12106	 * the in-memory copy of the inode. Once merged process any
12107	 * allocdirects that are completed by the merger.
12108	 */
12109	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12110	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12111		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12112		    NULL);
12113	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12114	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12115		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12116		    NULL);
12117	/*
12118	 * Now that the inode has been pushed into the buffer, the
12119	 * operations dependent on the inode being written to disk
12120	 * can be moved to the id_bufwait so that they will be
12121	 * processed when the buffer I/O completes.
12122	 */
12123	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12124		WORKLIST_REMOVE(wk);
12125		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12126	}
12127	/*
12128	 * Newly allocated inodes cannot be written until the bitmap
12129	 * that allocates them have been written (indicated by
12130	 * DEPCOMPLETE being set in id_state). If we are doing a
12131	 * forced sync (e.g., an fsync on a file), we force the bitmap
12132	 * to be written so that the update can be done.
12133	 */
12134	if (waitfor == 0) {
12135		FREE_LOCK(ump);
12136		return;
12137	}
12138retry:
12139	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12140		FREE_LOCK(ump);
12141		return;
12142	}
12143	ibp = inodedep->id_bmsafemap->sm_buf;
12144	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12145	if (ibp == NULL) {
12146		/*
12147		 * If ibp came back as NULL, the dependency could have been
12148		 * freed while we slept.  Look it up again, and check to see
12149		 * that it has completed.
12150		 */
12151		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12152			goto retry;
12153		FREE_LOCK(ump);
12154		return;
12155	}
12156	FREE_LOCK(ump);
12157	if ((error = bwrite(ibp)) != 0)
12158		softdep_error("softdep_update_inodeblock: bwrite", error);
12159}
12160
12161/*
12162 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12163 * old inode dependency list (such as id_inoupdt). This routine must be
12164 * called with splbio interrupts blocked.
12165 */
12166static void
12167merge_inode_lists(newlisthead, oldlisthead)
12168	struct allocdirectlst *newlisthead;
12169	struct allocdirectlst *oldlisthead;
12170{
12171	struct allocdirect *listadp, *newadp;
12172
12173	newadp = TAILQ_FIRST(newlisthead);
12174	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12175		if (listadp->ad_offset < newadp->ad_offset) {
12176			listadp = TAILQ_NEXT(listadp, ad_next);
12177			continue;
12178		}
12179		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12180		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12181		if (listadp->ad_offset == newadp->ad_offset) {
12182			allocdirect_merge(oldlisthead, newadp,
12183			    listadp);
12184			listadp = newadp;
12185		}
12186		newadp = TAILQ_FIRST(newlisthead);
12187	}
12188	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12189		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12190		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12191	}
12192}
12193
12194/*
12195 * If we are doing an fsync, then we must ensure that any directory
12196 * entries for the inode have been written after the inode gets to disk.
12197 */
12198int
12199softdep_fsync(vp)
12200	struct vnode *vp;	/* the "in_core" copy of the inode */
12201{
12202	struct inodedep *inodedep;
12203	struct pagedep *pagedep;
12204	struct inoref *inoref;
12205	struct ufsmount *ump;
12206	struct worklist *wk;
12207	struct diradd *dap;
12208	struct mount *mp;
12209	struct vnode *pvp;
12210	struct inode *ip;
12211	struct buf *bp;
12212	struct fs *fs;
12213	struct thread *td = curthread;
12214	int error, flushparent, pagedep_new_block;
12215	ino_t parentino;
12216	ufs_lbn_t lbn;
12217
12218	ip = VTOI(vp);
12219	fs = ip->i_fs;
12220	ump = ip->i_ump;
12221	mp = vp->v_mount;
12222	if (MOUNTEDSOFTDEP(mp) == 0)
12223		return (0);
12224	ACQUIRE_LOCK(ump);
12225restart:
12226	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12227		FREE_LOCK(ump);
12228		return (0);
12229	}
12230	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12231		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12232		    == DEPCOMPLETE) {
12233			jwait(&inoref->if_list, MNT_WAIT);
12234			goto restart;
12235		}
12236	}
12237	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12238	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12239	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12240	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12241	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12242		panic("softdep_fsync: pending ops %p", inodedep);
12243	for (error = 0, flushparent = 0; ; ) {
12244		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12245			break;
12246		if (wk->wk_type != D_DIRADD)
12247			panic("softdep_fsync: Unexpected type %s",
12248			    TYPENAME(wk->wk_type));
12249		dap = WK_DIRADD(wk);
12250		/*
12251		 * Flush our parent if this directory entry has a MKDIR_PARENT
12252		 * dependency or is contained in a newly allocated block.
12253		 */
12254		if (dap->da_state & DIRCHG)
12255			pagedep = dap->da_previous->dm_pagedep;
12256		else
12257			pagedep = dap->da_pagedep;
12258		parentino = pagedep->pd_ino;
12259		lbn = pagedep->pd_lbn;
12260		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12261			panic("softdep_fsync: dirty");
12262		if ((dap->da_state & MKDIR_PARENT) ||
12263		    (pagedep->pd_state & NEWBLOCK))
12264			flushparent = 1;
12265		else
12266			flushparent = 0;
12267		/*
12268		 * If we are being fsync'ed as part of vgone'ing this vnode,
12269		 * then we will not be able to release and recover the
12270		 * vnode below, so we just have to give up on writing its
12271		 * directory entry out. It will eventually be written, just
12272		 * not now, but then the user was not asking to have it
12273		 * written, so we are not breaking any promises.
12274		 */
12275		if (vp->v_iflag & VI_DOOMED)
12276			break;
12277		/*
12278		 * We prevent deadlock by always fetching inodes from the
12279		 * root, moving down the directory tree. Thus, when fetching
12280		 * our parent directory, we first try to get the lock. If
12281		 * that fails, we must unlock ourselves before requesting
12282		 * the lock on our parent. See the comment in ufs_lookup
12283		 * for details on possible races.
12284		 */
12285		FREE_LOCK(ump);
12286		if (ffs_vgetf(mp, parentino, LK_NOWAIT | LK_EXCLUSIVE, &pvp,
12287		    FFSV_FORCEINSMQ)) {
12288			error = vfs_busy(mp, MBF_NOWAIT);
12289			if (error != 0) {
12290				vfs_ref(mp);
12291				VOP_UNLOCK(vp, 0);
12292				error = vfs_busy(mp, 0);
12293				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12294				vfs_rel(mp);
12295				if (error != 0)
12296					return (ENOENT);
12297				if (vp->v_iflag & VI_DOOMED) {
12298					vfs_unbusy(mp);
12299					return (ENOENT);
12300				}
12301			}
12302			VOP_UNLOCK(vp, 0);
12303			error = ffs_vgetf(mp, parentino, LK_EXCLUSIVE,
12304			    &pvp, FFSV_FORCEINSMQ);
12305			vfs_unbusy(mp);
12306			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
12307			if (vp->v_iflag & VI_DOOMED) {
12308				if (error == 0)
12309					vput(pvp);
12310				error = ENOENT;
12311			}
12312			if (error != 0)
12313				return (error);
12314		}
12315		/*
12316		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12317		 * that are contained in direct blocks will be resolved by
12318		 * doing a ffs_update. Pagedeps contained in indirect blocks
12319		 * may require a complete sync'ing of the directory. So, we
12320		 * try the cheap and fast ffs_update first, and if that fails,
12321		 * then we do the slower ffs_syncvnode of the directory.
12322		 */
12323		if (flushparent) {
12324			int locked;
12325
12326			if ((error = ffs_update(pvp, 1)) != 0) {
12327				vput(pvp);
12328				return (error);
12329			}
12330			ACQUIRE_LOCK(ump);
12331			locked = 1;
12332			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12333				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12334					if (wk->wk_type != D_DIRADD)
12335						panic("softdep_fsync: Unexpected type %s",
12336						      TYPENAME(wk->wk_type));
12337					dap = WK_DIRADD(wk);
12338					if (dap->da_state & DIRCHG)
12339						pagedep = dap->da_previous->dm_pagedep;
12340					else
12341						pagedep = dap->da_pagedep;
12342					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12343					FREE_LOCK(ump);
12344					locked = 0;
12345					if (pagedep_new_block && (error =
12346					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12347						vput(pvp);
12348						return (error);
12349					}
12350				}
12351			}
12352			if (locked)
12353				FREE_LOCK(ump);
12354		}
12355		/*
12356		 * Flush directory page containing the inode's name.
12357		 */
12358		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12359		    &bp);
12360		if (error == 0)
12361			error = bwrite(bp);
12362		else
12363			brelse(bp);
12364		vput(pvp);
12365		if (error != 0)
12366			return (error);
12367		ACQUIRE_LOCK(ump);
12368		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12369			break;
12370	}
12371	FREE_LOCK(ump);
12372	return (0);
12373}
12374
12375/*
12376 * Flush all the dirty bitmaps associated with the block device
12377 * before flushing the rest of the dirty blocks so as to reduce
12378 * the number of dependencies that will have to be rolled back.
12379 *
12380 * XXX Unused?
12381 */
12382void
12383softdep_fsync_mountdev(vp)
12384	struct vnode *vp;
12385{
12386	struct buf *bp, *nbp;
12387	struct worklist *wk;
12388	struct bufobj *bo;
12389
12390	if (!vn_isdisk(vp, NULL))
12391		panic("softdep_fsync_mountdev: vnode not a disk");
12392	bo = &vp->v_bufobj;
12393restart:
12394	BO_LOCK(bo);
12395	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
12396		/*
12397		 * If it is already scheduled, skip to the next buffer.
12398		 */
12399		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
12400			continue;
12401
12402		if ((bp->b_flags & B_DELWRI) == 0)
12403			panic("softdep_fsync_mountdev: not dirty");
12404		/*
12405		 * We are only interested in bitmaps with outstanding
12406		 * dependencies.
12407		 */
12408		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
12409		    wk->wk_type != D_BMSAFEMAP ||
12410		    (bp->b_vflags & BV_BKGRDINPROG)) {
12411			BUF_UNLOCK(bp);
12412			continue;
12413		}
12414		BO_UNLOCK(bo);
12415		bremfree(bp);
12416		(void) bawrite(bp);
12417		goto restart;
12418	}
12419	drain_output(vp);
12420	BO_UNLOCK(bo);
12421}
12422
12423/*
12424 * Sync all cylinder groups that were dirty at the time this function is
12425 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
12426 * is used to flush freedep activity that may be holding up writes to a
12427 * indirect block.
12428 */
12429static int
12430sync_cgs(mp, waitfor)
12431	struct mount *mp;
12432	int waitfor;
12433{
12434	struct bmsafemap *bmsafemap;
12435	struct bmsafemap *sentinel;
12436	struct ufsmount *ump;
12437	struct buf *bp;
12438	int error;
12439
12440	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
12441	sentinel->sm_cg = -1;
12442	ump = VFSTOUFS(mp);
12443	error = 0;
12444	ACQUIRE_LOCK(ump);
12445	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
12446	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
12447	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
12448		/* Skip sentinels and cgs with no work to release. */
12449		if (bmsafemap->sm_cg == -1 ||
12450		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
12451		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
12452			LIST_REMOVE(sentinel, sm_next);
12453			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12454			continue;
12455		}
12456		/*
12457		 * If we don't get the lock and we're waiting try again, if
12458		 * not move on to the next buf and try to sync it.
12459		 */
12460		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
12461		if (bp == NULL && waitfor == MNT_WAIT)
12462			continue;
12463		LIST_REMOVE(sentinel, sm_next);
12464		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
12465		if (bp == NULL)
12466			continue;
12467		FREE_LOCK(ump);
12468		if (waitfor == MNT_NOWAIT)
12469			bawrite(bp);
12470		else
12471			error = bwrite(bp);
12472		ACQUIRE_LOCK(ump);
12473		if (error)
12474			break;
12475	}
12476	LIST_REMOVE(sentinel, sm_next);
12477	FREE_LOCK(ump);
12478	free(sentinel, M_BMSAFEMAP);
12479	return (error);
12480}
12481
12482/*
12483 * This routine is called when we are trying to synchronously flush a
12484 * file. This routine must eliminate any filesystem metadata dependencies
12485 * so that the syncing routine can succeed.
12486 */
12487int
12488softdep_sync_metadata(struct vnode *vp)
12489{
12490	struct inode *ip;
12491	int error;
12492
12493	ip = VTOI(vp);
12494	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
12495	    ("softdep_sync_metadata called on non-softdep filesystem"));
12496	/*
12497	 * Ensure that any direct block dependencies have been cleared,
12498	 * truncations are started, and inode references are journaled.
12499	 */
12500	ACQUIRE_LOCK(ip->i_ump);
12501	/*
12502	 * Write all journal records to prevent rollbacks on devvp.
12503	 */
12504	if (vp->v_type == VCHR)
12505		softdep_flushjournal(vp->v_mount);
12506	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
12507	/*
12508	 * Ensure that all truncates are written so we won't find deps on
12509	 * indirect blocks.
12510	 */
12511	process_truncates(vp);
12512	FREE_LOCK(ip->i_ump);
12513
12514	return (error);
12515}
12516
12517/*
12518 * This routine is called when we are attempting to sync a buf with
12519 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
12520 * other IO it can but returns EBUSY if the buffer is not yet able to
12521 * be written.  Dependencies which will not cause rollbacks will always
12522 * return 0.
12523 */
12524int
12525softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
12526{
12527	struct indirdep *indirdep;
12528	struct pagedep *pagedep;
12529	struct allocindir *aip;
12530	struct newblk *newblk;
12531	struct ufsmount *ump;
12532	struct buf *nbp;
12533	struct worklist *wk;
12534	int i, error;
12535
12536	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
12537	    ("softdep_sync_buf called on non-softdep filesystem"));
12538	/*
12539	 * For VCHR we just don't want to force flush any dependencies that
12540	 * will cause rollbacks.
12541	 */
12542	if (vp->v_type == VCHR) {
12543		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
12544			return (EBUSY);
12545		return (0);
12546	}
12547	ump = VTOI(vp)->i_ump;
12548	ACQUIRE_LOCK(ump);
12549	/*
12550	 * As we hold the buffer locked, none of its dependencies
12551	 * will disappear.
12552	 */
12553	error = 0;
12554top:
12555	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
12556		switch (wk->wk_type) {
12557
12558		case D_ALLOCDIRECT:
12559		case D_ALLOCINDIR:
12560			newblk = WK_NEWBLK(wk);
12561			if (newblk->nb_jnewblk != NULL) {
12562				if (waitfor == MNT_NOWAIT) {
12563					error = EBUSY;
12564					goto out_unlock;
12565				}
12566				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
12567				goto top;
12568			}
12569			if (newblk->nb_state & DEPCOMPLETE ||
12570			    waitfor == MNT_NOWAIT)
12571				continue;
12572			nbp = newblk->nb_bmsafemap->sm_buf;
12573			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12574			if (nbp == NULL)
12575				goto top;
12576			FREE_LOCK(ump);
12577			if ((error = bwrite(nbp)) != 0)
12578				goto out;
12579			ACQUIRE_LOCK(ump);
12580			continue;
12581
12582		case D_INDIRDEP:
12583			indirdep = WK_INDIRDEP(wk);
12584			if (waitfor == MNT_NOWAIT) {
12585				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
12586				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
12587					error = EBUSY;
12588					goto out_unlock;
12589				}
12590			}
12591			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
12592				panic("softdep_sync_buf: truncation pending.");
12593		restart:
12594			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
12595				newblk = (struct newblk *)aip;
12596				if (newblk->nb_jnewblk != NULL) {
12597					jwait(&newblk->nb_jnewblk->jn_list,
12598					    waitfor);
12599					goto restart;
12600				}
12601				if (newblk->nb_state & DEPCOMPLETE)
12602					continue;
12603				nbp = newblk->nb_bmsafemap->sm_buf;
12604				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
12605				if (nbp == NULL)
12606					goto restart;
12607				FREE_LOCK(ump);
12608				if ((error = bwrite(nbp)) != 0)
12609					goto out;
12610				ACQUIRE_LOCK(ump);
12611				goto restart;
12612			}
12613			continue;
12614
12615		case D_PAGEDEP:
12616			/*
12617			 * Only flush directory entries in synchronous passes.
12618			 */
12619			if (waitfor != MNT_WAIT) {
12620				error = EBUSY;
12621				goto out_unlock;
12622			}
12623			/*
12624			 * While syncing snapshots, we must allow recursive
12625			 * lookups.
12626			 */
12627			BUF_AREC(bp);
12628			/*
12629			 * We are trying to sync a directory that may
12630			 * have dependencies on both its own metadata
12631			 * and/or dependencies on the inodes of any
12632			 * recently allocated files. We walk its diradd
12633			 * lists pushing out the associated inode.
12634			 */
12635			pagedep = WK_PAGEDEP(wk);
12636			for (i = 0; i < DAHASHSZ; i++) {
12637				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
12638					continue;
12639				if ((error = flush_pagedep_deps(vp, wk->wk_mp,
12640				    &pagedep->pd_diraddhd[i]))) {
12641					BUF_NOREC(bp);
12642					goto out_unlock;
12643				}
12644			}
12645			BUF_NOREC(bp);
12646			continue;
12647
12648		case D_FREEWORK:
12649		case D_FREEDEP:
12650		case D_JSEGDEP:
12651		case D_JNEWBLK:
12652			continue;
12653
12654		default:
12655			panic("softdep_sync_buf: Unknown type %s",
12656			    TYPENAME(wk->wk_type));
12657			/* NOTREACHED */
12658		}
12659	}
12660out_unlock:
12661	FREE_LOCK(ump);
12662out:
12663	return (error);
12664}
12665
12666/*
12667 * Flush the dependencies associated with an inodedep.
12668 * Called with splbio blocked.
12669 */
12670static int
12671flush_inodedep_deps(vp, mp, ino)
12672	struct vnode *vp;
12673	struct mount *mp;
12674	ino_t ino;
12675{
12676	struct inodedep *inodedep;
12677	struct inoref *inoref;
12678	struct ufsmount *ump;
12679	int error, waitfor;
12680
12681	/*
12682	 * This work is done in two passes. The first pass grabs most
12683	 * of the buffers and begins asynchronously writing them. The
12684	 * only way to wait for these asynchronous writes is to sleep
12685	 * on the filesystem vnode which may stay busy for a long time
12686	 * if the filesystem is active. So, instead, we make a second
12687	 * pass over the dependencies blocking on each write. In the
12688	 * usual case we will be blocking against a write that we
12689	 * initiated, so when it is done the dependency will have been
12690	 * resolved. Thus the second pass is expected to end quickly.
12691	 * We give a brief window at the top of the loop to allow
12692	 * any pending I/O to complete.
12693	 */
12694	ump = VFSTOUFS(mp);
12695	LOCK_OWNED(ump);
12696	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
12697		if (error)
12698			return (error);
12699		FREE_LOCK(ump);
12700		ACQUIRE_LOCK(ump);
12701restart:
12702		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
12703			return (0);
12704		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12705			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12706			    == DEPCOMPLETE) {
12707				jwait(&inoref->if_list, MNT_WAIT);
12708				goto restart;
12709			}
12710		}
12711		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
12712		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
12713		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
12714		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
12715			continue;
12716		/*
12717		 * If pass2, we are done, otherwise do pass 2.
12718		 */
12719		if (waitfor == MNT_WAIT)
12720			break;
12721		waitfor = MNT_WAIT;
12722	}
12723	/*
12724	 * Try freeing inodedep in case all dependencies have been removed.
12725	 */
12726	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
12727		(void) free_inodedep(inodedep);
12728	return (0);
12729}
12730
12731/*
12732 * Flush an inode dependency list.
12733 * Called with splbio blocked.
12734 */
12735static int
12736flush_deplist(listhead, waitfor, errorp)
12737	struct allocdirectlst *listhead;
12738	int waitfor;
12739	int *errorp;
12740{
12741	struct allocdirect *adp;
12742	struct newblk *newblk;
12743	struct ufsmount *ump;
12744	struct buf *bp;
12745
12746	if ((adp = TAILQ_FIRST(listhead)) == NULL)
12747		return (0);
12748	ump = VFSTOUFS(adp->ad_list.wk_mp);
12749	LOCK_OWNED(ump);
12750	TAILQ_FOREACH(adp, listhead, ad_next) {
12751		newblk = (struct newblk *)adp;
12752		if (newblk->nb_jnewblk != NULL) {
12753			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12754			return (1);
12755		}
12756		if (newblk->nb_state & DEPCOMPLETE)
12757			continue;
12758		bp = newblk->nb_bmsafemap->sm_buf;
12759		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
12760		if (bp == NULL) {
12761			if (waitfor == MNT_NOWAIT)
12762				continue;
12763			return (1);
12764		}
12765		FREE_LOCK(ump);
12766		if (waitfor == MNT_NOWAIT)
12767			bawrite(bp);
12768		else
12769			*errorp = bwrite(bp);
12770		ACQUIRE_LOCK(ump);
12771		return (1);
12772	}
12773	return (0);
12774}
12775
12776/*
12777 * Flush dependencies associated with an allocdirect block.
12778 */
12779static int
12780flush_newblk_dep(vp, mp, lbn)
12781	struct vnode *vp;
12782	struct mount *mp;
12783	ufs_lbn_t lbn;
12784{
12785	struct newblk *newblk;
12786	struct ufsmount *ump;
12787	struct bufobj *bo;
12788	struct inode *ip;
12789	struct buf *bp;
12790	ufs2_daddr_t blkno;
12791	int error;
12792
12793	error = 0;
12794	bo = &vp->v_bufobj;
12795	ip = VTOI(vp);
12796	blkno = DIP(ip, i_db[lbn]);
12797	if (blkno == 0)
12798		panic("flush_newblk_dep: Missing block");
12799	ump = VFSTOUFS(mp);
12800	ACQUIRE_LOCK(ump);
12801	/*
12802	 * Loop until all dependencies related to this block are satisfied.
12803	 * We must be careful to restart after each sleep in case a write
12804	 * completes some part of this process for us.
12805	 */
12806	for (;;) {
12807		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
12808			FREE_LOCK(ump);
12809			break;
12810		}
12811		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
12812			panic("flush_newblk_deps: Bad newblk %p", newblk);
12813		/*
12814		 * Flush the journal.
12815		 */
12816		if (newblk->nb_jnewblk != NULL) {
12817			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
12818			continue;
12819		}
12820		/*
12821		 * Write the bitmap dependency.
12822		 */
12823		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
12824			bp = newblk->nb_bmsafemap->sm_buf;
12825			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12826			if (bp == NULL)
12827				continue;
12828			FREE_LOCK(ump);
12829			error = bwrite(bp);
12830			if (error)
12831				break;
12832			ACQUIRE_LOCK(ump);
12833			continue;
12834		}
12835		/*
12836		 * Write the buffer.
12837		 */
12838		FREE_LOCK(ump);
12839		BO_LOCK(bo);
12840		bp = gbincore(bo, lbn);
12841		if (bp != NULL) {
12842			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
12843			    LK_INTERLOCK, BO_LOCKPTR(bo));
12844			if (error == ENOLCK) {
12845				ACQUIRE_LOCK(ump);
12846				continue; /* Slept, retry */
12847			}
12848			if (error != 0)
12849				break;	/* Failed */
12850			if (bp->b_flags & B_DELWRI) {
12851				bremfree(bp);
12852				error = bwrite(bp);
12853				if (error)
12854					break;
12855			} else
12856				BUF_UNLOCK(bp);
12857		} else
12858			BO_UNLOCK(bo);
12859		/*
12860		 * We have to wait for the direct pointers to
12861		 * point at the newdirblk before the dependency
12862		 * will go away.
12863		 */
12864		error = ffs_update(vp, 1);
12865		if (error)
12866			break;
12867		ACQUIRE_LOCK(ump);
12868	}
12869	return (error);
12870}
12871
12872/*
12873 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
12874 * Called with splbio blocked.
12875 */
12876static int
12877flush_pagedep_deps(pvp, mp, diraddhdp)
12878	struct vnode *pvp;
12879	struct mount *mp;
12880	struct diraddhd *diraddhdp;
12881{
12882	struct inodedep *inodedep;
12883	struct inoref *inoref;
12884	struct ufsmount *ump;
12885	struct diradd *dap;
12886	struct vnode *vp;
12887	int error = 0;
12888	struct buf *bp;
12889	ino_t inum;
12890	struct diraddhd unfinished;
12891
12892	LIST_INIT(&unfinished);
12893	ump = VFSTOUFS(mp);
12894	LOCK_OWNED(ump);
12895restart:
12896	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
12897		/*
12898		 * Flush ourselves if this directory entry
12899		 * has a MKDIR_PARENT dependency.
12900		 */
12901		if (dap->da_state & MKDIR_PARENT) {
12902			FREE_LOCK(ump);
12903			if ((error = ffs_update(pvp, 1)) != 0)
12904				break;
12905			ACQUIRE_LOCK(ump);
12906			/*
12907			 * If that cleared dependencies, go on to next.
12908			 */
12909			if (dap != LIST_FIRST(diraddhdp))
12910				continue;
12911			/*
12912			 * All MKDIR_PARENT dependencies and all the
12913			 * NEWBLOCK pagedeps that are contained in direct
12914			 * blocks were resolved by doing above ffs_update.
12915			 * Pagedeps contained in indirect blocks may
12916			 * require a complete sync'ing of the directory.
12917			 * We are in the midst of doing a complete sync,
12918			 * so if they are not resolved in this pass we
12919			 * defer them for now as they will be sync'ed by
12920			 * our caller shortly.
12921			 */
12922			LIST_REMOVE(dap, da_pdlist);
12923			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
12924			continue;
12925		}
12926		/*
12927		 * A newly allocated directory must have its "." and
12928		 * ".." entries written out before its name can be
12929		 * committed in its parent.
12930		 */
12931		inum = dap->da_newinum;
12932		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12933			panic("flush_pagedep_deps: lost inode1");
12934		/*
12935		 * Wait for any pending journal adds to complete so we don't
12936		 * cause rollbacks while syncing.
12937		 */
12938		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12939			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12940			    == DEPCOMPLETE) {
12941				jwait(&inoref->if_list, MNT_WAIT);
12942				goto restart;
12943			}
12944		}
12945		if (dap->da_state & MKDIR_BODY) {
12946			FREE_LOCK(ump);
12947			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
12948			    FFSV_FORCEINSMQ)))
12949				break;
12950			error = flush_newblk_dep(vp, mp, 0);
12951			/*
12952			 * If we still have the dependency we might need to
12953			 * update the vnode to sync the new link count to
12954			 * disk.
12955			 */
12956			if (error == 0 && dap == LIST_FIRST(diraddhdp))
12957				error = ffs_update(vp, 1);
12958			vput(vp);
12959			if (error != 0)
12960				break;
12961			ACQUIRE_LOCK(ump);
12962			/*
12963			 * If that cleared dependencies, go on to next.
12964			 */
12965			if (dap != LIST_FIRST(diraddhdp))
12966				continue;
12967			if (dap->da_state & MKDIR_BODY) {
12968				inodedep_lookup(UFSTOVFS(ump), inum, 0,
12969				    &inodedep);
12970				panic("flush_pagedep_deps: MKDIR_BODY "
12971				    "inodedep %p dap %p vp %p",
12972				    inodedep, dap, vp);
12973			}
12974		}
12975		/*
12976		 * Flush the inode on which the directory entry depends.
12977		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
12978		 * the only remaining dependency is that the updated inode
12979		 * count must get pushed to disk. The inode has already
12980		 * been pushed into its inode buffer (via VOP_UPDATE) at
12981		 * the time of the reference count change. So we need only
12982		 * locate that buffer, ensure that there will be no rollback
12983		 * caused by a bitmap dependency, then write the inode buffer.
12984		 */
12985retry:
12986		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
12987			panic("flush_pagedep_deps: lost inode");
12988		/*
12989		 * If the inode still has bitmap dependencies,
12990		 * push them to disk.
12991		 */
12992		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
12993			bp = inodedep->id_bmsafemap->sm_buf;
12994			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
12995			if (bp == NULL)
12996				goto retry;
12997			FREE_LOCK(ump);
12998			if ((error = bwrite(bp)) != 0)
12999				break;
13000			ACQUIRE_LOCK(ump);
13001			if (dap != LIST_FIRST(diraddhdp))
13002				continue;
13003		}
13004		/*
13005		 * If the inode is still sitting in a buffer waiting
13006		 * to be written or waiting for the link count to be
13007		 * adjusted update it here to flush it to disk.
13008		 */
13009		if (dap == LIST_FIRST(diraddhdp)) {
13010			FREE_LOCK(ump);
13011			if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
13012			    FFSV_FORCEINSMQ)))
13013				break;
13014			error = ffs_update(vp, 1);
13015			vput(vp);
13016			if (error)
13017				break;
13018			ACQUIRE_LOCK(ump);
13019		}
13020		/*
13021		 * If we have failed to get rid of all the dependencies
13022		 * then something is seriously wrong.
13023		 */
13024		if (dap == LIST_FIRST(diraddhdp)) {
13025			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13026			panic("flush_pagedep_deps: failed to flush "
13027			    "inodedep %p ino %ju dap %p",
13028			    inodedep, (uintmax_t)inum, dap);
13029		}
13030	}
13031	if (error)
13032		ACQUIRE_LOCK(ump);
13033	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13034		LIST_REMOVE(dap, da_pdlist);
13035		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13036	}
13037	return (error);
13038}
13039
13040/*
13041 * A large burst of file addition or deletion activity can drive the
13042 * memory load excessively high. First attempt to slow things down
13043 * using the techniques below. If that fails, this routine requests
13044 * the offending operations to fall back to running synchronously
13045 * until the memory load returns to a reasonable level.
13046 */
13047int
13048softdep_slowdown(vp)
13049	struct vnode *vp;
13050{
13051	struct ufsmount *ump;
13052	int jlow;
13053	int max_softdeps_hard;
13054
13055	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13056	    ("softdep_slowdown called on non-softdep filesystem"));
13057	ump = VFSTOUFS(vp->v_mount);
13058	ACQUIRE_LOCK(ump);
13059	jlow = 0;
13060	/*
13061	 * Check for journal space if needed.
13062	 */
13063	if (DOINGSUJ(vp)) {
13064		if (journal_space(ump, 0) == 0)
13065			jlow = 1;
13066	}
13067	/*
13068	 * If the system is under its limits and our filesystem is
13069	 * not responsible for more than our share of the usage and
13070	 * we are not low on journal space, then no need to slow down.
13071	 */
13072	max_softdeps_hard = max_softdeps * 11 / 10;
13073	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13074	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13075	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13076	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13077	    ump->softdep_curdeps[D_DIRREM] <
13078	    (max_softdeps_hard / 2) / stat_flush_threads &&
13079	    ump->softdep_curdeps[D_INODEDEP] <
13080	    max_softdeps_hard / stat_flush_threads &&
13081	    ump->softdep_curdeps[D_INDIRDEP] <
13082	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13083	    ump->softdep_curdeps[D_FREEBLKS] <
13084	    max_softdeps_hard / stat_flush_threads) {
13085		FREE_LOCK(ump);
13086  		return (0);
13087	}
13088	/*
13089	 * If the journal is low or our filesystem is over its limit
13090	 * then speedup the cleanup.
13091	 */
13092	if (ump->softdep_curdeps[D_INDIRDEP] <
13093	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13094		softdep_speedup(ump);
13095	stat_sync_limit_hit += 1;
13096	FREE_LOCK(ump);
13097	/*
13098	 * We only slow down the rate at which new dependencies are
13099	 * generated if we are not using journaling. With journaling,
13100	 * the cleanup should always be sufficient to keep things
13101	 * under control.
13102	 */
13103	if (DOINGSUJ(vp))
13104		return (0);
13105	return (1);
13106}
13107
13108/*
13109 * Called by the allocation routines when they are about to fail
13110 * in the hope that we can free up the requested resource (inodes
13111 * or disk space).
13112 *
13113 * First check to see if the work list has anything on it. If it has,
13114 * clean up entries until we successfully free the requested resource.
13115 * Because this process holds inodes locked, we cannot handle any remove
13116 * requests that might block on a locked inode as that could lead to
13117 * deadlock. If the worklist yields none of the requested resource,
13118 * start syncing out vnodes to free up the needed space.
13119 */
13120int
13121softdep_request_cleanup(fs, vp, cred, resource)
13122	struct fs *fs;
13123	struct vnode *vp;
13124	struct ucred *cred;
13125	int resource;
13126{
13127	struct ufsmount *ump;
13128	struct mount *mp;
13129	struct vnode *lvp, *mvp;
13130	long starttime;
13131	ufs2_daddr_t needed;
13132	int error;
13133
13134	/*
13135	 * If we are being called because of a process doing a
13136	 * copy-on-write, then it is not safe to process any
13137	 * worklist items as we will recurse into the copyonwrite
13138	 * routine.  This will result in an incoherent snapshot.
13139	 * If the vnode that we hold is a snapshot, we must avoid
13140	 * handling other resources that could cause deadlock.
13141	 */
13142	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13143		return (0);
13144
13145	if (resource == FLUSH_BLOCKS_WAIT)
13146		stat_cleanup_blkrequests += 1;
13147	else
13148		stat_cleanup_inorequests += 1;
13149
13150	mp = vp->v_mount;
13151	ump = VFSTOUFS(mp);
13152	mtx_assert(UFS_MTX(ump), MA_OWNED);
13153	UFS_UNLOCK(ump);
13154	error = ffs_update(vp, 1);
13155	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13156		UFS_LOCK(ump);
13157		return (0);
13158	}
13159	/*
13160	 * If we are in need of resources, start by cleaning up
13161	 * any block removals associated with our inode.
13162	 */
13163	ACQUIRE_LOCK(ump);
13164	process_removes(vp);
13165	process_truncates(vp);
13166	FREE_LOCK(ump);
13167	/*
13168	 * Now clean up at least as many resources as we will need.
13169	 *
13170	 * When requested to clean up inodes, the number that are needed
13171	 * is set by the number of simultaneous writers (mnt_writeopcount)
13172	 * plus a bit of slop (2) in case some more writers show up while
13173	 * we are cleaning.
13174	 *
13175	 * When requested to free up space, the amount of space that
13176	 * we need is enough blocks to allocate a full-sized segment
13177	 * (fs_contigsumsize). The number of such segments that will
13178	 * be needed is set by the number of simultaneous writers
13179	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13180	 * writers show up while we are cleaning.
13181	 *
13182	 * Additionally, if we are unpriviledged and allocating space,
13183	 * we need to ensure that we clean up enough blocks to get the
13184	 * needed number of blocks over the threshhold of the minimum
13185	 * number of blocks required to be kept free by the filesystem
13186	 * (fs_minfree).
13187	 */
13188	if (resource == FLUSH_INODES_WAIT) {
13189		needed = vp->v_mount->mnt_writeopcount + 2;
13190	} else if (resource == FLUSH_BLOCKS_WAIT) {
13191		needed = (vp->v_mount->mnt_writeopcount + 2) *
13192		    fs->fs_contigsumsize;
13193		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE, 0))
13194			needed += fragstoblks(fs,
13195			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13196			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13197	} else {
13198		UFS_LOCK(ump);
13199		printf("softdep_request_cleanup: Unknown resource type %d\n",
13200		    resource);
13201		return (0);
13202	}
13203	starttime = time_second;
13204retry:
13205	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13206	    fs->fs_cstotal.cs_nbfree <= needed) ||
13207	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13208	    fs->fs_cstotal.cs_nifree <= needed)) {
13209		ACQUIRE_LOCK(ump);
13210		if (ump->softdep_on_worklist > 0 &&
13211		    process_worklist_item(UFSTOVFS(ump),
13212		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13213			stat_worklist_push += 1;
13214		FREE_LOCK(ump);
13215	}
13216	/*
13217	 * If we still need resources and there are no more worklist
13218	 * entries to process to obtain them, we have to start flushing
13219	 * the dirty vnodes to force the release of additional requests
13220	 * to the worklist that we can then process to reap addition
13221	 * resources. We walk the vnodes associated with the mount point
13222	 * until we get the needed worklist requests that we can reap.
13223	 */
13224	if ((resource == FLUSH_BLOCKS_WAIT &&
13225	     fs->fs_cstotal.cs_nbfree <= needed) ||
13226	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13227	     fs->fs_cstotal.cs_nifree <= needed)) {
13228		MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13229			if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13230				VI_UNLOCK(lvp);
13231				continue;
13232			}
13233			if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
13234			    curthread))
13235				continue;
13236			if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13237				vput(lvp);
13238				continue;
13239			}
13240			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13241			vput(lvp);
13242		}
13243		lvp = ump->um_devvp;
13244		if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13245			VOP_FSYNC(lvp, MNT_NOWAIT, curthread);
13246			VOP_UNLOCK(lvp, 0);
13247		}
13248		if (ump->softdep_on_worklist > 0) {
13249			stat_cleanup_retries += 1;
13250			goto retry;
13251		}
13252		stat_cleanup_failures += 1;
13253	}
13254	if (time_second - starttime > stat_cleanup_high_delay)
13255		stat_cleanup_high_delay = time_second - starttime;
13256	UFS_LOCK(ump);
13257	return (1);
13258}
13259
13260static bool
13261softdep_excess_items(struct ufsmount *ump, int item)
13262{
13263
13264	KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13265	return (dep_current[item] > max_softdeps &&
13266	    ump->softdep_curdeps[item] > max_softdeps /
13267	    stat_flush_threads);
13268}
13269
13270static void
13271schedule_cleanup(struct mount *mp)
13272{
13273	struct ufsmount *ump;
13274	struct thread *td;
13275
13276	ump = VFSTOUFS(mp);
13277	LOCK_OWNED(ump);
13278	FREE_LOCK(ump);
13279	td = curthread;
13280	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
13281	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
13282		/*
13283		 * No ast is delivered to kernel threads, so nobody
13284		 * would deref the mp.  Some kernel threads
13285		 * explicitely check for AST, e.g. NFS daemon does
13286		 * this in the serving loop.
13287		 */
13288		return;
13289	}
13290	if (td->td_su != NULL)
13291		vfs_rel(td->td_su);
13292	vfs_ref(mp);
13293	td->td_su = mp;
13294	thread_lock(td);
13295	td->td_flags |= TDF_ASTPENDING;
13296	thread_unlock(td);
13297}
13298
13299static void
13300softdep_ast_cleanup_proc(void)
13301{
13302	struct thread *td;
13303	struct mount *mp;
13304	struct ufsmount *ump;
13305	int error;
13306	bool req;
13307
13308	td = curthread;
13309	while ((mp = td->td_su) != NULL) {
13310		td->td_su = NULL;
13311		error = vfs_busy(mp, MBF_NOWAIT);
13312		vfs_rel(mp);
13313		if (error != 0)
13314			return;
13315		if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
13316			ump = VFSTOUFS(mp);
13317			for (;;) {
13318				req = false;
13319				ACQUIRE_LOCK(ump);
13320				if (softdep_excess_items(ump, D_INODEDEP)) {
13321					req = true;
13322					request_cleanup(mp, FLUSH_INODES);
13323				}
13324				if (softdep_excess_items(ump, D_DIRREM)) {
13325					req = true;
13326					request_cleanup(mp, FLUSH_BLOCKS);
13327				}
13328				FREE_LOCK(ump);
13329				if (softdep_excess_items(ump, D_NEWBLK) ||
13330				    softdep_excess_items(ump, D_ALLOCDIRECT) ||
13331				    softdep_excess_items(ump, D_ALLOCINDIR)) {
13332					error = vn_start_write(NULL, &mp,
13333					    V_WAIT);
13334					if (error == 0) {
13335						req = true;
13336						VFS_SYNC(mp, MNT_WAIT);
13337						vn_finished_write(mp);
13338					}
13339				}
13340				if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
13341					break;
13342			}
13343		}
13344		vfs_unbusy(mp);
13345	}
13346}
13347
13348/*
13349 * If memory utilization has gotten too high, deliberately slow things
13350 * down and speed up the I/O processing.
13351 */
13352static int
13353request_cleanup(mp, resource)
13354	struct mount *mp;
13355	int resource;
13356{
13357	struct thread *td = curthread;
13358	struct ufsmount *ump;
13359
13360	ump = VFSTOUFS(mp);
13361	LOCK_OWNED(ump);
13362	/*
13363	 * We never hold up the filesystem syncer or buf daemon.
13364	 */
13365	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
13366		return (0);
13367	/*
13368	 * First check to see if the work list has gotten backlogged.
13369	 * If it has, co-opt this process to help clean up two entries.
13370	 * Because this process may hold inodes locked, we cannot
13371	 * handle any remove requests that might block on a locked
13372	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
13373	 * to avoid recursively processing the worklist.
13374	 */
13375	if (ump->softdep_on_worklist > max_softdeps / 10) {
13376		td->td_pflags |= TDP_SOFTDEP;
13377		process_worklist_item(mp, 2, LK_NOWAIT);
13378		td->td_pflags &= ~TDP_SOFTDEP;
13379		stat_worklist_push += 2;
13380		return(1);
13381	}
13382	/*
13383	 * Next, we attempt to speed up the syncer process. If that
13384	 * is successful, then we allow the process to continue.
13385	 */
13386	if (softdep_speedup(ump) &&
13387	    resource != FLUSH_BLOCKS_WAIT &&
13388	    resource != FLUSH_INODES_WAIT)
13389		return(0);
13390	/*
13391	 * If we are resource constrained on inode dependencies, try
13392	 * flushing some dirty inodes. Otherwise, we are constrained
13393	 * by file deletions, so try accelerating flushes of directories
13394	 * with removal dependencies. We would like to do the cleanup
13395	 * here, but we probably hold an inode locked at this point and
13396	 * that might deadlock against one that we try to clean. So,
13397	 * the best that we can do is request the syncer daemon to do
13398	 * the cleanup for us.
13399	 */
13400	switch (resource) {
13401
13402	case FLUSH_INODES:
13403	case FLUSH_INODES_WAIT:
13404		ACQUIRE_GBLLOCK(&lk);
13405		stat_ino_limit_push += 1;
13406		req_clear_inodedeps += 1;
13407		FREE_GBLLOCK(&lk);
13408		stat_countp = &stat_ino_limit_hit;
13409		break;
13410
13411	case FLUSH_BLOCKS:
13412	case FLUSH_BLOCKS_WAIT:
13413		ACQUIRE_GBLLOCK(&lk);
13414		stat_blk_limit_push += 1;
13415		req_clear_remove += 1;
13416		FREE_GBLLOCK(&lk);
13417		stat_countp = &stat_blk_limit_hit;
13418		break;
13419
13420	default:
13421		panic("request_cleanup: unknown type");
13422	}
13423	/*
13424	 * Hopefully the syncer daemon will catch up and awaken us.
13425	 * We wait at most tickdelay before proceeding in any case.
13426	 */
13427	ACQUIRE_GBLLOCK(&lk);
13428	FREE_LOCK(ump);
13429	proc_waiting += 1;
13430	if (callout_pending(&softdep_callout) == FALSE)
13431		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
13432		    pause_timer, 0);
13433
13434	if ((td->td_pflags & TDP_KTHREAD) == 0)
13435		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
13436	proc_waiting -= 1;
13437	FREE_GBLLOCK(&lk);
13438	ACQUIRE_LOCK(ump);
13439	return (1);
13440}
13441
13442/*
13443 * Awaken processes pausing in request_cleanup and clear proc_waiting
13444 * to indicate that there is no longer a timer running. Pause_timer
13445 * will be called with the global softdep mutex (&lk) locked.
13446 */
13447static void
13448pause_timer(arg)
13449	void *arg;
13450{
13451
13452	GBLLOCK_OWNED(&lk);
13453	/*
13454	 * The callout_ API has acquired mtx and will hold it around this
13455	 * function call.
13456	 */
13457	*stat_countp += proc_waiting;
13458	wakeup(&proc_waiting);
13459}
13460
13461/*
13462 * If requested, try removing inode or removal dependencies.
13463 */
13464static void
13465check_clear_deps(mp)
13466	struct mount *mp;
13467{
13468
13469	/*
13470	 * If we are suspended, it may be because of our using
13471	 * too many inodedeps, so help clear them out.
13472	 */
13473	if (MOUNTEDSUJ(mp) && VFSTOUFS(mp)->softdep_jblocks->jb_suspended)
13474		clear_inodedeps(mp);
13475	/*
13476	 * General requests for cleanup of backed up dependencies
13477	 */
13478	ACQUIRE_GBLLOCK(&lk);
13479	if (req_clear_inodedeps) {
13480		req_clear_inodedeps -= 1;
13481		FREE_GBLLOCK(&lk);
13482		clear_inodedeps(mp);
13483		ACQUIRE_GBLLOCK(&lk);
13484		wakeup(&proc_waiting);
13485	}
13486	if (req_clear_remove) {
13487		req_clear_remove -= 1;
13488		FREE_GBLLOCK(&lk);
13489		clear_remove(mp);
13490		ACQUIRE_GBLLOCK(&lk);
13491		wakeup(&proc_waiting);
13492	}
13493	FREE_GBLLOCK(&lk);
13494}
13495
13496/*
13497 * Flush out a directory with at least one removal dependency in an effort to
13498 * reduce the number of dirrem, freefile, and freeblks dependency structures.
13499 */
13500static void
13501clear_remove(mp)
13502	struct mount *mp;
13503{
13504	struct pagedep_hashhead *pagedephd;
13505	struct pagedep *pagedep;
13506	struct ufsmount *ump;
13507	struct vnode *vp;
13508	struct bufobj *bo;
13509	int error, cnt;
13510	ino_t ino;
13511
13512	ump = VFSTOUFS(mp);
13513	LOCK_OWNED(ump);
13514
13515	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
13516		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
13517		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
13518			ump->pagedep_nextclean = 0;
13519		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
13520			if (LIST_EMPTY(&pagedep->pd_dirremhd))
13521				continue;
13522			ino = pagedep->pd_ino;
13523			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13524				continue;
13525			FREE_LOCK(ump);
13526
13527			/*
13528			 * Let unmount clear deps
13529			 */
13530			error = vfs_busy(mp, MBF_NOWAIT);
13531			if (error != 0)
13532				goto finish_write;
13533			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13534			     FFSV_FORCEINSMQ);
13535			vfs_unbusy(mp);
13536			if (error != 0) {
13537				softdep_error("clear_remove: vget", error);
13538				goto finish_write;
13539			}
13540			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13541				softdep_error("clear_remove: fsync", error);
13542			bo = &vp->v_bufobj;
13543			BO_LOCK(bo);
13544			drain_output(vp);
13545			BO_UNLOCK(bo);
13546			vput(vp);
13547		finish_write:
13548			vn_finished_write(mp);
13549			ACQUIRE_LOCK(ump);
13550			return;
13551		}
13552	}
13553}
13554
13555/*
13556 * Clear out a block of dirty inodes in an effort to reduce
13557 * the number of inodedep dependency structures.
13558 */
13559static void
13560clear_inodedeps(mp)
13561	struct mount *mp;
13562{
13563	struct inodedep_hashhead *inodedephd;
13564	struct inodedep *inodedep;
13565	struct ufsmount *ump;
13566	struct vnode *vp;
13567	struct fs *fs;
13568	int error, cnt;
13569	ino_t firstino, lastino, ino;
13570
13571	ump = VFSTOUFS(mp);
13572	fs = ump->um_fs;
13573	LOCK_OWNED(ump);
13574	/*
13575	 * Pick a random inode dependency to be cleared.
13576	 * We will then gather up all the inodes in its block
13577	 * that have dependencies and flush them out.
13578	 */
13579	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
13580		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
13581		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
13582			ump->inodedep_nextclean = 0;
13583		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
13584			break;
13585	}
13586	if (inodedep == NULL)
13587		return;
13588	/*
13589	 * Find the last inode in the block with dependencies.
13590	 */
13591	firstino = inodedep->id_ino & ~(INOPB(fs) - 1);
13592	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
13593		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
13594			break;
13595	/*
13596	 * Asynchronously push all but the last inode with dependencies.
13597	 * Synchronously push the last inode with dependencies to ensure
13598	 * that the inode block gets written to free up the inodedeps.
13599	 */
13600	for (ino = firstino; ino <= lastino; ino++) {
13601		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13602			continue;
13603		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
13604			continue;
13605		FREE_LOCK(ump);
13606		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
13607		if (error != 0) {
13608			vn_finished_write(mp);
13609			ACQUIRE_LOCK(ump);
13610			return;
13611		}
13612		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
13613		    FFSV_FORCEINSMQ)) != 0) {
13614			softdep_error("clear_inodedeps: vget", error);
13615			vfs_unbusy(mp);
13616			vn_finished_write(mp);
13617			ACQUIRE_LOCK(ump);
13618			return;
13619		}
13620		vfs_unbusy(mp);
13621		if (ino == lastino) {
13622			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
13623				softdep_error("clear_inodedeps: fsync1", error);
13624		} else {
13625			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
13626				softdep_error("clear_inodedeps: fsync2", error);
13627			BO_LOCK(&vp->v_bufobj);
13628			drain_output(vp);
13629			BO_UNLOCK(&vp->v_bufobj);
13630		}
13631		vput(vp);
13632		vn_finished_write(mp);
13633		ACQUIRE_LOCK(ump);
13634	}
13635}
13636
13637void
13638softdep_buf_append(bp, wkhd)
13639	struct buf *bp;
13640	struct workhead *wkhd;
13641{
13642	struct worklist *wk;
13643	struct ufsmount *ump;
13644
13645	if ((wk = LIST_FIRST(wkhd)) == NULL)
13646		return;
13647	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13648	    ("softdep_buf_append called on non-softdep filesystem"));
13649	ump = VFSTOUFS(wk->wk_mp);
13650	ACQUIRE_LOCK(ump);
13651	while ((wk = LIST_FIRST(wkhd)) != NULL) {
13652		WORKLIST_REMOVE(wk);
13653		WORKLIST_INSERT(&bp->b_dep, wk);
13654	}
13655	FREE_LOCK(ump);
13656
13657}
13658
13659void
13660softdep_inode_append(ip, cred, wkhd)
13661	struct inode *ip;
13662	struct ucred *cred;
13663	struct workhead *wkhd;
13664{
13665	struct buf *bp;
13666	struct fs *fs;
13667	int error;
13668
13669	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ip->i_ump)) != 0,
13670	    ("softdep_inode_append called on non-softdep filesystem"));
13671	fs = ip->i_fs;
13672	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
13673	    (int)fs->fs_bsize, cred, &bp);
13674	if (error) {
13675		bqrelse(bp);
13676		softdep_freework(wkhd);
13677		return;
13678	}
13679	softdep_buf_append(bp, wkhd);
13680	bqrelse(bp);
13681}
13682
13683void
13684softdep_freework(wkhd)
13685	struct workhead *wkhd;
13686{
13687	struct worklist *wk;
13688	struct ufsmount *ump;
13689
13690	if ((wk = LIST_FIRST(wkhd)) == NULL)
13691		return;
13692	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
13693	    ("softdep_freework called on non-softdep filesystem"));
13694	ump = VFSTOUFS(wk->wk_mp);
13695	ACQUIRE_LOCK(ump);
13696	handle_jwork(wkhd);
13697	FREE_LOCK(ump);
13698}
13699
13700/*
13701 * Function to determine if the buffer has outstanding dependencies
13702 * that will cause a roll-back if the buffer is written. If wantcount
13703 * is set, return number of dependencies, otherwise just yes or no.
13704 */
13705static int
13706softdep_count_dependencies(bp, wantcount)
13707	struct buf *bp;
13708	int wantcount;
13709{
13710	struct worklist *wk;
13711	struct ufsmount *ump;
13712	struct bmsafemap *bmsafemap;
13713	struct freework *freework;
13714	struct inodedep *inodedep;
13715	struct indirdep *indirdep;
13716	struct freeblks *freeblks;
13717	struct allocindir *aip;
13718	struct pagedep *pagedep;
13719	struct dirrem *dirrem;
13720	struct newblk *newblk;
13721	struct mkdir *mkdir;
13722	struct diradd *dap;
13723	int i, retval;
13724
13725	retval = 0;
13726	if ((wk = LIST_FIRST(&bp->b_dep)) == NULL)
13727		return (0);
13728	ump = VFSTOUFS(wk->wk_mp);
13729	ACQUIRE_LOCK(ump);
13730	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13731		switch (wk->wk_type) {
13732
13733		case D_INODEDEP:
13734			inodedep = WK_INODEDEP(wk);
13735			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
13736				/* bitmap allocation dependency */
13737				retval += 1;
13738				if (!wantcount)
13739					goto out;
13740			}
13741			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
13742				/* direct block pointer dependency */
13743				retval += 1;
13744				if (!wantcount)
13745					goto out;
13746			}
13747			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
13748				/* direct block pointer dependency */
13749				retval += 1;
13750				if (!wantcount)
13751					goto out;
13752			}
13753			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
13754				/* Add reference dependency. */
13755				retval += 1;
13756				if (!wantcount)
13757					goto out;
13758			}
13759			continue;
13760
13761		case D_INDIRDEP:
13762			indirdep = WK_INDIRDEP(wk);
13763
13764			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
13765				/* indirect truncation dependency */
13766				retval += 1;
13767				if (!wantcount)
13768					goto out;
13769			}
13770
13771			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13772				/* indirect block pointer dependency */
13773				retval += 1;
13774				if (!wantcount)
13775					goto out;
13776			}
13777			continue;
13778
13779		case D_PAGEDEP:
13780			pagedep = WK_PAGEDEP(wk);
13781			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
13782				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
13783					/* Journal remove ref dependency. */
13784					retval += 1;
13785					if (!wantcount)
13786						goto out;
13787				}
13788			}
13789			for (i = 0; i < DAHASHSZ; i++) {
13790
13791				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
13792					/* directory entry dependency */
13793					retval += 1;
13794					if (!wantcount)
13795						goto out;
13796				}
13797			}
13798			continue;
13799
13800		case D_BMSAFEMAP:
13801			bmsafemap = WK_BMSAFEMAP(wk);
13802			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
13803				/* Add reference dependency. */
13804				retval += 1;
13805				if (!wantcount)
13806					goto out;
13807			}
13808			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
13809				/* Allocate block dependency. */
13810				retval += 1;
13811				if (!wantcount)
13812					goto out;
13813			}
13814			continue;
13815
13816		case D_FREEBLKS:
13817			freeblks = WK_FREEBLKS(wk);
13818			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
13819				/* Freeblk journal dependency. */
13820				retval += 1;
13821				if (!wantcount)
13822					goto out;
13823			}
13824			continue;
13825
13826		case D_ALLOCDIRECT:
13827		case D_ALLOCINDIR:
13828			newblk = WK_NEWBLK(wk);
13829			if (newblk->nb_jnewblk) {
13830				/* Journal allocate dependency. */
13831				retval += 1;
13832				if (!wantcount)
13833					goto out;
13834			}
13835			continue;
13836
13837		case D_MKDIR:
13838			mkdir = WK_MKDIR(wk);
13839			if (mkdir->md_jaddref) {
13840				/* Journal reference dependency. */
13841				retval += 1;
13842				if (!wantcount)
13843					goto out;
13844			}
13845			continue;
13846
13847		case D_FREEWORK:
13848		case D_FREEDEP:
13849		case D_JSEGDEP:
13850		case D_JSEG:
13851		case D_SBDEP:
13852			/* never a dependency on these blocks */
13853			continue;
13854
13855		default:
13856			panic("softdep_count_dependencies: Unexpected type %s",
13857			    TYPENAME(wk->wk_type));
13858			/* NOTREACHED */
13859		}
13860	}
13861out:
13862	FREE_LOCK(ump);
13863	return retval;
13864}
13865
13866/*
13867 * Acquire exclusive access to a buffer.
13868 * Must be called with a locked mtx parameter.
13869 * Return acquired buffer or NULL on failure.
13870 */
13871static struct buf *
13872getdirtybuf(bp, lock, waitfor)
13873	struct buf *bp;
13874	struct rwlock *lock;
13875	int waitfor;
13876{
13877	int error;
13878
13879	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
13880		if (waitfor != MNT_WAIT)
13881			return (NULL);
13882		error = BUF_LOCK(bp,
13883		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
13884		/*
13885		 * Even if we sucessfully acquire bp here, we have dropped
13886		 * lock, which may violates our guarantee.
13887		 */
13888		if (error == 0)
13889			BUF_UNLOCK(bp);
13890		else if (error != ENOLCK)
13891			panic("getdirtybuf: inconsistent lock: %d", error);
13892		rw_wlock(lock);
13893		return (NULL);
13894	}
13895	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13896		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
13897			rw_wunlock(lock);
13898			BO_LOCK(bp->b_bufobj);
13899			BUF_UNLOCK(bp);
13900			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
13901				bp->b_vflags |= BV_BKGRDWAIT;
13902				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
13903				       PRIBIO | PDROP, "getbuf", 0);
13904			} else
13905				BO_UNLOCK(bp->b_bufobj);
13906			rw_wlock(lock);
13907			return (NULL);
13908		}
13909		BUF_UNLOCK(bp);
13910		if (waitfor != MNT_WAIT)
13911			return (NULL);
13912		/*
13913		 * The lock argument must be bp->b_vp's mutex in
13914		 * this case.
13915		 */
13916#ifdef	DEBUG_VFS_LOCKS
13917		if (bp->b_vp->v_type != VCHR)
13918			ASSERT_BO_WLOCKED(bp->b_bufobj);
13919#endif
13920		bp->b_vflags |= BV_BKGRDWAIT;
13921		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
13922		return (NULL);
13923	}
13924	if ((bp->b_flags & B_DELWRI) == 0) {
13925		BUF_UNLOCK(bp);
13926		return (NULL);
13927	}
13928	bremfree(bp);
13929	return (bp);
13930}
13931
13932
13933/*
13934 * Check if it is safe to suspend the file system now.  On entry,
13935 * the vnode interlock for devvp should be held.  Return 0 with
13936 * the mount interlock held if the file system can be suspended now,
13937 * otherwise return EAGAIN with the mount interlock held.
13938 */
13939int
13940softdep_check_suspend(struct mount *mp,
13941		      struct vnode *devvp,
13942		      int softdep_depcnt,
13943		      int softdep_accdepcnt,
13944		      int secondary_writes,
13945		      int secondary_accwrites)
13946{
13947	struct bufobj *bo;
13948	struct ufsmount *ump;
13949	struct inodedep *inodedep;
13950	int error, unlinked;
13951
13952	bo = &devvp->v_bufobj;
13953	ASSERT_BO_WLOCKED(bo);
13954
13955	/*
13956	 * If we are not running with soft updates, then we need only
13957	 * deal with secondary writes as we try to suspend.
13958	 */
13959	if (MOUNTEDSOFTDEP(mp) == 0) {
13960		MNT_ILOCK(mp);
13961		while (mp->mnt_secondary_writes != 0) {
13962			BO_UNLOCK(bo);
13963			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
13964			    (PUSER - 1) | PDROP, "secwr", 0);
13965			BO_LOCK(bo);
13966			MNT_ILOCK(mp);
13967		}
13968
13969		/*
13970		 * Reasons for needing more work before suspend:
13971		 * - Dirty buffers on devvp.
13972		 * - Secondary writes occurred after start of vnode sync loop
13973		 */
13974		error = 0;
13975		if (bo->bo_numoutput > 0 ||
13976		    bo->bo_dirty.bv_cnt > 0 ||
13977		    secondary_writes != 0 ||
13978		    mp->mnt_secondary_writes != 0 ||
13979		    secondary_accwrites != mp->mnt_secondary_accwrites)
13980			error = EAGAIN;
13981		BO_UNLOCK(bo);
13982		return (error);
13983	}
13984
13985	/*
13986	 * If we are running with soft updates, then we need to coordinate
13987	 * with them as we try to suspend.
13988	 */
13989	ump = VFSTOUFS(mp);
13990	for (;;) {
13991		if (!TRY_ACQUIRE_LOCK(ump)) {
13992			BO_UNLOCK(bo);
13993			ACQUIRE_LOCK(ump);
13994			FREE_LOCK(ump);
13995			BO_LOCK(bo);
13996			continue;
13997		}
13998		MNT_ILOCK(mp);
13999		if (mp->mnt_secondary_writes != 0) {
14000			FREE_LOCK(ump);
14001			BO_UNLOCK(bo);
14002			msleep(&mp->mnt_secondary_writes,
14003			       MNT_MTX(mp),
14004			       (PUSER - 1) | PDROP, "secwr", 0);
14005			BO_LOCK(bo);
14006			continue;
14007		}
14008		break;
14009	}
14010
14011	unlinked = 0;
14012	if (MOUNTEDSUJ(mp)) {
14013		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14014		    inodedep != NULL;
14015		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14016			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14017			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14018			    UNLINKONLIST) ||
14019			    !check_inodedep_free(inodedep))
14020				continue;
14021			unlinked++;
14022		}
14023	}
14024
14025	/*
14026	 * Reasons for needing more work before suspend:
14027	 * - Dirty buffers on devvp.
14028	 * - Softdep activity occurred after start of vnode sync loop
14029	 * - Secondary writes occurred after start of vnode sync loop
14030	 */
14031	error = 0;
14032	if (bo->bo_numoutput > 0 ||
14033	    bo->bo_dirty.bv_cnt > 0 ||
14034	    softdep_depcnt != unlinked ||
14035	    ump->softdep_deps != unlinked ||
14036	    softdep_accdepcnt != ump->softdep_accdeps ||
14037	    secondary_writes != 0 ||
14038	    mp->mnt_secondary_writes != 0 ||
14039	    secondary_accwrites != mp->mnt_secondary_accwrites)
14040		error = EAGAIN;
14041	FREE_LOCK(ump);
14042	BO_UNLOCK(bo);
14043	return (error);
14044}
14045
14046
14047/*
14048 * Get the number of dependency structures for the file system, both
14049 * the current number and the total number allocated.  These will
14050 * later be used to detect that softdep processing has occurred.
14051 */
14052void
14053softdep_get_depcounts(struct mount *mp,
14054		      int *softdep_depsp,
14055		      int *softdep_accdepsp)
14056{
14057	struct ufsmount *ump;
14058
14059	if (MOUNTEDSOFTDEP(mp) == 0) {
14060		*softdep_depsp = 0;
14061		*softdep_accdepsp = 0;
14062		return;
14063	}
14064	ump = VFSTOUFS(mp);
14065	ACQUIRE_LOCK(ump);
14066	*softdep_depsp = ump->softdep_deps;
14067	*softdep_accdepsp = ump->softdep_accdeps;
14068	FREE_LOCK(ump);
14069}
14070
14071/*
14072 * Wait for pending output on a vnode to complete.
14073 * Must be called with vnode lock and interlock locked.
14074 *
14075 * XXX: Should just be a call to bufobj_wwait().
14076 */
14077static void
14078drain_output(vp)
14079	struct vnode *vp;
14080{
14081	struct bufobj *bo;
14082
14083	bo = &vp->v_bufobj;
14084	ASSERT_VOP_LOCKED(vp, "drain_output");
14085	ASSERT_BO_WLOCKED(bo);
14086
14087	while (bo->bo_numoutput) {
14088		bo->bo_flag |= BO_WWAIT;
14089		msleep((caddr_t)&bo->bo_numoutput,
14090		    BO_LOCKPTR(bo), PRIBIO + 1, "drainvp", 0);
14091	}
14092}
14093
14094/*
14095 * Called whenever a buffer that is being invalidated or reallocated
14096 * contains dependencies. This should only happen if an I/O error has
14097 * occurred. The routine is called with the buffer locked.
14098 */
14099static void
14100softdep_deallocate_dependencies(bp)
14101	struct buf *bp;
14102{
14103
14104	if ((bp->b_ioflags & BIO_ERROR) == 0)
14105		panic("softdep_deallocate_dependencies: dangling deps");
14106	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14107		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14108	else
14109		printf("softdep_deallocate_dependencies: "
14110		    "got error %d while accessing filesystem\n", bp->b_error);
14111	if (bp->b_error != ENXIO)
14112		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14113}
14114
14115/*
14116 * Function to handle asynchronous write errors in the filesystem.
14117 */
14118static void
14119softdep_error(func, error)
14120	char *func;
14121	int error;
14122{
14123
14124	/* XXX should do something better! */
14125	printf("%s: got error %d while accessing filesystem\n", func, error);
14126}
14127
14128#ifdef DDB
14129
14130static void
14131inodedep_print(struct inodedep *inodedep, int verbose)
14132{
14133	db_printf("%p fs %p st %x ino %jd inoblk %jd delta %d nlink %d"
14134	    " saveino %p\n",
14135	    inodedep, inodedep->id_fs, inodedep->id_state,
14136	    (intmax_t)inodedep->id_ino,
14137	    (intmax_t)fsbtodb(inodedep->id_fs,
14138	    ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14139	    inodedep->id_nlinkdelta, inodedep->id_savednlink,
14140	    inodedep->id_savedino1);
14141
14142	if (verbose == 0)
14143		return;
14144
14145	db_printf("\tpendinghd %p, bufwait %p, inowait %p, inoreflst %p, "
14146	    "mkdiradd %p\n",
14147	    LIST_FIRST(&inodedep->id_pendinghd),
14148	    LIST_FIRST(&inodedep->id_bufwait),
14149	    LIST_FIRST(&inodedep->id_inowait),
14150	    TAILQ_FIRST(&inodedep->id_inoreflst),
14151	    inodedep->id_mkdiradd);
14152	db_printf("\tinoupdt %p, newinoupdt %p, extupdt %p, newextupdt %p\n",
14153	    TAILQ_FIRST(&inodedep->id_inoupdt),
14154	    TAILQ_FIRST(&inodedep->id_newinoupdt),
14155	    TAILQ_FIRST(&inodedep->id_extupdt),
14156	    TAILQ_FIRST(&inodedep->id_newextupdt));
14157}
14158
14159DB_SHOW_COMMAND(inodedep, db_show_inodedep)
14160{
14161
14162	if (have_addr == 0) {
14163		db_printf("Address required\n");
14164		return;
14165	}
14166	inodedep_print((struct inodedep*)addr, 1);
14167}
14168
14169DB_SHOW_COMMAND(inodedeps, db_show_inodedeps)
14170{
14171	struct inodedep_hashhead *inodedephd;
14172	struct inodedep *inodedep;
14173	struct ufsmount *ump;
14174	int cnt;
14175
14176	if (have_addr == 0) {
14177		db_printf("Address required\n");
14178		return;
14179	}
14180	ump = (struct ufsmount *)addr;
14181	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
14182		inodedephd = &ump->inodedep_hashtbl[cnt];
14183		LIST_FOREACH(inodedep, inodedephd, id_hash) {
14184			inodedep_print(inodedep, 0);
14185		}
14186	}
14187}
14188
14189DB_SHOW_COMMAND(worklist, db_show_worklist)
14190{
14191	struct worklist *wk;
14192
14193	if (have_addr == 0) {
14194		db_printf("Address required\n");
14195		return;
14196	}
14197	wk = (struct worklist *)addr;
14198	printf("worklist: %p type %s state 0x%X\n",
14199	    wk, TYPENAME(wk->wk_type), wk->wk_state);
14200}
14201
14202DB_SHOW_COMMAND(workhead, db_show_workhead)
14203{
14204	struct workhead *wkhd;
14205	struct worklist *wk;
14206	int i;
14207
14208	if (have_addr == 0) {
14209		db_printf("Address required\n");
14210		return;
14211	}
14212	wkhd = (struct workhead *)addr;
14213	wk = LIST_FIRST(wkhd);
14214	for (i = 0; i < 100 && wk != NULL; i++, wk = LIST_NEXT(wk, wk_list))
14215		db_printf("worklist: %p type %s state 0x%X",
14216		    wk, TYPENAME(wk->wk_type), wk->wk_state);
14217	if (i == 100)
14218		db_printf("workhead overflow");
14219	printf("\n");
14220}
14221
14222
14223DB_SHOW_COMMAND(mkdirs, db_show_mkdirs)
14224{
14225	struct mkdirlist *mkdirlisthd;
14226	struct jaddref *jaddref;
14227	struct diradd *diradd;
14228	struct mkdir *mkdir;
14229
14230	if (have_addr == 0) {
14231		db_printf("Address required\n");
14232		return;
14233	}
14234	mkdirlisthd = (struct mkdirlist *)addr;
14235	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
14236		diradd = mkdir->md_diradd;
14237		db_printf("mkdir: %p state 0x%X dap %p state 0x%X",
14238		    mkdir, mkdir->md_state, diradd, diradd->da_state);
14239		if ((jaddref = mkdir->md_jaddref) != NULL)
14240			db_printf(" jaddref %p jaddref state 0x%X",
14241			    jaddref, jaddref->ja_state);
14242		db_printf("\n");
14243	}
14244}
14245
14246/* exported to ffs_vfsops.c */
14247extern void db_print_ffs(struct ufsmount *ump);
14248void
14249db_print_ffs(struct ufsmount *ump)
14250{
14251	db_printf("mp %p %s devvp %p fs %p su_wl %d su_deps %d su_req %d\n",
14252	    ump->um_mountp, ump->um_mountp->mnt_stat.f_mntonname,
14253	    ump->um_devvp, ump->um_fs, ump->softdep_on_worklist,
14254	    ump->softdep_deps, ump->softdep_req);
14255}
14256
14257#endif /* DDB */
14258
14259#endif /* SOFTUPDATES */
14260