1/*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 1998, 2000 Marshall Kirk McKusick.
5 * Copyright 2009, 2010 Jeffrey W. Roberson <jeff@FreeBSD.org>
6 * All rights reserved.
7 *
8 * The soft updates code is derived from the appendix of a University
9 * of Michigan technical report (Gregory R. Ganger and Yale N. Patt,
10 * "Soft Updates: A Solution to the Metadata Update Problem in File
11 * Systems", CSE-TR-254-95, August 1995).
12 *
13 * Further information about soft updates can be obtained from:
14 *
15 *	Marshall Kirk McKusick		http://www.mckusick.com/softdep/
16 *	1614 Oxford Street		mckusick@mckusick.com
17 *	Berkeley, CA 94709-1608		+1-510-843-9542
18 *	USA
19 *
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
22 * are met:
23 *
24 * 1. Redistributions of source code must retain the above copyright
25 *    notice, this list of conditions and the following disclaimer.
26 * 2. Redistributions in binary form must reproduce the above copyright
27 *    notice, this list of conditions and the following disclaimer in the
28 *    documentation and/or other materials provided with the distribution.
29 *
30 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
31 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
33 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
34 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
35 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
36 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
37 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
38 * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
39 * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 *
41 *	from: @(#)ffs_softdep.c	9.59 (McKusick) 6/21/00
42 */
43
44#include <sys/cdefs.h>
45__FBSDID("$FreeBSD$");
46
47#include "opt_ffs.h"
48#include "opt_quota.h"
49#include "opt_ddb.h"
50
51#include <sys/param.h>
52#include <sys/kernel.h>
53#include <sys/systm.h>
54#include <sys/bio.h>
55#include <sys/buf.h>
56#include <sys/kdb.h>
57#include <sys/kthread.h>
58#include <sys/ktr.h>
59#include <sys/limits.h>
60#include <sys/lock.h>
61#include <sys/malloc.h>
62#include <sys/mount.h>
63#include <sys/mutex.h>
64#include <sys/namei.h>
65#include <sys/priv.h>
66#include <sys/proc.h>
67#include <sys/racct.h>
68#include <sys/rwlock.h>
69#include <sys/stat.h>
70#include <sys/sysctl.h>
71#include <sys/syslog.h>
72#include <sys/vnode.h>
73#include <sys/conf.h>
74
75#include <ufs/ufs/dir.h>
76#include <ufs/ufs/extattr.h>
77#include <ufs/ufs/quota.h>
78#include <ufs/ufs/inode.h>
79#include <ufs/ufs/ufsmount.h>
80#include <ufs/ffs/fs.h>
81#include <ufs/ffs/softdep.h>
82#include <ufs/ffs/ffs_extern.h>
83#include <ufs/ufs/ufs_extern.h>
84
85#include <vm/vm.h>
86#include <vm/vm_extern.h>
87#include <vm/vm_object.h>
88
89#include <geom/geom.h>
90#include <geom/geom_vfs.h>
91
92#include <ddb/ddb.h>
93
94#define	KTR_SUJ	0	/* Define to KTR_SPARE. */
95
96#ifndef SOFTUPDATES
97
98int
99softdep_flushfiles(oldmnt, flags, td)
100	struct mount *oldmnt;
101	int flags;
102	struct thread *td;
103{
104
105	panic("softdep_flushfiles called");
106}
107
108int
109softdep_mount(devvp, mp, fs, cred)
110	struct vnode *devvp;
111	struct mount *mp;
112	struct fs *fs;
113	struct ucred *cred;
114{
115
116	return (0);
117}
118
119void
120softdep_initialize()
121{
122
123	return;
124}
125
126void
127softdep_uninitialize()
128{
129
130	return;
131}
132
133void
134softdep_unmount(mp)
135	struct mount *mp;
136{
137
138	panic("softdep_unmount called");
139}
140
141void
142softdep_setup_sbupdate(ump, fs, bp)
143	struct ufsmount *ump;
144	struct fs *fs;
145	struct buf *bp;
146{
147
148	panic("softdep_setup_sbupdate called");
149}
150
151void
152softdep_setup_inomapdep(bp, ip, newinum, mode)
153	struct buf *bp;
154	struct inode *ip;
155	ino_t newinum;
156	int mode;
157{
158
159	panic("softdep_setup_inomapdep called");
160}
161
162void
163softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
164	struct buf *bp;
165	struct mount *mp;
166	ufs2_daddr_t newblkno;
167	int frags;
168	int oldfrags;
169{
170
171	panic("softdep_setup_blkmapdep called");
172}
173
174void
175softdep_setup_allocdirect(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
176	struct inode *ip;
177	ufs_lbn_t lbn;
178	ufs2_daddr_t newblkno;
179	ufs2_daddr_t oldblkno;
180	long newsize;
181	long oldsize;
182	struct buf *bp;
183{
184
185	panic("softdep_setup_allocdirect called");
186}
187
188void
189softdep_setup_allocext(ip, lbn, newblkno, oldblkno, newsize, oldsize, bp)
190	struct inode *ip;
191	ufs_lbn_t lbn;
192	ufs2_daddr_t newblkno;
193	ufs2_daddr_t oldblkno;
194	long newsize;
195	long oldsize;
196	struct buf *bp;
197{
198
199	panic("softdep_setup_allocext called");
200}
201
202void
203softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
204	struct inode *ip;
205	ufs_lbn_t lbn;
206	struct buf *bp;
207	int ptrno;
208	ufs2_daddr_t newblkno;
209	ufs2_daddr_t oldblkno;
210	struct buf *nbp;
211{
212
213	panic("softdep_setup_allocindir_page called");
214}
215
216void
217softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
218	struct buf *nbp;
219	struct inode *ip;
220	struct buf *bp;
221	int ptrno;
222	ufs2_daddr_t newblkno;
223{
224
225	panic("softdep_setup_allocindir_meta called");
226}
227
228void
229softdep_journal_freeblocks(ip, cred, length, flags)
230	struct inode *ip;
231	struct ucred *cred;
232	off_t length;
233	int flags;
234{
235
236	panic("softdep_journal_freeblocks called");
237}
238
239void
240softdep_journal_fsync(ip)
241	struct inode *ip;
242{
243
244	panic("softdep_journal_fsync called");
245}
246
247void
248softdep_setup_freeblocks(ip, length, flags)
249	struct inode *ip;
250	off_t length;
251	int flags;
252{
253
254	panic("softdep_setup_freeblocks called");
255}
256
257void
258softdep_freefile(pvp, ino, mode)
259		struct vnode *pvp;
260		ino_t ino;
261		int mode;
262{
263
264	panic("softdep_freefile called");
265}
266
267int
268softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
269	struct buf *bp;
270	struct inode *dp;
271	off_t diroffset;
272	ino_t newinum;
273	struct buf *newdirbp;
274	int isnewblk;
275{
276
277	panic("softdep_setup_directory_add called");
278}
279
280void
281softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
282	struct buf *bp;
283	struct inode *dp;
284	caddr_t base;
285	caddr_t oldloc;
286	caddr_t newloc;
287	int entrysize;
288{
289
290	panic("softdep_change_directoryentry_offset called");
291}
292
293void
294softdep_setup_remove(bp, dp, ip, isrmdir)
295	struct buf *bp;
296	struct inode *dp;
297	struct inode *ip;
298	int isrmdir;
299{
300
301	panic("softdep_setup_remove called");
302}
303
304void
305softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
306	struct buf *bp;
307	struct inode *dp;
308	struct inode *ip;
309	ino_t newinum;
310	int isrmdir;
311{
312
313	panic("softdep_setup_directory_change called");
314}
315
316void
317softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
318	struct mount *mp;
319	struct buf *bp;
320	ufs2_daddr_t blkno;
321	int frags;
322	struct workhead *wkhd;
323{
324
325	panic("%s called", __FUNCTION__);
326}
327
328void
329softdep_setup_inofree(mp, bp, ino, wkhd)
330	struct mount *mp;
331	struct buf *bp;
332	ino_t ino;
333	struct workhead *wkhd;
334{
335
336	panic("%s called", __FUNCTION__);
337}
338
339void
340softdep_setup_unlink(dp, ip)
341	struct inode *dp;
342	struct inode *ip;
343{
344
345	panic("%s called", __FUNCTION__);
346}
347
348void
349softdep_setup_link(dp, ip)
350	struct inode *dp;
351	struct inode *ip;
352{
353
354	panic("%s called", __FUNCTION__);
355}
356
357void
358softdep_revert_link(dp, ip)
359	struct inode *dp;
360	struct inode *ip;
361{
362
363	panic("%s called", __FUNCTION__);
364}
365
366void
367softdep_setup_rmdir(dp, ip)
368	struct inode *dp;
369	struct inode *ip;
370{
371
372	panic("%s called", __FUNCTION__);
373}
374
375void
376softdep_revert_rmdir(dp, ip)
377	struct inode *dp;
378	struct inode *ip;
379{
380
381	panic("%s called", __FUNCTION__);
382}
383
384void
385softdep_setup_create(dp, ip)
386	struct inode *dp;
387	struct inode *ip;
388{
389
390	panic("%s called", __FUNCTION__);
391}
392
393void
394softdep_revert_create(dp, ip)
395	struct inode *dp;
396	struct inode *ip;
397{
398
399	panic("%s called", __FUNCTION__);
400}
401
402void
403softdep_setup_mkdir(dp, ip)
404	struct inode *dp;
405	struct inode *ip;
406{
407
408	panic("%s called", __FUNCTION__);
409}
410
411void
412softdep_revert_mkdir(dp, ip)
413	struct inode *dp;
414	struct inode *ip;
415{
416
417	panic("%s called", __FUNCTION__);
418}
419
420void
421softdep_setup_dotdot_link(dp, ip)
422	struct inode *dp;
423	struct inode *ip;
424{
425
426	panic("%s called", __FUNCTION__);
427}
428
429int
430softdep_prealloc(vp, waitok)
431	struct vnode *vp;
432	int waitok;
433{
434
435	panic("%s called", __FUNCTION__);
436}
437
438int
439softdep_journal_lookup(mp, vpp)
440	struct mount *mp;
441	struct vnode **vpp;
442{
443
444	return (ENOENT);
445}
446
447void
448softdep_change_linkcnt(ip)
449	struct inode *ip;
450{
451
452	panic("softdep_change_linkcnt called");
453}
454
455void
456softdep_load_inodeblock(ip)
457	struct inode *ip;
458{
459
460	panic("softdep_load_inodeblock called");
461}
462
463void
464softdep_update_inodeblock(ip, bp, waitfor)
465	struct inode *ip;
466	struct buf *bp;
467	int waitfor;
468{
469
470	panic("softdep_update_inodeblock called");
471}
472
473int
474softdep_fsync(vp)
475	struct vnode *vp;	/* the "in_core" copy of the inode */
476{
477
478	return (0);
479}
480
481void
482softdep_fsync_mountdev(vp)
483	struct vnode *vp;
484{
485
486	return;
487}
488
489int
490softdep_flushworklist(oldmnt, countp, td)
491	struct mount *oldmnt;
492	int *countp;
493	struct thread *td;
494{
495
496	*countp = 0;
497	return (0);
498}
499
500int
501softdep_sync_metadata(struct vnode *vp)
502{
503
504	panic("softdep_sync_metadata called");
505}
506
507int
508softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
509{
510
511	panic("softdep_sync_buf called");
512}
513
514int
515softdep_slowdown(vp)
516	struct vnode *vp;
517{
518
519	panic("softdep_slowdown called");
520}
521
522int
523softdep_request_cleanup(fs, vp, cred, resource)
524	struct fs *fs;
525	struct vnode *vp;
526	struct ucred *cred;
527	int resource;
528{
529
530	return (0);
531}
532
533int
534softdep_check_suspend(struct mount *mp,
535		      struct vnode *devvp,
536		      int softdep_depcnt,
537		      int softdep_accdepcnt,
538		      int secondary_writes,
539		      int secondary_accwrites)
540{
541	struct bufobj *bo;
542	int error;
543
544	(void) softdep_depcnt,
545	(void) softdep_accdepcnt;
546
547	bo = &devvp->v_bufobj;
548	ASSERT_BO_WLOCKED(bo);
549
550	MNT_ILOCK(mp);
551	while (mp->mnt_secondary_writes != 0) {
552		BO_UNLOCK(bo);
553		msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
554		    (PUSER - 1) | PDROP, "secwr", 0);
555		BO_LOCK(bo);
556		MNT_ILOCK(mp);
557	}
558
559	/*
560	 * Reasons for needing more work before suspend:
561	 * - Dirty buffers on devvp.
562	 * - Secondary writes occurred after start of vnode sync loop
563	 */
564	error = 0;
565	if (bo->bo_numoutput > 0 ||
566	    bo->bo_dirty.bv_cnt > 0 ||
567	    secondary_writes != 0 ||
568	    mp->mnt_secondary_writes != 0 ||
569	    secondary_accwrites != mp->mnt_secondary_accwrites)
570		error = EAGAIN;
571	BO_UNLOCK(bo);
572	return (error);
573}
574
575void
576softdep_get_depcounts(struct mount *mp,
577		      int *softdepactivep,
578		      int *softdepactiveaccp)
579{
580	(void) mp;
581	*softdepactivep = 0;
582	*softdepactiveaccp = 0;
583}
584
585void
586softdep_buf_append(bp, wkhd)
587	struct buf *bp;
588	struct workhead *wkhd;
589{
590
591	panic("softdep_buf_appendwork called");
592}
593
594void
595softdep_inode_append(ip, cred, wkhd)
596	struct inode *ip;
597	struct ucred *cred;
598	struct workhead *wkhd;
599{
600
601	panic("softdep_inode_appendwork called");
602}
603
604void
605softdep_freework(wkhd)
606	struct workhead *wkhd;
607{
608
609	panic("softdep_freework called");
610}
611
612int
613softdep_prerename(fdvp, fvp, tdvp, tvp)
614	struct vnode *fdvp;
615	struct vnode *fvp;
616	struct vnode *tdvp;
617	struct vnode *tvp;
618{
619
620	panic("softdep_prerename called");
621}
622
623int
624softdep_prelink(dvp, vp)
625	struct vnode *dvp;
626	struct vnode *vp;
627{
628
629	panic("softdep_prelink called");
630}
631
632#else
633
634FEATURE(softupdates, "FFS soft-updates support");
635
636static SYSCTL_NODE(_debug, OID_AUTO, softdep, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
637    "soft updates stats");
638static SYSCTL_NODE(_debug_softdep, OID_AUTO, total,
639    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
640    "total dependencies allocated");
641static SYSCTL_NODE(_debug_softdep, OID_AUTO, highuse,
642    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
643    "high use dependencies allocated");
644static SYSCTL_NODE(_debug_softdep, OID_AUTO, current,
645    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
646    "current dependencies allocated");
647static SYSCTL_NODE(_debug_softdep, OID_AUTO, write,
648    CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
649    "current dependencies written");
650
651unsigned long dep_current[D_LAST + 1];
652unsigned long dep_highuse[D_LAST + 1];
653unsigned long dep_total[D_LAST + 1];
654unsigned long dep_write[D_LAST + 1];
655
656#define	SOFTDEP_TYPE(type, str, long)					\
657    static MALLOC_DEFINE(M_ ## type, #str, long);			\
658    SYSCTL_ULONG(_debug_softdep_total, OID_AUTO, str, CTLFLAG_RD,	\
659	&dep_total[D_ ## type], 0, "");					\
660    SYSCTL_ULONG(_debug_softdep_current, OID_AUTO, str, CTLFLAG_RD, 	\
661	&dep_current[D_ ## type], 0, "");				\
662    SYSCTL_ULONG(_debug_softdep_highuse, OID_AUTO, str, CTLFLAG_RD, 	\
663	&dep_highuse[D_ ## type], 0, "");				\
664    SYSCTL_ULONG(_debug_softdep_write, OID_AUTO, str, CTLFLAG_RD, 	\
665	&dep_write[D_ ## type], 0, "");
666
667SOFTDEP_TYPE(PAGEDEP, pagedep, "File page dependencies");
668SOFTDEP_TYPE(INODEDEP, inodedep, "Inode dependencies");
669SOFTDEP_TYPE(BMSAFEMAP, bmsafemap,
670    "Block or frag allocated from cyl group map");
671SOFTDEP_TYPE(NEWBLK, newblk, "New block or frag allocation dependency");
672SOFTDEP_TYPE(ALLOCDIRECT, allocdirect, "Block or frag dependency for an inode");
673SOFTDEP_TYPE(INDIRDEP, indirdep, "Indirect block dependencies");
674SOFTDEP_TYPE(ALLOCINDIR, allocindir, "Block dependency for an indirect block");
675SOFTDEP_TYPE(FREEFRAG, freefrag, "Previously used frag for an inode");
676SOFTDEP_TYPE(FREEBLKS, freeblks, "Blocks freed from an inode");
677SOFTDEP_TYPE(FREEFILE, freefile, "Inode deallocated");
678SOFTDEP_TYPE(DIRADD, diradd, "New directory entry");
679SOFTDEP_TYPE(MKDIR, mkdir, "New directory");
680SOFTDEP_TYPE(DIRREM, dirrem, "Directory entry deleted");
681SOFTDEP_TYPE(NEWDIRBLK, newdirblk, "Unclaimed new directory block");
682SOFTDEP_TYPE(FREEWORK, freework, "free an inode block");
683SOFTDEP_TYPE(FREEDEP, freedep, "track a block free");
684SOFTDEP_TYPE(JADDREF, jaddref, "Journal inode ref add");
685SOFTDEP_TYPE(JREMREF, jremref, "Journal inode ref remove");
686SOFTDEP_TYPE(JMVREF, jmvref, "Journal inode ref move");
687SOFTDEP_TYPE(JNEWBLK, jnewblk, "Journal new block");
688SOFTDEP_TYPE(JFREEBLK, jfreeblk, "Journal free block");
689SOFTDEP_TYPE(JFREEFRAG, jfreefrag, "Journal free frag");
690SOFTDEP_TYPE(JSEG, jseg, "Journal segment");
691SOFTDEP_TYPE(JSEGDEP, jsegdep, "Journal segment complete");
692SOFTDEP_TYPE(SBDEP, sbdep, "Superblock write dependency");
693SOFTDEP_TYPE(JTRUNC, jtrunc, "Journal inode truncation");
694SOFTDEP_TYPE(JFSYNC, jfsync, "Journal fsync complete");
695
696static MALLOC_DEFINE(M_SENTINEL, "sentinel", "Worklist sentinel");
697
698static MALLOC_DEFINE(M_SAVEDINO, "savedino", "Saved inodes");
699static MALLOC_DEFINE(M_JBLOCKS, "jblocks", "Journal block locations");
700static MALLOC_DEFINE(M_MOUNTDATA, "softdep", "Softdep per-mount data");
701
702#define M_SOFTDEP_FLAGS	(M_WAITOK)
703
704/*
705 * translate from workitem type to memory type
706 * MUST match the defines above, such that memtype[D_XXX] == M_XXX
707 */
708static struct malloc_type *memtype[] = {
709	NULL,
710	M_PAGEDEP,
711	M_INODEDEP,
712	M_BMSAFEMAP,
713	M_NEWBLK,
714	M_ALLOCDIRECT,
715	M_INDIRDEP,
716	M_ALLOCINDIR,
717	M_FREEFRAG,
718	M_FREEBLKS,
719	M_FREEFILE,
720	M_DIRADD,
721	M_MKDIR,
722	M_DIRREM,
723	M_NEWDIRBLK,
724	M_FREEWORK,
725	M_FREEDEP,
726	M_JADDREF,
727	M_JREMREF,
728	M_JMVREF,
729	M_JNEWBLK,
730	M_JFREEBLK,
731	M_JFREEFRAG,
732	M_JSEG,
733	M_JSEGDEP,
734	M_SBDEP,
735	M_JTRUNC,
736	M_JFSYNC,
737	M_SENTINEL
738};
739
740#define DtoM(type) (memtype[type])
741
742/*
743 * Names of malloc types.
744 */
745#define TYPENAME(type)  \
746	((unsigned)(type) <= D_LAST && (unsigned)(type) >= D_FIRST ? \
747	memtype[type]->ks_shortdesc : "???")
748/*
749 * End system adaptation definitions.
750 */
751
752#define	DOTDOT_OFFSET	offsetof(struct dirtemplate, dotdot_ino)
753#define	DOT_OFFSET	offsetof(struct dirtemplate, dot_ino)
754
755/*
756 * Internal function prototypes.
757 */
758static	void check_clear_deps(struct mount *);
759static	void softdep_error(char *, int);
760static	int softdep_prerename_vnode(struct ufsmount *, struct vnode *);
761static	int softdep_process_worklist(struct mount *, int);
762static	int softdep_waitidle(struct mount *, int);
763static	void drain_output(struct vnode *);
764static	struct buf *getdirtybuf(struct buf *, struct rwlock *, int);
765static	int check_inodedep_free(struct inodedep *);
766static	void clear_remove(struct mount *);
767static	void clear_inodedeps(struct mount *);
768static	void unlinked_inodedep(struct mount *, struct inodedep *);
769static	void clear_unlinked_inodedep(struct inodedep *);
770static	struct inodedep *first_unlinked_inodedep(struct ufsmount *);
771static	int flush_pagedep_deps(struct vnode *, struct mount *,
772	    struct diraddhd *, struct buf *);
773static	int free_pagedep(struct pagedep *);
774static	int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
775static	int flush_inodedep_deps(struct vnode *, struct mount *, ino_t);
776static	int flush_deplist(struct allocdirectlst *, int, int *);
777static	int sync_cgs(struct mount *, int);
778static	int handle_written_filepage(struct pagedep *, struct buf *, int);
779static	int handle_written_sbdep(struct sbdep *, struct buf *);
780static	void initiate_write_sbdep(struct sbdep *);
781static	void diradd_inode_written(struct diradd *, struct inodedep *);
782static	int handle_written_indirdep(struct indirdep *, struct buf *,
783	    struct buf**, int);
784static	int handle_written_inodeblock(struct inodedep *, struct buf *, int);
785static	int jnewblk_rollforward(struct jnewblk *, struct fs *, struct cg *,
786	    uint8_t *);
787static	int handle_written_bmsafemap(struct bmsafemap *, struct buf *, int);
788static	void handle_written_jaddref(struct jaddref *);
789static	void handle_written_jremref(struct jremref *);
790static	void handle_written_jseg(struct jseg *, struct buf *);
791static	void handle_written_jnewblk(struct jnewblk *);
792static	void handle_written_jblkdep(struct jblkdep *);
793static	void handle_written_jfreefrag(struct jfreefrag *);
794static	void complete_jseg(struct jseg *);
795static	void complete_jsegs(struct jseg *);
796static	void jseg_write(struct ufsmount *ump, struct jseg *, uint8_t *);
797static	void jaddref_write(struct jaddref *, struct jseg *, uint8_t *);
798static	void jremref_write(struct jremref *, struct jseg *, uint8_t *);
799static	void jmvref_write(struct jmvref *, struct jseg *, uint8_t *);
800static	void jtrunc_write(struct jtrunc *, struct jseg *, uint8_t *);
801static	void jfsync_write(struct jfsync *, struct jseg *, uint8_t *data);
802static	void jnewblk_write(struct jnewblk *, struct jseg *, uint8_t *);
803static	void jfreeblk_write(struct jfreeblk *, struct jseg *, uint8_t *);
804static	void jfreefrag_write(struct jfreefrag *, struct jseg *, uint8_t *);
805static	inline void inoref_write(struct inoref *, struct jseg *,
806	    struct jrefrec *);
807static	void handle_allocdirect_partdone(struct allocdirect *,
808	    struct workhead *);
809static	struct jnewblk *cancel_newblk(struct newblk *, struct worklist *,
810	    struct workhead *);
811static	void indirdep_complete(struct indirdep *);
812static	int indirblk_lookup(struct mount *, ufs2_daddr_t);
813static	void indirblk_insert(struct freework *);
814static	void indirblk_remove(struct freework *);
815static	void handle_allocindir_partdone(struct allocindir *);
816static	void initiate_write_filepage(struct pagedep *, struct buf *);
817static	void initiate_write_indirdep(struct indirdep*, struct buf *);
818static	void handle_written_mkdir(struct mkdir *, int);
819static	int jnewblk_rollback(struct jnewblk *, struct fs *, struct cg *,
820	    uint8_t *);
821static	void initiate_write_bmsafemap(struct bmsafemap *, struct buf *);
822static	void initiate_write_inodeblock_ufs1(struct inodedep *, struct buf *);
823static	void initiate_write_inodeblock_ufs2(struct inodedep *, struct buf *);
824static	void handle_workitem_freefile(struct freefile *);
825static	int handle_workitem_remove(struct dirrem *, int);
826static	struct dirrem *newdirrem(struct buf *, struct inode *,
827	    struct inode *, int, struct dirrem **);
828static	struct indirdep *indirdep_lookup(struct mount *, struct inode *,
829	    struct buf *);
830static	void cancel_indirdep(struct indirdep *, struct buf *,
831	    struct freeblks *);
832static	void free_indirdep(struct indirdep *);
833static	void free_diradd(struct diradd *, struct workhead *);
834static	void merge_diradd(struct inodedep *, struct diradd *);
835static	void complete_diradd(struct diradd *);
836static	struct diradd *diradd_lookup(struct pagedep *, int);
837static	struct jremref *cancel_diradd_dotdot(struct inode *, struct dirrem *,
838	    struct jremref *);
839static	struct jremref *cancel_mkdir_dotdot(struct inode *, struct dirrem *,
840	    struct jremref *);
841static	void cancel_diradd(struct diradd *, struct dirrem *, struct jremref *,
842	    struct jremref *, struct jremref *);
843static	void dirrem_journal(struct dirrem *, struct jremref *, struct jremref *,
844	    struct jremref *);
845static	void cancel_allocindir(struct allocindir *, struct buf *bp,
846	    struct freeblks *, int);
847static	int setup_trunc_indir(struct freeblks *, struct inode *,
848	    ufs_lbn_t, ufs_lbn_t, ufs2_daddr_t);
849static	void complete_trunc_indir(struct freework *);
850static	void trunc_indirdep(struct indirdep *, struct freeblks *, struct buf *,
851	    int);
852static	void complete_mkdir(struct mkdir *);
853static	void free_newdirblk(struct newdirblk *);
854static	void free_jremref(struct jremref *);
855static	void free_jaddref(struct jaddref *);
856static	void free_jsegdep(struct jsegdep *);
857static	void free_jsegs(struct jblocks *);
858static	void rele_jseg(struct jseg *);
859static	void free_jseg(struct jseg *, struct jblocks *);
860static	void free_jnewblk(struct jnewblk *);
861static	void free_jblkdep(struct jblkdep *);
862static	void free_jfreefrag(struct jfreefrag *);
863static	void free_freedep(struct freedep *);
864static	void journal_jremref(struct dirrem *, struct jremref *,
865	    struct inodedep *);
866static	void cancel_jnewblk(struct jnewblk *, struct workhead *);
867static	int cancel_jaddref(struct jaddref *, struct inodedep *,
868	    struct workhead *);
869static	void cancel_jfreefrag(struct jfreefrag *);
870static	inline void setup_freedirect(struct freeblks *, struct inode *,
871	    int, int);
872static	inline void setup_freeext(struct freeblks *, struct inode *, int, int);
873static	inline void setup_freeindir(struct freeblks *, struct inode *, int,
874	    ufs_lbn_t, int);
875static	inline struct freeblks *newfreeblks(struct mount *, struct inode *);
876static	void freeblks_free(struct ufsmount *, struct freeblks *, int);
877static	void indir_trunc(struct freework *, ufs2_daddr_t, ufs_lbn_t);
878static	ufs2_daddr_t blkcount(struct fs *, ufs2_daddr_t, off_t);
879static	int trunc_check_buf(struct buf *, int *, ufs_lbn_t, int, int);
880static	void trunc_dependencies(struct inode *, struct freeblks *, ufs_lbn_t,
881	    int, int);
882static	void trunc_pages(struct inode *, off_t, ufs2_daddr_t, int);
883static 	int cancel_pagedep(struct pagedep *, struct freeblks *, int);
884static	int deallocate_dependencies(struct buf *, struct freeblks *, int);
885static	void newblk_freefrag(struct newblk*);
886static	void free_newblk(struct newblk *);
887static	void cancel_allocdirect(struct allocdirectlst *,
888	    struct allocdirect *, struct freeblks *);
889static	int check_inode_unwritten(struct inodedep *);
890static	int free_inodedep(struct inodedep *);
891static	void freework_freeblock(struct freework *, u_long);
892static	void freework_enqueue(struct freework *);
893static	int handle_workitem_freeblocks(struct freeblks *, int);
894static	int handle_complete_freeblocks(struct freeblks *, int);
895static	void handle_workitem_indirblk(struct freework *);
896static	void handle_written_freework(struct freework *);
897static	void merge_inode_lists(struct allocdirectlst *,struct allocdirectlst *);
898static	struct worklist *jnewblk_merge(struct worklist *, struct worklist *,
899	    struct workhead *);
900static	struct freefrag *setup_allocindir_phase2(struct buf *, struct inode *,
901	    struct inodedep *, struct allocindir *, ufs_lbn_t);
902static	struct allocindir *newallocindir(struct inode *, int, ufs2_daddr_t,
903	    ufs2_daddr_t, ufs_lbn_t);
904static	void handle_workitem_freefrag(struct freefrag *);
905static	struct freefrag *newfreefrag(struct inode *, ufs2_daddr_t, long,
906	    ufs_lbn_t, u_long);
907static	void allocdirect_merge(struct allocdirectlst *,
908	    struct allocdirect *, struct allocdirect *);
909static	struct freefrag *allocindir_merge(struct allocindir *,
910	    struct allocindir *);
911static	int bmsafemap_find(struct bmsafemap_hashhead *, int,
912	    struct bmsafemap **);
913static	struct bmsafemap *bmsafemap_lookup(struct mount *, struct buf *,
914	    int cg, struct bmsafemap *);
915static	int newblk_find(struct newblk_hashhead *, ufs2_daddr_t, int,
916	    struct newblk **);
917static	int newblk_lookup(struct mount *, ufs2_daddr_t, int, struct newblk **);
918static	int inodedep_find(struct inodedep_hashhead *, ino_t,
919	    struct inodedep **);
920static	int inodedep_lookup(struct mount *, ino_t, int, struct inodedep **);
921static	int pagedep_lookup(struct mount *, struct buf *bp, ino_t, ufs_lbn_t,
922	    int, struct pagedep **);
923static	int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t,
924	    struct pagedep **);
925static	void pause_timer(void *);
926static	int request_cleanup(struct mount *, int);
927static	int softdep_request_cleanup_flush(struct mount *, struct ufsmount *);
928static	void schedule_cleanup(struct mount *);
929static void softdep_ast_cleanup_proc(struct thread *);
930static struct ufsmount *softdep_bp_to_mp(struct buf *bp);
931static	int process_worklist_item(struct mount *, int, int);
932static	void process_removes(struct vnode *);
933static	void process_truncates(struct vnode *);
934static	void jwork_move(struct workhead *, struct workhead *);
935static	void jwork_insert(struct workhead *, struct jsegdep *);
936static	void add_to_worklist(struct worklist *, int);
937static	void wake_worklist(struct worklist *);
938static	void wait_worklist(struct worklist *, char *);
939static	void remove_from_worklist(struct worklist *);
940static	void softdep_flush(void *);
941static	void softdep_flushjournal(struct mount *);
942static	int softdep_speedup(struct ufsmount *);
943static	void worklist_speedup(struct mount *);
944static	int journal_mount(struct mount *, struct fs *, struct ucred *);
945static	void journal_unmount(struct ufsmount *);
946static	int journal_space(struct ufsmount *, int);
947static	void journal_suspend(struct ufsmount *);
948static	int journal_unsuspend(struct ufsmount *ump);
949static	void add_to_journal(struct worklist *);
950static	void remove_from_journal(struct worklist *);
951static	bool softdep_excess_items(struct ufsmount *, int);
952static	void softdep_process_journal(struct mount *, struct worklist *, int);
953static	struct jremref *newjremref(struct dirrem *, struct inode *,
954	    struct inode *ip, off_t, nlink_t);
955static	struct jaddref *newjaddref(struct inode *, ino_t, off_t, int16_t,
956	    uint16_t);
957static	inline void newinoref(struct inoref *, ino_t, ino_t, off_t, nlink_t,
958	    uint16_t);
959static	inline struct jsegdep *inoref_jseg(struct inoref *);
960static	struct jmvref *newjmvref(struct inode *, ino_t, off_t, off_t);
961static	struct jfreeblk *newjfreeblk(struct freeblks *, ufs_lbn_t,
962	    ufs2_daddr_t, int);
963static	void adjust_newfreework(struct freeblks *, int);
964static	struct jtrunc *newjtrunc(struct freeblks *, off_t, int);
965static	void move_newblock_dep(struct jaddref *, struct inodedep *);
966static	void cancel_jfreeblk(struct freeblks *, ufs2_daddr_t);
967static	struct jfreefrag *newjfreefrag(struct freefrag *, struct inode *,
968	    ufs2_daddr_t, long, ufs_lbn_t);
969static	struct freework *newfreework(struct ufsmount *, struct freeblks *,
970	    struct freework *, ufs_lbn_t, ufs2_daddr_t, int, int, int);
971static	int jwait(struct worklist *, int);
972static	struct inodedep *inodedep_lookup_ip(struct inode *);
973static	int bmsafemap_backgroundwrite(struct bmsafemap *, struct buf *);
974static	struct freefile *handle_bufwait(struct inodedep *, struct workhead *);
975static	void handle_jwork(struct workhead *);
976static	struct mkdir *setup_newdir(struct diradd *, ino_t, ino_t, struct buf *,
977	    struct mkdir **);
978static	struct jblocks *jblocks_create(void);
979static	ufs2_daddr_t jblocks_alloc(struct jblocks *, int, int *);
980static	void jblocks_free(struct jblocks *, struct mount *, int);
981static	void jblocks_destroy(struct jblocks *);
982static	void jblocks_add(struct jblocks *, ufs2_daddr_t, int);
983
984/*
985 * Exported softdep operations.
986 */
987static	void softdep_disk_io_initiation(struct buf *);
988static	void softdep_disk_write_complete(struct buf *);
989static	void softdep_deallocate_dependencies(struct buf *);
990static	int softdep_count_dependencies(struct buf *bp, int);
991
992/*
993 * Global lock over all of soft updates.
994 */
995static struct mtx lk;
996MTX_SYSINIT(softdep_lock, &lk, "global softdep", MTX_DEF);
997
998#define ACQUIRE_GBLLOCK(lk)	mtx_lock(lk)
999#define FREE_GBLLOCK(lk)	mtx_unlock(lk)
1000#define GBLLOCK_OWNED(lk)	mtx_assert((lk), MA_OWNED)
1001
1002/*
1003 * Per-filesystem soft-updates locking.
1004 */
1005#define LOCK_PTR(ump)		(&(ump)->um_softdep->sd_fslock)
1006#define TRY_ACQUIRE_LOCK(ump)	rw_try_wlock(&(ump)->um_softdep->sd_fslock)
1007#define ACQUIRE_LOCK(ump)	rw_wlock(&(ump)->um_softdep->sd_fslock)
1008#define FREE_LOCK(ump)		rw_wunlock(&(ump)->um_softdep->sd_fslock)
1009#define LOCK_OWNED(ump)		rw_assert(&(ump)->um_softdep->sd_fslock, \
1010				    RA_WLOCKED)
1011
1012#define	BUF_AREC(bp)		lockallowrecurse(&(bp)->b_lock)
1013#define	BUF_NOREC(bp)		lockdisablerecurse(&(bp)->b_lock)
1014
1015/*
1016 * Worklist queue management.
1017 * These routines require that the lock be held.
1018 */
1019#ifndef /* NOT */ INVARIANTS
1020#define WORKLIST_INSERT(head, item) do {	\
1021	(item)->wk_state |= ONWORKLIST;		\
1022	LIST_INSERT_HEAD(head, item, wk_list);	\
1023} while (0)
1024#define WORKLIST_REMOVE(item) do {		\
1025	(item)->wk_state &= ~ONWORKLIST;	\
1026	LIST_REMOVE(item, wk_list);		\
1027} while (0)
1028#define WORKLIST_INSERT_UNLOCKED	WORKLIST_INSERT
1029#define WORKLIST_REMOVE_UNLOCKED	WORKLIST_REMOVE
1030
1031#else /* INVARIANTS */
1032static	void worklist_insert(struct workhead *, struct worklist *, int,
1033	const char *, int);
1034static	void worklist_remove(struct worklist *, int, const char *, int);
1035
1036#define WORKLIST_INSERT(head, item) \
1037	worklist_insert(head, item, 1, __func__, __LINE__)
1038#define WORKLIST_INSERT_UNLOCKED(head, item)\
1039	worklist_insert(head, item, 0, __func__, __LINE__)
1040#define WORKLIST_REMOVE(item)\
1041	worklist_remove(item, 1, __func__, __LINE__)
1042#define WORKLIST_REMOVE_UNLOCKED(item)\
1043	worklist_remove(item, 0, __func__, __LINE__)
1044
1045static void
1046worklist_insert(head, item, locked, func, line)
1047	struct workhead *head;
1048	struct worklist *item;
1049	int locked;
1050	const char *func;
1051	int line;
1052{
1053
1054	if (locked)
1055		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1056	if (item->wk_state & ONWORKLIST)
1057		panic("worklist_insert: %p %s(0x%X) already on list, "
1058		    "added in function %s at line %d",
1059		    item, TYPENAME(item->wk_type), item->wk_state,
1060		    item->wk_func, item->wk_line);
1061	item->wk_state |= ONWORKLIST;
1062	item->wk_func = func;
1063	item->wk_line = line;
1064	LIST_INSERT_HEAD(head, item, wk_list);
1065}
1066
1067static void
1068worklist_remove(item, locked, func, line)
1069	struct worklist *item;
1070	int locked;
1071	const char *func;
1072	int line;
1073{
1074
1075	if (locked)
1076		LOCK_OWNED(VFSTOUFS(item->wk_mp));
1077	if ((item->wk_state & ONWORKLIST) == 0)
1078		panic("worklist_remove: %p %s(0x%X) not on list, "
1079		    "removed in function %s at line %d",
1080		    item, TYPENAME(item->wk_type), item->wk_state,
1081		    item->wk_func, item->wk_line);
1082	item->wk_state &= ~ONWORKLIST;
1083	item->wk_func = func;
1084	item->wk_line = line;
1085	LIST_REMOVE(item, wk_list);
1086}
1087#endif /* INVARIANTS */
1088
1089/*
1090 * Merge two jsegdeps keeping only the oldest one as newer references
1091 * can't be discarded until after older references.
1092 */
1093static inline struct jsegdep *
1094jsegdep_merge(struct jsegdep *one, struct jsegdep *two)
1095{
1096	struct jsegdep *swp;
1097
1098	if (two == NULL)
1099		return (one);
1100
1101	if (one->jd_seg->js_seq > two->jd_seg->js_seq) {
1102		swp = one;
1103		one = two;
1104		two = swp;
1105	}
1106	WORKLIST_REMOVE(&two->jd_list);
1107	free_jsegdep(two);
1108
1109	return (one);
1110}
1111
1112/*
1113 * If two freedeps are compatible free one to reduce list size.
1114 */
1115static inline struct freedep *
1116freedep_merge(struct freedep *one, struct freedep *two)
1117{
1118	if (two == NULL)
1119		return (one);
1120
1121	if (one->fd_freework == two->fd_freework) {
1122		WORKLIST_REMOVE(&two->fd_list);
1123		free_freedep(two);
1124	}
1125	return (one);
1126}
1127
1128/*
1129 * Move journal work from one list to another.  Duplicate freedeps and
1130 * jsegdeps are coalesced to keep the lists as small as possible.
1131 */
1132static void
1133jwork_move(dst, src)
1134	struct workhead *dst;
1135	struct workhead *src;
1136{
1137	struct freedep *freedep;
1138	struct jsegdep *jsegdep;
1139	struct worklist *wkn;
1140	struct worklist *wk;
1141
1142	KASSERT(dst != src,
1143	    ("jwork_move: dst == src"));
1144	freedep = NULL;
1145	jsegdep = NULL;
1146	LIST_FOREACH_SAFE(wk, dst, wk_list, wkn) {
1147		if (wk->wk_type == D_JSEGDEP)
1148			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1149		else if (wk->wk_type == D_FREEDEP)
1150			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1151	}
1152
1153	while ((wk = LIST_FIRST(src)) != NULL) {
1154		WORKLIST_REMOVE(wk);
1155		WORKLIST_INSERT(dst, wk);
1156		if (wk->wk_type == D_JSEGDEP) {
1157			jsegdep = jsegdep_merge(WK_JSEGDEP(wk), jsegdep);
1158			continue;
1159		}
1160		if (wk->wk_type == D_FREEDEP)
1161			freedep = freedep_merge(WK_FREEDEP(wk), freedep);
1162	}
1163}
1164
1165static void
1166jwork_insert(dst, jsegdep)
1167	struct workhead *dst;
1168	struct jsegdep *jsegdep;
1169{
1170	struct jsegdep *jsegdepn;
1171	struct worklist *wk;
1172
1173	LIST_FOREACH(wk, dst, wk_list)
1174		if (wk->wk_type == D_JSEGDEP)
1175			break;
1176	if (wk == NULL) {
1177		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1178		return;
1179	}
1180	jsegdepn = WK_JSEGDEP(wk);
1181	if (jsegdep->jd_seg->js_seq < jsegdepn->jd_seg->js_seq) {
1182		WORKLIST_REMOVE(wk);
1183		free_jsegdep(jsegdepn);
1184		WORKLIST_INSERT(dst, &jsegdep->jd_list);
1185	} else
1186		free_jsegdep(jsegdep);
1187}
1188
1189/*
1190 * Routines for tracking and managing workitems.
1191 */
1192static	void workitem_free(struct worklist *, int);
1193static	void workitem_alloc(struct worklist *, int, struct mount *);
1194static	void workitem_reassign(struct worklist *, int);
1195
1196#define	WORKITEM_FREE(item, type) \
1197	workitem_free((struct worklist *)(item), (type))
1198#define	WORKITEM_REASSIGN(item, type) \
1199	workitem_reassign((struct worklist *)(item), (type))
1200
1201static void
1202workitem_free(item, type)
1203	struct worklist *item;
1204	int type;
1205{
1206	struct ufsmount *ump;
1207
1208#ifdef INVARIANTS
1209	if (item->wk_state & ONWORKLIST)
1210		panic("workitem_free: %s(0x%X) still on list, "
1211		    "added in function %s at line %d",
1212		    TYPENAME(item->wk_type), item->wk_state,
1213		    item->wk_func, item->wk_line);
1214	if (item->wk_type != type && type != D_NEWBLK)
1215		panic("workitem_free: type mismatch %s != %s",
1216		    TYPENAME(item->wk_type), TYPENAME(type));
1217#endif
1218	if (item->wk_state & IOWAITING)
1219		wakeup(item);
1220	ump = VFSTOUFS(item->wk_mp);
1221	LOCK_OWNED(ump);
1222	KASSERT(ump->softdep_deps > 0,
1223	    ("workitem_free: %s: softdep_deps going negative",
1224	    ump->um_fs->fs_fsmnt));
1225	if (--ump->softdep_deps == 0 && ump->softdep_req)
1226		wakeup(&ump->softdep_deps);
1227	KASSERT(dep_current[item->wk_type] > 0,
1228	    ("workitem_free: %s: dep_current[%s] going negative",
1229	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1230	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1231	    ("workitem_free: %s: softdep_curdeps[%s] going negative",
1232	    ump->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1233	atomic_subtract_long(&dep_current[item->wk_type], 1);
1234	ump->softdep_curdeps[item->wk_type] -= 1;
1235#ifdef INVARIANTS
1236	LIST_REMOVE(item, wk_all);
1237#endif
1238	free(item, DtoM(type));
1239}
1240
1241static void
1242workitem_alloc(item, type, mp)
1243	struct worklist *item;
1244	int type;
1245	struct mount *mp;
1246{
1247	struct ufsmount *ump;
1248
1249	item->wk_type = type;
1250	item->wk_mp = mp;
1251	item->wk_state = 0;
1252
1253	ump = VFSTOUFS(mp);
1254	ACQUIRE_GBLLOCK(&lk);
1255	dep_current[type]++;
1256	if (dep_current[type] > dep_highuse[type])
1257		dep_highuse[type] = dep_current[type];
1258	dep_total[type]++;
1259	FREE_GBLLOCK(&lk);
1260	ACQUIRE_LOCK(ump);
1261	ump->softdep_curdeps[type] += 1;
1262	ump->softdep_deps++;
1263	ump->softdep_accdeps++;
1264#ifdef INVARIANTS
1265	LIST_INSERT_HEAD(&ump->softdep_alldeps[type], item, wk_all);
1266#endif
1267	FREE_LOCK(ump);
1268}
1269
1270static void
1271workitem_reassign(item, newtype)
1272	struct worklist *item;
1273	int newtype;
1274{
1275	struct ufsmount *ump;
1276
1277	ump = VFSTOUFS(item->wk_mp);
1278	LOCK_OWNED(ump);
1279	KASSERT(ump->softdep_curdeps[item->wk_type] > 0,
1280	    ("workitem_reassign: %s: softdep_curdeps[%s] going negative",
1281	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1282	ump->softdep_curdeps[item->wk_type] -= 1;
1283	ump->softdep_curdeps[newtype] += 1;
1284	KASSERT(dep_current[item->wk_type] > 0,
1285	    ("workitem_reassign: %s: dep_current[%s] going negative",
1286	    VFSTOUFS(item->wk_mp)->um_fs->fs_fsmnt, TYPENAME(item->wk_type)));
1287	ACQUIRE_GBLLOCK(&lk);
1288	dep_current[newtype]++;
1289	dep_current[item->wk_type]--;
1290	if (dep_current[newtype] > dep_highuse[newtype])
1291		dep_highuse[newtype] = dep_current[newtype];
1292	dep_total[newtype]++;
1293	FREE_GBLLOCK(&lk);
1294	item->wk_type = newtype;
1295}
1296
1297/*
1298 * Workitem queue management
1299 */
1300static int max_softdeps;	/* maximum number of structs before slowdown */
1301static int tickdelay = 2;	/* number of ticks to pause during slowdown */
1302static int proc_waiting;	/* tracks whether we have a timeout posted */
1303static int *stat_countp;	/* statistic to count in proc_waiting timeout */
1304static struct callout softdep_callout;
1305static int req_clear_inodedeps;	/* syncer process flush some inodedeps */
1306static int req_clear_remove;	/* syncer process flush some freeblks */
1307static int softdep_flushcache = 0; /* Should we do BIO_FLUSH? */
1308
1309/*
1310 * runtime statistics
1311 */
1312static int stat_flush_threads;	/* number of softdep flushing threads */
1313static int stat_worklist_push;	/* number of worklist cleanups */
1314static int stat_delayed_inact;	/* number of delayed inactivation cleanups */
1315static int stat_blk_limit_push;	/* number of times block limit neared */
1316static int stat_ino_limit_push;	/* number of times inode limit neared */
1317static int stat_blk_limit_hit;	/* number of times block slowdown imposed */
1318static int stat_ino_limit_hit;	/* number of times inode slowdown imposed */
1319static int stat_sync_limit_hit;	/* number of synchronous slowdowns imposed */
1320static int stat_indir_blk_ptrs;	/* bufs redirtied as indir ptrs not written */
1321static int stat_inode_bitmap;	/* bufs redirtied as inode bitmap not written */
1322static int stat_direct_blk_ptrs;/* bufs redirtied as direct ptrs not written */
1323static int stat_dir_entry;	/* bufs redirtied as dir entry cannot write */
1324static int stat_jaddref;	/* bufs redirtied as ino bitmap can not write */
1325static int stat_jnewblk;	/* bufs redirtied as blk bitmap can not write */
1326static int stat_journal_min;	/* Times hit journal min threshold */
1327static int stat_journal_low;	/* Times hit journal low threshold */
1328static int stat_journal_wait;	/* Times blocked in jwait(). */
1329static int stat_jwait_filepage;	/* Times blocked in jwait() for filepage. */
1330static int stat_jwait_freeblks;	/* Times blocked in jwait() for freeblks. */
1331static int stat_jwait_inode;	/* Times blocked in jwait() for inodes. */
1332static int stat_jwait_newblk;	/* Times blocked in jwait() for newblks. */
1333static int stat_cleanup_high_delay; /* Maximum cleanup delay (in ticks) */
1334static int stat_cleanup_blkrequests; /* Number of block cleanup requests */
1335static int stat_cleanup_inorequests; /* Number of inode cleanup requests */
1336static int stat_cleanup_retries; /* Number of cleanups that needed to flush */
1337static int stat_cleanup_failures; /* Number of cleanup requests that failed */
1338static int stat_emptyjblocks; /* Number of potentially empty journal blocks */
1339
1340SYSCTL_INT(_debug_softdep, OID_AUTO, max_softdeps, CTLFLAG_RW,
1341    &max_softdeps, 0, "");
1342SYSCTL_INT(_debug_softdep, OID_AUTO, tickdelay, CTLFLAG_RW,
1343    &tickdelay, 0, "");
1344SYSCTL_INT(_debug_softdep, OID_AUTO, flush_threads, CTLFLAG_RD,
1345    &stat_flush_threads, 0, "");
1346SYSCTL_INT(_debug_softdep, OID_AUTO, worklist_push,
1347    CTLFLAG_RW | CTLFLAG_STATS, &stat_worklist_push, 0,"");
1348SYSCTL_INT(_debug_softdep, OID_AUTO, delayed_inactivations, CTLFLAG_RD,
1349    &stat_delayed_inact, 0, "");
1350SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_push,
1351    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_push, 0,"");
1352SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_push,
1353    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_push, 0,"");
1354SYSCTL_INT(_debug_softdep, OID_AUTO, blk_limit_hit,
1355    CTLFLAG_RW | CTLFLAG_STATS, &stat_blk_limit_hit, 0, "");
1356SYSCTL_INT(_debug_softdep, OID_AUTO, ino_limit_hit,
1357    CTLFLAG_RW | CTLFLAG_STATS, &stat_ino_limit_hit, 0, "");
1358SYSCTL_INT(_debug_softdep, OID_AUTO, sync_limit_hit,
1359    CTLFLAG_RW | CTLFLAG_STATS, &stat_sync_limit_hit, 0, "");
1360SYSCTL_INT(_debug_softdep, OID_AUTO, indir_blk_ptrs,
1361    CTLFLAG_RW | CTLFLAG_STATS, &stat_indir_blk_ptrs, 0, "");
1362SYSCTL_INT(_debug_softdep, OID_AUTO, inode_bitmap,
1363    CTLFLAG_RW | CTLFLAG_STATS, &stat_inode_bitmap, 0, "");
1364SYSCTL_INT(_debug_softdep, OID_AUTO, direct_blk_ptrs,
1365    CTLFLAG_RW | CTLFLAG_STATS, &stat_direct_blk_ptrs, 0, "");
1366SYSCTL_INT(_debug_softdep, OID_AUTO, dir_entry,
1367    CTLFLAG_RW | CTLFLAG_STATS, &stat_dir_entry, 0, "");
1368SYSCTL_INT(_debug_softdep, OID_AUTO, jaddref_rollback,
1369    CTLFLAG_RW | CTLFLAG_STATS, &stat_jaddref, 0, "");
1370SYSCTL_INT(_debug_softdep, OID_AUTO, jnewblk_rollback,
1371    CTLFLAG_RW | CTLFLAG_STATS, &stat_jnewblk, 0, "");
1372SYSCTL_INT(_debug_softdep, OID_AUTO, journal_low,
1373    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_low, 0, "");
1374SYSCTL_INT(_debug_softdep, OID_AUTO, journal_min,
1375    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_min, 0, "");
1376SYSCTL_INT(_debug_softdep, OID_AUTO, journal_wait,
1377    CTLFLAG_RW | CTLFLAG_STATS, &stat_journal_wait, 0, "");
1378SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_filepage,
1379    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_filepage, 0, "");
1380SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_freeblks,
1381    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_freeblks, 0, "");
1382SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_inode,
1383    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_inode, 0, "");
1384SYSCTL_INT(_debug_softdep, OID_AUTO, jwait_newblk,
1385    CTLFLAG_RW | CTLFLAG_STATS, &stat_jwait_newblk, 0, "");
1386SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_blkrequests,
1387    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_blkrequests, 0, "");
1388SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_inorequests,
1389    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_inorequests, 0, "");
1390SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_high_delay,
1391    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_high_delay, 0, "");
1392SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_retries,
1393    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_retries, 0, "");
1394SYSCTL_INT(_debug_softdep, OID_AUTO, cleanup_failures,
1395    CTLFLAG_RW | CTLFLAG_STATS, &stat_cleanup_failures, 0, "");
1396
1397SYSCTL_INT(_debug_softdep, OID_AUTO, flushcache, CTLFLAG_RW,
1398    &softdep_flushcache, 0, "");
1399SYSCTL_INT(_debug_softdep, OID_AUTO, emptyjblocks, CTLFLAG_RD,
1400    &stat_emptyjblocks, 0, "");
1401
1402SYSCTL_DECL(_vfs_ffs);
1403
1404/* Whether to recompute the summary at mount time */
1405static int compute_summary_at_mount = 0;
1406SYSCTL_INT(_vfs_ffs, OID_AUTO, compute_summary_at_mount, CTLFLAG_RW,
1407	   &compute_summary_at_mount, 0, "Recompute summary at mount");
1408static int print_threads = 0;
1409SYSCTL_INT(_debug_softdep, OID_AUTO, print_threads, CTLFLAG_RW,
1410    &print_threads, 0, "Notify flusher thread start/stop");
1411
1412/* List of all filesystems mounted with soft updates */
1413static TAILQ_HEAD(, mount_softdeps) softdepmounts;
1414
1415static void
1416get_parent_vp_unlock_bp(struct mount *mp, struct buf *bp,
1417    struct diraddhd *diraddhdp, struct diraddhd *unfinishedp)
1418{
1419	struct diradd *dap;
1420
1421	/*
1422	 * Requeue unfinished dependencies before
1423	 * unlocking buffer, which could make
1424	 * diraddhdp invalid.
1425	 */
1426	ACQUIRE_LOCK(VFSTOUFS(mp));
1427	while ((dap = LIST_FIRST(unfinishedp)) != NULL) {
1428		LIST_REMOVE(dap, da_pdlist);
1429		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
1430	}
1431	FREE_LOCK(VFSTOUFS(mp));
1432
1433	bp->b_vflags &= ~BV_SCANNED;
1434	BUF_NOREC(bp);
1435	BUF_UNLOCK(bp);
1436}
1437
1438/*
1439 * This function fetches inode inum on mount point mp.  We already
1440 * hold a locked vnode vp, and might have a locked buffer bp belonging
1441 * to vp.
1442
1443 * We must not block on acquiring the new inode lock as we will get
1444 * into a lock-order reversal with the buffer lock and possibly get a
1445 * deadlock.  Thus if we cannot instantiate the requested vnode
1446 * without sleeping on its lock, we must unlock the vnode and the
1447 * buffer before doing a blocking on the vnode lock.  We return
1448 * ERELOOKUP if we have had to unlock either the vnode or the buffer so
1449 * that the caller can reassess its state.
1450 *
1451 * Top-level VFS code (for syscalls and other consumers, e.g. callers
1452 * of VOP_FSYNC() in syncer) check for ERELOOKUP and restart at safe
1453 * point.
1454 *
1455 * Since callers expect to operate on fully constructed vnode, we also
1456 * recheck v_data after relock, and return ENOENT if NULL.
1457 *
1458 * If unlocking bp, we must unroll dequeueing its unfinished
1459 * dependencies, and clear scan flag, before unlocking.  If unlocking
1460 * vp while it is under deactivation, we re-queue deactivation.
1461 */
1462static int
1463get_parent_vp(struct vnode *vp, struct mount *mp, ino_t inum, struct buf *bp,
1464    struct diraddhd *diraddhdp, struct diraddhd *unfinishedp,
1465    struct vnode **rvp)
1466{
1467	struct vnode *pvp;
1468	int error;
1469	bool bplocked;
1470
1471	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked");
1472	for (bplocked = true, pvp = NULL;;) {
1473		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE | LK_NOWAIT, &pvp,
1474		    FFSV_FORCEINSMQ);
1475		if (error == 0) {
1476			/*
1477			 * Since we could have unlocked vp, the inode
1478			 * number could no longer indicate a
1479			 * constructed node.  In this case, we must
1480			 * restart the syscall.
1481			 */
1482			if (VTOI(pvp)->i_mode == 0 || !bplocked) {
1483				if (bp != NULL && bplocked)
1484					get_parent_vp_unlock_bp(mp, bp,
1485					    diraddhdp, unfinishedp);
1486				if (VTOI(pvp)->i_mode == 0)
1487					vgone(pvp);
1488				error = ERELOOKUP;
1489				goto out2;
1490			}
1491			goto out1;
1492		}
1493		if (bp != NULL && bplocked) {
1494			get_parent_vp_unlock_bp(mp, bp, diraddhdp, unfinishedp);
1495			bplocked = false;
1496		}
1497
1498		/*
1499		 * Do not drop vnode lock while inactivating during
1500		 * vunref.  This would result in leaks of the VI flags
1501		 * and reclaiming of non-truncated vnode.  Instead,
1502		 * re-schedule inactivation hoping that we would be
1503		 * able to sync inode later.
1504		 */
1505		if ((vp->v_iflag & VI_DOINGINACT) != 0 &&
1506		    (vp->v_vflag & VV_UNREF) != 0) {
1507			VI_LOCK(vp);
1508			vp->v_iflag |= VI_OWEINACT;
1509			VI_UNLOCK(vp);
1510			return (ERELOOKUP);
1511		}
1512
1513		VOP_UNLOCK(vp);
1514		error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &pvp,
1515		    FFSV_FORCEINSMQ);
1516		if (error != 0) {
1517			MPASS(error != ERELOOKUP);
1518			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1519			break;
1520		}
1521		if (VTOI(pvp)->i_mode == 0) {
1522			vgone(pvp);
1523			vput(pvp);
1524			pvp = NULL;
1525			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1526			error = ERELOOKUP;
1527			break;
1528		}
1529		error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
1530		if (error == 0)
1531			break;
1532		vput(pvp);
1533		pvp = NULL;
1534		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1535		if (vp->v_data == NULL) {
1536			error = ENOENT;
1537			break;
1538		}
1539	}
1540	if (bp != NULL) {
1541		MPASS(!bplocked);
1542		error = ERELOOKUP;
1543	}
1544out2:
1545	if (error != 0 && pvp != NULL) {
1546		vput(pvp);
1547		pvp = NULL;
1548	}
1549out1:
1550	*rvp = pvp;
1551	ASSERT_VOP_ELOCKED(vp, "child vnode must be locked on return");
1552	return (error);
1553}
1554
1555/*
1556 * This function cleans the worklist for a filesystem.
1557 * Each filesystem running with soft dependencies gets its own
1558 * thread to run in this function. The thread is started up in
1559 * softdep_mount and shutdown in softdep_unmount. They show up
1560 * as part of the kernel "bufdaemon" process whose process
1561 * entry is available in bufdaemonproc.
1562 */
1563static int searchfailed;
1564extern struct proc *bufdaemonproc;
1565static void
1566softdep_flush(addr)
1567	void *addr;
1568{
1569	struct mount *mp;
1570	struct thread *td;
1571	struct ufsmount *ump;
1572
1573	td = curthread;
1574	td->td_pflags |= TDP_NORUNNINGBUF;
1575	mp = (struct mount *)addr;
1576	ump = VFSTOUFS(mp);
1577	atomic_add_int(&stat_flush_threads, 1);
1578	ACQUIRE_LOCK(ump);
1579	ump->softdep_flags &= ~FLUSH_STARTING;
1580	wakeup(&ump->softdep_flushtd);
1581	FREE_LOCK(ump);
1582	if (print_threads) {
1583		if (stat_flush_threads == 1)
1584			printf("Running %s at pid %d\n", bufdaemonproc->p_comm,
1585			    bufdaemonproc->p_pid);
1586		printf("Start thread %s\n", td->td_name);
1587	}
1588	for (;;) {
1589		while (softdep_process_worklist(mp, 0) > 0 ||
1590		    (MOUNTEDSUJ(mp) &&
1591		    VFSTOUFS(mp)->softdep_jblocks->jb_suspended))
1592			kthread_suspend_check();
1593		ACQUIRE_LOCK(ump);
1594		if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1595			msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM,
1596			    "sdflush", hz / 2);
1597		ump->softdep_flags &= ~FLUSH_CLEANUP;
1598		/*
1599		 * Check to see if we are done and need to exit.
1600		 */
1601		if ((ump->softdep_flags & FLUSH_EXIT) == 0) {
1602			FREE_LOCK(ump);
1603			continue;
1604		}
1605		ump->softdep_flags &= ~FLUSH_EXIT;
1606		FREE_LOCK(ump);
1607		wakeup(&ump->softdep_flags);
1608		if (print_threads)
1609			printf("Stop thread %s: searchfailed %d, did cleanups %d\n", td->td_name, searchfailed, ump->um_softdep->sd_cleanups);
1610		atomic_subtract_int(&stat_flush_threads, 1);
1611		kthread_exit();
1612		panic("kthread_exit failed\n");
1613	}
1614}
1615
1616static void
1617worklist_speedup(mp)
1618	struct mount *mp;
1619{
1620	struct ufsmount *ump;
1621
1622	ump = VFSTOUFS(mp);
1623	LOCK_OWNED(ump);
1624	if ((ump->softdep_flags & (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1625		ump->softdep_flags |= FLUSH_CLEANUP;
1626	wakeup(&ump->softdep_flushtd);
1627}
1628
1629static void
1630softdep_send_speedup(struct ufsmount *ump, off_t shortage, u_int flags)
1631{
1632	struct buf *bp;
1633
1634	if ((ump->um_flags & UM_CANSPEEDUP) == 0)
1635		return;
1636
1637	bp = malloc(sizeof(*bp), M_TRIM, M_WAITOK | M_ZERO);
1638	bp->b_iocmd = BIO_SPEEDUP;
1639	bp->b_ioflags = flags;
1640	bp->b_bcount = omin(shortage, LONG_MAX);
1641	g_vfs_strategy(ump->um_bo, bp);
1642	bufwait(bp);
1643	free(bp, M_TRIM);
1644}
1645
1646static int
1647softdep_speedup(ump)
1648	struct ufsmount *ump;
1649{
1650	struct ufsmount *altump;
1651	struct mount_softdeps *sdp;
1652
1653	LOCK_OWNED(ump);
1654	worklist_speedup(ump->um_mountp);
1655	bd_speedup();
1656	/*
1657	 * If we have global shortages, then we need other
1658	 * filesystems to help with the cleanup. Here we wakeup a
1659	 * flusher thread for a filesystem that is over its fair
1660	 * share of resources.
1661	 */
1662	if (req_clear_inodedeps || req_clear_remove) {
1663		ACQUIRE_GBLLOCK(&lk);
1664		TAILQ_FOREACH(sdp, &softdepmounts, sd_next) {
1665			if ((altump = sdp->sd_ump) == ump)
1666				continue;
1667			if (((req_clear_inodedeps &&
1668			    altump->softdep_curdeps[D_INODEDEP] >
1669			    max_softdeps / stat_flush_threads) ||
1670			    (req_clear_remove &&
1671			    altump->softdep_curdeps[D_DIRREM] >
1672			    (max_softdeps / 2) / stat_flush_threads)) &&
1673			    TRY_ACQUIRE_LOCK(altump))
1674				break;
1675		}
1676		if (sdp == NULL) {
1677			searchfailed++;
1678			FREE_GBLLOCK(&lk);
1679		} else {
1680			/*
1681			 * Move to the end of the list so we pick a
1682			 * different one on out next try.
1683			 */
1684			TAILQ_REMOVE(&softdepmounts, sdp, sd_next);
1685			TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
1686			FREE_GBLLOCK(&lk);
1687			if ((altump->softdep_flags &
1688			    (FLUSH_CLEANUP | FLUSH_EXIT)) == 0)
1689				altump->softdep_flags |= FLUSH_CLEANUP;
1690			altump->um_softdep->sd_cleanups++;
1691			wakeup(&altump->softdep_flushtd);
1692			FREE_LOCK(altump);
1693		}
1694	}
1695	return (speedup_syncer());
1696}
1697
1698/*
1699 * Add an item to the end of the work queue.
1700 * This routine requires that the lock be held.
1701 * This is the only routine that adds items to the list.
1702 * The following routine is the only one that removes items
1703 * and does so in order from first to last.
1704 */
1705
1706#define	WK_HEAD		0x0001	/* Add to HEAD. */
1707#define	WK_NODELAY	0x0002	/* Process immediately. */
1708
1709static void
1710add_to_worklist(wk, flags)
1711	struct worklist *wk;
1712	int flags;
1713{
1714	struct ufsmount *ump;
1715
1716	ump = VFSTOUFS(wk->wk_mp);
1717	LOCK_OWNED(ump);
1718	if (wk->wk_state & ONWORKLIST)
1719		panic("add_to_worklist: %s(0x%X) already on list",
1720		    TYPENAME(wk->wk_type), wk->wk_state);
1721	wk->wk_state |= ONWORKLIST;
1722	if (ump->softdep_on_worklist == 0) {
1723		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1724		ump->softdep_worklist_tail = wk;
1725	} else if (flags & WK_HEAD) {
1726		LIST_INSERT_HEAD(&ump->softdep_workitem_pending, wk, wk_list);
1727	} else {
1728		LIST_INSERT_AFTER(ump->softdep_worklist_tail, wk, wk_list);
1729		ump->softdep_worklist_tail = wk;
1730	}
1731	ump->softdep_on_worklist += 1;
1732	if (flags & WK_NODELAY)
1733		worklist_speedup(wk->wk_mp);
1734}
1735
1736/*
1737 * Remove the item to be processed. If we are removing the last
1738 * item on the list, we need to recalculate the tail pointer.
1739 */
1740static void
1741remove_from_worklist(wk)
1742	struct worklist *wk;
1743{
1744	struct ufsmount *ump;
1745
1746	ump = VFSTOUFS(wk->wk_mp);
1747	if (ump->softdep_worklist_tail == wk)
1748		ump->softdep_worklist_tail =
1749		    (struct worklist *)wk->wk_list.le_prev;
1750	WORKLIST_REMOVE(wk);
1751	ump->softdep_on_worklist -= 1;
1752}
1753
1754static void
1755wake_worklist(wk)
1756	struct worklist *wk;
1757{
1758	if (wk->wk_state & IOWAITING) {
1759		wk->wk_state &= ~IOWAITING;
1760		wakeup(wk);
1761	}
1762}
1763
1764static void
1765wait_worklist(wk, wmesg)
1766	struct worklist *wk;
1767	char *wmesg;
1768{
1769	struct ufsmount *ump;
1770
1771	ump = VFSTOUFS(wk->wk_mp);
1772	wk->wk_state |= IOWAITING;
1773	msleep(wk, LOCK_PTR(ump), PVM, wmesg, 0);
1774}
1775
1776/*
1777 * Process that runs once per second to handle items in the background queue.
1778 *
1779 * Note that we ensure that everything is done in the order in which they
1780 * appear in the queue. The code below depends on this property to ensure
1781 * that blocks of a file are freed before the inode itself is freed. This
1782 * ordering ensures that no new <vfsid, inum, lbn> triples will be generated
1783 * until all the old ones have been purged from the dependency lists.
1784 */
1785static int
1786softdep_process_worklist(mp, full)
1787	struct mount *mp;
1788	int full;
1789{
1790	int cnt, matchcnt;
1791	struct ufsmount *ump;
1792	long starttime;
1793
1794	KASSERT(mp != NULL, ("softdep_process_worklist: NULL mp"));
1795	if (MOUNTEDSOFTDEP(mp) == 0)
1796		return (0);
1797	matchcnt = 0;
1798	ump = VFSTOUFS(mp);
1799	ACQUIRE_LOCK(ump);
1800	starttime = time_second;
1801	softdep_process_journal(mp, NULL, full ? MNT_WAIT : 0);
1802	check_clear_deps(mp);
1803	while (ump->softdep_on_worklist > 0) {
1804		if ((cnt = process_worklist_item(mp, 10, LK_NOWAIT)) == 0)
1805			break;
1806		else
1807			matchcnt += cnt;
1808		check_clear_deps(mp);
1809		/*
1810		 * We do not generally want to stop for buffer space, but if
1811		 * we are really being a buffer hog, we will stop and wait.
1812		 */
1813		if (should_yield()) {
1814			FREE_LOCK(ump);
1815			kern_yield(PRI_USER);
1816			bwillwrite();
1817			ACQUIRE_LOCK(ump);
1818		}
1819		/*
1820		 * Never allow processing to run for more than one
1821		 * second. This gives the syncer thread the opportunity
1822		 * to pause if appropriate.
1823		 */
1824		if (!full && starttime != time_second)
1825			break;
1826	}
1827	if (full == 0)
1828		journal_unsuspend(ump);
1829	FREE_LOCK(ump);
1830	return (matchcnt);
1831}
1832
1833/*
1834 * Process all removes associated with a vnode if we are running out of
1835 * journal space.  Any other process which attempts to flush these will
1836 * be unable as we have the vnodes locked.
1837 */
1838static void
1839process_removes(vp)
1840	struct vnode *vp;
1841{
1842	struct inodedep *inodedep;
1843	struct dirrem *dirrem;
1844	struct ufsmount *ump;
1845	struct mount *mp;
1846	ino_t inum;
1847
1848	mp = vp->v_mount;
1849	ump = VFSTOUFS(mp);
1850	LOCK_OWNED(ump);
1851	inum = VTOI(vp)->i_number;
1852	for (;;) {
1853top:
1854		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1855			return;
1856		LIST_FOREACH(dirrem, &inodedep->id_dirremhd, dm_inonext) {
1857			/*
1858			 * If another thread is trying to lock this vnode
1859			 * it will fail but we must wait for it to do so
1860			 * before we can proceed.
1861			 */
1862			if (dirrem->dm_state & INPROGRESS) {
1863				wait_worklist(&dirrem->dm_list, "pwrwait");
1864				goto top;
1865			}
1866			if ((dirrem->dm_state & (COMPLETE | ONWORKLIST)) ==
1867			    (COMPLETE | ONWORKLIST))
1868				break;
1869		}
1870		if (dirrem == NULL)
1871			return;
1872		remove_from_worklist(&dirrem->dm_list);
1873		FREE_LOCK(ump);
1874		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
1875			panic("process_removes: suspended filesystem");
1876		handle_workitem_remove(dirrem, 0);
1877		vn_finished_secondary_write(mp);
1878		ACQUIRE_LOCK(ump);
1879	}
1880}
1881
1882/*
1883 * Process all truncations associated with a vnode if we are running out
1884 * of journal space.  This is called when the vnode lock is already held
1885 * and no other process can clear the truncation.  This function returns
1886 * a value greater than zero if it did any work.
1887 */
1888static void
1889process_truncates(vp)
1890	struct vnode *vp;
1891{
1892	struct inodedep *inodedep;
1893	struct freeblks *freeblks;
1894	struct ufsmount *ump;
1895	struct mount *mp;
1896	ino_t inum;
1897	int cgwait;
1898
1899	mp = vp->v_mount;
1900	ump = VFSTOUFS(mp);
1901	LOCK_OWNED(ump);
1902	inum = VTOI(vp)->i_number;
1903	for (;;) {
1904		if (inodedep_lookup(mp, inum, 0, &inodedep) == 0)
1905			return;
1906		cgwait = 0;
1907		TAILQ_FOREACH(freeblks, &inodedep->id_freeblklst, fb_next) {
1908			/* Journal entries not yet written.  */
1909			if (!LIST_EMPTY(&freeblks->fb_jblkdephd)) {
1910				jwait(&LIST_FIRST(
1911				    &freeblks->fb_jblkdephd)->jb_list,
1912				    MNT_WAIT);
1913				break;
1914			}
1915			/* Another thread is executing this item. */
1916			if (freeblks->fb_state & INPROGRESS) {
1917				wait_worklist(&freeblks->fb_list, "ptrwait");
1918				break;
1919			}
1920			/* Freeblks is waiting on a inode write. */
1921			if ((freeblks->fb_state & COMPLETE) == 0) {
1922				FREE_LOCK(ump);
1923				ffs_update(vp, 1);
1924				ACQUIRE_LOCK(ump);
1925				break;
1926			}
1927			if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST)) ==
1928			    (ALLCOMPLETE | ONWORKLIST)) {
1929				remove_from_worklist(&freeblks->fb_list);
1930				freeblks->fb_state |= INPROGRESS;
1931				FREE_LOCK(ump);
1932				if (vn_start_secondary_write(NULL, &mp,
1933				    V_NOWAIT))
1934					panic("process_truncates: "
1935					    "suspended filesystem");
1936				handle_workitem_freeblocks(freeblks, 0);
1937				vn_finished_secondary_write(mp);
1938				ACQUIRE_LOCK(ump);
1939				break;
1940			}
1941			if (freeblks->fb_cgwait)
1942				cgwait++;
1943		}
1944		if (cgwait) {
1945			FREE_LOCK(ump);
1946			sync_cgs(mp, MNT_WAIT);
1947			ffs_sync_snap(mp, MNT_WAIT);
1948			ACQUIRE_LOCK(ump);
1949			continue;
1950		}
1951		if (freeblks == NULL)
1952			break;
1953	}
1954	return;
1955}
1956
1957/*
1958 * Process one item on the worklist.
1959 */
1960static int
1961process_worklist_item(mp, target, flags)
1962	struct mount *mp;
1963	int target;
1964	int flags;
1965{
1966	struct worklist sentinel;
1967	struct worklist *wk;
1968	struct ufsmount *ump;
1969	int matchcnt;
1970	int error;
1971
1972	KASSERT(mp != NULL, ("process_worklist_item: NULL mp"));
1973	/*
1974	 * If we are being called because of a process doing a
1975	 * copy-on-write, then it is not safe to write as we may
1976	 * recurse into the copy-on-write routine.
1977	 */
1978	if (curthread->td_pflags & TDP_COWINPROGRESS)
1979		return (-1);
1980	PHOLD(curproc);	/* Don't let the stack go away. */
1981	ump = VFSTOUFS(mp);
1982	LOCK_OWNED(ump);
1983	matchcnt = 0;
1984	sentinel.wk_mp = NULL;
1985	sentinel.wk_type = D_SENTINEL;
1986	LIST_INSERT_HEAD(&ump->softdep_workitem_pending, &sentinel, wk_list);
1987	for (wk = LIST_NEXT(&sentinel, wk_list); wk != NULL;
1988	    wk = LIST_NEXT(&sentinel, wk_list)) {
1989		if (wk->wk_type == D_SENTINEL) {
1990			LIST_REMOVE(&sentinel, wk_list);
1991			LIST_INSERT_AFTER(wk, &sentinel, wk_list);
1992			continue;
1993		}
1994		if (wk->wk_state & INPROGRESS)
1995			panic("process_worklist_item: %p already in progress.",
1996			    wk);
1997		wk->wk_state |= INPROGRESS;
1998		remove_from_worklist(wk);
1999		FREE_LOCK(ump);
2000		if (vn_start_secondary_write(NULL, &mp, V_NOWAIT))
2001			panic("process_worklist_item: suspended filesystem");
2002		switch (wk->wk_type) {
2003		case D_DIRREM:
2004			/* removal of a directory entry */
2005			error = handle_workitem_remove(WK_DIRREM(wk), flags);
2006			break;
2007
2008		case D_FREEBLKS:
2009			/* releasing blocks and/or fragments from a file */
2010			error = handle_workitem_freeblocks(WK_FREEBLKS(wk),
2011			    flags);
2012			break;
2013
2014		case D_FREEFRAG:
2015			/* releasing a fragment when replaced as a file grows */
2016			handle_workitem_freefrag(WK_FREEFRAG(wk));
2017			error = 0;
2018			break;
2019
2020		case D_FREEFILE:
2021			/* releasing an inode when its link count drops to 0 */
2022			handle_workitem_freefile(WK_FREEFILE(wk));
2023			error = 0;
2024			break;
2025
2026		default:
2027			panic("%s_process_worklist: Unknown type %s",
2028			    "softdep", TYPENAME(wk->wk_type));
2029			/* NOTREACHED */
2030		}
2031		vn_finished_secondary_write(mp);
2032		ACQUIRE_LOCK(ump);
2033		if (error == 0) {
2034			if (++matchcnt == target)
2035				break;
2036			continue;
2037		}
2038		/*
2039		 * We have to retry the worklist item later.  Wake up any
2040		 * waiters who may be able to complete it immediately and
2041		 * add the item back to the head so we don't try to execute
2042		 * it again.
2043		 */
2044		wk->wk_state &= ~INPROGRESS;
2045		wake_worklist(wk);
2046		add_to_worklist(wk, WK_HEAD);
2047	}
2048	/* Sentinal could've become the tail from remove_from_worklist. */
2049	if (ump->softdep_worklist_tail == &sentinel)
2050		ump->softdep_worklist_tail =
2051		    (struct worklist *)sentinel.wk_list.le_prev;
2052	LIST_REMOVE(&sentinel, wk_list);
2053	PRELE(curproc);
2054	return (matchcnt);
2055}
2056
2057/*
2058 * Move dependencies from one buffer to another.
2059 */
2060int
2061softdep_move_dependencies(oldbp, newbp)
2062	struct buf *oldbp;
2063	struct buf *newbp;
2064{
2065	struct worklist *wk, *wktail;
2066	struct ufsmount *ump;
2067	int dirty;
2068
2069	if ((wk = LIST_FIRST(&oldbp->b_dep)) == NULL)
2070		return (0);
2071	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
2072	    ("softdep_move_dependencies called on non-softdep filesystem"));
2073	dirty = 0;
2074	wktail = NULL;
2075	ump = VFSTOUFS(wk->wk_mp);
2076	ACQUIRE_LOCK(ump);
2077	while ((wk = LIST_FIRST(&oldbp->b_dep)) != NULL) {
2078		LIST_REMOVE(wk, wk_list);
2079		if (wk->wk_type == D_BMSAFEMAP &&
2080		    bmsafemap_backgroundwrite(WK_BMSAFEMAP(wk), newbp))
2081			dirty = 1;
2082		if (wktail == NULL)
2083			LIST_INSERT_HEAD(&newbp->b_dep, wk, wk_list);
2084		else
2085			LIST_INSERT_AFTER(wktail, wk, wk_list);
2086		wktail = wk;
2087	}
2088	FREE_LOCK(ump);
2089
2090	return (dirty);
2091}
2092
2093/*
2094 * Purge the work list of all items associated with a particular mount point.
2095 */
2096int
2097softdep_flushworklist(oldmnt, countp, td)
2098	struct mount *oldmnt;
2099	int *countp;
2100	struct thread *td;
2101{
2102	struct vnode *devvp;
2103	struct ufsmount *ump;
2104	int count, error;
2105
2106	/*
2107	 * Alternately flush the block device associated with the mount
2108	 * point and process any dependencies that the flushing
2109	 * creates. We continue until no more worklist dependencies
2110	 * are found.
2111	 */
2112	*countp = 0;
2113	error = 0;
2114	ump = VFSTOUFS(oldmnt);
2115	devvp = ump->um_devvp;
2116	while ((count = softdep_process_worklist(oldmnt, 1)) > 0) {
2117		*countp += count;
2118		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2119		error = VOP_FSYNC(devvp, MNT_WAIT, td);
2120		VOP_UNLOCK(devvp);
2121		if (error != 0)
2122			break;
2123	}
2124	return (error);
2125}
2126
2127#define	SU_WAITIDLE_RETRIES	20
2128static int
2129softdep_waitidle(struct mount *mp, int flags __unused)
2130{
2131	struct ufsmount *ump;
2132	struct vnode *devvp;
2133	struct thread *td;
2134	int error, i;
2135
2136	ump = VFSTOUFS(mp);
2137	devvp = ump->um_devvp;
2138	td = curthread;
2139	error = 0;
2140	ACQUIRE_LOCK(ump);
2141	for (i = 0; i < SU_WAITIDLE_RETRIES && ump->softdep_deps != 0; i++) {
2142		ump->softdep_req = 1;
2143		KASSERT((flags & FORCECLOSE) == 0 ||
2144		    ump->softdep_on_worklist == 0,
2145		    ("softdep_waitidle: work added after flush"));
2146		msleep(&ump->softdep_deps, LOCK_PTR(ump), PVM | PDROP,
2147		    "softdeps", 10 * hz);
2148		vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2149		error = VOP_FSYNC(devvp, MNT_WAIT, td);
2150		VOP_UNLOCK(devvp);
2151		ACQUIRE_LOCK(ump);
2152		if (error != 0)
2153			break;
2154	}
2155	ump->softdep_req = 0;
2156	if (i == SU_WAITIDLE_RETRIES && error == 0 && ump->softdep_deps != 0) {
2157		error = EBUSY;
2158		printf("softdep_waitidle: Failed to flush worklist for %p\n",
2159		    mp);
2160	}
2161	FREE_LOCK(ump);
2162	return (error);
2163}
2164
2165/*
2166 * Flush all vnodes and worklist items associated with a specified mount point.
2167 */
2168int
2169softdep_flushfiles(oldmnt, flags, td)
2170	struct mount *oldmnt;
2171	int flags;
2172	struct thread *td;
2173{
2174#ifdef QUOTA
2175	struct ufsmount *ump;
2176	int i;
2177#endif
2178	int error, early, depcount, loopcnt, retry_flush_count, retry;
2179	int morework;
2180
2181	KASSERT(MOUNTEDSOFTDEP(oldmnt) != 0,
2182	    ("softdep_flushfiles called on non-softdep filesystem"));
2183	loopcnt = 10;
2184	retry_flush_count = 3;
2185retry_flush:
2186	error = 0;
2187
2188	/*
2189	 * Alternately flush the vnodes associated with the mount
2190	 * point and process any dependencies that the flushing
2191	 * creates. In theory, this loop can happen at most twice,
2192	 * but we give it a few extra just to be sure.
2193	 */
2194	for (; loopcnt > 0; loopcnt--) {
2195		/*
2196		 * Do another flush in case any vnodes were brought in
2197		 * as part of the cleanup operations.
2198		 */
2199		early = retry_flush_count == 1 || (oldmnt->mnt_kern_flag &
2200		    MNTK_UNMOUNT) == 0 ? 0 : EARLYFLUSH;
2201		if ((error = ffs_flushfiles(oldmnt, flags | early, td)) != 0)
2202			break;
2203		if ((error = softdep_flushworklist(oldmnt, &depcount, td)) != 0 ||
2204		    depcount == 0)
2205			break;
2206	}
2207	/*
2208	 * If we are unmounting then it is an error to fail. If we
2209	 * are simply trying to downgrade to read-only, then filesystem
2210	 * activity can keep us busy forever, so we just fail with EBUSY.
2211	 */
2212	if (loopcnt == 0) {
2213		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT)
2214			panic("softdep_flushfiles: looping");
2215		error = EBUSY;
2216	}
2217	if (!error)
2218		error = softdep_waitidle(oldmnt, flags);
2219	if (!error) {
2220		if (oldmnt->mnt_kern_flag & MNTK_UNMOUNT) {
2221			retry = 0;
2222			MNT_ILOCK(oldmnt);
2223			morework = oldmnt->mnt_nvnodelistsize > 0;
2224#ifdef QUOTA
2225			ump = VFSTOUFS(oldmnt);
2226			UFS_LOCK(ump);
2227			for (i = 0; i < MAXQUOTAS; i++) {
2228				if (ump->um_quotas[i] != NULLVP)
2229					morework = 1;
2230			}
2231			UFS_UNLOCK(ump);
2232#endif
2233			if (morework) {
2234				if (--retry_flush_count > 0) {
2235					retry = 1;
2236					loopcnt = 3;
2237				} else
2238					error = EBUSY;
2239			}
2240			MNT_IUNLOCK(oldmnt);
2241			if (retry)
2242				goto retry_flush;
2243		}
2244	}
2245	return (error);
2246}
2247
2248/*
2249 * Structure hashing.
2250 *
2251 * There are four types of structures that can be looked up:
2252 *	1) pagedep structures identified by mount point, inode number,
2253 *	   and logical block.
2254 *	2) inodedep structures identified by mount point and inode number.
2255 *	3) newblk structures identified by mount point and
2256 *	   physical block number.
2257 *	4) bmsafemap structures identified by mount point and
2258 *	   cylinder group number.
2259 *
2260 * The "pagedep" and "inodedep" dependency structures are hashed
2261 * separately from the file blocks and inodes to which they correspond.
2262 * This separation helps when the in-memory copy of an inode or
2263 * file block must be replaced. It also obviates the need to access
2264 * an inode or file page when simply updating (or de-allocating)
2265 * dependency structures. Lookup of newblk structures is needed to
2266 * find newly allocated blocks when trying to associate them with
2267 * their allocdirect or allocindir structure.
2268 *
2269 * The lookup routines optionally create and hash a new instance when
2270 * an existing entry is not found. The bmsafemap lookup routine always
2271 * allocates a new structure if an existing one is not found.
2272 */
2273#define DEPALLOC	0x0001	/* allocate structure if lookup fails */
2274
2275/*
2276 * Structures and routines associated with pagedep caching.
2277 */
2278#define	PAGEDEP_HASH(ump, inum, lbn) \
2279	(&(ump)->pagedep_hashtbl[((inum) + (lbn)) & (ump)->pagedep_hash_size])
2280
2281static int
2282pagedep_find(pagedephd, ino, lbn, pagedeppp)
2283	struct pagedep_hashhead *pagedephd;
2284	ino_t ino;
2285	ufs_lbn_t lbn;
2286	struct pagedep **pagedeppp;
2287{
2288	struct pagedep *pagedep;
2289
2290	LIST_FOREACH(pagedep, pagedephd, pd_hash) {
2291		if (ino == pagedep->pd_ino && lbn == pagedep->pd_lbn) {
2292			*pagedeppp = pagedep;
2293			return (1);
2294		}
2295	}
2296	*pagedeppp = NULL;
2297	return (0);
2298}
2299/*
2300 * Look up a pagedep. Return 1 if found, 0 otherwise.
2301 * If not found, allocate if DEPALLOC flag is passed.
2302 * Found or allocated entry is returned in pagedeppp.
2303 */
2304static int
2305pagedep_lookup(mp, bp, ino, lbn, flags, pagedeppp)
2306	struct mount *mp;
2307	struct buf *bp;
2308	ino_t ino;
2309	ufs_lbn_t lbn;
2310	int flags;
2311	struct pagedep **pagedeppp;
2312{
2313	struct pagedep *pagedep;
2314	struct pagedep_hashhead *pagedephd;
2315	struct worklist *wk;
2316	struct ufsmount *ump;
2317	int ret;
2318	int i;
2319
2320	ump = VFSTOUFS(mp);
2321	LOCK_OWNED(ump);
2322	if (bp) {
2323		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
2324			if (wk->wk_type == D_PAGEDEP) {
2325				*pagedeppp = WK_PAGEDEP(wk);
2326				return (1);
2327			}
2328		}
2329	}
2330	pagedephd = PAGEDEP_HASH(ump, ino, lbn);
2331	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2332	if (ret) {
2333		if (((*pagedeppp)->pd_state & ONWORKLIST) == 0 && bp)
2334			WORKLIST_INSERT(&bp->b_dep, &(*pagedeppp)->pd_list);
2335		return (1);
2336	}
2337	if ((flags & DEPALLOC) == 0)
2338		return (0);
2339	FREE_LOCK(ump);
2340	pagedep = malloc(sizeof(struct pagedep),
2341	    M_PAGEDEP, M_SOFTDEP_FLAGS|M_ZERO);
2342	workitem_alloc(&pagedep->pd_list, D_PAGEDEP, mp);
2343	ACQUIRE_LOCK(ump);
2344	ret = pagedep_find(pagedephd, ino, lbn, pagedeppp);
2345	if (*pagedeppp) {
2346		/*
2347		 * This should never happen since we only create pagedeps
2348		 * with the vnode lock held.  Could be an assert.
2349		 */
2350		WORKITEM_FREE(pagedep, D_PAGEDEP);
2351		return (ret);
2352	}
2353	pagedep->pd_ino = ino;
2354	pagedep->pd_lbn = lbn;
2355	LIST_INIT(&pagedep->pd_dirremhd);
2356	LIST_INIT(&pagedep->pd_pendinghd);
2357	for (i = 0; i < DAHASHSZ; i++)
2358		LIST_INIT(&pagedep->pd_diraddhd[i]);
2359	LIST_INSERT_HEAD(pagedephd, pagedep, pd_hash);
2360	WORKLIST_INSERT(&bp->b_dep, &pagedep->pd_list);
2361	*pagedeppp = pagedep;
2362	return (0);
2363}
2364
2365/*
2366 * Structures and routines associated with inodedep caching.
2367 */
2368#define	INODEDEP_HASH(ump, inum) \
2369      (&(ump)->inodedep_hashtbl[(inum) & (ump)->inodedep_hash_size])
2370
2371static int
2372inodedep_find(inodedephd, inum, inodedeppp)
2373	struct inodedep_hashhead *inodedephd;
2374	ino_t inum;
2375	struct inodedep **inodedeppp;
2376{
2377	struct inodedep *inodedep;
2378
2379	LIST_FOREACH(inodedep, inodedephd, id_hash)
2380		if (inum == inodedep->id_ino)
2381			break;
2382	if (inodedep) {
2383		*inodedeppp = inodedep;
2384		return (1);
2385	}
2386	*inodedeppp = NULL;
2387
2388	return (0);
2389}
2390/*
2391 * Look up an inodedep. Return 1 if found, 0 if not found.
2392 * If not found, allocate if DEPALLOC flag is passed.
2393 * Found or allocated entry is returned in inodedeppp.
2394 */
2395static int
2396inodedep_lookup(mp, inum, flags, inodedeppp)
2397	struct mount *mp;
2398	ino_t inum;
2399	int flags;
2400	struct inodedep **inodedeppp;
2401{
2402	struct inodedep *inodedep;
2403	struct inodedep_hashhead *inodedephd;
2404	struct ufsmount *ump;
2405	struct fs *fs;
2406
2407	ump = VFSTOUFS(mp);
2408	LOCK_OWNED(ump);
2409	fs = ump->um_fs;
2410	inodedephd = INODEDEP_HASH(ump, inum);
2411
2412	if (inodedep_find(inodedephd, inum, inodedeppp))
2413		return (1);
2414	if ((flags & DEPALLOC) == 0)
2415		return (0);
2416	/*
2417	 * If the system is over its limit and our filesystem is
2418	 * responsible for more than our share of that usage and
2419	 * we are not in a rush, request some inodedep cleanup.
2420	 */
2421	if (softdep_excess_items(ump, D_INODEDEP))
2422		schedule_cleanup(mp);
2423	else
2424		FREE_LOCK(ump);
2425	inodedep = malloc(sizeof(struct inodedep),
2426		M_INODEDEP, M_SOFTDEP_FLAGS);
2427	workitem_alloc(&inodedep->id_list, D_INODEDEP, mp);
2428	ACQUIRE_LOCK(ump);
2429	if (inodedep_find(inodedephd, inum, inodedeppp)) {
2430		WORKITEM_FREE(inodedep, D_INODEDEP);
2431		return (1);
2432	}
2433	inodedep->id_fs = fs;
2434	inodedep->id_ino = inum;
2435	inodedep->id_state = ALLCOMPLETE;
2436	inodedep->id_nlinkdelta = 0;
2437	inodedep->id_nlinkwrote = -1;
2438	inodedep->id_savedino1 = NULL;
2439	inodedep->id_savedsize = -1;
2440	inodedep->id_savedextsize = -1;
2441	inodedep->id_savednlink = -1;
2442	inodedep->id_bmsafemap = NULL;
2443	inodedep->id_mkdiradd = NULL;
2444	LIST_INIT(&inodedep->id_dirremhd);
2445	LIST_INIT(&inodedep->id_pendinghd);
2446	LIST_INIT(&inodedep->id_inowait);
2447	LIST_INIT(&inodedep->id_bufwait);
2448	TAILQ_INIT(&inodedep->id_inoreflst);
2449	TAILQ_INIT(&inodedep->id_inoupdt);
2450	TAILQ_INIT(&inodedep->id_newinoupdt);
2451	TAILQ_INIT(&inodedep->id_extupdt);
2452	TAILQ_INIT(&inodedep->id_newextupdt);
2453	TAILQ_INIT(&inodedep->id_freeblklst);
2454	LIST_INSERT_HEAD(inodedephd, inodedep, id_hash);
2455	*inodedeppp = inodedep;
2456	return (0);
2457}
2458
2459/*
2460 * Structures and routines associated with newblk caching.
2461 */
2462#define	NEWBLK_HASH(ump, inum) \
2463	(&(ump)->newblk_hashtbl[(inum) & (ump)->newblk_hash_size])
2464
2465static int
2466newblk_find(newblkhd, newblkno, flags, newblkpp)
2467	struct newblk_hashhead *newblkhd;
2468	ufs2_daddr_t newblkno;
2469	int flags;
2470	struct newblk **newblkpp;
2471{
2472	struct newblk *newblk;
2473
2474	LIST_FOREACH(newblk, newblkhd, nb_hash) {
2475		if (newblkno != newblk->nb_newblkno)
2476			continue;
2477		/*
2478		 * If we're creating a new dependency don't match those that
2479		 * have already been converted to allocdirects.  This is for
2480		 * a frag extend.
2481		 */
2482		if ((flags & DEPALLOC) && newblk->nb_list.wk_type != D_NEWBLK)
2483			continue;
2484		break;
2485	}
2486	if (newblk) {
2487		*newblkpp = newblk;
2488		return (1);
2489	}
2490	*newblkpp = NULL;
2491	return (0);
2492}
2493
2494/*
2495 * Look up a newblk. Return 1 if found, 0 if not found.
2496 * If not found, allocate if DEPALLOC flag is passed.
2497 * Found or allocated entry is returned in newblkpp.
2498 */
2499static int
2500newblk_lookup(mp, newblkno, flags, newblkpp)
2501	struct mount *mp;
2502	ufs2_daddr_t newblkno;
2503	int flags;
2504	struct newblk **newblkpp;
2505{
2506	struct newblk *newblk;
2507	struct newblk_hashhead *newblkhd;
2508	struct ufsmount *ump;
2509
2510	ump = VFSTOUFS(mp);
2511	LOCK_OWNED(ump);
2512	newblkhd = NEWBLK_HASH(ump, newblkno);
2513	if (newblk_find(newblkhd, newblkno, flags, newblkpp))
2514		return (1);
2515	if ((flags & DEPALLOC) == 0)
2516		return (0);
2517	if (softdep_excess_items(ump, D_NEWBLK) ||
2518	    softdep_excess_items(ump, D_ALLOCDIRECT) ||
2519	    softdep_excess_items(ump, D_ALLOCINDIR))
2520		schedule_cleanup(mp);
2521	else
2522		FREE_LOCK(ump);
2523	newblk = malloc(sizeof(union allblk), M_NEWBLK,
2524	    M_SOFTDEP_FLAGS | M_ZERO);
2525	workitem_alloc(&newblk->nb_list, D_NEWBLK, mp);
2526	ACQUIRE_LOCK(ump);
2527	if (newblk_find(newblkhd, newblkno, flags, newblkpp)) {
2528		WORKITEM_FREE(newblk, D_NEWBLK);
2529		return (1);
2530	}
2531	newblk->nb_freefrag = NULL;
2532	LIST_INIT(&newblk->nb_indirdeps);
2533	LIST_INIT(&newblk->nb_newdirblk);
2534	LIST_INIT(&newblk->nb_jwork);
2535	newblk->nb_state = ATTACHED;
2536	newblk->nb_newblkno = newblkno;
2537	LIST_INSERT_HEAD(newblkhd, newblk, nb_hash);
2538	*newblkpp = newblk;
2539	return (0);
2540}
2541
2542/*
2543 * Structures and routines associated with freed indirect block caching.
2544 */
2545#define	INDIR_HASH(ump, blkno) \
2546	(&(ump)->indir_hashtbl[(blkno) & (ump)->indir_hash_size])
2547
2548/*
2549 * Lookup an indirect block in the indir hash table.  The freework is
2550 * removed and potentially freed.  The caller must do a blocking journal
2551 * write before writing to the blkno.
2552 */
2553static int
2554indirblk_lookup(mp, blkno)
2555	struct mount *mp;
2556	ufs2_daddr_t blkno;
2557{
2558	struct freework *freework;
2559	struct indir_hashhead *wkhd;
2560	struct ufsmount *ump;
2561
2562	ump = VFSTOUFS(mp);
2563	wkhd = INDIR_HASH(ump, blkno);
2564	TAILQ_FOREACH(freework, wkhd, fw_next) {
2565		if (freework->fw_blkno != blkno)
2566			continue;
2567		indirblk_remove(freework);
2568		return (1);
2569	}
2570	return (0);
2571}
2572
2573/*
2574 * Insert an indirect block represented by freework into the indirblk
2575 * hash table so that it may prevent the block from being re-used prior
2576 * to the journal being written.
2577 */
2578static void
2579indirblk_insert(freework)
2580	struct freework *freework;
2581{
2582	struct jblocks *jblocks;
2583	struct jseg *jseg;
2584	struct ufsmount *ump;
2585
2586	ump = VFSTOUFS(freework->fw_list.wk_mp);
2587	jblocks = ump->softdep_jblocks;
2588	jseg = TAILQ_LAST(&jblocks->jb_segs, jseglst);
2589	if (jseg == NULL)
2590		return;
2591
2592	LIST_INSERT_HEAD(&jseg->js_indirs, freework, fw_segs);
2593	TAILQ_INSERT_HEAD(INDIR_HASH(ump, freework->fw_blkno), freework,
2594	    fw_next);
2595	freework->fw_state &= ~DEPCOMPLETE;
2596}
2597
2598static void
2599indirblk_remove(freework)
2600	struct freework *freework;
2601{
2602	struct ufsmount *ump;
2603
2604	ump = VFSTOUFS(freework->fw_list.wk_mp);
2605	LIST_REMOVE(freework, fw_segs);
2606	TAILQ_REMOVE(INDIR_HASH(ump, freework->fw_blkno), freework, fw_next);
2607	freework->fw_state |= DEPCOMPLETE;
2608	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
2609		WORKITEM_FREE(freework, D_FREEWORK);
2610}
2611
2612/*
2613 * Executed during filesystem system initialization before
2614 * mounting any filesystems.
2615 */
2616void
2617softdep_initialize()
2618{
2619
2620	TAILQ_INIT(&softdepmounts);
2621#ifdef __LP64__
2622	max_softdeps = desiredvnodes * 4;
2623#else
2624	max_softdeps = desiredvnodes * 2;
2625#endif
2626
2627	/* initialise bioops hack */
2628	bioops.io_start = softdep_disk_io_initiation;
2629	bioops.io_complete = softdep_disk_write_complete;
2630	bioops.io_deallocate = softdep_deallocate_dependencies;
2631	bioops.io_countdeps = softdep_count_dependencies;
2632	softdep_ast_cleanup = softdep_ast_cleanup_proc;
2633
2634	/* Initialize the callout with an mtx. */
2635	callout_init_mtx(&softdep_callout, &lk, 0);
2636}
2637
2638/*
2639 * Executed after all filesystems have been unmounted during
2640 * filesystem module unload.
2641 */
2642void
2643softdep_uninitialize()
2644{
2645
2646	/* clear bioops hack */
2647	bioops.io_start = NULL;
2648	bioops.io_complete = NULL;
2649	bioops.io_deallocate = NULL;
2650	bioops.io_countdeps = NULL;
2651	softdep_ast_cleanup = NULL;
2652
2653	callout_drain(&softdep_callout);
2654}
2655
2656/*
2657 * Called at mount time to notify the dependency code that a
2658 * filesystem wishes to use it.
2659 */
2660int
2661softdep_mount(devvp, mp, fs, cred)
2662	struct vnode *devvp;
2663	struct mount *mp;
2664	struct fs *fs;
2665	struct ucred *cred;
2666{
2667	struct csum_total cstotal;
2668	struct mount_softdeps *sdp;
2669	struct ufsmount *ump;
2670	struct cg *cgp;
2671	struct buf *bp;
2672	u_int cyl, i;
2673	int error;
2674
2675	sdp = malloc(sizeof(struct mount_softdeps), M_MOUNTDATA,
2676	    M_WAITOK | M_ZERO);
2677	MNT_ILOCK(mp);
2678	mp->mnt_flag = (mp->mnt_flag & ~MNT_ASYNC) | MNT_SOFTDEP;
2679	if ((mp->mnt_kern_flag & MNTK_SOFTDEP) == 0) {
2680		mp->mnt_kern_flag = (mp->mnt_kern_flag & ~MNTK_ASYNC) |
2681			MNTK_SOFTDEP | MNTK_NOASYNC;
2682	}
2683	ump = VFSTOUFS(mp);
2684	ump->um_softdep = sdp;
2685	MNT_IUNLOCK(mp);
2686	rw_init(LOCK_PTR(ump), "per-fs softdep");
2687	sdp->sd_ump = ump;
2688	LIST_INIT(&ump->softdep_workitem_pending);
2689	LIST_INIT(&ump->softdep_journal_pending);
2690	TAILQ_INIT(&ump->softdep_unlinked);
2691	LIST_INIT(&ump->softdep_dirtycg);
2692	ump->softdep_worklist_tail = NULL;
2693	ump->softdep_on_worklist = 0;
2694	ump->softdep_deps = 0;
2695	LIST_INIT(&ump->softdep_mkdirlisthd);
2696	ump->pagedep_hashtbl = hashinit(desiredvnodes / 5, M_PAGEDEP,
2697	    &ump->pagedep_hash_size);
2698	ump->pagedep_nextclean = 0;
2699	ump->inodedep_hashtbl = hashinit(desiredvnodes, M_INODEDEP,
2700	    &ump->inodedep_hash_size);
2701	ump->inodedep_nextclean = 0;
2702	ump->newblk_hashtbl = hashinit(max_softdeps / 2,  M_NEWBLK,
2703	    &ump->newblk_hash_size);
2704	ump->bmsafemap_hashtbl = hashinit(1024, M_BMSAFEMAP,
2705	    &ump->bmsafemap_hash_size);
2706	i = 1 << (ffs(desiredvnodes / 10) - 1);
2707	ump->indir_hashtbl = malloc(i * sizeof(struct indir_hashhead),
2708	    M_FREEWORK, M_WAITOK);
2709	ump->indir_hash_size = i - 1;
2710	for (i = 0; i <= ump->indir_hash_size; i++)
2711		TAILQ_INIT(&ump->indir_hashtbl[i]);
2712#ifdef INVARIANTS
2713	for (i = 0; i <= D_LAST; i++)
2714		LIST_INIT(&ump->softdep_alldeps[i]);
2715#endif
2716	ACQUIRE_GBLLOCK(&lk);
2717	TAILQ_INSERT_TAIL(&softdepmounts, sdp, sd_next);
2718	FREE_GBLLOCK(&lk);
2719	if ((fs->fs_flags & FS_SUJ) &&
2720	    (error = journal_mount(mp, fs, cred)) != 0) {
2721		printf("Failed to start journal: %d\n", error);
2722		softdep_unmount(mp);
2723		return (error);
2724	}
2725	/*
2726	 * Start our flushing thread in the bufdaemon process.
2727	 */
2728	ACQUIRE_LOCK(ump);
2729	ump->softdep_flags |= FLUSH_STARTING;
2730	FREE_LOCK(ump);
2731	kproc_kthread_add(&softdep_flush, mp, &bufdaemonproc,
2732	    &ump->softdep_flushtd, 0, 0, "softdepflush", "%s worker",
2733	    mp->mnt_stat.f_mntonname);
2734	ACQUIRE_LOCK(ump);
2735	while ((ump->softdep_flags & FLUSH_STARTING) != 0) {
2736		msleep(&ump->softdep_flushtd, LOCK_PTR(ump), PVM, "sdstart",
2737		    hz / 2);
2738	}
2739	FREE_LOCK(ump);
2740	/*
2741	 * When doing soft updates, the counters in the
2742	 * superblock may have gotten out of sync. Recomputation
2743	 * can take a long time and can be deferred for background
2744	 * fsck.  However, the old behavior of scanning the cylinder
2745	 * groups and recalculating them at mount time is available
2746	 * by setting vfs.ffs.compute_summary_at_mount to one.
2747	 */
2748	if (compute_summary_at_mount == 0 || fs->fs_clean != 0)
2749		return (0);
2750	bzero(&cstotal, sizeof cstotal);
2751	for (cyl = 0; cyl < fs->fs_ncg; cyl++) {
2752		if ((error = bread(devvp, fsbtodb(fs, cgtod(fs, cyl)),
2753		    fs->fs_cgsize, cred, &bp)) != 0) {
2754			brelse(bp);
2755			softdep_unmount(mp);
2756			return (error);
2757		}
2758		cgp = (struct cg *)bp->b_data;
2759		cstotal.cs_nffree += cgp->cg_cs.cs_nffree;
2760		cstotal.cs_nbfree += cgp->cg_cs.cs_nbfree;
2761		cstotal.cs_nifree += cgp->cg_cs.cs_nifree;
2762		cstotal.cs_ndir += cgp->cg_cs.cs_ndir;
2763		fs->fs_cs(fs, cyl) = cgp->cg_cs;
2764		brelse(bp);
2765	}
2766#ifdef INVARIANTS
2767	if (bcmp(&cstotal, &fs->fs_cstotal, sizeof cstotal))
2768		printf("%s: superblock summary recomputed\n", fs->fs_fsmnt);
2769#endif
2770	bcopy(&cstotal, &fs->fs_cstotal, sizeof cstotal);
2771	return (0);
2772}
2773
2774void
2775softdep_unmount(mp)
2776	struct mount *mp;
2777{
2778	struct ufsmount *ump;
2779#ifdef INVARIANTS
2780	int i;
2781#endif
2782
2783	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
2784	    ("softdep_unmount called on non-softdep filesystem"));
2785	ump = VFSTOUFS(mp);
2786	MNT_ILOCK(mp);
2787	mp->mnt_flag &= ~MNT_SOFTDEP;
2788	if (MOUNTEDSUJ(mp) == 0) {
2789		MNT_IUNLOCK(mp);
2790	} else {
2791		mp->mnt_flag &= ~MNT_SUJ;
2792		MNT_IUNLOCK(mp);
2793		journal_unmount(ump);
2794	}
2795	/*
2796	 * Shut down our flushing thread. Check for NULL is if
2797	 * softdep_mount errors out before the thread has been created.
2798	 */
2799	if (ump->softdep_flushtd != NULL) {
2800		ACQUIRE_LOCK(ump);
2801		ump->softdep_flags |= FLUSH_EXIT;
2802		wakeup(&ump->softdep_flushtd);
2803		msleep(&ump->softdep_flags, LOCK_PTR(ump), PVM | PDROP,
2804		    "sdwait", 0);
2805		KASSERT((ump->softdep_flags & FLUSH_EXIT) == 0,
2806		    ("Thread shutdown failed"));
2807	}
2808	/*
2809	 * Free up our resources.
2810	 */
2811	ACQUIRE_GBLLOCK(&lk);
2812	TAILQ_REMOVE(&softdepmounts, ump->um_softdep, sd_next);
2813	FREE_GBLLOCK(&lk);
2814	rw_destroy(LOCK_PTR(ump));
2815	hashdestroy(ump->pagedep_hashtbl, M_PAGEDEP, ump->pagedep_hash_size);
2816	hashdestroy(ump->inodedep_hashtbl, M_INODEDEP, ump->inodedep_hash_size);
2817	hashdestroy(ump->newblk_hashtbl, M_NEWBLK, ump->newblk_hash_size);
2818	hashdestroy(ump->bmsafemap_hashtbl, M_BMSAFEMAP,
2819	    ump->bmsafemap_hash_size);
2820	free(ump->indir_hashtbl, M_FREEWORK);
2821#ifdef INVARIANTS
2822	for (i = 0; i <= D_LAST; i++) {
2823		KASSERT(ump->softdep_curdeps[i] == 0,
2824		    ("Unmount %s: Dep type %s != 0 (%ld)", ump->um_fs->fs_fsmnt,
2825		    TYPENAME(i), ump->softdep_curdeps[i]));
2826		KASSERT(LIST_EMPTY(&ump->softdep_alldeps[i]),
2827		    ("Unmount %s: Dep type %s not empty (%p)", ump->um_fs->fs_fsmnt,
2828		    TYPENAME(i), LIST_FIRST(&ump->softdep_alldeps[i])));
2829	}
2830#endif
2831	free(ump->um_softdep, M_MOUNTDATA);
2832}
2833
2834static struct jblocks *
2835jblocks_create(void)
2836{
2837	struct jblocks *jblocks;
2838
2839	jblocks = malloc(sizeof(*jblocks), M_JBLOCKS, M_WAITOK | M_ZERO);
2840	TAILQ_INIT(&jblocks->jb_segs);
2841	jblocks->jb_avail = 10;
2842	jblocks->jb_extent = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2843	    M_JBLOCKS, M_WAITOK | M_ZERO);
2844
2845	return (jblocks);
2846}
2847
2848static ufs2_daddr_t
2849jblocks_alloc(jblocks, bytes, actual)
2850	struct jblocks *jblocks;
2851	int bytes;
2852	int *actual;
2853{
2854	ufs2_daddr_t daddr;
2855	struct jextent *jext;
2856	int freecnt;
2857	int blocks;
2858
2859	blocks = bytes / DEV_BSIZE;
2860	jext = &jblocks->jb_extent[jblocks->jb_head];
2861	freecnt = jext->je_blocks - jblocks->jb_off;
2862	if (freecnt == 0) {
2863		jblocks->jb_off = 0;
2864		if (++jblocks->jb_head > jblocks->jb_used)
2865			jblocks->jb_head = 0;
2866		jext = &jblocks->jb_extent[jblocks->jb_head];
2867		freecnt = jext->je_blocks;
2868	}
2869	if (freecnt > blocks)
2870		freecnt = blocks;
2871	*actual = freecnt * DEV_BSIZE;
2872	daddr = jext->je_daddr + jblocks->jb_off;
2873	jblocks->jb_off += freecnt;
2874	jblocks->jb_free -= freecnt;
2875
2876	return (daddr);
2877}
2878
2879static void
2880jblocks_free(jblocks, mp, bytes)
2881	struct jblocks *jblocks;
2882	struct mount *mp;
2883	int bytes;
2884{
2885
2886	LOCK_OWNED(VFSTOUFS(mp));
2887	jblocks->jb_free += bytes / DEV_BSIZE;
2888	if (jblocks->jb_suspended)
2889		worklist_speedup(mp);
2890	wakeup(jblocks);
2891}
2892
2893static void
2894jblocks_destroy(jblocks)
2895	struct jblocks *jblocks;
2896{
2897
2898	if (jblocks->jb_extent)
2899		free(jblocks->jb_extent, M_JBLOCKS);
2900	free(jblocks, M_JBLOCKS);
2901}
2902
2903static void
2904jblocks_add(jblocks, daddr, blocks)
2905	struct jblocks *jblocks;
2906	ufs2_daddr_t daddr;
2907	int blocks;
2908{
2909	struct jextent *jext;
2910
2911	jblocks->jb_blocks += blocks;
2912	jblocks->jb_free += blocks;
2913	jext = &jblocks->jb_extent[jblocks->jb_used];
2914	/* Adding the first block. */
2915	if (jext->je_daddr == 0) {
2916		jext->je_daddr = daddr;
2917		jext->je_blocks = blocks;
2918		return;
2919	}
2920	/* Extending the last extent. */
2921	if (jext->je_daddr + jext->je_blocks == daddr) {
2922		jext->je_blocks += blocks;
2923		return;
2924	}
2925	/* Adding a new extent. */
2926	if (++jblocks->jb_used == jblocks->jb_avail) {
2927		jblocks->jb_avail *= 2;
2928		jext = malloc(sizeof(struct jextent) * jblocks->jb_avail,
2929		    M_JBLOCKS, M_WAITOK | M_ZERO);
2930		memcpy(jext, jblocks->jb_extent,
2931		    sizeof(struct jextent) * jblocks->jb_used);
2932		free(jblocks->jb_extent, M_JBLOCKS);
2933		jblocks->jb_extent = jext;
2934	}
2935	jext = &jblocks->jb_extent[jblocks->jb_used];
2936	jext->je_daddr = daddr;
2937	jext->je_blocks = blocks;
2938	return;
2939}
2940
2941int
2942softdep_journal_lookup(mp, vpp)
2943	struct mount *mp;
2944	struct vnode **vpp;
2945{
2946	struct componentname cnp;
2947	struct vnode *dvp;
2948	ino_t sujournal;
2949	int error;
2950
2951	error = VFS_VGET(mp, UFS_ROOTINO, LK_EXCLUSIVE, &dvp);
2952	if (error)
2953		return (error);
2954	bzero(&cnp, sizeof(cnp));
2955	cnp.cn_nameiop = LOOKUP;
2956	cnp.cn_flags = ISLASTCN;
2957	cnp.cn_thread = curthread;
2958	cnp.cn_cred = curthread->td_ucred;
2959	cnp.cn_pnbuf = SUJ_FILE;
2960	cnp.cn_nameptr = SUJ_FILE;
2961	cnp.cn_namelen = strlen(SUJ_FILE);
2962	error = ufs_lookup_ino(dvp, NULL, &cnp, &sujournal);
2963	vput(dvp);
2964	if (error != 0)
2965		return (error);
2966	error = VFS_VGET(mp, sujournal, LK_EXCLUSIVE, vpp);
2967	return (error);
2968}
2969
2970/*
2971 * Open and verify the journal file.
2972 */
2973static int
2974journal_mount(mp, fs, cred)
2975	struct mount *mp;
2976	struct fs *fs;
2977	struct ucred *cred;
2978{
2979	struct jblocks *jblocks;
2980	struct ufsmount *ump;
2981	struct vnode *vp;
2982	struct inode *ip;
2983	ufs2_daddr_t blkno;
2984	int bcount;
2985	int error;
2986	int i;
2987
2988	ump = VFSTOUFS(mp);
2989	ump->softdep_journal_tail = NULL;
2990	ump->softdep_on_journal = 0;
2991	ump->softdep_accdeps = 0;
2992	ump->softdep_req = 0;
2993	ump->softdep_jblocks = NULL;
2994	error = softdep_journal_lookup(mp, &vp);
2995	if (error != 0) {
2996		printf("Failed to find journal.  Use tunefs to create one\n");
2997		return (error);
2998	}
2999	ip = VTOI(vp);
3000	if (ip->i_size < SUJ_MIN) {
3001		error = ENOSPC;
3002		goto out;
3003	}
3004	bcount = lblkno(fs, ip->i_size);	/* Only use whole blocks. */
3005	jblocks = jblocks_create();
3006	for (i = 0; i < bcount; i++) {
3007		error = ufs_bmaparray(vp, i, &blkno, NULL, NULL, NULL);
3008		if (error)
3009			break;
3010		jblocks_add(jblocks, blkno, fsbtodb(fs, fs->fs_frag));
3011	}
3012	if (error) {
3013		jblocks_destroy(jblocks);
3014		goto out;
3015	}
3016	jblocks->jb_low = jblocks->jb_free / 3;	/* Reserve 33%. */
3017	jblocks->jb_min = jblocks->jb_free / 10; /* Suspend at 10%. */
3018	ump->softdep_jblocks = jblocks;
3019out:
3020	if (error == 0) {
3021		MNT_ILOCK(mp);
3022		mp->mnt_flag |= MNT_SUJ;
3023		mp->mnt_flag &= ~MNT_SOFTDEP;
3024		MNT_IUNLOCK(mp);
3025		/*
3026		 * Only validate the journal contents if the
3027		 * filesystem is clean, otherwise we write the logs
3028		 * but they'll never be used.  If the filesystem was
3029		 * still dirty when we mounted it the journal is
3030		 * invalid and a new journal can only be valid if it
3031		 * starts from a clean mount.
3032		 */
3033		if (fs->fs_clean) {
3034			DIP_SET(ip, i_modrev, fs->fs_mtime);
3035			ip->i_flags |= IN_MODIFIED;
3036			ffs_update(vp, 1);
3037		}
3038	}
3039	vput(vp);
3040	return (error);
3041}
3042
3043static void
3044journal_unmount(ump)
3045	struct ufsmount *ump;
3046{
3047
3048	if (ump->softdep_jblocks)
3049		jblocks_destroy(ump->softdep_jblocks);
3050	ump->softdep_jblocks = NULL;
3051}
3052
3053/*
3054 * Called when a journal record is ready to be written.  Space is allocated
3055 * and the journal entry is created when the journal is flushed to stable
3056 * store.
3057 */
3058static void
3059add_to_journal(wk)
3060	struct worklist *wk;
3061{
3062	struct ufsmount *ump;
3063
3064	ump = VFSTOUFS(wk->wk_mp);
3065	LOCK_OWNED(ump);
3066	if (wk->wk_state & ONWORKLIST)
3067		panic("add_to_journal: %s(0x%X) already on list",
3068		    TYPENAME(wk->wk_type), wk->wk_state);
3069	wk->wk_state |= ONWORKLIST | DEPCOMPLETE;
3070	if (LIST_EMPTY(&ump->softdep_journal_pending)) {
3071		ump->softdep_jblocks->jb_age = ticks;
3072		LIST_INSERT_HEAD(&ump->softdep_journal_pending, wk, wk_list);
3073	} else
3074		LIST_INSERT_AFTER(ump->softdep_journal_tail, wk, wk_list);
3075	ump->softdep_journal_tail = wk;
3076	ump->softdep_on_journal += 1;
3077}
3078
3079/*
3080 * Remove an arbitrary item for the journal worklist maintain the tail
3081 * pointer.  This happens when a new operation obviates the need to
3082 * journal an old operation.
3083 */
3084static void
3085remove_from_journal(wk)
3086	struct worklist *wk;
3087{
3088	struct ufsmount *ump;
3089
3090	ump = VFSTOUFS(wk->wk_mp);
3091	LOCK_OWNED(ump);
3092#ifdef INVARIANTS
3093	{
3094		struct worklist *wkn;
3095
3096		LIST_FOREACH(wkn, &ump->softdep_journal_pending, wk_list)
3097			if (wkn == wk)
3098				break;
3099		if (wkn == NULL)
3100			panic("remove_from_journal: %p is not in journal", wk);
3101	}
3102#endif
3103	/*
3104	 * We emulate a TAILQ to save space in most structures which do not
3105	 * require TAILQ semantics.  Here we must update the tail position
3106	 * when removing the tail which is not the final entry. This works
3107	 * only if the worklist linkage are at the beginning of the structure.
3108	 */
3109	if (ump->softdep_journal_tail == wk)
3110		ump->softdep_journal_tail =
3111		    (struct worklist *)wk->wk_list.le_prev;
3112	WORKLIST_REMOVE(wk);
3113	ump->softdep_on_journal -= 1;
3114}
3115
3116/*
3117 * Check for journal space as well as dependency limits so the prelink
3118 * code can throttle both journaled and non-journaled filesystems.
3119 * Threshold is 0 for low and 1 for min.
3120 */
3121static int
3122journal_space(ump, thresh)
3123	struct ufsmount *ump;
3124	int thresh;
3125{
3126	struct jblocks *jblocks;
3127	int limit, avail;
3128
3129	jblocks = ump->softdep_jblocks;
3130	if (jblocks == NULL)
3131		return (1);
3132	/*
3133	 * We use a tighter restriction here to prevent request_cleanup()
3134	 * running in threads from running into locks we currently hold.
3135	 * We have to be over the limit and our filesystem has to be
3136	 * responsible for more than our share of that usage.
3137	 */
3138	limit = (max_softdeps / 10) * 9;
3139	if (dep_current[D_INODEDEP] > limit &&
3140	    ump->softdep_curdeps[D_INODEDEP] > limit / stat_flush_threads)
3141		return (0);
3142	if (thresh)
3143		thresh = jblocks->jb_min;
3144	else
3145		thresh = jblocks->jb_low;
3146	avail = (ump->softdep_on_journal * JREC_SIZE) / DEV_BSIZE;
3147	avail = jblocks->jb_free - avail;
3148
3149	return (avail > thresh);
3150}
3151
3152static void
3153journal_suspend(ump)
3154	struct ufsmount *ump;
3155{
3156	struct jblocks *jblocks;
3157	struct mount *mp;
3158	bool set;
3159
3160	mp = UFSTOVFS(ump);
3161	if ((mp->mnt_kern_flag & MNTK_SUSPEND) != 0)
3162		return;
3163
3164	jblocks = ump->softdep_jblocks;
3165	vfs_op_enter(mp);
3166	set = false;
3167	MNT_ILOCK(mp);
3168	if ((mp->mnt_kern_flag & MNTK_SUSPEND) == 0) {
3169		stat_journal_min++;
3170		mp->mnt_kern_flag |= MNTK_SUSPEND;
3171		mp->mnt_susp_owner = ump->softdep_flushtd;
3172		set = true;
3173	}
3174	jblocks->jb_suspended = 1;
3175	MNT_IUNLOCK(mp);
3176	if (!set)
3177		vfs_op_exit(mp);
3178}
3179
3180static int
3181journal_unsuspend(struct ufsmount *ump)
3182{
3183	struct jblocks *jblocks;
3184	struct mount *mp;
3185
3186	mp = UFSTOVFS(ump);
3187	jblocks = ump->softdep_jblocks;
3188
3189	if (jblocks != NULL && jblocks->jb_suspended &&
3190	    journal_space(ump, jblocks->jb_min)) {
3191		jblocks->jb_suspended = 0;
3192		FREE_LOCK(ump);
3193		mp->mnt_susp_owner = curthread;
3194		vfs_write_resume(mp, 0);
3195		ACQUIRE_LOCK(ump);
3196		return (1);
3197	}
3198	return (0);
3199}
3200
3201/*
3202 * Called before any allocation function to be certain that there is
3203 * sufficient space in the journal prior to creating any new records.
3204 * Since in the case of block allocation we may have multiple locked
3205 * buffers at the time of the actual allocation we can not block
3206 * when the journal records are created.  Doing so would create a deadlock
3207 * if any of these buffers needed to be flushed to reclaim space.  Instead
3208 * we require a sufficiently large amount of available space such that
3209 * each thread in the system could have passed this allocation check and
3210 * still have sufficient free space.  With 20% of a minimum journal size
3211 * of 1MB we have 6553 records available.
3212 */
3213int
3214softdep_prealloc(vp, waitok)
3215	struct vnode *vp;
3216	int waitok;
3217{
3218	struct ufsmount *ump;
3219
3220	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
3221	    ("softdep_prealloc called on non-softdep filesystem"));
3222	/*
3223	 * Nothing to do if we are not running journaled soft updates.
3224	 * If we currently hold the snapshot lock, we must avoid
3225	 * handling other resources that could cause deadlock.  Do not
3226	 * touch quotas vnode since it is typically recursed with
3227	 * other vnode locks held.
3228	 */
3229	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)) ||
3230	    (vp->v_vflag & VV_SYSTEM) != 0)
3231		return (0);
3232	ump = VFSTOUFS(vp->v_mount);
3233	ACQUIRE_LOCK(ump);
3234	if (journal_space(ump, 0)) {
3235		FREE_LOCK(ump);
3236		return (0);
3237	}
3238	stat_journal_low++;
3239	FREE_LOCK(ump);
3240	if (waitok == MNT_NOWAIT)
3241		return (ENOSPC);
3242	/*
3243	 * Attempt to sync this vnode once to flush any journal
3244	 * work attached to it.
3245	 */
3246	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
3247		ffs_syncvnode(vp, waitok, 0);
3248	ACQUIRE_LOCK(ump);
3249	process_removes(vp);
3250	process_truncates(vp);
3251	if (journal_space(ump, 0) == 0) {
3252		softdep_speedup(ump);
3253		if (journal_space(ump, 1) == 0)
3254			journal_suspend(ump);
3255	}
3256	FREE_LOCK(ump);
3257
3258	return (0);
3259}
3260
3261/*
3262 * Try hard to sync all data and metadata for the vnode, and workitems
3263 * flushing which might conflict with the vnode lock.  This is a
3264 * helper for softdep_prerename().
3265 */
3266static int
3267softdep_prerename_vnode(ump, vp)
3268	struct ufsmount *ump;
3269	struct vnode *vp;
3270{
3271	int error;
3272
3273	ASSERT_VOP_ELOCKED(vp, "prehandle");
3274	if (vp->v_data == NULL)
3275		return (0);
3276	error = VOP_FSYNC(vp, MNT_WAIT, curthread);
3277	if (error != 0)
3278		return (error);
3279	ACQUIRE_LOCK(ump);
3280	process_removes(vp);
3281	process_truncates(vp);
3282	FREE_LOCK(ump);
3283	return (0);
3284}
3285
3286/*
3287 * Must be called from VOP_RENAME() after all vnodes are locked.
3288 * Ensures that there is enough journal space for rename.  It is
3289 * sufficiently different from softdep_prelink() by having to handle
3290 * four vnodes.
3291 */
3292int
3293softdep_prerename(fdvp, fvp, tdvp, tvp)
3294	struct vnode *fdvp;
3295	struct vnode *fvp;
3296	struct vnode *tdvp;
3297	struct vnode *tvp;
3298{
3299	struct ufsmount *ump;
3300	int error;
3301
3302	ump = VFSTOUFS(fdvp->v_mount);
3303
3304	if (journal_space(ump, 0))
3305		return (0);
3306
3307	VOP_UNLOCK(tdvp);
3308	VOP_UNLOCK(fvp);
3309	if (tvp != NULL && tvp != tdvp)
3310		VOP_UNLOCK(tvp);
3311
3312	error = softdep_prerename_vnode(ump, fdvp);
3313	VOP_UNLOCK(fdvp);
3314	if (error != 0)
3315		return (error);
3316
3317	VOP_LOCK(fvp, LK_EXCLUSIVE | LK_RETRY);
3318	error = softdep_prerename_vnode(ump, fvp);
3319	VOP_UNLOCK(fvp);
3320	if (error != 0)
3321		return (error);
3322
3323	if (tdvp != fdvp) {
3324		VOP_LOCK(tdvp, LK_EXCLUSIVE | LK_RETRY);
3325		error = softdep_prerename_vnode(ump, tdvp);
3326		VOP_UNLOCK(tdvp);
3327		if (error != 0)
3328			return (error);
3329	}
3330
3331	if (tvp != fvp && tvp != NULL) {
3332		VOP_LOCK(tvp, LK_EXCLUSIVE | LK_RETRY);
3333		error = softdep_prerename_vnode(ump, tvp);
3334		VOP_UNLOCK(tvp);
3335		if (error != 0)
3336			return (error);
3337	}
3338
3339	ACQUIRE_LOCK(ump);
3340	softdep_speedup(ump);
3341	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3342	if (journal_space(ump, 0) == 0) {
3343		softdep_speedup(ump);
3344		if (journal_space(ump, 1) == 0)
3345			journal_suspend(ump);
3346	}
3347	FREE_LOCK(ump);
3348	return (ERELOOKUP);
3349}
3350
3351/*
3352 * Before adjusting a link count on a vnode verify that we have sufficient
3353 * journal space.  If not, process operations that depend on the currently
3354 * locked pair of vnodes to try to flush space as the syncer, buf daemon,
3355 * and softdep flush threads can not acquire these locks to reclaim space.
3356 *
3357 * Returns 0 if all owned locks are still valid and were not dropped
3358 * in the process, in other case it returns either an error from sync,
3359 * or ERELOOKUP if any of the locks were re-acquired.  In the later
3360 * case, the state of the vnodes cannot be relied upon and our VFS
3361 * syscall must be restarted at top level from the lookup.
3362 */
3363int
3364softdep_prelink(dvp, vp)
3365	struct vnode *dvp;
3366	struct vnode *vp;
3367{
3368	struct ufsmount *ump;
3369
3370	ASSERT_VOP_ELOCKED(dvp, "prelink dvp");
3371	if (vp != NULL)
3372		ASSERT_VOP_ELOCKED(vp, "prelink vp");
3373	ump = VFSTOUFS(dvp->v_mount);
3374
3375	/*
3376	 * Nothing to do if we have sufficient journal space.  We skip
3377	 * flushing when vp is a snapshot to avoid deadlock where
3378	 * another thread is trying to update the inodeblock for dvp
3379	 * and is waiting on snaplk that vp holds.
3380	 */
3381	if (journal_space(ump, 0) || (vp != NULL && IS_SNAPSHOT(VTOI(vp))))
3382		return (0);
3383
3384	stat_journal_low++;
3385	if (vp != NULL) {
3386		VOP_UNLOCK(dvp);
3387		ffs_syncvnode(vp, MNT_NOWAIT, 0);
3388		vn_lock_pair(dvp, false, vp, true);
3389		if (dvp->v_data == NULL)
3390			return (ERELOOKUP);
3391	}
3392	if (vp != NULL)
3393		VOP_UNLOCK(vp);
3394	ffs_syncvnode(dvp, MNT_WAIT, 0);
3395	VOP_UNLOCK(dvp);
3396
3397	/* Process vp before dvp as it may create .. removes. */
3398	if (vp != NULL) {
3399		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
3400		if (vp->v_data == NULL) {
3401			vn_lock_pair(dvp, false, vp, true);
3402			return (ERELOOKUP);
3403		}
3404		ACQUIRE_LOCK(ump);
3405		process_removes(vp);
3406		process_truncates(vp);
3407		FREE_LOCK(ump);
3408		VOP_UNLOCK(vp);
3409	}
3410
3411	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
3412	if (dvp->v_data == NULL) {
3413		vn_lock_pair(dvp, true, vp, false);
3414		return (ERELOOKUP);
3415	}
3416
3417	ACQUIRE_LOCK(ump);
3418	process_removes(dvp);
3419	process_truncates(dvp);
3420	VOP_UNLOCK(dvp);
3421	softdep_speedup(ump);
3422
3423	process_worklist_item(UFSTOVFS(ump), 2, LK_NOWAIT);
3424	if (journal_space(ump, 0) == 0) {
3425		softdep_speedup(ump);
3426		if (journal_space(ump, 1) == 0)
3427			journal_suspend(ump);
3428	}
3429	FREE_LOCK(ump);
3430
3431	vn_lock_pair(dvp, false, vp, false);
3432	return (ERELOOKUP);
3433}
3434
3435static void
3436jseg_write(ump, jseg, data)
3437	struct ufsmount *ump;
3438	struct jseg *jseg;
3439	uint8_t *data;
3440{
3441	struct jsegrec *rec;
3442
3443	rec = (struct jsegrec *)data;
3444	rec->jsr_seq = jseg->js_seq;
3445	rec->jsr_oldest = jseg->js_oldseq;
3446	rec->jsr_cnt = jseg->js_cnt;
3447	rec->jsr_blocks = jseg->js_size / ump->um_devvp->v_bufobj.bo_bsize;
3448	rec->jsr_crc = 0;
3449	rec->jsr_time = ump->um_fs->fs_mtime;
3450}
3451
3452static inline void
3453inoref_write(inoref, jseg, rec)
3454	struct inoref *inoref;
3455	struct jseg *jseg;
3456	struct jrefrec *rec;
3457{
3458
3459	inoref->if_jsegdep->jd_seg = jseg;
3460	rec->jr_ino = inoref->if_ino;
3461	rec->jr_parent = inoref->if_parent;
3462	rec->jr_nlink = inoref->if_nlink;
3463	rec->jr_mode = inoref->if_mode;
3464	rec->jr_diroff = inoref->if_diroff;
3465}
3466
3467static void
3468jaddref_write(jaddref, jseg, data)
3469	struct jaddref *jaddref;
3470	struct jseg *jseg;
3471	uint8_t *data;
3472{
3473	struct jrefrec *rec;
3474
3475	rec = (struct jrefrec *)data;
3476	rec->jr_op = JOP_ADDREF;
3477	inoref_write(&jaddref->ja_ref, jseg, rec);
3478}
3479
3480static void
3481jremref_write(jremref, jseg, data)
3482	struct jremref *jremref;
3483	struct jseg *jseg;
3484	uint8_t *data;
3485{
3486	struct jrefrec *rec;
3487
3488	rec = (struct jrefrec *)data;
3489	rec->jr_op = JOP_REMREF;
3490	inoref_write(&jremref->jr_ref, jseg, rec);
3491}
3492
3493static void
3494jmvref_write(jmvref, jseg, data)
3495	struct jmvref *jmvref;
3496	struct jseg *jseg;
3497	uint8_t *data;
3498{
3499	struct jmvrec *rec;
3500
3501	rec = (struct jmvrec *)data;
3502	rec->jm_op = JOP_MVREF;
3503	rec->jm_ino = jmvref->jm_ino;
3504	rec->jm_parent = jmvref->jm_parent;
3505	rec->jm_oldoff = jmvref->jm_oldoff;
3506	rec->jm_newoff = jmvref->jm_newoff;
3507}
3508
3509static void
3510jnewblk_write(jnewblk, jseg, data)
3511	struct jnewblk *jnewblk;
3512	struct jseg *jseg;
3513	uint8_t *data;
3514{
3515	struct jblkrec *rec;
3516
3517	jnewblk->jn_jsegdep->jd_seg = jseg;
3518	rec = (struct jblkrec *)data;
3519	rec->jb_op = JOP_NEWBLK;
3520	rec->jb_ino = jnewblk->jn_ino;
3521	rec->jb_blkno = jnewblk->jn_blkno;
3522	rec->jb_lbn = jnewblk->jn_lbn;
3523	rec->jb_frags = jnewblk->jn_frags;
3524	rec->jb_oldfrags = jnewblk->jn_oldfrags;
3525}
3526
3527static void
3528jfreeblk_write(jfreeblk, jseg, data)
3529	struct jfreeblk *jfreeblk;
3530	struct jseg *jseg;
3531	uint8_t *data;
3532{
3533	struct jblkrec *rec;
3534
3535	jfreeblk->jf_dep.jb_jsegdep->jd_seg = jseg;
3536	rec = (struct jblkrec *)data;
3537	rec->jb_op = JOP_FREEBLK;
3538	rec->jb_ino = jfreeblk->jf_ino;
3539	rec->jb_blkno = jfreeblk->jf_blkno;
3540	rec->jb_lbn = jfreeblk->jf_lbn;
3541	rec->jb_frags = jfreeblk->jf_frags;
3542	rec->jb_oldfrags = 0;
3543}
3544
3545static void
3546jfreefrag_write(jfreefrag, jseg, data)
3547	struct jfreefrag *jfreefrag;
3548	struct jseg *jseg;
3549	uint8_t *data;
3550{
3551	struct jblkrec *rec;
3552
3553	jfreefrag->fr_jsegdep->jd_seg = jseg;
3554	rec = (struct jblkrec *)data;
3555	rec->jb_op = JOP_FREEBLK;
3556	rec->jb_ino = jfreefrag->fr_ino;
3557	rec->jb_blkno = jfreefrag->fr_blkno;
3558	rec->jb_lbn = jfreefrag->fr_lbn;
3559	rec->jb_frags = jfreefrag->fr_frags;
3560	rec->jb_oldfrags = 0;
3561}
3562
3563static void
3564jtrunc_write(jtrunc, jseg, data)
3565	struct jtrunc *jtrunc;
3566	struct jseg *jseg;
3567	uint8_t *data;
3568{
3569	struct jtrncrec *rec;
3570
3571	jtrunc->jt_dep.jb_jsegdep->jd_seg = jseg;
3572	rec = (struct jtrncrec *)data;
3573	rec->jt_op = JOP_TRUNC;
3574	rec->jt_ino = jtrunc->jt_ino;
3575	rec->jt_size = jtrunc->jt_size;
3576	rec->jt_extsize = jtrunc->jt_extsize;
3577}
3578
3579static void
3580jfsync_write(jfsync, jseg, data)
3581	struct jfsync *jfsync;
3582	struct jseg *jseg;
3583	uint8_t *data;
3584{
3585	struct jtrncrec *rec;
3586
3587	rec = (struct jtrncrec *)data;
3588	rec->jt_op = JOP_SYNC;
3589	rec->jt_ino = jfsync->jfs_ino;
3590	rec->jt_size = jfsync->jfs_size;
3591	rec->jt_extsize = jfsync->jfs_extsize;
3592}
3593
3594static void
3595softdep_flushjournal(mp)
3596	struct mount *mp;
3597{
3598	struct jblocks *jblocks;
3599	struct ufsmount *ump;
3600
3601	if (MOUNTEDSUJ(mp) == 0)
3602		return;
3603	ump = VFSTOUFS(mp);
3604	jblocks = ump->softdep_jblocks;
3605	ACQUIRE_LOCK(ump);
3606	while (ump->softdep_on_journal) {
3607		jblocks->jb_needseg = 1;
3608		softdep_process_journal(mp, NULL, MNT_WAIT);
3609	}
3610	FREE_LOCK(ump);
3611}
3612
3613static void softdep_synchronize_completed(struct bio *);
3614static void softdep_synchronize(struct bio *, struct ufsmount *, void *);
3615
3616static void
3617softdep_synchronize_completed(bp)
3618        struct bio *bp;
3619{
3620	struct jseg *oldest;
3621	struct jseg *jseg;
3622	struct ufsmount *ump;
3623
3624	/*
3625	 * caller1 marks the last segment written before we issued the
3626	 * synchronize cache.
3627	 */
3628	jseg = bp->bio_caller1;
3629	if (jseg == NULL) {
3630		g_destroy_bio(bp);
3631		return;
3632	}
3633	ump = VFSTOUFS(jseg->js_list.wk_mp);
3634	ACQUIRE_LOCK(ump);
3635	oldest = NULL;
3636	/*
3637	 * Mark all the journal entries waiting on the synchronize cache
3638	 * as completed so they may continue on.
3639	 */
3640	while (jseg != NULL && (jseg->js_state & COMPLETE) == 0) {
3641		jseg->js_state |= COMPLETE;
3642		oldest = jseg;
3643		jseg = TAILQ_PREV(jseg, jseglst, js_next);
3644	}
3645	/*
3646	 * Restart deferred journal entry processing from the oldest
3647	 * completed jseg.
3648	 */
3649	if (oldest)
3650		complete_jsegs(oldest);
3651
3652	FREE_LOCK(ump);
3653	g_destroy_bio(bp);
3654}
3655
3656/*
3657 * Send BIO_FLUSH/SYNCHRONIZE CACHE to the device to enforce write ordering
3658 * barriers.  The journal must be written prior to any blocks that depend
3659 * on it and the journal can not be released until the blocks have be
3660 * written.  This code handles both barriers simultaneously.
3661 */
3662static void
3663softdep_synchronize(bp, ump, caller1)
3664	struct bio *bp;
3665	struct ufsmount *ump;
3666	void *caller1;
3667{
3668
3669	bp->bio_cmd = BIO_FLUSH;
3670	bp->bio_flags |= BIO_ORDERED;
3671	bp->bio_data = NULL;
3672	bp->bio_offset = ump->um_cp->provider->mediasize;
3673	bp->bio_length = 0;
3674	bp->bio_done = softdep_synchronize_completed;
3675	bp->bio_caller1 = caller1;
3676	g_io_request(bp, ump->um_cp);
3677}
3678
3679/*
3680 * Flush some journal records to disk.
3681 */
3682static void
3683softdep_process_journal(mp, needwk, flags)
3684	struct mount *mp;
3685	struct worklist *needwk;
3686	int flags;
3687{
3688	struct jblocks *jblocks;
3689	struct ufsmount *ump;
3690	struct worklist *wk;
3691	struct jseg *jseg;
3692	struct buf *bp;
3693	struct bio *bio;
3694	uint8_t *data;
3695	struct fs *fs;
3696	int shouldflush;
3697	int segwritten;
3698	int jrecmin;	/* Minimum records per block. */
3699	int jrecmax;	/* Maximum records per block. */
3700	int size;
3701	int cnt;
3702	int off;
3703	int devbsize;
3704
3705	if (MOUNTEDSUJ(mp) == 0)
3706		return;
3707	shouldflush = softdep_flushcache;
3708	bio = NULL;
3709	jseg = NULL;
3710	ump = VFSTOUFS(mp);
3711	LOCK_OWNED(ump);
3712	fs = ump->um_fs;
3713	jblocks = ump->softdep_jblocks;
3714	devbsize = ump->um_devvp->v_bufobj.bo_bsize;
3715	/*
3716	 * We write anywhere between a disk block and fs block.  The upper
3717	 * bound is picked to prevent buffer cache fragmentation and limit
3718	 * processing time per I/O.
3719	 */
3720	jrecmin = (devbsize / JREC_SIZE) - 1; /* -1 for seg header */
3721	jrecmax = (fs->fs_bsize / devbsize) * jrecmin;
3722	segwritten = 0;
3723	for (;;) {
3724		cnt = ump->softdep_on_journal;
3725		/*
3726		 * Criteria for writing a segment:
3727		 * 1) We have a full block.
3728		 * 2) We're called from jwait() and haven't found the
3729		 *    journal item yet.
3730		 * 3) Always write if needseg is set.
3731		 * 4) If we are called from process_worklist and have
3732		 *    not yet written anything we write a partial block
3733		 *    to enforce a 1 second maximum latency on journal
3734		 *    entries.
3735		 */
3736		if (cnt < (jrecmax - 1) && needwk == NULL &&
3737		    jblocks->jb_needseg == 0 && (segwritten || cnt == 0))
3738			break;
3739		cnt++;
3740		/*
3741		 * Verify some free journal space.  softdep_prealloc() should
3742		 * guarantee that we don't run out so this is indicative of
3743		 * a problem with the flow control.  Try to recover
3744		 * gracefully in any event.
3745		 */
3746		while (jblocks->jb_free == 0) {
3747			if (flags != MNT_WAIT)
3748				break;
3749			printf("softdep: Out of journal space!\n");
3750			softdep_speedup(ump);
3751			msleep(jblocks, LOCK_PTR(ump), PRIBIO, "jblocks", hz);
3752		}
3753		FREE_LOCK(ump);
3754		jseg = malloc(sizeof(*jseg), M_JSEG, M_SOFTDEP_FLAGS);
3755		workitem_alloc(&jseg->js_list, D_JSEG, mp);
3756		LIST_INIT(&jseg->js_entries);
3757		LIST_INIT(&jseg->js_indirs);
3758		jseg->js_state = ATTACHED;
3759		if (shouldflush == 0)
3760			jseg->js_state |= COMPLETE;
3761		else if (bio == NULL)
3762			bio = g_alloc_bio();
3763		jseg->js_jblocks = jblocks;
3764		bp = geteblk(fs->fs_bsize, 0);
3765		ACQUIRE_LOCK(ump);
3766		/*
3767		 * If there was a race while we were allocating the block
3768		 * and jseg the entry we care about was likely written.
3769		 * We bail out in both the WAIT and NOWAIT case and assume
3770		 * the caller will loop if the entry it cares about is
3771		 * not written.
3772		 */
3773		cnt = ump->softdep_on_journal;
3774		if (cnt + jblocks->jb_needseg == 0 || jblocks->jb_free == 0) {
3775			bp->b_flags |= B_INVAL | B_NOCACHE;
3776			WORKITEM_FREE(jseg, D_JSEG);
3777			FREE_LOCK(ump);
3778			brelse(bp);
3779			ACQUIRE_LOCK(ump);
3780			break;
3781		}
3782		/*
3783		 * Calculate the disk block size required for the available
3784		 * records rounded to the min size.
3785		 */
3786		if (cnt == 0)
3787			size = devbsize;
3788		else if (cnt < jrecmax)
3789			size = howmany(cnt, jrecmin) * devbsize;
3790		else
3791			size = fs->fs_bsize;
3792		/*
3793		 * Allocate a disk block for this journal data and account
3794		 * for truncation of the requested size if enough contiguous
3795		 * space was not available.
3796		 */
3797		bp->b_blkno = jblocks_alloc(jblocks, size, &size);
3798		bp->b_lblkno = bp->b_blkno;
3799		bp->b_offset = bp->b_blkno * DEV_BSIZE;
3800		bp->b_bcount = size;
3801		bp->b_flags &= ~B_INVAL;
3802		bp->b_flags |= B_VALIDSUSPWRT | B_NOCOPY;
3803		/*
3804		 * Initialize our jseg with cnt records.  Assign the next
3805		 * sequence number to it and link it in-order.
3806		 */
3807		cnt = MIN(cnt, (size / devbsize) * jrecmin);
3808		jseg->js_buf = bp;
3809		jseg->js_cnt = cnt;
3810		jseg->js_refs = cnt + 1;	/* Self ref. */
3811		jseg->js_size = size;
3812		jseg->js_seq = jblocks->jb_nextseq++;
3813		if (jblocks->jb_oldestseg == NULL)
3814			jblocks->jb_oldestseg = jseg;
3815		jseg->js_oldseq = jblocks->jb_oldestseg->js_seq;
3816		TAILQ_INSERT_TAIL(&jblocks->jb_segs, jseg, js_next);
3817		if (jblocks->jb_writeseg == NULL)
3818			jblocks->jb_writeseg = jseg;
3819		/*
3820		 * Start filling in records from the pending list.
3821		 */
3822		data = bp->b_data;
3823		off = 0;
3824
3825		/*
3826		 * Always put a header on the first block.
3827		 * XXX As with below, there might not be a chance to get
3828		 * into the loop.  Ensure that something valid is written.
3829		 */
3830		jseg_write(ump, jseg, data);
3831		off += JREC_SIZE;
3832		data = bp->b_data + off;
3833
3834		/*
3835		 * XXX Something is wrong here.  There's no work to do,
3836		 * but we need to perform and I/O and allow it to complete
3837		 * anyways.
3838		 */
3839		if (LIST_EMPTY(&ump->softdep_journal_pending))
3840			stat_emptyjblocks++;
3841
3842		while ((wk = LIST_FIRST(&ump->softdep_journal_pending))
3843		    != NULL) {
3844			if (cnt == 0)
3845				break;
3846			/* Place a segment header on every device block. */
3847			if ((off % devbsize) == 0) {
3848				jseg_write(ump, jseg, data);
3849				off += JREC_SIZE;
3850				data = bp->b_data + off;
3851			}
3852			if (wk == needwk)
3853				needwk = NULL;
3854			remove_from_journal(wk);
3855			wk->wk_state |= INPROGRESS;
3856			WORKLIST_INSERT(&jseg->js_entries, wk);
3857			switch (wk->wk_type) {
3858			case D_JADDREF:
3859				jaddref_write(WK_JADDREF(wk), jseg, data);
3860				break;
3861			case D_JREMREF:
3862				jremref_write(WK_JREMREF(wk), jseg, data);
3863				break;
3864			case D_JMVREF:
3865				jmvref_write(WK_JMVREF(wk), jseg, data);
3866				break;
3867			case D_JNEWBLK:
3868				jnewblk_write(WK_JNEWBLK(wk), jseg, data);
3869				break;
3870			case D_JFREEBLK:
3871				jfreeblk_write(WK_JFREEBLK(wk), jseg, data);
3872				break;
3873			case D_JFREEFRAG:
3874				jfreefrag_write(WK_JFREEFRAG(wk), jseg, data);
3875				break;
3876			case D_JTRUNC:
3877				jtrunc_write(WK_JTRUNC(wk), jseg, data);
3878				break;
3879			case D_JFSYNC:
3880				jfsync_write(WK_JFSYNC(wk), jseg, data);
3881				break;
3882			default:
3883				panic("process_journal: Unknown type %s",
3884				    TYPENAME(wk->wk_type));
3885				/* NOTREACHED */
3886			}
3887			off += JREC_SIZE;
3888			data = bp->b_data + off;
3889			cnt--;
3890		}
3891
3892		/* Clear any remaining space so we don't leak kernel data */
3893		if (size > off)
3894			bzero(data, size - off);
3895
3896		/*
3897		 * Write this one buffer and continue.
3898		 */
3899		segwritten = 1;
3900		jblocks->jb_needseg = 0;
3901		WORKLIST_INSERT(&bp->b_dep, &jseg->js_list);
3902		FREE_LOCK(ump);
3903		bp->b_xflags |= BX_CVTENXIO;
3904		pbgetvp(ump->um_devvp, bp);
3905		/*
3906		 * We only do the blocking wait once we find the journal
3907		 * entry we're looking for.
3908		 */
3909		if (needwk == NULL && flags == MNT_WAIT)
3910			bwrite(bp);
3911		else
3912			bawrite(bp);
3913		ACQUIRE_LOCK(ump);
3914	}
3915	/*
3916	 * If we wrote a segment issue a synchronize cache so the journal
3917	 * is reflected on disk before the data is written.  Since reclaiming
3918	 * journal space also requires writing a journal record this
3919	 * process also enforces a barrier before reclamation.
3920	 */
3921	if (segwritten && shouldflush) {
3922		softdep_synchronize(bio, ump,
3923		    TAILQ_LAST(&jblocks->jb_segs, jseglst));
3924	} else if (bio)
3925		g_destroy_bio(bio);
3926	/*
3927	 * If we've suspended the filesystem because we ran out of journal
3928	 * space either try to sync it here to make some progress or
3929	 * unsuspend it if we already have.
3930	 */
3931	if (flags == 0 && jblocks->jb_suspended) {
3932		if (journal_unsuspend(ump))
3933			return;
3934		FREE_LOCK(ump);
3935		VFS_SYNC(mp, MNT_NOWAIT);
3936		ffs_sbupdate(ump, MNT_WAIT, 0);
3937		ACQUIRE_LOCK(ump);
3938	}
3939}
3940
3941/*
3942 * Complete a jseg, allowing all dependencies awaiting journal writes
3943 * to proceed.  Each journal dependency also attaches a jsegdep to dependent
3944 * structures so that the journal segment can be freed to reclaim space.
3945 */
3946static void
3947complete_jseg(jseg)
3948	struct jseg *jseg;
3949{
3950	struct worklist *wk;
3951	struct jmvref *jmvref;
3952#ifdef INVARIANTS
3953	int i = 0;
3954#endif
3955
3956	while ((wk = LIST_FIRST(&jseg->js_entries)) != NULL) {
3957		WORKLIST_REMOVE(wk);
3958		wk->wk_state &= ~INPROGRESS;
3959		wk->wk_state |= COMPLETE;
3960		KASSERT(i++ < jseg->js_cnt,
3961		    ("handle_written_jseg: overflow %d >= %d",
3962		    i - 1, jseg->js_cnt));
3963		switch (wk->wk_type) {
3964		case D_JADDREF:
3965			handle_written_jaddref(WK_JADDREF(wk));
3966			break;
3967		case D_JREMREF:
3968			handle_written_jremref(WK_JREMREF(wk));
3969			break;
3970		case D_JMVREF:
3971			rele_jseg(jseg);	/* No jsegdep. */
3972			jmvref = WK_JMVREF(wk);
3973			LIST_REMOVE(jmvref, jm_deps);
3974			if ((jmvref->jm_pagedep->pd_state & ONWORKLIST) == 0)
3975				free_pagedep(jmvref->jm_pagedep);
3976			WORKITEM_FREE(jmvref, D_JMVREF);
3977			break;
3978		case D_JNEWBLK:
3979			handle_written_jnewblk(WK_JNEWBLK(wk));
3980			break;
3981		case D_JFREEBLK:
3982			handle_written_jblkdep(&WK_JFREEBLK(wk)->jf_dep);
3983			break;
3984		case D_JTRUNC:
3985			handle_written_jblkdep(&WK_JTRUNC(wk)->jt_dep);
3986			break;
3987		case D_JFSYNC:
3988			rele_jseg(jseg);	/* No jsegdep. */
3989			WORKITEM_FREE(wk, D_JFSYNC);
3990			break;
3991		case D_JFREEFRAG:
3992			handle_written_jfreefrag(WK_JFREEFRAG(wk));
3993			break;
3994		default:
3995			panic("handle_written_jseg: Unknown type %s",
3996			    TYPENAME(wk->wk_type));
3997			/* NOTREACHED */
3998		}
3999	}
4000	/* Release the self reference so the structure may be freed. */
4001	rele_jseg(jseg);
4002}
4003
4004/*
4005 * Determine which jsegs are ready for completion processing.  Waits for
4006 * synchronize cache to complete as well as forcing in-order completion
4007 * of journal entries.
4008 */
4009static void
4010complete_jsegs(jseg)
4011	struct jseg *jseg;
4012{
4013	struct jblocks *jblocks;
4014	struct jseg *jsegn;
4015
4016	jblocks = jseg->js_jblocks;
4017	/*
4018	 * Don't allow out of order completions.  If this isn't the first
4019	 * block wait for it to write before we're done.
4020	 */
4021	if (jseg != jblocks->jb_writeseg)
4022		return;
4023	/* Iterate through available jsegs processing their entries. */
4024	while (jseg && (jseg->js_state & ALLCOMPLETE) == ALLCOMPLETE) {
4025		jblocks->jb_oldestwrseq = jseg->js_oldseq;
4026		jsegn = TAILQ_NEXT(jseg, js_next);
4027		complete_jseg(jseg);
4028		jseg = jsegn;
4029	}
4030	jblocks->jb_writeseg = jseg;
4031	/*
4032	 * Attempt to free jsegs now that oldestwrseq may have advanced.
4033	 */
4034	free_jsegs(jblocks);
4035}
4036
4037/*
4038 * Mark a jseg as DEPCOMPLETE and throw away the buffer.  Attempt to handle
4039 * the final completions.
4040 */
4041static void
4042handle_written_jseg(jseg, bp)
4043	struct jseg *jseg;
4044	struct buf *bp;
4045{
4046
4047	if (jseg->js_refs == 0)
4048		panic("handle_written_jseg: No self-reference on %p", jseg);
4049	jseg->js_state |= DEPCOMPLETE;
4050	/*
4051	 * We'll never need this buffer again, set flags so it will be
4052	 * discarded.
4053	 */
4054	bp->b_flags |= B_INVAL | B_NOCACHE;
4055	pbrelvp(bp);
4056	complete_jsegs(jseg);
4057}
4058
4059static inline struct jsegdep *
4060inoref_jseg(inoref)
4061	struct inoref *inoref;
4062{
4063	struct jsegdep *jsegdep;
4064
4065	jsegdep = inoref->if_jsegdep;
4066	inoref->if_jsegdep = NULL;
4067
4068	return (jsegdep);
4069}
4070
4071/*
4072 * Called once a jremref has made it to stable store.  The jremref is marked
4073 * complete and we attempt to free it.  Any pagedeps writes sleeping waiting
4074 * for the jremref to complete will be awoken by free_jremref.
4075 */
4076static void
4077handle_written_jremref(jremref)
4078	struct jremref *jremref;
4079{
4080	struct inodedep *inodedep;
4081	struct jsegdep *jsegdep;
4082	struct dirrem *dirrem;
4083
4084	/* Grab the jsegdep. */
4085	jsegdep = inoref_jseg(&jremref->jr_ref);
4086	/*
4087	 * Remove us from the inoref list.
4088	 */
4089	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino,
4090	    0, &inodedep) == 0)
4091		panic("handle_written_jremref: Lost inodedep");
4092	TAILQ_REMOVE(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
4093	/*
4094	 * Complete the dirrem.
4095	 */
4096	dirrem = jremref->jr_dirrem;
4097	jremref->jr_dirrem = NULL;
4098	LIST_REMOVE(jremref, jr_deps);
4099	jsegdep->jd_state |= jremref->jr_state & MKDIR_PARENT;
4100	jwork_insert(&dirrem->dm_jwork, jsegdep);
4101	if (LIST_EMPTY(&dirrem->dm_jremrefhd) &&
4102	    (dirrem->dm_state & COMPLETE) != 0)
4103		add_to_worklist(&dirrem->dm_list, 0);
4104	free_jremref(jremref);
4105}
4106
4107/*
4108 * Called once a jaddref has made it to stable store.  The dependency is
4109 * marked complete and any dependent structures are added to the inode
4110 * bufwait list to be completed as soon as it is written.  If a bitmap write
4111 * depends on this entry we move the inode into the inodedephd of the
4112 * bmsafemap dependency and attempt to remove the jaddref from the bmsafemap.
4113 */
4114static void
4115handle_written_jaddref(jaddref)
4116	struct jaddref *jaddref;
4117{
4118	struct jsegdep *jsegdep;
4119	struct inodedep *inodedep;
4120	struct diradd *diradd;
4121	struct mkdir *mkdir;
4122
4123	/* Grab the jsegdep. */
4124	jsegdep = inoref_jseg(&jaddref->ja_ref);
4125	mkdir = NULL;
4126	diradd = NULL;
4127	if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4128	    0, &inodedep) == 0)
4129		panic("handle_written_jaddref: Lost inodedep.");
4130	if (jaddref->ja_diradd == NULL)
4131		panic("handle_written_jaddref: No dependency");
4132	if (jaddref->ja_diradd->da_list.wk_type == D_DIRADD) {
4133		diradd = jaddref->ja_diradd;
4134		WORKLIST_INSERT(&inodedep->id_bufwait, &diradd->da_list);
4135	} else if (jaddref->ja_state & MKDIR_PARENT) {
4136		mkdir = jaddref->ja_mkdir;
4137		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir->md_list);
4138	} else if (jaddref->ja_state & MKDIR_BODY)
4139		mkdir = jaddref->ja_mkdir;
4140	else
4141		panic("handle_written_jaddref: Unknown dependency %p",
4142		    jaddref->ja_diradd);
4143	jaddref->ja_diradd = NULL;	/* also clears ja_mkdir */
4144	/*
4145	 * Remove us from the inode list.
4146	 */
4147	TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref, if_deps);
4148	/*
4149	 * The mkdir may be waiting on the jaddref to clear before freeing.
4150	 */
4151	if (mkdir) {
4152		KASSERT(mkdir->md_list.wk_type == D_MKDIR,
4153		    ("handle_written_jaddref: Incorrect type for mkdir %s",
4154		    TYPENAME(mkdir->md_list.wk_type)));
4155		mkdir->md_jaddref = NULL;
4156		diradd = mkdir->md_diradd;
4157		mkdir->md_state |= DEPCOMPLETE;
4158		complete_mkdir(mkdir);
4159	}
4160	jwork_insert(&diradd->da_jwork, jsegdep);
4161	if (jaddref->ja_state & NEWBLOCK) {
4162		inodedep->id_state |= ONDEPLIST;
4163		LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_inodedephd,
4164		    inodedep, id_deps);
4165	}
4166	free_jaddref(jaddref);
4167}
4168
4169/*
4170 * Called once a jnewblk journal is written.  The allocdirect or allocindir
4171 * is placed in the bmsafemap to await notification of a written bitmap.  If
4172 * the operation was canceled we add the segdep to the appropriate
4173 * dependency to free the journal space once the canceling operation
4174 * completes.
4175 */
4176static void
4177handle_written_jnewblk(jnewblk)
4178	struct jnewblk *jnewblk;
4179{
4180	struct bmsafemap *bmsafemap;
4181	struct freefrag *freefrag;
4182	struct freework *freework;
4183	struct jsegdep *jsegdep;
4184	struct newblk *newblk;
4185
4186	/* Grab the jsegdep. */
4187	jsegdep = jnewblk->jn_jsegdep;
4188	jnewblk->jn_jsegdep = NULL;
4189	if (jnewblk->jn_dep == NULL)
4190		panic("handle_written_jnewblk: No dependency for the segdep.");
4191	switch (jnewblk->jn_dep->wk_type) {
4192	case D_NEWBLK:
4193	case D_ALLOCDIRECT:
4194	case D_ALLOCINDIR:
4195		/*
4196		 * Add the written block to the bmsafemap so it can
4197		 * be notified when the bitmap is on disk.
4198		 */
4199		newblk = WK_NEWBLK(jnewblk->jn_dep);
4200		newblk->nb_jnewblk = NULL;
4201		if ((newblk->nb_state & GOINGAWAY) == 0) {
4202			bmsafemap = newblk->nb_bmsafemap;
4203			newblk->nb_state |= ONDEPLIST;
4204			LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk,
4205			    nb_deps);
4206		}
4207		jwork_insert(&newblk->nb_jwork, jsegdep);
4208		break;
4209	case D_FREEFRAG:
4210		/*
4211		 * A newblock being removed by a freefrag when replaced by
4212		 * frag extension.
4213		 */
4214		freefrag = WK_FREEFRAG(jnewblk->jn_dep);
4215		freefrag->ff_jdep = NULL;
4216		jwork_insert(&freefrag->ff_jwork, jsegdep);
4217		break;
4218	case D_FREEWORK:
4219		/*
4220		 * A direct block was removed by truncate.
4221		 */
4222		freework = WK_FREEWORK(jnewblk->jn_dep);
4223		freework->fw_jnewblk = NULL;
4224		jwork_insert(&freework->fw_freeblks->fb_jwork, jsegdep);
4225		break;
4226	default:
4227		panic("handle_written_jnewblk: Unknown type %d.",
4228		    jnewblk->jn_dep->wk_type);
4229	}
4230	jnewblk->jn_dep = NULL;
4231	free_jnewblk(jnewblk);
4232}
4233
4234/*
4235 * Cancel a jfreefrag that won't be needed, probably due to colliding with
4236 * an in-flight allocation that has not yet been committed.  Divorce us
4237 * from the freefrag and mark it DEPCOMPLETE so that it may be added
4238 * to the worklist.
4239 */
4240static void
4241cancel_jfreefrag(jfreefrag)
4242	struct jfreefrag *jfreefrag;
4243{
4244	struct freefrag *freefrag;
4245
4246	if (jfreefrag->fr_jsegdep) {
4247		free_jsegdep(jfreefrag->fr_jsegdep);
4248		jfreefrag->fr_jsegdep = NULL;
4249	}
4250	freefrag = jfreefrag->fr_freefrag;
4251	jfreefrag->fr_freefrag = NULL;
4252	free_jfreefrag(jfreefrag);
4253	freefrag->ff_state |= DEPCOMPLETE;
4254	CTR1(KTR_SUJ, "cancel_jfreefrag: blkno %jd", freefrag->ff_blkno);
4255}
4256
4257/*
4258 * Free a jfreefrag when the parent freefrag is rendered obsolete.
4259 */
4260static void
4261free_jfreefrag(jfreefrag)
4262	struct jfreefrag *jfreefrag;
4263{
4264
4265	if (jfreefrag->fr_state & INPROGRESS)
4266		WORKLIST_REMOVE(&jfreefrag->fr_list);
4267	else if (jfreefrag->fr_state & ONWORKLIST)
4268		remove_from_journal(&jfreefrag->fr_list);
4269	if (jfreefrag->fr_freefrag != NULL)
4270		panic("free_jfreefrag:  Still attached to a freefrag.");
4271	WORKITEM_FREE(jfreefrag, D_JFREEFRAG);
4272}
4273
4274/*
4275 * Called when the journal write for a jfreefrag completes.  The parent
4276 * freefrag is added to the worklist if this completes its dependencies.
4277 */
4278static void
4279handle_written_jfreefrag(jfreefrag)
4280	struct jfreefrag *jfreefrag;
4281{
4282	struct jsegdep *jsegdep;
4283	struct freefrag *freefrag;
4284
4285	/* Grab the jsegdep. */
4286	jsegdep = jfreefrag->fr_jsegdep;
4287	jfreefrag->fr_jsegdep = NULL;
4288	freefrag = jfreefrag->fr_freefrag;
4289	if (freefrag == NULL)
4290		panic("handle_written_jfreefrag: No freefrag.");
4291	freefrag->ff_state |= DEPCOMPLETE;
4292	freefrag->ff_jdep = NULL;
4293	jwork_insert(&freefrag->ff_jwork, jsegdep);
4294	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
4295		add_to_worklist(&freefrag->ff_list, 0);
4296	jfreefrag->fr_freefrag = NULL;
4297	free_jfreefrag(jfreefrag);
4298}
4299
4300/*
4301 * Called when the journal write for a jfreeblk completes.  The jfreeblk
4302 * is removed from the freeblks list of pending journal writes and the
4303 * jsegdep is moved to the freeblks jwork to be completed when all blocks
4304 * have been reclaimed.
4305 */
4306static void
4307handle_written_jblkdep(jblkdep)
4308	struct jblkdep *jblkdep;
4309{
4310	struct freeblks *freeblks;
4311	struct jsegdep *jsegdep;
4312
4313	/* Grab the jsegdep. */
4314	jsegdep = jblkdep->jb_jsegdep;
4315	jblkdep->jb_jsegdep = NULL;
4316	freeblks = jblkdep->jb_freeblks;
4317	LIST_REMOVE(jblkdep, jb_deps);
4318	jwork_insert(&freeblks->fb_jwork, jsegdep);
4319	/*
4320	 * If the freeblks is all journaled, we can add it to the worklist.
4321	 */
4322	if (LIST_EMPTY(&freeblks->fb_jblkdephd) &&
4323	    (freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
4324		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
4325
4326	free_jblkdep(jblkdep);
4327}
4328
4329static struct jsegdep *
4330newjsegdep(struct worklist *wk)
4331{
4332	struct jsegdep *jsegdep;
4333
4334	jsegdep = malloc(sizeof(*jsegdep), M_JSEGDEP, M_SOFTDEP_FLAGS);
4335	workitem_alloc(&jsegdep->jd_list, D_JSEGDEP, wk->wk_mp);
4336	jsegdep->jd_seg = NULL;
4337
4338	return (jsegdep);
4339}
4340
4341static struct jmvref *
4342newjmvref(dp, ino, oldoff, newoff)
4343	struct inode *dp;
4344	ino_t ino;
4345	off_t oldoff;
4346	off_t newoff;
4347{
4348	struct jmvref *jmvref;
4349
4350	jmvref = malloc(sizeof(*jmvref), M_JMVREF, M_SOFTDEP_FLAGS);
4351	workitem_alloc(&jmvref->jm_list, D_JMVREF, ITOVFS(dp));
4352	jmvref->jm_list.wk_state = ATTACHED | DEPCOMPLETE;
4353	jmvref->jm_parent = dp->i_number;
4354	jmvref->jm_ino = ino;
4355	jmvref->jm_oldoff = oldoff;
4356	jmvref->jm_newoff = newoff;
4357
4358	return (jmvref);
4359}
4360
4361/*
4362 * Allocate a new jremref that tracks the removal of ip from dp with the
4363 * directory entry offset of diroff.  Mark the entry as ATTACHED and
4364 * DEPCOMPLETE as we have all the information required for the journal write
4365 * and the directory has already been removed from the buffer.  The caller
4366 * is responsible for linking the jremref into the pagedep and adding it
4367 * to the journal to write.  The MKDIR_PARENT flag is set if we're doing
4368 * a DOTDOT addition so handle_workitem_remove() can properly assign
4369 * the jsegdep when we're done.
4370 */
4371static struct jremref *
4372newjremref(struct dirrem *dirrem, struct inode *dp, struct inode *ip,
4373    off_t diroff, nlink_t nlink)
4374{
4375	struct jremref *jremref;
4376
4377	jremref = malloc(sizeof(*jremref), M_JREMREF, M_SOFTDEP_FLAGS);
4378	workitem_alloc(&jremref->jr_list, D_JREMREF, ITOVFS(dp));
4379	jremref->jr_state = ATTACHED;
4380	newinoref(&jremref->jr_ref, ip->i_number, dp->i_number, diroff,
4381	   nlink, ip->i_mode);
4382	jremref->jr_dirrem = dirrem;
4383
4384	return (jremref);
4385}
4386
4387static inline void
4388newinoref(struct inoref *inoref, ino_t ino, ino_t parent, off_t diroff,
4389    nlink_t nlink, uint16_t mode)
4390{
4391
4392	inoref->if_jsegdep = newjsegdep(&inoref->if_list);
4393	inoref->if_diroff = diroff;
4394	inoref->if_ino = ino;
4395	inoref->if_parent = parent;
4396	inoref->if_nlink = nlink;
4397	inoref->if_mode = mode;
4398}
4399
4400/*
4401 * Allocate a new jaddref to track the addition of ino to dp at diroff.  The
4402 * directory offset may not be known until later.  The caller is responsible
4403 * adding the entry to the journal when this information is available.  nlink
4404 * should be the link count prior to the addition and mode is only required
4405 * to have the correct FMT.
4406 */
4407static struct jaddref *
4408newjaddref(struct inode *dp, ino_t ino, off_t diroff, int16_t nlink,
4409    uint16_t mode)
4410{
4411	struct jaddref *jaddref;
4412
4413	jaddref = malloc(sizeof(*jaddref), M_JADDREF, M_SOFTDEP_FLAGS);
4414	workitem_alloc(&jaddref->ja_list, D_JADDREF, ITOVFS(dp));
4415	jaddref->ja_state = ATTACHED;
4416	jaddref->ja_mkdir = NULL;
4417	newinoref(&jaddref->ja_ref, ino, dp->i_number, diroff, nlink, mode);
4418
4419	return (jaddref);
4420}
4421
4422/*
4423 * Create a new free dependency for a freework.  The caller is responsible
4424 * for adjusting the reference count when it has the lock held.  The freedep
4425 * will track an outstanding bitmap write that will ultimately clear the
4426 * freework to continue.
4427 */
4428static struct freedep *
4429newfreedep(struct freework *freework)
4430{
4431	struct freedep *freedep;
4432
4433	freedep = malloc(sizeof(*freedep), M_FREEDEP, M_SOFTDEP_FLAGS);
4434	workitem_alloc(&freedep->fd_list, D_FREEDEP, freework->fw_list.wk_mp);
4435	freedep->fd_freework = freework;
4436
4437	return (freedep);
4438}
4439
4440/*
4441 * Free a freedep structure once the buffer it is linked to is written.  If
4442 * this is the last reference to the freework schedule it for completion.
4443 */
4444static void
4445free_freedep(freedep)
4446	struct freedep *freedep;
4447{
4448	struct freework *freework;
4449
4450	freework = freedep->fd_freework;
4451	freework->fw_freeblks->fb_cgwait--;
4452	if (--freework->fw_ref == 0)
4453		freework_enqueue(freework);
4454	WORKITEM_FREE(freedep, D_FREEDEP);
4455}
4456
4457/*
4458 * Allocate a new freework structure that may be a level in an indirect
4459 * when parent is not NULL or a top level block when it is.  The top level
4460 * freework structures are allocated without the per-filesystem lock held
4461 * and before the freeblks is visible outside of softdep_setup_freeblocks().
4462 */
4463static struct freework *
4464newfreework(ump, freeblks, parent, lbn, nb, frags, off, journal)
4465	struct ufsmount *ump;
4466	struct freeblks *freeblks;
4467	struct freework *parent;
4468	ufs_lbn_t lbn;
4469	ufs2_daddr_t nb;
4470	int frags;
4471	int off;
4472	int journal;
4473{
4474	struct freework *freework;
4475
4476	freework = malloc(sizeof(*freework), M_FREEWORK, M_SOFTDEP_FLAGS);
4477	workitem_alloc(&freework->fw_list, D_FREEWORK, freeblks->fb_list.wk_mp);
4478	freework->fw_state = ATTACHED;
4479	freework->fw_jnewblk = NULL;
4480	freework->fw_freeblks = freeblks;
4481	freework->fw_parent = parent;
4482	freework->fw_lbn = lbn;
4483	freework->fw_blkno = nb;
4484	freework->fw_frags = frags;
4485	freework->fw_indir = NULL;
4486	freework->fw_ref = (MOUNTEDSUJ(UFSTOVFS(ump)) == 0 ||
4487	    lbn >= -UFS_NXADDR) ? 0 : NINDIR(ump->um_fs) + 1;
4488	freework->fw_start = freework->fw_off = off;
4489	if (journal)
4490		newjfreeblk(freeblks, lbn, nb, frags);
4491	if (parent == NULL) {
4492		ACQUIRE_LOCK(ump);
4493		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
4494		freeblks->fb_ref++;
4495		FREE_LOCK(ump);
4496	}
4497
4498	return (freework);
4499}
4500
4501/*
4502 * Eliminate a jfreeblk for a block that does not need journaling.
4503 */
4504static void
4505cancel_jfreeblk(freeblks, blkno)
4506	struct freeblks *freeblks;
4507	ufs2_daddr_t blkno;
4508{
4509	struct jfreeblk *jfreeblk;
4510	struct jblkdep *jblkdep;
4511
4512	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps) {
4513		if (jblkdep->jb_list.wk_type != D_JFREEBLK)
4514			continue;
4515		jfreeblk = WK_JFREEBLK(&jblkdep->jb_list);
4516		if (jfreeblk->jf_blkno == blkno)
4517			break;
4518	}
4519	if (jblkdep == NULL)
4520		return;
4521	CTR1(KTR_SUJ, "cancel_jfreeblk: blkno %jd", blkno);
4522	free_jsegdep(jblkdep->jb_jsegdep);
4523	LIST_REMOVE(jblkdep, jb_deps);
4524	WORKITEM_FREE(jfreeblk, D_JFREEBLK);
4525}
4526
4527/*
4528 * Allocate a new jfreeblk to journal top level block pointer when truncating
4529 * a file.  The caller must add this to the worklist when the per-filesystem
4530 * lock is held.
4531 */
4532static struct jfreeblk *
4533newjfreeblk(freeblks, lbn, blkno, frags)
4534	struct freeblks *freeblks;
4535	ufs_lbn_t lbn;
4536	ufs2_daddr_t blkno;
4537	int frags;
4538{
4539	struct jfreeblk *jfreeblk;
4540
4541	jfreeblk = malloc(sizeof(*jfreeblk), M_JFREEBLK, M_SOFTDEP_FLAGS);
4542	workitem_alloc(&jfreeblk->jf_dep.jb_list, D_JFREEBLK,
4543	    freeblks->fb_list.wk_mp);
4544	jfreeblk->jf_dep.jb_jsegdep = newjsegdep(&jfreeblk->jf_dep.jb_list);
4545	jfreeblk->jf_dep.jb_freeblks = freeblks;
4546	jfreeblk->jf_ino = freeblks->fb_inum;
4547	jfreeblk->jf_lbn = lbn;
4548	jfreeblk->jf_blkno = blkno;
4549	jfreeblk->jf_frags = frags;
4550	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jfreeblk->jf_dep, jb_deps);
4551
4552	return (jfreeblk);
4553}
4554
4555/*
4556 * The journal is only prepared to handle full-size block numbers, so we
4557 * have to adjust the record to reflect the change to a full-size block.
4558 * For example, suppose we have a block made up of fragments 8-15 and
4559 * want to free its last two fragments. We are given a request that says:
4560 *     FREEBLK ino=5, blkno=14, lbn=0, frags=2, oldfrags=0
4561 * where frags are the number of fragments to free and oldfrags are the
4562 * number of fragments to keep. To block align it, we have to change it to
4563 * have a valid full-size blkno, so it becomes:
4564 *     FREEBLK ino=5, blkno=8, lbn=0, frags=2, oldfrags=6
4565 */
4566static void
4567adjust_newfreework(freeblks, frag_offset)
4568	struct freeblks *freeblks;
4569	int frag_offset;
4570{
4571	struct jfreeblk *jfreeblk;
4572
4573	KASSERT((LIST_FIRST(&freeblks->fb_jblkdephd) != NULL &&
4574	    LIST_FIRST(&freeblks->fb_jblkdephd)->jb_list.wk_type == D_JFREEBLK),
4575	    ("adjust_newfreework: Missing freeblks dependency"));
4576
4577	jfreeblk = WK_JFREEBLK(LIST_FIRST(&freeblks->fb_jblkdephd));
4578	jfreeblk->jf_blkno -= frag_offset;
4579	jfreeblk->jf_frags += frag_offset;
4580}
4581
4582/*
4583 * Allocate a new jtrunc to track a partial truncation.
4584 */
4585static struct jtrunc *
4586newjtrunc(freeblks, size, extsize)
4587	struct freeblks *freeblks;
4588	off_t size;
4589	int extsize;
4590{
4591	struct jtrunc *jtrunc;
4592
4593	jtrunc = malloc(sizeof(*jtrunc), M_JTRUNC, M_SOFTDEP_FLAGS);
4594	workitem_alloc(&jtrunc->jt_dep.jb_list, D_JTRUNC,
4595	    freeblks->fb_list.wk_mp);
4596	jtrunc->jt_dep.jb_jsegdep = newjsegdep(&jtrunc->jt_dep.jb_list);
4597	jtrunc->jt_dep.jb_freeblks = freeblks;
4598	jtrunc->jt_ino = freeblks->fb_inum;
4599	jtrunc->jt_size = size;
4600	jtrunc->jt_extsize = extsize;
4601	LIST_INSERT_HEAD(&freeblks->fb_jblkdephd, &jtrunc->jt_dep, jb_deps);
4602
4603	return (jtrunc);
4604}
4605
4606/*
4607 * If we're canceling a new bitmap we have to search for another ref
4608 * to move into the bmsafemap dep.  This might be better expressed
4609 * with another structure.
4610 */
4611static void
4612move_newblock_dep(jaddref, inodedep)
4613	struct jaddref *jaddref;
4614	struct inodedep *inodedep;
4615{
4616	struct inoref *inoref;
4617	struct jaddref *jaddrefn;
4618
4619	jaddrefn = NULL;
4620	for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4621	    inoref = TAILQ_NEXT(inoref, if_deps)) {
4622		if ((jaddref->ja_state & NEWBLOCK) &&
4623		    inoref->if_list.wk_type == D_JADDREF) {
4624			jaddrefn = (struct jaddref *)inoref;
4625			break;
4626		}
4627	}
4628	if (jaddrefn == NULL)
4629		return;
4630	jaddrefn->ja_state &= ~(ATTACHED | UNDONE);
4631	jaddrefn->ja_state |= jaddref->ja_state &
4632	    (ATTACHED | UNDONE | NEWBLOCK);
4633	jaddref->ja_state &= ~(ATTACHED | UNDONE | NEWBLOCK);
4634	jaddref->ja_state |= ATTACHED;
4635	LIST_REMOVE(jaddref, ja_bmdeps);
4636	LIST_INSERT_HEAD(&inodedep->id_bmsafemap->sm_jaddrefhd, jaddrefn,
4637	    ja_bmdeps);
4638}
4639
4640/*
4641 * Cancel a jaddref either before it has been written or while it is being
4642 * written.  This happens when a link is removed before the add reaches
4643 * the disk.  The jaddref dependency is kept linked into the bmsafemap
4644 * and inode to prevent the link count or bitmap from reaching the disk
4645 * until handle_workitem_remove() re-adjusts the counts and bitmaps as
4646 * required.
4647 *
4648 * Returns 1 if the canceled addref requires journaling of the remove and
4649 * 0 otherwise.
4650 */
4651static int
4652cancel_jaddref(jaddref, inodedep, wkhd)
4653	struct jaddref *jaddref;
4654	struct inodedep *inodedep;
4655	struct workhead *wkhd;
4656{
4657	struct inoref *inoref;
4658	struct jsegdep *jsegdep;
4659	int needsj;
4660
4661	KASSERT((jaddref->ja_state & COMPLETE) == 0,
4662	    ("cancel_jaddref: Canceling complete jaddref"));
4663	if (jaddref->ja_state & (INPROGRESS | COMPLETE))
4664		needsj = 1;
4665	else
4666		needsj = 0;
4667	if (inodedep == NULL)
4668		if (inodedep_lookup(jaddref->ja_list.wk_mp, jaddref->ja_ino,
4669		    0, &inodedep) == 0)
4670			panic("cancel_jaddref: Lost inodedep");
4671	/*
4672	 * We must adjust the nlink of any reference operation that follows
4673	 * us so that it is consistent with the in-memory reference.  This
4674	 * ensures that inode nlink rollbacks always have the correct link.
4675	 */
4676	if (needsj == 0) {
4677		for (inoref = TAILQ_NEXT(&jaddref->ja_ref, if_deps); inoref;
4678		    inoref = TAILQ_NEXT(inoref, if_deps)) {
4679			if (inoref->if_state & GOINGAWAY)
4680				break;
4681			inoref->if_nlink--;
4682		}
4683	}
4684	jsegdep = inoref_jseg(&jaddref->ja_ref);
4685	if (jaddref->ja_state & NEWBLOCK)
4686		move_newblock_dep(jaddref, inodedep);
4687	wake_worklist(&jaddref->ja_list);
4688	jaddref->ja_mkdir = NULL;
4689	if (jaddref->ja_state & INPROGRESS) {
4690		jaddref->ja_state &= ~INPROGRESS;
4691		WORKLIST_REMOVE(&jaddref->ja_list);
4692		jwork_insert(wkhd, jsegdep);
4693	} else {
4694		free_jsegdep(jsegdep);
4695		if (jaddref->ja_state & DEPCOMPLETE)
4696			remove_from_journal(&jaddref->ja_list);
4697	}
4698	jaddref->ja_state |= (GOINGAWAY | DEPCOMPLETE);
4699	/*
4700	 * Leave NEWBLOCK jaddrefs on the inodedep so handle_workitem_remove
4701	 * can arrange for them to be freed with the bitmap.  Otherwise we
4702	 * no longer need this addref attached to the inoreflst and it
4703	 * will incorrectly adjust nlink if we leave it.
4704	 */
4705	if ((jaddref->ja_state & NEWBLOCK) == 0) {
4706		TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
4707		    if_deps);
4708		jaddref->ja_state |= COMPLETE;
4709		free_jaddref(jaddref);
4710		return (needsj);
4711	}
4712	/*
4713	 * Leave the head of the list for jsegdeps for fast merging.
4714	 */
4715	if (LIST_FIRST(wkhd) != NULL) {
4716		jaddref->ja_state |= ONWORKLIST;
4717		LIST_INSERT_AFTER(LIST_FIRST(wkhd), &jaddref->ja_list, wk_list);
4718	} else
4719		WORKLIST_INSERT(wkhd, &jaddref->ja_list);
4720
4721	return (needsj);
4722}
4723
4724/*
4725 * Attempt to free a jaddref structure when some work completes.  This
4726 * should only succeed once the entry is written and all dependencies have
4727 * been notified.
4728 */
4729static void
4730free_jaddref(jaddref)
4731	struct jaddref *jaddref;
4732{
4733
4734	if ((jaddref->ja_state & ALLCOMPLETE) != ALLCOMPLETE)
4735		return;
4736	if (jaddref->ja_ref.if_jsegdep)
4737		panic("free_jaddref: segdep attached to jaddref %p(0x%X)\n",
4738		    jaddref, jaddref->ja_state);
4739	if (jaddref->ja_state & NEWBLOCK)
4740		LIST_REMOVE(jaddref, ja_bmdeps);
4741	if (jaddref->ja_state & (INPROGRESS | ONWORKLIST))
4742		panic("free_jaddref: Bad state %p(0x%X)",
4743		    jaddref, jaddref->ja_state);
4744	if (jaddref->ja_mkdir != NULL)
4745		panic("free_jaddref: Work pending, 0x%X\n", jaddref->ja_state);
4746	WORKITEM_FREE(jaddref, D_JADDREF);
4747}
4748
4749/*
4750 * Free a jremref structure once it has been written or discarded.
4751 */
4752static void
4753free_jremref(jremref)
4754	struct jremref *jremref;
4755{
4756
4757	if (jremref->jr_ref.if_jsegdep)
4758		free_jsegdep(jremref->jr_ref.if_jsegdep);
4759	if (jremref->jr_state & INPROGRESS)
4760		panic("free_jremref: IO still pending");
4761	WORKITEM_FREE(jremref, D_JREMREF);
4762}
4763
4764/*
4765 * Free a jnewblk structure.
4766 */
4767static void
4768free_jnewblk(jnewblk)
4769	struct jnewblk *jnewblk;
4770{
4771
4772	if ((jnewblk->jn_state & ALLCOMPLETE) != ALLCOMPLETE)
4773		return;
4774	LIST_REMOVE(jnewblk, jn_deps);
4775	if (jnewblk->jn_dep != NULL)
4776		panic("free_jnewblk: Dependency still attached.");
4777	WORKITEM_FREE(jnewblk, D_JNEWBLK);
4778}
4779
4780/*
4781 * Cancel a jnewblk which has been been made redundant by frag extension.
4782 */
4783static void
4784cancel_jnewblk(jnewblk, wkhd)
4785	struct jnewblk *jnewblk;
4786	struct workhead *wkhd;
4787{
4788	struct jsegdep *jsegdep;
4789
4790	CTR1(KTR_SUJ, "cancel_jnewblk: blkno %jd", jnewblk->jn_blkno);
4791	jsegdep = jnewblk->jn_jsegdep;
4792	if (jnewblk->jn_jsegdep == NULL || jnewblk->jn_dep == NULL)
4793		panic("cancel_jnewblk: Invalid state");
4794	jnewblk->jn_jsegdep  = NULL;
4795	jnewblk->jn_dep = NULL;
4796	jnewblk->jn_state |= GOINGAWAY;
4797	if (jnewblk->jn_state & INPROGRESS) {
4798		jnewblk->jn_state &= ~INPROGRESS;
4799		WORKLIST_REMOVE(&jnewblk->jn_list);
4800		jwork_insert(wkhd, jsegdep);
4801	} else {
4802		free_jsegdep(jsegdep);
4803		remove_from_journal(&jnewblk->jn_list);
4804	}
4805	wake_worklist(&jnewblk->jn_list);
4806	WORKLIST_INSERT(wkhd, &jnewblk->jn_list);
4807}
4808
4809static void
4810free_jblkdep(jblkdep)
4811	struct jblkdep *jblkdep;
4812{
4813
4814	if (jblkdep->jb_list.wk_type == D_JFREEBLK)
4815		WORKITEM_FREE(jblkdep, D_JFREEBLK);
4816	else if (jblkdep->jb_list.wk_type == D_JTRUNC)
4817		WORKITEM_FREE(jblkdep, D_JTRUNC);
4818	else
4819		panic("free_jblkdep: Unexpected type %s",
4820		    TYPENAME(jblkdep->jb_list.wk_type));
4821}
4822
4823/*
4824 * Free a single jseg once it is no longer referenced in memory or on
4825 * disk.  Reclaim journal blocks and dependencies waiting for the segment
4826 * to disappear.
4827 */
4828static void
4829free_jseg(jseg, jblocks)
4830	struct jseg *jseg;
4831	struct jblocks *jblocks;
4832{
4833	struct freework *freework;
4834
4835	/*
4836	 * Free freework structures that were lingering to indicate freed
4837	 * indirect blocks that forced journal write ordering on reallocate.
4838	 */
4839	while ((freework = LIST_FIRST(&jseg->js_indirs)) != NULL)
4840		indirblk_remove(freework);
4841	if (jblocks->jb_oldestseg == jseg)
4842		jblocks->jb_oldestseg = TAILQ_NEXT(jseg, js_next);
4843	TAILQ_REMOVE(&jblocks->jb_segs, jseg, js_next);
4844	jblocks_free(jblocks, jseg->js_list.wk_mp, jseg->js_size);
4845	KASSERT(LIST_EMPTY(&jseg->js_entries),
4846	    ("free_jseg: Freed jseg has valid entries."));
4847	WORKITEM_FREE(jseg, D_JSEG);
4848}
4849
4850/*
4851 * Free all jsegs that meet the criteria for being reclaimed and update
4852 * oldestseg.
4853 */
4854static void
4855free_jsegs(jblocks)
4856	struct jblocks *jblocks;
4857{
4858	struct jseg *jseg;
4859
4860	/*
4861	 * Free only those jsegs which have none allocated before them to
4862	 * preserve the journal space ordering.
4863	 */
4864	while ((jseg = TAILQ_FIRST(&jblocks->jb_segs)) != NULL) {
4865		/*
4866		 * Only reclaim space when nothing depends on this journal
4867		 * set and another set has written that it is no longer
4868		 * valid.
4869		 */
4870		if (jseg->js_refs != 0) {
4871			jblocks->jb_oldestseg = jseg;
4872			return;
4873		}
4874		if ((jseg->js_state & ALLCOMPLETE) != ALLCOMPLETE)
4875			break;
4876		if (jseg->js_seq > jblocks->jb_oldestwrseq)
4877			break;
4878		/*
4879		 * We can free jsegs that didn't write entries when
4880		 * oldestwrseq == js_seq.
4881		 */
4882		if (jseg->js_seq == jblocks->jb_oldestwrseq &&
4883		    jseg->js_cnt != 0)
4884			break;
4885		free_jseg(jseg, jblocks);
4886	}
4887	/*
4888	 * If we exited the loop above we still must discover the
4889	 * oldest valid segment.
4890	 */
4891	if (jseg)
4892		for (jseg = jblocks->jb_oldestseg; jseg != NULL;
4893		     jseg = TAILQ_NEXT(jseg, js_next))
4894			if (jseg->js_refs != 0)
4895				break;
4896	jblocks->jb_oldestseg = jseg;
4897	/*
4898	 * The journal has no valid records but some jsegs may still be
4899	 * waiting on oldestwrseq to advance.  We force a small record
4900	 * out to permit these lingering records to be reclaimed.
4901	 */
4902	if (jblocks->jb_oldestseg == NULL && !TAILQ_EMPTY(&jblocks->jb_segs))
4903		jblocks->jb_needseg = 1;
4904}
4905
4906/*
4907 * Release one reference to a jseg and free it if the count reaches 0.  This
4908 * should eventually reclaim journal space as well.
4909 */
4910static void
4911rele_jseg(jseg)
4912	struct jseg *jseg;
4913{
4914
4915	KASSERT(jseg->js_refs > 0,
4916	    ("free_jseg: Invalid refcnt %d", jseg->js_refs));
4917	if (--jseg->js_refs != 0)
4918		return;
4919	free_jsegs(jseg->js_jblocks);
4920}
4921
4922/*
4923 * Release a jsegdep and decrement the jseg count.
4924 */
4925static void
4926free_jsegdep(jsegdep)
4927	struct jsegdep *jsegdep;
4928{
4929
4930	if (jsegdep->jd_seg)
4931		rele_jseg(jsegdep->jd_seg);
4932	WORKITEM_FREE(jsegdep, D_JSEGDEP);
4933}
4934
4935/*
4936 * Wait for a journal item to make it to disk.  Initiate journal processing
4937 * if required.
4938 */
4939static int
4940jwait(wk, waitfor)
4941	struct worklist *wk;
4942	int waitfor;
4943{
4944
4945	LOCK_OWNED(VFSTOUFS(wk->wk_mp));
4946	/*
4947	 * Blocking journal waits cause slow synchronous behavior.  Record
4948	 * stats on the frequency of these blocking operations.
4949	 */
4950	if (waitfor == MNT_WAIT) {
4951		stat_journal_wait++;
4952		switch (wk->wk_type) {
4953		case D_JREMREF:
4954		case D_JMVREF:
4955			stat_jwait_filepage++;
4956			break;
4957		case D_JTRUNC:
4958		case D_JFREEBLK:
4959			stat_jwait_freeblks++;
4960			break;
4961		case D_JNEWBLK:
4962			stat_jwait_newblk++;
4963			break;
4964		case D_JADDREF:
4965			stat_jwait_inode++;
4966			break;
4967		default:
4968			break;
4969		}
4970	}
4971	/*
4972	 * If IO has not started we process the journal.  We can't mark the
4973	 * worklist item as IOWAITING because we drop the lock while
4974	 * processing the journal and the worklist entry may be freed after
4975	 * this point.  The caller may call back in and re-issue the request.
4976	 */
4977	if ((wk->wk_state & INPROGRESS) == 0) {
4978		softdep_process_journal(wk->wk_mp, wk, waitfor);
4979		if (waitfor != MNT_WAIT)
4980			return (EBUSY);
4981		return (0);
4982	}
4983	if (waitfor != MNT_WAIT)
4984		return (EBUSY);
4985	wait_worklist(wk, "jwait");
4986	return (0);
4987}
4988
4989/*
4990 * Lookup an inodedep based on an inode pointer and set the nlinkdelta as
4991 * appropriate.  This is a convenience function to reduce duplicate code
4992 * for the setup and revert functions below.
4993 */
4994static struct inodedep *
4995inodedep_lookup_ip(ip)
4996	struct inode *ip;
4997{
4998	struct inodedep *inodedep;
4999
5000	KASSERT(ip->i_nlink >= ip->i_effnlink,
5001	    ("inodedep_lookup_ip: bad delta"));
5002	(void) inodedep_lookup(ITOVFS(ip), ip->i_number, DEPALLOC,
5003	    &inodedep);
5004	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
5005	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
5006
5007	return (inodedep);
5008}
5009
5010/*
5011 * Called prior to creating a new inode and linking it to a directory.  The
5012 * jaddref structure must already be allocated by softdep_setup_inomapdep
5013 * and it is discovered here so we can initialize the mode and update
5014 * nlinkdelta.
5015 */
5016void
5017softdep_setup_create(dp, ip)
5018	struct inode *dp;
5019	struct inode *ip;
5020{
5021	struct inodedep *inodedep;
5022	struct jaddref *jaddref;
5023	struct vnode *dvp;
5024
5025	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5026	    ("softdep_setup_create called on non-softdep filesystem"));
5027	KASSERT(ip->i_nlink == 1,
5028	    ("softdep_setup_create: Invalid link count."));
5029	dvp = ITOV(dp);
5030	ACQUIRE_LOCK(ITOUMP(dp));
5031	inodedep = inodedep_lookup_ip(ip);
5032	if (DOINGSUJ(dvp)) {
5033		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5034		    inoreflst);
5035		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
5036		    ("softdep_setup_create: No addref structure present."));
5037	}
5038	FREE_LOCK(ITOUMP(dp));
5039}
5040
5041/*
5042 * Create a jaddref structure to track the addition of a DOTDOT link when
5043 * we are reparenting an inode as part of a rename.  This jaddref will be
5044 * found by softdep_setup_directory_change.  Adjusts nlinkdelta for
5045 * non-journaling softdep.
5046 */
5047void
5048softdep_setup_dotdot_link(dp, ip)
5049	struct inode *dp;
5050	struct inode *ip;
5051{
5052	struct inodedep *inodedep;
5053	struct jaddref *jaddref;
5054	struct vnode *dvp;
5055
5056	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5057	    ("softdep_setup_dotdot_link called on non-softdep filesystem"));
5058	dvp = ITOV(dp);
5059	jaddref = NULL;
5060	/*
5061	 * We don't set MKDIR_PARENT as this is not tied to a mkdir and
5062	 * is used as a normal link would be.
5063	 */
5064	if (DOINGSUJ(dvp))
5065		jaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
5066		    dp->i_effnlink - 1, dp->i_mode);
5067	ACQUIRE_LOCK(ITOUMP(dp));
5068	inodedep = inodedep_lookup_ip(dp);
5069	if (jaddref)
5070		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5071		    if_deps);
5072	FREE_LOCK(ITOUMP(dp));
5073}
5074
5075/*
5076 * Create a jaddref structure to track a new link to an inode.  The directory
5077 * offset is not known until softdep_setup_directory_add or
5078 * softdep_setup_directory_change.  Adjusts nlinkdelta for non-journaling
5079 * softdep.
5080 */
5081void
5082softdep_setup_link(dp, ip)
5083	struct inode *dp;
5084	struct inode *ip;
5085{
5086	struct inodedep *inodedep;
5087	struct jaddref *jaddref;
5088	struct vnode *dvp;
5089
5090	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5091	    ("softdep_setup_link called on non-softdep filesystem"));
5092	dvp = ITOV(dp);
5093	jaddref = NULL;
5094	if (DOINGSUJ(dvp))
5095		jaddref = newjaddref(dp, ip->i_number, 0, ip->i_effnlink - 1,
5096		    ip->i_mode);
5097	ACQUIRE_LOCK(ITOUMP(dp));
5098	inodedep = inodedep_lookup_ip(ip);
5099	if (jaddref)
5100		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5101		    if_deps);
5102	FREE_LOCK(ITOUMP(dp));
5103}
5104
5105/*
5106 * Called to create the jaddref structures to track . and .. references as
5107 * well as lookup and further initialize the incomplete jaddref created
5108 * by softdep_setup_inomapdep when the inode was allocated.  Adjusts
5109 * nlinkdelta for non-journaling softdep.
5110 */
5111void
5112softdep_setup_mkdir(dp, ip)
5113	struct inode *dp;
5114	struct inode *ip;
5115{
5116	struct inodedep *inodedep;
5117	struct jaddref *dotdotaddref;
5118	struct jaddref *dotaddref;
5119	struct jaddref *jaddref;
5120	struct vnode *dvp;
5121
5122	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5123	    ("softdep_setup_mkdir called on non-softdep filesystem"));
5124	dvp = ITOV(dp);
5125	dotaddref = dotdotaddref = NULL;
5126	if (DOINGSUJ(dvp)) {
5127		dotaddref = newjaddref(ip, ip->i_number, DOT_OFFSET, 1,
5128		    ip->i_mode);
5129		dotaddref->ja_state |= MKDIR_BODY;
5130		dotdotaddref = newjaddref(ip, dp->i_number, DOTDOT_OFFSET,
5131		    dp->i_effnlink - 1, dp->i_mode);
5132		dotdotaddref->ja_state |= MKDIR_PARENT;
5133	}
5134	ACQUIRE_LOCK(ITOUMP(dp));
5135	inodedep = inodedep_lookup_ip(ip);
5136	if (DOINGSUJ(dvp)) {
5137		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5138		    inoreflst);
5139		KASSERT(jaddref != NULL,
5140		    ("softdep_setup_mkdir: No addref structure present."));
5141		KASSERT(jaddref->ja_parent == dp->i_number,
5142		    ("softdep_setup_mkdir: bad parent %ju",
5143		    (uintmax_t)jaddref->ja_parent));
5144		TAILQ_INSERT_BEFORE(&jaddref->ja_ref, &dotaddref->ja_ref,
5145		    if_deps);
5146	}
5147	inodedep = inodedep_lookup_ip(dp);
5148	if (DOINGSUJ(dvp))
5149		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst,
5150		    &dotdotaddref->ja_ref, if_deps);
5151	FREE_LOCK(ITOUMP(dp));
5152}
5153
5154/*
5155 * Called to track nlinkdelta of the inode and parent directories prior to
5156 * unlinking a directory.
5157 */
5158void
5159softdep_setup_rmdir(dp, ip)
5160	struct inode *dp;
5161	struct inode *ip;
5162{
5163	struct vnode *dvp;
5164
5165	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5166	    ("softdep_setup_rmdir called on non-softdep filesystem"));
5167	dvp = ITOV(dp);
5168	ACQUIRE_LOCK(ITOUMP(dp));
5169	(void) inodedep_lookup_ip(ip);
5170	(void) inodedep_lookup_ip(dp);
5171	FREE_LOCK(ITOUMP(dp));
5172}
5173
5174/*
5175 * Called to track nlinkdelta of the inode and parent directories prior to
5176 * unlink.
5177 */
5178void
5179softdep_setup_unlink(dp, ip)
5180	struct inode *dp;
5181	struct inode *ip;
5182{
5183	struct vnode *dvp;
5184
5185	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5186	    ("softdep_setup_unlink called on non-softdep filesystem"));
5187	dvp = ITOV(dp);
5188	ACQUIRE_LOCK(ITOUMP(dp));
5189	(void) inodedep_lookup_ip(ip);
5190	(void) inodedep_lookup_ip(dp);
5191	FREE_LOCK(ITOUMP(dp));
5192}
5193
5194/*
5195 * Called to release the journal structures created by a failed non-directory
5196 * creation.  Adjusts nlinkdelta for non-journaling softdep.
5197 */
5198void
5199softdep_revert_create(dp, ip)
5200	struct inode *dp;
5201	struct inode *ip;
5202{
5203	struct inodedep *inodedep;
5204	struct jaddref *jaddref;
5205	struct vnode *dvp;
5206
5207	KASSERT(MOUNTEDSOFTDEP(ITOVFS((dp))) != 0,
5208	    ("softdep_revert_create called on non-softdep filesystem"));
5209	dvp = ITOV(dp);
5210	ACQUIRE_LOCK(ITOUMP(dp));
5211	inodedep = inodedep_lookup_ip(ip);
5212	if (DOINGSUJ(dvp)) {
5213		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5214		    inoreflst);
5215		KASSERT(jaddref->ja_parent == dp->i_number,
5216		    ("softdep_revert_create: addref parent mismatch"));
5217		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5218	}
5219	FREE_LOCK(ITOUMP(dp));
5220}
5221
5222/*
5223 * Called to release the journal structures created by a failed link
5224 * addition.  Adjusts nlinkdelta for non-journaling softdep.
5225 */
5226void
5227softdep_revert_link(dp, ip)
5228	struct inode *dp;
5229	struct inode *ip;
5230{
5231	struct inodedep *inodedep;
5232	struct jaddref *jaddref;
5233	struct vnode *dvp;
5234
5235	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5236	    ("softdep_revert_link called on non-softdep filesystem"));
5237	dvp = ITOV(dp);
5238	ACQUIRE_LOCK(ITOUMP(dp));
5239	inodedep = inodedep_lookup_ip(ip);
5240	if (DOINGSUJ(dvp)) {
5241		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5242		    inoreflst);
5243		KASSERT(jaddref->ja_parent == dp->i_number,
5244		    ("softdep_revert_link: addref parent mismatch"));
5245		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5246	}
5247	FREE_LOCK(ITOUMP(dp));
5248}
5249
5250/*
5251 * Called to release the journal structures created by a failed mkdir
5252 * attempt.  Adjusts nlinkdelta for non-journaling softdep.
5253 */
5254void
5255softdep_revert_mkdir(dp, ip)
5256	struct inode *dp;
5257	struct inode *ip;
5258{
5259	struct inodedep *inodedep;
5260	struct jaddref *jaddref;
5261	struct jaddref *dotaddref;
5262	struct vnode *dvp;
5263
5264	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5265	    ("softdep_revert_mkdir called on non-softdep filesystem"));
5266	dvp = ITOV(dp);
5267
5268	ACQUIRE_LOCK(ITOUMP(dp));
5269	inodedep = inodedep_lookup_ip(dp);
5270	if (DOINGSUJ(dvp)) {
5271		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5272		    inoreflst);
5273		KASSERT(jaddref->ja_parent == ip->i_number,
5274		    ("softdep_revert_mkdir: dotdot addref parent mismatch"));
5275		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5276	}
5277	inodedep = inodedep_lookup_ip(ip);
5278	if (DOINGSUJ(dvp)) {
5279		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
5280		    inoreflst);
5281		KASSERT(jaddref->ja_parent == dp->i_number,
5282		    ("softdep_revert_mkdir: addref parent mismatch"));
5283		dotaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
5284		    inoreflst, if_deps);
5285		cancel_jaddref(jaddref, inodedep, &inodedep->id_inowait);
5286		KASSERT(dotaddref->ja_parent == ip->i_number,
5287		    ("softdep_revert_mkdir: dot addref parent mismatch"));
5288		cancel_jaddref(dotaddref, inodedep, &inodedep->id_inowait);
5289	}
5290	FREE_LOCK(ITOUMP(dp));
5291}
5292
5293/*
5294 * Called to correct nlinkdelta after a failed rmdir.
5295 */
5296void
5297softdep_revert_rmdir(dp, ip)
5298	struct inode *dp;
5299	struct inode *ip;
5300{
5301
5302	KASSERT(MOUNTEDSOFTDEP(ITOVFS(dp)) != 0,
5303	    ("softdep_revert_rmdir called on non-softdep filesystem"));
5304	ACQUIRE_LOCK(ITOUMP(dp));
5305	(void) inodedep_lookup_ip(ip);
5306	(void) inodedep_lookup_ip(dp);
5307	FREE_LOCK(ITOUMP(dp));
5308}
5309
5310/*
5311 * Protecting the freemaps (or bitmaps).
5312 *
5313 * To eliminate the need to execute fsck before mounting a filesystem
5314 * after a power failure, one must (conservatively) guarantee that the
5315 * on-disk copy of the bitmaps never indicate that a live inode or block is
5316 * free.  So, when a block or inode is allocated, the bitmap should be
5317 * updated (on disk) before any new pointers.  When a block or inode is
5318 * freed, the bitmap should not be updated until all pointers have been
5319 * reset.  The latter dependency is handled by the delayed de-allocation
5320 * approach described below for block and inode de-allocation.  The former
5321 * dependency is handled by calling the following procedure when a block or
5322 * inode is allocated. When an inode is allocated an "inodedep" is created
5323 * with its DEPCOMPLETE flag cleared until its bitmap is written to disk.
5324 * Each "inodedep" is also inserted into the hash indexing structure so
5325 * that any additional link additions can be made dependent on the inode
5326 * allocation.
5327 *
5328 * The ufs filesystem maintains a number of free block counts (e.g., per
5329 * cylinder group, per cylinder and per <cylinder, rotational position> pair)
5330 * in addition to the bitmaps.  These counts are used to improve efficiency
5331 * during allocation and therefore must be consistent with the bitmaps.
5332 * There is no convenient way to guarantee post-crash consistency of these
5333 * counts with simple update ordering, for two main reasons: (1) The counts
5334 * and bitmaps for a single cylinder group block are not in the same disk
5335 * sector.  If a disk write is interrupted (e.g., by power failure), one may
5336 * be written and the other not.  (2) Some of the counts are located in the
5337 * superblock rather than the cylinder group block. So, we focus our soft
5338 * updates implementation on protecting the bitmaps. When mounting a
5339 * filesystem, we recompute the auxiliary counts from the bitmaps.
5340 */
5341
5342/*
5343 * Called just after updating the cylinder group block to allocate an inode.
5344 */
5345void
5346softdep_setup_inomapdep(bp, ip, newinum, mode)
5347	struct buf *bp;		/* buffer for cylgroup block with inode map */
5348	struct inode *ip;	/* inode related to allocation */
5349	ino_t newinum;		/* new inode number being allocated */
5350	int mode;
5351{
5352	struct inodedep *inodedep;
5353	struct bmsafemap *bmsafemap;
5354	struct jaddref *jaddref;
5355	struct mount *mp;
5356	struct fs *fs;
5357
5358	mp = ITOVFS(ip);
5359	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5360	    ("softdep_setup_inomapdep called on non-softdep filesystem"));
5361	fs = VFSTOUFS(mp)->um_fs;
5362	jaddref = NULL;
5363
5364	/*
5365	 * Allocate the journal reference add structure so that the bitmap
5366	 * can be dependent on it.
5367	 */
5368	if (MOUNTEDSUJ(mp)) {
5369		jaddref = newjaddref(ip, newinum, 0, 0, mode);
5370		jaddref->ja_state |= NEWBLOCK;
5371	}
5372
5373	/*
5374	 * Create a dependency for the newly allocated inode.
5375	 * Panic if it already exists as something is seriously wrong.
5376	 * Otherwise add it to the dependency list for the buffer holding
5377	 * the cylinder group map from which it was allocated.
5378	 *
5379	 * We have to preallocate a bmsafemap entry in case it is needed
5380	 * in bmsafemap_lookup since once we allocate the inodedep, we
5381	 * have to finish initializing it before we can FREE_LOCK().
5382	 * By preallocating, we avoid FREE_LOCK() while doing a malloc
5383	 * in bmsafemap_lookup. We cannot call bmsafemap_lookup before
5384	 * creating the inodedep as it can be freed during the time
5385	 * that we FREE_LOCK() while allocating the inodedep. We must
5386	 * call workitem_alloc() before entering the locked section as
5387	 * it also acquires the lock and we must avoid trying doing so
5388	 * recursively.
5389	 */
5390	bmsafemap = malloc(sizeof(struct bmsafemap),
5391	    M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5392	workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5393	ACQUIRE_LOCK(ITOUMP(ip));
5394	if ((inodedep_lookup(mp, newinum, DEPALLOC, &inodedep)))
5395		panic("softdep_setup_inomapdep: dependency %p for new"
5396		    "inode already exists", inodedep);
5397	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum), bmsafemap);
5398	if (jaddref) {
5399		LIST_INSERT_HEAD(&bmsafemap->sm_jaddrefhd, jaddref, ja_bmdeps);
5400		TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jaddref->ja_ref,
5401		    if_deps);
5402	} else {
5403		inodedep->id_state |= ONDEPLIST;
5404		LIST_INSERT_HEAD(&bmsafemap->sm_inodedephd, inodedep, id_deps);
5405	}
5406	inodedep->id_bmsafemap = bmsafemap;
5407	inodedep->id_state &= ~DEPCOMPLETE;
5408	FREE_LOCK(ITOUMP(ip));
5409}
5410
5411/*
5412 * Called just after updating the cylinder group block to
5413 * allocate block or fragment.
5414 */
5415void
5416softdep_setup_blkmapdep(bp, mp, newblkno, frags, oldfrags)
5417	struct buf *bp;		/* buffer for cylgroup block with block map */
5418	struct mount *mp;	/* filesystem doing allocation */
5419	ufs2_daddr_t newblkno;	/* number of newly allocated block */
5420	int frags;		/* Number of fragments. */
5421	int oldfrags;		/* Previous number of fragments for extend. */
5422{
5423	struct newblk *newblk;
5424	struct bmsafemap *bmsafemap;
5425	struct jnewblk *jnewblk;
5426	struct ufsmount *ump;
5427	struct fs *fs;
5428
5429	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5430	    ("softdep_setup_blkmapdep called on non-softdep filesystem"));
5431	ump = VFSTOUFS(mp);
5432	fs = ump->um_fs;
5433	jnewblk = NULL;
5434	/*
5435	 * Create a dependency for the newly allocated block.
5436	 * Add it to the dependency list for the buffer holding
5437	 * the cylinder group map from which it was allocated.
5438	 */
5439	if (MOUNTEDSUJ(mp)) {
5440		jnewblk = malloc(sizeof(*jnewblk), M_JNEWBLK, M_SOFTDEP_FLAGS);
5441		workitem_alloc(&jnewblk->jn_list, D_JNEWBLK, mp);
5442		jnewblk->jn_jsegdep = newjsegdep(&jnewblk->jn_list);
5443		jnewblk->jn_state = ATTACHED;
5444		jnewblk->jn_blkno = newblkno;
5445		jnewblk->jn_frags = frags;
5446		jnewblk->jn_oldfrags = oldfrags;
5447#ifdef INVARIANTS
5448		{
5449			struct cg *cgp;
5450			uint8_t *blksfree;
5451			long bno;
5452			int i;
5453
5454			cgp = (struct cg *)bp->b_data;
5455			blksfree = cg_blksfree(cgp);
5456			bno = dtogd(fs, jnewblk->jn_blkno);
5457			for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags;
5458			    i++) {
5459				if (isset(blksfree, bno + i))
5460					panic("softdep_setup_blkmapdep: "
5461					    "free fragment %d from %d-%d "
5462					    "state 0x%X dep %p", i,
5463					    jnewblk->jn_oldfrags,
5464					    jnewblk->jn_frags,
5465					    jnewblk->jn_state,
5466					    jnewblk->jn_dep);
5467			}
5468		}
5469#endif
5470	}
5471
5472	CTR3(KTR_SUJ,
5473	    "softdep_setup_blkmapdep: blkno %jd frags %d oldfrags %d",
5474	    newblkno, frags, oldfrags);
5475	ACQUIRE_LOCK(ump);
5476	if (newblk_lookup(mp, newblkno, DEPALLOC, &newblk) != 0)
5477		panic("softdep_setup_blkmapdep: found block");
5478	newblk->nb_bmsafemap = bmsafemap = bmsafemap_lookup(mp, bp,
5479	    dtog(fs, newblkno), NULL);
5480	if (jnewblk) {
5481		jnewblk->jn_dep = (struct worklist *)newblk;
5482		LIST_INSERT_HEAD(&bmsafemap->sm_jnewblkhd, jnewblk, jn_deps);
5483	} else {
5484		newblk->nb_state |= ONDEPLIST;
5485		LIST_INSERT_HEAD(&bmsafemap->sm_newblkhd, newblk, nb_deps);
5486	}
5487	newblk->nb_bmsafemap = bmsafemap;
5488	newblk->nb_jnewblk = jnewblk;
5489	FREE_LOCK(ump);
5490}
5491
5492#define	BMSAFEMAP_HASH(ump, cg) \
5493      (&(ump)->bmsafemap_hashtbl[(cg) & (ump)->bmsafemap_hash_size])
5494
5495static int
5496bmsafemap_find(bmsafemaphd, cg, bmsafemapp)
5497	struct bmsafemap_hashhead *bmsafemaphd;
5498	int cg;
5499	struct bmsafemap **bmsafemapp;
5500{
5501	struct bmsafemap *bmsafemap;
5502
5503	LIST_FOREACH(bmsafemap, bmsafemaphd, sm_hash)
5504		if (bmsafemap->sm_cg == cg)
5505			break;
5506	if (bmsafemap) {
5507		*bmsafemapp = bmsafemap;
5508		return (1);
5509	}
5510	*bmsafemapp = NULL;
5511
5512	return (0);
5513}
5514
5515/*
5516 * Find the bmsafemap associated with a cylinder group buffer.
5517 * If none exists, create one. The buffer must be locked when
5518 * this routine is called and this routine must be called with
5519 * the softdep lock held. To avoid giving up the lock while
5520 * allocating a new bmsafemap, a preallocated bmsafemap may be
5521 * provided. If it is provided but not needed, it is freed.
5522 */
5523static struct bmsafemap *
5524bmsafemap_lookup(mp, bp, cg, newbmsafemap)
5525	struct mount *mp;
5526	struct buf *bp;
5527	int cg;
5528	struct bmsafemap *newbmsafemap;
5529{
5530	struct bmsafemap_hashhead *bmsafemaphd;
5531	struct bmsafemap *bmsafemap, *collision;
5532	struct worklist *wk;
5533	struct ufsmount *ump;
5534
5535	ump = VFSTOUFS(mp);
5536	LOCK_OWNED(ump);
5537	KASSERT(bp != NULL, ("bmsafemap_lookup: missing buffer"));
5538	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
5539		if (wk->wk_type == D_BMSAFEMAP) {
5540			if (newbmsafemap)
5541				WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5542			return (WK_BMSAFEMAP(wk));
5543		}
5544	}
5545	bmsafemaphd = BMSAFEMAP_HASH(ump, cg);
5546	if (bmsafemap_find(bmsafemaphd, cg, &bmsafemap) == 1) {
5547		if (newbmsafemap)
5548			WORKITEM_FREE(newbmsafemap, D_BMSAFEMAP);
5549		return (bmsafemap);
5550	}
5551	if (newbmsafemap) {
5552		bmsafemap = newbmsafemap;
5553	} else {
5554		FREE_LOCK(ump);
5555		bmsafemap = malloc(sizeof(struct bmsafemap),
5556			M_BMSAFEMAP, M_SOFTDEP_FLAGS);
5557		workitem_alloc(&bmsafemap->sm_list, D_BMSAFEMAP, mp);
5558		ACQUIRE_LOCK(ump);
5559	}
5560	bmsafemap->sm_buf = bp;
5561	LIST_INIT(&bmsafemap->sm_inodedephd);
5562	LIST_INIT(&bmsafemap->sm_inodedepwr);
5563	LIST_INIT(&bmsafemap->sm_newblkhd);
5564	LIST_INIT(&bmsafemap->sm_newblkwr);
5565	LIST_INIT(&bmsafemap->sm_jaddrefhd);
5566	LIST_INIT(&bmsafemap->sm_jnewblkhd);
5567	LIST_INIT(&bmsafemap->sm_freehd);
5568	LIST_INIT(&bmsafemap->sm_freewr);
5569	if (bmsafemap_find(bmsafemaphd, cg, &collision) == 1) {
5570		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
5571		return (collision);
5572	}
5573	bmsafemap->sm_cg = cg;
5574	LIST_INSERT_HEAD(bmsafemaphd, bmsafemap, sm_hash);
5575	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
5576	WORKLIST_INSERT(&bp->b_dep, &bmsafemap->sm_list);
5577	return (bmsafemap);
5578}
5579
5580/*
5581 * Direct block allocation dependencies.
5582 *
5583 * When a new block is allocated, the corresponding disk locations must be
5584 * initialized (with zeros or new data) before the on-disk inode points to
5585 * them.  Also, the freemap from which the block was allocated must be
5586 * updated (on disk) before the inode's pointer. These two dependencies are
5587 * independent of each other and are needed for all file blocks and indirect
5588 * blocks that are pointed to directly by the inode.  Just before the
5589 * "in-core" version of the inode is updated with a newly allocated block
5590 * number, a procedure (below) is called to setup allocation dependency
5591 * structures.  These structures are removed when the corresponding
5592 * dependencies are satisfied or when the block allocation becomes obsolete
5593 * (i.e., the file is deleted, the block is de-allocated, or the block is a
5594 * fragment that gets upgraded).  All of these cases are handled in
5595 * procedures described later.
5596 *
5597 * When a file extension causes a fragment to be upgraded, either to a larger
5598 * fragment or to a full block, the on-disk location may change (if the
5599 * previous fragment could not simply be extended). In this case, the old
5600 * fragment must be de-allocated, but not until after the inode's pointer has
5601 * been updated. In most cases, this is handled by later procedures, which
5602 * will construct a "freefrag" structure to be added to the workitem queue
5603 * when the inode update is complete (or obsolete).  The main exception to
5604 * this is when an allocation occurs while a pending allocation dependency
5605 * (for the same block pointer) remains.  This case is handled in the main
5606 * allocation dependency setup procedure by immediately freeing the
5607 * unreferenced fragments.
5608 */
5609void
5610softdep_setup_allocdirect(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
5611	struct inode *ip;	/* inode to which block is being added */
5612	ufs_lbn_t off;		/* block pointer within inode */
5613	ufs2_daddr_t newblkno;	/* disk block number being added */
5614	ufs2_daddr_t oldblkno;	/* previous block number, 0 unless frag */
5615	long newsize;		/* size of new block */
5616	long oldsize;		/* size of new block */
5617	struct buf *bp;		/* bp for allocated block */
5618{
5619	struct allocdirect *adp, *oldadp;
5620	struct allocdirectlst *adphead;
5621	struct freefrag *freefrag;
5622	struct inodedep *inodedep;
5623	struct pagedep *pagedep;
5624	struct jnewblk *jnewblk;
5625	struct newblk *newblk;
5626	struct mount *mp;
5627	ufs_lbn_t lbn;
5628
5629	lbn = bp->b_lblkno;
5630	mp = ITOVFS(ip);
5631	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
5632	    ("softdep_setup_allocdirect called on non-softdep filesystem"));
5633	if (oldblkno && oldblkno != newblkno)
5634		/*
5635		 * The usual case is that a smaller fragment that
5636		 * was just allocated has been replaced with a bigger
5637		 * fragment or a full-size block. If it is marked as
5638		 * B_DELWRI, the current contents have not been written
5639		 * to disk. It is possible that the block was written
5640		 * earlier, but very uncommon. If the block has never
5641		 * been written, there is no need to send a BIO_DELETE
5642		 * for it when it is freed. The gain from avoiding the
5643		 * TRIMs for the common case of unwritten blocks far
5644		 * exceeds the cost of the write amplification for the
5645		 * uncommon case of failing to send a TRIM for a block
5646		 * that had been written.
5647		 */
5648		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
5649		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
5650	else
5651		freefrag = NULL;
5652
5653	CTR6(KTR_SUJ,
5654	    "softdep_setup_allocdirect: ino %d blkno %jd oldblkno %jd "
5655	    "off %jd newsize %ld oldsize %d",
5656	    ip->i_number, newblkno, oldblkno, off, newsize, oldsize);
5657	ACQUIRE_LOCK(ITOUMP(ip));
5658	if (off >= UFS_NDADDR) {
5659		if (lbn > 0)
5660			panic("softdep_setup_allocdirect: bad lbn %jd, off %jd",
5661			    lbn, off);
5662		/* allocating an indirect block */
5663		if (oldblkno != 0)
5664			panic("softdep_setup_allocdirect: non-zero indir");
5665	} else {
5666		if (off != lbn)
5667			panic("softdep_setup_allocdirect: lbn %jd != off %jd",
5668			    lbn, off);
5669		/*
5670		 * Allocating a direct block.
5671		 *
5672		 * If we are allocating a directory block, then we must
5673		 * allocate an associated pagedep to track additions and
5674		 * deletions.
5675		 */
5676		if ((ip->i_mode & IFMT) == IFDIR)
5677			pagedep_lookup(mp, bp, ip->i_number, off, DEPALLOC,
5678			    &pagedep);
5679	}
5680	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
5681		panic("softdep_setup_allocdirect: lost block");
5682	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
5683	    ("softdep_setup_allocdirect: newblk already initialized"));
5684	/*
5685	 * Convert the newblk to an allocdirect.
5686	 */
5687	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
5688	adp = (struct allocdirect *)newblk;
5689	newblk->nb_freefrag = freefrag;
5690	adp->ad_offset = off;
5691	adp->ad_oldblkno = oldblkno;
5692	adp->ad_newsize = newsize;
5693	adp->ad_oldsize = oldsize;
5694
5695	/*
5696	 * Finish initializing the journal.
5697	 */
5698	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
5699		jnewblk->jn_ino = ip->i_number;
5700		jnewblk->jn_lbn = lbn;
5701		add_to_journal(&jnewblk->jn_list);
5702	}
5703	if (freefrag && freefrag->ff_jdep != NULL &&
5704	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
5705		add_to_journal(freefrag->ff_jdep);
5706	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
5707	adp->ad_inodedep = inodedep;
5708
5709	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
5710	/*
5711	 * The list of allocdirects must be kept in sorted and ascending
5712	 * order so that the rollback routines can quickly determine the
5713	 * first uncommitted block (the size of the file stored on disk
5714	 * ends at the end of the lowest committed fragment, or if there
5715	 * are no fragments, at the end of the highest committed block).
5716	 * Since files generally grow, the typical case is that the new
5717	 * block is to be added at the end of the list. We speed this
5718	 * special case by checking against the last allocdirect in the
5719	 * list before laboriously traversing the list looking for the
5720	 * insertion point.
5721	 */
5722	adphead = &inodedep->id_newinoupdt;
5723	oldadp = TAILQ_LAST(adphead, allocdirectlst);
5724	if (oldadp == NULL || oldadp->ad_offset <= off) {
5725		/* insert at end of list */
5726		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
5727		if (oldadp != NULL && oldadp->ad_offset == off)
5728			allocdirect_merge(adphead, adp, oldadp);
5729		FREE_LOCK(ITOUMP(ip));
5730		return;
5731	}
5732	TAILQ_FOREACH(oldadp, adphead, ad_next) {
5733		if (oldadp->ad_offset >= off)
5734			break;
5735	}
5736	if (oldadp == NULL)
5737		panic("softdep_setup_allocdirect: lost entry");
5738	/* insert in middle of list */
5739	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
5740	if (oldadp->ad_offset == off)
5741		allocdirect_merge(adphead, adp, oldadp);
5742
5743	FREE_LOCK(ITOUMP(ip));
5744}
5745
5746/*
5747 * Merge a newer and older journal record to be stored either in a
5748 * newblock or freefrag.  This handles aggregating journal records for
5749 * fragment allocation into a second record as well as replacing a
5750 * journal free with an aborted journal allocation.  A segment for the
5751 * oldest record will be placed on wkhd if it has been written.  If not
5752 * the segment for the newer record will suffice.
5753 */
5754static struct worklist *
5755jnewblk_merge(new, old, wkhd)
5756	struct worklist *new;
5757	struct worklist *old;
5758	struct workhead *wkhd;
5759{
5760	struct jnewblk *njnewblk;
5761	struct jnewblk *jnewblk;
5762
5763	/* Handle NULLs to simplify callers. */
5764	if (new == NULL)
5765		return (old);
5766	if (old == NULL)
5767		return (new);
5768	/* Replace a jfreefrag with a jnewblk. */
5769	if (new->wk_type == D_JFREEFRAG) {
5770		if (WK_JNEWBLK(old)->jn_blkno != WK_JFREEFRAG(new)->fr_blkno)
5771			panic("jnewblk_merge: blkno mismatch: %p, %p",
5772			    old, new);
5773		cancel_jfreefrag(WK_JFREEFRAG(new));
5774		return (old);
5775	}
5776	if (old->wk_type != D_JNEWBLK || new->wk_type != D_JNEWBLK)
5777		panic("jnewblk_merge: Bad type: old %d new %d\n",
5778		    old->wk_type, new->wk_type);
5779	/*
5780	 * Handle merging of two jnewblk records that describe
5781	 * different sets of fragments in the same block.
5782	 */
5783	jnewblk = WK_JNEWBLK(old);
5784	njnewblk = WK_JNEWBLK(new);
5785	if (jnewblk->jn_blkno != njnewblk->jn_blkno)
5786		panic("jnewblk_merge: Merging disparate blocks.");
5787	/*
5788	 * The record may be rolled back in the cg.
5789	 */
5790	if (jnewblk->jn_state & UNDONE) {
5791		jnewblk->jn_state &= ~UNDONE;
5792		njnewblk->jn_state |= UNDONE;
5793		njnewblk->jn_state &= ~ATTACHED;
5794	}
5795	/*
5796	 * We modify the newer addref and free the older so that if neither
5797	 * has been written the most up-to-date copy will be on disk.  If
5798	 * both have been written but rolled back we only temporarily need
5799	 * one of them to fix the bits when the cg write completes.
5800	 */
5801	jnewblk->jn_state |= ATTACHED | COMPLETE;
5802	njnewblk->jn_oldfrags = jnewblk->jn_oldfrags;
5803	cancel_jnewblk(jnewblk, wkhd);
5804	WORKLIST_REMOVE(&jnewblk->jn_list);
5805	free_jnewblk(jnewblk);
5806	return (new);
5807}
5808
5809/*
5810 * Replace an old allocdirect dependency with a newer one.
5811 */
5812static void
5813allocdirect_merge(adphead, newadp, oldadp)
5814	struct allocdirectlst *adphead;	/* head of list holding allocdirects */
5815	struct allocdirect *newadp;	/* allocdirect being added */
5816	struct allocdirect *oldadp;	/* existing allocdirect being checked */
5817{
5818	struct worklist *wk;
5819	struct freefrag *freefrag;
5820
5821	freefrag = NULL;
5822	LOCK_OWNED(VFSTOUFS(newadp->ad_list.wk_mp));
5823	if (newadp->ad_oldblkno != oldadp->ad_newblkno ||
5824	    newadp->ad_oldsize != oldadp->ad_newsize ||
5825	    newadp->ad_offset >= UFS_NDADDR)
5826		panic("%s %jd != new %jd || old size %ld != new %ld",
5827		    "allocdirect_merge: old blkno",
5828		    (intmax_t)newadp->ad_oldblkno,
5829		    (intmax_t)oldadp->ad_newblkno,
5830		    newadp->ad_oldsize, oldadp->ad_newsize);
5831	newadp->ad_oldblkno = oldadp->ad_oldblkno;
5832	newadp->ad_oldsize = oldadp->ad_oldsize;
5833	/*
5834	 * If the old dependency had a fragment to free or had never
5835	 * previously had a block allocated, then the new dependency
5836	 * can immediately post its freefrag and adopt the old freefrag.
5837	 * This action is done by swapping the freefrag dependencies.
5838	 * The new dependency gains the old one's freefrag, and the
5839	 * old one gets the new one and then immediately puts it on
5840	 * the worklist when it is freed by free_newblk. It is
5841	 * not possible to do this swap when the old dependency had a
5842	 * non-zero size but no previous fragment to free. This condition
5843	 * arises when the new block is an extension of the old block.
5844	 * Here, the first part of the fragment allocated to the new
5845	 * dependency is part of the block currently claimed on disk by
5846	 * the old dependency, so cannot legitimately be freed until the
5847	 * conditions for the new dependency are fulfilled.
5848	 */
5849	freefrag = newadp->ad_freefrag;
5850	if (oldadp->ad_freefrag != NULL || oldadp->ad_oldblkno == 0) {
5851		newadp->ad_freefrag = oldadp->ad_freefrag;
5852		oldadp->ad_freefrag = freefrag;
5853	}
5854	/*
5855	 * If we are tracking a new directory-block allocation,
5856	 * move it from the old allocdirect to the new allocdirect.
5857	 */
5858	if ((wk = LIST_FIRST(&oldadp->ad_newdirblk)) != NULL) {
5859		WORKLIST_REMOVE(wk);
5860		if (!LIST_EMPTY(&oldadp->ad_newdirblk))
5861			panic("allocdirect_merge: extra newdirblk");
5862		WORKLIST_INSERT(&newadp->ad_newdirblk, wk);
5863	}
5864	TAILQ_REMOVE(adphead, oldadp, ad_next);
5865	/*
5866	 * We need to move any journal dependencies over to the freefrag
5867	 * that releases this block if it exists.  Otherwise we are
5868	 * extending an existing block and we'll wait until that is
5869	 * complete to release the journal space and extend the
5870	 * new journal to cover this old space as well.
5871	 */
5872	if (freefrag == NULL) {
5873		if (oldadp->ad_newblkno != newadp->ad_newblkno)
5874			panic("allocdirect_merge: %jd != %jd",
5875			    oldadp->ad_newblkno, newadp->ad_newblkno);
5876		newadp->ad_block.nb_jnewblk = (struct jnewblk *)
5877		    jnewblk_merge(&newadp->ad_block.nb_jnewblk->jn_list,
5878		    &oldadp->ad_block.nb_jnewblk->jn_list,
5879		    &newadp->ad_block.nb_jwork);
5880		oldadp->ad_block.nb_jnewblk = NULL;
5881		cancel_newblk(&oldadp->ad_block, NULL,
5882		    &newadp->ad_block.nb_jwork);
5883	} else {
5884		wk = (struct worklist *) cancel_newblk(&oldadp->ad_block,
5885		    &freefrag->ff_list, &freefrag->ff_jwork);
5886		freefrag->ff_jdep = jnewblk_merge(freefrag->ff_jdep, wk,
5887		    &freefrag->ff_jwork);
5888	}
5889	free_newblk(&oldadp->ad_block);
5890}
5891
5892/*
5893 * Allocate a jfreefrag structure to journal a single block free.
5894 */
5895static struct jfreefrag *
5896newjfreefrag(freefrag, ip, blkno, size, lbn)
5897	struct freefrag *freefrag;
5898	struct inode *ip;
5899	ufs2_daddr_t blkno;
5900	long size;
5901	ufs_lbn_t lbn;
5902{
5903	struct jfreefrag *jfreefrag;
5904	struct fs *fs;
5905
5906	fs = ITOFS(ip);
5907	jfreefrag = malloc(sizeof(struct jfreefrag), M_JFREEFRAG,
5908	    M_SOFTDEP_FLAGS);
5909	workitem_alloc(&jfreefrag->fr_list, D_JFREEFRAG, ITOVFS(ip));
5910	jfreefrag->fr_jsegdep = newjsegdep(&jfreefrag->fr_list);
5911	jfreefrag->fr_state = ATTACHED | DEPCOMPLETE;
5912	jfreefrag->fr_ino = ip->i_number;
5913	jfreefrag->fr_lbn = lbn;
5914	jfreefrag->fr_blkno = blkno;
5915	jfreefrag->fr_frags = numfrags(fs, size);
5916	jfreefrag->fr_freefrag = freefrag;
5917
5918	return (jfreefrag);
5919}
5920
5921/*
5922 * Allocate a new freefrag structure.
5923 */
5924static struct freefrag *
5925newfreefrag(ip, blkno, size, lbn, key)
5926	struct inode *ip;
5927	ufs2_daddr_t blkno;
5928	long size;
5929	ufs_lbn_t lbn;
5930	u_long key;
5931{
5932	struct freefrag *freefrag;
5933	struct ufsmount *ump;
5934	struct fs *fs;
5935
5936	CTR4(KTR_SUJ, "newfreefrag: ino %d blkno %jd size %ld lbn %jd",
5937	    ip->i_number, blkno, size, lbn);
5938	ump = ITOUMP(ip);
5939	fs = ump->um_fs;
5940	if (fragnum(fs, blkno) + numfrags(fs, size) > fs->fs_frag)
5941		panic("newfreefrag: frag size");
5942	freefrag = malloc(sizeof(struct freefrag),
5943	    M_FREEFRAG, M_SOFTDEP_FLAGS);
5944	workitem_alloc(&freefrag->ff_list, D_FREEFRAG, UFSTOVFS(ump));
5945	freefrag->ff_state = ATTACHED;
5946	LIST_INIT(&freefrag->ff_jwork);
5947	freefrag->ff_inum = ip->i_number;
5948	freefrag->ff_vtype = ITOV(ip)->v_type;
5949	freefrag->ff_blkno = blkno;
5950	freefrag->ff_fragsize = size;
5951	freefrag->ff_key = key;
5952
5953	if (MOUNTEDSUJ(UFSTOVFS(ump))) {
5954		freefrag->ff_jdep = (struct worklist *)
5955		    newjfreefrag(freefrag, ip, blkno, size, lbn);
5956	} else {
5957		freefrag->ff_state |= DEPCOMPLETE;
5958		freefrag->ff_jdep = NULL;
5959	}
5960
5961	return (freefrag);
5962}
5963
5964/*
5965 * This workitem de-allocates fragments that were replaced during
5966 * file block allocation.
5967 */
5968static void
5969handle_workitem_freefrag(freefrag)
5970	struct freefrag *freefrag;
5971{
5972	struct ufsmount *ump = VFSTOUFS(freefrag->ff_list.wk_mp);
5973	struct workhead wkhd;
5974
5975	CTR3(KTR_SUJ,
5976	    "handle_workitem_freefrag: ino %d blkno %jd size %ld",
5977	    freefrag->ff_inum, freefrag->ff_blkno, freefrag->ff_fragsize);
5978	/*
5979	 * It would be illegal to add new completion items to the
5980	 * freefrag after it was schedule to be done so it must be
5981	 * safe to modify the list head here.
5982	 */
5983	LIST_INIT(&wkhd);
5984	ACQUIRE_LOCK(ump);
5985	LIST_SWAP(&freefrag->ff_jwork, &wkhd, worklist, wk_list);
5986	/*
5987	 * If the journal has not been written we must cancel it here.
5988	 */
5989	if (freefrag->ff_jdep) {
5990		if (freefrag->ff_jdep->wk_type != D_JNEWBLK)
5991			panic("handle_workitem_freefrag: Unexpected type %d\n",
5992			    freefrag->ff_jdep->wk_type);
5993		cancel_jnewblk(WK_JNEWBLK(freefrag->ff_jdep), &wkhd);
5994	}
5995	FREE_LOCK(ump);
5996	ffs_blkfree(ump, ump->um_fs, ump->um_devvp, freefrag->ff_blkno,
5997	   freefrag->ff_fragsize, freefrag->ff_inum, freefrag->ff_vtype,
5998	   &wkhd, freefrag->ff_key);
5999	ACQUIRE_LOCK(ump);
6000	WORKITEM_FREE(freefrag, D_FREEFRAG);
6001	FREE_LOCK(ump);
6002}
6003
6004/*
6005 * Set up a dependency structure for an external attributes data block.
6006 * This routine follows much of the structure of softdep_setup_allocdirect.
6007 * See the description of softdep_setup_allocdirect above for details.
6008 */
6009void
6010softdep_setup_allocext(ip, off, newblkno, oldblkno, newsize, oldsize, bp)
6011	struct inode *ip;
6012	ufs_lbn_t off;
6013	ufs2_daddr_t newblkno;
6014	ufs2_daddr_t oldblkno;
6015	long newsize;
6016	long oldsize;
6017	struct buf *bp;
6018{
6019	struct allocdirect *adp, *oldadp;
6020	struct allocdirectlst *adphead;
6021	struct freefrag *freefrag;
6022	struct inodedep *inodedep;
6023	struct jnewblk *jnewblk;
6024	struct newblk *newblk;
6025	struct mount *mp;
6026	struct ufsmount *ump;
6027	ufs_lbn_t lbn;
6028
6029	mp = ITOVFS(ip);
6030	ump = VFSTOUFS(mp);
6031	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6032	    ("softdep_setup_allocext called on non-softdep filesystem"));
6033	KASSERT(off < UFS_NXADDR,
6034	    ("softdep_setup_allocext: lbn %lld > UFS_NXADDR", (long long)off));
6035
6036	lbn = bp->b_lblkno;
6037	if (oldblkno && oldblkno != newblkno)
6038		/*
6039		 * The usual case is that a smaller fragment that
6040		 * was just allocated has been replaced with a bigger
6041		 * fragment or a full-size block. If it is marked as
6042		 * B_DELWRI, the current contents have not been written
6043		 * to disk. It is possible that the block was written
6044		 * earlier, but very uncommon. If the block has never
6045		 * been written, there is no need to send a BIO_DELETE
6046		 * for it when it is freed. The gain from avoiding the
6047		 * TRIMs for the common case of unwritten blocks far
6048		 * exceeds the cost of the write amplification for the
6049		 * uncommon case of failing to send a TRIM for a block
6050		 * that had been written.
6051		 */
6052		freefrag = newfreefrag(ip, oldblkno, oldsize, lbn,
6053		    (bp->b_flags & B_DELWRI) != 0 ? NOTRIM_KEY : SINGLETON_KEY);
6054	else
6055		freefrag = NULL;
6056
6057	ACQUIRE_LOCK(ump);
6058	if (newblk_lookup(mp, newblkno, 0, &newblk) == 0)
6059		panic("softdep_setup_allocext: lost block");
6060	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
6061	    ("softdep_setup_allocext: newblk already initialized"));
6062	/*
6063	 * Convert the newblk to an allocdirect.
6064	 */
6065	WORKITEM_REASSIGN(newblk, D_ALLOCDIRECT);
6066	adp = (struct allocdirect *)newblk;
6067	newblk->nb_freefrag = freefrag;
6068	adp->ad_offset = off;
6069	adp->ad_oldblkno = oldblkno;
6070	adp->ad_newsize = newsize;
6071	adp->ad_oldsize = oldsize;
6072	adp->ad_state |=  EXTDATA;
6073
6074	/*
6075	 * Finish initializing the journal.
6076	 */
6077	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
6078		jnewblk->jn_ino = ip->i_number;
6079		jnewblk->jn_lbn = lbn;
6080		add_to_journal(&jnewblk->jn_list);
6081	}
6082	if (freefrag && freefrag->ff_jdep != NULL &&
6083	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
6084		add_to_journal(freefrag->ff_jdep);
6085	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6086	adp->ad_inodedep = inodedep;
6087
6088	WORKLIST_INSERT(&bp->b_dep, &newblk->nb_list);
6089	/*
6090	 * The list of allocdirects must be kept in sorted and ascending
6091	 * order so that the rollback routines can quickly determine the
6092	 * first uncommitted block (the size of the file stored on disk
6093	 * ends at the end of the lowest committed fragment, or if there
6094	 * are no fragments, at the end of the highest committed block).
6095	 * Since files generally grow, the typical case is that the new
6096	 * block is to be added at the end of the list. We speed this
6097	 * special case by checking against the last allocdirect in the
6098	 * list before laboriously traversing the list looking for the
6099	 * insertion point.
6100	 */
6101	adphead = &inodedep->id_newextupdt;
6102	oldadp = TAILQ_LAST(adphead, allocdirectlst);
6103	if (oldadp == NULL || oldadp->ad_offset <= off) {
6104		/* insert at end of list */
6105		TAILQ_INSERT_TAIL(adphead, adp, ad_next);
6106		if (oldadp != NULL && oldadp->ad_offset == off)
6107			allocdirect_merge(adphead, adp, oldadp);
6108		FREE_LOCK(ump);
6109		return;
6110	}
6111	TAILQ_FOREACH(oldadp, adphead, ad_next) {
6112		if (oldadp->ad_offset >= off)
6113			break;
6114	}
6115	if (oldadp == NULL)
6116		panic("softdep_setup_allocext: lost entry");
6117	/* insert in middle of list */
6118	TAILQ_INSERT_BEFORE(oldadp, adp, ad_next);
6119	if (oldadp->ad_offset == off)
6120		allocdirect_merge(adphead, adp, oldadp);
6121	FREE_LOCK(ump);
6122}
6123
6124/*
6125 * Indirect block allocation dependencies.
6126 *
6127 * The same dependencies that exist for a direct block also exist when
6128 * a new block is allocated and pointed to by an entry in a block of
6129 * indirect pointers. The undo/redo states described above are also
6130 * used here. Because an indirect block contains many pointers that
6131 * may have dependencies, a second copy of the entire in-memory indirect
6132 * block is kept. The buffer cache copy is always completely up-to-date.
6133 * The second copy, which is used only as a source for disk writes,
6134 * contains only the safe pointers (i.e., those that have no remaining
6135 * update dependencies). The second copy is freed when all pointers
6136 * are safe. The cache is not allowed to replace indirect blocks with
6137 * pending update dependencies. If a buffer containing an indirect
6138 * block with dependencies is written, these routines will mark it
6139 * dirty again. It can only be successfully written once all the
6140 * dependencies are removed. The ffs_fsync routine in conjunction with
6141 * softdep_sync_metadata work together to get all the dependencies
6142 * removed so that a file can be successfully written to disk. Three
6143 * procedures are used when setting up indirect block pointer
6144 * dependencies. The division is necessary because of the organization
6145 * of the "balloc" routine and because of the distinction between file
6146 * pages and file metadata blocks.
6147 */
6148
6149/*
6150 * Allocate a new allocindir structure.
6151 */
6152static struct allocindir *
6153newallocindir(ip, ptrno, newblkno, oldblkno, lbn)
6154	struct inode *ip;	/* inode for file being extended */
6155	int ptrno;		/* offset of pointer in indirect block */
6156	ufs2_daddr_t newblkno;	/* disk block number being added */
6157	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
6158	ufs_lbn_t lbn;
6159{
6160	struct newblk *newblk;
6161	struct allocindir *aip;
6162	struct freefrag *freefrag;
6163	struct jnewblk *jnewblk;
6164
6165	if (oldblkno)
6166		freefrag = newfreefrag(ip, oldblkno, ITOFS(ip)->fs_bsize, lbn,
6167		    SINGLETON_KEY);
6168	else
6169		freefrag = NULL;
6170	ACQUIRE_LOCK(ITOUMP(ip));
6171	if (newblk_lookup(ITOVFS(ip), newblkno, 0, &newblk) == 0)
6172		panic("new_allocindir: lost block");
6173	KASSERT(newblk->nb_list.wk_type == D_NEWBLK,
6174	    ("newallocindir: newblk already initialized"));
6175	WORKITEM_REASSIGN(newblk, D_ALLOCINDIR);
6176	newblk->nb_freefrag = freefrag;
6177	aip = (struct allocindir *)newblk;
6178	aip->ai_offset = ptrno;
6179	aip->ai_oldblkno = oldblkno;
6180	aip->ai_lbn = lbn;
6181	if ((jnewblk = newblk->nb_jnewblk) != NULL) {
6182		jnewblk->jn_ino = ip->i_number;
6183		jnewblk->jn_lbn = lbn;
6184		add_to_journal(&jnewblk->jn_list);
6185	}
6186	if (freefrag && freefrag->ff_jdep != NULL &&
6187	    freefrag->ff_jdep->wk_type == D_JFREEFRAG)
6188		add_to_journal(freefrag->ff_jdep);
6189	return (aip);
6190}
6191
6192/*
6193 * Called just before setting an indirect block pointer
6194 * to a newly allocated file page.
6195 */
6196void
6197softdep_setup_allocindir_page(ip, lbn, bp, ptrno, newblkno, oldblkno, nbp)
6198	struct inode *ip;	/* inode for file being extended */
6199	ufs_lbn_t lbn;		/* allocated block number within file */
6200	struct buf *bp;		/* buffer with indirect blk referencing page */
6201	int ptrno;		/* offset of pointer in indirect block */
6202	ufs2_daddr_t newblkno;	/* disk block number being added */
6203	ufs2_daddr_t oldblkno;	/* previous block number, 0 if none */
6204	struct buf *nbp;	/* buffer holding allocated page */
6205{
6206	struct inodedep *inodedep;
6207	struct freefrag *freefrag;
6208	struct allocindir *aip;
6209	struct pagedep *pagedep;
6210	struct mount *mp;
6211	struct ufsmount *ump;
6212
6213	mp = ITOVFS(ip);
6214	ump = VFSTOUFS(mp);
6215	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6216	    ("softdep_setup_allocindir_page called on non-softdep filesystem"));
6217	KASSERT(lbn == nbp->b_lblkno,
6218	    ("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
6219	    lbn, bp->b_lblkno));
6220	CTR4(KTR_SUJ,
6221	    "softdep_setup_allocindir_page: ino %d blkno %jd oldblkno %jd "
6222	    "lbn %jd", ip->i_number, newblkno, oldblkno, lbn);
6223	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
6224	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
6225	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6226	/*
6227	 * If we are allocating a directory page, then we must
6228	 * allocate an associated pagedep to track additions and
6229	 * deletions.
6230	 */
6231	if ((ip->i_mode & IFMT) == IFDIR)
6232		pagedep_lookup(mp, nbp, ip->i_number, lbn, DEPALLOC, &pagedep);
6233	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6234	freefrag = setup_allocindir_phase2(bp, ip, inodedep, aip, lbn);
6235	FREE_LOCK(ump);
6236	if (freefrag)
6237		handle_workitem_freefrag(freefrag);
6238}
6239
6240/*
6241 * Called just before setting an indirect block pointer to a
6242 * newly allocated indirect block.
6243 */
6244void
6245softdep_setup_allocindir_meta(nbp, ip, bp, ptrno, newblkno)
6246	struct buf *nbp;	/* newly allocated indirect block */
6247	struct inode *ip;	/* inode for file being extended */
6248	struct buf *bp;		/* indirect block referencing allocated block */
6249	int ptrno;		/* offset of pointer in indirect block */
6250	ufs2_daddr_t newblkno;	/* disk block number being added */
6251{
6252	struct inodedep *inodedep;
6253	struct allocindir *aip;
6254	struct ufsmount *ump;
6255	ufs_lbn_t lbn;
6256
6257	ump = ITOUMP(ip);
6258	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
6259	    ("softdep_setup_allocindir_meta called on non-softdep filesystem"));
6260	CTR3(KTR_SUJ,
6261	    "softdep_setup_allocindir_meta: ino %d blkno %jd ptrno %d",
6262	    ip->i_number, newblkno, ptrno);
6263	lbn = nbp->b_lblkno;
6264	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
6265	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
6266	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
6267	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
6268	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
6269		panic("softdep_setup_allocindir_meta: Block already existed");
6270	FREE_LOCK(ump);
6271}
6272
6273static void
6274indirdep_complete(indirdep)
6275	struct indirdep *indirdep;
6276{
6277	struct allocindir *aip;
6278
6279	LIST_REMOVE(indirdep, ir_next);
6280	indirdep->ir_state |= DEPCOMPLETE;
6281
6282	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL) {
6283		LIST_REMOVE(aip, ai_next);
6284		free_newblk(&aip->ai_block);
6285	}
6286	/*
6287	 * If this indirdep is not attached to a buf it was simply waiting
6288	 * on completion to clear completehd.  free_indirdep() asserts
6289	 * that nothing is dangling.
6290	 */
6291	if ((indirdep->ir_state & ONWORKLIST) == 0)
6292		free_indirdep(indirdep);
6293}
6294
6295static struct indirdep *
6296indirdep_lookup(mp, ip, bp)
6297	struct mount *mp;
6298	struct inode *ip;
6299	struct buf *bp;
6300{
6301	struct indirdep *indirdep, *newindirdep;
6302	struct newblk *newblk;
6303	struct ufsmount *ump;
6304	struct worklist *wk;
6305	struct fs *fs;
6306	ufs2_daddr_t blkno;
6307
6308	ump = VFSTOUFS(mp);
6309	LOCK_OWNED(ump);
6310	indirdep = NULL;
6311	newindirdep = NULL;
6312	fs = ump->um_fs;
6313	for (;;) {
6314		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
6315			if (wk->wk_type != D_INDIRDEP)
6316				continue;
6317			indirdep = WK_INDIRDEP(wk);
6318			break;
6319		}
6320		/* Found on the buffer worklist, no new structure to free. */
6321		if (indirdep != NULL && newindirdep == NULL)
6322			return (indirdep);
6323		if (indirdep != NULL && newindirdep != NULL)
6324			panic("indirdep_lookup: simultaneous create");
6325		/* None found on the buffer and a new structure is ready. */
6326		if (indirdep == NULL && newindirdep != NULL)
6327			break;
6328		/* None found and no new structure available. */
6329		FREE_LOCK(ump);
6330		newindirdep = malloc(sizeof(struct indirdep),
6331		    M_INDIRDEP, M_SOFTDEP_FLAGS);
6332		workitem_alloc(&newindirdep->ir_list, D_INDIRDEP, mp);
6333		newindirdep->ir_state = ATTACHED;
6334		if (I_IS_UFS1(ip))
6335			newindirdep->ir_state |= UFS1FMT;
6336		TAILQ_INIT(&newindirdep->ir_trunc);
6337		newindirdep->ir_saveddata = NULL;
6338		LIST_INIT(&newindirdep->ir_deplisthd);
6339		LIST_INIT(&newindirdep->ir_donehd);
6340		LIST_INIT(&newindirdep->ir_writehd);
6341		LIST_INIT(&newindirdep->ir_completehd);
6342		if (bp->b_blkno == bp->b_lblkno) {
6343			ufs_bmaparray(bp->b_vp, bp->b_lblkno, &blkno, bp,
6344			    NULL, NULL);
6345			bp->b_blkno = blkno;
6346		}
6347		newindirdep->ir_freeblks = NULL;
6348		newindirdep->ir_savebp =
6349		    getblk(ump->um_devvp, bp->b_blkno, bp->b_bcount, 0, 0, 0);
6350		newindirdep->ir_bp = bp;
6351		BUF_KERNPROC(newindirdep->ir_savebp);
6352		bcopy(bp->b_data, newindirdep->ir_savebp->b_data, bp->b_bcount);
6353		ACQUIRE_LOCK(ump);
6354	}
6355	indirdep = newindirdep;
6356	WORKLIST_INSERT(&bp->b_dep, &indirdep->ir_list);
6357	/*
6358	 * If the block is not yet allocated we don't set DEPCOMPLETE so
6359	 * that we don't free dependencies until the pointers are valid.
6360	 * This could search b_dep for D_ALLOCDIRECT/D_ALLOCINDIR rather
6361	 * than using the hash.
6362	 */
6363	if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk))
6364		LIST_INSERT_HEAD(&newblk->nb_indirdeps, indirdep, ir_next);
6365	else
6366		indirdep->ir_state |= DEPCOMPLETE;
6367	return (indirdep);
6368}
6369
6370/*
6371 * Called to finish the allocation of the "aip" allocated
6372 * by one of the two routines above.
6373 */
6374static struct freefrag *
6375setup_allocindir_phase2(bp, ip, inodedep, aip, lbn)
6376	struct buf *bp;		/* in-memory copy of the indirect block */
6377	struct inode *ip;	/* inode for file being extended */
6378	struct inodedep *inodedep; /* Inodedep for ip */
6379	struct allocindir *aip;	/* allocindir allocated by the above routines */
6380	ufs_lbn_t lbn;		/* Logical block number for this block. */
6381{
6382	struct fs *fs;
6383	struct indirdep *indirdep;
6384	struct allocindir *oldaip;
6385	struct freefrag *freefrag;
6386	struct mount *mp;
6387	struct ufsmount *ump;
6388
6389	mp = ITOVFS(ip);
6390	ump = VFSTOUFS(mp);
6391	LOCK_OWNED(ump);
6392	fs = ump->um_fs;
6393	if (bp->b_lblkno >= 0)
6394		panic("setup_allocindir_phase2: not indir blk");
6395	KASSERT(aip->ai_offset >= 0 && aip->ai_offset < NINDIR(fs),
6396	    ("setup_allocindir_phase2: Bad offset %d", aip->ai_offset));
6397	indirdep = indirdep_lookup(mp, ip, bp);
6398	KASSERT(indirdep->ir_savebp != NULL,
6399	    ("setup_allocindir_phase2 NULL ir_savebp"));
6400	aip->ai_indirdep = indirdep;
6401	/*
6402	 * Check for an unwritten dependency for this indirect offset.  If
6403	 * there is, merge the old dependency into the new one.  This happens
6404	 * as a result of reallocblk only.
6405	 */
6406	freefrag = NULL;
6407	if (aip->ai_oldblkno != 0) {
6408		LIST_FOREACH(oldaip, &indirdep->ir_deplisthd, ai_next) {
6409			if (oldaip->ai_offset == aip->ai_offset) {
6410				freefrag = allocindir_merge(aip, oldaip);
6411				goto done;
6412			}
6413		}
6414		LIST_FOREACH(oldaip, &indirdep->ir_donehd, ai_next) {
6415			if (oldaip->ai_offset == aip->ai_offset) {
6416				freefrag = allocindir_merge(aip, oldaip);
6417				goto done;
6418			}
6419		}
6420	}
6421done:
6422	LIST_INSERT_HEAD(&indirdep->ir_deplisthd, aip, ai_next);
6423	return (freefrag);
6424}
6425
6426/*
6427 * Merge two allocindirs which refer to the same block.  Move newblock
6428 * dependencies and setup the freefrags appropriately.
6429 */
6430static struct freefrag *
6431allocindir_merge(aip, oldaip)
6432	struct allocindir *aip;
6433	struct allocindir *oldaip;
6434{
6435	struct freefrag *freefrag;
6436	struct worklist *wk;
6437
6438	if (oldaip->ai_newblkno != aip->ai_oldblkno)
6439		panic("allocindir_merge: blkno");
6440	aip->ai_oldblkno = oldaip->ai_oldblkno;
6441	freefrag = aip->ai_freefrag;
6442	aip->ai_freefrag = oldaip->ai_freefrag;
6443	oldaip->ai_freefrag = NULL;
6444	KASSERT(freefrag != NULL, ("setup_allocindir_phase2: No freefrag"));
6445	/*
6446	 * If we are tracking a new directory-block allocation,
6447	 * move it from the old allocindir to the new allocindir.
6448	 */
6449	if ((wk = LIST_FIRST(&oldaip->ai_newdirblk)) != NULL) {
6450		WORKLIST_REMOVE(wk);
6451		if (!LIST_EMPTY(&oldaip->ai_newdirblk))
6452			panic("allocindir_merge: extra newdirblk");
6453		WORKLIST_INSERT(&aip->ai_newdirblk, wk);
6454	}
6455	/*
6456	 * We can skip journaling for this freefrag and just complete
6457	 * any pending journal work for the allocindir that is being
6458	 * removed after the freefrag completes.
6459	 */
6460	if (freefrag->ff_jdep)
6461		cancel_jfreefrag(WK_JFREEFRAG(freefrag->ff_jdep));
6462	LIST_REMOVE(oldaip, ai_next);
6463	freefrag->ff_jdep = (struct worklist *)cancel_newblk(&oldaip->ai_block,
6464	    &freefrag->ff_list, &freefrag->ff_jwork);
6465	free_newblk(&oldaip->ai_block);
6466
6467	return (freefrag);
6468}
6469
6470static inline void
6471setup_freedirect(freeblks, ip, i, needj)
6472	struct freeblks *freeblks;
6473	struct inode *ip;
6474	int i;
6475	int needj;
6476{
6477	struct ufsmount *ump;
6478	ufs2_daddr_t blkno;
6479	int frags;
6480
6481	blkno = DIP(ip, i_db[i]);
6482	if (blkno == 0)
6483		return;
6484	DIP_SET(ip, i_db[i], 0);
6485	ump = ITOUMP(ip);
6486	frags = sblksize(ump->um_fs, ip->i_size, i);
6487	frags = numfrags(ump->um_fs, frags);
6488	newfreework(ump, freeblks, NULL, i, blkno, frags, 0, needj);
6489}
6490
6491static inline void
6492setup_freeext(freeblks, ip, i, needj)
6493	struct freeblks *freeblks;
6494	struct inode *ip;
6495	int i;
6496	int needj;
6497{
6498	struct ufsmount *ump;
6499	ufs2_daddr_t blkno;
6500	int frags;
6501
6502	blkno = ip->i_din2->di_extb[i];
6503	if (blkno == 0)
6504		return;
6505	ip->i_din2->di_extb[i] = 0;
6506	ump = ITOUMP(ip);
6507	frags = sblksize(ump->um_fs, ip->i_din2->di_extsize, i);
6508	frags = numfrags(ump->um_fs, frags);
6509	newfreework(ump, freeblks, NULL, -1 - i, blkno, frags, 0, needj);
6510}
6511
6512static inline void
6513setup_freeindir(freeblks, ip, i, lbn, needj)
6514	struct freeblks *freeblks;
6515	struct inode *ip;
6516	int i;
6517	ufs_lbn_t lbn;
6518	int needj;
6519{
6520	struct ufsmount *ump;
6521	ufs2_daddr_t blkno;
6522
6523	blkno = DIP(ip, i_ib[i]);
6524	if (blkno == 0)
6525		return;
6526	DIP_SET(ip, i_ib[i], 0);
6527	ump = ITOUMP(ip);
6528	newfreework(ump, freeblks, NULL, lbn, blkno, ump->um_fs->fs_frag,
6529	    0, needj);
6530}
6531
6532static inline struct freeblks *
6533newfreeblks(mp, ip)
6534	struct mount *mp;
6535	struct inode *ip;
6536{
6537	struct freeblks *freeblks;
6538
6539	freeblks = malloc(sizeof(struct freeblks),
6540		M_FREEBLKS, M_SOFTDEP_FLAGS|M_ZERO);
6541	workitem_alloc(&freeblks->fb_list, D_FREEBLKS, mp);
6542	LIST_INIT(&freeblks->fb_jblkdephd);
6543	LIST_INIT(&freeblks->fb_jwork);
6544	freeblks->fb_ref = 0;
6545	freeblks->fb_cgwait = 0;
6546	freeblks->fb_state = ATTACHED;
6547	freeblks->fb_uid = ip->i_uid;
6548	freeblks->fb_inum = ip->i_number;
6549	freeblks->fb_vtype = ITOV(ip)->v_type;
6550	freeblks->fb_modrev = DIP(ip, i_modrev);
6551	freeblks->fb_devvp = ITODEVVP(ip);
6552	freeblks->fb_chkcnt = 0;
6553	freeblks->fb_len = 0;
6554
6555	return (freeblks);
6556}
6557
6558static void
6559trunc_indirdep(indirdep, freeblks, bp, off)
6560	struct indirdep *indirdep;
6561	struct freeblks *freeblks;
6562	struct buf *bp;
6563	int off;
6564{
6565	struct allocindir *aip, *aipn;
6566
6567	/*
6568	 * The first set of allocindirs won't be in savedbp.
6569	 */
6570	LIST_FOREACH_SAFE(aip, &indirdep->ir_deplisthd, ai_next, aipn)
6571		if (aip->ai_offset > off)
6572			cancel_allocindir(aip, bp, freeblks, 1);
6573	LIST_FOREACH_SAFE(aip, &indirdep->ir_donehd, ai_next, aipn)
6574		if (aip->ai_offset > off)
6575			cancel_allocindir(aip, bp, freeblks, 1);
6576	/*
6577	 * These will exist in savedbp.
6578	 */
6579	LIST_FOREACH_SAFE(aip, &indirdep->ir_writehd, ai_next, aipn)
6580		if (aip->ai_offset > off)
6581			cancel_allocindir(aip, NULL, freeblks, 0);
6582	LIST_FOREACH_SAFE(aip, &indirdep->ir_completehd, ai_next, aipn)
6583		if (aip->ai_offset > off)
6584			cancel_allocindir(aip, NULL, freeblks, 0);
6585}
6586
6587/*
6588 * Follow the chain of indirects down to lastlbn creating a freework
6589 * structure for each.  This will be used to start indir_trunc() at
6590 * the right offset and create the journal records for the parrtial
6591 * truncation.  A second step will handle the truncated dependencies.
6592 */
6593static int
6594setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno)
6595	struct freeblks *freeblks;
6596	struct inode *ip;
6597	ufs_lbn_t lbn;
6598	ufs_lbn_t lastlbn;
6599	ufs2_daddr_t blkno;
6600{
6601	struct indirdep *indirdep;
6602	struct indirdep *indirn;
6603	struct freework *freework;
6604	struct newblk *newblk;
6605	struct mount *mp;
6606	struct ufsmount *ump;
6607	struct buf *bp;
6608	uint8_t *start;
6609	uint8_t *end;
6610	ufs_lbn_t lbnadd;
6611	int level;
6612	int error;
6613	int off;
6614
6615	freework = NULL;
6616	if (blkno == 0)
6617		return (0);
6618	mp = freeblks->fb_list.wk_mp;
6619	ump = VFSTOUFS(mp);
6620	/*
6621	 * Here, calls to VOP_BMAP() will fail.  However, we already have
6622	 * the on-disk address, so we just pass it to bread() instead of
6623	 * having bread() attempt to calculate it using VOP_BMAP().
6624	 */
6625	error = ffs_breadz(ump, ITOV(ip), lbn, blkptrtodb(ump, blkno),
6626	    (int)mp->mnt_stat.f_iosize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
6627	if (error)
6628		return (error);
6629	level = lbn_level(lbn);
6630	lbnadd = lbn_offset(ump->um_fs, level);
6631	/*
6632	 * Compute the offset of the last block we want to keep.  Store
6633	 * in the freework the first block we want to completely free.
6634	 */
6635	off = (lastlbn - -(lbn + level)) / lbnadd;
6636	if (off + 1 == NINDIR(ump->um_fs))
6637		goto nowork;
6638	freework = newfreework(ump, freeblks, NULL, lbn, blkno, 0, off + 1, 0);
6639	/*
6640	 * Link the freework into the indirdep.  This will prevent any new
6641	 * allocations from proceeding until we are finished with the
6642	 * truncate and the block is written.
6643	 */
6644	ACQUIRE_LOCK(ump);
6645	indirdep = indirdep_lookup(mp, ip, bp);
6646	if (indirdep->ir_freeblks)
6647		panic("setup_trunc_indir: indirdep already truncated.");
6648	TAILQ_INSERT_TAIL(&indirdep->ir_trunc, freework, fw_next);
6649	freework->fw_indir = indirdep;
6650	/*
6651	 * Cancel any allocindirs that will not make it to disk.
6652	 * We have to do this for all copies of the indirdep that
6653	 * live on this newblk.
6654	 */
6655	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
6656		if (newblk_lookup(mp, dbtofsb(ump->um_fs, bp->b_blkno), 0,
6657		    &newblk) == 0)
6658			panic("setup_trunc_indir: lost block");
6659		LIST_FOREACH(indirn, &newblk->nb_indirdeps, ir_next)
6660			trunc_indirdep(indirn, freeblks, bp, off);
6661	} else
6662		trunc_indirdep(indirdep, freeblks, bp, off);
6663	FREE_LOCK(ump);
6664	/*
6665	 * Creation is protected by the buf lock. The saveddata is only
6666	 * needed if a full truncation follows a partial truncation but it
6667	 * is difficult to allocate in that case so we fetch it anyway.
6668	 */
6669	if (indirdep->ir_saveddata == NULL)
6670		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
6671		    M_SOFTDEP_FLAGS);
6672nowork:
6673	/* Fetch the blkno of the child and the zero start offset. */
6674	if (I_IS_UFS1(ip)) {
6675		blkno = ((ufs1_daddr_t *)bp->b_data)[off];
6676		start = (uint8_t *)&((ufs1_daddr_t *)bp->b_data)[off+1];
6677	} else {
6678		blkno = ((ufs2_daddr_t *)bp->b_data)[off];
6679		start = (uint8_t *)&((ufs2_daddr_t *)bp->b_data)[off+1];
6680	}
6681	if (freework) {
6682		/* Zero the truncated pointers. */
6683		end = bp->b_data + bp->b_bcount;
6684		bzero(start, end - start);
6685		bdwrite(bp);
6686	} else
6687		bqrelse(bp);
6688	if (level == 0)
6689		return (0);
6690	lbn++; /* adjust level */
6691	lbn -= (off * lbnadd);
6692	return setup_trunc_indir(freeblks, ip, lbn, lastlbn, blkno);
6693}
6694
6695/*
6696 * Complete the partial truncation of an indirect block setup by
6697 * setup_trunc_indir().  This zeros the truncated pointers in the saved
6698 * copy and writes them to disk before the freeblks is allowed to complete.
6699 */
6700static void
6701complete_trunc_indir(freework)
6702	struct freework *freework;
6703{
6704	struct freework *fwn;
6705	struct indirdep *indirdep;
6706	struct ufsmount *ump;
6707	struct buf *bp;
6708	uintptr_t start;
6709	int count;
6710
6711	ump = VFSTOUFS(freework->fw_list.wk_mp);
6712	LOCK_OWNED(ump);
6713	indirdep = freework->fw_indir;
6714	for (;;) {
6715		bp = indirdep->ir_bp;
6716		/* See if the block was discarded. */
6717		if (bp == NULL)
6718			break;
6719		/* Inline part of getdirtybuf().  We dont want bremfree. */
6720		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) == 0)
6721			break;
6722		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
6723		    LOCK_PTR(ump)) == 0)
6724			BUF_UNLOCK(bp);
6725		ACQUIRE_LOCK(ump);
6726	}
6727	freework->fw_state |= DEPCOMPLETE;
6728	TAILQ_REMOVE(&indirdep->ir_trunc, freework, fw_next);
6729	/*
6730	 * Zero the pointers in the saved copy.
6731	 */
6732	if (indirdep->ir_state & UFS1FMT)
6733		start = sizeof(ufs1_daddr_t);
6734	else
6735		start = sizeof(ufs2_daddr_t);
6736	start *= freework->fw_start;
6737	count = indirdep->ir_savebp->b_bcount - start;
6738	start += (uintptr_t)indirdep->ir_savebp->b_data;
6739	bzero((char *)start, count);
6740	/*
6741	 * We need to start the next truncation in the list if it has not
6742	 * been started yet.
6743	 */
6744	fwn = TAILQ_FIRST(&indirdep->ir_trunc);
6745	if (fwn != NULL) {
6746		if (fwn->fw_freeblks == indirdep->ir_freeblks)
6747			TAILQ_REMOVE(&indirdep->ir_trunc, fwn, fw_next);
6748		if ((fwn->fw_state & ONWORKLIST) == 0)
6749			freework_enqueue(fwn);
6750	}
6751	/*
6752	 * If bp is NULL the block was fully truncated, restore
6753	 * the saved block list otherwise free it if it is no
6754	 * longer needed.
6755	 */
6756	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
6757		if (bp == NULL)
6758			bcopy(indirdep->ir_saveddata,
6759			    indirdep->ir_savebp->b_data,
6760			    indirdep->ir_savebp->b_bcount);
6761		free(indirdep->ir_saveddata, M_INDIRDEP);
6762		indirdep->ir_saveddata = NULL;
6763	}
6764	/*
6765	 * When bp is NULL there is a full truncation pending.  We
6766	 * must wait for this full truncation to be journaled before
6767	 * we can release this freework because the disk pointers will
6768	 * never be written as zero.
6769	 */
6770	if (bp == NULL)  {
6771		if (LIST_EMPTY(&indirdep->ir_freeblks->fb_jblkdephd))
6772			handle_written_freework(freework);
6773		else
6774			WORKLIST_INSERT(&indirdep->ir_freeblks->fb_freeworkhd,
6775			   &freework->fw_list);
6776		if (fwn == NULL) {
6777			freework->fw_indir = (void *)0x0000deadbeef0000;
6778			bp = indirdep->ir_savebp;
6779			indirdep->ir_savebp = NULL;
6780			free_indirdep(indirdep);
6781			FREE_LOCK(ump);
6782			brelse(bp);
6783			ACQUIRE_LOCK(ump);
6784		}
6785	} else {
6786		/* Complete when the real copy is written. */
6787		WORKLIST_INSERT(&bp->b_dep, &freework->fw_list);
6788		BUF_UNLOCK(bp);
6789	}
6790}
6791
6792/*
6793 * Calculate the number of blocks we are going to release where datablocks
6794 * is the current total and length is the new file size.
6795 */
6796static ufs2_daddr_t
6797blkcount(fs, datablocks, length)
6798	struct fs *fs;
6799	ufs2_daddr_t datablocks;
6800	off_t length;
6801{
6802	off_t totblks, numblks;
6803
6804	totblks = 0;
6805	numblks = howmany(length, fs->fs_bsize);
6806	if (numblks <= UFS_NDADDR) {
6807		totblks = howmany(length, fs->fs_fsize);
6808		goto out;
6809	}
6810        totblks = blkstofrags(fs, numblks);
6811	numblks -= UFS_NDADDR;
6812	/*
6813	 * Count all single, then double, then triple indirects required.
6814	 * Subtracting one indirects worth of blocks for each pass
6815	 * acknowledges one of each pointed to by the inode.
6816	 */
6817	for (;;) {
6818		totblks += blkstofrags(fs, howmany(numblks, NINDIR(fs)));
6819		numblks -= NINDIR(fs);
6820		if (numblks <= 0)
6821			break;
6822		numblks = howmany(numblks, NINDIR(fs));
6823	}
6824out:
6825	totblks = fsbtodb(fs, totblks);
6826	/*
6827	 * Handle sparse files.  We can't reclaim more blocks than the inode
6828	 * references.  We will correct it later in handle_complete_freeblks()
6829	 * when we know the real count.
6830	 */
6831	if (totblks > datablocks)
6832		return (0);
6833	return (datablocks - totblks);
6834}
6835
6836/*
6837 * Handle freeblocks for journaled softupdate filesystems.
6838 *
6839 * Contrary to normal softupdates, we must preserve the block pointers in
6840 * indirects until their subordinates are free.  This is to avoid journaling
6841 * every block that is freed which may consume more space than the journal
6842 * itself.  The recovery program will see the free block journals at the
6843 * base of the truncated area and traverse them to reclaim space.  The
6844 * pointers in the inode may be cleared immediately after the journal
6845 * records are written because each direct and indirect pointer in the
6846 * inode is recorded in a journal.  This permits full truncation to proceed
6847 * asynchronously.  The write order is journal -> inode -> cgs -> indirects.
6848 *
6849 * The algorithm is as follows:
6850 * 1) Traverse the in-memory state and create journal entries to release
6851 *    the relevant blocks and full indirect trees.
6852 * 2) Traverse the indirect block chain adding partial truncation freework
6853 *    records to indirects in the path to lastlbn.  The freework will
6854 *    prevent new allocation dependencies from being satisfied in this
6855 *    indirect until the truncation completes.
6856 * 3) Read and lock the inode block, performing an update with the new size
6857 *    and pointers.  This prevents truncated data from becoming valid on
6858 *    disk through step 4.
6859 * 4) Reap unsatisfied dependencies that are beyond the truncated area,
6860 *    eliminate journal work for those records that do not require it.
6861 * 5) Schedule the journal records to be written followed by the inode block.
6862 * 6) Allocate any necessary frags for the end of file.
6863 * 7) Zero any partially truncated blocks.
6864 *
6865 * From this truncation proceeds asynchronously using the freework and
6866 * indir_trunc machinery.  The file will not be extended again into a
6867 * partially truncated indirect block until all work is completed but
6868 * the normal dependency mechanism ensures that it is rolled back/forward
6869 * as appropriate.  Further truncation may occur without delay and is
6870 * serialized in indir_trunc().
6871 */
6872void
6873softdep_journal_freeblocks(ip, cred, length, flags)
6874	struct inode *ip;	/* The inode whose length is to be reduced */
6875	struct ucred *cred;
6876	off_t length;		/* The new length for the file */
6877	int flags;		/* IO_EXT and/or IO_NORMAL */
6878{
6879	struct freeblks *freeblks, *fbn;
6880	struct worklist *wk, *wkn;
6881	struct inodedep *inodedep;
6882	struct jblkdep *jblkdep;
6883	struct allocdirect *adp, *adpn;
6884	struct ufsmount *ump;
6885	struct fs *fs;
6886	struct buf *bp;
6887	struct vnode *vp;
6888	struct mount *mp;
6889	daddr_t dbn;
6890	ufs2_daddr_t extblocks, datablocks;
6891	ufs_lbn_t tmpval, lbn, lastlbn;
6892	int frags, lastoff, iboff, allocblock, needj, error, i;
6893
6894	ump = ITOUMP(ip);
6895	mp = UFSTOVFS(ump);
6896	fs = ump->um_fs;
6897	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
6898	    ("softdep_journal_freeblocks called on non-softdep filesystem"));
6899	vp = ITOV(ip);
6900	needj = 1;
6901	iboff = -1;
6902	allocblock = 0;
6903	extblocks = 0;
6904	datablocks = 0;
6905	frags = 0;
6906	freeblks = newfreeblks(mp, ip);
6907	ACQUIRE_LOCK(ump);
6908	/*
6909	 * If we're truncating a removed file that will never be written
6910	 * we don't need to journal the block frees.  The canceled journals
6911	 * for the allocations will suffice.
6912	 */
6913	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
6914	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
6915	    length == 0)
6916		needj = 0;
6917	CTR3(KTR_SUJ, "softdep_journal_freeblks: ip %d length %ld needj %d",
6918	    ip->i_number, length, needj);
6919	FREE_LOCK(ump);
6920	/*
6921	 * Calculate the lbn that we are truncating to.  This results in -1
6922	 * if we're truncating the 0 bytes.  So it is the last lbn we want
6923	 * to keep, not the first lbn we want to truncate.
6924	 */
6925	lastlbn = lblkno(fs, length + fs->fs_bsize - 1) - 1;
6926	lastoff = blkoff(fs, length);
6927	/*
6928	 * Compute frags we are keeping in lastlbn.  0 means all.
6929	 */
6930	if (lastlbn >= 0 && lastlbn < UFS_NDADDR) {
6931		frags = fragroundup(fs, lastoff);
6932		/* adp offset of last valid allocdirect. */
6933		iboff = lastlbn;
6934	} else if (lastlbn > 0)
6935		iboff = UFS_NDADDR;
6936	if (fs->fs_magic == FS_UFS2_MAGIC)
6937		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
6938	/*
6939	 * Handle normal data blocks and indirects.  This section saves
6940	 * values used after the inode update to complete frag and indirect
6941	 * truncation.
6942	 */
6943	if ((flags & IO_NORMAL) != 0) {
6944		/*
6945		 * Handle truncation of whole direct and indirect blocks.
6946		 */
6947		for (i = iboff + 1; i < UFS_NDADDR; i++)
6948			setup_freedirect(freeblks, ip, i, needj);
6949		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
6950		    i < UFS_NIADDR;
6951		    i++, lbn += tmpval, tmpval *= NINDIR(fs)) {
6952			/* Release a whole indirect tree. */
6953			if (lbn > lastlbn) {
6954				setup_freeindir(freeblks, ip, i, -lbn -i,
6955				    needj);
6956				continue;
6957			}
6958			iboff = i + UFS_NDADDR;
6959			/*
6960			 * Traverse partially truncated indirect tree.
6961			 */
6962			if (lbn <= lastlbn && lbn + tmpval - 1 > lastlbn)
6963				setup_trunc_indir(freeblks, ip, -lbn - i,
6964				    lastlbn, DIP(ip, i_ib[i]));
6965		}
6966		/*
6967		 * Handle partial truncation to a frag boundary.
6968		 */
6969		if (frags) {
6970			ufs2_daddr_t blkno;
6971			long oldfrags;
6972
6973			oldfrags = blksize(fs, ip, lastlbn);
6974			blkno = DIP(ip, i_db[lastlbn]);
6975			if (blkno && oldfrags != frags) {
6976				oldfrags -= frags;
6977				oldfrags = numfrags(fs, oldfrags);
6978				blkno += numfrags(fs, frags);
6979				newfreework(ump, freeblks, NULL, lastlbn,
6980				    blkno, oldfrags, 0, needj);
6981				if (needj)
6982					adjust_newfreework(freeblks,
6983					    numfrags(fs, frags));
6984			} else if (blkno == 0)
6985				allocblock = 1;
6986		}
6987		/*
6988		 * Add a journal record for partial truncate if we are
6989		 * handling indirect blocks.  Non-indirects need no extra
6990		 * journaling.
6991		 */
6992		if (length != 0 && lastlbn >= UFS_NDADDR) {
6993			UFS_INODE_SET_FLAG(ip, IN_TRUNCATED);
6994			newjtrunc(freeblks, length, 0);
6995		}
6996		ip->i_size = length;
6997		DIP_SET(ip, i_size, ip->i_size);
6998		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
6999		datablocks = DIP(ip, i_blocks) - extblocks;
7000		if (length != 0)
7001			datablocks = blkcount(fs, datablocks, length);
7002		freeblks->fb_len = length;
7003	}
7004	if ((flags & IO_EXT) != 0) {
7005		for (i = 0; i < UFS_NXADDR; i++)
7006			setup_freeext(freeblks, ip, i, needj);
7007		ip->i_din2->di_extsize = 0;
7008		datablocks += extblocks;
7009		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7010	}
7011#ifdef QUOTA
7012	/* Reference the quotas in case the block count is wrong in the end. */
7013	quotaref(vp, freeblks->fb_quota);
7014	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
7015#endif
7016	freeblks->fb_chkcnt = -datablocks;
7017	UFS_LOCK(ump);
7018	fs->fs_pendingblocks += datablocks;
7019	UFS_UNLOCK(ump);
7020	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
7021	/*
7022	 * Handle truncation of incomplete alloc direct dependencies.  We
7023	 * hold the inode block locked to prevent incomplete dependencies
7024	 * from reaching the disk while we are eliminating those that
7025	 * have been truncated.  This is a partially inlined ffs_update().
7026	 */
7027	ufs_itimes(vp);
7028	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
7029	dbn = fsbtodb(fs, ino_to_fsba(fs, ip->i_number));
7030	error = ffs_breadz(ump, ump->um_devvp, dbn, dbn, (int)fs->fs_bsize,
7031	    NULL, NULL, 0, cred, 0, NULL, &bp);
7032	if (error) {
7033		softdep_error("softdep_journal_freeblocks", error);
7034		return;
7035	}
7036	if (bp->b_bufsize == fs->fs_bsize)
7037		bp->b_flags |= B_CLUSTEROK;
7038	softdep_update_inodeblock(ip, bp, 0);
7039	if (ump->um_fstype == UFS1) {
7040		*((struct ufs1_dinode *)bp->b_data +
7041		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din1;
7042	} else {
7043		ffs_update_dinode_ckhash(fs, ip->i_din2);
7044		*((struct ufs2_dinode *)bp->b_data +
7045		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
7046	}
7047	ACQUIRE_LOCK(ump);
7048	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7049	if ((inodedep->id_state & IOSTARTED) != 0)
7050		panic("softdep_setup_freeblocks: inode busy");
7051	/*
7052	 * Add the freeblks structure to the list of operations that
7053	 * must await the zero'ed inode being written to disk. If we
7054	 * still have a bitmap dependency (needj), then the inode
7055	 * has never been written to disk, so we can process the
7056	 * freeblks below once we have deleted the dependencies.
7057	 */
7058	if (needj)
7059		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
7060	else
7061		freeblks->fb_state |= COMPLETE;
7062	if ((flags & IO_NORMAL) != 0) {
7063		TAILQ_FOREACH_SAFE(adp, &inodedep->id_inoupdt, ad_next, adpn) {
7064			if (adp->ad_offset > iboff)
7065				cancel_allocdirect(&inodedep->id_inoupdt, adp,
7066				    freeblks);
7067			/*
7068			 * Truncate the allocdirect.  We could eliminate
7069			 * or modify journal records as well.
7070			 */
7071			else if (adp->ad_offset == iboff && frags)
7072				adp->ad_newsize = frags;
7073		}
7074	}
7075	if ((flags & IO_EXT) != 0)
7076		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
7077			cancel_allocdirect(&inodedep->id_extupdt, adp,
7078			    freeblks);
7079	/*
7080	 * Scan the bufwait list for newblock dependencies that will never
7081	 * make it to disk.
7082	 */
7083	LIST_FOREACH_SAFE(wk, &inodedep->id_bufwait, wk_list, wkn) {
7084		if (wk->wk_type != D_ALLOCDIRECT)
7085			continue;
7086		adp = WK_ALLOCDIRECT(wk);
7087		if (((flags & IO_NORMAL) != 0 && (adp->ad_offset > iboff)) ||
7088		    ((flags & IO_EXT) != 0 && (adp->ad_state & EXTDATA))) {
7089			cancel_jfreeblk(freeblks, adp->ad_newblkno);
7090			cancel_newblk(WK_NEWBLK(wk), NULL, &freeblks->fb_jwork);
7091			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7092		}
7093	}
7094	/*
7095	 * Add journal work.
7096	 */
7097	LIST_FOREACH(jblkdep, &freeblks->fb_jblkdephd, jb_deps)
7098		add_to_journal(&jblkdep->jb_list);
7099	FREE_LOCK(ump);
7100	bdwrite(bp);
7101	/*
7102	 * Truncate dependency structures beyond length.
7103	 */
7104	trunc_dependencies(ip, freeblks, lastlbn, frags, flags);
7105	/*
7106	 * This is only set when we need to allocate a fragment because
7107	 * none existed at the end of a frag-sized file.  It handles only
7108	 * allocating a new, zero filled block.
7109	 */
7110	if (allocblock) {
7111		ip->i_size = length - lastoff;
7112		DIP_SET(ip, i_size, ip->i_size);
7113		error = UFS_BALLOC(vp, length - 1, 1, cred, BA_CLRBUF, &bp);
7114		if (error != 0) {
7115			softdep_error("softdep_journal_freeblks", error);
7116			return;
7117		}
7118		ip->i_size = length;
7119		DIP_SET(ip, i_size, length);
7120		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE | IN_UPDATE);
7121		allocbuf(bp, frags);
7122		ffs_update(vp, 0);
7123		bawrite(bp);
7124	} else if (lastoff != 0 && vp->v_type != VDIR) {
7125		int size;
7126
7127		/*
7128		 * Zero the end of a truncated frag or block.
7129		 */
7130		size = sblksize(fs, length, lastlbn);
7131		error = bread(vp, lastlbn, size, cred, &bp);
7132		if (error == 0) {
7133			bzero((char *)bp->b_data + lastoff, size - lastoff);
7134			bawrite(bp);
7135		} else if (!ffs_fsfail_cleanup(ump, error)) {
7136			softdep_error("softdep_journal_freeblks", error);
7137			return;
7138		}
7139	}
7140	ACQUIRE_LOCK(ump);
7141	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7142	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
7143	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
7144	/*
7145	 * We zero earlier truncations so they don't erroneously
7146	 * update i_blocks.
7147	 */
7148	if (freeblks->fb_len == 0 && (flags & IO_NORMAL) != 0)
7149		TAILQ_FOREACH(fbn, &inodedep->id_freeblklst, fb_next)
7150			fbn->fb_len = 0;
7151	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE &&
7152	    LIST_EMPTY(&freeblks->fb_jblkdephd))
7153		freeblks->fb_state |= INPROGRESS;
7154	else
7155		freeblks = NULL;
7156	FREE_LOCK(ump);
7157	if (freeblks)
7158		handle_workitem_freeblocks(freeblks, 0);
7159	trunc_pages(ip, length, extblocks, flags);
7160
7161}
7162
7163/*
7164 * Flush a JOP_SYNC to the journal.
7165 */
7166void
7167softdep_journal_fsync(ip)
7168	struct inode *ip;
7169{
7170	struct jfsync *jfsync;
7171	struct ufsmount *ump;
7172
7173	ump = ITOUMP(ip);
7174	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7175	    ("softdep_journal_fsync called on non-softdep filesystem"));
7176	if ((ip->i_flag & IN_TRUNCATED) == 0)
7177		return;
7178	ip->i_flag &= ~IN_TRUNCATED;
7179	jfsync = malloc(sizeof(*jfsync), M_JFSYNC, M_SOFTDEP_FLAGS | M_ZERO);
7180	workitem_alloc(&jfsync->jfs_list, D_JFSYNC, UFSTOVFS(ump));
7181	jfsync->jfs_size = ip->i_size;
7182	jfsync->jfs_ino = ip->i_number;
7183	ACQUIRE_LOCK(ump);
7184	add_to_journal(&jfsync->jfs_list);
7185	jwait(&jfsync->jfs_list, MNT_WAIT);
7186	FREE_LOCK(ump);
7187}
7188
7189/*
7190 * Block de-allocation dependencies.
7191 *
7192 * When blocks are de-allocated, the on-disk pointers must be nullified before
7193 * the blocks are made available for use by other files.  (The true
7194 * requirement is that old pointers must be nullified before new on-disk
7195 * pointers are set.  We chose this slightly more stringent requirement to
7196 * reduce complexity.) Our implementation handles this dependency by updating
7197 * the inode (or indirect block) appropriately but delaying the actual block
7198 * de-allocation (i.e., freemap and free space count manipulation) until
7199 * after the updated versions reach stable storage.  After the disk is
7200 * updated, the blocks can be safely de-allocated whenever it is convenient.
7201 * This implementation handles only the common case of reducing a file's
7202 * length to zero. Other cases are handled by the conventional synchronous
7203 * write approach.
7204 *
7205 * The ffs implementation with which we worked double-checks
7206 * the state of the block pointers and file size as it reduces
7207 * a file's length.  Some of this code is replicated here in our
7208 * soft updates implementation.  The freeblks->fb_chkcnt field is
7209 * used to transfer a part of this information to the procedure
7210 * that eventually de-allocates the blocks.
7211 *
7212 * This routine should be called from the routine that shortens
7213 * a file's length, before the inode's size or block pointers
7214 * are modified. It will save the block pointer information for
7215 * later release and zero the inode so that the calling routine
7216 * can release it.
7217 */
7218void
7219softdep_setup_freeblocks(ip, length, flags)
7220	struct inode *ip;	/* The inode whose length is to be reduced */
7221	off_t length;		/* The new length for the file */
7222	int flags;		/* IO_EXT and/or IO_NORMAL */
7223{
7224	struct ufs1_dinode *dp1;
7225	struct ufs2_dinode *dp2;
7226	struct freeblks *freeblks;
7227	struct inodedep *inodedep;
7228	struct allocdirect *adp;
7229	struct ufsmount *ump;
7230	struct buf *bp;
7231	struct fs *fs;
7232	ufs2_daddr_t extblocks, datablocks;
7233	struct mount *mp;
7234	int i, delay, error;
7235	ufs_lbn_t tmpval;
7236	ufs_lbn_t lbn;
7237
7238	ump = ITOUMP(ip);
7239	mp = UFSTOVFS(ump);
7240	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
7241	    ("softdep_setup_freeblocks called on non-softdep filesystem"));
7242	CTR2(KTR_SUJ, "softdep_setup_freeblks: ip %d length %ld",
7243	    ip->i_number, length);
7244	KASSERT(length == 0, ("softdep_setup_freeblocks: non-zero length"));
7245	fs = ump->um_fs;
7246	if ((error = bread(ump->um_devvp,
7247	    fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
7248	    (int)fs->fs_bsize, NOCRED, &bp)) != 0) {
7249		if (!ffs_fsfail_cleanup(ump, error))
7250			softdep_error("softdep_setup_freeblocks", error);
7251		return;
7252	}
7253	freeblks = newfreeblks(mp, ip);
7254	extblocks = 0;
7255	datablocks = 0;
7256	if (fs->fs_magic == FS_UFS2_MAGIC)
7257		extblocks = btodb(fragroundup(fs, ip->i_din2->di_extsize));
7258	if ((flags & IO_NORMAL) != 0) {
7259		for (i = 0; i < UFS_NDADDR; i++)
7260			setup_freedirect(freeblks, ip, i, 0);
7261		for (i = 0, tmpval = NINDIR(fs), lbn = UFS_NDADDR;
7262		    i < UFS_NIADDR;
7263		    i++, lbn += tmpval, tmpval *= NINDIR(fs))
7264			setup_freeindir(freeblks, ip, i, -lbn -i, 0);
7265		ip->i_size = 0;
7266		DIP_SET(ip, i_size, 0);
7267		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7268		datablocks = DIP(ip, i_blocks) - extblocks;
7269	}
7270	if ((flags & IO_EXT) != 0) {
7271		for (i = 0; i < UFS_NXADDR; i++)
7272			setup_freeext(freeblks, ip, i, 0);
7273		ip->i_din2->di_extsize = 0;
7274		datablocks += extblocks;
7275		UFS_INODE_SET_FLAG(ip, IN_SIZEMOD | IN_CHANGE);
7276	}
7277#ifdef QUOTA
7278	/* Reference the quotas in case the block count is wrong in the end. */
7279	quotaref(ITOV(ip), freeblks->fb_quota);
7280	(void) chkdq(ip, -datablocks, NOCRED, FORCE);
7281#endif
7282	freeblks->fb_chkcnt = -datablocks;
7283	UFS_LOCK(ump);
7284	fs->fs_pendingblocks += datablocks;
7285	UFS_UNLOCK(ump);
7286	DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - datablocks);
7287	/*
7288	 * Push the zero'ed inode to its disk buffer so that we are free
7289	 * to delete its dependencies below. Once the dependencies are gone
7290	 * the buffer can be safely released.
7291	 */
7292	if (ump->um_fstype == UFS1) {
7293		dp1 = ((struct ufs1_dinode *)bp->b_data +
7294		    ino_to_fsbo(fs, ip->i_number));
7295		ip->i_din1->di_freelink = dp1->di_freelink;
7296		*dp1 = *ip->i_din1;
7297	} else {
7298		dp2 = ((struct ufs2_dinode *)bp->b_data +
7299		    ino_to_fsbo(fs, ip->i_number));
7300		ip->i_din2->di_freelink = dp2->di_freelink;
7301		ffs_update_dinode_ckhash(fs, ip->i_din2);
7302		*dp2 = *ip->i_din2;
7303	}
7304	/*
7305	 * Find and eliminate any inode dependencies.
7306	 */
7307	ACQUIRE_LOCK(ump);
7308	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
7309	if ((inodedep->id_state & IOSTARTED) != 0)
7310		panic("softdep_setup_freeblocks: inode busy");
7311	/*
7312	 * Add the freeblks structure to the list of operations that
7313	 * must await the zero'ed inode being written to disk. If we
7314	 * still have a bitmap dependency (delay == 0), then the inode
7315	 * has never been written to disk, so we can process the
7316	 * freeblks below once we have deleted the dependencies.
7317	 */
7318	delay = (inodedep->id_state & DEPCOMPLETE);
7319	if (delay)
7320		WORKLIST_INSERT(&bp->b_dep, &freeblks->fb_list);
7321	else
7322		freeblks->fb_state |= COMPLETE;
7323	/*
7324	 * Because the file length has been truncated to zero, any
7325	 * pending block allocation dependency structures associated
7326	 * with this inode are obsolete and can simply be de-allocated.
7327	 * We must first merge the two dependency lists to get rid of
7328	 * any duplicate freefrag structures, then purge the merged list.
7329	 * If we still have a bitmap dependency, then the inode has never
7330	 * been written to disk, so we can free any fragments without delay.
7331	 */
7332	if (flags & IO_NORMAL) {
7333		merge_inode_lists(&inodedep->id_newinoupdt,
7334		    &inodedep->id_inoupdt);
7335		while ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
7336			cancel_allocdirect(&inodedep->id_inoupdt, adp,
7337			    freeblks);
7338	}
7339	if (flags & IO_EXT) {
7340		merge_inode_lists(&inodedep->id_newextupdt,
7341		    &inodedep->id_extupdt);
7342		while ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
7343			cancel_allocdirect(&inodedep->id_extupdt, adp,
7344			    freeblks);
7345	}
7346	FREE_LOCK(ump);
7347	bdwrite(bp);
7348	trunc_dependencies(ip, freeblks, -1, 0, flags);
7349	ACQUIRE_LOCK(ump);
7350	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
7351		(void) free_inodedep(inodedep);
7352	freeblks->fb_state |= DEPCOMPLETE;
7353	/*
7354	 * If the inode with zeroed block pointers is now on disk
7355	 * we can start freeing blocks.
7356	 */
7357	if ((freeblks->fb_state & ALLCOMPLETE) == ALLCOMPLETE)
7358		freeblks->fb_state |= INPROGRESS;
7359	else
7360		freeblks = NULL;
7361	FREE_LOCK(ump);
7362	if (freeblks)
7363		handle_workitem_freeblocks(freeblks, 0);
7364	trunc_pages(ip, length, extblocks, flags);
7365}
7366
7367/*
7368 * Eliminate pages from the page cache that back parts of this inode and
7369 * adjust the vnode pager's idea of our size.  This prevents stale data
7370 * from hanging around in the page cache.
7371 */
7372static void
7373trunc_pages(ip, length, extblocks, flags)
7374	struct inode *ip;
7375	off_t length;
7376	ufs2_daddr_t extblocks;
7377	int flags;
7378{
7379	struct vnode *vp;
7380	struct fs *fs;
7381	ufs_lbn_t lbn;
7382	off_t end, extend;
7383
7384	vp = ITOV(ip);
7385	fs = ITOFS(ip);
7386	extend = OFF_TO_IDX(lblktosize(fs, -extblocks));
7387	if ((flags & IO_EXT) != 0)
7388		vn_pages_remove(vp, extend, 0);
7389	if ((flags & IO_NORMAL) == 0)
7390		return;
7391	BO_LOCK(&vp->v_bufobj);
7392	drain_output(vp);
7393	BO_UNLOCK(&vp->v_bufobj);
7394	/*
7395	 * The vnode pager eliminates file pages we eliminate indirects
7396	 * below.
7397	 */
7398	vnode_pager_setsize(vp, length);
7399	/*
7400	 * Calculate the end based on the last indirect we want to keep.  If
7401	 * the block extends into indirects we can just use the negative of
7402	 * its lbn.  Doubles and triples exist at lower numbers so we must
7403	 * be careful not to remove those, if they exist.  double and triple
7404	 * indirect lbns do not overlap with others so it is not important
7405	 * to verify how many levels are required.
7406	 */
7407	lbn = lblkno(fs, length);
7408	if (lbn >= UFS_NDADDR) {
7409		/* Calculate the virtual lbn of the triple indirect. */
7410		lbn = -lbn - (UFS_NIADDR - 1);
7411		end = OFF_TO_IDX(lblktosize(fs, lbn));
7412	} else
7413		end = extend;
7414	vn_pages_remove(vp, OFF_TO_IDX(OFF_MAX), end);
7415}
7416
7417/*
7418 * See if the buf bp is in the range eliminated by truncation.
7419 */
7420static int
7421trunc_check_buf(bp, blkoffp, lastlbn, lastoff, flags)
7422	struct buf *bp;
7423	int *blkoffp;
7424	ufs_lbn_t lastlbn;
7425	int lastoff;
7426	int flags;
7427{
7428	ufs_lbn_t lbn;
7429
7430	*blkoffp = 0;
7431	/* Only match ext/normal blocks as appropriate. */
7432	if (((flags & IO_EXT) == 0 && (bp->b_xflags & BX_ALTDATA)) ||
7433	    ((flags & IO_NORMAL) == 0 && (bp->b_xflags & BX_ALTDATA) == 0))
7434		return (0);
7435	/* ALTDATA is always a full truncation. */
7436	if ((bp->b_xflags & BX_ALTDATA) != 0)
7437		return (1);
7438	/* -1 is full truncation. */
7439	if (lastlbn == -1)
7440		return (1);
7441	/*
7442	 * If this is a partial truncate we only want those
7443	 * blocks and indirect blocks that cover the range
7444	 * we're after.
7445	 */
7446	lbn = bp->b_lblkno;
7447	if (lbn < 0)
7448		lbn = -(lbn + lbn_level(lbn));
7449	if (lbn < lastlbn)
7450		return (0);
7451	/* Here we only truncate lblkno if it's partial. */
7452	if (lbn == lastlbn) {
7453		if (lastoff == 0)
7454			return (0);
7455		*blkoffp = lastoff;
7456	}
7457	return (1);
7458}
7459
7460/*
7461 * Eliminate any dependencies that exist in memory beyond lblkno:off
7462 */
7463static void
7464trunc_dependencies(ip, freeblks, lastlbn, lastoff, flags)
7465	struct inode *ip;
7466	struct freeblks *freeblks;
7467	ufs_lbn_t lastlbn;
7468	int lastoff;
7469	int flags;
7470{
7471	struct bufobj *bo;
7472	struct vnode *vp;
7473	struct buf *bp;
7474	int blkoff;
7475
7476	/*
7477	 * We must wait for any I/O in progress to finish so that
7478	 * all potential buffers on the dirty list will be visible.
7479	 * Once they are all there, walk the list and get rid of
7480	 * any dependencies.
7481	 */
7482	vp = ITOV(ip);
7483	bo = &vp->v_bufobj;
7484	BO_LOCK(bo);
7485	drain_output(vp);
7486	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
7487		bp->b_vflags &= ~BV_SCANNED;
7488restart:
7489	TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs) {
7490		if (bp->b_vflags & BV_SCANNED)
7491			continue;
7492		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7493			bp->b_vflags |= BV_SCANNED;
7494			continue;
7495		}
7496		KASSERT(bp->b_bufobj == bo, ("Wrong object in buffer"));
7497		if ((bp = getdirtybuf(bp, BO_LOCKPTR(bo), MNT_WAIT)) == NULL)
7498			goto restart;
7499		BO_UNLOCK(bo);
7500		if (deallocate_dependencies(bp, freeblks, blkoff))
7501			bqrelse(bp);
7502		else
7503			brelse(bp);
7504		BO_LOCK(bo);
7505		goto restart;
7506	}
7507	/*
7508	 * Now do the work of vtruncbuf while also matching indirect blocks.
7509	 */
7510	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs)
7511		bp->b_vflags &= ~BV_SCANNED;
7512cleanrestart:
7513	TAILQ_FOREACH(bp, &bo->bo_clean.bv_hd, b_bobufs) {
7514		if (bp->b_vflags & BV_SCANNED)
7515			continue;
7516		if (!trunc_check_buf(bp, &blkoff, lastlbn, lastoff, flags)) {
7517			bp->b_vflags |= BV_SCANNED;
7518			continue;
7519		}
7520		if (BUF_LOCK(bp,
7521		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
7522		    BO_LOCKPTR(bo)) == ENOLCK) {
7523			BO_LOCK(bo);
7524			goto cleanrestart;
7525		}
7526		BO_LOCK(bo);
7527		bp->b_vflags |= BV_SCANNED;
7528		BO_UNLOCK(bo);
7529		bremfree(bp);
7530		if (blkoff != 0) {
7531			allocbuf(bp, blkoff);
7532			bqrelse(bp);
7533		} else {
7534			bp->b_flags |= B_INVAL | B_NOCACHE | B_RELBUF;
7535			brelse(bp);
7536		}
7537		BO_LOCK(bo);
7538		goto cleanrestart;
7539	}
7540	drain_output(vp);
7541	BO_UNLOCK(bo);
7542}
7543
7544static int
7545cancel_pagedep(pagedep, freeblks, blkoff)
7546	struct pagedep *pagedep;
7547	struct freeblks *freeblks;
7548	int blkoff;
7549{
7550	struct jremref *jremref;
7551	struct jmvref *jmvref;
7552	struct dirrem *dirrem, *tmp;
7553	int i;
7554
7555	/*
7556	 * Copy any directory remove dependencies to the list
7557	 * to be processed after the freeblks proceeds.  If
7558	 * directory entry never made it to disk they
7559	 * can be dumped directly onto the work list.
7560	 */
7561	LIST_FOREACH_SAFE(dirrem, &pagedep->pd_dirremhd, dm_next, tmp) {
7562		/* Skip this directory removal if it is intended to remain. */
7563		if (dirrem->dm_offset < blkoff)
7564			continue;
7565		/*
7566		 * If there are any dirrems we wait for the journal write
7567		 * to complete and then restart the buf scan as the lock
7568		 * has been dropped.
7569		 */
7570		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL) {
7571			jwait(&jremref->jr_list, MNT_WAIT);
7572			return (ERESTART);
7573		}
7574		LIST_REMOVE(dirrem, dm_next);
7575		dirrem->dm_dirinum = pagedep->pd_ino;
7576		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &dirrem->dm_list);
7577	}
7578	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL) {
7579		jwait(&jmvref->jm_list, MNT_WAIT);
7580		return (ERESTART);
7581	}
7582	/*
7583	 * When we're partially truncating a pagedep we just want to flush
7584	 * journal entries and return.  There can not be any adds in the
7585	 * truncated portion of the directory and newblk must remain if
7586	 * part of the block remains.
7587	 */
7588	if (blkoff != 0) {
7589		struct diradd *dap;
7590
7591		LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
7592			if (dap->da_offset > blkoff)
7593				panic("cancel_pagedep: diradd %p off %d > %d",
7594				    dap, dap->da_offset, blkoff);
7595		for (i = 0; i < DAHASHSZ; i++)
7596			LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist)
7597				if (dap->da_offset > blkoff)
7598					panic("cancel_pagedep: diradd %p off %d > %d",
7599					    dap, dap->da_offset, blkoff);
7600		return (0);
7601	}
7602	/*
7603	 * There should be no directory add dependencies present
7604	 * as the directory could not be truncated until all
7605	 * children were removed.
7606	 */
7607	KASSERT(LIST_FIRST(&pagedep->pd_pendinghd) == NULL,
7608	    ("deallocate_dependencies: pendinghd != NULL"));
7609	for (i = 0; i < DAHASHSZ; i++)
7610		KASSERT(LIST_FIRST(&pagedep->pd_diraddhd[i]) == NULL,
7611		    ("deallocate_dependencies: diraddhd != NULL"));
7612	if ((pagedep->pd_state & NEWBLOCK) != 0)
7613		free_newdirblk(pagedep->pd_newdirblk);
7614	if (free_pagedep(pagedep) == 0)
7615		panic("Failed to free pagedep %p", pagedep);
7616	return (0);
7617}
7618
7619/*
7620 * Reclaim any dependency structures from a buffer that is about to
7621 * be reallocated to a new vnode. The buffer must be locked, thus,
7622 * no I/O completion operations can occur while we are manipulating
7623 * its associated dependencies. The mutex is held so that other I/O's
7624 * associated with related dependencies do not occur.
7625 */
7626static int
7627deallocate_dependencies(bp, freeblks, off)
7628	struct buf *bp;
7629	struct freeblks *freeblks;
7630	int off;
7631{
7632	struct indirdep *indirdep;
7633	struct pagedep *pagedep;
7634	struct worklist *wk, *wkn;
7635	struct ufsmount *ump;
7636
7637	ump = softdep_bp_to_mp(bp);
7638	if (ump == NULL)
7639		goto done;
7640	ACQUIRE_LOCK(ump);
7641	LIST_FOREACH_SAFE(wk, &bp->b_dep, wk_list, wkn) {
7642		switch (wk->wk_type) {
7643		case D_INDIRDEP:
7644			indirdep = WK_INDIRDEP(wk);
7645			if (bp->b_lblkno >= 0 ||
7646			    bp->b_blkno != indirdep->ir_savebp->b_lblkno)
7647				panic("deallocate_dependencies: not indir");
7648			cancel_indirdep(indirdep, bp, freeblks);
7649			continue;
7650
7651		case D_PAGEDEP:
7652			pagedep = WK_PAGEDEP(wk);
7653			if (cancel_pagedep(pagedep, freeblks, off)) {
7654				FREE_LOCK(ump);
7655				return (ERESTART);
7656			}
7657			continue;
7658
7659		case D_ALLOCINDIR:
7660			/*
7661			 * Simply remove the allocindir, we'll find it via
7662			 * the indirdep where we can clear pointers if
7663			 * needed.
7664			 */
7665			WORKLIST_REMOVE(wk);
7666			continue;
7667
7668		case D_FREEWORK:
7669			/*
7670			 * A truncation is waiting for the zero'd pointers
7671			 * to be written.  It can be freed when the freeblks
7672			 * is journaled.
7673			 */
7674			WORKLIST_REMOVE(wk);
7675			wk->wk_state |= ONDEPLIST;
7676			WORKLIST_INSERT(&freeblks->fb_freeworkhd, wk);
7677			break;
7678
7679		case D_ALLOCDIRECT:
7680			if (off != 0)
7681				continue;
7682			/* FALLTHROUGH */
7683		default:
7684			panic("deallocate_dependencies: Unexpected type %s",
7685			    TYPENAME(wk->wk_type));
7686			/* NOTREACHED */
7687		}
7688	}
7689	FREE_LOCK(ump);
7690done:
7691	/*
7692	 * Don't throw away this buf, we were partially truncating and
7693	 * some deps may always remain.
7694	 */
7695	if (off) {
7696		allocbuf(bp, off);
7697		bp->b_vflags |= BV_SCANNED;
7698		return (EBUSY);
7699	}
7700	bp->b_flags |= B_INVAL | B_NOCACHE;
7701
7702	return (0);
7703}
7704
7705/*
7706 * An allocdirect is being canceled due to a truncate.  We must make sure
7707 * the journal entry is released in concert with the blkfree that releases
7708 * the storage.  Completed journal entries must not be released until the
7709 * space is no longer pointed to by the inode or in the bitmap.
7710 */
7711static void
7712cancel_allocdirect(adphead, adp, freeblks)
7713	struct allocdirectlst *adphead;
7714	struct allocdirect *adp;
7715	struct freeblks *freeblks;
7716{
7717	struct freework *freework;
7718	struct newblk *newblk;
7719	struct worklist *wk;
7720
7721	TAILQ_REMOVE(adphead, adp, ad_next);
7722	newblk = (struct newblk *)adp;
7723	freework = NULL;
7724	/*
7725	 * Find the correct freework structure.
7726	 */
7727	LIST_FOREACH(wk, &freeblks->fb_freeworkhd, wk_list) {
7728		if (wk->wk_type != D_FREEWORK)
7729			continue;
7730		freework = WK_FREEWORK(wk);
7731		if (freework->fw_blkno == newblk->nb_newblkno)
7732			break;
7733	}
7734	if (freework == NULL)
7735		panic("cancel_allocdirect: Freework not found");
7736	/*
7737	 * If a newblk exists at all we still have the journal entry that
7738	 * initiated the allocation so we do not need to journal the free.
7739	 */
7740	cancel_jfreeblk(freeblks, freework->fw_blkno);
7741	/*
7742	 * If the journal hasn't been written the jnewblk must be passed
7743	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
7744	 * this by linking the journal dependency into the freework to be
7745	 * freed when freework_freeblock() is called.  If the journal has
7746	 * been written we can simply reclaim the journal space when the
7747	 * freeblks work is complete.
7748	 */
7749	freework->fw_jnewblk = cancel_newblk(newblk, &freework->fw_list,
7750	    &freeblks->fb_jwork);
7751	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
7752}
7753
7754/*
7755 * Cancel a new block allocation.  May be an indirect or direct block.  We
7756 * remove it from various lists and return any journal record that needs to
7757 * be resolved by the caller.
7758 *
7759 * A special consideration is made for indirects which were never pointed
7760 * at on disk and will never be found once this block is released.
7761 */
7762static struct jnewblk *
7763cancel_newblk(newblk, wk, wkhd)
7764	struct newblk *newblk;
7765	struct worklist *wk;
7766	struct workhead *wkhd;
7767{
7768	struct jnewblk *jnewblk;
7769
7770	CTR1(KTR_SUJ, "cancel_newblk: blkno %jd", newblk->nb_newblkno);
7771
7772	newblk->nb_state |= GOINGAWAY;
7773	/*
7774	 * Previously we traversed the completedhd on each indirdep
7775	 * attached to this newblk to cancel them and gather journal
7776	 * work.  Since we need only the oldest journal segment and
7777	 * the lowest point on the tree will always have the oldest
7778	 * journal segment we are free to release the segments
7779	 * of any subordinates and may leave the indirdep list to
7780	 * indirdep_complete() when this newblk is freed.
7781	 */
7782	if (newblk->nb_state & ONDEPLIST) {
7783		newblk->nb_state &= ~ONDEPLIST;
7784		LIST_REMOVE(newblk, nb_deps);
7785	}
7786	if (newblk->nb_state & ONWORKLIST)
7787		WORKLIST_REMOVE(&newblk->nb_list);
7788	/*
7789	 * If the journal entry hasn't been written we save a pointer to
7790	 * the dependency that frees it until it is written or the
7791	 * superseding operation completes.
7792	 */
7793	jnewblk = newblk->nb_jnewblk;
7794	if (jnewblk != NULL && wk != NULL) {
7795		newblk->nb_jnewblk = NULL;
7796		jnewblk->jn_dep = wk;
7797	}
7798	if (!LIST_EMPTY(&newblk->nb_jwork))
7799		jwork_move(wkhd, &newblk->nb_jwork);
7800	/*
7801	 * When truncating we must free the newdirblk early to remove
7802	 * the pagedep from the hash before returning.
7803	 */
7804	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7805		free_newdirblk(WK_NEWDIRBLK(wk));
7806	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7807		panic("cancel_newblk: extra newdirblk");
7808
7809	return (jnewblk);
7810}
7811
7812/*
7813 * Schedule the freefrag associated with a newblk to be released once
7814 * the pointers are written and the previous block is no longer needed.
7815 */
7816static void
7817newblk_freefrag(newblk)
7818	struct newblk *newblk;
7819{
7820	struct freefrag *freefrag;
7821
7822	if (newblk->nb_freefrag == NULL)
7823		return;
7824	freefrag = newblk->nb_freefrag;
7825	newblk->nb_freefrag = NULL;
7826	freefrag->ff_state |= COMPLETE;
7827	if ((freefrag->ff_state & ALLCOMPLETE) == ALLCOMPLETE)
7828		add_to_worklist(&freefrag->ff_list, 0);
7829}
7830
7831/*
7832 * Free a newblk. Generate a new freefrag work request if appropriate.
7833 * This must be called after the inode pointer and any direct block pointers
7834 * are valid or fully removed via truncate or frag extension.
7835 */
7836static void
7837free_newblk(newblk)
7838	struct newblk *newblk;
7839{
7840	struct indirdep *indirdep;
7841	struct worklist *wk;
7842
7843	KASSERT(newblk->nb_jnewblk == NULL,
7844	    ("free_newblk: jnewblk %p still attached", newblk->nb_jnewblk));
7845	KASSERT(newblk->nb_list.wk_type != D_NEWBLK,
7846	    ("free_newblk: unclaimed newblk"));
7847	LOCK_OWNED(VFSTOUFS(newblk->nb_list.wk_mp));
7848	newblk_freefrag(newblk);
7849	if (newblk->nb_state & ONDEPLIST)
7850		LIST_REMOVE(newblk, nb_deps);
7851	if (newblk->nb_state & ONWORKLIST)
7852		WORKLIST_REMOVE(&newblk->nb_list);
7853	LIST_REMOVE(newblk, nb_hash);
7854	if ((wk = LIST_FIRST(&newblk->nb_newdirblk)) != NULL)
7855		free_newdirblk(WK_NEWDIRBLK(wk));
7856	if (!LIST_EMPTY(&newblk->nb_newdirblk))
7857		panic("free_newblk: extra newdirblk");
7858	while ((indirdep = LIST_FIRST(&newblk->nb_indirdeps)) != NULL)
7859		indirdep_complete(indirdep);
7860	handle_jwork(&newblk->nb_jwork);
7861	WORKITEM_FREE(newblk, D_NEWBLK);
7862}
7863
7864/*
7865 * Free a newdirblk. Clear the NEWBLOCK flag on its associated pagedep.
7866 */
7867static void
7868free_newdirblk(newdirblk)
7869	struct newdirblk *newdirblk;
7870{
7871	struct pagedep *pagedep;
7872	struct diradd *dap;
7873	struct worklist *wk;
7874
7875	LOCK_OWNED(VFSTOUFS(newdirblk->db_list.wk_mp));
7876	WORKLIST_REMOVE(&newdirblk->db_list);
7877	/*
7878	 * If the pagedep is still linked onto the directory buffer
7879	 * dependency chain, then some of the entries on the
7880	 * pd_pendinghd list may not be committed to disk yet. In
7881	 * this case, we will simply clear the NEWBLOCK flag and
7882	 * let the pd_pendinghd list be processed when the pagedep
7883	 * is next written. If the pagedep is no longer on the buffer
7884	 * dependency chain, then all the entries on the pd_pending
7885	 * list are committed to disk and we can free them here.
7886	 */
7887	pagedep = newdirblk->db_pagedep;
7888	pagedep->pd_state &= ~NEWBLOCK;
7889	if ((pagedep->pd_state & ONWORKLIST) == 0) {
7890		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
7891			free_diradd(dap, NULL);
7892		/*
7893		 * If no dependencies remain, the pagedep will be freed.
7894		 */
7895		free_pagedep(pagedep);
7896	}
7897	/* Should only ever be one item in the list. */
7898	while ((wk = LIST_FIRST(&newdirblk->db_mkdir)) != NULL) {
7899		WORKLIST_REMOVE(wk);
7900		handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
7901	}
7902	WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
7903}
7904
7905/*
7906 * Prepare an inode to be freed. The actual free operation is not
7907 * done until the zero'ed inode has been written to disk.
7908 */
7909void
7910softdep_freefile(pvp, ino, mode)
7911	struct vnode *pvp;
7912	ino_t ino;
7913	int mode;
7914{
7915	struct inode *ip = VTOI(pvp);
7916	struct inodedep *inodedep;
7917	struct freefile *freefile;
7918	struct freeblks *freeblks;
7919	struct ufsmount *ump;
7920
7921	ump = ITOUMP(ip);
7922	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
7923	    ("softdep_freefile called on non-softdep filesystem"));
7924	/*
7925	 * This sets up the inode de-allocation dependency.
7926	 */
7927	freefile = malloc(sizeof(struct freefile),
7928		M_FREEFILE, M_SOFTDEP_FLAGS);
7929	workitem_alloc(&freefile->fx_list, D_FREEFILE, pvp->v_mount);
7930	freefile->fx_mode = mode;
7931	freefile->fx_oldinum = ino;
7932	freefile->fx_devvp = ump->um_devvp;
7933	LIST_INIT(&freefile->fx_jwork);
7934	UFS_LOCK(ump);
7935	ump->um_fs->fs_pendinginodes += 1;
7936	UFS_UNLOCK(ump);
7937
7938	/*
7939	 * If the inodedep does not exist, then the zero'ed inode has
7940	 * been written to disk. If the allocated inode has never been
7941	 * written to disk, then the on-disk inode is zero'ed. In either
7942	 * case we can free the file immediately.  If the journal was
7943	 * canceled before being written the inode will never make it to
7944	 * disk and we must send the canceled journal entrys to
7945	 * ffs_freefile() to be cleared in conjunction with the bitmap.
7946	 * Any blocks waiting on the inode to write can be safely freed
7947	 * here as it will never been written.
7948	 */
7949	ACQUIRE_LOCK(ump);
7950	inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7951	if (inodedep) {
7952		/*
7953		 * Clear out freeblks that no longer need to reference
7954		 * this inode.
7955		 */
7956		while ((freeblks =
7957		    TAILQ_FIRST(&inodedep->id_freeblklst)) != NULL) {
7958			TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks,
7959			    fb_next);
7960			freeblks->fb_state &= ~ONDEPLIST;
7961		}
7962		/*
7963		 * Remove this inode from the unlinked list.
7964		 */
7965		if (inodedep->id_state & UNLINKED) {
7966			/*
7967			 * Save the journal work to be freed with the bitmap
7968			 * before we clear UNLINKED.  Otherwise it can be lost
7969			 * if the inode block is written.
7970			 */
7971			handle_bufwait(inodedep, &freefile->fx_jwork);
7972			clear_unlinked_inodedep(inodedep);
7973			/*
7974			 * Re-acquire inodedep as we've dropped the
7975			 * per-filesystem lock in clear_unlinked_inodedep().
7976			 */
7977			inodedep_lookup(pvp->v_mount, ino, 0, &inodedep);
7978		}
7979	}
7980	if (inodedep == NULL || check_inode_unwritten(inodedep)) {
7981		FREE_LOCK(ump);
7982		handle_workitem_freefile(freefile);
7983		return;
7984	}
7985	if ((inodedep->id_state & DEPCOMPLETE) == 0)
7986		inodedep->id_state |= GOINGAWAY;
7987	WORKLIST_INSERT(&inodedep->id_inowait, &freefile->fx_list);
7988	FREE_LOCK(ump);
7989	if (ip->i_number == ino)
7990		UFS_INODE_SET_FLAG(ip, IN_MODIFIED);
7991}
7992
7993/*
7994 * Check to see if an inode has never been written to disk. If
7995 * so free the inodedep and return success, otherwise return failure.
7996 *
7997 * If we still have a bitmap dependency, then the inode has never
7998 * been written to disk. Drop the dependency as it is no longer
7999 * necessary since the inode is being deallocated. We set the
8000 * ALLCOMPLETE flags since the bitmap now properly shows that the
8001 * inode is not allocated. Even if the inode is actively being
8002 * written, it has been rolled back to its zero'ed state, so we
8003 * are ensured that a zero inode is what is on the disk. For short
8004 * lived files, this change will usually result in removing all the
8005 * dependencies from the inode so that it can be freed immediately.
8006 */
8007static int
8008check_inode_unwritten(inodedep)
8009	struct inodedep *inodedep;
8010{
8011
8012	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
8013
8014	if ((inodedep->id_state & (DEPCOMPLETE | UNLINKED)) != 0 ||
8015	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
8016	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
8017	    !LIST_EMPTY(&inodedep->id_bufwait) ||
8018	    !LIST_EMPTY(&inodedep->id_inowait) ||
8019	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
8020	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
8021	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
8022	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
8023	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
8024	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
8025	    inodedep->id_mkdiradd != NULL ||
8026	    inodedep->id_nlinkdelta != 0)
8027		return (0);
8028	/*
8029	 * Another process might be in initiate_write_inodeblock_ufs[12]
8030	 * trying to allocate memory without holding "Softdep Lock".
8031	 */
8032	if ((inodedep->id_state & IOSTARTED) != 0 &&
8033	    inodedep->id_savedino1 == NULL)
8034		return (0);
8035
8036	if (inodedep->id_state & ONDEPLIST)
8037		LIST_REMOVE(inodedep, id_deps);
8038	inodedep->id_state &= ~ONDEPLIST;
8039	inodedep->id_state |= ALLCOMPLETE;
8040	inodedep->id_bmsafemap = NULL;
8041	if (inodedep->id_state & ONWORKLIST)
8042		WORKLIST_REMOVE(&inodedep->id_list);
8043	if (inodedep->id_savedino1 != NULL) {
8044		free(inodedep->id_savedino1, M_SAVEDINO);
8045		inodedep->id_savedino1 = NULL;
8046	}
8047	if (free_inodedep(inodedep) == 0)
8048		panic("check_inode_unwritten: busy inode");
8049	return (1);
8050}
8051
8052static int
8053check_inodedep_free(inodedep)
8054	struct inodedep *inodedep;
8055{
8056
8057	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
8058	if ((inodedep->id_state & ALLCOMPLETE) != ALLCOMPLETE ||
8059	    !LIST_EMPTY(&inodedep->id_dirremhd) ||
8060	    !LIST_EMPTY(&inodedep->id_pendinghd) ||
8061	    !LIST_EMPTY(&inodedep->id_bufwait) ||
8062	    !LIST_EMPTY(&inodedep->id_inowait) ||
8063	    !TAILQ_EMPTY(&inodedep->id_inoreflst) ||
8064	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
8065	    !TAILQ_EMPTY(&inodedep->id_newinoupdt) ||
8066	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
8067	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
8068	    !TAILQ_EMPTY(&inodedep->id_freeblklst) ||
8069	    inodedep->id_mkdiradd != NULL ||
8070	    inodedep->id_nlinkdelta != 0 ||
8071	    inodedep->id_savedino1 != NULL)
8072		return (0);
8073	return (1);
8074}
8075
8076/*
8077 * Try to free an inodedep structure. Return 1 if it could be freed.
8078 */
8079static int
8080free_inodedep(inodedep)
8081	struct inodedep *inodedep;
8082{
8083
8084	LOCK_OWNED(VFSTOUFS(inodedep->id_list.wk_mp));
8085	if ((inodedep->id_state & (ONWORKLIST | UNLINKED)) != 0 ||
8086	    !check_inodedep_free(inodedep))
8087		return (0);
8088	if (inodedep->id_state & ONDEPLIST)
8089		LIST_REMOVE(inodedep, id_deps);
8090	LIST_REMOVE(inodedep, id_hash);
8091	WORKITEM_FREE(inodedep, D_INODEDEP);
8092	return (1);
8093}
8094
8095/*
8096 * Free the block referenced by a freework structure.  The parent freeblks
8097 * structure is released and completed when the final cg bitmap reaches
8098 * the disk.  This routine may be freeing a jnewblk which never made it to
8099 * disk in which case we do not have to wait as the operation is undone
8100 * in memory immediately.
8101 */
8102static void
8103freework_freeblock(freework, key)
8104	struct freework *freework;
8105	u_long key;
8106{
8107	struct freeblks *freeblks;
8108	struct jnewblk *jnewblk;
8109	struct ufsmount *ump;
8110	struct workhead wkhd;
8111	struct fs *fs;
8112	int bsize;
8113	int needj;
8114
8115	ump = VFSTOUFS(freework->fw_list.wk_mp);
8116	LOCK_OWNED(ump);
8117	/*
8118	 * Handle partial truncate separately.
8119	 */
8120	if (freework->fw_indir) {
8121		complete_trunc_indir(freework);
8122		return;
8123	}
8124	freeblks = freework->fw_freeblks;
8125	fs = ump->um_fs;
8126	needj = MOUNTEDSUJ(freeblks->fb_list.wk_mp) != 0;
8127	bsize = lfragtosize(fs, freework->fw_frags);
8128	LIST_INIT(&wkhd);
8129	/*
8130	 * DEPCOMPLETE is cleared in indirblk_insert() if the block lives
8131	 * on the indirblk hashtable and prevents premature freeing.
8132	 */
8133	freework->fw_state |= DEPCOMPLETE;
8134	/*
8135	 * SUJ needs to wait for the segment referencing freed indirect
8136	 * blocks to expire so that we know the checker will not confuse
8137	 * a re-allocated indirect block with its old contents.
8138	 */
8139	if (needj && freework->fw_lbn <= -UFS_NDADDR)
8140		indirblk_insert(freework);
8141	/*
8142	 * If we are canceling an existing jnewblk pass it to the free
8143	 * routine, otherwise pass the freeblk which will ultimately
8144	 * release the freeblks.  If we're not journaling, we can just
8145	 * free the freeblks immediately.
8146	 */
8147	jnewblk = freework->fw_jnewblk;
8148	if (jnewblk != NULL) {
8149		cancel_jnewblk(jnewblk, &wkhd);
8150		needj = 0;
8151	} else if (needj) {
8152		freework->fw_state |= DELAYEDFREE;
8153		freeblks->fb_cgwait++;
8154		WORKLIST_INSERT(&wkhd, &freework->fw_list);
8155	}
8156	FREE_LOCK(ump);
8157	freeblks_free(ump, freeblks, btodb(bsize));
8158	CTR4(KTR_SUJ,
8159	    "freework_freeblock: ino %jd blkno %jd lbn %jd size %d",
8160	    freeblks->fb_inum, freework->fw_blkno, freework->fw_lbn, bsize);
8161	ffs_blkfree(ump, fs, freeblks->fb_devvp, freework->fw_blkno, bsize,
8162	    freeblks->fb_inum, freeblks->fb_vtype, &wkhd, key);
8163	ACQUIRE_LOCK(ump);
8164	/*
8165	 * The jnewblk will be discarded and the bits in the map never
8166	 * made it to disk.  We can immediately free the freeblk.
8167	 */
8168	if (needj == 0)
8169		handle_written_freework(freework);
8170}
8171
8172/*
8173 * We enqueue freework items that need processing back on the freeblks and
8174 * add the freeblks to the worklist.  This makes it easier to find all work
8175 * required to flush a truncation in process_truncates().
8176 */
8177static void
8178freework_enqueue(freework)
8179	struct freework *freework;
8180{
8181	struct freeblks *freeblks;
8182
8183	freeblks = freework->fw_freeblks;
8184	if ((freework->fw_state & INPROGRESS) == 0)
8185		WORKLIST_INSERT(&freeblks->fb_freeworkhd, &freework->fw_list);
8186	if ((freeblks->fb_state &
8187	    (ONWORKLIST | INPROGRESS | ALLCOMPLETE)) == ALLCOMPLETE &&
8188	    LIST_EMPTY(&freeblks->fb_jblkdephd))
8189		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8190}
8191
8192/*
8193 * Start, continue, or finish the process of freeing an indirect block tree.
8194 * The free operation may be paused at any point with fw_off containing the
8195 * offset to restart from.  This enables us to implement some flow control
8196 * for large truncates which may fan out and generate a huge number of
8197 * dependencies.
8198 */
8199static void
8200handle_workitem_indirblk(freework)
8201	struct freework *freework;
8202{
8203	struct freeblks *freeblks;
8204	struct ufsmount *ump;
8205	struct fs *fs;
8206
8207	freeblks = freework->fw_freeblks;
8208	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8209	fs = ump->um_fs;
8210	if (freework->fw_state & DEPCOMPLETE) {
8211		handle_written_freework(freework);
8212		return;
8213	}
8214	if (freework->fw_off == NINDIR(fs)) {
8215		freework_freeblock(freework, SINGLETON_KEY);
8216		return;
8217	}
8218	freework->fw_state |= INPROGRESS;
8219	FREE_LOCK(ump);
8220	indir_trunc(freework, fsbtodb(fs, freework->fw_blkno),
8221	    freework->fw_lbn);
8222	ACQUIRE_LOCK(ump);
8223}
8224
8225/*
8226 * Called when a freework structure attached to a cg buf is written.  The
8227 * ref on either the parent or the freeblks structure is released and
8228 * the freeblks is added back to the worklist if there is more work to do.
8229 */
8230static void
8231handle_written_freework(freework)
8232	struct freework *freework;
8233{
8234	struct freeblks *freeblks;
8235	struct freework *parent;
8236
8237	freeblks = freework->fw_freeblks;
8238	parent = freework->fw_parent;
8239	if (freework->fw_state & DELAYEDFREE)
8240		freeblks->fb_cgwait--;
8241	freework->fw_state |= COMPLETE;
8242	if ((freework->fw_state & ALLCOMPLETE) == ALLCOMPLETE)
8243		WORKITEM_FREE(freework, D_FREEWORK);
8244	if (parent) {
8245		if (--parent->fw_ref == 0)
8246			freework_enqueue(parent);
8247		return;
8248	}
8249	if (--freeblks->fb_ref != 0)
8250		return;
8251	if ((freeblks->fb_state & (ALLCOMPLETE | ONWORKLIST | INPROGRESS)) ==
8252	    ALLCOMPLETE && LIST_EMPTY(&freeblks->fb_jblkdephd))
8253		add_to_worklist(&freeblks->fb_list, WK_NODELAY);
8254}
8255
8256/*
8257 * This workitem routine performs the block de-allocation.
8258 * The workitem is added to the pending list after the updated
8259 * inode block has been written to disk.  As mentioned above,
8260 * checks regarding the number of blocks de-allocated (compared
8261 * to the number of blocks allocated for the file) are also
8262 * performed in this function.
8263 */
8264static int
8265handle_workitem_freeblocks(freeblks, flags)
8266	struct freeblks *freeblks;
8267	int flags;
8268{
8269	struct freework *freework;
8270	struct newblk *newblk;
8271	struct allocindir *aip;
8272	struct ufsmount *ump;
8273	struct worklist *wk;
8274	u_long key;
8275
8276	KASSERT(LIST_EMPTY(&freeblks->fb_jblkdephd),
8277	    ("handle_workitem_freeblocks: Journal entries not written."));
8278	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8279	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8280	ACQUIRE_LOCK(ump);
8281	while ((wk = LIST_FIRST(&freeblks->fb_freeworkhd)) != NULL) {
8282		WORKLIST_REMOVE(wk);
8283		switch (wk->wk_type) {
8284		case D_DIRREM:
8285			wk->wk_state |= COMPLETE;
8286			add_to_worklist(wk, 0);
8287			continue;
8288
8289		case D_ALLOCDIRECT:
8290			free_newblk(WK_NEWBLK(wk));
8291			continue;
8292
8293		case D_ALLOCINDIR:
8294			aip = WK_ALLOCINDIR(wk);
8295			freework = NULL;
8296			if (aip->ai_state & DELAYEDFREE) {
8297				FREE_LOCK(ump);
8298				freework = newfreework(ump, freeblks, NULL,
8299				    aip->ai_lbn, aip->ai_newblkno,
8300				    ump->um_fs->fs_frag, 0, 0);
8301				ACQUIRE_LOCK(ump);
8302			}
8303			newblk = WK_NEWBLK(wk);
8304			if (newblk->nb_jnewblk) {
8305				freework->fw_jnewblk = newblk->nb_jnewblk;
8306				newblk->nb_jnewblk->jn_dep = &freework->fw_list;
8307				newblk->nb_jnewblk = NULL;
8308			}
8309			free_newblk(newblk);
8310			continue;
8311
8312		case D_FREEWORK:
8313			freework = WK_FREEWORK(wk);
8314			if (freework->fw_lbn <= -UFS_NDADDR)
8315				handle_workitem_indirblk(freework);
8316			else
8317				freework_freeblock(freework, key);
8318			continue;
8319		default:
8320			panic("handle_workitem_freeblocks: Unknown type %s",
8321			    TYPENAME(wk->wk_type));
8322		}
8323	}
8324	if (freeblks->fb_ref != 0) {
8325		freeblks->fb_state &= ~INPROGRESS;
8326		wake_worklist(&freeblks->fb_list);
8327		freeblks = NULL;
8328	}
8329	FREE_LOCK(ump);
8330	ffs_blkrelease_finish(ump, key);
8331	if (freeblks)
8332		return handle_complete_freeblocks(freeblks, flags);
8333	return (0);
8334}
8335
8336/*
8337 * Handle completion of block free via truncate.  This allows fs_pending
8338 * to track the actual free block count more closely than if we only updated
8339 * it at the end.  We must be careful to handle cases where the block count
8340 * on free was incorrect.
8341 */
8342static void
8343freeblks_free(ump, freeblks, blocks)
8344	struct ufsmount *ump;
8345	struct freeblks *freeblks;
8346	int blocks;
8347{
8348	struct fs *fs;
8349	ufs2_daddr_t remain;
8350
8351	UFS_LOCK(ump);
8352	remain = -freeblks->fb_chkcnt;
8353	freeblks->fb_chkcnt += blocks;
8354	if (remain > 0) {
8355		if (remain < blocks)
8356			blocks = remain;
8357		fs = ump->um_fs;
8358		fs->fs_pendingblocks -= blocks;
8359	}
8360	UFS_UNLOCK(ump);
8361}
8362
8363/*
8364 * Once all of the freework workitems are complete we can retire the
8365 * freeblocks dependency and any journal work awaiting completion.  This
8366 * can not be called until all other dependencies are stable on disk.
8367 */
8368static int
8369handle_complete_freeblocks(freeblks, flags)
8370	struct freeblks *freeblks;
8371	int flags;
8372{
8373	struct inodedep *inodedep;
8374	struct inode *ip;
8375	struct vnode *vp;
8376	struct fs *fs;
8377	struct ufsmount *ump;
8378	ufs2_daddr_t spare;
8379
8380	ump = VFSTOUFS(freeblks->fb_list.wk_mp);
8381	fs = ump->um_fs;
8382	flags = LK_EXCLUSIVE | flags;
8383	spare = freeblks->fb_chkcnt;
8384
8385	/*
8386	 * If we did not release the expected number of blocks we may have
8387	 * to adjust the inode block count here.  Only do so if it wasn't
8388	 * a truncation to zero and the modrev still matches.
8389	 */
8390	if (spare && freeblks->fb_len != 0) {
8391		if (ffs_vgetf(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8392		    flags, &vp, FFSV_FORCEINSMQ) != 0)
8393			return (EBUSY);
8394		ip = VTOI(vp);
8395		if (ip->i_mode == 0) {
8396			vgone(vp);
8397		} else if (DIP(ip, i_modrev) == freeblks->fb_modrev) {
8398			DIP_SET(ip, i_blocks, DIP(ip, i_blocks) - spare);
8399			UFS_INODE_SET_FLAG(ip, IN_CHANGE);
8400			/*
8401			 * We must wait so this happens before the
8402			 * journal is reclaimed.
8403			 */
8404			ffs_update(vp, 1);
8405		}
8406		vput(vp);
8407	}
8408	if (spare < 0) {
8409		UFS_LOCK(ump);
8410		fs->fs_pendingblocks += spare;
8411		UFS_UNLOCK(ump);
8412	}
8413#ifdef QUOTA
8414	/* Handle spare. */
8415	if (spare)
8416		quotaadj(freeblks->fb_quota, ump, -spare);
8417	quotarele(freeblks->fb_quota);
8418#endif
8419	ACQUIRE_LOCK(ump);
8420	if (freeblks->fb_state & ONDEPLIST) {
8421		inodedep_lookup(freeblks->fb_list.wk_mp, freeblks->fb_inum,
8422		    0, &inodedep);
8423		TAILQ_REMOVE(&inodedep->id_freeblklst, freeblks, fb_next);
8424		freeblks->fb_state &= ~ONDEPLIST;
8425		if (TAILQ_EMPTY(&inodedep->id_freeblklst))
8426			free_inodedep(inodedep);
8427	}
8428	/*
8429	 * All of the freeblock deps must be complete prior to this call
8430	 * so it's now safe to complete earlier outstanding journal entries.
8431	 */
8432	handle_jwork(&freeblks->fb_jwork);
8433	WORKITEM_FREE(freeblks, D_FREEBLKS);
8434	FREE_LOCK(ump);
8435	return (0);
8436}
8437
8438/*
8439 * Release blocks associated with the freeblks and stored in the indirect
8440 * block dbn. If level is greater than SINGLE, the block is an indirect block
8441 * and recursive calls to indirtrunc must be used to cleanse other indirect
8442 * blocks.
8443 *
8444 * This handles partial and complete truncation of blocks.  Partial is noted
8445 * with goingaway == 0.  In this case the freework is completed after the
8446 * zero'd indirects are written to disk.  For full truncation the freework
8447 * is completed after the block is freed.
8448 */
8449static void
8450indir_trunc(freework, dbn, lbn)
8451	struct freework *freework;
8452	ufs2_daddr_t dbn;
8453	ufs_lbn_t lbn;
8454{
8455	struct freework *nfreework;
8456	struct workhead wkhd;
8457	struct freeblks *freeblks;
8458	struct buf *bp;
8459	struct fs *fs;
8460	struct indirdep *indirdep;
8461	struct mount *mp;
8462	struct ufsmount *ump;
8463	ufs1_daddr_t *bap1;
8464	ufs2_daddr_t nb, nnb, *bap2;
8465	ufs_lbn_t lbnadd, nlbn;
8466	u_long key;
8467	int nblocks, ufs1fmt, freedblocks;
8468	int goingaway, freedeps, needj, level, cnt, i, error;
8469
8470	freeblks = freework->fw_freeblks;
8471	mp = freeblks->fb_list.wk_mp;
8472	ump = VFSTOUFS(mp);
8473	fs = ump->um_fs;
8474	/*
8475	 * Get buffer of block pointers to be freed.  There are three cases:
8476	 *
8477	 * 1) Partial truncate caches the indirdep pointer in the freework
8478	 *    which provides us a back copy to the save bp which holds the
8479	 *    pointers we want to clear.  When this completes the zero
8480	 *    pointers are written to the real copy.
8481	 * 2) The indirect is being completely truncated, cancel_indirdep()
8482	 *    eliminated the real copy and placed the indirdep on the saved
8483	 *    copy.  The indirdep and buf are discarded when this completes.
8484	 * 3) The indirect was not in memory, we read a copy off of the disk
8485	 *    using the devvp and drop and invalidate the buffer when we're
8486	 *    done.
8487	 */
8488	goingaway = 1;
8489	indirdep = NULL;
8490	if (freework->fw_indir != NULL) {
8491		goingaway = 0;
8492		indirdep = freework->fw_indir;
8493		bp = indirdep->ir_savebp;
8494		if (bp == NULL || bp->b_blkno != dbn)
8495			panic("indir_trunc: Bad saved buf %p blkno %jd",
8496			    bp, (intmax_t)dbn);
8497	} else if ((bp = incore(&freeblks->fb_devvp->v_bufobj, dbn)) != NULL) {
8498		/*
8499		 * The lock prevents the buf dep list from changing and
8500	 	 * indirects on devvp should only ever have one dependency.
8501		 */
8502		indirdep = WK_INDIRDEP(LIST_FIRST(&bp->b_dep));
8503		if (indirdep == NULL || (indirdep->ir_state & GOINGAWAY) == 0)
8504			panic("indir_trunc: Bad indirdep %p from buf %p",
8505			    indirdep, bp);
8506	} else {
8507		error = ffs_breadz(ump, freeblks->fb_devvp, dbn, dbn,
8508		    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL, &bp);
8509		if (error)
8510			return;
8511	}
8512	ACQUIRE_LOCK(ump);
8513	/* Protects against a race with complete_trunc_indir(). */
8514	freework->fw_state &= ~INPROGRESS;
8515	/*
8516	 * If we have an indirdep we need to enforce the truncation order
8517	 * and discard it when it is complete.
8518	 */
8519	if (indirdep) {
8520		if (freework != TAILQ_FIRST(&indirdep->ir_trunc) &&
8521		    !TAILQ_EMPTY(&indirdep->ir_trunc)) {
8522			/*
8523			 * Add the complete truncate to the list on the
8524			 * indirdep to enforce in-order processing.
8525			 */
8526			if (freework->fw_indir == NULL)
8527				TAILQ_INSERT_TAIL(&indirdep->ir_trunc,
8528				    freework, fw_next);
8529			FREE_LOCK(ump);
8530			return;
8531		}
8532		/*
8533		 * If we're goingaway, free the indirdep.  Otherwise it will
8534		 * linger until the write completes.
8535		 */
8536		if (goingaway) {
8537			KASSERT(indirdep->ir_savebp == bp,
8538			    ("indir_trunc: losing ir_savebp %p",
8539			    indirdep->ir_savebp));
8540			indirdep->ir_savebp = NULL;
8541			free_indirdep(indirdep);
8542		}
8543	}
8544	FREE_LOCK(ump);
8545	/* Initialize pointers depending on block size. */
8546	if (ump->um_fstype == UFS1) {
8547		bap1 = (ufs1_daddr_t *)bp->b_data;
8548		nb = bap1[freework->fw_off];
8549		ufs1fmt = 1;
8550		bap2 = NULL;
8551	} else {
8552		bap2 = (ufs2_daddr_t *)bp->b_data;
8553		nb = bap2[freework->fw_off];
8554		ufs1fmt = 0;
8555		bap1 = NULL;
8556	}
8557	level = lbn_level(lbn);
8558	needj = MOUNTEDSUJ(UFSTOVFS(ump)) != 0;
8559	lbnadd = lbn_offset(fs, level);
8560	nblocks = btodb(fs->fs_bsize);
8561	nfreework = freework;
8562	freedeps = 0;
8563	cnt = 0;
8564	/*
8565	 * Reclaim blocks.  Traverses into nested indirect levels and
8566	 * arranges for the current level to be freed when subordinates
8567	 * are free when journaling.
8568	 */
8569	key = ffs_blkrelease_start(ump, freeblks->fb_devvp, freeblks->fb_inum);
8570	for (i = freework->fw_off; i < NINDIR(fs); i++, nb = nnb) {
8571		if (UFS_CHECK_BLKNO(mp, freeblks->fb_inum, nb,
8572		    fs->fs_bsize) != 0)
8573			nb = 0;
8574		if (i != NINDIR(fs) - 1) {
8575			if (ufs1fmt)
8576				nnb = bap1[i+1];
8577			else
8578				nnb = bap2[i+1];
8579		} else
8580			nnb = 0;
8581		if (nb == 0)
8582			continue;
8583		cnt++;
8584		if (level != 0) {
8585			nlbn = (lbn + 1) - (i * lbnadd);
8586			if (needj != 0) {
8587				nfreework = newfreework(ump, freeblks, freework,
8588				    nlbn, nb, fs->fs_frag, 0, 0);
8589				freedeps++;
8590			}
8591			indir_trunc(nfreework, fsbtodb(fs, nb), nlbn);
8592		} else {
8593			struct freedep *freedep;
8594
8595			/*
8596			 * Attempt to aggregate freedep dependencies for
8597			 * all blocks being released to the same CG.
8598			 */
8599			LIST_INIT(&wkhd);
8600			if (needj != 0 &&
8601			    (nnb == 0 || (dtog(fs, nb) != dtog(fs, nnb)))) {
8602				freedep = newfreedep(freework);
8603				WORKLIST_INSERT_UNLOCKED(&wkhd,
8604				    &freedep->fd_list);
8605				freedeps++;
8606			}
8607			CTR3(KTR_SUJ,
8608			    "indir_trunc: ino %jd blkno %jd size %d",
8609			    freeblks->fb_inum, nb, fs->fs_bsize);
8610			ffs_blkfree(ump, fs, freeblks->fb_devvp, nb,
8611			    fs->fs_bsize, freeblks->fb_inum,
8612			    freeblks->fb_vtype, &wkhd, key);
8613		}
8614	}
8615	ffs_blkrelease_finish(ump, key);
8616	if (goingaway) {
8617		bp->b_flags |= B_INVAL | B_NOCACHE;
8618		brelse(bp);
8619	}
8620	freedblocks = 0;
8621	if (level == 0)
8622		freedblocks = (nblocks * cnt);
8623	if (needj == 0)
8624		freedblocks += nblocks;
8625	freeblks_free(ump, freeblks, freedblocks);
8626	/*
8627	 * If we are journaling set up the ref counts and offset so this
8628	 * indirect can be completed when its children are free.
8629	 */
8630	if (needj) {
8631		ACQUIRE_LOCK(ump);
8632		freework->fw_off = i;
8633		freework->fw_ref += freedeps;
8634		freework->fw_ref -= NINDIR(fs) + 1;
8635		if (level == 0)
8636			freeblks->fb_cgwait += freedeps;
8637		if (freework->fw_ref == 0)
8638			freework_freeblock(freework, SINGLETON_KEY);
8639		FREE_LOCK(ump);
8640		return;
8641	}
8642	/*
8643	 * If we're not journaling we can free the indirect now.
8644	 */
8645	dbn = dbtofsb(fs, dbn);
8646	CTR3(KTR_SUJ,
8647	    "indir_trunc 2: ino %jd blkno %jd size %d",
8648	    freeblks->fb_inum, dbn, fs->fs_bsize);
8649	ffs_blkfree(ump, fs, freeblks->fb_devvp, dbn, fs->fs_bsize,
8650	    freeblks->fb_inum, freeblks->fb_vtype, NULL, SINGLETON_KEY);
8651	/* Non SUJ softdep does single-threaded truncations. */
8652	if (freework->fw_blkno == dbn) {
8653		freework->fw_state |= ALLCOMPLETE;
8654		ACQUIRE_LOCK(ump);
8655		handle_written_freework(freework);
8656		FREE_LOCK(ump);
8657	}
8658	return;
8659}
8660
8661/*
8662 * Cancel an allocindir when it is removed via truncation.  When bp is not
8663 * NULL the indirect never appeared on disk and is scheduled to be freed
8664 * independently of the indir so we can more easily track journal work.
8665 */
8666static void
8667cancel_allocindir(aip, bp, freeblks, trunc)
8668	struct allocindir *aip;
8669	struct buf *bp;
8670	struct freeblks *freeblks;
8671	int trunc;
8672{
8673	struct indirdep *indirdep;
8674	struct freefrag *freefrag;
8675	struct newblk *newblk;
8676
8677	newblk = (struct newblk *)aip;
8678	LIST_REMOVE(aip, ai_next);
8679	/*
8680	 * We must eliminate the pointer in bp if it must be freed on its
8681	 * own due to partial truncate or pending journal work.
8682	 */
8683	if (bp && (trunc || newblk->nb_jnewblk)) {
8684		/*
8685		 * Clear the pointer and mark the aip to be freed
8686		 * directly if it never existed on disk.
8687		 */
8688		aip->ai_state |= DELAYEDFREE;
8689		indirdep = aip->ai_indirdep;
8690		if (indirdep->ir_state & UFS1FMT)
8691			((ufs1_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8692		else
8693			((ufs2_daddr_t *)bp->b_data)[aip->ai_offset] = 0;
8694	}
8695	/*
8696	 * When truncating the previous pointer will be freed via
8697	 * savedbp.  Eliminate the freefrag which would dup free.
8698	 */
8699	if (trunc && (freefrag = newblk->nb_freefrag) != NULL) {
8700		newblk->nb_freefrag = NULL;
8701		if (freefrag->ff_jdep)
8702			cancel_jfreefrag(
8703			    WK_JFREEFRAG(freefrag->ff_jdep));
8704		jwork_move(&freeblks->fb_jwork, &freefrag->ff_jwork);
8705		WORKITEM_FREE(freefrag, D_FREEFRAG);
8706	}
8707	/*
8708	 * If the journal hasn't been written the jnewblk must be passed
8709	 * to the call to ffs_blkfree that reclaims the space.  We accomplish
8710	 * this by leaving the journal dependency on the newblk to be freed
8711	 * when a freework is created in handle_workitem_freeblocks().
8712	 */
8713	cancel_newblk(newblk, NULL, &freeblks->fb_jwork);
8714	WORKLIST_INSERT(&freeblks->fb_freeworkhd, &newblk->nb_list);
8715}
8716
8717/*
8718 * Create the mkdir dependencies for . and .. in a new directory.  Link them
8719 * in to a newdirblk so any subsequent additions are tracked properly.  The
8720 * caller is responsible for adding the mkdir1 dependency to the journal
8721 * and updating id_mkdiradd.  This function returns with the per-filesystem
8722 * lock held.
8723 */
8724static struct mkdir *
8725setup_newdir(dap, newinum, dinum, newdirbp, mkdirp)
8726	struct diradd *dap;
8727	ino_t newinum;
8728	ino_t dinum;
8729	struct buf *newdirbp;
8730	struct mkdir **mkdirp;
8731{
8732	struct newblk *newblk;
8733	struct pagedep *pagedep;
8734	struct inodedep *inodedep;
8735	struct newdirblk *newdirblk;
8736	struct mkdir *mkdir1, *mkdir2;
8737	struct worklist *wk;
8738	struct jaddref *jaddref;
8739	struct ufsmount *ump;
8740	struct mount *mp;
8741
8742	mp = dap->da_list.wk_mp;
8743	ump = VFSTOUFS(mp);
8744	newdirblk = malloc(sizeof(struct newdirblk), M_NEWDIRBLK,
8745	    M_SOFTDEP_FLAGS);
8746	workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8747	LIST_INIT(&newdirblk->db_mkdir);
8748	mkdir1 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8749	workitem_alloc(&mkdir1->md_list, D_MKDIR, mp);
8750	mkdir1->md_state = ATTACHED | MKDIR_BODY;
8751	mkdir1->md_diradd = dap;
8752	mkdir1->md_jaddref = NULL;
8753	mkdir2 = malloc(sizeof(struct mkdir), M_MKDIR, M_SOFTDEP_FLAGS);
8754	workitem_alloc(&mkdir2->md_list, D_MKDIR, mp);
8755	mkdir2->md_state = ATTACHED | MKDIR_PARENT;
8756	mkdir2->md_diradd = dap;
8757	mkdir2->md_jaddref = NULL;
8758	if (MOUNTEDSUJ(mp) == 0) {
8759		mkdir1->md_state |= DEPCOMPLETE;
8760		mkdir2->md_state |= DEPCOMPLETE;
8761	}
8762	/*
8763	 * Dependency on "." and ".." being written to disk.
8764	 */
8765	mkdir1->md_buf = newdirbp;
8766	ACQUIRE_LOCK(VFSTOUFS(mp));
8767	LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir1, md_mkdirs);
8768	/*
8769	 * We must link the pagedep, allocdirect, and newdirblk for
8770	 * the initial file page so the pointer to the new directory
8771	 * is not written until the directory contents are live and
8772	 * any subsequent additions are not marked live until the
8773	 * block is reachable via the inode.
8774	 */
8775	if (pagedep_lookup(mp, newdirbp, newinum, 0, 0, &pagedep) == 0)
8776		panic("setup_newdir: lost pagedep");
8777	LIST_FOREACH(wk, &newdirbp->b_dep, wk_list)
8778		if (wk->wk_type == D_ALLOCDIRECT)
8779			break;
8780	if (wk == NULL)
8781		panic("setup_newdir: lost allocdirect");
8782	if (pagedep->pd_state & NEWBLOCK)
8783		panic("setup_newdir: NEWBLOCK already set");
8784	newblk = WK_NEWBLK(wk);
8785	pagedep->pd_state |= NEWBLOCK;
8786	pagedep->pd_newdirblk = newdirblk;
8787	newdirblk->db_pagedep = pagedep;
8788	WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8789	WORKLIST_INSERT(&newdirblk->db_mkdir, &mkdir1->md_list);
8790	/*
8791	 * Look up the inodedep for the parent directory so that we
8792	 * can link mkdir2 into the pending dotdot jaddref or
8793	 * the inode write if there is none.  If the inode is
8794	 * ALLCOMPLETE and no jaddref is present all dependencies have
8795	 * been satisfied and mkdir2 can be freed.
8796	 */
8797	inodedep_lookup(mp, dinum, 0, &inodedep);
8798	if (MOUNTEDSUJ(mp)) {
8799		if (inodedep == NULL)
8800			panic("setup_newdir: Lost parent.");
8801		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8802		    inoreflst);
8803		KASSERT(jaddref != NULL && jaddref->ja_parent == newinum &&
8804		    (jaddref->ja_state & MKDIR_PARENT),
8805		    ("setup_newdir: bad dotdot jaddref %p", jaddref));
8806		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8807		mkdir2->md_jaddref = jaddref;
8808		jaddref->ja_mkdir = mkdir2;
8809	} else if (inodedep == NULL ||
8810	    (inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
8811		dap->da_state &= ~MKDIR_PARENT;
8812		WORKITEM_FREE(mkdir2, D_MKDIR);
8813		mkdir2 = NULL;
8814	} else {
8815		LIST_INSERT_HEAD(&ump->softdep_mkdirlisthd, mkdir2, md_mkdirs);
8816		WORKLIST_INSERT(&inodedep->id_bufwait, &mkdir2->md_list);
8817	}
8818	*mkdirp = mkdir2;
8819
8820	return (mkdir1);
8821}
8822
8823/*
8824 * Directory entry addition dependencies.
8825 *
8826 * When adding a new directory entry, the inode (with its incremented link
8827 * count) must be written to disk before the directory entry's pointer to it.
8828 * Also, if the inode is newly allocated, the corresponding freemap must be
8829 * updated (on disk) before the directory entry's pointer. These requirements
8830 * are met via undo/redo on the directory entry's pointer, which consists
8831 * simply of the inode number.
8832 *
8833 * As directory entries are added and deleted, the free space within a
8834 * directory block can become fragmented.  The ufs filesystem will compact
8835 * a fragmented directory block to make space for a new entry. When this
8836 * occurs, the offsets of previously added entries change. Any "diradd"
8837 * dependency structures corresponding to these entries must be updated with
8838 * the new offsets.
8839 */
8840
8841/*
8842 * This routine is called after the in-memory inode's link
8843 * count has been incremented, but before the directory entry's
8844 * pointer to the inode has been set.
8845 */
8846int
8847softdep_setup_directory_add(bp, dp, diroffset, newinum, newdirbp, isnewblk)
8848	struct buf *bp;		/* buffer containing directory block */
8849	struct inode *dp;	/* inode for directory */
8850	off_t diroffset;	/* offset of new entry in directory */
8851	ino_t newinum;		/* inode referenced by new directory entry */
8852	struct buf *newdirbp;	/* non-NULL => contents of new mkdir */
8853	int isnewblk;		/* entry is in a newly allocated block */
8854{
8855	int offset;		/* offset of new entry within directory block */
8856	ufs_lbn_t lbn;		/* block in directory containing new entry */
8857	struct fs *fs;
8858	struct diradd *dap;
8859	struct newblk *newblk;
8860	struct pagedep *pagedep;
8861	struct inodedep *inodedep;
8862	struct newdirblk *newdirblk;
8863	struct mkdir *mkdir1, *mkdir2;
8864	struct jaddref *jaddref;
8865	struct ufsmount *ump;
8866	struct mount *mp;
8867	int isindir;
8868
8869	mp = ITOVFS(dp);
8870	ump = VFSTOUFS(mp);
8871	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
8872	    ("softdep_setup_directory_add called on non-softdep filesystem"));
8873	/*
8874	 * Whiteouts have no dependencies.
8875	 */
8876	if (newinum == UFS_WINO) {
8877		if (newdirbp != NULL)
8878			bdwrite(newdirbp);
8879		return (0);
8880	}
8881	jaddref = NULL;
8882	mkdir1 = mkdir2 = NULL;
8883	fs = ump->um_fs;
8884	lbn = lblkno(fs, diroffset);
8885	offset = blkoff(fs, diroffset);
8886	dap = malloc(sizeof(struct diradd), M_DIRADD,
8887		M_SOFTDEP_FLAGS|M_ZERO);
8888	workitem_alloc(&dap->da_list, D_DIRADD, mp);
8889	dap->da_offset = offset;
8890	dap->da_newinum = newinum;
8891	dap->da_state = ATTACHED;
8892	LIST_INIT(&dap->da_jwork);
8893	isindir = bp->b_lblkno >= UFS_NDADDR;
8894	newdirblk = NULL;
8895	if (isnewblk &&
8896	    (isindir ? blkoff(fs, diroffset) : fragoff(fs, diroffset)) == 0) {
8897		newdirblk = malloc(sizeof(struct newdirblk),
8898		    M_NEWDIRBLK, M_SOFTDEP_FLAGS);
8899		workitem_alloc(&newdirblk->db_list, D_NEWDIRBLK, mp);
8900		LIST_INIT(&newdirblk->db_mkdir);
8901	}
8902	/*
8903	 * If we're creating a new directory setup the dependencies and set
8904	 * the dap state to wait for them.  Otherwise it's COMPLETE and
8905	 * we can move on.
8906	 */
8907	if (newdirbp == NULL) {
8908		dap->da_state |= DEPCOMPLETE;
8909		ACQUIRE_LOCK(ump);
8910	} else {
8911		dap->da_state |= MKDIR_BODY | MKDIR_PARENT;
8912		mkdir1 = setup_newdir(dap, newinum, dp->i_number, newdirbp,
8913		    &mkdir2);
8914	}
8915	/*
8916	 * Link into parent directory pagedep to await its being written.
8917	 */
8918	pagedep_lookup(mp, bp, dp->i_number, lbn, DEPALLOC, &pagedep);
8919#ifdef INVARIANTS
8920	if (diradd_lookup(pagedep, offset) != NULL)
8921		panic("softdep_setup_directory_add: %p already at off %d\n",
8922		    diradd_lookup(pagedep, offset), offset);
8923#endif
8924	dap->da_pagedep = pagedep;
8925	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
8926	    da_pdlist);
8927	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
8928	/*
8929	 * If we're journaling, link the diradd into the jaddref so it
8930	 * may be completed after the journal entry is written.  Otherwise,
8931	 * link the diradd into its inodedep.  If the inode is not yet
8932	 * written place it on the bufwait list, otherwise do the post-inode
8933	 * write processing to put it on the id_pendinghd list.
8934	 */
8935	if (MOUNTEDSUJ(mp)) {
8936		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
8937		    inoreflst);
8938		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
8939		    ("softdep_setup_directory_add: bad jaddref %p", jaddref));
8940		jaddref->ja_diroff = diroffset;
8941		jaddref->ja_diradd = dap;
8942		add_to_journal(&jaddref->ja_list);
8943	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE)
8944		diradd_inode_written(dap, inodedep);
8945	else
8946		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
8947	/*
8948	 * Add the journal entries for . and .. links now that the primary
8949	 * link is written.
8950	 */
8951	if (mkdir1 != NULL && MOUNTEDSUJ(mp)) {
8952		jaddref = (struct jaddref *)TAILQ_PREV(&jaddref->ja_ref,
8953		    inoreflst, if_deps);
8954		KASSERT(jaddref != NULL &&
8955		    jaddref->ja_ino == jaddref->ja_parent &&
8956		    (jaddref->ja_state & MKDIR_BODY),
8957		    ("softdep_setup_directory_add: bad dot jaddref %p",
8958		    jaddref));
8959		mkdir1->md_jaddref = jaddref;
8960		jaddref->ja_mkdir = mkdir1;
8961		/*
8962		 * It is important that the dotdot journal entry
8963		 * is added prior to the dot entry since dot writes
8964		 * both the dot and dotdot links.  These both must
8965		 * be added after the primary link for the journal
8966		 * to remain consistent.
8967		 */
8968		add_to_journal(&mkdir2->md_jaddref->ja_list);
8969		add_to_journal(&jaddref->ja_list);
8970	}
8971	/*
8972	 * If we are adding a new directory remember this diradd so that if
8973	 * we rename it we can keep the dot and dotdot dependencies.  If
8974	 * we are adding a new name for an inode that has a mkdiradd we
8975	 * must be in rename and we have to move the dot and dotdot
8976	 * dependencies to this new name.  The old name is being orphaned
8977	 * soon.
8978	 */
8979	if (mkdir1 != NULL) {
8980		if (inodedep->id_mkdiradd != NULL)
8981			panic("softdep_setup_directory_add: Existing mkdir");
8982		inodedep->id_mkdiradd = dap;
8983	} else if (inodedep->id_mkdiradd)
8984		merge_diradd(inodedep, dap);
8985	if (newdirblk != NULL) {
8986		/*
8987		 * There is nothing to do if we are already tracking
8988		 * this block.
8989		 */
8990		if ((pagedep->pd_state & NEWBLOCK) != 0) {
8991			WORKITEM_FREE(newdirblk, D_NEWDIRBLK);
8992			FREE_LOCK(ump);
8993			return (0);
8994		}
8995		if (newblk_lookup(mp, dbtofsb(fs, bp->b_blkno), 0, &newblk)
8996		    == 0)
8997			panic("softdep_setup_directory_add: lost entry");
8998		WORKLIST_INSERT(&newblk->nb_newdirblk, &newdirblk->db_list);
8999		pagedep->pd_state |= NEWBLOCK;
9000		pagedep->pd_newdirblk = newdirblk;
9001		newdirblk->db_pagedep = pagedep;
9002		FREE_LOCK(ump);
9003		/*
9004		 * If we extended into an indirect signal direnter to sync.
9005		 */
9006		if (isindir)
9007			return (1);
9008		return (0);
9009	}
9010	FREE_LOCK(ump);
9011	return (0);
9012}
9013
9014/*
9015 * This procedure is called to change the offset of a directory
9016 * entry when compacting a directory block which must be owned
9017 * exclusively by the caller. Note that the actual entry movement
9018 * must be done in this procedure to ensure that no I/O completions
9019 * occur while the move is in progress.
9020 */
9021void
9022softdep_change_directoryentry_offset(bp, dp, base, oldloc, newloc, entrysize)
9023	struct buf *bp;		/* Buffer holding directory block. */
9024	struct inode *dp;	/* inode for directory */
9025	caddr_t base;		/* address of dp->i_offset */
9026	caddr_t oldloc;		/* address of old directory location */
9027	caddr_t newloc;		/* address of new directory location */
9028	int entrysize;		/* size of directory entry */
9029{
9030	int offset, oldoffset, newoffset;
9031	struct pagedep *pagedep;
9032	struct jmvref *jmvref;
9033	struct diradd *dap;
9034	struct direct *de;
9035	struct mount *mp;
9036	struct ufsmount *ump;
9037	ufs_lbn_t lbn;
9038	int flags;
9039
9040	mp = ITOVFS(dp);
9041	ump = VFSTOUFS(mp);
9042	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9043	    ("softdep_change_directoryentry_offset called on "
9044	     "non-softdep filesystem"));
9045	de = (struct direct *)oldloc;
9046	jmvref = NULL;
9047	flags = 0;
9048	/*
9049	 * Moves are always journaled as it would be too complex to
9050	 * determine if any affected adds or removes are present in the
9051	 * journal.
9052	 */
9053	if (MOUNTEDSUJ(mp)) {
9054		flags = DEPALLOC;
9055		jmvref = newjmvref(dp, de->d_ino,
9056		    I_OFFSET(dp) + (oldloc - base),
9057		    I_OFFSET(dp) + (newloc - base));
9058	}
9059	lbn = lblkno(ump->um_fs, I_OFFSET(dp));
9060	offset = blkoff(ump->um_fs, I_OFFSET(dp));
9061	oldoffset = offset + (oldloc - base);
9062	newoffset = offset + (newloc - base);
9063	ACQUIRE_LOCK(ump);
9064	if (pagedep_lookup(mp, bp, dp->i_number, lbn, flags, &pagedep) == 0)
9065		goto done;
9066	dap = diradd_lookup(pagedep, oldoffset);
9067	if (dap) {
9068		dap->da_offset = newoffset;
9069		newoffset = DIRADDHASH(newoffset);
9070		oldoffset = DIRADDHASH(oldoffset);
9071		if ((dap->da_state & ALLCOMPLETE) != ALLCOMPLETE &&
9072		    newoffset != oldoffset) {
9073			LIST_REMOVE(dap, da_pdlist);
9074			LIST_INSERT_HEAD(&pagedep->pd_diraddhd[newoffset],
9075			    dap, da_pdlist);
9076		}
9077	}
9078done:
9079	if (jmvref) {
9080		jmvref->jm_pagedep = pagedep;
9081		LIST_INSERT_HEAD(&pagedep->pd_jmvrefhd, jmvref, jm_deps);
9082		add_to_journal(&jmvref->jm_list);
9083	}
9084	bcopy(oldloc, newloc, entrysize);
9085	FREE_LOCK(ump);
9086}
9087
9088/*
9089 * Move the mkdir dependencies and journal work from one diradd to another
9090 * when renaming a directory.  The new name must depend on the mkdir deps
9091 * completing as the old name did.  Directories can only have one valid link
9092 * at a time so one must be canonical.
9093 */
9094static void
9095merge_diradd(inodedep, newdap)
9096	struct inodedep *inodedep;
9097	struct diradd *newdap;
9098{
9099	struct diradd *olddap;
9100	struct mkdir *mkdir, *nextmd;
9101	struct ufsmount *ump;
9102	short state;
9103
9104	olddap = inodedep->id_mkdiradd;
9105	inodedep->id_mkdiradd = newdap;
9106	if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9107		newdap->da_state &= ~DEPCOMPLETE;
9108		ump = VFSTOUFS(inodedep->id_list.wk_mp);
9109		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9110		     mkdir = nextmd) {
9111			nextmd = LIST_NEXT(mkdir, md_mkdirs);
9112			if (mkdir->md_diradd != olddap)
9113				continue;
9114			mkdir->md_diradd = newdap;
9115			state = mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY);
9116			newdap->da_state |= state;
9117			olddap->da_state &= ~state;
9118			if ((olddap->da_state &
9119			    (MKDIR_PARENT | MKDIR_BODY)) == 0)
9120				break;
9121		}
9122		if ((olddap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
9123			panic("merge_diradd: unfound ref");
9124	}
9125	/*
9126	 * Any mkdir related journal items are not safe to be freed until
9127	 * the new name is stable.
9128	 */
9129	jwork_move(&newdap->da_jwork, &olddap->da_jwork);
9130	olddap->da_state |= DEPCOMPLETE;
9131	complete_diradd(olddap);
9132}
9133
9134/*
9135 * Move the diradd to the pending list when all diradd dependencies are
9136 * complete.
9137 */
9138static void
9139complete_diradd(dap)
9140	struct diradd *dap;
9141{
9142	struct pagedep *pagedep;
9143
9144	if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
9145		if (dap->da_state & DIRCHG)
9146			pagedep = dap->da_previous->dm_pagedep;
9147		else
9148			pagedep = dap->da_pagedep;
9149		LIST_REMOVE(dap, da_pdlist);
9150		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9151	}
9152}
9153
9154/*
9155 * Cancel a diradd when a dirrem overlaps with it.  We must cancel the journal
9156 * add entries and conditonally journal the remove.
9157 */
9158static void
9159cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref)
9160	struct diradd *dap;
9161	struct dirrem *dirrem;
9162	struct jremref *jremref;
9163	struct jremref *dotremref;
9164	struct jremref *dotdotremref;
9165{
9166	struct inodedep *inodedep;
9167	struct jaddref *jaddref;
9168	struct inoref *inoref;
9169	struct ufsmount *ump;
9170	struct mkdir *mkdir;
9171
9172	/*
9173	 * If no remove references were allocated we're on a non-journaled
9174	 * filesystem and can skip the cancel step.
9175	 */
9176	if (jremref == NULL) {
9177		free_diradd(dap, NULL);
9178		return;
9179	}
9180	/*
9181	 * Cancel the primary name an free it if it does not require
9182	 * journaling.
9183	 */
9184	if (inodedep_lookup(dap->da_list.wk_mp, dap->da_newinum,
9185	    0, &inodedep) != 0) {
9186		/* Abort the addref that reference this diradd.  */
9187		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
9188			if (inoref->if_list.wk_type != D_JADDREF)
9189				continue;
9190			jaddref = (struct jaddref *)inoref;
9191			if (jaddref->ja_diradd != dap)
9192				continue;
9193			if (cancel_jaddref(jaddref, inodedep,
9194			    &dirrem->dm_jwork) == 0) {
9195				free_jremref(jremref);
9196				jremref = NULL;
9197			}
9198			break;
9199		}
9200	}
9201	/*
9202	 * Cancel subordinate names and free them if they do not require
9203	 * journaling.
9204	 */
9205	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9206		ump = VFSTOUFS(dap->da_list.wk_mp);
9207		LIST_FOREACH(mkdir, &ump->softdep_mkdirlisthd, md_mkdirs) {
9208			if (mkdir->md_diradd != dap)
9209				continue;
9210			if ((jaddref = mkdir->md_jaddref) == NULL)
9211				continue;
9212			mkdir->md_jaddref = NULL;
9213			if (mkdir->md_state & MKDIR_PARENT) {
9214				if (cancel_jaddref(jaddref, NULL,
9215				    &dirrem->dm_jwork) == 0) {
9216					free_jremref(dotdotremref);
9217					dotdotremref = NULL;
9218				}
9219			} else {
9220				if (cancel_jaddref(jaddref, inodedep,
9221				    &dirrem->dm_jwork) == 0) {
9222					free_jremref(dotremref);
9223					dotremref = NULL;
9224				}
9225			}
9226		}
9227	}
9228
9229	if (jremref)
9230		journal_jremref(dirrem, jremref, inodedep);
9231	if (dotremref)
9232		journal_jremref(dirrem, dotremref, inodedep);
9233	if (dotdotremref)
9234		journal_jremref(dirrem, dotdotremref, NULL);
9235	jwork_move(&dirrem->dm_jwork, &dap->da_jwork);
9236	free_diradd(dap, &dirrem->dm_jwork);
9237}
9238
9239/*
9240 * Free a diradd dependency structure.
9241 */
9242static void
9243free_diradd(dap, wkhd)
9244	struct diradd *dap;
9245	struct workhead *wkhd;
9246{
9247	struct dirrem *dirrem;
9248	struct pagedep *pagedep;
9249	struct inodedep *inodedep;
9250	struct mkdir *mkdir, *nextmd;
9251	struct ufsmount *ump;
9252
9253	ump = VFSTOUFS(dap->da_list.wk_mp);
9254	LOCK_OWNED(ump);
9255	LIST_REMOVE(dap, da_pdlist);
9256	if (dap->da_state & ONWORKLIST)
9257		WORKLIST_REMOVE(&dap->da_list);
9258	if ((dap->da_state & DIRCHG) == 0) {
9259		pagedep = dap->da_pagedep;
9260	} else {
9261		dirrem = dap->da_previous;
9262		pagedep = dirrem->dm_pagedep;
9263		dirrem->dm_dirinum = pagedep->pd_ino;
9264		dirrem->dm_state |= COMPLETE;
9265		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9266			add_to_worklist(&dirrem->dm_list, 0);
9267	}
9268	if (inodedep_lookup(pagedep->pd_list.wk_mp, dap->da_newinum,
9269	    0, &inodedep) != 0)
9270		if (inodedep->id_mkdiradd == dap)
9271			inodedep->id_mkdiradd = NULL;
9272	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0) {
9273		for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9274		     mkdir = nextmd) {
9275			nextmd = LIST_NEXT(mkdir, md_mkdirs);
9276			if (mkdir->md_diradd != dap)
9277				continue;
9278			dap->da_state &=
9279			    ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
9280			LIST_REMOVE(mkdir, md_mkdirs);
9281			if (mkdir->md_state & ONWORKLIST)
9282				WORKLIST_REMOVE(&mkdir->md_list);
9283			if (mkdir->md_jaddref != NULL)
9284				panic("free_diradd: Unexpected jaddref");
9285			WORKITEM_FREE(mkdir, D_MKDIR);
9286			if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0)
9287				break;
9288		}
9289		if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) != 0)
9290			panic("free_diradd: unfound ref");
9291	}
9292	if (inodedep)
9293		free_inodedep(inodedep);
9294	/*
9295	 * Free any journal segments waiting for the directory write.
9296	 */
9297	handle_jwork(&dap->da_jwork);
9298	WORKITEM_FREE(dap, D_DIRADD);
9299}
9300
9301/*
9302 * Directory entry removal dependencies.
9303 *
9304 * When removing a directory entry, the entry's inode pointer must be
9305 * zero'ed on disk before the corresponding inode's link count is decremented
9306 * (possibly freeing the inode for re-use). This dependency is handled by
9307 * updating the directory entry but delaying the inode count reduction until
9308 * after the directory block has been written to disk. After this point, the
9309 * inode count can be decremented whenever it is convenient.
9310 */
9311
9312/*
9313 * This routine should be called immediately after removing
9314 * a directory entry.  The inode's link count should not be
9315 * decremented by the calling procedure -- the soft updates
9316 * code will do this task when it is safe.
9317 */
9318void
9319softdep_setup_remove(bp, dp, ip, isrmdir)
9320	struct buf *bp;		/* buffer containing directory block */
9321	struct inode *dp;	/* inode for the directory being modified */
9322	struct inode *ip;	/* inode for directory entry being removed */
9323	int isrmdir;		/* indicates if doing RMDIR */
9324{
9325	struct dirrem *dirrem, *prevdirrem;
9326	struct inodedep *inodedep;
9327	struct ufsmount *ump;
9328	int direct;
9329
9330	ump = ITOUMP(ip);
9331	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9332	    ("softdep_setup_remove called on non-softdep filesystem"));
9333	/*
9334	 * Allocate a new dirrem if appropriate and ACQUIRE_LOCK.  We want
9335	 * newdirrem() to setup the full directory remove which requires
9336	 * isrmdir > 1.
9337	 */
9338	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9339	/*
9340	 * Add the dirrem to the inodedep's pending remove list for quick
9341	 * discovery later.
9342	 */
9343	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0)
9344		panic("softdep_setup_remove: Lost inodedep.");
9345	KASSERT((inodedep->id_state & UNLINKED) == 0, ("inode unlinked"));
9346	dirrem->dm_state |= ONDEPLIST;
9347	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9348
9349	/*
9350	 * If the COMPLETE flag is clear, then there were no active
9351	 * entries and we want to roll back to a zeroed entry until
9352	 * the new inode is committed to disk. If the COMPLETE flag is
9353	 * set then we have deleted an entry that never made it to
9354	 * disk. If the entry we deleted resulted from a name change,
9355	 * then the old name still resides on disk. We cannot delete
9356	 * its inode (returned to us in prevdirrem) until the zeroed
9357	 * directory entry gets to disk. The new inode has never been
9358	 * referenced on the disk, so can be deleted immediately.
9359	 */
9360	if ((dirrem->dm_state & COMPLETE) == 0) {
9361		LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd, dirrem,
9362		    dm_next);
9363		FREE_LOCK(ump);
9364	} else {
9365		if (prevdirrem != NULL)
9366			LIST_INSERT_HEAD(&dirrem->dm_pagedep->pd_dirremhd,
9367			    prevdirrem, dm_next);
9368		dirrem->dm_dirinum = dirrem->dm_pagedep->pd_ino;
9369		direct = LIST_EMPTY(&dirrem->dm_jremrefhd);
9370		FREE_LOCK(ump);
9371		if (direct)
9372			handle_workitem_remove(dirrem, 0);
9373	}
9374}
9375
9376/*
9377 * Check for an entry matching 'offset' on both the pd_dirraddhd list and the
9378 * pd_pendinghd list of a pagedep.
9379 */
9380static struct diradd *
9381diradd_lookup(pagedep, offset)
9382	struct pagedep *pagedep;
9383	int offset;
9384{
9385	struct diradd *dap;
9386
9387	LIST_FOREACH(dap, &pagedep->pd_diraddhd[DIRADDHASH(offset)], da_pdlist)
9388		if (dap->da_offset == offset)
9389			return (dap);
9390	LIST_FOREACH(dap, &pagedep->pd_pendinghd, da_pdlist)
9391		if (dap->da_offset == offset)
9392			return (dap);
9393	return (NULL);
9394}
9395
9396/*
9397 * Search for a .. diradd dependency in a directory that is being removed.
9398 * If the directory was renamed to a new parent we have a diradd rather
9399 * than a mkdir for the .. entry.  We need to cancel it now before
9400 * it is found in truncate().
9401 */
9402static struct jremref *
9403cancel_diradd_dotdot(ip, dirrem, jremref)
9404	struct inode *ip;
9405	struct dirrem *dirrem;
9406	struct jremref *jremref;
9407{
9408	struct pagedep *pagedep;
9409	struct diradd *dap;
9410	struct worklist *wk;
9411
9412	if (pagedep_lookup(ITOVFS(ip), NULL, ip->i_number, 0, 0, &pagedep) == 0)
9413		return (jremref);
9414	dap = diradd_lookup(pagedep, DOTDOT_OFFSET);
9415	if (dap == NULL)
9416		return (jremref);
9417	cancel_diradd(dap, dirrem, jremref, NULL, NULL);
9418	/*
9419	 * Mark any journal work as belonging to the parent so it is freed
9420	 * with the .. reference.
9421	 */
9422	LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9423		wk->wk_state |= MKDIR_PARENT;
9424	return (NULL);
9425}
9426
9427/*
9428 * Cancel the MKDIR_PARENT mkdir component of a diradd when we're going to
9429 * replace it with a dirrem/diradd pair as a result of re-parenting a
9430 * directory.  This ensures that we don't simultaneously have a mkdir and
9431 * a diradd for the same .. entry.
9432 */
9433static struct jremref *
9434cancel_mkdir_dotdot(ip, dirrem, jremref)
9435	struct inode *ip;
9436	struct dirrem *dirrem;
9437	struct jremref *jremref;
9438{
9439	struct inodedep *inodedep;
9440	struct jaddref *jaddref;
9441	struct ufsmount *ump;
9442	struct mkdir *mkdir;
9443	struct diradd *dap;
9444	struct mount *mp;
9445
9446	mp = ITOVFS(ip);
9447	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9448		return (jremref);
9449	dap = inodedep->id_mkdiradd;
9450	if (dap == NULL || (dap->da_state & MKDIR_PARENT) == 0)
9451		return (jremref);
9452	ump = VFSTOUFS(inodedep->id_list.wk_mp);
9453	for (mkdir = LIST_FIRST(&ump->softdep_mkdirlisthd); mkdir;
9454	    mkdir = LIST_NEXT(mkdir, md_mkdirs))
9455		if (mkdir->md_diradd == dap && mkdir->md_state & MKDIR_PARENT)
9456			break;
9457	if (mkdir == NULL)
9458		panic("cancel_mkdir_dotdot: Unable to find mkdir\n");
9459	if ((jaddref = mkdir->md_jaddref) != NULL) {
9460		mkdir->md_jaddref = NULL;
9461		jaddref->ja_state &= ~MKDIR_PARENT;
9462		if (inodedep_lookup(mp, jaddref->ja_ino, 0, &inodedep) == 0)
9463			panic("cancel_mkdir_dotdot: Lost parent inodedep");
9464		if (cancel_jaddref(jaddref, inodedep, &dirrem->dm_jwork)) {
9465			journal_jremref(dirrem, jremref, inodedep);
9466			jremref = NULL;
9467		}
9468	}
9469	if (mkdir->md_state & ONWORKLIST)
9470		WORKLIST_REMOVE(&mkdir->md_list);
9471	mkdir->md_state |= ALLCOMPLETE;
9472	complete_mkdir(mkdir);
9473	return (jremref);
9474}
9475
9476static void
9477journal_jremref(dirrem, jremref, inodedep)
9478	struct dirrem *dirrem;
9479	struct jremref *jremref;
9480	struct inodedep *inodedep;
9481{
9482
9483	if (inodedep == NULL)
9484		if (inodedep_lookup(jremref->jr_list.wk_mp,
9485		    jremref->jr_ref.if_ino, 0, &inodedep) == 0)
9486			panic("journal_jremref: Lost inodedep");
9487	LIST_INSERT_HEAD(&dirrem->dm_jremrefhd, jremref, jr_deps);
9488	TAILQ_INSERT_TAIL(&inodedep->id_inoreflst, &jremref->jr_ref, if_deps);
9489	add_to_journal(&jremref->jr_list);
9490}
9491
9492static void
9493dirrem_journal(dirrem, jremref, dotremref, dotdotremref)
9494	struct dirrem *dirrem;
9495	struct jremref *jremref;
9496	struct jremref *dotremref;
9497	struct jremref *dotdotremref;
9498{
9499	struct inodedep *inodedep;
9500
9501	if (inodedep_lookup(jremref->jr_list.wk_mp, jremref->jr_ref.if_ino, 0,
9502	    &inodedep) == 0)
9503		panic("dirrem_journal: Lost inodedep");
9504	journal_jremref(dirrem, jremref, inodedep);
9505	if (dotremref)
9506		journal_jremref(dirrem, dotremref, inodedep);
9507	if (dotdotremref)
9508		journal_jremref(dirrem, dotdotremref, NULL);
9509}
9510
9511/*
9512 * Allocate a new dirrem if appropriate and return it along with
9513 * its associated pagedep. Called without a lock, returns with lock.
9514 */
9515static struct dirrem *
9516newdirrem(bp, dp, ip, isrmdir, prevdirremp)
9517	struct buf *bp;		/* buffer containing directory block */
9518	struct inode *dp;	/* inode for the directory being modified */
9519	struct inode *ip;	/* inode for directory entry being removed */
9520	int isrmdir;		/* indicates if doing RMDIR */
9521	struct dirrem **prevdirremp; /* previously referenced inode, if any */
9522{
9523	int offset;
9524	ufs_lbn_t lbn;
9525	struct diradd *dap;
9526	struct dirrem *dirrem;
9527	struct pagedep *pagedep;
9528	struct jremref *jremref;
9529	struct jremref *dotremref;
9530	struct jremref *dotdotremref;
9531	struct vnode *dvp;
9532	struct ufsmount *ump;
9533
9534	/*
9535	 * Whiteouts have no deletion dependencies.
9536	 */
9537	if (ip == NULL)
9538		panic("newdirrem: whiteout");
9539	dvp = ITOV(dp);
9540	ump = ITOUMP(dp);
9541
9542	/*
9543	 * If the system is over its limit and our filesystem is
9544	 * responsible for more than our share of that usage and
9545	 * we are not a snapshot, request some inodedep cleanup.
9546	 * Limiting the number of dirrem structures will also limit
9547	 * the number of freefile and freeblks structures.
9548	 */
9549	ACQUIRE_LOCK(ump);
9550	if (!IS_SNAPSHOT(ip) && softdep_excess_items(ump, D_DIRREM))
9551		schedule_cleanup(UFSTOVFS(ump));
9552	else
9553		FREE_LOCK(ump);
9554	dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS |
9555	    M_ZERO);
9556	workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount);
9557	LIST_INIT(&dirrem->dm_jremrefhd);
9558	LIST_INIT(&dirrem->dm_jwork);
9559	dirrem->dm_state = isrmdir ? RMDIR : 0;
9560	dirrem->dm_oldinum = ip->i_number;
9561	*prevdirremp = NULL;
9562	/*
9563	 * Allocate remove reference structures to track journal write
9564	 * dependencies.  We will always have one for the link and
9565	 * when doing directories we will always have one more for dot.
9566	 * When renaming a directory we skip the dotdot link change so
9567	 * this is not needed.
9568	 */
9569	jremref = dotremref = dotdotremref = NULL;
9570	if (DOINGSUJ(dvp)) {
9571		if (isrmdir) {
9572			jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9573			    ip->i_effnlink + 2);
9574			dotremref = newjremref(dirrem, ip, ip, DOT_OFFSET,
9575			    ip->i_effnlink + 1);
9576			dotdotremref = newjremref(dirrem, ip, dp, DOTDOT_OFFSET,
9577			    dp->i_effnlink + 1);
9578			dotdotremref->jr_state |= MKDIR_PARENT;
9579		} else
9580			jremref = newjremref(dirrem, dp, ip, I_OFFSET(dp),
9581			    ip->i_effnlink + 1);
9582	}
9583	ACQUIRE_LOCK(ump);
9584	lbn = lblkno(ump->um_fs, I_OFFSET(dp));
9585	offset = blkoff(ump->um_fs, I_OFFSET(dp));
9586	pagedep_lookup(UFSTOVFS(ump), bp, dp->i_number, lbn, DEPALLOC,
9587	    &pagedep);
9588	dirrem->dm_pagedep = pagedep;
9589	dirrem->dm_offset = offset;
9590	/*
9591	 * If we're renaming a .. link to a new directory, cancel any
9592	 * existing MKDIR_PARENT mkdir.  If it has already been canceled
9593	 * the jremref is preserved for any potential diradd in this
9594	 * location.  This can not coincide with a rmdir.
9595	 */
9596	if (I_OFFSET(dp) == DOTDOT_OFFSET) {
9597		if (isrmdir)
9598			panic("newdirrem: .. directory change during remove?");
9599		jremref = cancel_mkdir_dotdot(dp, dirrem, jremref);
9600	}
9601	/*
9602	 * If we're removing a directory search for the .. dependency now and
9603	 * cancel it.  Any pending journal work will be added to the dirrem
9604	 * to be completed when the workitem remove completes.
9605	 */
9606	if (isrmdir)
9607		dotdotremref = cancel_diradd_dotdot(ip, dirrem, dotdotremref);
9608	/*
9609	 * Check for a diradd dependency for the same directory entry.
9610	 * If present, then both dependencies become obsolete and can
9611	 * be de-allocated.
9612	 */
9613	dap = diradd_lookup(pagedep, offset);
9614	if (dap == NULL) {
9615		/*
9616		 * Link the jremref structures into the dirrem so they are
9617		 * written prior to the pagedep.
9618		 */
9619		if (jremref)
9620			dirrem_journal(dirrem, jremref, dotremref,
9621			    dotdotremref);
9622		return (dirrem);
9623	}
9624	/*
9625	 * Must be ATTACHED at this point.
9626	 */
9627	if ((dap->da_state & ATTACHED) == 0)
9628		panic("newdirrem: not ATTACHED");
9629	if (dap->da_newinum != ip->i_number)
9630		panic("newdirrem: inum %ju should be %ju",
9631		    (uintmax_t)ip->i_number, (uintmax_t)dap->da_newinum);
9632	/*
9633	 * If we are deleting a changed name that never made it to disk,
9634	 * then return the dirrem describing the previous inode (which
9635	 * represents the inode currently referenced from this entry on disk).
9636	 */
9637	if ((dap->da_state & DIRCHG) != 0) {
9638		*prevdirremp = dap->da_previous;
9639		dap->da_state &= ~DIRCHG;
9640		dap->da_pagedep = pagedep;
9641	}
9642	/*
9643	 * We are deleting an entry that never made it to disk.
9644	 * Mark it COMPLETE so we can delete its inode immediately.
9645	 */
9646	dirrem->dm_state |= COMPLETE;
9647	cancel_diradd(dap, dirrem, jremref, dotremref, dotdotremref);
9648#ifdef INVARIANTS
9649	if (isrmdir == 0) {
9650		struct worklist *wk;
9651
9652		LIST_FOREACH(wk, &dirrem->dm_jwork, wk_list)
9653			if (wk->wk_state & (MKDIR_BODY | MKDIR_PARENT))
9654				panic("bad wk %p (0x%X)\n", wk, wk->wk_state);
9655	}
9656#endif
9657
9658	return (dirrem);
9659}
9660
9661/*
9662 * Directory entry change dependencies.
9663 *
9664 * Changing an existing directory entry requires that an add operation
9665 * be completed first followed by a deletion. The semantics for the addition
9666 * are identical to the description of adding a new entry above except
9667 * that the rollback is to the old inode number rather than zero. Once
9668 * the addition dependency is completed, the removal is done as described
9669 * in the removal routine above.
9670 */
9671
9672/*
9673 * This routine should be called immediately after changing
9674 * a directory entry.  The inode's link count should not be
9675 * decremented by the calling procedure -- the soft updates
9676 * code will perform this task when it is safe.
9677 */
9678void
9679softdep_setup_directory_change(bp, dp, ip, newinum, isrmdir)
9680	struct buf *bp;		/* buffer containing directory block */
9681	struct inode *dp;	/* inode for the directory being modified */
9682	struct inode *ip;	/* inode for directory entry being removed */
9683	ino_t newinum;		/* new inode number for changed entry */
9684	int isrmdir;		/* indicates if doing RMDIR */
9685{
9686	int offset;
9687	struct diradd *dap = NULL;
9688	struct dirrem *dirrem, *prevdirrem;
9689	struct pagedep *pagedep;
9690	struct inodedep *inodedep;
9691	struct jaddref *jaddref;
9692	struct mount *mp;
9693	struct ufsmount *ump;
9694
9695	mp = ITOVFS(dp);
9696	ump = VFSTOUFS(mp);
9697	offset = blkoff(ump->um_fs, I_OFFSET(dp));
9698	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
9699	   ("softdep_setup_directory_change called on non-softdep filesystem"));
9700
9701	/*
9702	 * Whiteouts do not need diradd dependencies.
9703	 */
9704	if (newinum != UFS_WINO) {
9705		dap = malloc(sizeof(struct diradd),
9706		    M_DIRADD, M_SOFTDEP_FLAGS|M_ZERO);
9707		workitem_alloc(&dap->da_list, D_DIRADD, mp);
9708		dap->da_state = DIRCHG | ATTACHED | DEPCOMPLETE;
9709		dap->da_offset = offset;
9710		dap->da_newinum = newinum;
9711		LIST_INIT(&dap->da_jwork);
9712	}
9713
9714	/*
9715	 * Allocate a new dirrem and ACQUIRE_LOCK.
9716	 */
9717	dirrem = newdirrem(bp, dp, ip, isrmdir, &prevdirrem);
9718	pagedep = dirrem->dm_pagedep;
9719	/*
9720	 * The possible values for isrmdir:
9721	 *	0 - non-directory file rename
9722	 *	1 - directory rename within same directory
9723	 *   inum - directory rename to new directory of given inode number
9724	 * When renaming to a new directory, we are both deleting and
9725	 * creating a new directory entry, so the link count on the new
9726	 * directory should not change. Thus we do not need the followup
9727	 * dirrem which is usually done in handle_workitem_remove. We set
9728	 * the DIRCHG flag to tell handle_workitem_remove to skip the
9729	 * followup dirrem.
9730	 */
9731	if (isrmdir > 1)
9732		dirrem->dm_state |= DIRCHG;
9733
9734	/*
9735	 * Whiteouts have no additional dependencies,
9736	 * so just put the dirrem on the correct list.
9737	 */
9738	if (newinum == UFS_WINO) {
9739		if ((dirrem->dm_state & COMPLETE) == 0) {
9740			LIST_INSERT_HEAD(&pagedep->pd_dirremhd, dirrem,
9741			    dm_next);
9742		} else {
9743			dirrem->dm_dirinum = pagedep->pd_ino;
9744			if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9745				add_to_worklist(&dirrem->dm_list, 0);
9746		}
9747		FREE_LOCK(ump);
9748		return;
9749	}
9750	/*
9751	 * Add the dirrem to the inodedep's pending remove list for quick
9752	 * discovery later.  A valid nlinkdelta ensures that this lookup
9753	 * will not fail.
9754	 */
9755	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
9756		panic("softdep_setup_directory_change: Lost inodedep.");
9757	dirrem->dm_state |= ONDEPLIST;
9758	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
9759
9760	/*
9761	 * If the COMPLETE flag is clear, then there were no active
9762	 * entries and we want to roll back to the previous inode until
9763	 * the new inode is committed to disk. If the COMPLETE flag is
9764	 * set, then we have deleted an entry that never made it to disk.
9765	 * If the entry we deleted resulted from a name change, then the old
9766	 * inode reference still resides on disk. Any rollback that we do
9767	 * needs to be to that old inode (returned to us in prevdirrem). If
9768	 * the entry we deleted resulted from a create, then there is
9769	 * no entry on the disk, so we want to roll back to zero rather
9770	 * than the uncommitted inode. In either of the COMPLETE cases we
9771	 * want to immediately free the unwritten and unreferenced inode.
9772	 */
9773	if ((dirrem->dm_state & COMPLETE) == 0) {
9774		dap->da_previous = dirrem;
9775	} else {
9776		if (prevdirrem != NULL) {
9777			dap->da_previous = prevdirrem;
9778		} else {
9779			dap->da_state &= ~DIRCHG;
9780			dap->da_pagedep = pagedep;
9781		}
9782		dirrem->dm_dirinum = pagedep->pd_ino;
9783		if (LIST_EMPTY(&dirrem->dm_jremrefhd))
9784			add_to_worklist(&dirrem->dm_list, 0);
9785	}
9786	/*
9787	 * Lookup the jaddref for this journal entry.  We must finish
9788	 * initializing it and make the diradd write dependent on it.
9789	 * If we're not journaling, put it on the id_bufwait list if the
9790	 * inode is not yet written. If it is written, do the post-inode
9791	 * write processing to put it on the id_pendinghd list.
9792	 */
9793	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
9794	if (MOUNTEDSUJ(mp)) {
9795		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
9796		    inoreflst);
9797		KASSERT(jaddref != NULL && jaddref->ja_parent == dp->i_number,
9798		    ("softdep_setup_directory_change: bad jaddref %p",
9799		    jaddref));
9800		jaddref->ja_diroff = I_OFFSET(dp);
9801		jaddref->ja_diradd = dap;
9802		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9803		    dap, da_pdlist);
9804		add_to_journal(&jaddref->ja_list);
9805	} else if ((inodedep->id_state & ALLCOMPLETE) == ALLCOMPLETE) {
9806		dap->da_state |= COMPLETE;
9807		LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap, da_pdlist);
9808		WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
9809	} else {
9810		LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)],
9811		    dap, da_pdlist);
9812		WORKLIST_INSERT(&inodedep->id_bufwait, &dap->da_list);
9813	}
9814	/*
9815	 * If we're making a new name for a directory that has not been
9816	 * committed when need to move the dot and dotdot references to
9817	 * this new name.
9818	 */
9819	if (inodedep->id_mkdiradd && I_OFFSET(dp) != DOTDOT_OFFSET)
9820		merge_diradd(inodedep, dap);
9821	FREE_LOCK(ump);
9822}
9823
9824/*
9825 * Called whenever the link count on an inode is changed.
9826 * It creates an inode dependency so that the new reference(s)
9827 * to the inode cannot be committed to disk until the updated
9828 * inode has been written.
9829 */
9830void
9831softdep_change_linkcnt(ip)
9832	struct inode *ip;	/* the inode with the increased link count */
9833{
9834	struct inodedep *inodedep;
9835	struct ufsmount *ump;
9836
9837	ump = ITOUMP(ip);
9838	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9839	    ("softdep_change_linkcnt called on non-softdep filesystem"));
9840	ACQUIRE_LOCK(ump);
9841	inodedep_lookup(UFSTOVFS(ump), ip->i_number, DEPALLOC, &inodedep);
9842	if (ip->i_nlink < ip->i_effnlink)
9843		panic("softdep_change_linkcnt: bad delta");
9844	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
9845	FREE_LOCK(ump);
9846}
9847
9848/*
9849 * Attach a sbdep dependency to the superblock buf so that we can keep
9850 * track of the head of the linked list of referenced but unlinked inodes.
9851 */
9852void
9853softdep_setup_sbupdate(ump, fs, bp)
9854	struct ufsmount *ump;
9855	struct fs *fs;
9856	struct buf *bp;
9857{
9858	struct sbdep *sbdep;
9859	struct worklist *wk;
9860
9861	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
9862	    ("softdep_setup_sbupdate called on non-softdep filesystem"));
9863	LIST_FOREACH(wk, &bp->b_dep, wk_list)
9864		if (wk->wk_type == D_SBDEP)
9865			break;
9866	if (wk != NULL)
9867		return;
9868	sbdep = malloc(sizeof(struct sbdep), M_SBDEP, M_SOFTDEP_FLAGS);
9869	workitem_alloc(&sbdep->sb_list, D_SBDEP, UFSTOVFS(ump));
9870	sbdep->sb_fs = fs;
9871	sbdep->sb_ump = ump;
9872	ACQUIRE_LOCK(ump);
9873	WORKLIST_INSERT(&bp->b_dep, &sbdep->sb_list);
9874	FREE_LOCK(ump);
9875}
9876
9877/*
9878 * Return the first unlinked inodedep which is ready to be the head of the
9879 * list.  The inodedep and all those after it must have valid next pointers.
9880 */
9881static struct inodedep *
9882first_unlinked_inodedep(ump)
9883	struct ufsmount *ump;
9884{
9885	struct inodedep *inodedep;
9886	struct inodedep *idp;
9887
9888	LOCK_OWNED(ump);
9889	for (inodedep = TAILQ_LAST(&ump->softdep_unlinked, inodedeplst);
9890	    inodedep; inodedep = idp) {
9891		if ((inodedep->id_state & UNLINKNEXT) == 0)
9892			return (NULL);
9893		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
9894		if (idp == NULL || (idp->id_state & UNLINKNEXT) == 0)
9895			break;
9896		if ((inodedep->id_state & UNLINKPREV) == 0)
9897			break;
9898	}
9899	return (inodedep);
9900}
9901
9902/*
9903 * Set the sujfree unlinked head pointer prior to writing a superblock.
9904 */
9905static void
9906initiate_write_sbdep(sbdep)
9907	struct sbdep *sbdep;
9908{
9909	struct inodedep *inodedep;
9910	struct fs *bpfs;
9911	struct fs *fs;
9912
9913	bpfs = sbdep->sb_fs;
9914	fs = sbdep->sb_ump->um_fs;
9915	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9916	if (inodedep) {
9917		fs->fs_sujfree = inodedep->id_ino;
9918		inodedep->id_state |= UNLINKPREV;
9919	} else
9920		fs->fs_sujfree = 0;
9921	bpfs->fs_sujfree = fs->fs_sujfree;
9922	/*
9923	 * Because we have made changes to the superblock, we need to
9924	 * recompute its check-hash.
9925	 */
9926	bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
9927}
9928
9929/*
9930 * After a superblock is written determine whether it must be written again
9931 * due to a changing unlinked list head.
9932 */
9933static int
9934handle_written_sbdep(sbdep, bp)
9935	struct sbdep *sbdep;
9936	struct buf *bp;
9937{
9938	struct inodedep *inodedep;
9939	struct fs *fs;
9940
9941	LOCK_OWNED(sbdep->sb_ump);
9942	fs = sbdep->sb_fs;
9943	/*
9944	 * If the superblock doesn't match the in-memory list start over.
9945	 */
9946	inodedep = first_unlinked_inodedep(sbdep->sb_ump);
9947	if ((inodedep && fs->fs_sujfree != inodedep->id_ino) ||
9948	    (inodedep == NULL && fs->fs_sujfree != 0)) {
9949		bdirty(bp);
9950		return (1);
9951	}
9952	WORKITEM_FREE(sbdep, D_SBDEP);
9953	if (fs->fs_sujfree == 0)
9954		return (0);
9955	/*
9956	 * Now that we have a record of this inode in stable store allow it
9957	 * to be written to free up pending work.  Inodes may see a lot of
9958	 * write activity after they are unlinked which we must not hold up.
9959	 */
9960	for (; inodedep != NULL; inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
9961		if ((inodedep->id_state & UNLINKLINKS) != UNLINKLINKS)
9962			panic("handle_written_sbdep: Bad inodedep %p (0x%X)",
9963			    inodedep, inodedep->id_state);
9964		if (inodedep->id_state & UNLINKONLIST)
9965			break;
9966		inodedep->id_state |= DEPCOMPLETE | UNLINKONLIST;
9967	}
9968
9969	return (0);
9970}
9971
9972/*
9973 * Mark an inodedep as unlinked and insert it into the in-memory unlinked list.
9974 */
9975static void
9976unlinked_inodedep(mp, inodedep)
9977	struct mount *mp;
9978	struct inodedep *inodedep;
9979{
9980	struct ufsmount *ump;
9981
9982	ump = VFSTOUFS(mp);
9983	LOCK_OWNED(ump);
9984	if (MOUNTEDSUJ(mp) == 0)
9985		return;
9986	ump->um_fs->fs_fmod = 1;
9987	if (inodedep->id_state & UNLINKED)
9988		panic("unlinked_inodedep: %p already unlinked\n", inodedep);
9989	inodedep->id_state |= UNLINKED;
9990	TAILQ_INSERT_HEAD(&ump->softdep_unlinked, inodedep, id_unlinked);
9991}
9992
9993/*
9994 * Remove an inodedep from the unlinked inodedep list.  This may require
9995 * disk writes if the inode has made it that far.
9996 */
9997static void
9998clear_unlinked_inodedep(inodedep)
9999	struct inodedep *inodedep;
10000{
10001	struct ufs2_dinode *dip;
10002	struct ufsmount *ump;
10003	struct inodedep *idp;
10004	struct inodedep *idn;
10005	struct fs *fs, *bpfs;
10006	struct buf *bp;
10007	daddr_t dbn;
10008	ino_t ino;
10009	ino_t nino;
10010	ino_t pino;
10011	int error;
10012
10013	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10014	fs = ump->um_fs;
10015	ino = inodedep->id_ino;
10016	error = 0;
10017	for (;;) {
10018		LOCK_OWNED(ump);
10019		KASSERT((inodedep->id_state & UNLINKED) != 0,
10020		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
10021		    inodedep));
10022		/*
10023		 * If nothing has yet been written simply remove us from
10024		 * the in memory list and return.  This is the most common
10025		 * case where handle_workitem_remove() loses the final
10026		 * reference.
10027		 */
10028		if ((inodedep->id_state & UNLINKLINKS) == 0)
10029			break;
10030		/*
10031		 * If we have a NEXT pointer and no PREV pointer we can simply
10032		 * clear NEXT's PREV and remove ourselves from the list.  Be
10033		 * careful not to clear PREV if the superblock points at
10034		 * next as well.
10035		 */
10036		idn = TAILQ_NEXT(inodedep, id_unlinked);
10037		if ((inodedep->id_state & UNLINKLINKS) == UNLINKNEXT) {
10038			if (idn && fs->fs_sujfree != idn->id_ino)
10039				idn->id_state &= ~UNLINKPREV;
10040			break;
10041		}
10042		/*
10043		 * Here we have an inodedep which is actually linked into
10044		 * the list.  We must remove it by forcing a write to the
10045		 * link before us, whether it be the superblock or an inode.
10046		 * Unfortunately the list may change while we're waiting
10047		 * on the buf lock for either resource so we must loop until
10048		 * we lock the right one.  If both the superblock and an
10049		 * inode point to this inode we must clear the inode first
10050		 * followed by the superblock.
10051		 */
10052		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
10053		pino = 0;
10054		if (idp && (idp->id_state & UNLINKNEXT))
10055			pino = idp->id_ino;
10056		FREE_LOCK(ump);
10057		if (pino == 0) {
10058			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
10059			    (int)fs->fs_sbsize, 0, 0, 0);
10060		} else {
10061			dbn = fsbtodb(fs, ino_to_fsba(fs, pino));
10062			error = ffs_breadz(ump, ump->um_devvp, dbn, dbn,
10063			    (int)fs->fs_bsize, NULL, NULL, 0, NOCRED, 0, NULL,
10064			    &bp);
10065		}
10066		ACQUIRE_LOCK(ump);
10067		if (error)
10068			break;
10069		/* If the list has changed restart the loop. */
10070		idp = TAILQ_PREV(inodedep, inodedeplst, id_unlinked);
10071		nino = 0;
10072		if (idp && (idp->id_state & UNLINKNEXT))
10073			nino = idp->id_ino;
10074		if (nino != pino ||
10075		    (inodedep->id_state & UNLINKPREV) != UNLINKPREV) {
10076			FREE_LOCK(ump);
10077			brelse(bp);
10078			ACQUIRE_LOCK(ump);
10079			continue;
10080		}
10081		nino = 0;
10082		idn = TAILQ_NEXT(inodedep, id_unlinked);
10083		if (idn)
10084			nino = idn->id_ino;
10085		/*
10086		 * Remove us from the in memory list.  After this we cannot
10087		 * access the inodedep.
10088		 */
10089		KASSERT((inodedep->id_state & UNLINKED) != 0,
10090		    ("clear_unlinked_inodedep: inodedep %p not unlinked",
10091		    inodedep));
10092		inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
10093		TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
10094		FREE_LOCK(ump);
10095		/*
10096		 * The predecessor's next pointer is manually updated here
10097		 * so that the NEXT flag is never cleared for an element
10098		 * that is in the list.
10099		 */
10100		if (pino == 0) {
10101			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
10102			bpfs = (struct fs *)bp->b_data;
10103			ffs_oldfscompat_write(bpfs, ump);
10104			softdep_setup_sbupdate(ump, bpfs, bp);
10105			/*
10106			 * Because we may have made changes to the superblock,
10107			 * we need to recompute its check-hash.
10108			 */
10109			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
10110		} else if (fs->fs_magic == FS_UFS1_MAGIC) {
10111			((struct ufs1_dinode *)bp->b_data +
10112			    ino_to_fsbo(fs, pino))->di_freelink = nino;
10113		} else {
10114			dip = (struct ufs2_dinode *)bp->b_data +
10115			    ino_to_fsbo(fs, pino);
10116			dip->di_freelink = nino;
10117			ffs_update_dinode_ckhash(fs, dip);
10118		}
10119		/*
10120		 * If the bwrite fails we have no recourse to recover.  The
10121		 * filesystem is corrupted already.
10122		 */
10123		bwrite(bp);
10124		ACQUIRE_LOCK(ump);
10125		/*
10126		 * If the superblock pointer still needs to be cleared force
10127		 * a write here.
10128		 */
10129		if (fs->fs_sujfree == ino) {
10130			FREE_LOCK(ump);
10131			bp = getblk(ump->um_devvp, btodb(fs->fs_sblockloc),
10132			    (int)fs->fs_sbsize, 0, 0, 0);
10133			bcopy((caddr_t)fs, bp->b_data, (u_int)fs->fs_sbsize);
10134			bpfs = (struct fs *)bp->b_data;
10135			ffs_oldfscompat_write(bpfs, ump);
10136			softdep_setup_sbupdate(ump, bpfs, bp);
10137			/*
10138			 * Because we may have made changes to the superblock,
10139			 * we need to recompute its check-hash.
10140			 */
10141			bpfs->fs_ckhash = ffs_calc_sbhash(bpfs);
10142			bwrite(bp);
10143			ACQUIRE_LOCK(ump);
10144		}
10145
10146		if (fs->fs_sujfree != ino)
10147			return;
10148		panic("clear_unlinked_inodedep: Failed to clear free head");
10149	}
10150	if (inodedep->id_ino == fs->fs_sujfree)
10151		panic("clear_unlinked_inodedep: Freeing head of free list");
10152	inodedep->id_state &= ~(UNLINKED | UNLINKLINKS | UNLINKONLIST);
10153	TAILQ_REMOVE(&ump->softdep_unlinked, inodedep, id_unlinked);
10154	return;
10155}
10156
10157/*
10158 * This workitem decrements the inode's link count.
10159 * If the link count reaches zero, the file is removed.
10160 */
10161static int
10162handle_workitem_remove(dirrem, flags)
10163	struct dirrem *dirrem;
10164	int flags;
10165{
10166	struct inodedep *inodedep;
10167	struct workhead dotdotwk;
10168	struct worklist *wk;
10169	struct ufsmount *ump;
10170	struct mount *mp;
10171	struct vnode *vp;
10172	struct inode *ip;
10173	ino_t oldinum;
10174
10175	if (dirrem->dm_state & ONWORKLIST)
10176		panic("handle_workitem_remove: dirrem %p still on worklist",
10177		    dirrem);
10178	oldinum = dirrem->dm_oldinum;
10179	mp = dirrem->dm_list.wk_mp;
10180	ump = VFSTOUFS(mp);
10181	flags |= LK_EXCLUSIVE;
10182	if (ffs_vgetf(mp, oldinum, flags, &vp, FFSV_FORCEINSMQ) != 0)
10183		return (EBUSY);
10184	ip = VTOI(vp);
10185	MPASS(ip->i_mode != 0);
10186	ACQUIRE_LOCK(ump);
10187	if ((inodedep_lookup(mp, oldinum, 0, &inodedep)) == 0)
10188		panic("handle_workitem_remove: lost inodedep");
10189	if (dirrem->dm_state & ONDEPLIST)
10190		LIST_REMOVE(dirrem, dm_inonext);
10191	KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
10192	    ("handle_workitem_remove:  Journal entries not written."));
10193
10194	/*
10195	 * Move all dependencies waiting on the remove to complete
10196	 * from the dirrem to the inode inowait list to be completed
10197	 * after the inode has been updated and written to disk.
10198	 *
10199	 * Any marked MKDIR_PARENT are saved to be completed when the
10200	 * dotdot ref is removed unless DIRCHG is specified.  For
10201	 * directory change operations there will be no further
10202	 * directory writes and the jsegdeps need to be moved along
10203	 * with the rest to be completed when the inode is free or
10204	 * stable in the inode free list.
10205	 */
10206	LIST_INIT(&dotdotwk);
10207	while ((wk = LIST_FIRST(&dirrem->dm_jwork)) != NULL) {
10208		WORKLIST_REMOVE(wk);
10209		if ((dirrem->dm_state & DIRCHG) == 0 &&
10210		    wk->wk_state & MKDIR_PARENT) {
10211			wk->wk_state &= ~MKDIR_PARENT;
10212			WORKLIST_INSERT(&dotdotwk, wk);
10213			continue;
10214		}
10215		WORKLIST_INSERT(&inodedep->id_inowait, wk);
10216	}
10217	LIST_SWAP(&dirrem->dm_jwork, &dotdotwk, worklist, wk_list);
10218	/*
10219	 * Normal file deletion.
10220	 */
10221	if ((dirrem->dm_state & RMDIR) == 0) {
10222		ip->i_nlink--;
10223		KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: file ino "
10224		    "%ju negative i_nlink %d", (intmax_t)ip->i_number,
10225		    ip->i_nlink));
10226		DIP_SET(ip, i_nlink, ip->i_nlink);
10227		UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10228		if (ip->i_nlink < ip->i_effnlink)
10229			panic("handle_workitem_remove: bad file delta");
10230		if (ip->i_nlink == 0)
10231			unlinked_inodedep(mp, inodedep);
10232		inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10233		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10234		    ("handle_workitem_remove: worklist not empty. %s",
10235		    TYPENAME(LIST_FIRST(&dirrem->dm_jwork)->wk_type)));
10236		WORKITEM_FREE(dirrem, D_DIRREM);
10237		FREE_LOCK(ump);
10238		goto out;
10239	}
10240	/*
10241	 * Directory deletion. Decrement reference count for both the
10242	 * just deleted parent directory entry and the reference for ".".
10243	 * Arrange to have the reference count on the parent decremented
10244	 * to account for the loss of "..".
10245	 */
10246	ip->i_nlink -= 2;
10247	KASSERT(ip->i_nlink >= 0, ("handle_workitem_remove: directory ino "
10248	    "%ju negative i_nlink %d", (intmax_t)ip->i_number, ip->i_nlink));
10249	DIP_SET(ip, i_nlink, ip->i_nlink);
10250	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10251	if (ip->i_nlink < ip->i_effnlink)
10252		panic("handle_workitem_remove: bad dir delta");
10253	if (ip->i_nlink == 0)
10254		unlinked_inodedep(mp, inodedep);
10255	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
10256	/*
10257	 * Rename a directory to a new parent. Since, we are both deleting
10258	 * and creating a new directory entry, the link count on the new
10259	 * directory should not change. Thus we skip the followup dirrem.
10260	 */
10261	if (dirrem->dm_state & DIRCHG) {
10262		KASSERT(LIST_EMPTY(&dirrem->dm_jwork),
10263		    ("handle_workitem_remove: DIRCHG and worklist not empty."));
10264		WORKITEM_FREE(dirrem, D_DIRREM);
10265		FREE_LOCK(ump);
10266		goto out;
10267	}
10268	dirrem->dm_state = ONDEPLIST;
10269	dirrem->dm_oldinum = dirrem->dm_dirinum;
10270	/*
10271	 * Place the dirrem on the parent's diremhd list.
10272	 */
10273	if (inodedep_lookup(mp, dirrem->dm_oldinum, 0, &inodedep) == 0)
10274		panic("handle_workitem_remove: lost dir inodedep");
10275	LIST_INSERT_HEAD(&inodedep->id_dirremhd, dirrem, dm_inonext);
10276	/*
10277	 * If the allocated inode has never been written to disk, then
10278	 * the on-disk inode is zero'ed and we can remove the file
10279	 * immediately.  When journaling if the inode has been marked
10280	 * unlinked and not DEPCOMPLETE we know it can never be written.
10281	 */
10282	inodedep_lookup(mp, oldinum, 0, &inodedep);
10283	if (inodedep == NULL ||
10284	    (inodedep->id_state & (DEPCOMPLETE | UNLINKED)) == UNLINKED ||
10285	    check_inode_unwritten(inodedep)) {
10286		FREE_LOCK(ump);
10287		vput(vp);
10288		return handle_workitem_remove(dirrem, flags);
10289	}
10290	WORKLIST_INSERT(&inodedep->id_inowait, &dirrem->dm_list);
10291	FREE_LOCK(ump);
10292	UFS_INODE_SET_FLAG(ip, IN_CHANGE);
10293out:
10294	ffs_update(vp, 0);
10295	vput(vp);
10296	return (0);
10297}
10298
10299/*
10300 * Inode de-allocation dependencies.
10301 *
10302 * When an inode's link count is reduced to zero, it can be de-allocated. We
10303 * found it convenient to postpone de-allocation until after the inode is
10304 * written to disk with its new link count (zero).  At this point, all of the
10305 * on-disk inode's block pointers are nullified and, with careful dependency
10306 * list ordering, all dependencies related to the inode will be satisfied and
10307 * the corresponding dependency structures de-allocated.  So, if/when the
10308 * inode is reused, there will be no mixing of old dependencies with new
10309 * ones.  This artificial dependency is set up by the block de-allocation
10310 * procedure above (softdep_setup_freeblocks) and completed by the
10311 * following procedure.
10312 */
10313static void
10314handle_workitem_freefile(freefile)
10315	struct freefile *freefile;
10316{
10317	struct workhead wkhd;
10318	struct fs *fs;
10319	struct ufsmount *ump;
10320	int error;
10321#ifdef INVARIANTS
10322	struct inodedep *idp;
10323#endif
10324
10325	ump = VFSTOUFS(freefile->fx_list.wk_mp);
10326	fs = ump->um_fs;
10327#ifdef INVARIANTS
10328	ACQUIRE_LOCK(ump);
10329	error = inodedep_lookup(UFSTOVFS(ump), freefile->fx_oldinum, 0, &idp);
10330	FREE_LOCK(ump);
10331	if (error)
10332		panic("handle_workitem_freefile: inodedep %p survived", idp);
10333#endif
10334	UFS_LOCK(ump);
10335	fs->fs_pendinginodes -= 1;
10336	UFS_UNLOCK(ump);
10337	LIST_INIT(&wkhd);
10338	LIST_SWAP(&freefile->fx_jwork, &wkhd, worklist, wk_list);
10339	if ((error = ffs_freefile(ump, fs, freefile->fx_devvp,
10340	    freefile->fx_oldinum, freefile->fx_mode, &wkhd)) != 0)
10341		softdep_error("handle_workitem_freefile", error);
10342	ACQUIRE_LOCK(ump);
10343	WORKITEM_FREE(freefile, D_FREEFILE);
10344	FREE_LOCK(ump);
10345}
10346
10347/*
10348 * Helper function which unlinks marker element from work list and returns
10349 * the next element on the list.
10350 */
10351static __inline struct worklist *
10352markernext(struct worklist *marker)
10353{
10354	struct worklist *next;
10355
10356	next = LIST_NEXT(marker, wk_list);
10357	LIST_REMOVE(marker, wk_list);
10358	return next;
10359}
10360
10361/*
10362 * Disk writes.
10363 *
10364 * The dependency structures constructed above are most actively used when file
10365 * system blocks are written to disk.  No constraints are placed on when a
10366 * block can be written, but unsatisfied update dependencies are made safe by
10367 * modifying (or replacing) the source memory for the duration of the disk
10368 * write.  When the disk write completes, the memory block is again brought
10369 * up-to-date.
10370 *
10371 * In-core inode structure reclamation.
10372 *
10373 * Because there are a finite number of "in-core" inode structures, they are
10374 * reused regularly.  By transferring all inode-related dependencies to the
10375 * in-memory inode block and indexing them separately (via "inodedep"s), we
10376 * can allow "in-core" inode structures to be reused at any time and avoid
10377 * any increase in contention.
10378 *
10379 * Called just before entering the device driver to initiate a new disk I/O.
10380 * The buffer must be locked, thus, no I/O completion operations can occur
10381 * while we are manipulating its associated dependencies.
10382 */
10383static void
10384softdep_disk_io_initiation(bp)
10385	struct buf *bp;		/* structure describing disk write to occur */
10386{
10387	struct worklist *wk;
10388	struct worklist marker;
10389	struct inodedep *inodedep;
10390	struct freeblks *freeblks;
10391	struct jblkdep *jblkdep;
10392	struct newblk *newblk;
10393	struct ufsmount *ump;
10394
10395	/*
10396	 * We only care about write operations. There should never
10397	 * be dependencies for reads.
10398	 */
10399	if (bp->b_iocmd != BIO_WRITE)
10400		panic("softdep_disk_io_initiation: not write");
10401
10402	if (bp->b_vflags & BV_BKGRDINPROG)
10403		panic("softdep_disk_io_initiation: Writing buffer with "
10404		    "background write in progress: %p", bp);
10405
10406	ump = softdep_bp_to_mp(bp);
10407	if (ump == NULL)
10408		return;
10409
10410	marker.wk_type = D_LAST + 1;	/* Not a normal workitem */
10411	PHOLD(curproc);			/* Don't swap out kernel stack */
10412	ACQUIRE_LOCK(ump);
10413	/*
10414	 * Do any necessary pre-I/O processing.
10415	 */
10416	for (wk = LIST_FIRST(&bp->b_dep); wk != NULL;
10417	     wk = markernext(&marker)) {
10418		LIST_INSERT_AFTER(wk, &marker, wk_list);
10419		switch (wk->wk_type) {
10420		case D_PAGEDEP:
10421			initiate_write_filepage(WK_PAGEDEP(wk), bp);
10422			continue;
10423
10424		case D_INODEDEP:
10425			inodedep = WK_INODEDEP(wk);
10426			if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC)
10427				initiate_write_inodeblock_ufs1(inodedep, bp);
10428			else
10429				initiate_write_inodeblock_ufs2(inodedep, bp);
10430			continue;
10431
10432		case D_INDIRDEP:
10433			initiate_write_indirdep(WK_INDIRDEP(wk), bp);
10434			continue;
10435
10436		case D_BMSAFEMAP:
10437			initiate_write_bmsafemap(WK_BMSAFEMAP(wk), bp);
10438			continue;
10439
10440		case D_JSEG:
10441			WK_JSEG(wk)->js_buf = NULL;
10442			continue;
10443
10444		case D_FREEBLKS:
10445			freeblks = WK_FREEBLKS(wk);
10446			jblkdep = LIST_FIRST(&freeblks->fb_jblkdephd);
10447			/*
10448			 * We have to wait for the freeblks to be journaled
10449			 * before we can write an inodeblock with updated
10450			 * pointers.  Be careful to arrange the marker so
10451			 * we revisit the freeblks if it's not removed by
10452			 * the first jwait().
10453			 */
10454			if (jblkdep != NULL) {
10455				LIST_REMOVE(&marker, wk_list);
10456				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10457				jwait(&jblkdep->jb_list, MNT_WAIT);
10458			}
10459			continue;
10460		case D_ALLOCDIRECT:
10461		case D_ALLOCINDIR:
10462			/*
10463			 * We have to wait for the jnewblk to be journaled
10464			 * before we can write to a block if the contents
10465			 * may be confused with an earlier file's indirect
10466			 * at recovery time.  Handle the marker as described
10467			 * above.
10468			 */
10469			newblk = WK_NEWBLK(wk);
10470			if (newblk->nb_jnewblk != NULL &&
10471			    indirblk_lookup(newblk->nb_list.wk_mp,
10472			    newblk->nb_newblkno)) {
10473				LIST_REMOVE(&marker, wk_list);
10474				LIST_INSERT_BEFORE(wk, &marker, wk_list);
10475				jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
10476			}
10477			continue;
10478
10479		case D_SBDEP:
10480			initiate_write_sbdep(WK_SBDEP(wk));
10481			continue;
10482
10483		case D_MKDIR:
10484		case D_FREEWORK:
10485		case D_FREEDEP:
10486		case D_JSEGDEP:
10487			continue;
10488
10489		default:
10490			panic("handle_disk_io_initiation: Unexpected type %s",
10491			    TYPENAME(wk->wk_type));
10492			/* NOTREACHED */
10493		}
10494	}
10495	FREE_LOCK(ump);
10496	PRELE(curproc);			/* Allow swapout of kernel stack */
10497}
10498
10499/*
10500 * Called from within the procedure above to deal with unsatisfied
10501 * allocation dependencies in a directory. The buffer must be locked,
10502 * thus, no I/O completion operations can occur while we are
10503 * manipulating its associated dependencies.
10504 */
10505static void
10506initiate_write_filepage(pagedep, bp)
10507	struct pagedep *pagedep;
10508	struct buf *bp;
10509{
10510	struct jremref *jremref;
10511	struct jmvref *jmvref;
10512	struct dirrem *dirrem;
10513	struct diradd *dap;
10514	struct direct *ep;
10515	int i;
10516
10517	if (pagedep->pd_state & IOSTARTED) {
10518		/*
10519		 * This can only happen if there is a driver that does not
10520		 * understand chaining. Here biodone will reissue the call
10521		 * to strategy for the incomplete buffers.
10522		 */
10523		printf("initiate_write_filepage: already started\n");
10524		return;
10525	}
10526	pagedep->pd_state |= IOSTARTED;
10527	/*
10528	 * Wait for all journal remove dependencies to hit the disk.
10529	 * We can not allow any potentially conflicting directory adds
10530	 * to be visible before removes and rollback is too difficult.
10531	 * The per-filesystem lock may be dropped and re-acquired, however
10532	 * we hold the buf locked so the dependency can not go away.
10533	 */
10534	LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next)
10535		while ((jremref = LIST_FIRST(&dirrem->dm_jremrefhd)) != NULL)
10536			jwait(&jremref->jr_list, MNT_WAIT);
10537	while ((jmvref = LIST_FIRST(&pagedep->pd_jmvrefhd)) != NULL)
10538		jwait(&jmvref->jm_list, MNT_WAIT);
10539	for (i = 0; i < DAHASHSZ; i++) {
10540		LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
10541			ep = (struct direct *)
10542			    ((char *)bp->b_data + dap->da_offset);
10543			if (ep->d_ino != dap->da_newinum)
10544				panic("%s: dir inum %ju != new %ju",
10545				    "initiate_write_filepage",
10546				    (uintmax_t)ep->d_ino,
10547				    (uintmax_t)dap->da_newinum);
10548			if (dap->da_state & DIRCHG)
10549				ep->d_ino = dap->da_previous->dm_oldinum;
10550			else
10551				ep->d_ino = 0;
10552			dap->da_state &= ~ATTACHED;
10553			dap->da_state |= UNDONE;
10554		}
10555	}
10556}
10557
10558/*
10559 * Version of initiate_write_inodeblock that handles UFS1 dinodes.
10560 * Note that any bug fixes made to this routine must be done in the
10561 * version found below.
10562 *
10563 * Called from within the procedure above to deal with unsatisfied
10564 * allocation dependencies in an inodeblock. The buffer must be
10565 * locked, thus, no I/O completion operations can occur while we
10566 * are manipulating its associated dependencies.
10567 */
10568static void
10569initiate_write_inodeblock_ufs1(inodedep, bp)
10570	struct inodedep *inodedep;
10571	struct buf *bp;			/* The inode block */
10572{
10573	struct allocdirect *adp, *lastadp;
10574	struct ufs1_dinode *dp;
10575	struct ufs1_dinode *sip;
10576	struct inoref *inoref;
10577	struct ufsmount *ump;
10578	struct fs *fs;
10579	ufs_lbn_t i;
10580#ifdef INVARIANTS
10581	ufs_lbn_t prevlbn = 0;
10582#endif
10583	int deplist;
10584
10585	if (inodedep->id_state & IOSTARTED)
10586		panic("initiate_write_inodeblock_ufs1: already started");
10587	inodedep->id_state |= IOSTARTED;
10588	fs = inodedep->id_fs;
10589	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10590	LOCK_OWNED(ump);
10591	dp = (struct ufs1_dinode *)bp->b_data +
10592	    ino_to_fsbo(fs, inodedep->id_ino);
10593
10594	/*
10595	 * If we're on the unlinked list but have not yet written our
10596	 * next pointer initialize it here.
10597	 */
10598	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10599		struct inodedep *inon;
10600
10601		inon = TAILQ_NEXT(inodedep, id_unlinked);
10602		dp->di_freelink = inon ? inon->id_ino : 0;
10603	}
10604	/*
10605	 * If the bitmap is not yet written, then the allocated
10606	 * inode cannot be written to disk.
10607	 */
10608	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10609		if (inodedep->id_savedino1 != NULL)
10610			panic("initiate_write_inodeblock_ufs1: I/O underway");
10611		FREE_LOCK(ump);
10612		sip = malloc(sizeof(struct ufs1_dinode),
10613		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10614		ACQUIRE_LOCK(ump);
10615		inodedep->id_savedino1 = sip;
10616		*inodedep->id_savedino1 = *dp;
10617		bzero((caddr_t)dp, sizeof(struct ufs1_dinode));
10618		dp->di_gen = inodedep->id_savedino1->di_gen;
10619		dp->di_freelink = inodedep->id_savedino1->di_freelink;
10620		return;
10621	}
10622	/*
10623	 * If no dependencies, then there is nothing to roll back.
10624	 */
10625	inodedep->id_savedsize = dp->di_size;
10626	inodedep->id_savedextsize = 0;
10627	inodedep->id_savednlink = dp->di_nlink;
10628	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10629	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10630		return;
10631	/*
10632	 * Revert the link count to that of the first unwritten journal entry.
10633	 */
10634	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10635	if (inoref)
10636		dp->di_nlink = inoref->if_nlink;
10637	/*
10638	 * Set the dependencies to busy.
10639	 */
10640	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10641	     adp = TAILQ_NEXT(adp, ad_next)) {
10642#ifdef INVARIANTS
10643		if (deplist != 0 && prevlbn >= adp->ad_offset)
10644			panic("softdep_write_inodeblock: lbn order");
10645		prevlbn = adp->ad_offset;
10646		if (adp->ad_offset < UFS_NDADDR &&
10647		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10648			panic("initiate_write_inodeblock_ufs1: "
10649			    "direct pointer #%jd mismatch %d != %jd",
10650			    (intmax_t)adp->ad_offset,
10651			    dp->di_db[adp->ad_offset],
10652			    (intmax_t)adp->ad_newblkno);
10653		if (adp->ad_offset >= UFS_NDADDR &&
10654		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10655			panic("initiate_write_inodeblock_ufs1: "
10656			    "indirect pointer #%jd mismatch %d != %jd",
10657			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10658			    dp->di_ib[adp->ad_offset - UFS_NDADDR],
10659			    (intmax_t)adp->ad_newblkno);
10660		deplist |= 1 << adp->ad_offset;
10661		if ((adp->ad_state & ATTACHED) == 0)
10662			panic("initiate_write_inodeblock_ufs1: "
10663			    "Unknown state 0x%x", adp->ad_state);
10664#endif /* INVARIANTS */
10665		adp->ad_state &= ~ATTACHED;
10666		adp->ad_state |= UNDONE;
10667	}
10668	/*
10669	 * The on-disk inode cannot claim to be any larger than the last
10670	 * fragment that has been written. Otherwise, the on-disk inode
10671	 * might have fragments that were not the last block in the file
10672	 * which would corrupt the filesystem.
10673	 */
10674	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10675	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10676		if (adp->ad_offset >= UFS_NDADDR)
10677			break;
10678		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10679		/* keep going until hitting a rollback to a frag */
10680		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10681			continue;
10682		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10683		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10684#ifdef INVARIANTS
10685			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10686				panic("initiate_write_inodeblock_ufs1: "
10687				    "lost dep1");
10688#endif /* INVARIANTS */
10689			dp->di_db[i] = 0;
10690		}
10691		for (i = 0; i < UFS_NIADDR; i++) {
10692#ifdef INVARIANTS
10693			if (dp->di_ib[i] != 0 &&
10694			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10695				panic("initiate_write_inodeblock_ufs1: "
10696				    "lost dep2");
10697#endif /* INVARIANTS */
10698			dp->di_ib[i] = 0;
10699		}
10700		return;
10701	}
10702	/*
10703	 * If we have zero'ed out the last allocated block of the file,
10704	 * roll back the size to the last currently allocated block.
10705	 * We know that this last allocated block is a full-sized as
10706	 * we already checked for fragments in the loop above.
10707	 */
10708	if (lastadp != NULL &&
10709	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10710		for (i = lastadp->ad_offset; i >= 0; i--)
10711			if (dp->di_db[i] != 0)
10712				break;
10713		dp->di_size = (i + 1) * fs->fs_bsize;
10714	}
10715	/*
10716	 * The only dependencies are for indirect blocks.
10717	 *
10718	 * The file size for indirect block additions is not guaranteed.
10719	 * Such a guarantee would be non-trivial to achieve. The conventional
10720	 * synchronous write implementation also does not make this guarantee.
10721	 * Fsck should catch and fix discrepancies. Arguably, the file size
10722	 * can be over-estimated without destroying integrity when the file
10723	 * moves into the indirect blocks (i.e., is large). If we want to
10724	 * postpone fsck, we are stuck with this argument.
10725	 */
10726	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10727		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10728}
10729
10730/*
10731 * Version of initiate_write_inodeblock that handles UFS2 dinodes.
10732 * Note that any bug fixes made to this routine must be done in the
10733 * version found above.
10734 *
10735 * Called from within the procedure above to deal with unsatisfied
10736 * allocation dependencies in an inodeblock. The buffer must be
10737 * locked, thus, no I/O completion operations can occur while we
10738 * are manipulating its associated dependencies.
10739 */
10740static void
10741initiate_write_inodeblock_ufs2(inodedep, bp)
10742	struct inodedep *inodedep;
10743	struct buf *bp;			/* The inode block */
10744{
10745	struct allocdirect *adp, *lastadp;
10746	struct ufs2_dinode *dp;
10747	struct ufs2_dinode *sip;
10748	struct inoref *inoref;
10749	struct ufsmount *ump;
10750	struct fs *fs;
10751	ufs_lbn_t i;
10752#ifdef INVARIANTS
10753	ufs_lbn_t prevlbn = 0;
10754#endif
10755	int deplist;
10756
10757	if (inodedep->id_state & IOSTARTED)
10758		panic("initiate_write_inodeblock_ufs2: already started");
10759	inodedep->id_state |= IOSTARTED;
10760	fs = inodedep->id_fs;
10761	ump = VFSTOUFS(inodedep->id_list.wk_mp);
10762	LOCK_OWNED(ump);
10763	dp = (struct ufs2_dinode *)bp->b_data +
10764	    ino_to_fsbo(fs, inodedep->id_ino);
10765
10766	/*
10767	 * If we're on the unlinked list but have not yet written our
10768	 * next pointer initialize it here.
10769	 */
10770	if ((inodedep->id_state & (UNLINKED | UNLINKNEXT)) == UNLINKED) {
10771		struct inodedep *inon;
10772
10773		inon = TAILQ_NEXT(inodedep, id_unlinked);
10774		dp->di_freelink = inon ? inon->id_ino : 0;
10775		ffs_update_dinode_ckhash(fs, dp);
10776	}
10777	/*
10778	 * If the bitmap is not yet written, then the allocated
10779	 * inode cannot be written to disk.
10780	 */
10781	if ((inodedep->id_state & DEPCOMPLETE) == 0) {
10782		if (inodedep->id_savedino2 != NULL)
10783			panic("initiate_write_inodeblock_ufs2: I/O underway");
10784		FREE_LOCK(ump);
10785		sip = malloc(sizeof(struct ufs2_dinode),
10786		    M_SAVEDINO, M_SOFTDEP_FLAGS);
10787		ACQUIRE_LOCK(ump);
10788		inodedep->id_savedino2 = sip;
10789		*inodedep->id_savedino2 = *dp;
10790		bzero((caddr_t)dp, sizeof(struct ufs2_dinode));
10791		dp->di_gen = inodedep->id_savedino2->di_gen;
10792		dp->di_freelink = inodedep->id_savedino2->di_freelink;
10793		return;
10794	}
10795	/*
10796	 * If no dependencies, then there is nothing to roll back.
10797	 */
10798	inodedep->id_savedsize = dp->di_size;
10799	inodedep->id_savedextsize = dp->di_extsize;
10800	inodedep->id_savednlink = dp->di_nlink;
10801	if (TAILQ_EMPTY(&inodedep->id_inoupdt) &&
10802	    TAILQ_EMPTY(&inodedep->id_extupdt) &&
10803	    TAILQ_EMPTY(&inodedep->id_inoreflst))
10804		return;
10805	/*
10806	 * Revert the link count to that of the first unwritten journal entry.
10807	 */
10808	inoref = TAILQ_FIRST(&inodedep->id_inoreflst);
10809	if (inoref)
10810		dp->di_nlink = inoref->if_nlink;
10811
10812	/*
10813	 * Set the ext data dependencies to busy.
10814	 */
10815	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10816	     adp = TAILQ_NEXT(adp, ad_next)) {
10817#ifdef INVARIANTS
10818		if (deplist != 0 && prevlbn >= adp->ad_offset)
10819			panic("initiate_write_inodeblock_ufs2: lbn order");
10820		prevlbn = adp->ad_offset;
10821		if (dp->di_extb[adp->ad_offset] != adp->ad_newblkno)
10822			panic("initiate_write_inodeblock_ufs2: "
10823			    "ext pointer #%jd mismatch %jd != %jd",
10824			    (intmax_t)adp->ad_offset,
10825			    (intmax_t)dp->di_extb[adp->ad_offset],
10826			    (intmax_t)adp->ad_newblkno);
10827		deplist |= 1 << adp->ad_offset;
10828		if ((adp->ad_state & ATTACHED) == 0)
10829			panic("initiate_write_inodeblock_ufs2: Unknown "
10830			    "state 0x%x", adp->ad_state);
10831#endif /* INVARIANTS */
10832		adp->ad_state &= ~ATTACHED;
10833		adp->ad_state |= UNDONE;
10834	}
10835	/*
10836	 * The on-disk inode cannot claim to be any larger than the last
10837	 * fragment that has been written. Otherwise, the on-disk inode
10838	 * might have fragments that were not the last block in the ext
10839	 * data which would corrupt the filesystem.
10840	 */
10841	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_extupdt); adp;
10842	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10843		dp->di_extb[adp->ad_offset] = adp->ad_oldblkno;
10844		/* keep going until hitting a rollback to a frag */
10845		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10846			continue;
10847		dp->di_extsize = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10848		for (i = adp->ad_offset + 1; i < UFS_NXADDR; i++) {
10849#ifdef INVARIANTS
10850			if (dp->di_extb[i] != 0 && (deplist & (1 << i)) == 0)
10851				panic("initiate_write_inodeblock_ufs2: "
10852				    "lost dep1");
10853#endif /* INVARIANTS */
10854			dp->di_extb[i] = 0;
10855		}
10856		lastadp = NULL;
10857		break;
10858	}
10859	/*
10860	 * If we have zero'ed out the last allocated block of the ext
10861	 * data, roll back the size to the last currently allocated block.
10862	 * We know that this last allocated block is a full-sized as
10863	 * we already checked for fragments in the loop above.
10864	 */
10865	if (lastadp != NULL &&
10866	    dp->di_extsize <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10867		for (i = lastadp->ad_offset; i >= 0; i--)
10868			if (dp->di_extb[i] != 0)
10869				break;
10870		dp->di_extsize = (i + 1) * fs->fs_bsize;
10871	}
10872	/*
10873	 * Set the file data dependencies to busy.
10874	 */
10875	for (deplist = 0, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10876	     adp = TAILQ_NEXT(adp, ad_next)) {
10877#ifdef INVARIANTS
10878		if (deplist != 0 && prevlbn >= adp->ad_offset)
10879			panic("softdep_write_inodeblock: lbn order");
10880		if ((adp->ad_state & ATTACHED) == 0)
10881			panic("inodedep %p and adp %p not attached", inodedep, adp);
10882		prevlbn = adp->ad_offset;
10883		if (!ffs_fsfail_cleanup(ump, 0) &&
10884		    adp->ad_offset < UFS_NDADDR &&
10885		    dp->di_db[adp->ad_offset] != adp->ad_newblkno)
10886			panic("initiate_write_inodeblock_ufs2: "
10887			    "direct pointer #%jd mismatch %jd != %jd",
10888			    (intmax_t)adp->ad_offset,
10889			    (intmax_t)dp->di_db[adp->ad_offset],
10890			    (intmax_t)adp->ad_newblkno);
10891		if (!ffs_fsfail_cleanup(ump, 0) &&
10892		    adp->ad_offset >= UFS_NDADDR &&
10893		    dp->di_ib[adp->ad_offset - UFS_NDADDR] != adp->ad_newblkno)
10894			panic("initiate_write_inodeblock_ufs2: "
10895			    "indirect pointer #%jd mismatch %jd != %jd",
10896			    (intmax_t)adp->ad_offset - UFS_NDADDR,
10897			    (intmax_t)dp->di_ib[adp->ad_offset - UFS_NDADDR],
10898			    (intmax_t)adp->ad_newblkno);
10899		deplist |= 1 << adp->ad_offset;
10900		if ((adp->ad_state & ATTACHED) == 0)
10901			panic("initiate_write_inodeblock_ufs2: Unknown "
10902			     "state 0x%x", adp->ad_state);
10903#endif /* INVARIANTS */
10904		adp->ad_state &= ~ATTACHED;
10905		adp->ad_state |= UNDONE;
10906	}
10907	/*
10908	 * The on-disk inode cannot claim to be any larger than the last
10909	 * fragment that has been written. Otherwise, the on-disk inode
10910	 * might have fragments that were not the last block in the file
10911	 * which would corrupt the filesystem.
10912	 */
10913	for (lastadp = NULL, adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp;
10914	     lastadp = adp, adp = TAILQ_NEXT(adp, ad_next)) {
10915		if (adp->ad_offset >= UFS_NDADDR)
10916			break;
10917		dp->di_db[adp->ad_offset] = adp->ad_oldblkno;
10918		/* keep going until hitting a rollback to a frag */
10919		if (adp->ad_oldsize == 0 || adp->ad_oldsize == fs->fs_bsize)
10920			continue;
10921		dp->di_size = fs->fs_bsize * adp->ad_offset + adp->ad_oldsize;
10922		for (i = adp->ad_offset + 1; i < UFS_NDADDR; i++) {
10923#ifdef INVARIANTS
10924			if (dp->di_db[i] != 0 && (deplist & (1 << i)) == 0)
10925				panic("initiate_write_inodeblock_ufs2: "
10926				    "lost dep2");
10927#endif /* INVARIANTS */
10928			dp->di_db[i] = 0;
10929		}
10930		for (i = 0; i < UFS_NIADDR; i++) {
10931#ifdef INVARIANTS
10932			if (dp->di_ib[i] != 0 &&
10933			    (deplist & ((1 << UFS_NDADDR) << i)) == 0)
10934				panic("initiate_write_inodeblock_ufs2: "
10935				    "lost dep3");
10936#endif /* INVARIANTS */
10937			dp->di_ib[i] = 0;
10938		}
10939		ffs_update_dinode_ckhash(fs, dp);
10940		return;
10941	}
10942	/*
10943	 * If we have zero'ed out the last allocated block of the file,
10944	 * roll back the size to the last currently allocated block.
10945	 * We know that this last allocated block is a full-sized as
10946	 * we already checked for fragments in the loop above.
10947	 */
10948	if (lastadp != NULL &&
10949	    dp->di_size <= (lastadp->ad_offset + 1) * fs->fs_bsize) {
10950		for (i = lastadp->ad_offset; i >= 0; i--)
10951			if (dp->di_db[i] != 0)
10952				break;
10953		dp->di_size = (i + 1) * fs->fs_bsize;
10954	}
10955	/*
10956	 * The only dependencies are for indirect blocks.
10957	 *
10958	 * The file size for indirect block additions is not guaranteed.
10959	 * Such a guarantee would be non-trivial to achieve. The conventional
10960	 * synchronous write implementation also does not make this guarantee.
10961	 * Fsck should catch and fix discrepancies. Arguably, the file size
10962	 * can be over-estimated without destroying integrity when the file
10963	 * moves into the indirect blocks (i.e., is large). If we want to
10964	 * postpone fsck, we are stuck with this argument.
10965	 */
10966	for (; adp; adp = TAILQ_NEXT(adp, ad_next))
10967		dp->di_ib[adp->ad_offset - UFS_NDADDR] = 0;
10968	ffs_update_dinode_ckhash(fs, dp);
10969}
10970
10971/*
10972 * Cancel an indirdep as a result of truncation.  Release all of the
10973 * children allocindirs and place their journal work on the appropriate
10974 * list.
10975 */
10976static void
10977cancel_indirdep(indirdep, bp, freeblks)
10978	struct indirdep *indirdep;
10979	struct buf *bp;
10980	struct freeblks *freeblks;
10981{
10982	struct allocindir *aip;
10983
10984	/*
10985	 * None of the indirect pointers will ever be visible,
10986	 * so they can simply be tossed. GOINGAWAY ensures
10987	 * that allocated pointers will be saved in the buffer
10988	 * cache until they are freed. Note that they will
10989	 * only be able to be found by their physical address
10990	 * since the inode mapping the logical address will
10991	 * be gone. The save buffer used for the safe copy
10992	 * was allocated in setup_allocindir_phase2 using
10993	 * the physical address so it could be used for this
10994	 * purpose. Hence we swap the safe copy with the real
10995	 * copy, allowing the safe copy to be freed and holding
10996	 * on to the real copy for later use in indir_trunc.
10997	 */
10998	if (indirdep->ir_state & GOINGAWAY)
10999		panic("cancel_indirdep: already gone");
11000	if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
11001		indirdep->ir_state |= DEPCOMPLETE;
11002		LIST_REMOVE(indirdep, ir_next);
11003	}
11004	indirdep->ir_state |= GOINGAWAY;
11005	/*
11006	 * Pass in bp for blocks still have journal writes
11007	 * pending so we can cancel them on their own.
11008	 */
11009	while ((aip = LIST_FIRST(&indirdep->ir_deplisthd)) != NULL)
11010		cancel_allocindir(aip, bp, freeblks, 0);
11011	while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL)
11012		cancel_allocindir(aip, NULL, freeblks, 0);
11013	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL)
11014		cancel_allocindir(aip, NULL, freeblks, 0);
11015	while ((aip = LIST_FIRST(&indirdep->ir_completehd)) != NULL)
11016		cancel_allocindir(aip, NULL, freeblks, 0);
11017	/*
11018	 * If there are pending partial truncations we need to keep the
11019	 * old block copy around until they complete.  This is because
11020	 * the current b_data is not a perfect superset of the available
11021	 * blocks.
11022	 */
11023	if (TAILQ_EMPTY(&indirdep->ir_trunc))
11024		bcopy(bp->b_data, indirdep->ir_savebp->b_data, bp->b_bcount);
11025	else
11026		bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
11027	WORKLIST_REMOVE(&indirdep->ir_list);
11028	WORKLIST_INSERT(&indirdep->ir_savebp->b_dep, &indirdep->ir_list);
11029	indirdep->ir_bp = NULL;
11030	indirdep->ir_freeblks = freeblks;
11031}
11032
11033/*
11034 * Free an indirdep once it no longer has new pointers to track.
11035 */
11036static void
11037free_indirdep(indirdep)
11038	struct indirdep *indirdep;
11039{
11040
11041	KASSERT(TAILQ_EMPTY(&indirdep->ir_trunc),
11042	    ("free_indirdep: Indir trunc list not empty."));
11043	KASSERT(LIST_EMPTY(&indirdep->ir_completehd),
11044	    ("free_indirdep: Complete head not empty."));
11045	KASSERT(LIST_EMPTY(&indirdep->ir_writehd),
11046	    ("free_indirdep: write head not empty."));
11047	KASSERT(LIST_EMPTY(&indirdep->ir_donehd),
11048	    ("free_indirdep: done head not empty."));
11049	KASSERT(LIST_EMPTY(&indirdep->ir_deplisthd),
11050	    ("free_indirdep: deplist head not empty."));
11051	KASSERT((indirdep->ir_state & DEPCOMPLETE),
11052	    ("free_indirdep: %p still on newblk list.", indirdep));
11053	KASSERT(indirdep->ir_saveddata == NULL,
11054	    ("free_indirdep: %p still has saved data.", indirdep));
11055	KASSERT(indirdep->ir_savebp == NULL,
11056	    ("free_indirdep: %p still has savebp buffer.", indirdep));
11057	if (indirdep->ir_state & ONWORKLIST)
11058		WORKLIST_REMOVE(&indirdep->ir_list);
11059	WORKITEM_FREE(indirdep, D_INDIRDEP);
11060}
11061
11062/*
11063 * Called before a write to an indirdep.  This routine is responsible for
11064 * rolling back pointers to a safe state which includes only those
11065 * allocindirs which have been completed.
11066 */
11067static void
11068initiate_write_indirdep(indirdep, bp)
11069	struct indirdep *indirdep;
11070	struct buf *bp;
11071{
11072	struct ufsmount *ump;
11073
11074	indirdep->ir_state |= IOSTARTED;
11075	if (indirdep->ir_state & GOINGAWAY)
11076		panic("disk_io_initiation: indirdep gone");
11077	/*
11078	 * If there are no remaining dependencies, this will be writing
11079	 * the real pointers.
11080	 */
11081	if (LIST_EMPTY(&indirdep->ir_deplisthd) &&
11082	    TAILQ_EMPTY(&indirdep->ir_trunc))
11083		return;
11084	/*
11085	 * Replace up-to-date version with safe version.
11086	 */
11087	if (indirdep->ir_saveddata == NULL) {
11088		ump = VFSTOUFS(indirdep->ir_list.wk_mp);
11089		LOCK_OWNED(ump);
11090		FREE_LOCK(ump);
11091		indirdep->ir_saveddata = malloc(bp->b_bcount, M_INDIRDEP,
11092		    M_SOFTDEP_FLAGS);
11093		ACQUIRE_LOCK(ump);
11094	}
11095	indirdep->ir_state &= ~ATTACHED;
11096	indirdep->ir_state |= UNDONE;
11097	bcopy(bp->b_data, indirdep->ir_saveddata, bp->b_bcount);
11098	bcopy(indirdep->ir_savebp->b_data, bp->b_data,
11099	    bp->b_bcount);
11100}
11101
11102/*
11103 * Called when an inode has been cleared in a cg bitmap.  This finally
11104 * eliminates any canceled jaddrefs
11105 */
11106void
11107softdep_setup_inofree(mp, bp, ino, wkhd)
11108	struct mount *mp;
11109	struct buf *bp;
11110	ino_t ino;
11111	struct workhead *wkhd;
11112{
11113	struct worklist *wk, *wkn;
11114	struct inodedep *inodedep;
11115	struct ufsmount *ump;
11116	uint8_t *inosused;
11117	struct cg *cgp;
11118	struct fs *fs;
11119
11120	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
11121	    ("softdep_setup_inofree called on non-softdep filesystem"));
11122	ump = VFSTOUFS(mp);
11123	ACQUIRE_LOCK(ump);
11124	if (!ffs_fsfail_cleanup(ump, 0)) {
11125		fs = ump->um_fs;
11126		cgp = (struct cg *)bp->b_data;
11127		inosused = cg_inosused(cgp);
11128		if (isset(inosused, ino % fs->fs_ipg))
11129			panic("softdep_setup_inofree: inode %ju not freed.",
11130			    (uintmax_t)ino);
11131	}
11132	if (inodedep_lookup(mp, ino, 0, &inodedep))
11133		panic("softdep_setup_inofree: ino %ju has existing inodedep %p",
11134		    (uintmax_t)ino, inodedep);
11135	if (wkhd) {
11136		LIST_FOREACH_SAFE(wk, wkhd, wk_list, wkn) {
11137			if (wk->wk_type != D_JADDREF)
11138				continue;
11139			WORKLIST_REMOVE(wk);
11140			/*
11141			 * We can free immediately even if the jaddref
11142			 * isn't attached in a background write as now
11143			 * the bitmaps are reconciled.
11144			 */
11145			wk->wk_state |= COMPLETE | ATTACHED;
11146			free_jaddref(WK_JADDREF(wk));
11147		}
11148		jwork_move(&bp->b_dep, wkhd);
11149	}
11150	FREE_LOCK(ump);
11151}
11152
11153/*
11154 * Called via ffs_blkfree() after a set of frags has been cleared from a cg
11155 * map.  Any dependencies waiting for the write to clear are added to the
11156 * buf's list and any jnewblks that are being canceled are discarded
11157 * immediately.
11158 */
11159void
11160softdep_setup_blkfree(mp, bp, blkno, frags, wkhd)
11161	struct mount *mp;
11162	struct buf *bp;
11163	ufs2_daddr_t blkno;
11164	int frags;
11165	struct workhead *wkhd;
11166{
11167	struct bmsafemap *bmsafemap;
11168	struct jnewblk *jnewblk;
11169	struct ufsmount *ump;
11170	struct worklist *wk;
11171	struct fs *fs;
11172#ifdef INVARIANTS
11173	uint8_t *blksfree;
11174	struct cg *cgp;
11175	ufs2_daddr_t jstart;
11176	ufs2_daddr_t jend;
11177	ufs2_daddr_t end;
11178	long bno;
11179	int i;
11180#endif
11181
11182	CTR3(KTR_SUJ,
11183	    "softdep_setup_blkfree: blkno %jd frags %d wk head %p",
11184	    blkno, frags, wkhd);
11185
11186	ump = VFSTOUFS(mp);
11187	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
11188	    ("softdep_setup_blkfree called on non-softdep filesystem"));
11189	ACQUIRE_LOCK(ump);
11190	/* Lookup the bmsafemap so we track when it is dirty. */
11191	fs = ump->um_fs;
11192	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11193	/*
11194	 * Detach any jnewblks which have been canceled.  They must linger
11195	 * until the bitmap is cleared again by ffs_blkfree() to prevent
11196	 * an unjournaled allocation from hitting the disk.
11197	 */
11198	if (wkhd) {
11199		while ((wk = LIST_FIRST(wkhd)) != NULL) {
11200			CTR2(KTR_SUJ,
11201			    "softdep_setup_blkfree: blkno %jd wk type %d",
11202			    blkno, wk->wk_type);
11203			WORKLIST_REMOVE(wk);
11204			if (wk->wk_type != D_JNEWBLK) {
11205				WORKLIST_INSERT(&bmsafemap->sm_freehd, wk);
11206				continue;
11207			}
11208			jnewblk = WK_JNEWBLK(wk);
11209			KASSERT(jnewblk->jn_state & GOINGAWAY,
11210			    ("softdep_setup_blkfree: jnewblk not canceled."));
11211#ifdef INVARIANTS
11212			/*
11213			 * Assert that this block is free in the bitmap
11214			 * before we discard the jnewblk.
11215			 */
11216			cgp = (struct cg *)bp->b_data;
11217			blksfree = cg_blksfree(cgp);
11218			bno = dtogd(fs, jnewblk->jn_blkno);
11219			for (i = jnewblk->jn_oldfrags;
11220			    i < jnewblk->jn_frags; i++) {
11221				if (isset(blksfree, bno + i))
11222					continue;
11223				panic("softdep_setup_blkfree: not free");
11224			}
11225#endif
11226			/*
11227			 * Even if it's not attached we can free immediately
11228			 * as the new bitmap is correct.
11229			 */
11230			wk->wk_state |= COMPLETE | ATTACHED;
11231			free_jnewblk(jnewblk);
11232		}
11233	}
11234
11235#ifdef INVARIANTS
11236	/*
11237	 * Assert that we are not freeing a block which has an outstanding
11238	 * allocation dependency.
11239	 */
11240	fs = VFSTOUFS(mp)->um_fs;
11241	bmsafemap = bmsafemap_lookup(mp, bp, dtog(fs, blkno), NULL);
11242	end = blkno + frags;
11243	LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11244		/*
11245		 * Don't match against blocks that will be freed when the
11246		 * background write is done.
11247		 */
11248		if ((jnewblk->jn_state & (ATTACHED | COMPLETE | DEPCOMPLETE)) ==
11249		    (COMPLETE | DEPCOMPLETE))
11250			continue;
11251		jstart = jnewblk->jn_blkno + jnewblk->jn_oldfrags;
11252		jend = jnewblk->jn_blkno + jnewblk->jn_frags;
11253		if ((blkno >= jstart && blkno < jend) ||
11254		    (end > jstart && end <= jend)) {
11255			printf("state 0x%X %jd - %d %d dep %p\n",
11256			    jnewblk->jn_state, jnewblk->jn_blkno,
11257			    jnewblk->jn_oldfrags, jnewblk->jn_frags,
11258			    jnewblk->jn_dep);
11259			panic("softdep_setup_blkfree: "
11260			    "%jd-%jd(%d) overlaps with %jd-%jd",
11261			    blkno, end, frags, jstart, jend);
11262		}
11263	}
11264#endif
11265	FREE_LOCK(ump);
11266}
11267
11268/*
11269 * Revert a block allocation when the journal record that describes it
11270 * is not yet written.
11271 */
11272static int
11273jnewblk_rollback(jnewblk, fs, cgp, blksfree)
11274	struct jnewblk *jnewblk;
11275	struct fs *fs;
11276	struct cg *cgp;
11277	uint8_t *blksfree;
11278{
11279	ufs1_daddr_t fragno;
11280	long cgbno, bbase;
11281	int frags, blk;
11282	int i;
11283
11284	frags = 0;
11285	cgbno = dtogd(fs, jnewblk->jn_blkno);
11286	/*
11287	 * We have to test which frags need to be rolled back.  We may
11288	 * be operating on a stale copy when doing background writes.
11289	 */
11290	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++)
11291		if (isclr(blksfree, cgbno + i))
11292			frags++;
11293	if (frags == 0)
11294		return (0);
11295	/*
11296	 * This is mostly ffs_blkfree() sans some validation and
11297	 * superblock updates.
11298	 */
11299	if (frags == fs->fs_frag) {
11300		fragno = fragstoblks(fs, cgbno);
11301		ffs_setblock(fs, blksfree, fragno);
11302		ffs_clusteracct(fs, cgp, fragno, 1);
11303		cgp->cg_cs.cs_nbfree++;
11304	} else {
11305		cgbno += jnewblk->jn_oldfrags;
11306		bbase = cgbno - fragnum(fs, cgbno);
11307		/* Decrement the old frags.  */
11308		blk = blkmap(fs, blksfree, bbase);
11309		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
11310		/* Deallocate the fragment */
11311		for (i = 0; i < frags; i++)
11312			setbit(blksfree, cgbno + i);
11313		cgp->cg_cs.cs_nffree += frags;
11314		/* Add back in counts associated with the new frags */
11315		blk = blkmap(fs, blksfree, bbase);
11316		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
11317		/* If a complete block has been reassembled, account for it. */
11318		fragno = fragstoblks(fs, bbase);
11319		if (ffs_isblock(fs, blksfree, fragno)) {
11320			cgp->cg_cs.cs_nffree -= fs->fs_frag;
11321			ffs_clusteracct(fs, cgp, fragno, 1);
11322			cgp->cg_cs.cs_nbfree++;
11323		}
11324	}
11325	stat_jnewblk++;
11326	jnewblk->jn_state &= ~ATTACHED;
11327	jnewblk->jn_state |= UNDONE;
11328
11329	return (frags);
11330}
11331
11332static void
11333initiate_write_bmsafemap(bmsafemap, bp)
11334	struct bmsafemap *bmsafemap;
11335	struct buf *bp;			/* The cg block. */
11336{
11337	struct jaddref *jaddref;
11338	struct jnewblk *jnewblk;
11339	uint8_t *inosused;
11340	uint8_t *blksfree;
11341	struct cg *cgp;
11342	struct fs *fs;
11343	ino_t ino;
11344
11345	/*
11346	 * If this is a background write, we did this at the time that
11347	 * the copy was made, so do not need to do it again.
11348	 */
11349	if (bmsafemap->sm_state & IOSTARTED)
11350		return;
11351	bmsafemap->sm_state |= IOSTARTED;
11352	/*
11353	 * Clear any inode allocations which are pending journal writes.
11354	 */
11355	if (LIST_FIRST(&bmsafemap->sm_jaddrefhd) != NULL) {
11356		cgp = (struct cg *)bp->b_data;
11357		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11358		inosused = cg_inosused(cgp);
11359		LIST_FOREACH(jaddref, &bmsafemap->sm_jaddrefhd, ja_bmdeps) {
11360			ino = jaddref->ja_ino % fs->fs_ipg;
11361			if (isset(inosused, ino)) {
11362				if ((jaddref->ja_mode & IFMT) == IFDIR)
11363					cgp->cg_cs.cs_ndir--;
11364				cgp->cg_cs.cs_nifree++;
11365				clrbit(inosused, ino);
11366				jaddref->ja_state &= ~ATTACHED;
11367				jaddref->ja_state |= UNDONE;
11368				stat_jaddref++;
11369			} else
11370				panic("initiate_write_bmsafemap: inode %ju "
11371				    "marked free", (uintmax_t)jaddref->ja_ino);
11372		}
11373	}
11374	/*
11375	 * Clear any block allocations which are pending journal writes.
11376	 */
11377	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
11378		cgp = (struct cg *)bp->b_data;
11379		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
11380		blksfree = cg_blksfree(cgp);
11381		LIST_FOREACH(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps) {
11382			if (jnewblk_rollback(jnewblk, fs, cgp, blksfree))
11383				continue;
11384			panic("initiate_write_bmsafemap: block %jd "
11385			    "marked free", jnewblk->jn_blkno);
11386		}
11387	}
11388	/*
11389	 * Move allocation lists to the written lists so they can be
11390	 * cleared once the block write is complete.
11391	 */
11392	LIST_SWAP(&bmsafemap->sm_inodedephd, &bmsafemap->sm_inodedepwr,
11393	    inodedep, id_deps);
11394	LIST_SWAP(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
11395	    newblk, nb_deps);
11396	LIST_SWAP(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr, worklist,
11397	    wk_list);
11398}
11399
11400void
11401softdep_handle_error(struct buf *bp)
11402{
11403	struct ufsmount *ump;
11404
11405	ump = softdep_bp_to_mp(bp);
11406	if (ump == NULL)
11407		return;
11408
11409	if (ffs_fsfail_cleanup(ump, bp->b_error)) {
11410		/*
11411		 * No future writes will succeed, so the on-disk image is safe.
11412		 * Pretend that this write succeeded so that the softdep state
11413		 * will be cleaned up naturally.
11414		 */
11415		bp->b_ioflags &= ~BIO_ERROR;
11416		bp->b_error = 0;
11417	}
11418}
11419
11420/*
11421 * This routine is called during the completion interrupt
11422 * service routine for a disk write (from the procedure called
11423 * by the device driver to inform the filesystem caches of
11424 * a request completion).  It should be called early in this
11425 * procedure, before the block is made available to other
11426 * processes or other routines are called.
11427 *
11428 */
11429static void
11430softdep_disk_write_complete(bp)
11431	struct buf *bp;		/* describes the completed disk write */
11432{
11433	struct worklist *wk;
11434	struct worklist *owk;
11435	struct ufsmount *ump;
11436	struct workhead reattach;
11437	struct freeblks *freeblks;
11438	struct buf *sbp;
11439
11440	ump = softdep_bp_to_mp(bp);
11441	KASSERT(LIST_EMPTY(&bp->b_dep) || ump != NULL,
11442	    ("softdep_disk_write_complete: softdep_bp_to_mp returned NULL "
11443	     "with outstanding dependencies for buffer %p", bp));
11444	if (ump == NULL)
11445		return;
11446	if ((bp->b_ioflags & BIO_ERROR) != 0)
11447		softdep_handle_error(bp);
11448	/*
11449	 * If an error occurred while doing the write, then the data
11450	 * has not hit the disk and the dependencies cannot be processed.
11451	 * But we do have to go through and roll forward any dependencies
11452	 * that were rolled back before the disk write.
11453	 */
11454	sbp = NULL;
11455	ACQUIRE_LOCK(ump);
11456	if ((bp->b_ioflags & BIO_ERROR) != 0 && (bp->b_flags & B_INVAL) == 0) {
11457		LIST_FOREACH(wk, &bp->b_dep, wk_list) {
11458			switch (wk->wk_type) {
11459			case D_PAGEDEP:
11460				handle_written_filepage(WK_PAGEDEP(wk), bp, 0);
11461				continue;
11462
11463			case D_INODEDEP:
11464				handle_written_inodeblock(WK_INODEDEP(wk),
11465				    bp, 0);
11466				continue;
11467
11468			case D_BMSAFEMAP:
11469				handle_written_bmsafemap(WK_BMSAFEMAP(wk),
11470				    bp, 0);
11471				continue;
11472
11473			case D_INDIRDEP:
11474				handle_written_indirdep(WK_INDIRDEP(wk),
11475				    bp, &sbp, 0);
11476				continue;
11477			default:
11478				/* nothing to roll forward */
11479				continue;
11480			}
11481		}
11482		FREE_LOCK(ump);
11483		if (sbp)
11484			brelse(sbp);
11485		return;
11486	}
11487	LIST_INIT(&reattach);
11488
11489	/*
11490	 * Ump SU lock must not be released anywhere in this code segment.
11491	 */
11492	owk = NULL;
11493	while ((wk = LIST_FIRST(&bp->b_dep)) != NULL) {
11494		WORKLIST_REMOVE(wk);
11495		atomic_add_long(&dep_write[wk->wk_type], 1);
11496		if (wk == owk)
11497			panic("duplicate worklist: %p\n", wk);
11498		owk = wk;
11499		switch (wk->wk_type) {
11500		case D_PAGEDEP:
11501			if (handle_written_filepage(WK_PAGEDEP(wk), bp,
11502			    WRITESUCCEEDED))
11503				WORKLIST_INSERT(&reattach, wk);
11504			continue;
11505
11506		case D_INODEDEP:
11507			if (handle_written_inodeblock(WK_INODEDEP(wk), bp,
11508			    WRITESUCCEEDED))
11509				WORKLIST_INSERT(&reattach, wk);
11510			continue;
11511
11512		case D_BMSAFEMAP:
11513			if (handle_written_bmsafemap(WK_BMSAFEMAP(wk), bp,
11514			    WRITESUCCEEDED))
11515				WORKLIST_INSERT(&reattach, wk);
11516			continue;
11517
11518		case D_MKDIR:
11519			handle_written_mkdir(WK_MKDIR(wk), MKDIR_BODY);
11520			continue;
11521
11522		case D_ALLOCDIRECT:
11523			wk->wk_state |= COMPLETE;
11524			handle_allocdirect_partdone(WK_ALLOCDIRECT(wk), NULL);
11525			continue;
11526
11527		case D_ALLOCINDIR:
11528			wk->wk_state |= COMPLETE;
11529			handle_allocindir_partdone(WK_ALLOCINDIR(wk));
11530			continue;
11531
11532		case D_INDIRDEP:
11533			if (handle_written_indirdep(WK_INDIRDEP(wk), bp, &sbp,
11534			    WRITESUCCEEDED))
11535				WORKLIST_INSERT(&reattach, wk);
11536			continue;
11537
11538		case D_FREEBLKS:
11539			wk->wk_state |= COMPLETE;
11540			freeblks = WK_FREEBLKS(wk);
11541			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE &&
11542			    LIST_EMPTY(&freeblks->fb_jblkdephd))
11543				add_to_worklist(wk, WK_NODELAY);
11544			continue;
11545
11546		case D_FREEWORK:
11547			handle_written_freework(WK_FREEWORK(wk));
11548			break;
11549
11550		case D_JSEGDEP:
11551			free_jsegdep(WK_JSEGDEP(wk));
11552			continue;
11553
11554		case D_JSEG:
11555			handle_written_jseg(WK_JSEG(wk), bp);
11556			continue;
11557
11558		case D_SBDEP:
11559			if (handle_written_sbdep(WK_SBDEP(wk), bp))
11560				WORKLIST_INSERT(&reattach, wk);
11561			continue;
11562
11563		case D_FREEDEP:
11564			free_freedep(WK_FREEDEP(wk));
11565			continue;
11566
11567		default:
11568			panic("handle_disk_write_complete: Unknown type %s",
11569			    TYPENAME(wk->wk_type));
11570			/* NOTREACHED */
11571		}
11572	}
11573	/*
11574	 * Reattach any requests that must be redone.
11575	 */
11576	while ((wk = LIST_FIRST(&reattach)) != NULL) {
11577		WORKLIST_REMOVE(wk);
11578		WORKLIST_INSERT(&bp->b_dep, wk);
11579	}
11580	FREE_LOCK(ump);
11581	if (sbp)
11582		brelse(sbp);
11583}
11584
11585/*
11586 * Called from within softdep_disk_write_complete above.
11587 */
11588static void
11589handle_allocdirect_partdone(adp, wkhd)
11590	struct allocdirect *adp;	/* the completed allocdirect */
11591	struct workhead *wkhd;		/* Work to do when inode is writtne. */
11592{
11593	struct allocdirectlst *listhead;
11594	struct allocdirect *listadp;
11595	struct inodedep *inodedep;
11596	long bsize;
11597
11598	LOCK_OWNED(VFSTOUFS(adp->ad_block.nb_list.wk_mp));
11599	if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11600		return;
11601	/*
11602	 * The on-disk inode cannot claim to be any larger than the last
11603	 * fragment that has been written. Otherwise, the on-disk inode
11604	 * might have fragments that were not the last block in the file
11605	 * which would corrupt the filesystem. Thus, we cannot free any
11606	 * allocdirects after one whose ad_oldblkno claims a fragment as
11607	 * these blocks must be rolled back to zero before writing the inode.
11608	 * We check the currently active set of allocdirects in id_inoupdt
11609	 * or id_extupdt as appropriate.
11610	 */
11611	inodedep = adp->ad_inodedep;
11612	bsize = inodedep->id_fs->fs_bsize;
11613	if (adp->ad_state & EXTDATA)
11614		listhead = &inodedep->id_extupdt;
11615	else
11616		listhead = &inodedep->id_inoupdt;
11617	TAILQ_FOREACH(listadp, listhead, ad_next) {
11618		/* found our block */
11619		if (listadp == adp)
11620			break;
11621		/* continue if ad_oldlbn is not a fragment */
11622		if (listadp->ad_oldsize == 0 ||
11623		    listadp->ad_oldsize == bsize)
11624			continue;
11625		/* hit a fragment */
11626		return;
11627	}
11628	/*
11629	 * If we have reached the end of the current list without
11630	 * finding the just finished dependency, then it must be
11631	 * on the future dependency list. Future dependencies cannot
11632	 * be freed until they are moved to the current list.
11633	 */
11634	if (listadp == NULL) {
11635#ifdef INVARIANTS
11636		if (adp->ad_state & EXTDATA)
11637			listhead = &inodedep->id_newextupdt;
11638		else
11639			listhead = &inodedep->id_newinoupdt;
11640		TAILQ_FOREACH(listadp, listhead, ad_next)
11641			/* found our block */
11642			if (listadp == adp)
11643				break;
11644		if (listadp == NULL)
11645			panic("handle_allocdirect_partdone: lost dep");
11646#endif /* INVARIANTS */
11647		return;
11648	}
11649	/*
11650	 * If we have found the just finished dependency, then queue
11651	 * it along with anything that follows it that is complete.
11652	 * Since the pointer has not yet been written in the inode
11653	 * as the dependency prevents it, place the allocdirect on the
11654	 * bufwait list where it will be freed once the pointer is
11655	 * valid.
11656	 */
11657	if (wkhd == NULL)
11658		wkhd = &inodedep->id_bufwait;
11659	for (; adp; adp = listadp) {
11660		listadp = TAILQ_NEXT(adp, ad_next);
11661		if ((adp->ad_state & ALLCOMPLETE) != ALLCOMPLETE)
11662			return;
11663		TAILQ_REMOVE(listhead, adp, ad_next);
11664		WORKLIST_INSERT(wkhd, &adp->ad_block.nb_list);
11665	}
11666}
11667
11668/*
11669 * Called from within softdep_disk_write_complete above.  This routine
11670 * completes successfully written allocindirs.
11671 */
11672static void
11673handle_allocindir_partdone(aip)
11674	struct allocindir *aip;		/* the completed allocindir */
11675{
11676	struct indirdep *indirdep;
11677
11678	if ((aip->ai_state & ALLCOMPLETE) != ALLCOMPLETE)
11679		return;
11680	indirdep = aip->ai_indirdep;
11681	LIST_REMOVE(aip, ai_next);
11682	/*
11683	 * Don't set a pointer while the buffer is undergoing IO or while
11684	 * we have active truncations.
11685	 */
11686	if (indirdep->ir_state & UNDONE || !TAILQ_EMPTY(&indirdep->ir_trunc)) {
11687		LIST_INSERT_HEAD(&indirdep->ir_donehd, aip, ai_next);
11688		return;
11689	}
11690	if (indirdep->ir_state & UFS1FMT)
11691		((ufs1_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11692		    aip->ai_newblkno;
11693	else
11694		((ufs2_daddr_t *)indirdep->ir_savebp->b_data)[aip->ai_offset] =
11695		    aip->ai_newblkno;
11696	/*
11697	 * Await the pointer write before freeing the allocindir.
11698	 */
11699	LIST_INSERT_HEAD(&indirdep->ir_writehd, aip, ai_next);
11700}
11701
11702/*
11703 * Release segments held on a jwork list.
11704 */
11705static void
11706handle_jwork(wkhd)
11707	struct workhead *wkhd;
11708{
11709	struct worklist *wk;
11710
11711	while ((wk = LIST_FIRST(wkhd)) != NULL) {
11712		WORKLIST_REMOVE(wk);
11713		switch (wk->wk_type) {
11714		case D_JSEGDEP:
11715			free_jsegdep(WK_JSEGDEP(wk));
11716			continue;
11717		case D_FREEDEP:
11718			free_freedep(WK_FREEDEP(wk));
11719			continue;
11720		case D_FREEFRAG:
11721			rele_jseg(WK_JSEG(WK_FREEFRAG(wk)->ff_jdep));
11722			WORKITEM_FREE(wk, D_FREEFRAG);
11723			continue;
11724		case D_FREEWORK:
11725			handle_written_freework(WK_FREEWORK(wk));
11726			continue;
11727		default:
11728			panic("handle_jwork: Unknown type %s\n",
11729			    TYPENAME(wk->wk_type));
11730		}
11731	}
11732}
11733
11734/*
11735 * Handle the bufwait list on an inode when it is safe to release items
11736 * held there.  This normally happens after an inode block is written but
11737 * may be delayed and handled later if there are pending journal items that
11738 * are not yet safe to be released.
11739 */
11740static struct freefile *
11741handle_bufwait(inodedep, refhd)
11742	struct inodedep *inodedep;
11743	struct workhead *refhd;
11744{
11745	struct jaddref *jaddref;
11746	struct freefile *freefile;
11747	struct worklist *wk;
11748
11749	freefile = NULL;
11750	while ((wk = LIST_FIRST(&inodedep->id_bufwait)) != NULL) {
11751		WORKLIST_REMOVE(wk);
11752		switch (wk->wk_type) {
11753		case D_FREEFILE:
11754			/*
11755			 * We defer adding freefile to the worklist
11756			 * until all other additions have been made to
11757			 * ensure that it will be done after all the
11758			 * old blocks have been freed.
11759			 */
11760			if (freefile != NULL)
11761				panic("handle_bufwait: freefile");
11762			freefile = WK_FREEFILE(wk);
11763			continue;
11764
11765		case D_MKDIR:
11766			handle_written_mkdir(WK_MKDIR(wk), MKDIR_PARENT);
11767			continue;
11768
11769		case D_DIRADD:
11770			diradd_inode_written(WK_DIRADD(wk), inodedep);
11771			continue;
11772
11773		case D_FREEFRAG:
11774			wk->wk_state |= COMPLETE;
11775			if ((wk->wk_state & ALLCOMPLETE) == ALLCOMPLETE)
11776				add_to_worklist(wk, 0);
11777			continue;
11778
11779		case D_DIRREM:
11780			wk->wk_state |= COMPLETE;
11781			add_to_worklist(wk, 0);
11782			continue;
11783
11784		case D_ALLOCDIRECT:
11785		case D_ALLOCINDIR:
11786			free_newblk(WK_NEWBLK(wk));
11787			continue;
11788
11789		case D_JNEWBLK:
11790			wk->wk_state |= COMPLETE;
11791			free_jnewblk(WK_JNEWBLK(wk));
11792			continue;
11793
11794		/*
11795		 * Save freed journal segments and add references on
11796		 * the supplied list which will delay their release
11797		 * until the cg bitmap is cleared on disk.
11798		 */
11799		case D_JSEGDEP:
11800			if (refhd == NULL)
11801				free_jsegdep(WK_JSEGDEP(wk));
11802			else
11803				WORKLIST_INSERT(refhd, wk);
11804			continue;
11805
11806		case D_JADDREF:
11807			jaddref = WK_JADDREF(wk);
11808			TAILQ_REMOVE(&inodedep->id_inoreflst, &jaddref->ja_ref,
11809			    if_deps);
11810			/*
11811			 * Transfer any jaddrefs to the list to be freed with
11812			 * the bitmap if we're handling a removed file.
11813			 */
11814			if (refhd == NULL) {
11815				wk->wk_state |= COMPLETE;
11816				free_jaddref(jaddref);
11817			} else
11818				WORKLIST_INSERT(refhd, wk);
11819			continue;
11820
11821		default:
11822			panic("handle_bufwait: Unknown type %p(%s)",
11823			    wk, TYPENAME(wk->wk_type));
11824			/* NOTREACHED */
11825		}
11826	}
11827	return (freefile);
11828}
11829/*
11830 * Called from within softdep_disk_write_complete above to restore
11831 * in-memory inode block contents to their most up-to-date state. Note
11832 * that this routine is always called from interrupt level with further
11833 * interrupts from this device blocked.
11834 *
11835 * If the write did not succeed, we will do all the roll-forward
11836 * operations, but we will not take the actions that will allow its
11837 * dependencies to be processed.
11838 */
11839static int
11840handle_written_inodeblock(inodedep, bp, flags)
11841	struct inodedep *inodedep;
11842	struct buf *bp;		/* buffer containing the inode block */
11843	int flags;
11844{
11845	struct freefile *freefile;
11846	struct allocdirect *adp, *nextadp;
11847	struct ufs1_dinode *dp1 = NULL;
11848	struct ufs2_dinode *dp2 = NULL;
11849	struct workhead wkhd;
11850	int hadchanges, fstype;
11851	ino_t freelink;
11852
11853	LIST_INIT(&wkhd);
11854	hadchanges = 0;
11855	freefile = NULL;
11856	if ((inodedep->id_state & IOSTARTED) == 0)
11857		panic("handle_written_inodeblock: not started");
11858	inodedep->id_state &= ~IOSTARTED;
11859	if (inodedep->id_fs->fs_magic == FS_UFS1_MAGIC) {
11860		fstype = UFS1;
11861		dp1 = (struct ufs1_dinode *)bp->b_data +
11862		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11863		freelink = dp1->di_freelink;
11864	} else {
11865		fstype = UFS2;
11866		dp2 = (struct ufs2_dinode *)bp->b_data +
11867		    ino_to_fsbo(inodedep->id_fs, inodedep->id_ino);
11868		freelink = dp2->di_freelink;
11869	}
11870	/*
11871	 * Leave this inodeblock dirty until it's in the list.
11872	 */
11873	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) == UNLINKED &&
11874	    (flags & WRITESUCCEEDED)) {
11875		struct inodedep *inon;
11876
11877		inon = TAILQ_NEXT(inodedep, id_unlinked);
11878		if ((inon == NULL && freelink == 0) ||
11879		    (inon && inon->id_ino == freelink)) {
11880			if (inon)
11881				inon->id_state |= UNLINKPREV;
11882			inodedep->id_state |= UNLINKNEXT;
11883		}
11884		hadchanges = 1;
11885	}
11886	/*
11887	 * If we had to rollback the inode allocation because of
11888	 * bitmaps being incomplete, then simply restore it.
11889	 * Keep the block dirty so that it will not be reclaimed until
11890	 * all associated dependencies have been cleared and the
11891	 * corresponding updates written to disk.
11892	 */
11893	if (inodedep->id_savedino1 != NULL) {
11894		hadchanges = 1;
11895		if (fstype == UFS1)
11896			*dp1 = *inodedep->id_savedino1;
11897		else
11898			*dp2 = *inodedep->id_savedino2;
11899		free(inodedep->id_savedino1, M_SAVEDINO);
11900		inodedep->id_savedino1 = NULL;
11901		if ((bp->b_flags & B_DELWRI) == 0)
11902			stat_inode_bitmap++;
11903		bdirty(bp);
11904		/*
11905		 * If the inode is clear here and GOINGAWAY it will never
11906		 * be written.  Process the bufwait and clear any pending
11907		 * work which may include the freefile.
11908		 */
11909		if (inodedep->id_state & GOINGAWAY)
11910			goto bufwait;
11911		return (1);
11912	}
11913	if (flags & WRITESUCCEEDED)
11914		inodedep->id_state |= COMPLETE;
11915	/*
11916	 * Roll forward anything that had to be rolled back before
11917	 * the inode could be updated.
11918	 */
11919	for (adp = TAILQ_FIRST(&inodedep->id_inoupdt); adp; adp = nextadp) {
11920		nextadp = TAILQ_NEXT(adp, ad_next);
11921		if (adp->ad_state & ATTACHED)
11922			panic("handle_written_inodeblock: new entry");
11923		if (fstype == UFS1) {
11924			if (adp->ad_offset < UFS_NDADDR) {
11925				if (dp1->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11926					panic("%s %s #%jd mismatch %d != %jd",
11927					    "handle_written_inodeblock:",
11928					    "direct pointer",
11929					    (intmax_t)adp->ad_offset,
11930					    dp1->di_db[adp->ad_offset],
11931					    (intmax_t)adp->ad_oldblkno);
11932				dp1->di_db[adp->ad_offset] = adp->ad_newblkno;
11933			} else {
11934				if (dp1->di_ib[adp->ad_offset - UFS_NDADDR] !=
11935				    0)
11936					panic("%s: %s #%jd allocated as %d",
11937					    "handle_written_inodeblock",
11938					    "indirect pointer",
11939					    (intmax_t)adp->ad_offset -
11940					    UFS_NDADDR,
11941					    dp1->di_ib[adp->ad_offset -
11942					    UFS_NDADDR]);
11943				dp1->di_ib[adp->ad_offset - UFS_NDADDR] =
11944				    adp->ad_newblkno;
11945			}
11946		} else {
11947			if (adp->ad_offset < UFS_NDADDR) {
11948				if (dp2->di_db[adp->ad_offset]!=adp->ad_oldblkno)
11949					panic("%s: %s #%jd %s %jd != %jd",
11950					    "handle_written_inodeblock",
11951					    "direct pointer",
11952					    (intmax_t)adp->ad_offset, "mismatch",
11953					    (intmax_t)dp2->di_db[adp->ad_offset],
11954					    (intmax_t)adp->ad_oldblkno);
11955				dp2->di_db[adp->ad_offset] = adp->ad_newblkno;
11956			} else {
11957				if (dp2->di_ib[adp->ad_offset - UFS_NDADDR] !=
11958				    0)
11959					panic("%s: %s #%jd allocated as %jd",
11960					    "handle_written_inodeblock",
11961					    "indirect pointer",
11962					    (intmax_t)adp->ad_offset -
11963					    UFS_NDADDR,
11964					    (intmax_t)
11965					    dp2->di_ib[adp->ad_offset -
11966					    UFS_NDADDR]);
11967				dp2->di_ib[adp->ad_offset - UFS_NDADDR] =
11968				    adp->ad_newblkno;
11969			}
11970		}
11971		adp->ad_state &= ~UNDONE;
11972		adp->ad_state |= ATTACHED;
11973		hadchanges = 1;
11974	}
11975	for (adp = TAILQ_FIRST(&inodedep->id_extupdt); adp; adp = nextadp) {
11976		nextadp = TAILQ_NEXT(adp, ad_next);
11977		if (adp->ad_state & ATTACHED)
11978			panic("handle_written_inodeblock: new entry");
11979		if (dp2->di_extb[adp->ad_offset] != adp->ad_oldblkno)
11980			panic("%s: direct pointers #%jd %s %jd != %jd",
11981			    "handle_written_inodeblock",
11982			    (intmax_t)adp->ad_offset, "mismatch",
11983			    (intmax_t)dp2->di_extb[adp->ad_offset],
11984			    (intmax_t)adp->ad_oldblkno);
11985		dp2->di_extb[adp->ad_offset] = adp->ad_newblkno;
11986		adp->ad_state &= ~UNDONE;
11987		adp->ad_state |= ATTACHED;
11988		hadchanges = 1;
11989	}
11990	if (hadchanges && (bp->b_flags & B_DELWRI) == 0)
11991		stat_direct_blk_ptrs++;
11992	/*
11993	 * Reset the file size to its most up-to-date value.
11994	 */
11995	if (inodedep->id_savedsize == -1 || inodedep->id_savedextsize == -1)
11996		panic("handle_written_inodeblock: bad size");
11997	if (inodedep->id_savednlink > UFS_LINK_MAX)
11998		panic("handle_written_inodeblock: Invalid link count "
11999		    "%jd for inodedep %p", (uintmax_t)inodedep->id_savednlink,
12000		    inodedep);
12001	if (fstype == UFS1) {
12002		if (dp1->di_nlink != inodedep->id_savednlink) {
12003			dp1->di_nlink = inodedep->id_savednlink;
12004			hadchanges = 1;
12005		}
12006		if (dp1->di_size != inodedep->id_savedsize) {
12007			dp1->di_size = inodedep->id_savedsize;
12008			hadchanges = 1;
12009		}
12010	} else {
12011		if (dp2->di_nlink != inodedep->id_savednlink) {
12012			dp2->di_nlink = inodedep->id_savednlink;
12013			hadchanges = 1;
12014		}
12015		if (dp2->di_size != inodedep->id_savedsize) {
12016			dp2->di_size = inodedep->id_savedsize;
12017			hadchanges = 1;
12018		}
12019		if (dp2->di_extsize != inodedep->id_savedextsize) {
12020			dp2->di_extsize = inodedep->id_savedextsize;
12021			hadchanges = 1;
12022		}
12023	}
12024	inodedep->id_savedsize = -1;
12025	inodedep->id_savedextsize = -1;
12026	inodedep->id_savednlink = -1;
12027	/*
12028	 * If there were any rollbacks in the inode block, then it must be
12029	 * marked dirty so that its will eventually get written back in
12030	 * its correct form.
12031	 */
12032	if (hadchanges) {
12033		if (fstype == UFS2)
12034			ffs_update_dinode_ckhash(inodedep->id_fs, dp2);
12035		bdirty(bp);
12036	}
12037bufwait:
12038	/*
12039	 * If the write did not succeed, we have done all the roll-forward
12040	 * operations, but we cannot take the actions that will allow its
12041	 * dependencies to be processed.
12042	 */
12043	if ((flags & WRITESUCCEEDED) == 0)
12044		return (hadchanges);
12045	/*
12046	 * Process any allocdirects that completed during the update.
12047	 */
12048	if ((adp = TAILQ_FIRST(&inodedep->id_inoupdt)) != NULL)
12049		handle_allocdirect_partdone(adp, &wkhd);
12050	if ((adp = TAILQ_FIRST(&inodedep->id_extupdt)) != NULL)
12051		handle_allocdirect_partdone(adp, &wkhd);
12052	/*
12053	 * Process deallocations that were held pending until the
12054	 * inode had been written to disk. Freeing of the inode
12055	 * is delayed until after all blocks have been freed to
12056	 * avoid creation of new <vfsid, inum, lbn> triples
12057	 * before the old ones have been deleted.  Completely
12058	 * unlinked inodes are not processed until the unlinked
12059	 * inode list is written or the last reference is removed.
12060	 */
12061	if ((inodedep->id_state & (UNLINKED | UNLINKONLIST)) != UNLINKED) {
12062		freefile = handle_bufwait(inodedep, NULL);
12063		if (freefile && !LIST_EMPTY(&wkhd)) {
12064			WORKLIST_INSERT(&wkhd, &freefile->fx_list);
12065			freefile = NULL;
12066		}
12067	}
12068	/*
12069	 * Move rolled forward dependency completions to the bufwait list
12070	 * now that those that were already written have been processed.
12071	 */
12072	if (!LIST_EMPTY(&wkhd) && hadchanges == 0)
12073		panic("handle_written_inodeblock: bufwait but no changes");
12074	jwork_move(&inodedep->id_bufwait, &wkhd);
12075
12076	if (freefile != NULL) {
12077		/*
12078		 * If the inode is goingaway it was never written.  Fake up
12079		 * the state here so free_inodedep() can succeed.
12080		 */
12081		if (inodedep->id_state & GOINGAWAY)
12082			inodedep->id_state |= COMPLETE | DEPCOMPLETE;
12083		if (free_inodedep(inodedep) == 0)
12084			panic("handle_written_inodeblock: live inodedep %p",
12085			    inodedep);
12086		add_to_worklist(&freefile->fx_list, 0);
12087		return (0);
12088	}
12089
12090	/*
12091	 * If no outstanding dependencies, free it.
12092	 */
12093	if (free_inodedep(inodedep) ||
12094	    (TAILQ_FIRST(&inodedep->id_inoreflst) == 0 &&
12095	     TAILQ_FIRST(&inodedep->id_inoupdt) == 0 &&
12096	     TAILQ_FIRST(&inodedep->id_extupdt) == 0 &&
12097	     LIST_FIRST(&inodedep->id_bufwait) == 0))
12098		return (0);
12099	return (hadchanges);
12100}
12101
12102/*
12103 * Perform needed roll-forwards and kick off any dependencies that
12104 * can now be processed.
12105 *
12106 * If the write did not succeed, we will do all the roll-forward
12107 * operations, but we will not take the actions that will allow its
12108 * dependencies to be processed.
12109 */
12110static int
12111handle_written_indirdep(indirdep, bp, bpp, flags)
12112	struct indirdep *indirdep;
12113	struct buf *bp;
12114	struct buf **bpp;
12115	int flags;
12116{
12117	struct allocindir *aip;
12118	struct buf *sbp;
12119	int chgs;
12120
12121	if (indirdep->ir_state & GOINGAWAY)
12122		panic("handle_written_indirdep: indirdep gone");
12123	if ((indirdep->ir_state & IOSTARTED) == 0)
12124		panic("handle_written_indirdep: IO not started");
12125	chgs = 0;
12126	/*
12127	 * If there were rollbacks revert them here.
12128	 */
12129	if (indirdep->ir_saveddata) {
12130		bcopy(indirdep->ir_saveddata, bp->b_data, bp->b_bcount);
12131		if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
12132			free(indirdep->ir_saveddata, M_INDIRDEP);
12133			indirdep->ir_saveddata = NULL;
12134		}
12135		chgs = 1;
12136	}
12137	indirdep->ir_state &= ~(UNDONE | IOSTARTED);
12138	indirdep->ir_state |= ATTACHED;
12139	/*
12140	 * If the write did not succeed, we have done all the roll-forward
12141	 * operations, but we cannot take the actions that will allow its
12142	 * dependencies to be processed.
12143	 */
12144	if ((flags & WRITESUCCEEDED) == 0) {
12145		stat_indir_blk_ptrs++;
12146		bdirty(bp);
12147		return (1);
12148	}
12149	/*
12150	 * Move allocindirs with written pointers to the completehd if
12151	 * the indirdep's pointer is not yet written.  Otherwise
12152	 * free them here.
12153	 */
12154	while ((aip = LIST_FIRST(&indirdep->ir_writehd)) != NULL) {
12155		LIST_REMOVE(aip, ai_next);
12156		if ((indirdep->ir_state & DEPCOMPLETE) == 0) {
12157			LIST_INSERT_HEAD(&indirdep->ir_completehd, aip,
12158			    ai_next);
12159			newblk_freefrag(&aip->ai_block);
12160			continue;
12161		}
12162		free_newblk(&aip->ai_block);
12163	}
12164	/*
12165	 * Move allocindirs that have finished dependency processing from
12166	 * the done list to the write list after updating the pointers.
12167	 */
12168	if (TAILQ_EMPTY(&indirdep->ir_trunc)) {
12169		while ((aip = LIST_FIRST(&indirdep->ir_donehd)) != NULL) {
12170			handle_allocindir_partdone(aip);
12171			if (aip == LIST_FIRST(&indirdep->ir_donehd))
12172				panic("disk_write_complete: not gone");
12173			chgs = 1;
12174		}
12175	}
12176	/*
12177	 * Preserve the indirdep if there were any changes or if it is not
12178	 * yet valid on disk.
12179	 */
12180	if (chgs) {
12181		stat_indir_blk_ptrs++;
12182		bdirty(bp);
12183		return (1);
12184	}
12185	/*
12186	 * If there were no changes we can discard the savedbp and detach
12187	 * ourselves from the buf.  We are only carrying completed pointers
12188	 * in this case.
12189	 */
12190	sbp = indirdep->ir_savebp;
12191	sbp->b_flags |= B_INVAL | B_NOCACHE;
12192	indirdep->ir_savebp = NULL;
12193	indirdep->ir_bp = NULL;
12194	if (*bpp != NULL)
12195		panic("handle_written_indirdep: bp already exists.");
12196	*bpp = sbp;
12197	/*
12198	 * The indirdep may not be freed until its parent points at it.
12199	 */
12200	if (indirdep->ir_state & DEPCOMPLETE)
12201		free_indirdep(indirdep);
12202
12203	return (0);
12204}
12205
12206/*
12207 * Process a diradd entry after its dependent inode has been written.
12208 */
12209static void
12210diradd_inode_written(dap, inodedep)
12211	struct diradd *dap;
12212	struct inodedep *inodedep;
12213{
12214
12215	LOCK_OWNED(VFSTOUFS(dap->da_list.wk_mp));
12216	dap->da_state |= COMPLETE;
12217	complete_diradd(dap);
12218	WORKLIST_INSERT(&inodedep->id_pendinghd, &dap->da_list);
12219}
12220
12221/*
12222 * Returns true if the bmsafemap will have rollbacks when written.  Must only
12223 * be called with the per-filesystem lock and the buf lock on the cg held.
12224 */
12225static int
12226bmsafemap_backgroundwrite(bmsafemap, bp)
12227	struct bmsafemap *bmsafemap;
12228	struct buf *bp;
12229{
12230	int dirty;
12231
12232	LOCK_OWNED(VFSTOUFS(bmsafemap->sm_list.wk_mp));
12233	dirty = !LIST_EMPTY(&bmsafemap->sm_jaddrefhd) |
12234	    !LIST_EMPTY(&bmsafemap->sm_jnewblkhd);
12235	/*
12236	 * If we're initiating a background write we need to process the
12237	 * rollbacks as they exist now, not as they exist when IO starts.
12238	 * No other consumers will look at the contents of the shadowed
12239	 * buf so this is safe to do here.
12240	 */
12241	if (bp->b_xflags & BX_BKGRDMARKER)
12242		initiate_write_bmsafemap(bmsafemap, bp);
12243
12244	return (dirty);
12245}
12246
12247/*
12248 * Re-apply an allocation when a cg write is complete.
12249 */
12250static int
12251jnewblk_rollforward(jnewblk, fs, cgp, blksfree)
12252	struct jnewblk *jnewblk;
12253	struct fs *fs;
12254	struct cg *cgp;
12255	uint8_t *blksfree;
12256{
12257	ufs1_daddr_t fragno;
12258	ufs2_daddr_t blkno;
12259	long cgbno, bbase;
12260	int frags, blk;
12261	int i;
12262
12263	frags = 0;
12264	cgbno = dtogd(fs, jnewblk->jn_blkno);
12265	for (i = jnewblk->jn_oldfrags; i < jnewblk->jn_frags; i++) {
12266		if (isclr(blksfree, cgbno + i))
12267			panic("jnewblk_rollforward: re-allocated fragment");
12268		frags++;
12269	}
12270	if (frags == fs->fs_frag) {
12271		blkno = fragstoblks(fs, cgbno);
12272		ffs_clrblock(fs, blksfree, (long)blkno);
12273		ffs_clusteracct(fs, cgp, blkno, -1);
12274		cgp->cg_cs.cs_nbfree--;
12275	} else {
12276		bbase = cgbno - fragnum(fs, cgbno);
12277		cgbno += jnewblk->jn_oldfrags;
12278                /* If a complete block had been reassembled, account for it. */
12279		fragno = fragstoblks(fs, bbase);
12280		if (ffs_isblock(fs, blksfree, fragno)) {
12281			cgp->cg_cs.cs_nffree += fs->fs_frag;
12282			ffs_clusteracct(fs, cgp, fragno, -1);
12283			cgp->cg_cs.cs_nbfree--;
12284		}
12285		/* Decrement the old frags.  */
12286		blk = blkmap(fs, blksfree, bbase);
12287		ffs_fragacct(fs, blk, cgp->cg_frsum, -1);
12288		/* Allocate the fragment */
12289		for (i = 0; i < frags; i++)
12290			clrbit(blksfree, cgbno + i);
12291		cgp->cg_cs.cs_nffree -= frags;
12292		/* Add back in counts associated with the new frags */
12293		blk = blkmap(fs, blksfree, bbase);
12294		ffs_fragacct(fs, blk, cgp->cg_frsum, 1);
12295	}
12296	return (frags);
12297}
12298
12299/*
12300 * Complete a write to a bmsafemap structure.  Roll forward any bitmap
12301 * changes if it's not a background write.  Set all written dependencies
12302 * to DEPCOMPLETE and free the structure if possible.
12303 *
12304 * If the write did not succeed, we will do all the roll-forward
12305 * operations, but we will not take the actions that will allow its
12306 * dependencies to be processed.
12307 */
12308static int
12309handle_written_bmsafemap(bmsafemap, bp, flags)
12310	struct bmsafemap *bmsafemap;
12311	struct buf *bp;
12312	int flags;
12313{
12314	struct newblk *newblk;
12315	struct inodedep *inodedep;
12316	struct jaddref *jaddref, *jatmp;
12317	struct jnewblk *jnewblk, *jntmp;
12318	struct ufsmount *ump;
12319	uint8_t *inosused;
12320	uint8_t *blksfree;
12321	struct cg *cgp;
12322	struct fs *fs;
12323	ino_t ino;
12324	int foreground;
12325	int chgs;
12326
12327	if ((bmsafemap->sm_state & IOSTARTED) == 0)
12328		panic("handle_written_bmsafemap: Not started\n");
12329	ump = VFSTOUFS(bmsafemap->sm_list.wk_mp);
12330	chgs = 0;
12331	bmsafemap->sm_state &= ~IOSTARTED;
12332	foreground = (bp->b_xflags & BX_BKGRDMARKER) == 0;
12333	/*
12334	 * If write was successful, release journal work that was waiting
12335	 * on the write. Otherwise move the work back.
12336	 */
12337	if (flags & WRITESUCCEEDED)
12338		handle_jwork(&bmsafemap->sm_freewr);
12339	else
12340		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12341		    worklist, wk_list);
12342
12343	/*
12344	 * Restore unwritten inode allocation pending jaddref writes.
12345	 */
12346	if (!LIST_EMPTY(&bmsafemap->sm_jaddrefhd)) {
12347		cgp = (struct cg *)bp->b_data;
12348		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12349		inosused = cg_inosused(cgp);
12350		LIST_FOREACH_SAFE(jaddref, &bmsafemap->sm_jaddrefhd,
12351		    ja_bmdeps, jatmp) {
12352			if ((jaddref->ja_state & UNDONE) == 0)
12353				continue;
12354			ino = jaddref->ja_ino % fs->fs_ipg;
12355			if (isset(inosused, ino))
12356				panic("handle_written_bmsafemap: "
12357				    "re-allocated inode");
12358			/* Do the roll-forward only if it's a real copy. */
12359			if (foreground) {
12360				if ((jaddref->ja_mode & IFMT) == IFDIR)
12361					cgp->cg_cs.cs_ndir++;
12362				cgp->cg_cs.cs_nifree--;
12363				setbit(inosused, ino);
12364				chgs = 1;
12365			}
12366			jaddref->ja_state &= ~UNDONE;
12367			jaddref->ja_state |= ATTACHED;
12368			free_jaddref(jaddref);
12369		}
12370	}
12371	/*
12372	 * Restore any block allocations which are pending journal writes.
12373	 */
12374	if (LIST_FIRST(&bmsafemap->sm_jnewblkhd) != NULL) {
12375		cgp = (struct cg *)bp->b_data;
12376		fs = VFSTOUFS(bmsafemap->sm_list.wk_mp)->um_fs;
12377		blksfree = cg_blksfree(cgp);
12378		LIST_FOREACH_SAFE(jnewblk, &bmsafemap->sm_jnewblkhd, jn_deps,
12379		    jntmp) {
12380			if ((jnewblk->jn_state & UNDONE) == 0)
12381				continue;
12382			/* Do the roll-forward only if it's a real copy. */
12383			if (foreground &&
12384			    jnewblk_rollforward(jnewblk, fs, cgp, blksfree))
12385				chgs = 1;
12386			jnewblk->jn_state &= ~(UNDONE | NEWBLOCK);
12387			jnewblk->jn_state |= ATTACHED;
12388			free_jnewblk(jnewblk);
12389		}
12390	}
12391	/*
12392	 * If the write did not succeed, we have done all the roll-forward
12393	 * operations, but we cannot take the actions that will allow its
12394	 * dependencies to be processed.
12395	 */
12396	if ((flags & WRITESUCCEEDED) == 0) {
12397		LIST_CONCAT(&bmsafemap->sm_newblkhd, &bmsafemap->sm_newblkwr,
12398		    newblk, nb_deps);
12399		LIST_CONCAT(&bmsafemap->sm_freehd, &bmsafemap->sm_freewr,
12400		    worklist, wk_list);
12401		if (foreground)
12402			bdirty(bp);
12403		return (1);
12404	}
12405	while ((newblk = LIST_FIRST(&bmsafemap->sm_newblkwr))) {
12406		newblk->nb_state |= DEPCOMPLETE;
12407		newblk->nb_state &= ~ONDEPLIST;
12408		newblk->nb_bmsafemap = NULL;
12409		LIST_REMOVE(newblk, nb_deps);
12410		if (newblk->nb_list.wk_type == D_ALLOCDIRECT)
12411			handle_allocdirect_partdone(
12412			    WK_ALLOCDIRECT(&newblk->nb_list), NULL);
12413		else if (newblk->nb_list.wk_type == D_ALLOCINDIR)
12414			handle_allocindir_partdone(
12415			    WK_ALLOCINDIR(&newblk->nb_list));
12416		else if (newblk->nb_list.wk_type != D_NEWBLK)
12417			panic("handle_written_bmsafemap: Unexpected type: %s",
12418			    TYPENAME(newblk->nb_list.wk_type));
12419	}
12420	while ((inodedep = LIST_FIRST(&bmsafemap->sm_inodedepwr)) != NULL) {
12421		inodedep->id_state |= DEPCOMPLETE;
12422		inodedep->id_state &= ~ONDEPLIST;
12423		LIST_REMOVE(inodedep, id_deps);
12424		inodedep->id_bmsafemap = NULL;
12425	}
12426	LIST_REMOVE(bmsafemap, sm_next);
12427	if (chgs == 0 && LIST_EMPTY(&bmsafemap->sm_jaddrefhd) &&
12428	    LIST_EMPTY(&bmsafemap->sm_jnewblkhd) &&
12429	    LIST_EMPTY(&bmsafemap->sm_newblkhd) &&
12430	    LIST_EMPTY(&bmsafemap->sm_inodedephd) &&
12431	    LIST_EMPTY(&bmsafemap->sm_freehd)) {
12432		LIST_REMOVE(bmsafemap, sm_hash);
12433		WORKITEM_FREE(bmsafemap, D_BMSAFEMAP);
12434		return (0);
12435	}
12436	LIST_INSERT_HEAD(&ump->softdep_dirtycg, bmsafemap, sm_next);
12437	if (foreground)
12438		bdirty(bp);
12439	return (1);
12440}
12441
12442/*
12443 * Try to free a mkdir dependency.
12444 */
12445static void
12446complete_mkdir(mkdir)
12447	struct mkdir *mkdir;
12448{
12449	struct diradd *dap;
12450
12451	if ((mkdir->md_state & ALLCOMPLETE) != ALLCOMPLETE)
12452		return;
12453	LIST_REMOVE(mkdir, md_mkdirs);
12454	dap = mkdir->md_diradd;
12455	dap->da_state &= ~(mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY));
12456	if ((dap->da_state & (MKDIR_PARENT | MKDIR_BODY)) == 0) {
12457		dap->da_state |= DEPCOMPLETE;
12458		complete_diradd(dap);
12459	}
12460	WORKITEM_FREE(mkdir, D_MKDIR);
12461}
12462
12463/*
12464 * Handle the completion of a mkdir dependency.
12465 */
12466static void
12467handle_written_mkdir(mkdir, type)
12468	struct mkdir *mkdir;
12469	int type;
12470{
12471
12472	if ((mkdir->md_state & (MKDIR_PARENT | MKDIR_BODY)) != type)
12473		panic("handle_written_mkdir: bad type");
12474	mkdir->md_state |= COMPLETE;
12475	complete_mkdir(mkdir);
12476}
12477
12478static int
12479free_pagedep(pagedep)
12480	struct pagedep *pagedep;
12481{
12482	int i;
12483
12484	if (pagedep->pd_state & NEWBLOCK)
12485		return (0);
12486	if (!LIST_EMPTY(&pagedep->pd_dirremhd))
12487		return (0);
12488	for (i = 0; i < DAHASHSZ; i++)
12489		if (!LIST_EMPTY(&pagedep->pd_diraddhd[i]))
12490			return (0);
12491	if (!LIST_EMPTY(&pagedep->pd_pendinghd))
12492		return (0);
12493	if (!LIST_EMPTY(&pagedep->pd_jmvrefhd))
12494		return (0);
12495	if (pagedep->pd_state & ONWORKLIST)
12496		WORKLIST_REMOVE(&pagedep->pd_list);
12497	LIST_REMOVE(pagedep, pd_hash);
12498	WORKITEM_FREE(pagedep, D_PAGEDEP);
12499
12500	return (1);
12501}
12502
12503/*
12504 * Called from within softdep_disk_write_complete above.
12505 * A write operation was just completed. Removed inodes can
12506 * now be freed and associated block pointers may be committed.
12507 * Note that this routine is always called from interrupt level
12508 * with further interrupts from this device blocked.
12509 *
12510 * If the write did not succeed, we will do all the roll-forward
12511 * operations, but we will not take the actions that will allow its
12512 * dependencies to be processed.
12513 */
12514static int
12515handle_written_filepage(pagedep, bp, flags)
12516	struct pagedep *pagedep;
12517	struct buf *bp;		/* buffer containing the written page */
12518	int flags;
12519{
12520	struct dirrem *dirrem;
12521	struct diradd *dap, *nextdap;
12522	struct direct *ep;
12523	int i, chgs;
12524
12525	if ((pagedep->pd_state & IOSTARTED) == 0)
12526		panic("handle_written_filepage: not started");
12527	pagedep->pd_state &= ~IOSTARTED;
12528	if ((flags & WRITESUCCEEDED) == 0)
12529		goto rollforward;
12530	/*
12531	 * Process any directory removals that have been committed.
12532	 */
12533	while ((dirrem = LIST_FIRST(&pagedep->pd_dirremhd)) != NULL) {
12534		LIST_REMOVE(dirrem, dm_next);
12535		dirrem->dm_state |= COMPLETE;
12536		dirrem->dm_dirinum = pagedep->pd_ino;
12537		KASSERT(LIST_EMPTY(&dirrem->dm_jremrefhd),
12538		    ("handle_written_filepage: Journal entries not written."));
12539		add_to_worklist(&dirrem->dm_list, 0);
12540	}
12541	/*
12542	 * Free any directory additions that have been committed.
12543	 * If it is a newly allocated block, we have to wait until
12544	 * the on-disk directory inode claims the new block.
12545	 */
12546	if ((pagedep->pd_state & NEWBLOCK) == 0)
12547		while ((dap = LIST_FIRST(&pagedep->pd_pendinghd)) != NULL)
12548			free_diradd(dap, NULL);
12549rollforward:
12550	/*
12551	 * Uncommitted directory entries must be restored.
12552	 */
12553	for (chgs = 0, i = 0; i < DAHASHSZ; i++) {
12554		for (dap = LIST_FIRST(&pagedep->pd_diraddhd[i]); dap;
12555		     dap = nextdap) {
12556			nextdap = LIST_NEXT(dap, da_pdlist);
12557			if (dap->da_state & ATTACHED)
12558				panic("handle_written_filepage: attached");
12559			ep = (struct direct *)
12560			    ((char *)bp->b_data + dap->da_offset);
12561			ep->d_ino = dap->da_newinum;
12562			dap->da_state &= ~UNDONE;
12563			dap->da_state |= ATTACHED;
12564			chgs = 1;
12565			/*
12566			 * If the inode referenced by the directory has
12567			 * been written out, then the dependency can be
12568			 * moved to the pending list.
12569			 */
12570			if ((dap->da_state & ALLCOMPLETE) == ALLCOMPLETE) {
12571				LIST_REMOVE(dap, da_pdlist);
12572				LIST_INSERT_HEAD(&pagedep->pd_pendinghd, dap,
12573				    da_pdlist);
12574			}
12575		}
12576	}
12577	/*
12578	 * If there were any rollbacks in the directory, then it must be
12579	 * marked dirty so that its will eventually get written back in
12580	 * its correct form.
12581	 */
12582	if (chgs || (flags & WRITESUCCEEDED) == 0) {
12583		if ((bp->b_flags & B_DELWRI) == 0)
12584			stat_dir_entry++;
12585		bdirty(bp);
12586		return (1);
12587	}
12588	/*
12589	 * If we are not waiting for a new directory block to be
12590	 * claimed by its inode, then the pagedep will be freed.
12591	 * Otherwise it will remain to track any new entries on
12592	 * the page in case they are fsync'ed.
12593	 */
12594	free_pagedep(pagedep);
12595	return (0);
12596}
12597
12598/*
12599 * Writing back in-core inode structures.
12600 *
12601 * The filesystem only accesses an inode's contents when it occupies an
12602 * "in-core" inode structure.  These "in-core" structures are separate from
12603 * the page frames used to cache inode blocks.  Only the latter are
12604 * transferred to/from the disk.  So, when the updated contents of the
12605 * "in-core" inode structure are copied to the corresponding in-memory inode
12606 * block, the dependencies are also transferred.  The following procedure is
12607 * called when copying a dirty "in-core" inode to a cached inode block.
12608 */
12609
12610/*
12611 * Called when an inode is loaded from disk. If the effective link count
12612 * differed from the actual link count when it was last flushed, then we
12613 * need to ensure that the correct effective link count is put back.
12614 */
12615void
12616softdep_load_inodeblock(ip)
12617	struct inode *ip;	/* the "in_core" copy of the inode */
12618{
12619	struct inodedep *inodedep;
12620	struct ufsmount *ump;
12621
12622	ump = ITOUMP(ip);
12623	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
12624	    ("softdep_load_inodeblock called on non-softdep filesystem"));
12625	/*
12626	 * Check for alternate nlink count.
12627	 */
12628	ip->i_effnlink = ip->i_nlink;
12629	ACQUIRE_LOCK(ump);
12630	if (inodedep_lookup(UFSTOVFS(ump), ip->i_number, 0, &inodedep) == 0) {
12631		FREE_LOCK(ump);
12632		return;
12633	}
12634	if (ip->i_nlink != inodedep->id_nlinkwrote &&
12635	    inodedep->id_nlinkwrote != -1) {
12636		KASSERT(ip->i_nlink == 0 &&
12637		    (ump->um_flags & UM_FSFAIL_CLEANUP) != 0,
12638		    ("read bad i_nlink value"));
12639		ip->i_effnlink = ip->i_nlink = inodedep->id_nlinkwrote;
12640	}
12641	ip->i_effnlink -= inodedep->id_nlinkdelta;
12642	KASSERT(ip->i_effnlink >= 0,
12643	    ("softdep_load_inodeblock: negative i_effnlink"));
12644	FREE_LOCK(ump);
12645}
12646
12647/*
12648 * This routine is called just before the "in-core" inode
12649 * information is to be copied to the in-memory inode block.
12650 * Recall that an inode block contains several inodes. If
12651 * the force flag is set, then the dependencies will be
12652 * cleared so that the update can always be made. Note that
12653 * the buffer is locked when this routine is called, so we
12654 * will never be in the middle of writing the inode block
12655 * to disk.
12656 */
12657void
12658softdep_update_inodeblock(ip, bp, waitfor)
12659	struct inode *ip;	/* the "in_core" copy of the inode */
12660	struct buf *bp;		/* the buffer containing the inode block */
12661	int waitfor;		/* nonzero => update must be allowed */
12662{
12663	struct inodedep *inodedep;
12664	struct inoref *inoref;
12665	struct ufsmount *ump;
12666	struct worklist *wk;
12667	struct mount *mp;
12668	struct buf *ibp;
12669	struct fs *fs;
12670	int error;
12671
12672	ump = ITOUMP(ip);
12673	mp = UFSTOVFS(ump);
12674	KASSERT(MOUNTEDSOFTDEP(mp) != 0,
12675	    ("softdep_update_inodeblock called on non-softdep filesystem"));
12676	fs = ump->um_fs;
12677	/*
12678	 * Preserve the freelink that is on disk.  clear_unlinked_inodedep()
12679	 * does not have access to the in-core ip so must write directly into
12680	 * the inode block buffer when setting freelink.
12681	 */
12682	if (fs->fs_magic == FS_UFS1_MAGIC)
12683		DIP_SET(ip, i_freelink, ((struct ufs1_dinode *)bp->b_data +
12684		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12685	else
12686		DIP_SET(ip, i_freelink, ((struct ufs2_dinode *)bp->b_data +
12687		    ino_to_fsbo(fs, ip->i_number))->di_freelink);
12688	/*
12689	 * If the effective link count is not equal to the actual link
12690	 * count, then we must track the difference in an inodedep while
12691	 * the inode is (potentially) tossed out of the cache. Otherwise,
12692	 * if there is no existing inodedep, then there are no dependencies
12693	 * to track.
12694	 */
12695	ACQUIRE_LOCK(ump);
12696again:
12697	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12698		FREE_LOCK(ump);
12699		if (ip->i_effnlink != ip->i_nlink)
12700			panic("softdep_update_inodeblock: bad link count");
12701		return;
12702	}
12703	KASSERT(ip->i_nlink >= inodedep->id_nlinkdelta,
12704	    ("softdep_update_inodeblock inconsistent ip %p i_nlink %d "
12705	    "inodedep %p id_nlinkdelta %jd",
12706	    ip, ip->i_nlink, inodedep, (intmax_t)inodedep->id_nlinkdelta));
12707	inodedep->id_nlinkwrote = ip->i_nlink;
12708	if (inodedep->id_nlinkdelta != ip->i_nlink - ip->i_effnlink)
12709		panic("softdep_update_inodeblock: bad delta");
12710	/*
12711	 * If we're flushing all dependencies we must also move any waiting
12712	 * for journal writes onto the bufwait list prior to I/O.
12713	 */
12714	if (waitfor) {
12715		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12716			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12717			    == DEPCOMPLETE) {
12718				jwait(&inoref->if_list, MNT_WAIT);
12719				goto again;
12720			}
12721		}
12722	}
12723	/*
12724	 * Changes have been initiated. Anything depending on these
12725	 * changes cannot occur until this inode has been written.
12726	 */
12727	inodedep->id_state &= ~COMPLETE;
12728	if ((inodedep->id_state & ONWORKLIST) == 0)
12729		WORKLIST_INSERT(&bp->b_dep, &inodedep->id_list);
12730	/*
12731	 * Any new dependencies associated with the incore inode must
12732	 * now be moved to the list associated with the buffer holding
12733	 * the in-memory copy of the inode. Once merged process any
12734	 * allocdirects that are completed by the merger.
12735	 */
12736	merge_inode_lists(&inodedep->id_newinoupdt, &inodedep->id_inoupdt);
12737	if (!TAILQ_EMPTY(&inodedep->id_inoupdt))
12738		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_inoupdt),
12739		    NULL);
12740	merge_inode_lists(&inodedep->id_newextupdt, &inodedep->id_extupdt);
12741	if (!TAILQ_EMPTY(&inodedep->id_extupdt))
12742		handle_allocdirect_partdone(TAILQ_FIRST(&inodedep->id_extupdt),
12743		    NULL);
12744	/*
12745	 * Now that the inode has been pushed into the buffer, the
12746	 * operations dependent on the inode being written to disk
12747	 * can be moved to the id_bufwait so that they will be
12748	 * processed when the buffer I/O completes.
12749	 */
12750	while ((wk = LIST_FIRST(&inodedep->id_inowait)) != NULL) {
12751		WORKLIST_REMOVE(wk);
12752		WORKLIST_INSERT(&inodedep->id_bufwait, wk);
12753	}
12754	/*
12755	 * Newly allocated inodes cannot be written until the bitmap
12756	 * that allocates them have been written (indicated by
12757	 * DEPCOMPLETE being set in id_state). If we are doing a
12758	 * forced sync (e.g., an fsync on a file), we force the bitmap
12759	 * to be written so that the update can be done.
12760	 */
12761	if (waitfor == 0) {
12762		FREE_LOCK(ump);
12763		return;
12764	}
12765retry:
12766	if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) != 0) {
12767		FREE_LOCK(ump);
12768		return;
12769	}
12770	ibp = inodedep->id_bmsafemap->sm_buf;
12771	ibp = getdirtybuf(ibp, LOCK_PTR(ump), MNT_WAIT);
12772	if (ibp == NULL) {
12773		/*
12774		 * If ibp came back as NULL, the dependency could have been
12775		 * freed while we slept.  Look it up again, and check to see
12776		 * that it has completed.
12777		 */
12778		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0)
12779			goto retry;
12780		FREE_LOCK(ump);
12781		return;
12782	}
12783	FREE_LOCK(ump);
12784	if ((error = bwrite(ibp)) != 0)
12785		softdep_error("softdep_update_inodeblock: bwrite", error);
12786}
12787
12788/*
12789 * Merge the a new inode dependency list (such as id_newinoupdt) into an
12790 * old inode dependency list (such as id_inoupdt).
12791 */
12792static void
12793merge_inode_lists(newlisthead, oldlisthead)
12794	struct allocdirectlst *newlisthead;
12795	struct allocdirectlst *oldlisthead;
12796{
12797	struct allocdirect *listadp, *newadp;
12798
12799	newadp = TAILQ_FIRST(newlisthead);
12800	if (newadp != NULL)
12801		LOCK_OWNED(VFSTOUFS(newadp->ad_block.nb_list.wk_mp));
12802	for (listadp = TAILQ_FIRST(oldlisthead); listadp && newadp;) {
12803		if (listadp->ad_offset < newadp->ad_offset) {
12804			listadp = TAILQ_NEXT(listadp, ad_next);
12805			continue;
12806		}
12807		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12808		TAILQ_INSERT_BEFORE(listadp, newadp, ad_next);
12809		if (listadp->ad_offset == newadp->ad_offset) {
12810			allocdirect_merge(oldlisthead, newadp,
12811			    listadp);
12812			listadp = newadp;
12813		}
12814		newadp = TAILQ_FIRST(newlisthead);
12815	}
12816	while ((newadp = TAILQ_FIRST(newlisthead)) != NULL) {
12817		TAILQ_REMOVE(newlisthead, newadp, ad_next);
12818		TAILQ_INSERT_TAIL(oldlisthead, newadp, ad_next);
12819	}
12820}
12821
12822/*
12823 * If we are doing an fsync, then we must ensure that any directory
12824 * entries for the inode have been written after the inode gets to disk.
12825 */
12826int
12827softdep_fsync(vp)
12828	struct vnode *vp;	/* the "in_core" copy of the inode */
12829{
12830	struct inodedep *inodedep;
12831	struct pagedep *pagedep;
12832	struct inoref *inoref;
12833	struct ufsmount *ump;
12834	struct worklist *wk;
12835	struct diradd *dap;
12836	struct mount *mp;
12837	struct vnode *pvp;
12838	struct inode *ip;
12839	struct buf *bp;
12840	struct fs *fs;
12841	struct thread *td = curthread;
12842	int error, flushparent, pagedep_new_block;
12843	ino_t parentino;
12844	ufs_lbn_t lbn;
12845
12846	ip = VTOI(vp);
12847	mp = vp->v_mount;
12848	ump = VFSTOUFS(mp);
12849	fs = ump->um_fs;
12850	if (MOUNTEDSOFTDEP(mp) == 0)
12851		return (0);
12852	ACQUIRE_LOCK(ump);
12853restart:
12854	if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0) {
12855		FREE_LOCK(ump);
12856		return (0);
12857	}
12858	TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
12859		if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
12860		    == DEPCOMPLETE) {
12861			jwait(&inoref->if_list, MNT_WAIT);
12862			goto restart;
12863		}
12864	}
12865	if (!LIST_EMPTY(&inodedep->id_inowait) ||
12866	    !TAILQ_EMPTY(&inodedep->id_extupdt) ||
12867	    !TAILQ_EMPTY(&inodedep->id_newextupdt) ||
12868	    !TAILQ_EMPTY(&inodedep->id_inoupdt) ||
12869	    !TAILQ_EMPTY(&inodedep->id_newinoupdt))
12870		panic("softdep_fsync: pending ops %p", inodedep);
12871	for (error = 0, flushparent = 0; ; ) {
12872		if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) == NULL)
12873			break;
12874		if (wk->wk_type != D_DIRADD)
12875			panic("softdep_fsync: Unexpected type %s",
12876			    TYPENAME(wk->wk_type));
12877		dap = WK_DIRADD(wk);
12878		/*
12879		 * Flush our parent if this directory entry has a MKDIR_PARENT
12880		 * dependency or is contained in a newly allocated block.
12881		 */
12882		if (dap->da_state & DIRCHG)
12883			pagedep = dap->da_previous->dm_pagedep;
12884		else
12885			pagedep = dap->da_pagedep;
12886		parentino = pagedep->pd_ino;
12887		lbn = pagedep->pd_lbn;
12888		if ((dap->da_state & (MKDIR_BODY | COMPLETE)) != COMPLETE)
12889			panic("softdep_fsync: dirty");
12890		if ((dap->da_state & MKDIR_PARENT) ||
12891		    (pagedep->pd_state & NEWBLOCK))
12892			flushparent = 1;
12893		else
12894			flushparent = 0;
12895		/*
12896		 * If we are being fsync'ed as part of vgone'ing this vnode,
12897		 * then we will not be able to release and recover the
12898		 * vnode below, so we just have to give up on writing its
12899		 * directory entry out. It will eventually be written, just
12900		 * not now, but then the user was not asking to have it
12901		 * written, so we are not breaking any promises.
12902		 */
12903		if (VN_IS_DOOMED(vp))
12904			break;
12905		/*
12906		 * We prevent deadlock by always fetching inodes from the
12907		 * root, moving down the directory tree. Thus, when fetching
12908		 * our parent directory, we first try to get the lock. If
12909		 * that fails, we must unlock ourselves before requesting
12910		 * the lock on our parent. See the comment in ufs_lookup
12911		 * for details on possible races.
12912		 */
12913		FREE_LOCK(ump);
12914		error = get_parent_vp(vp, mp, parentino, NULL, NULL, NULL,
12915		    &pvp);
12916		if (error == ERELOOKUP)
12917			error = 0;
12918		if (error != 0)
12919			return (error);
12920		/*
12921		 * All MKDIR_PARENT dependencies and all the NEWBLOCK pagedeps
12922		 * that are contained in direct blocks will be resolved by
12923		 * doing a ffs_update. Pagedeps contained in indirect blocks
12924		 * may require a complete sync'ing of the directory. So, we
12925		 * try the cheap and fast ffs_update first, and if that fails,
12926		 * then we do the slower ffs_syncvnode of the directory.
12927		 */
12928		if (flushparent) {
12929			int locked;
12930
12931			if ((error = ffs_update(pvp, 1)) != 0) {
12932				vput(pvp);
12933				return (error);
12934			}
12935			ACQUIRE_LOCK(ump);
12936			locked = 1;
12937			if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) != 0) {
12938				if ((wk = LIST_FIRST(&inodedep->id_pendinghd)) != NULL) {
12939					if (wk->wk_type != D_DIRADD)
12940						panic("softdep_fsync: Unexpected type %s",
12941						      TYPENAME(wk->wk_type));
12942					dap = WK_DIRADD(wk);
12943					if (dap->da_state & DIRCHG)
12944						pagedep = dap->da_previous->dm_pagedep;
12945					else
12946						pagedep = dap->da_pagedep;
12947					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
12948					FREE_LOCK(ump);
12949					locked = 0;
12950					if (pagedep_new_block && (error =
12951					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
12952						vput(pvp);
12953						return (error);
12954					}
12955				}
12956			}
12957			if (locked)
12958				FREE_LOCK(ump);
12959		}
12960		/*
12961		 * Flush directory page containing the inode's name.
12962		 */
12963		error = bread(pvp, lbn, blksize(fs, VTOI(pvp), lbn), td->td_ucred,
12964		    &bp);
12965		if (error == 0)
12966			error = bwrite(bp);
12967		else
12968			brelse(bp);
12969		vput(pvp);
12970		if (!ffs_fsfail_cleanup(ump, error))
12971			return (error);
12972		ACQUIRE_LOCK(ump);
12973		if (inodedep_lookup(mp, ip->i_number, 0, &inodedep) == 0)
12974			break;
12975	}
12976	FREE_LOCK(ump);
12977	return (0);
12978}
12979
12980/*
12981 * Flush all the dirty bitmaps associated with the block device
12982 * before flushing the rest of the dirty blocks so as to reduce
12983 * the number of dependencies that will have to be rolled back.
12984 *
12985 * XXX Unused?
12986 */
12987void
12988softdep_fsync_mountdev(vp)
12989	struct vnode *vp;
12990{
12991	struct buf *bp, *nbp;
12992	struct worklist *wk;
12993	struct bufobj *bo;
12994
12995	if (!vn_isdisk(vp))
12996		panic("softdep_fsync_mountdev: vnode not a disk");
12997	bo = &vp->v_bufobj;
12998restart:
12999	BO_LOCK(bo);
13000	TAILQ_FOREACH_SAFE(bp, &bo->bo_dirty.bv_hd, b_bobufs, nbp) {
13001		/*
13002		 * If it is already scheduled, skip to the next buffer.
13003		 */
13004		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL))
13005			continue;
13006
13007		if ((bp->b_flags & B_DELWRI) == 0)
13008			panic("softdep_fsync_mountdev: not dirty");
13009		/*
13010		 * We are only interested in bitmaps with outstanding
13011		 * dependencies.
13012		 */
13013		if ((wk = LIST_FIRST(&bp->b_dep)) == NULL ||
13014		    wk->wk_type != D_BMSAFEMAP ||
13015		    (bp->b_vflags & BV_BKGRDINPROG)) {
13016			BUF_UNLOCK(bp);
13017			continue;
13018		}
13019		BO_UNLOCK(bo);
13020		bremfree(bp);
13021		(void) bawrite(bp);
13022		goto restart;
13023	}
13024	drain_output(vp);
13025	BO_UNLOCK(bo);
13026}
13027
13028/*
13029 * Sync all cylinder groups that were dirty at the time this function is
13030 * called.  Newly dirtied cgs will be inserted before the sentinel.  This
13031 * is used to flush freedep activity that may be holding up writes to a
13032 * indirect block.
13033 */
13034static int
13035sync_cgs(mp, waitfor)
13036	struct mount *mp;
13037	int waitfor;
13038{
13039	struct bmsafemap *bmsafemap;
13040	struct bmsafemap *sentinel;
13041	struct ufsmount *ump;
13042	struct buf *bp;
13043	int error;
13044
13045	sentinel = malloc(sizeof(*sentinel), M_BMSAFEMAP, M_ZERO | M_WAITOK);
13046	sentinel->sm_cg = -1;
13047	ump = VFSTOUFS(mp);
13048	error = 0;
13049	ACQUIRE_LOCK(ump);
13050	LIST_INSERT_HEAD(&ump->softdep_dirtycg, sentinel, sm_next);
13051	for (bmsafemap = LIST_NEXT(sentinel, sm_next); bmsafemap != NULL;
13052	    bmsafemap = LIST_NEXT(sentinel, sm_next)) {
13053		/* Skip sentinels and cgs with no work to release. */
13054		if (bmsafemap->sm_cg == -1 ||
13055		    (LIST_EMPTY(&bmsafemap->sm_freehd) &&
13056		    LIST_EMPTY(&bmsafemap->sm_freewr))) {
13057			LIST_REMOVE(sentinel, sm_next);
13058			LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
13059			continue;
13060		}
13061		/*
13062		 * If we don't get the lock and we're waiting try again, if
13063		 * not move on to the next buf and try to sync it.
13064		 */
13065		bp = getdirtybuf(bmsafemap->sm_buf, LOCK_PTR(ump), waitfor);
13066		if (bp == NULL && waitfor == MNT_WAIT)
13067			continue;
13068		LIST_REMOVE(sentinel, sm_next);
13069		LIST_INSERT_AFTER(bmsafemap, sentinel, sm_next);
13070		if (bp == NULL)
13071			continue;
13072		FREE_LOCK(ump);
13073		if (waitfor == MNT_NOWAIT)
13074			bawrite(bp);
13075		else
13076			error = bwrite(bp);
13077		ACQUIRE_LOCK(ump);
13078		if (error)
13079			break;
13080	}
13081	LIST_REMOVE(sentinel, sm_next);
13082	FREE_LOCK(ump);
13083	free(sentinel, M_BMSAFEMAP);
13084	return (error);
13085}
13086
13087/*
13088 * This routine is called when we are trying to synchronously flush a
13089 * file. This routine must eliminate any filesystem metadata dependencies
13090 * so that the syncing routine can succeed.
13091 */
13092int
13093softdep_sync_metadata(struct vnode *vp)
13094{
13095	struct inode *ip;
13096	int error;
13097
13098	ip = VTOI(vp);
13099	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13100	    ("softdep_sync_metadata called on non-softdep filesystem"));
13101	/*
13102	 * Ensure that any direct block dependencies have been cleared,
13103	 * truncations are started, and inode references are journaled.
13104	 */
13105	ACQUIRE_LOCK(VFSTOUFS(vp->v_mount));
13106	/*
13107	 * Write all journal records to prevent rollbacks on devvp.
13108	 */
13109	if (vp->v_type == VCHR)
13110		softdep_flushjournal(vp->v_mount);
13111	error = flush_inodedep_deps(vp, vp->v_mount, ip->i_number);
13112	/*
13113	 * Ensure that all truncates are written so we won't find deps on
13114	 * indirect blocks.
13115	 */
13116	process_truncates(vp);
13117	FREE_LOCK(VFSTOUFS(vp->v_mount));
13118
13119	return (error);
13120}
13121
13122/*
13123 * This routine is called when we are attempting to sync a buf with
13124 * dependencies.  If waitfor is MNT_NOWAIT it attempts to schedule any
13125 * other IO it can but returns EBUSY if the buffer is not yet able to
13126 * be written.  Dependencies which will not cause rollbacks will always
13127 * return 0.
13128 */
13129int
13130softdep_sync_buf(struct vnode *vp, struct buf *bp, int waitfor)
13131{
13132	struct indirdep *indirdep;
13133	struct pagedep *pagedep;
13134	struct allocindir *aip;
13135	struct newblk *newblk;
13136	struct ufsmount *ump;
13137	struct buf *nbp;
13138	struct worklist *wk;
13139	int i, error;
13140
13141	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13142	    ("softdep_sync_buf called on non-softdep filesystem"));
13143	/*
13144	 * For VCHR we just don't want to force flush any dependencies that
13145	 * will cause rollbacks.
13146	 */
13147	if (vp->v_type == VCHR) {
13148		if (waitfor == MNT_NOWAIT && softdep_count_dependencies(bp, 0))
13149			return (EBUSY);
13150		return (0);
13151	}
13152	ump = VFSTOUFS(vp->v_mount);
13153	ACQUIRE_LOCK(ump);
13154	/*
13155	 * As we hold the buffer locked, none of its dependencies
13156	 * will disappear.
13157	 */
13158	error = 0;
13159top:
13160	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
13161		switch (wk->wk_type) {
13162		case D_ALLOCDIRECT:
13163		case D_ALLOCINDIR:
13164			newblk = WK_NEWBLK(wk);
13165			if (newblk->nb_jnewblk != NULL) {
13166				if (waitfor == MNT_NOWAIT) {
13167					error = EBUSY;
13168					goto out_unlock;
13169				}
13170				jwait(&newblk->nb_jnewblk->jn_list, waitfor);
13171				goto top;
13172			}
13173			if (newblk->nb_state & DEPCOMPLETE ||
13174			    waitfor == MNT_NOWAIT)
13175				continue;
13176			nbp = newblk->nb_bmsafemap->sm_buf;
13177			nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13178			if (nbp == NULL)
13179				goto top;
13180			FREE_LOCK(ump);
13181			if ((error = bwrite(nbp)) != 0)
13182				goto out;
13183			ACQUIRE_LOCK(ump);
13184			continue;
13185
13186		case D_INDIRDEP:
13187			indirdep = WK_INDIRDEP(wk);
13188			if (waitfor == MNT_NOWAIT) {
13189				if (!TAILQ_EMPTY(&indirdep->ir_trunc) ||
13190				    !LIST_EMPTY(&indirdep->ir_deplisthd)) {
13191					error = EBUSY;
13192					goto out_unlock;
13193				}
13194			}
13195			if (!TAILQ_EMPTY(&indirdep->ir_trunc))
13196				panic("softdep_sync_buf: truncation pending.");
13197		restart:
13198			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
13199				newblk = (struct newblk *)aip;
13200				if (newblk->nb_jnewblk != NULL) {
13201					jwait(&newblk->nb_jnewblk->jn_list,
13202					    waitfor);
13203					goto restart;
13204				}
13205				if (newblk->nb_state & DEPCOMPLETE)
13206					continue;
13207				nbp = newblk->nb_bmsafemap->sm_buf;
13208				nbp = getdirtybuf(nbp, LOCK_PTR(ump), waitfor);
13209				if (nbp == NULL)
13210					goto restart;
13211				FREE_LOCK(ump);
13212				if ((error = bwrite(nbp)) != 0)
13213					goto out;
13214				ACQUIRE_LOCK(ump);
13215				goto restart;
13216			}
13217			continue;
13218
13219		case D_PAGEDEP:
13220			/*
13221			 * Only flush directory entries in synchronous passes.
13222			 */
13223			if (waitfor != MNT_WAIT) {
13224				error = EBUSY;
13225				goto out_unlock;
13226			}
13227			/*
13228			 * While syncing snapshots, we must allow recursive
13229			 * lookups.
13230			 */
13231			BUF_AREC(bp);
13232			/*
13233			 * We are trying to sync a directory that may
13234			 * have dependencies on both its own metadata
13235			 * and/or dependencies on the inodes of any
13236			 * recently allocated files. We walk its diradd
13237			 * lists pushing out the associated inode.
13238			 */
13239			pagedep = WK_PAGEDEP(wk);
13240			for (i = 0; i < DAHASHSZ; i++) {
13241				if (LIST_FIRST(&pagedep->pd_diraddhd[i]) == 0)
13242					continue;
13243				error = flush_pagedep_deps(vp, wk->wk_mp,
13244				    &pagedep->pd_diraddhd[i], bp);
13245				if (error != 0) {
13246					if (error != ERELOOKUP)
13247						BUF_NOREC(bp);
13248					goto out_unlock;
13249				}
13250			}
13251			BUF_NOREC(bp);
13252			continue;
13253
13254		case D_FREEWORK:
13255		case D_FREEDEP:
13256		case D_JSEGDEP:
13257		case D_JNEWBLK:
13258			continue;
13259
13260		default:
13261			panic("softdep_sync_buf: Unknown type %s",
13262			    TYPENAME(wk->wk_type));
13263			/* NOTREACHED */
13264		}
13265	}
13266out_unlock:
13267	FREE_LOCK(ump);
13268out:
13269	return (error);
13270}
13271
13272/*
13273 * Flush the dependencies associated with an inodedep.
13274 */
13275static int
13276flush_inodedep_deps(vp, mp, ino)
13277	struct vnode *vp;
13278	struct mount *mp;
13279	ino_t ino;
13280{
13281	struct inodedep *inodedep;
13282	struct inoref *inoref;
13283	struct ufsmount *ump;
13284	int error, waitfor;
13285
13286	/*
13287	 * This work is done in two passes. The first pass grabs most
13288	 * of the buffers and begins asynchronously writing them. The
13289	 * only way to wait for these asynchronous writes is to sleep
13290	 * on the filesystem vnode which may stay busy for a long time
13291	 * if the filesystem is active. So, instead, we make a second
13292	 * pass over the dependencies blocking on each write. In the
13293	 * usual case we will be blocking against a write that we
13294	 * initiated, so when it is done the dependency will have been
13295	 * resolved. Thus the second pass is expected to end quickly.
13296	 * We give a brief window at the top of the loop to allow
13297	 * any pending I/O to complete.
13298	 */
13299	ump = VFSTOUFS(mp);
13300	LOCK_OWNED(ump);
13301	for (error = 0, waitfor = MNT_NOWAIT; ; ) {
13302		if (error)
13303			return (error);
13304		FREE_LOCK(ump);
13305		ACQUIRE_LOCK(ump);
13306restart:
13307		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
13308			return (0);
13309		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13310			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13311			    == DEPCOMPLETE) {
13312				jwait(&inoref->if_list, MNT_WAIT);
13313				goto restart;
13314			}
13315		}
13316		if (flush_deplist(&inodedep->id_inoupdt, waitfor, &error) ||
13317		    flush_deplist(&inodedep->id_newinoupdt, waitfor, &error) ||
13318		    flush_deplist(&inodedep->id_extupdt, waitfor, &error) ||
13319		    flush_deplist(&inodedep->id_newextupdt, waitfor, &error))
13320			continue;
13321		/*
13322		 * If pass2, we are done, otherwise do pass 2.
13323		 */
13324		if (waitfor == MNT_WAIT)
13325			break;
13326		waitfor = MNT_WAIT;
13327	}
13328	/*
13329	 * Try freeing inodedep in case all dependencies have been removed.
13330	 */
13331	if (inodedep_lookup(mp, ino, 0, &inodedep) != 0)
13332		(void) free_inodedep(inodedep);
13333	return (0);
13334}
13335
13336/*
13337 * Flush an inode dependency list.
13338 */
13339static int
13340flush_deplist(listhead, waitfor, errorp)
13341	struct allocdirectlst *listhead;
13342	int waitfor;
13343	int *errorp;
13344{
13345	struct allocdirect *adp;
13346	struct newblk *newblk;
13347	struct ufsmount *ump;
13348	struct buf *bp;
13349
13350	if ((adp = TAILQ_FIRST(listhead)) == NULL)
13351		return (0);
13352	ump = VFSTOUFS(adp->ad_list.wk_mp);
13353	LOCK_OWNED(ump);
13354	TAILQ_FOREACH(adp, listhead, ad_next) {
13355		newblk = (struct newblk *)adp;
13356		if (newblk->nb_jnewblk != NULL) {
13357			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13358			return (1);
13359		}
13360		if (newblk->nb_state & DEPCOMPLETE)
13361			continue;
13362		bp = newblk->nb_bmsafemap->sm_buf;
13363		bp = getdirtybuf(bp, LOCK_PTR(ump), waitfor);
13364		if (bp == NULL) {
13365			if (waitfor == MNT_NOWAIT)
13366				continue;
13367			return (1);
13368		}
13369		FREE_LOCK(ump);
13370		if (waitfor == MNT_NOWAIT)
13371			bawrite(bp);
13372		else
13373			*errorp = bwrite(bp);
13374		ACQUIRE_LOCK(ump);
13375		return (1);
13376	}
13377	return (0);
13378}
13379
13380/*
13381 * Flush dependencies associated with an allocdirect block.
13382 */
13383static int
13384flush_newblk_dep(vp, mp, lbn)
13385	struct vnode *vp;
13386	struct mount *mp;
13387	ufs_lbn_t lbn;
13388{
13389	struct newblk *newblk;
13390	struct ufsmount *ump;
13391	struct bufobj *bo;
13392	struct inode *ip;
13393	struct buf *bp;
13394	ufs2_daddr_t blkno;
13395	int error;
13396
13397	error = 0;
13398	bo = &vp->v_bufobj;
13399	ip = VTOI(vp);
13400	blkno = DIP(ip, i_db[lbn]);
13401	if (blkno == 0)
13402		panic("flush_newblk_dep: Missing block");
13403	ump = VFSTOUFS(mp);
13404	ACQUIRE_LOCK(ump);
13405	/*
13406	 * Loop until all dependencies related to this block are satisfied.
13407	 * We must be careful to restart after each sleep in case a write
13408	 * completes some part of this process for us.
13409	 */
13410	for (;;) {
13411		if (newblk_lookup(mp, blkno, 0, &newblk) == 0) {
13412			FREE_LOCK(ump);
13413			break;
13414		}
13415		if (newblk->nb_list.wk_type != D_ALLOCDIRECT)
13416			panic("flush_newblk_dep: Bad newblk %p", newblk);
13417		/*
13418		 * Flush the journal.
13419		 */
13420		if (newblk->nb_jnewblk != NULL) {
13421			jwait(&newblk->nb_jnewblk->jn_list, MNT_WAIT);
13422			continue;
13423		}
13424		/*
13425		 * Write the bitmap dependency.
13426		 */
13427		if ((newblk->nb_state & DEPCOMPLETE) == 0) {
13428			bp = newblk->nb_bmsafemap->sm_buf;
13429			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13430			if (bp == NULL)
13431				continue;
13432			FREE_LOCK(ump);
13433			error = bwrite(bp);
13434			if (error)
13435				break;
13436			ACQUIRE_LOCK(ump);
13437			continue;
13438		}
13439		/*
13440		 * Write the buffer.
13441		 */
13442		FREE_LOCK(ump);
13443		BO_LOCK(bo);
13444		bp = gbincore(bo, lbn);
13445		if (bp != NULL) {
13446			error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
13447			    LK_INTERLOCK, BO_LOCKPTR(bo));
13448			if (error == ENOLCK) {
13449				ACQUIRE_LOCK(ump);
13450				error = 0;
13451				continue; /* Slept, retry */
13452			}
13453			if (error != 0)
13454				break;	/* Failed */
13455			if (bp->b_flags & B_DELWRI) {
13456				bremfree(bp);
13457				error = bwrite(bp);
13458				if (error)
13459					break;
13460			} else
13461				BUF_UNLOCK(bp);
13462		} else
13463			BO_UNLOCK(bo);
13464		/*
13465		 * We have to wait for the direct pointers to
13466		 * point at the newdirblk before the dependency
13467		 * will go away.
13468		 */
13469		error = ffs_update(vp, 1);
13470		if (error)
13471			break;
13472		ACQUIRE_LOCK(ump);
13473	}
13474	return (error);
13475}
13476
13477/*
13478 * Eliminate a pagedep dependency by flushing out all its diradd dependencies.
13479 */
13480static int
13481flush_pagedep_deps(pvp, mp, diraddhdp, locked_bp)
13482	struct vnode *pvp;
13483	struct mount *mp;
13484	struct diraddhd *diraddhdp;
13485	struct buf *locked_bp;
13486{
13487	struct inodedep *inodedep;
13488	struct inoref *inoref;
13489	struct ufsmount *ump;
13490	struct diradd *dap;
13491	struct vnode *vp;
13492	int error = 0;
13493	struct buf *bp;
13494	ino_t inum;
13495	struct diraddhd unfinished;
13496
13497	LIST_INIT(&unfinished);
13498	ump = VFSTOUFS(mp);
13499	LOCK_OWNED(ump);
13500restart:
13501	while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
13502		/*
13503		 * Flush ourselves if this directory entry
13504		 * has a MKDIR_PARENT dependency.
13505		 */
13506		if (dap->da_state & MKDIR_PARENT) {
13507			FREE_LOCK(ump);
13508			if ((error = ffs_update(pvp, 1)) != 0)
13509				break;
13510			ACQUIRE_LOCK(ump);
13511			/*
13512			 * If that cleared dependencies, go on to next.
13513			 */
13514			if (dap != LIST_FIRST(diraddhdp))
13515				continue;
13516			/*
13517			 * All MKDIR_PARENT dependencies and all the
13518			 * NEWBLOCK pagedeps that are contained in direct
13519			 * blocks were resolved by doing above ffs_update.
13520			 * Pagedeps contained in indirect blocks may
13521			 * require a complete sync'ing of the directory.
13522			 * We are in the midst of doing a complete sync,
13523			 * so if they are not resolved in this pass we
13524			 * defer them for now as they will be sync'ed by
13525			 * our caller shortly.
13526			 */
13527			LIST_REMOVE(dap, da_pdlist);
13528			LIST_INSERT_HEAD(&unfinished, dap, da_pdlist);
13529			continue;
13530		}
13531		/*
13532		 * A newly allocated directory must have its "." and
13533		 * ".." entries written out before its name can be
13534		 * committed in its parent.
13535		 */
13536		inum = dap->da_newinum;
13537		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13538			panic("flush_pagedep_deps: lost inode1");
13539		/*
13540		 * Wait for any pending journal adds to complete so we don't
13541		 * cause rollbacks while syncing.
13542		 */
13543		TAILQ_FOREACH(inoref, &inodedep->id_inoreflst, if_deps) {
13544			if ((inoref->if_state & (DEPCOMPLETE | GOINGAWAY))
13545			    == DEPCOMPLETE) {
13546				jwait(&inoref->if_list, MNT_WAIT);
13547				goto restart;
13548			}
13549		}
13550		if (dap->da_state & MKDIR_BODY) {
13551			FREE_LOCK(ump);
13552			error = get_parent_vp(pvp, mp, inum, locked_bp,
13553			    diraddhdp, &unfinished, &vp);
13554			if (error != 0)
13555				break;
13556			error = flush_newblk_dep(vp, mp, 0);
13557			/*
13558			 * If we still have the dependency we might need to
13559			 * update the vnode to sync the new link count to
13560			 * disk.
13561			 */
13562			if (error == 0 && dap == LIST_FIRST(diraddhdp))
13563				error = ffs_update(vp, 1);
13564			vput(vp);
13565			if (error != 0)
13566				break;
13567			ACQUIRE_LOCK(ump);
13568			/*
13569			 * If that cleared dependencies, go on to next.
13570			 */
13571			if (dap != LIST_FIRST(diraddhdp))
13572				continue;
13573			if (dap->da_state & MKDIR_BODY) {
13574				inodedep_lookup(UFSTOVFS(ump), inum, 0,
13575				    &inodedep);
13576				panic("flush_pagedep_deps: MKDIR_BODY "
13577				    "inodedep %p dap %p vp %p",
13578				    inodedep, dap, vp);
13579			}
13580		}
13581		/*
13582		 * Flush the inode on which the directory entry depends.
13583		 * Having accounted for MKDIR_PARENT and MKDIR_BODY above,
13584		 * the only remaining dependency is that the updated inode
13585		 * count must get pushed to disk. The inode has already
13586		 * been pushed into its inode buffer (via VOP_UPDATE) at
13587		 * the time of the reference count change. So we need only
13588		 * locate that buffer, ensure that there will be no rollback
13589		 * caused by a bitmap dependency, then write the inode buffer.
13590		 */
13591retry:
13592		if (inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep) == 0)
13593			panic("flush_pagedep_deps: lost inode");
13594		/*
13595		 * If the inode still has bitmap dependencies,
13596		 * push them to disk.
13597		 */
13598		if ((inodedep->id_state & (DEPCOMPLETE | GOINGAWAY)) == 0) {
13599			bp = inodedep->id_bmsafemap->sm_buf;
13600			bp = getdirtybuf(bp, LOCK_PTR(ump), MNT_WAIT);
13601			if (bp == NULL)
13602				goto retry;
13603			FREE_LOCK(ump);
13604			if ((error = bwrite(bp)) != 0)
13605				break;
13606			ACQUIRE_LOCK(ump);
13607			if (dap != LIST_FIRST(diraddhdp))
13608				continue;
13609		}
13610		/*
13611		 * If the inode is still sitting in a buffer waiting
13612		 * to be written or waiting for the link count to be
13613		 * adjusted update it here to flush it to disk.
13614		 */
13615		if (dap == LIST_FIRST(diraddhdp)) {
13616			FREE_LOCK(ump);
13617			error = get_parent_vp(pvp, mp, inum, locked_bp,
13618			    diraddhdp, &unfinished, &vp);
13619			if (error != 0)
13620				break;
13621			error = ffs_update(vp, 1);
13622			vput(vp);
13623			if (error)
13624				break;
13625			ACQUIRE_LOCK(ump);
13626		}
13627		/*
13628		 * If we have failed to get rid of all the dependencies
13629		 * then something is seriously wrong.
13630		 */
13631		if (dap == LIST_FIRST(diraddhdp)) {
13632			inodedep_lookup(UFSTOVFS(ump), inum, 0, &inodedep);
13633			panic("flush_pagedep_deps: failed to flush "
13634			    "inodedep %p ino %ju dap %p",
13635			    inodedep, (uintmax_t)inum, dap);
13636		}
13637	}
13638	if (error)
13639		ACQUIRE_LOCK(ump);
13640	while ((dap = LIST_FIRST(&unfinished)) != NULL) {
13641		LIST_REMOVE(dap, da_pdlist);
13642		LIST_INSERT_HEAD(diraddhdp, dap, da_pdlist);
13643	}
13644	return (error);
13645}
13646
13647/*
13648 * A large burst of file addition or deletion activity can drive the
13649 * memory load excessively high. First attempt to slow things down
13650 * using the techniques below. If that fails, this routine requests
13651 * the offending operations to fall back to running synchronously
13652 * until the memory load returns to a reasonable level.
13653 */
13654int
13655softdep_slowdown(vp)
13656	struct vnode *vp;
13657{
13658	struct ufsmount *ump;
13659	int jlow;
13660	int max_softdeps_hard;
13661
13662	KASSERT(MOUNTEDSOFTDEP(vp->v_mount) != 0,
13663	    ("softdep_slowdown called on non-softdep filesystem"));
13664	ump = VFSTOUFS(vp->v_mount);
13665	ACQUIRE_LOCK(ump);
13666	jlow = 0;
13667	/*
13668	 * Check for journal space if needed.
13669	 */
13670	if (DOINGSUJ(vp)) {
13671		if (journal_space(ump, 0) == 0)
13672			jlow = 1;
13673	}
13674	/*
13675	 * If the system is under its limits and our filesystem is
13676	 * not responsible for more than our share of the usage and
13677	 * we are not low on journal space, then no need to slow down.
13678	 */
13679	max_softdeps_hard = max_softdeps * 11 / 10;
13680	if (dep_current[D_DIRREM] < max_softdeps_hard / 2 &&
13681	    dep_current[D_INODEDEP] < max_softdeps_hard &&
13682	    dep_current[D_INDIRDEP] < max_softdeps_hard / 1000 &&
13683	    dep_current[D_FREEBLKS] < max_softdeps_hard && jlow == 0 &&
13684	    ump->softdep_curdeps[D_DIRREM] <
13685	    (max_softdeps_hard / 2) / stat_flush_threads &&
13686	    ump->softdep_curdeps[D_INODEDEP] <
13687	    max_softdeps_hard / stat_flush_threads &&
13688	    ump->softdep_curdeps[D_INDIRDEP] <
13689	    (max_softdeps_hard / 1000) / stat_flush_threads &&
13690	    ump->softdep_curdeps[D_FREEBLKS] <
13691	    max_softdeps_hard / stat_flush_threads) {
13692		FREE_LOCK(ump);
13693  		return (0);
13694	}
13695	/*
13696	 * If the journal is low or our filesystem is over its limit
13697	 * then speedup the cleanup.
13698	 */
13699	if (ump->softdep_curdeps[D_INDIRDEP] <
13700	    (max_softdeps_hard / 1000) / stat_flush_threads || jlow)
13701		softdep_speedup(ump);
13702	stat_sync_limit_hit += 1;
13703	FREE_LOCK(ump);
13704	/*
13705	 * We only slow down the rate at which new dependencies are
13706	 * generated if we are not using journaling. With journaling,
13707	 * the cleanup should always be sufficient to keep things
13708	 * under control.
13709	 */
13710	if (DOINGSUJ(vp))
13711		return (0);
13712	return (1);
13713}
13714
13715static int
13716softdep_request_cleanup_filter(struct vnode *vp, void *arg __unused)
13717{
13718	return ((vp->v_iflag & VI_OWEINACT) != 0 && vp->v_usecount == 0 &&
13719	    ((vp->v_vflag & VV_NOSYNC) != 0 || VTOI(vp)->i_effnlink == 0));
13720}
13721
13722static void
13723softdep_request_cleanup_inactivate(struct mount *mp)
13724{
13725	struct vnode *vp, *mvp;
13726	int error;
13727
13728	MNT_VNODE_FOREACH_LAZY(vp, mp, mvp, softdep_request_cleanup_filter,
13729	    NULL) {
13730		vholdl(vp);
13731		vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
13732		VI_LOCK(vp);
13733		if (vp->v_data != NULL && vp->v_usecount == 0) {
13734			while ((vp->v_iflag & VI_OWEINACT) != 0) {
13735				error = vinactive(vp);
13736				if (error != 0 && error != ERELOOKUP)
13737					break;
13738			}
13739			atomic_add_int(&stat_delayed_inact, 1);
13740		}
13741		VOP_UNLOCK(vp);
13742		vdropl(vp);
13743	}
13744}
13745
13746/*
13747 * Called by the allocation routines when they are about to fail
13748 * in the hope that we can free up the requested resource (inodes
13749 * or disk space).
13750 *
13751 * First check to see if the work list has anything on it. If it has,
13752 * clean up entries until we successfully free the requested resource.
13753 * Because this process holds inodes locked, we cannot handle any remove
13754 * requests that might block on a locked inode as that could lead to
13755 * deadlock. If the worklist yields none of the requested resource,
13756 * start syncing out vnodes to free up the needed space.
13757 */
13758int
13759softdep_request_cleanup(fs, vp, cred, resource)
13760	struct fs *fs;
13761	struct vnode *vp;
13762	struct ucred *cred;
13763	int resource;
13764{
13765	struct ufsmount *ump;
13766	struct mount *mp;
13767	long starttime;
13768	ufs2_daddr_t needed;
13769	int error, failed_vnode;
13770
13771	/*
13772	 * If we are being called because of a process doing a
13773	 * copy-on-write, then it is not safe to process any
13774	 * worklist items as we will recurse into the copyonwrite
13775	 * routine.  This will result in an incoherent snapshot.
13776	 * If the vnode that we hold is a snapshot, we must avoid
13777	 * handling other resources that could cause deadlock.
13778	 */
13779	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
13780		return (0);
13781
13782	if (resource == FLUSH_BLOCKS_WAIT)
13783		stat_cleanup_blkrequests += 1;
13784	else
13785		stat_cleanup_inorequests += 1;
13786
13787	mp = vp->v_mount;
13788	ump = VFSTOUFS(mp);
13789	mtx_assert(UFS_MTX(ump), MA_OWNED);
13790	UFS_UNLOCK(ump);
13791	error = ffs_update(vp, 1);
13792	if (error != 0 || MOUNTEDSOFTDEP(mp) == 0) {
13793		UFS_LOCK(ump);
13794		return (0);
13795	}
13796	/*
13797	 * If we are in need of resources, start by cleaning up
13798	 * any block removals associated with our inode.
13799	 */
13800	ACQUIRE_LOCK(ump);
13801	process_removes(vp);
13802	process_truncates(vp);
13803	FREE_LOCK(ump);
13804	/*
13805	 * Now clean up at least as many resources as we will need.
13806	 *
13807	 * When requested to clean up inodes, the number that are needed
13808	 * is set by the number of simultaneous writers (mnt_writeopcount)
13809	 * plus a bit of slop (2) in case some more writers show up while
13810	 * we are cleaning.
13811	 *
13812	 * When requested to free up space, the amount of space that
13813	 * we need is enough blocks to allocate a full-sized segment
13814	 * (fs_contigsumsize). The number of such segments that will
13815	 * be needed is set by the number of simultaneous writers
13816	 * (mnt_writeopcount) plus a bit of slop (2) in case some more
13817	 * writers show up while we are cleaning.
13818	 *
13819	 * Additionally, if we are unpriviledged and allocating space,
13820	 * we need to ensure that we clean up enough blocks to get the
13821	 * needed number of blocks over the threshold of the minimum
13822	 * number of blocks required to be kept free by the filesystem
13823	 * (fs_minfree).
13824	 */
13825	if (resource == FLUSH_INODES_WAIT) {
13826		needed = vfs_mount_fetch_counter(vp->v_mount,
13827		    MNT_COUNT_WRITEOPCOUNT) + 2;
13828	} else if (resource == FLUSH_BLOCKS_WAIT) {
13829		needed = (vfs_mount_fetch_counter(vp->v_mount,
13830		    MNT_COUNT_WRITEOPCOUNT) + 2) * fs->fs_contigsumsize;
13831		if (priv_check_cred(cred, PRIV_VFS_BLOCKRESERVE))
13832			needed += fragstoblks(fs,
13833			    roundup((fs->fs_dsize * fs->fs_minfree / 100) -
13834			    fs->fs_cstotal.cs_nffree, fs->fs_frag));
13835	} else {
13836		printf("softdep_request_cleanup: Unknown resource type %d\n",
13837		    resource);
13838		UFS_LOCK(ump);
13839		return (0);
13840	}
13841	starttime = time_second;
13842retry:
13843	if (resource == FLUSH_BLOCKS_WAIT &&
13844	    fs->fs_cstotal.cs_nbfree <= needed)
13845		softdep_send_speedup(ump, needed * fs->fs_bsize,
13846		    BIO_SPEEDUP_TRIM);
13847	if ((resource == FLUSH_BLOCKS_WAIT && ump->softdep_on_worklist > 0 &&
13848	    fs->fs_cstotal.cs_nbfree <= needed) ||
13849	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13850	    fs->fs_cstotal.cs_nifree <= needed)) {
13851		ACQUIRE_LOCK(ump);
13852		if (ump->softdep_on_worklist > 0 &&
13853		    process_worklist_item(UFSTOVFS(ump),
13854		    ump->softdep_on_worklist, LK_NOWAIT) != 0)
13855			stat_worklist_push += 1;
13856		FREE_LOCK(ump);
13857	}
13858
13859	/*
13860	 * Check that there are vnodes pending inactivation.  As they
13861	 * have been unlinked, inactivating them will free up their
13862	 * inodes.
13863	 */
13864	ACQUIRE_LOCK(ump);
13865	if (resource == FLUSH_INODES_WAIT &&
13866	    fs->fs_cstotal.cs_nifree <= needed &&
13867	    fs->fs_pendinginodes <= needed) {
13868		if ((ump->um_softdep->sd_flags & FLUSH_DI_ACTIVE) == 0) {
13869			ump->um_softdep->sd_flags |= FLUSH_DI_ACTIVE;
13870			FREE_LOCK(ump);
13871			softdep_request_cleanup_inactivate(mp);
13872			ACQUIRE_LOCK(ump);
13873			ump->um_softdep->sd_flags &= ~FLUSH_DI_ACTIVE;
13874			wakeup(&ump->um_softdep->sd_flags);
13875		} else {
13876			while ((ump->um_softdep->sd_flags &
13877			    FLUSH_DI_ACTIVE) != 0) {
13878				msleep(&ump->um_softdep->sd_flags,
13879				    LOCK_PTR(ump), PVM, "ffsvina", hz);
13880			}
13881		}
13882	}
13883	FREE_LOCK(ump);
13884
13885	/*
13886	 * If we still need resources and there are no more worklist
13887	 * entries to process to obtain them, we have to start flushing
13888	 * the dirty vnodes to force the release of additional requests
13889	 * to the worklist that we can then process to reap addition
13890	 * resources. We walk the vnodes associated with the mount point
13891	 * until we get the needed worklist requests that we can reap.
13892	 *
13893	 * If there are several threads all needing to clean the same
13894	 * mount point, only one is allowed to walk the mount list.
13895	 * When several threads all try to walk the same mount list,
13896	 * they end up competing with each other and often end up in
13897	 * livelock. This approach ensures that forward progress is
13898	 * made at the cost of occational ENOSPC errors being returned
13899	 * that might otherwise have been avoided.
13900	 */
13901	error = 1;
13902	if ((resource == FLUSH_BLOCKS_WAIT &&
13903	     fs->fs_cstotal.cs_nbfree <= needed) ||
13904	    (resource == FLUSH_INODES_WAIT && fs->fs_pendinginodes > 0 &&
13905	     fs->fs_cstotal.cs_nifree <= needed)) {
13906		ACQUIRE_LOCK(ump);
13907		if ((ump->um_softdep->sd_flags & FLUSH_RC_ACTIVE) == 0) {
13908			ump->um_softdep->sd_flags |= FLUSH_RC_ACTIVE;
13909			FREE_LOCK(ump);
13910			failed_vnode = softdep_request_cleanup_flush(mp, ump);
13911			ACQUIRE_LOCK(ump);
13912			ump->um_softdep->sd_flags &= ~FLUSH_RC_ACTIVE;
13913			wakeup(&ump->um_softdep->sd_flags);
13914			FREE_LOCK(ump);
13915			if (ump->softdep_on_worklist > 0) {
13916				stat_cleanup_retries += 1;
13917				if (!failed_vnode)
13918					goto retry;
13919			}
13920		} else {
13921			while ((ump->um_softdep->sd_flags &
13922			    FLUSH_RC_ACTIVE) != 0) {
13923				msleep(&ump->um_softdep->sd_flags,
13924				    LOCK_PTR(ump), PVM, "ffsrca", hz);
13925			}
13926			FREE_LOCK(ump);
13927			error = 0;
13928		}
13929		stat_cleanup_failures += 1;
13930	}
13931	if (time_second - starttime > stat_cleanup_high_delay)
13932		stat_cleanup_high_delay = time_second - starttime;
13933	UFS_LOCK(ump);
13934	return (error);
13935}
13936
13937/*
13938 * Scan the vnodes for the specified mount point flushing out any
13939 * vnodes that can be locked without waiting. Finally, try to flush
13940 * the device associated with the mount point if it can be locked
13941 * without waiting.
13942 *
13943 * We return 0 if we were able to lock every vnode in our scan.
13944 * If we had to skip one or more vnodes, we return 1.
13945 */
13946static int
13947softdep_request_cleanup_flush(mp, ump)
13948	struct mount *mp;
13949	struct ufsmount *ump;
13950{
13951	struct thread *td;
13952	struct vnode *lvp, *mvp;
13953	int failed_vnode;
13954
13955	failed_vnode = 0;
13956	td = curthread;
13957	MNT_VNODE_FOREACH_ALL(lvp, mp, mvp) {
13958		if (TAILQ_FIRST(&lvp->v_bufobj.bo_dirty.bv_hd) == 0) {
13959			VI_UNLOCK(lvp);
13960			continue;
13961		}
13962		if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT) != 0) {
13963			failed_vnode = 1;
13964			continue;
13965		}
13966		if (lvp->v_vflag & VV_NOSYNC) {	/* unlinked */
13967			vput(lvp);
13968			continue;
13969		}
13970		(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
13971		vput(lvp);
13972	}
13973	lvp = ump->um_devvp;
13974	if (vn_lock(lvp, LK_EXCLUSIVE | LK_NOWAIT) == 0) {
13975		VOP_FSYNC(lvp, MNT_NOWAIT, td);
13976		VOP_UNLOCK(lvp);
13977	}
13978	return (failed_vnode);
13979}
13980
13981static bool
13982softdep_excess_items(struct ufsmount *ump, int item)
13983{
13984
13985	KASSERT(item >= 0 && item < D_LAST, ("item %d", item));
13986	return (dep_current[item] > max_softdeps &&
13987	    ump->softdep_curdeps[item] > max_softdeps /
13988	    stat_flush_threads);
13989}
13990
13991static void
13992schedule_cleanup(struct mount *mp)
13993{
13994	struct ufsmount *ump;
13995	struct thread *td;
13996
13997	ump = VFSTOUFS(mp);
13998	LOCK_OWNED(ump);
13999	FREE_LOCK(ump);
14000	td = curthread;
14001	if ((td->td_pflags & TDP_KTHREAD) != 0 &&
14002	    (td->td_proc->p_flag2 & P2_AST_SU) == 0) {
14003		/*
14004		 * No ast is delivered to kernel threads, so nobody
14005		 * would deref the mp.  Some kernel threads
14006		 * explicitely check for AST, e.g. NFS daemon does
14007		 * this in the serving loop.
14008		 */
14009		return;
14010	}
14011	if (td->td_su != NULL)
14012		vfs_rel(td->td_su);
14013	vfs_ref(mp);
14014	td->td_su = mp;
14015	thread_lock(td);
14016	td->td_flags |= TDF_ASTPENDING;
14017	thread_unlock(td);
14018}
14019
14020static void
14021softdep_ast_cleanup_proc(struct thread *td)
14022{
14023	struct mount *mp;
14024	struct ufsmount *ump;
14025	int error;
14026	bool req;
14027
14028	while ((mp = td->td_su) != NULL) {
14029		td->td_su = NULL;
14030		error = vfs_busy(mp, MBF_NOWAIT);
14031		vfs_rel(mp);
14032		if (error != 0)
14033			return;
14034		if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) {
14035			ump = VFSTOUFS(mp);
14036			for (;;) {
14037				req = false;
14038				ACQUIRE_LOCK(ump);
14039				if (softdep_excess_items(ump, D_INODEDEP)) {
14040					req = true;
14041					request_cleanup(mp, FLUSH_INODES);
14042				}
14043				if (softdep_excess_items(ump, D_DIRREM)) {
14044					req = true;
14045					request_cleanup(mp, FLUSH_BLOCKS);
14046				}
14047				FREE_LOCK(ump);
14048				if (softdep_excess_items(ump, D_NEWBLK) ||
14049				    softdep_excess_items(ump, D_ALLOCDIRECT) ||
14050				    softdep_excess_items(ump, D_ALLOCINDIR)) {
14051					error = vn_start_write(NULL, &mp,
14052					    V_WAIT);
14053					if (error == 0) {
14054						req = true;
14055						VFS_SYNC(mp, MNT_WAIT);
14056						vn_finished_write(mp);
14057					}
14058				}
14059				if ((td->td_pflags & TDP_KTHREAD) != 0 || !req)
14060					break;
14061			}
14062		}
14063		vfs_unbusy(mp);
14064	}
14065	if ((mp = td->td_su) != NULL) {
14066		td->td_su = NULL;
14067		vfs_rel(mp);
14068	}
14069}
14070
14071/*
14072 * If memory utilization has gotten too high, deliberately slow things
14073 * down and speed up the I/O processing.
14074 */
14075static int
14076request_cleanup(mp, resource)
14077	struct mount *mp;
14078	int resource;
14079{
14080	struct thread *td = curthread;
14081	struct ufsmount *ump;
14082
14083	ump = VFSTOUFS(mp);
14084	LOCK_OWNED(ump);
14085	/*
14086	 * We never hold up the filesystem syncer or buf daemon.
14087	 */
14088	if (td->td_pflags & (TDP_SOFTDEP|TDP_NORUNNINGBUF))
14089		return (0);
14090	/*
14091	 * First check to see if the work list has gotten backlogged.
14092	 * If it has, co-opt this process to help clean up two entries.
14093	 * Because this process may hold inodes locked, we cannot
14094	 * handle any remove requests that might block on a locked
14095	 * inode as that could lead to deadlock.  We set TDP_SOFTDEP
14096	 * to avoid recursively processing the worklist.
14097	 */
14098	if (ump->softdep_on_worklist > max_softdeps / 10) {
14099		td->td_pflags |= TDP_SOFTDEP;
14100		process_worklist_item(mp, 2, LK_NOWAIT);
14101		td->td_pflags &= ~TDP_SOFTDEP;
14102		stat_worklist_push += 2;
14103		return(1);
14104	}
14105	/*
14106	 * Next, we attempt to speed up the syncer process. If that
14107	 * is successful, then we allow the process to continue.
14108	 */
14109	if (softdep_speedup(ump) &&
14110	    resource != FLUSH_BLOCKS_WAIT &&
14111	    resource != FLUSH_INODES_WAIT)
14112		return(0);
14113	/*
14114	 * If we are resource constrained on inode dependencies, try
14115	 * flushing some dirty inodes. Otherwise, we are constrained
14116	 * by file deletions, so try accelerating flushes of directories
14117	 * with removal dependencies. We would like to do the cleanup
14118	 * here, but we probably hold an inode locked at this point and
14119	 * that might deadlock against one that we try to clean. So,
14120	 * the best that we can do is request the syncer daemon to do
14121	 * the cleanup for us.
14122	 */
14123	switch (resource) {
14124	case FLUSH_INODES:
14125	case FLUSH_INODES_WAIT:
14126		ACQUIRE_GBLLOCK(&lk);
14127		stat_ino_limit_push += 1;
14128		req_clear_inodedeps += 1;
14129		FREE_GBLLOCK(&lk);
14130		stat_countp = &stat_ino_limit_hit;
14131		break;
14132
14133	case FLUSH_BLOCKS:
14134	case FLUSH_BLOCKS_WAIT:
14135		ACQUIRE_GBLLOCK(&lk);
14136		stat_blk_limit_push += 1;
14137		req_clear_remove += 1;
14138		FREE_GBLLOCK(&lk);
14139		stat_countp = &stat_blk_limit_hit;
14140		break;
14141
14142	default:
14143		panic("request_cleanup: unknown type");
14144	}
14145	/*
14146	 * Hopefully the syncer daemon will catch up and awaken us.
14147	 * We wait at most tickdelay before proceeding in any case.
14148	 */
14149	ACQUIRE_GBLLOCK(&lk);
14150	FREE_LOCK(ump);
14151	proc_waiting += 1;
14152	if (callout_pending(&softdep_callout) == FALSE)
14153		callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2,
14154		    pause_timer, 0);
14155
14156	if ((td->td_pflags & TDP_KTHREAD) == 0)
14157		msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0);
14158	proc_waiting -= 1;
14159	FREE_GBLLOCK(&lk);
14160	ACQUIRE_LOCK(ump);
14161	return (1);
14162}
14163
14164/*
14165 * Awaken processes pausing in request_cleanup and clear proc_waiting
14166 * to indicate that there is no longer a timer running. Pause_timer
14167 * will be called with the global softdep mutex (&lk) locked.
14168 */
14169static void
14170pause_timer(arg)
14171	void *arg;
14172{
14173
14174	GBLLOCK_OWNED(&lk);
14175	/*
14176	 * The callout_ API has acquired mtx and will hold it around this
14177	 * function call.
14178	 */
14179	*stat_countp += proc_waiting;
14180	wakeup(&proc_waiting);
14181}
14182
14183/*
14184 * If requested, try removing inode or removal dependencies.
14185 */
14186static void
14187check_clear_deps(mp)
14188	struct mount *mp;
14189{
14190	struct ufsmount *ump;
14191	bool suj_susp;
14192
14193	/*
14194	 * Tell the lower layers that any TRIM or WRITE transactions that have
14195	 * been delayed for performance reasons should proceed to help alleviate
14196	 * the shortage faster. The race between checking req_* and the softdep
14197	 * mutex (lk) is fine since this is an advisory operation that at most
14198	 * causes deferred work to be done sooner.
14199	 */
14200	ump = VFSTOUFS(mp);
14201	suj_susp = MOUNTEDSUJ(mp) && ump->softdep_jblocks->jb_suspended;
14202	if (req_clear_remove || req_clear_inodedeps || suj_susp) {
14203		FREE_LOCK(ump);
14204		softdep_send_speedup(ump, 0, BIO_SPEEDUP_TRIM | BIO_SPEEDUP_WRITE);
14205		ACQUIRE_LOCK(ump);
14206	}
14207
14208	/*
14209	 * If we are suspended, it may be because of our using
14210	 * too many inodedeps, so help clear them out.
14211	 */
14212	if (suj_susp)
14213		clear_inodedeps(mp);
14214
14215	/*
14216	 * General requests for cleanup of backed up dependencies
14217	 */
14218	ACQUIRE_GBLLOCK(&lk);
14219	if (req_clear_inodedeps) {
14220		req_clear_inodedeps -= 1;
14221		FREE_GBLLOCK(&lk);
14222		clear_inodedeps(mp);
14223		ACQUIRE_GBLLOCK(&lk);
14224		wakeup(&proc_waiting);
14225	}
14226	if (req_clear_remove) {
14227		req_clear_remove -= 1;
14228		FREE_GBLLOCK(&lk);
14229		clear_remove(mp);
14230		ACQUIRE_GBLLOCK(&lk);
14231		wakeup(&proc_waiting);
14232	}
14233	FREE_GBLLOCK(&lk);
14234}
14235
14236/*
14237 * Flush out a directory with at least one removal dependency in an effort to
14238 * reduce the number of dirrem, freefile, and freeblks dependency structures.
14239 */
14240static void
14241clear_remove(mp)
14242	struct mount *mp;
14243{
14244	struct pagedep_hashhead *pagedephd;
14245	struct pagedep *pagedep;
14246	struct ufsmount *ump;
14247	struct vnode *vp;
14248	struct bufobj *bo;
14249	int error, cnt;
14250	ino_t ino;
14251
14252	ump = VFSTOUFS(mp);
14253	LOCK_OWNED(ump);
14254
14255	for (cnt = 0; cnt <= ump->pagedep_hash_size; cnt++) {
14256		pagedephd = &ump->pagedep_hashtbl[ump->pagedep_nextclean++];
14257		if (ump->pagedep_nextclean > ump->pagedep_hash_size)
14258			ump->pagedep_nextclean = 0;
14259		LIST_FOREACH(pagedep, pagedephd, pd_hash) {
14260			if (LIST_EMPTY(&pagedep->pd_dirremhd))
14261				continue;
14262			ino = pagedep->pd_ino;
14263			if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14264				continue;
14265			FREE_LOCK(ump);
14266
14267			/*
14268			 * Let unmount clear deps
14269			 */
14270			error = vfs_busy(mp, MBF_NOWAIT);
14271			if (error != 0)
14272				goto finish_write;
14273			error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14274			     FFSV_FORCEINSMQ);
14275			vfs_unbusy(mp);
14276			if (error != 0) {
14277				softdep_error("clear_remove: vget", error);
14278				goto finish_write;
14279			}
14280			MPASS(VTOI(vp)->i_mode != 0);
14281			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14282				softdep_error("clear_remove: fsync", error);
14283			bo = &vp->v_bufobj;
14284			BO_LOCK(bo);
14285			drain_output(vp);
14286			BO_UNLOCK(bo);
14287			vput(vp);
14288		finish_write:
14289			vn_finished_write(mp);
14290			ACQUIRE_LOCK(ump);
14291			return;
14292		}
14293	}
14294}
14295
14296/*
14297 * Clear out a block of dirty inodes in an effort to reduce
14298 * the number of inodedep dependency structures.
14299 */
14300static void
14301clear_inodedeps(mp)
14302	struct mount *mp;
14303{
14304	struct inodedep_hashhead *inodedephd;
14305	struct inodedep *inodedep;
14306	struct ufsmount *ump;
14307	struct vnode *vp;
14308	struct fs *fs;
14309	int error, cnt;
14310	ino_t firstino, lastino, ino;
14311
14312	ump = VFSTOUFS(mp);
14313	fs = ump->um_fs;
14314	LOCK_OWNED(ump);
14315	/*
14316	 * Pick a random inode dependency to be cleared.
14317	 * We will then gather up all the inodes in its block
14318	 * that have dependencies and flush them out.
14319	 */
14320	for (cnt = 0; cnt <= ump->inodedep_hash_size; cnt++) {
14321		inodedephd = &ump->inodedep_hashtbl[ump->inodedep_nextclean++];
14322		if (ump->inodedep_nextclean > ump->inodedep_hash_size)
14323			ump->inodedep_nextclean = 0;
14324		if ((inodedep = LIST_FIRST(inodedephd)) != NULL)
14325			break;
14326	}
14327	if (inodedep == NULL)
14328		return;
14329	/*
14330	 * Find the last inode in the block with dependencies.
14331	 */
14332	firstino = rounddown2(inodedep->id_ino, INOPB(fs));
14333	for (lastino = firstino + INOPB(fs) - 1; lastino > firstino; lastino--)
14334		if (inodedep_lookup(mp, lastino, 0, &inodedep) != 0)
14335			break;
14336	/*
14337	 * Asynchronously push all but the last inode with dependencies.
14338	 * Synchronously push the last inode with dependencies to ensure
14339	 * that the inode block gets written to free up the inodedeps.
14340	 */
14341	for (ino = firstino; ino <= lastino; ino++) {
14342		if (inodedep_lookup(mp, ino, 0, &inodedep) == 0)
14343			continue;
14344		if (vn_start_write(NULL, &mp, V_NOWAIT) != 0)
14345			continue;
14346		FREE_LOCK(ump);
14347		error = vfs_busy(mp, MBF_NOWAIT); /* Let unmount clear deps */
14348		if (error != 0) {
14349			vn_finished_write(mp);
14350			ACQUIRE_LOCK(ump);
14351			return;
14352		}
14353		if ((error = ffs_vgetf(mp, ino, LK_EXCLUSIVE, &vp,
14354		    FFSV_FORCEINSMQ)) != 0) {
14355			softdep_error("clear_inodedeps: vget", error);
14356			vfs_unbusy(mp);
14357			vn_finished_write(mp);
14358			ACQUIRE_LOCK(ump);
14359			return;
14360		}
14361		vfs_unbusy(mp);
14362		if (VTOI(vp)->i_mode == 0) {
14363			vgone(vp);
14364		} else if (ino == lastino) {
14365			do {
14366				error = ffs_syncvnode(vp, MNT_WAIT, 0);
14367			} while (error == ERELOOKUP);
14368			if (error != 0)
14369				softdep_error("clear_inodedeps: fsync1", error);
14370		} else {
14371			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
14372				softdep_error("clear_inodedeps: fsync2", error);
14373			BO_LOCK(&vp->v_bufobj);
14374			drain_output(vp);
14375			BO_UNLOCK(&vp->v_bufobj);
14376		}
14377		vput(vp);
14378		vn_finished_write(mp);
14379		ACQUIRE_LOCK(ump);
14380	}
14381}
14382
14383void
14384softdep_buf_append(bp, wkhd)
14385	struct buf *bp;
14386	struct workhead *wkhd;
14387{
14388	struct worklist *wk;
14389	struct ufsmount *ump;
14390
14391	if ((wk = LIST_FIRST(wkhd)) == NULL)
14392		return;
14393	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14394	    ("softdep_buf_append called on non-softdep filesystem"));
14395	ump = VFSTOUFS(wk->wk_mp);
14396	ACQUIRE_LOCK(ump);
14397	while ((wk = LIST_FIRST(wkhd)) != NULL) {
14398		WORKLIST_REMOVE(wk);
14399		WORKLIST_INSERT(&bp->b_dep, wk);
14400	}
14401	FREE_LOCK(ump);
14402
14403}
14404
14405void
14406softdep_inode_append(ip, cred, wkhd)
14407	struct inode *ip;
14408	struct ucred *cred;
14409	struct workhead *wkhd;
14410{
14411	struct buf *bp;
14412	struct fs *fs;
14413	struct ufsmount *ump;
14414	int error;
14415
14416	ump = ITOUMP(ip);
14417	KASSERT(MOUNTEDSOFTDEP(UFSTOVFS(ump)) != 0,
14418	    ("softdep_inode_append called on non-softdep filesystem"));
14419	fs = ump->um_fs;
14420	error = bread(ump->um_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
14421	    (int)fs->fs_bsize, cred, &bp);
14422	if (error) {
14423		bqrelse(bp);
14424		softdep_freework(wkhd);
14425		return;
14426	}
14427	softdep_buf_append(bp, wkhd);
14428	bqrelse(bp);
14429}
14430
14431void
14432softdep_freework(wkhd)
14433	struct workhead *wkhd;
14434{
14435	struct worklist *wk;
14436	struct ufsmount *ump;
14437
14438	if ((wk = LIST_FIRST(wkhd)) == NULL)
14439		return;
14440	KASSERT(MOUNTEDSOFTDEP(wk->wk_mp) != 0,
14441	    ("softdep_freework called on non-softdep filesystem"));
14442	ump = VFSTOUFS(wk->wk_mp);
14443	ACQUIRE_LOCK(ump);
14444	handle_jwork(wkhd);
14445	FREE_LOCK(ump);
14446}
14447
14448static struct ufsmount *
14449softdep_bp_to_mp(bp)
14450	struct buf *bp;
14451{
14452	struct mount *mp;
14453	struct vnode *vp;
14454
14455	if (LIST_EMPTY(&bp->b_dep))
14456		return (NULL);
14457	vp = bp->b_vp;
14458	KASSERT(vp != NULL,
14459	    ("%s, buffer with dependencies lacks vnode", __func__));
14460
14461	/*
14462	 * The ump mount point is stable after we get a correct
14463	 * pointer, since bp is locked and this prevents unmount from
14464	 * proceeding.  But to get to it, we cannot dereference bp->b_dep
14465	 * head wk_mp, because we do not yet own SU ump lock and
14466	 * workitem might be freed while dereferenced.
14467	 */
14468retry:
14469	switch (vp->v_type) {
14470	case VCHR:
14471		VI_LOCK(vp);
14472		mp = vp->v_type == VCHR ? vp->v_rdev->si_mountpt : NULL;
14473		VI_UNLOCK(vp);
14474		if (mp == NULL)
14475			goto retry;
14476		break;
14477	case VREG:
14478	case VDIR:
14479	case VLNK:
14480	case VFIFO:
14481	case VSOCK:
14482		mp = vp->v_mount;
14483		break;
14484	case VBLK:
14485		vn_printf(vp, "softdep_bp_to_mp: unexpected block device\n");
14486		/* FALLTHROUGH */
14487	case VNON:
14488	case VBAD:
14489	case VMARKER:
14490		mp = NULL;
14491		break;
14492	default:
14493		vn_printf(vp, "unknown vnode type");
14494		mp = NULL;
14495		break;
14496	}
14497	return (VFSTOUFS(mp));
14498}
14499
14500/*
14501 * Function to determine if the buffer has outstanding dependencies
14502 * that will cause a roll-back if the buffer is written. If wantcount
14503 * is set, return number of dependencies, otherwise just yes or no.
14504 */
14505static int
14506softdep_count_dependencies(bp, wantcount)
14507	struct buf *bp;
14508	int wantcount;
14509{
14510	struct worklist *wk;
14511	struct ufsmount *ump;
14512	struct bmsafemap *bmsafemap;
14513	struct freework *freework;
14514	struct inodedep *inodedep;
14515	struct indirdep *indirdep;
14516	struct freeblks *freeblks;
14517	struct allocindir *aip;
14518	struct pagedep *pagedep;
14519	struct dirrem *dirrem;
14520	struct newblk *newblk;
14521	struct mkdir *mkdir;
14522	struct diradd *dap;
14523	int i, retval;
14524
14525	ump = softdep_bp_to_mp(bp);
14526	if (ump == NULL)
14527		return (0);
14528	retval = 0;
14529	ACQUIRE_LOCK(ump);
14530	LIST_FOREACH(wk, &bp->b_dep, wk_list) {
14531		switch (wk->wk_type) {
14532		case D_INODEDEP:
14533			inodedep = WK_INODEDEP(wk);
14534			if ((inodedep->id_state & DEPCOMPLETE) == 0) {
14535				/* bitmap allocation dependency */
14536				retval += 1;
14537				if (!wantcount)
14538					goto out;
14539			}
14540			if (TAILQ_FIRST(&inodedep->id_inoupdt)) {
14541				/* direct block pointer dependency */
14542				retval += 1;
14543				if (!wantcount)
14544					goto out;
14545			}
14546			if (TAILQ_FIRST(&inodedep->id_extupdt)) {
14547				/* direct block pointer dependency */
14548				retval += 1;
14549				if (!wantcount)
14550					goto out;
14551			}
14552			if (TAILQ_FIRST(&inodedep->id_inoreflst)) {
14553				/* Add reference dependency. */
14554				retval += 1;
14555				if (!wantcount)
14556					goto out;
14557			}
14558			continue;
14559
14560		case D_INDIRDEP:
14561			indirdep = WK_INDIRDEP(wk);
14562
14563			TAILQ_FOREACH(freework, &indirdep->ir_trunc, fw_next) {
14564				/* indirect truncation dependency */
14565				retval += 1;
14566				if (!wantcount)
14567					goto out;
14568			}
14569
14570			LIST_FOREACH(aip, &indirdep->ir_deplisthd, ai_next) {
14571				/* indirect block pointer dependency */
14572				retval += 1;
14573				if (!wantcount)
14574					goto out;
14575			}
14576			continue;
14577
14578		case D_PAGEDEP:
14579			pagedep = WK_PAGEDEP(wk);
14580			LIST_FOREACH(dirrem, &pagedep->pd_dirremhd, dm_next) {
14581				if (LIST_FIRST(&dirrem->dm_jremrefhd)) {
14582					/* Journal remove ref dependency. */
14583					retval += 1;
14584					if (!wantcount)
14585						goto out;
14586				}
14587			}
14588			for (i = 0; i < DAHASHSZ; i++) {
14589				LIST_FOREACH(dap, &pagedep->pd_diraddhd[i], da_pdlist) {
14590					/* directory entry dependency */
14591					retval += 1;
14592					if (!wantcount)
14593						goto out;
14594				}
14595			}
14596			continue;
14597
14598		case D_BMSAFEMAP:
14599			bmsafemap = WK_BMSAFEMAP(wk);
14600			if (LIST_FIRST(&bmsafemap->sm_jaddrefhd)) {
14601				/* Add reference dependency. */
14602				retval += 1;
14603				if (!wantcount)
14604					goto out;
14605			}
14606			if (LIST_FIRST(&bmsafemap->sm_jnewblkhd)) {
14607				/* Allocate block dependency. */
14608				retval += 1;
14609				if (!wantcount)
14610					goto out;
14611			}
14612			continue;
14613
14614		case D_FREEBLKS:
14615			freeblks = WK_FREEBLKS(wk);
14616			if (LIST_FIRST(&freeblks->fb_jblkdephd)) {
14617				/* Freeblk journal dependency. */
14618				retval += 1;
14619				if (!wantcount)
14620					goto out;
14621			}
14622			continue;
14623
14624		case D_ALLOCDIRECT:
14625		case D_ALLOCINDIR:
14626			newblk = WK_NEWBLK(wk);
14627			if (newblk->nb_jnewblk) {
14628				/* Journal allocate dependency. */
14629				retval += 1;
14630				if (!wantcount)
14631					goto out;
14632			}
14633			continue;
14634
14635		case D_MKDIR:
14636			mkdir = WK_MKDIR(wk);
14637			if (mkdir->md_jaddref) {
14638				/* Journal reference dependency. */
14639				retval += 1;
14640				if (!wantcount)
14641					goto out;
14642			}
14643			continue;
14644
14645		case D_FREEWORK:
14646		case D_FREEDEP:
14647		case D_JSEGDEP:
14648		case D_JSEG:
14649		case D_SBDEP:
14650			/* never a dependency on these blocks */
14651			continue;
14652
14653		default:
14654			panic("softdep_count_dependencies: Unexpected type %s",
14655			    TYPENAME(wk->wk_type));
14656			/* NOTREACHED */
14657		}
14658	}
14659out:
14660	FREE_LOCK(ump);
14661	return (retval);
14662}
14663
14664/*
14665 * Acquire exclusive access to a buffer.
14666 * Must be called with a locked mtx parameter.
14667 * Return acquired buffer or NULL on failure.
14668 */
14669static struct buf *
14670getdirtybuf(bp, lock, waitfor)
14671	struct buf *bp;
14672	struct rwlock *lock;
14673	int waitfor;
14674{
14675	int error;
14676
14677	if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0) {
14678		if (waitfor != MNT_WAIT)
14679			return (NULL);
14680		error = BUF_LOCK(bp,
14681		    LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, lock);
14682		/*
14683		 * Even if we successfully acquire bp here, we have dropped
14684		 * lock, which may violates our guarantee.
14685		 */
14686		if (error == 0)
14687			BUF_UNLOCK(bp);
14688		else if (error != ENOLCK)
14689			panic("getdirtybuf: inconsistent lock: %d", error);
14690		rw_wlock(lock);
14691		return (NULL);
14692	}
14693	if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14694		if (lock != BO_LOCKPTR(bp->b_bufobj) && waitfor == MNT_WAIT) {
14695			rw_wunlock(lock);
14696			BO_LOCK(bp->b_bufobj);
14697			BUF_UNLOCK(bp);
14698			if ((bp->b_vflags & BV_BKGRDINPROG) != 0) {
14699				bp->b_vflags |= BV_BKGRDWAIT;
14700				msleep(&bp->b_xflags, BO_LOCKPTR(bp->b_bufobj),
14701				       PRIBIO | PDROP, "getbuf", 0);
14702			} else
14703				BO_UNLOCK(bp->b_bufobj);
14704			rw_wlock(lock);
14705			return (NULL);
14706		}
14707		BUF_UNLOCK(bp);
14708		if (waitfor != MNT_WAIT)
14709			return (NULL);
14710#ifdef DEBUG_VFS_LOCKS
14711		if (bp->b_vp->v_type != VCHR)
14712			ASSERT_BO_WLOCKED(bp->b_bufobj);
14713#endif
14714		bp->b_vflags |= BV_BKGRDWAIT;
14715		rw_sleep(&bp->b_xflags, lock, PRIBIO, "getbuf", 0);
14716		return (NULL);
14717	}
14718	if ((bp->b_flags & B_DELWRI) == 0) {
14719		BUF_UNLOCK(bp);
14720		return (NULL);
14721	}
14722	bremfree(bp);
14723	return (bp);
14724}
14725
14726/*
14727 * Check if it is safe to suspend the file system now.  On entry,
14728 * the vnode interlock for devvp should be held.  Return 0 with
14729 * the mount interlock held if the file system can be suspended now,
14730 * otherwise return EAGAIN with the mount interlock held.
14731 */
14732int
14733softdep_check_suspend(struct mount *mp,
14734		      struct vnode *devvp,
14735		      int softdep_depcnt,
14736		      int softdep_accdepcnt,
14737		      int secondary_writes,
14738		      int secondary_accwrites)
14739{
14740	struct bufobj *bo;
14741	struct ufsmount *ump;
14742	struct inodedep *inodedep;
14743	int error, unlinked;
14744
14745	bo = &devvp->v_bufobj;
14746	ASSERT_BO_WLOCKED(bo);
14747
14748	/*
14749	 * If we are not running with soft updates, then we need only
14750	 * deal with secondary writes as we try to suspend.
14751	 */
14752	if (MOUNTEDSOFTDEP(mp) == 0) {
14753		MNT_ILOCK(mp);
14754		while (mp->mnt_secondary_writes != 0) {
14755			BO_UNLOCK(bo);
14756			msleep(&mp->mnt_secondary_writes, MNT_MTX(mp),
14757			    (PUSER - 1) | PDROP, "secwr", 0);
14758			BO_LOCK(bo);
14759			MNT_ILOCK(mp);
14760		}
14761
14762		/*
14763		 * Reasons for needing more work before suspend:
14764		 * - Dirty buffers on devvp.
14765		 * - Secondary writes occurred after start of vnode sync loop
14766		 */
14767		error = 0;
14768		if (bo->bo_numoutput > 0 ||
14769		    bo->bo_dirty.bv_cnt > 0 ||
14770		    secondary_writes != 0 ||
14771		    mp->mnt_secondary_writes != 0 ||
14772		    secondary_accwrites != mp->mnt_secondary_accwrites)
14773			error = EAGAIN;
14774		BO_UNLOCK(bo);
14775		return (error);
14776	}
14777
14778	/*
14779	 * If we are running with soft updates, then we need to coordinate
14780	 * with them as we try to suspend.
14781	 */
14782	ump = VFSTOUFS(mp);
14783	for (;;) {
14784		if (!TRY_ACQUIRE_LOCK(ump)) {
14785			BO_UNLOCK(bo);
14786			ACQUIRE_LOCK(ump);
14787			FREE_LOCK(ump);
14788			BO_LOCK(bo);
14789			continue;
14790		}
14791		MNT_ILOCK(mp);
14792		if (mp->mnt_secondary_writes != 0) {
14793			FREE_LOCK(ump);
14794			BO_UNLOCK(bo);
14795			msleep(&mp->mnt_secondary_writes,
14796			       MNT_MTX(mp),
14797			       (PUSER - 1) | PDROP, "secwr", 0);
14798			BO_LOCK(bo);
14799			continue;
14800		}
14801		break;
14802	}
14803
14804	unlinked = 0;
14805	if (MOUNTEDSUJ(mp)) {
14806		for (inodedep = TAILQ_FIRST(&ump->softdep_unlinked);
14807		    inodedep != NULL;
14808		    inodedep = TAILQ_NEXT(inodedep, id_unlinked)) {
14809			if ((inodedep->id_state & (UNLINKED | UNLINKLINKS |
14810			    UNLINKONLIST)) != (UNLINKED | UNLINKLINKS |
14811			    UNLINKONLIST) ||
14812			    !check_inodedep_free(inodedep))
14813				continue;
14814			unlinked++;
14815		}
14816	}
14817
14818	/*
14819	 * Reasons for needing more work before suspend:
14820	 * - Dirty buffers on devvp.
14821	 * - Softdep activity occurred after start of vnode sync loop
14822	 * - Secondary writes occurred after start of vnode sync loop
14823	 */
14824	error = 0;
14825	if (bo->bo_numoutput > 0 ||
14826	    bo->bo_dirty.bv_cnt > 0 ||
14827	    softdep_depcnt != unlinked ||
14828	    ump->softdep_deps != unlinked ||
14829	    softdep_accdepcnt != ump->softdep_accdeps ||
14830	    secondary_writes != 0 ||
14831	    mp->mnt_secondary_writes != 0 ||
14832	    secondary_accwrites != mp->mnt_secondary_accwrites)
14833		error = EAGAIN;
14834	FREE_LOCK(ump);
14835	BO_UNLOCK(bo);
14836	return (error);
14837}
14838
14839/*
14840 * Get the number of dependency structures for the file system, both
14841 * the current number and the total number allocated.  These will
14842 * later be used to detect that softdep processing has occurred.
14843 */
14844void
14845softdep_get_depcounts(struct mount *mp,
14846		      int *softdep_depsp,
14847		      int *softdep_accdepsp)
14848{
14849	struct ufsmount *ump;
14850
14851	if (MOUNTEDSOFTDEP(mp) == 0) {
14852		*softdep_depsp = 0;
14853		*softdep_accdepsp = 0;
14854		return;
14855	}
14856	ump = VFSTOUFS(mp);
14857	ACQUIRE_LOCK(ump);
14858	*softdep_depsp = ump->softdep_deps;
14859	*softdep_accdepsp = ump->softdep_accdeps;
14860	FREE_LOCK(ump);
14861}
14862
14863/*
14864 * Wait for pending output on a vnode to complete.
14865 */
14866static void
14867drain_output(vp)
14868	struct vnode *vp;
14869{
14870
14871	ASSERT_VOP_LOCKED(vp, "drain_output");
14872	(void)bufobj_wwait(&vp->v_bufobj, 0, 0);
14873}
14874
14875/*
14876 * Called whenever a buffer that is being invalidated or reallocated
14877 * contains dependencies. This should only happen if an I/O error has
14878 * occurred. The routine is called with the buffer locked.
14879 */
14880static void
14881softdep_deallocate_dependencies(bp)
14882	struct buf *bp;
14883{
14884
14885	if ((bp->b_ioflags & BIO_ERROR) == 0)
14886		panic("softdep_deallocate_dependencies: dangling deps");
14887	if (bp->b_vp != NULL && bp->b_vp->v_mount != NULL)
14888		softdep_error(bp->b_vp->v_mount->mnt_stat.f_mntonname, bp->b_error);
14889	else
14890		printf("softdep_deallocate_dependencies: "
14891		    "got error %d while accessing filesystem\n", bp->b_error);
14892	if (bp->b_error != ENXIO)
14893		panic("softdep_deallocate_dependencies: unrecovered I/O error");
14894}
14895
14896/*
14897 * Function to handle asynchronous write errors in the filesystem.
14898 */
14899static void
14900softdep_error(func, error)
14901	char *func;
14902	int error;
14903{
14904
14905	/* XXX should do something better! */
14906	printf("%s: got error %d while accessing filesystem\n", func, error);
14907}
14908
14909#ifdef DDB
14910
14911/* exported to ffs_vfsops.c */
14912extern void db_print_ffs(struct ufsmount *ump);
14913void
14914db_print_ffs(struct ufsmount *ump)
14915{
14916	db_printf("mp %p (%s) devvp %p\n", ump->um_mountp,
14917	    ump->um_mountp->mnt_stat.f_mntonname, ump->um_devvp);
14918	db_printf("    fs %p su_wl %d su_deps %d su_req %d\n",
14919	    ump->um_fs, ump->softdep_on_worklist,
14920	    ump->softdep_deps, ump->softdep_req);
14921}
14922
14923static void
14924worklist_print(struct worklist *wk, int verbose)
14925{
14926
14927	if (!verbose) {
14928		db_printf("%s: %p state 0x%b\n", TYPENAME(wk->wk_type), wk,
14929		    (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS);
14930		return;
14931	}
14932	db_printf("worklist: %p type %s state 0x%b next %p\n    ", wk,
14933	    TYPENAME(wk->wk_type), (u_int)wk->wk_state, PRINT_SOFTDEP_FLAGS,
14934	    LIST_NEXT(wk, wk_list));
14935	db_print_ffs(VFSTOUFS(wk->wk_mp));
14936}
14937
14938static void
14939inodedep_print(struct inodedep *inodedep, int verbose)
14940{
14941
14942	worklist_print(&inodedep->id_list, 0);
14943	db_printf("    fs %p ino %jd inoblk %jd delta %jd nlink %jd\n",
14944	    inodedep->id_fs,
14945	    (intmax_t)inodedep->id_ino,
14946	    (intmax_t)fsbtodb(inodedep->id_fs,
14947	        ino_to_fsba(inodedep->id_fs, inodedep->id_ino)),
14948	    (intmax_t)inodedep->id_nlinkdelta,
14949	    (intmax_t)inodedep->id_savednlink);
14950
14951	if (verbose == 0)
14952		return;
14953
14954	db_printf("    bmsafemap %p, mkdiradd %p, inoreflst %p\n",
14955	    inodedep->id_bmsafemap,
14956	    inodedep->id_mkdiradd,
14957	    TAILQ_FIRST(&inodedep->id_inoreflst));
14958	db_printf("    dirremhd %p, pendinghd %p, bufwait %p\n",
14959	    LIST_FIRST(&inodedep->id_dirremhd),
14960	    LIST_FIRST(&inodedep->id_pendinghd),
14961	    LIST_FIRST(&inodedep->id_bufwait));
14962	db_printf("    inowait %p, inoupdt %p, newinoupdt %p\n",
14963	    LIST_FIRST(&inodedep->id_inowait),
14964	    TAILQ_FIRST(&inodedep->id_inoupdt),
14965	    TAILQ_FIRST(&inodedep->id_newinoupdt));
14966	db_printf("    extupdt %p, newextupdt %p, freeblklst %p\n",
14967	    TAILQ_FIRST(&inodedep->id_extupdt),
14968	    TAILQ_FIRST(&inodedep->id_newextupdt),
14969	    TAILQ_FIRST(&inodedep->id_freeblklst));
14970	db_printf("    saveino %p, savedsize %jd, savedextsize %jd\n",
14971	    inodedep->id_savedino1,
14972	    (intmax_t)inodedep->id_savedsize,
14973	    (intmax_t)inodedep->id_savedextsize);
14974}
14975
14976static void
14977newblk_print(struct newblk *nbp)
14978{
14979
14980	worklist_print(&nbp->nb_list, 0);
14981	db_printf("    newblkno %jd\n", (intmax_t)nbp->nb_newblkno);
14982	db_printf("    jnewblk %p, bmsafemap %p, freefrag %p\n",
14983	    &nbp->nb_jnewblk,
14984	    &nbp->nb_bmsafemap,
14985	    &nbp->nb_freefrag);
14986	db_printf("    indirdeps %p, newdirblk %p, jwork %p\n",
14987	    LIST_FIRST(&nbp->nb_indirdeps),
14988	    LIST_FIRST(&nbp->nb_newdirblk),
14989	    LIST_FIRST(&nbp->nb_jwork));
14990}
14991
14992static void
14993allocdirect_print(struct allocdirect *adp)
14994{
14995
14996	newblk_print(&adp->ad_block);
14997	db_printf("    oldblkno %jd, oldsize %ld, newsize %ld\n",
14998	    adp->ad_oldblkno, adp->ad_oldsize, adp->ad_newsize);
14999	db_printf("    offset %d, inodedep %p\n",
15000	    adp->ad_offset, adp->ad_inodedep);
15001}
15002
15003static void
15004allocindir_print(struct allocindir *aip)
15005{
15006
15007	newblk_print(&aip->ai_block);
15008	db_printf("    oldblkno %jd, lbn %jd\n",
15009	    (intmax_t)aip->ai_oldblkno, (intmax_t)aip->ai_lbn);
15010	db_printf("    offset %d, indirdep %p\n",
15011	    aip->ai_offset, aip->ai_indirdep);
15012}
15013
15014static void
15015mkdir_print(struct mkdir *mkdir)
15016{
15017
15018	worklist_print(&mkdir->md_list, 0);
15019	db_printf("    diradd %p, jaddref %p, buf %p\n",
15020		mkdir->md_diradd, mkdir->md_jaddref, mkdir->md_buf);
15021}
15022
15023DB_SHOW_COMMAND(sd_inodedep, db_show_sd_inodedep)
15024{
15025
15026	if (have_addr == 0) {
15027		db_printf("inodedep address required\n");
15028		return;
15029	}
15030	inodedep_print((struct inodedep*)addr, 1);
15031}
15032
15033DB_SHOW_COMMAND(sd_allinodedeps, db_show_sd_allinodedeps)
15034{
15035	struct inodedep_hashhead *inodedephd;
15036	struct inodedep *inodedep;
15037	struct ufsmount *ump;
15038	int cnt;
15039
15040	if (have_addr == 0) {
15041		db_printf("ufsmount address required\n");
15042		return;
15043	}
15044	ump = (struct ufsmount *)addr;
15045	for (cnt = 0; cnt < ump->inodedep_hash_size; cnt++) {
15046		inodedephd = &ump->inodedep_hashtbl[cnt];
15047		LIST_FOREACH(inodedep, inodedephd, id_hash) {
15048			inodedep_print(inodedep, 0);
15049		}
15050	}
15051}
15052
15053DB_SHOW_COMMAND(sd_worklist, db_show_sd_worklist)
15054{
15055
15056	if (have_addr == 0) {
15057		db_printf("worklist address required\n");
15058		return;
15059	}
15060	worklist_print((struct worklist *)addr, 1);
15061}
15062
15063DB_SHOW_COMMAND(sd_workhead, db_show_sd_workhead)
15064{
15065	struct worklist *wk;
15066	struct workhead *wkhd;
15067
15068	if (have_addr == 0) {
15069		db_printf("worklist address required "
15070		    "(for example value in bp->b_dep)\n");
15071		return;
15072	}
15073	/*
15074	 * We often do not have the address of the worklist head but
15075	 * instead a pointer to its first entry (e.g., we have the
15076	 * contents of bp->b_dep rather than &bp->b_dep). But the back
15077	 * pointer of bp->b_dep will point at the head of the list, so
15078	 * we cheat and use that instead. If we are in the middle of
15079	 * a list we will still get the same result, so nothing
15080	 * unexpected will result.
15081	 */
15082	wk = (struct worklist *)addr;
15083	if (wk == NULL)
15084		return;
15085	wkhd = (struct workhead *)wk->wk_list.le_prev;
15086	LIST_FOREACH(wk, wkhd, wk_list) {
15087		switch(wk->wk_type) {
15088		case D_INODEDEP:
15089			inodedep_print(WK_INODEDEP(wk), 0);
15090			continue;
15091		case D_ALLOCDIRECT:
15092			allocdirect_print(WK_ALLOCDIRECT(wk));
15093			continue;
15094		case D_ALLOCINDIR:
15095			allocindir_print(WK_ALLOCINDIR(wk));
15096			continue;
15097		case D_MKDIR:
15098			mkdir_print(WK_MKDIR(wk));
15099			continue;
15100		default:
15101			worklist_print(wk, 0);
15102			continue;
15103		}
15104	}
15105}
15106
15107DB_SHOW_COMMAND(sd_mkdir, db_show_sd_mkdir)
15108{
15109	if (have_addr == 0) {
15110		db_printf("mkdir address required\n");
15111		return;
15112	}
15113	mkdir_print((struct mkdir *)addr);
15114}
15115
15116DB_SHOW_COMMAND(sd_mkdir_list, db_show_sd_mkdir_list)
15117{
15118	struct mkdirlist *mkdirlisthd;
15119	struct mkdir *mkdir;
15120
15121	if (have_addr == 0) {
15122		db_printf("mkdir listhead address required\n");
15123		return;
15124	}
15125	mkdirlisthd = (struct mkdirlist *)addr;
15126	LIST_FOREACH(mkdir, mkdirlisthd, md_mkdirs) {
15127		mkdir_print(mkdir);
15128		if (mkdir->md_diradd != NULL) {
15129			db_printf("    ");
15130			worklist_print(&mkdir->md_diradd->da_list, 0);
15131		}
15132		if (mkdir->md_jaddref != NULL) {
15133			db_printf("    ");
15134			worklist_print(&mkdir->md_jaddref->ja_list, 0);
15135		}
15136	}
15137}
15138
15139DB_SHOW_COMMAND(sd_allocdirect, db_show_sd_allocdirect)
15140{
15141	if (have_addr == 0) {
15142		db_printf("allocdirect address required\n");
15143		return;
15144	}
15145	allocdirect_print((struct allocdirect *)addr);
15146}
15147
15148DB_SHOW_COMMAND(sd_allocindir, db_show_sd_allocindir)
15149{
15150	if (have_addr == 0) {
15151		db_printf("allocindir address required\n");
15152		return;
15153	}
15154	allocindir_print((struct allocindir *)addr);
15155}
15156
15157#endif /* DDB */
15158
15159#endif /* SOFTUPDATES */
15160