1163837Spjd/*-
2163837Spjd * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org>
3163837Spjd * All rights reserved.
4163837Spjd *
5163837Spjd * Redistribution and use in source and binary forms, with or without
6163837Spjd * modification, are permitted provided that the following conditions
7163837Spjd * are met:
8163837Spjd * 1. Redistributions of source code must retain the above copyright
9163837Spjd *    notice, this list of conditions and the following disclaimer.
10163837Spjd * 2. Redistributions in binary form must reproduce the above copyright
11163837Spjd *    notice, this list of conditions and the following disclaimer in the
12163837Spjd *    documentation and/or other materials provided with the distribution.
13163837Spjd *
14163837Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15163837Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16163837Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17163837Spjd * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18163837Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19163837Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20163837Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21163837Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22163837Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23163837Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24163837Spjd * SUCH DAMAGE.
25163837Spjd */
26163837Spjd
27163837Spjd#include <sys/cdefs.h>
28163837Spjd__FBSDID("$FreeBSD$");
29163837Spjd
30163837Spjd#include <sys/param.h>
31163837Spjd#include <sys/systm.h>
32163837Spjd#include <sys/kernel.h>
33163837Spjd#include <sys/module.h>
34163837Spjd#include <sys/limits.h>
35163837Spjd#include <sys/lock.h>
36163837Spjd#include <sys/mutex.h>
37163837Spjd#include <sys/bio.h>
38163837Spjd#include <sys/sysctl.h>
39163837Spjd#include <sys/malloc.h>
40163837Spjd#include <sys/mount.h>
41163837Spjd#include <sys/eventhandler.h>
42163837Spjd#include <sys/proc.h>
43163837Spjd#include <sys/kthread.h>
44163837Spjd#include <sys/sched.h>
45163837Spjd#include <sys/taskqueue.h>
46163837Spjd#include <sys/vnode.h>
47163837Spjd#include <sys/sbuf.h>
48163837Spjd#ifdef GJ_MEMDEBUG
49163837Spjd#include <sys/stack.h>
50163837Spjd#include <sys/kdb.h>
51163837Spjd#endif
52163837Spjd#include <vm/vm.h>
53163837Spjd#include <vm/vm_kern.h>
54163837Spjd#include <geom/geom.h>
55163837Spjd
56163837Spjd#include <geom/journal/g_journal.h>
57163837Spjd
58219029SnetchildFEATURE(geom_journal, "GEOM journaling support");
59163837Spjd
60163837Spjd/*
61163837Spjd * On-disk journal format:
62163837Spjd *
63163837Spjd * JH - Journal header
64163837Spjd * RH - Record header
65163837Spjd *
66163837Spjd * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
67163837Spjd * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ...
68163837Spjd * %%%%%% ****** +------+ +------+     ****** +------+     %%%%%%
69163837Spjd *
70163837Spjd */
71163837Spjd
72163837SpjdCTASSERT(sizeof(struct g_journal_header) <= 512);
73163837SpjdCTASSERT(sizeof(struct g_journal_record_header) <= 512);
74163837Spjd
75163837Spjdstatic MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data");
76163837Spjdstatic struct mtx g_journal_cache_mtx;
77163837SpjdMTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF);
78163837Spjd
79163837Spjdconst struct g_journal_desc *g_journal_filesystems[] = {
80163837Spjd	&g_journal_ufs,
81163837Spjd	NULL
82163837Spjd};
83163837Spjd
84163837SpjdSYSCTL_DECL(_kern_geom);
85163837Spjd
86163837Spjdint g_journal_debug = 0;
87163837SpjdTUNABLE_INT("kern.geom.journal.debug", &g_journal_debug);
88163837Spjdstatic u_int g_journal_switch_time = 10;
89163837Spjdstatic u_int g_journal_force_switch = 70;
90163837Spjdstatic u_int g_journal_parallel_flushes = 16;
91163837Spjdstatic u_int g_journal_parallel_copies = 16;
92163837Spjdstatic u_int g_journal_accept_immediately = 64;
93163837Spjdstatic u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES;
94163837Spjdstatic u_int g_journal_do_optimize = 1;
95163837Spjd
96227309Sedstatic SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0,
97227309Sed    "GEOM_JOURNAL stuff");
98163837SpjdSYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0,
99163837Spjd    "Debug level");
100163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW,
101163837Spjd    &g_journal_switch_time, 0, "Switch journals every N seconds");
102163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW,
103168426Spjd    &g_journal_force_switch, 0, "Force switch when journal is N% full");
104163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW,
105163837Spjd    &g_journal_parallel_flushes, 0,
106179897Slulf    "Number of flush I/O requests to send in parallel");
107163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW,
108163837Spjd    &g_journal_accept_immediately, 0,
109179897Slulf    "Number of I/O requests accepted immediately");
110163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW,
111163837Spjd    &g_journal_parallel_copies, 0,
112179897Slulf    "Number of copy I/O requests to send in parallel");
113163837Spjdstatic int
114163837Spjdg_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS)
115163837Spjd{
116163837Spjd	u_int entries;
117163837Spjd	int error;
118163837Spjd
119163837Spjd	entries = g_journal_record_entries;
120170289Sdwmalone	error = sysctl_handle_int(oidp, &entries, 0, req);
121163837Spjd	if (error != 0 || req->newptr == NULL)
122163837Spjd		return (error);
123163837Spjd	if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES)
124163837Spjd		return (EINVAL);
125163837Spjd	g_journal_record_entries = entries;
126163837Spjd	return (0);
127163837Spjd}
128163837SpjdSYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries,
129163837Spjd    CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I",
130163837Spjd    "Maximum number of entires in one journal record");
131163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW,
132163837Spjd    &g_journal_do_optimize, 0, "Try to combine bios on flush and copy");
133163837Spjd
134163837Spjdstatic u_int g_journal_cache_used = 0;
135163837Spjdstatic u_int g_journal_cache_limit = 64 * 1024 * 1024;
136163837SpjdTUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit);
137163837Spjdstatic u_int g_journal_cache_divisor = 2;
138163837SpjdTUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor);
139163837Spjdstatic u_int g_journal_cache_switch = 90;
140163837Spjdstatic u_int g_journal_cache_misses = 0;
141163837Spjdstatic u_int g_journal_cache_alloc_failures = 0;
142163837Spjdstatic u_int g_journal_cache_low = 0;
143163837Spjd
144227309Sedstatic SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0,
145163837Spjd    "GEOM_JOURNAL cache");
146163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD,
147163837Spjd    &g_journal_cache_used, 0, "Number of allocated bytes");
148163837Spjdstatic int
149163837Spjdg_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS)
150163837Spjd{
151163837Spjd	u_int limit;
152163837Spjd	int error;
153163837Spjd
154163837Spjd	limit = g_journal_cache_limit;
155170289Sdwmalone	error = sysctl_handle_int(oidp, &limit, 0, req);
156163837Spjd	if (error != 0 || req->newptr == NULL)
157163837Spjd		return (error);
158163837Spjd	g_journal_cache_limit = limit;
159163837Spjd	g_journal_cache_low = (limit / 100) * g_journal_cache_switch;
160163837Spjd	return (0);
161163837Spjd}
162163837SpjdSYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit,
163163837Spjd    CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I",
164163837Spjd    "Maximum number of allocated bytes");
165163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN,
166163837Spjd    &g_journal_cache_divisor, 0,
167163837Spjd    "(kmem_size / kern.geom.journal.cache.divisor) == cache size");
168163837Spjdstatic int
169163837Spjdg_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS)
170163837Spjd{
171163837Spjd	u_int cswitch;
172163837Spjd	int error;
173163837Spjd
174163837Spjd	cswitch = g_journal_cache_switch;
175170289Sdwmalone	error = sysctl_handle_int(oidp, &cswitch, 0, req);
176163837Spjd	if (error != 0 || req->newptr == NULL)
177163837Spjd		return (error);
178163837Spjd	if (cswitch < 0 || cswitch > 100)
179163837Spjd		return (EINVAL);
180163837Spjd	g_journal_cache_switch = cswitch;
181163837Spjd	g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch;
182163837Spjd	return (0);
183163837Spjd}
184163837SpjdSYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch,
185163837Spjd    CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I",
186163837Spjd    "Force switch when we hit this percent of cache use");
187163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW,
188163837Spjd    &g_journal_cache_misses, 0, "Number of cache misses");
189163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW,
190163837Spjd    &g_journal_cache_alloc_failures, 0, "Memory allocation failures");
191163837Spjd
192163837Spjdstatic u_long g_journal_stats_bytes_skipped = 0;
193163837Spjdstatic u_long g_journal_stats_combined_ios = 0;
194163837Spjdstatic u_long g_journal_stats_switches = 0;
195163837Spjdstatic u_long g_journal_stats_wait_for_copy = 0;
196163837Spjdstatic u_long g_journal_stats_journal_full = 0;
197163837Spjdstatic u_long g_journal_stats_low_mem = 0;
198163837Spjd
199227309Sedstatic SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0,
200163837Spjd    "GEOM_JOURNAL statistics");
201163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW,
202163837Spjd    &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes");
203163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW,
204163837Spjd    &g_journal_stats_combined_ios, 0, "Number of combined I/O requests");
205163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW,
206163837Spjd    &g_journal_stats_switches, 0, "Number of journal switches");
207163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW,
208163837Spjd    &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch");
209163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW,
210163837Spjd    &g_journal_stats_journal_full, 0,
211163837Spjd    "Number of times journal was almost full.");
212163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW,
213163837Spjd    &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called.");
214163837Spjd
215163837Spjdstatic g_taste_t g_journal_taste;
216163837Spjdstatic g_ctl_req_t g_journal_config;
217163837Spjdstatic g_dumpconf_t g_journal_dumpconf;
218163837Spjdstatic g_init_t g_journal_init;
219163837Spjdstatic g_fini_t g_journal_fini;
220163837Spjd
221163837Spjdstruct g_class g_journal_class = {
222163837Spjd	.name = G_JOURNAL_CLASS_NAME,
223163837Spjd	.version = G_VERSION,
224163837Spjd	.taste = g_journal_taste,
225163837Spjd	.ctlreq = g_journal_config,
226163837Spjd	.dumpconf = g_journal_dumpconf,
227163837Spjd	.init = g_journal_init,
228163837Spjd	.fini = g_journal_fini
229163837Spjd};
230163837Spjd
231163837Spjdstatic int g_journal_destroy(struct g_journal_softc *sc);
232163837Spjdstatic void g_journal_metadata_update(struct g_journal_softc *sc);
233163837Spjdstatic void g_journal_switch_wait(struct g_journal_softc *sc);
234163837Spjd
235163837Spjd#define	GJ_SWITCHER_WORKING	0
236163837Spjd#define	GJ_SWITCHER_DIE		1
237163837Spjd#define	GJ_SWITCHER_DIED	2
238163837Spjdstatic int g_journal_switcher_state = GJ_SWITCHER_WORKING;
239163837Spjdstatic int g_journal_switcher_wokenup = 0;
240163837Spjdstatic int g_journal_sync_requested = 0;
241163837Spjd
242163837Spjd#ifdef GJ_MEMDEBUG
243163837Spjdstruct meminfo {
244163837Spjd	size_t		mi_size;
245163837Spjd	struct stack	mi_stack;
246163837Spjd};
247163837Spjd#endif
248163837Spjd
249163837Spjd/*
250163837Spjd * We use our own malloc/realloc/free funtions, so we can collect statistics
251163837Spjd * and force journal switch when we're running out of cache.
252163837Spjd */
253163837Spjdstatic void *
254163837Spjdgj_malloc(size_t size, int flags)
255163837Spjd{
256163837Spjd	void *p;
257163837Spjd#ifdef GJ_MEMDEBUG
258163837Spjd	struct meminfo *mi;
259163837Spjd#endif
260163837Spjd
261163837Spjd	mtx_lock(&g_journal_cache_mtx);
262163837Spjd	if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup &&
263163837Spjd	    g_journal_cache_used + size > g_journal_cache_low) {
264163837Spjd		GJ_DEBUG(1, "No cache, waking up the switcher.");
265163837Spjd		g_journal_switcher_wokenup = 1;
266163837Spjd		wakeup(&g_journal_switcher_state);
267163837Spjd	}
268163837Spjd	if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 &&
269163837Spjd	    g_journal_cache_used + size > g_journal_cache_limit) {
270163837Spjd		mtx_unlock(&g_journal_cache_mtx);
271163837Spjd		g_journal_cache_alloc_failures++;
272163837Spjd		return (NULL);
273163837Spjd	}
274163837Spjd	g_journal_cache_used += size;
275163837Spjd	mtx_unlock(&g_journal_cache_mtx);
276163837Spjd	flags &= ~M_NOWAIT;
277163837Spjd#ifndef GJ_MEMDEBUG
278163837Spjd	p = malloc(size, M_JOURNAL, flags | M_WAITOK);
279163837Spjd#else
280163837Spjd	mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK);
281163837Spjd	p = (u_char *)mi + sizeof(*mi);
282163837Spjd	mi->mi_size = size;
283163837Spjd	stack_save(&mi->mi_stack);
284163837Spjd#endif
285163837Spjd	return (p);
286163837Spjd}
287163837Spjd
288163837Spjdstatic void
289163837Spjdgj_free(void *p, size_t size)
290163837Spjd{
291163837Spjd#ifdef GJ_MEMDEBUG
292163837Spjd	struct meminfo *mi;
293163837Spjd#endif
294163837Spjd
295163837Spjd	KASSERT(p != NULL, ("p=NULL"));
296163837Spjd	KASSERT(size > 0, ("size=0"));
297163837Spjd	mtx_lock(&g_journal_cache_mtx);
298163837Spjd	KASSERT(g_journal_cache_used >= size, ("Freeing too much?"));
299163837Spjd	g_journal_cache_used -= size;
300163837Spjd	mtx_unlock(&g_journal_cache_mtx);
301163837Spjd#ifdef GJ_MEMDEBUG
302163837Spjd	mi = p = (void *)((u_char *)p - sizeof(*mi));
303163837Spjd	if (mi->mi_size != size) {
304163837Spjd		printf("GJOURNAL: Size mismatch! %zu != %zu\n", size,
305163837Spjd		    mi->mi_size);
306163837Spjd		printf("GJOURNAL: Alloc backtrace:\n");
307163837Spjd		stack_print(&mi->mi_stack);
308163837Spjd		printf("GJOURNAL: Free backtrace:\n");
309163837Spjd		kdb_backtrace();
310163837Spjd	}
311163837Spjd#endif
312163837Spjd	free(p, M_JOURNAL);
313163837Spjd}
314163837Spjd
315163837Spjdstatic void *
316163837Spjdgj_realloc(void *p, size_t size, size_t oldsize)
317163837Spjd{
318163837Spjd	void *np;
319163837Spjd
320163837Spjd#ifndef GJ_MEMDEBUG
321163837Spjd	mtx_lock(&g_journal_cache_mtx);
322163837Spjd	g_journal_cache_used -= oldsize;
323163837Spjd	g_journal_cache_used += size;
324163837Spjd	mtx_unlock(&g_journal_cache_mtx);
325163837Spjd	np = realloc(p, size, M_JOURNAL, M_WAITOK);
326163837Spjd#else
327163837Spjd	np = gj_malloc(size, M_WAITOK);
328163837Spjd	bcopy(p, np, MIN(oldsize, size));
329163837Spjd	gj_free(p, oldsize);
330163837Spjd#endif
331163837Spjd	return (np);
332163837Spjd}
333163837Spjd
334163837Spjdstatic void
335163837Spjdg_journal_check_overflow(struct g_journal_softc *sc)
336163837Spjd{
337163837Spjd	off_t length, used;
338163837Spjd
339163837Spjd	if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset &&
340163837Spjd	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset) ||
341163837Spjd	    (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset &&
342163837Spjd	     sc->sc_journal_offset >= sc->sc_inactive.jj_offset &&
343163837Spjd	     sc->sc_journal_offset < sc->sc_active.jj_offset)) {
344253141Skib		panic("Journal overflow "
345253141Skib		    "(id = %u joffset=%jd active=%jd inactive=%jd)",
346253141Skib		    (unsigned)sc->sc_id,
347163837Spjd		    (intmax_t)sc->sc_journal_offset,
348163837Spjd		    (intmax_t)sc->sc_active.jj_offset,
349163837Spjd		    (intmax_t)sc->sc_inactive.jj_offset);
350163837Spjd	}
351163837Spjd	if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) {
352163837Spjd		length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset;
353163837Spjd		used = sc->sc_journal_offset - sc->sc_active.jj_offset;
354163837Spjd	} else {
355163837Spjd		length = sc->sc_jend - sc->sc_active.jj_offset;
356163837Spjd		length += sc->sc_inactive.jj_offset - sc->sc_jstart;
357163837Spjd		if (sc->sc_journal_offset >= sc->sc_active.jj_offset)
358163837Spjd			used = sc->sc_journal_offset - sc->sc_active.jj_offset;
359163837Spjd		else {
360163837Spjd			used = sc->sc_jend - sc->sc_active.jj_offset;
361163837Spjd			used += sc->sc_journal_offset - sc->sc_jstart;
362163837Spjd		}
363163837Spjd	}
364163837Spjd	/* Already woken up? */
365163837Spjd	if (g_journal_switcher_wokenup)
366163837Spjd		return;
367163837Spjd	/*
368163837Spjd	 * If the active journal takes more than g_journal_force_switch precent
369163837Spjd	 * of free journal space, we force journal switch.
370163837Spjd	 */
371163837Spjd	KASSERT(length > 0,
372163837Spjd	    ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd",
373163837Spjd	    (intmax_t)length, (intmax_t)used,
374163837Spjd	    (intmax_t)sc->sc_active.jj_offset,
375163837Spjd	    (intmax_t)sc->sc_inactive.jj_offset,
376163837Spjd	    (intmax_t)sc->sc_journal_offset));
377163837Spjd	if ((used * 100) / length > g_journal_force_switch) {
378163837Spjd		g_journal_stats_journal_full++;
379163837Spjd		GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.",
380163837Spjd		    sc->sc_name, (used * 100) / length);
381163837Spjd		mtx_lock(&g_journal_cache_mtx);
382163837Spjd		g_journal_switcher_wokenup = 1;
383163837Spjd		wakeup(&g_journal_switcher_state);
384163837Spjd		mtx_unlock(&g_journal_cache_mtx);
385163837Spjd	}
386163837Spjd}
387163837Spjd
388163837Spjdstatic void
389163837Spjdg_journal_orphan(struct g_consumer *cp)
390163837Spjd{
391163837Spjd	struct g_journal_softc *sc;
392163837Spjd	char name[256];
393163837Spjd	int error;
394163837Spjd
395163837Spjd	g_topology_assert();
396163837Spjd	sc = cp->geom->softc;
397164821Spjd	strlcpy(name, cp->provider->name, sizeof(name));
398164821Spjd	GJ_DEBUG(0, "Lost provider %s.", name);
399164821Spjd	if (sc == NULL)
400164821Spjd		return;
401163837Spjd	error = g_journal_destroy(sc);
402163837Spjd	if (error == 0)
403163837Spjd		GJ_DEBUG(0, "Journal %s destroyed.", name);
404163837Spjd	else {
405163837Spjd		GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). "
406163837Spjd		    "Destroy it manually after last close.", sc->sc_name,
407163837Spjd		    error);
408163837Spjd	}
409163837Spjd}
410163837Spjd
411163837Spjdstatic int
412163837Spjdg_journal_access(struct g_provider *pp, int acr, int acw, int ace)
413163837Spjd{
414163837Spjd	struct g_journal_softc *sc;
415163837Spjd	int dcr, dcw, dce;
416163837Spjd
417163837Spjd	g_topology_assert();
418163837Spjd	GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name,
419163837Spjd	    acr, acw, ace);
420163837Spjd
421163837Spjd	dcr = pp->acr + acr;
422163837Spjd	dcw = pp->acw + acw;
423163837Spjd	dce = pp->ace + ace;
424163837Spjd
425163837Spjd	sc = pp->geom->softc;
426163837Spjd	if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) {
427163837Spjd		if (acr <= 0 && acw <= 0 && ace <= 0)
428163837Spjd			return (0);
429163837Spjd		else
430163837Spjd			return (ENXIO);
431163837Spjd	}
432163837Spjd	if (pp->acw == 0 && dcw > 0) {
433163837Spjd		GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name);
434163837Spjd		sc->sc_flags &= ~GJF_DEVICE_CLEAN;
435163837Spjd		g_topology_unlock();
436163837Spjd		g_journal_metadata_update(sc);
437163837Spjd		g_topology_lock();
438163837Spjd	} /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) {
439163837Spjd		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
440163837Spjd		sc->sc_flags |= GJF_DEVICE_CLEAN;
441163837Spjd		g_topology_unlock();
442163837Spjd		g_journal_metadata_update(sc);
443163837Spjd		g_topology_lock();
444163837Spjd	} */
445163837Spjd	return (0);
446163837Spjd}
447163837Spjd
448163837Spjdstatic void
449163837Spjdg_journal_header_encode(struct g_journal_header *hdr, u_char *data)
450163837Spjd{
451163837Spjd
452163837Spjd	bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC));
453163837Spjd	data += sizeof(GJ_HEADER_MAGIC);
454163837Spjd	le32enc(data, hdr->jh_journal_id);
455163837Spjd	data += 4;
456163837Spjd	le32enc(data, hdr->jh_journal_next_id);
457163837Spjd}
458163837Spjd
459163837Spjdstatic int
460163837Spjdg_journal_header_decode(const u_char *data, struct g_journal_header *hdr)
461163837Spjd{
462163837Spjd
463163837Spjd	bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic));
464163837Spjd	data += sizeof(hdr->jh_magic);
465163837Spjd	if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0)
466163837Spjd		return (EINVAL);
467163837Spjd	hdr->jh_journal_id = le32dec(data);
468163837Spjd	data += 4;
469163837Spjd	hdr->jh_journal_next_id = le32dec(data);
470163837Spjd	return (0);
471163837Spjd}
472163837Spjd
473163837Spjdstatic void
474163837Spjdg_journal_flush_cache(struct g_journal_softc *sc)
475163837Spjd{
476163837Spjd	struct bintime bt;
477163837Spjd	int error;
478163837Spjd
479163837Spjd	if (sc->sc_bio_flush == 0)
480163837Spjd		return;
481163837Spjd	GJ_TIMER_START(1, &bt);
482163837Spjd	if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) {
483163837Spjd		error = g_io_flush(sc->sc_jconsumer);
484163837Spjd		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
485163837Spjd		    sc->sc_jconsumer->provider->name, error);
486163837Spjd	}
487163837Spjd	if (sc->sc_bio_flush & GJ_FLUSH_DATA) {
488163837Spjd		/*
489163837Spjd		 * TODO: This could be called in parallel with the
490163837Spjd		 *       previous call.
491163837Spjd		 */
492163837Spjd		error = g_io_flush(sc->sc_dconsumer);
493163837Spjd		GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.",
494163837Spjd		    sc->sc_dconsumer->provider->name, error);
495163837Spjd	}
496163837Spjd	GJ_TIMER_STOP(1, &bt, "Cache flush time");
497163837Spjd}
498163837Spjd
499163837Spjdstatic int
500163837Spjdg_journal_write_header(struct g_journal_softc *sc)
501163837Spjd{
502163837Spjd	struct g_journal_header hdr;
503163837Spjd	struct g_consumer *cp;
504163837Spjd	u_char *buf;
505163837Spjd	int error;
506163837Spjd
507163837Spjd	cp = sc->sc_jconsumer;
508163837Spjd	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
509163837Spjd
510163837Spjd	strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic));
511163837Spjd	hdr.jh_journal_id = sc->sc_journal_id;
512163837Spjd	hdr.jh_journal_next_id = sc->sc_journal_next_id;
513163837Spjd	g_journal_header_encode(&hdr, buf);
514163837Spjd	error = g_write_data(cp, sc->sc_journal_offset, buf,
515163837Spjd	    cp->provider->sectorsize);
516163837Spjd	/* if (error == 0) */
517163837Spjd	sc->sc_journal_offset += cp->provider->sectorsize;
518163837Spjd
519163837Spjd	gj_free(buf, cp->provider->sectorsize);
520163837Spjd	return (error);
521163837Spjd}
522163837Spjd
523163837Spjd/*
524163837Spjd * Every journal record has a header and data following it.
525163837Spjd * Functions below are used to decode the header before storing it to
526163837Spjd * little endian and to encode it after reading to system endianess.
527163837Spjd */
528163837Spjdstatic void
529163837Spjdg_journal_record_header_encode(struct g_journal_record_header *hdr,
530163837Spjd    u_char *data)
531163837Spjd{
532163837Spjd	struct g_journal_entry *ent;
533163837Spjd	u_int i;
534163837Spjd
535163837Spjd	bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC));
536163837Spjd	data += sizeof(GJ_RECORD_HEADER_MAGIC);
537163837Spjd	le32enc(data, hdr->jrh_journal_id);
538163837Spjd	data += 8;
539163837Spjd	le16enc(data, hdr->jrh_nentries);
540163837Spjd	data += 2;
541163837Spjd	bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum));
542163837Spjd	data += 8;
543163837Spjd	for (i = 0; i < hdr->jrh_nentries; i++) {
544163837Spjd		ent = &hdr->jrh_entries[i];
545163837Spjd		le64enc(data, ent->je_joffset);
546163837Spjd		data += 8;
547163837Spjd		le64enc(data, ent->je_offset);
548163837Spjd		data += 8;
549163837Spjd		le64enc(data, ent->je_length);
550163837Spjd		data += 8;
551163837Spjd	}
552163837Spjd}
553163837Spjd
554163837Spjdstatic int
555163837Spjdg_journal_record_header_decode(const u_char *data,
556163837Spjd    struct g_journal_record_header *hdr)
557163837Spjd{
558163837Spjd	struct g_journal_entry *ent;
559163837Spjd	u_int i;
560163837Spjd
561163837Spjd	bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic));
562163837Spjd	data += sizeof(hdr->jrh_magic);
563163837Spjd	if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0)
564163837Spjd		return (EINVAL);
565163837Spjd	hdr->jrh_journal_id = le32dec(data);
566163837Spjd	data += 8;
567163837Spjd	hdr->jrh_nentries = le16dec(data);
568163837Spjd	data += 2;
569163837Spjd	if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES)
570163837Spjd		return (EINVAL);
571163837Spjd	bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum));
572163837Spjd	data += 8;
573163837Spjd	for (i = 0; i < hdr->jrh_nentries; i++) {
574163837Spjd		ent = &hdr->jrh_entries[i];
575163837Spjd		ent->je_joffset = le64dec(data);
576163837Spjd		data += 8;
577163837Spjd		ent->je_offset = le64dec(data);
578163837Spjd		data += 8;
579163837Spjd		ent->je_length = le64dec(data);
580163837Spjd		data += 8;
581163837Spjd	}
582163837Spjd	return (0);
583163837Spjd}
584163837Spjd
585163837Spjd/*
586163837Spjd * Function reads metadata from a provider (via the given consumer), decodes
587163837Spjd * it to system endianess and verifies its correctness.
588163837Spjd */
589163837Spjdstatic int
590163837Spjdg_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md)
591163837Spjd{
592163837Spjd	struct g_provider *pp;
593163837Spjd	u_char *buf;
594163837Spjd	int error;
595163837Spjd
596163837Spjd	g_topology_assert();
597163837Spjd
598163837Spjd	error = g_access(cp, 1, 0, 0);
599163837Spjd	if (error != 0)
600163837Spjd		return (error);
601163837Spjd	pp = cp->provider;
602163837Spjd	g_topology_unlock();
603163837Spjd	/* Metadata is stored in last sector. */
604163837Spjd	buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize,
605163837Spjd	    &error);
606163837Spjd	g_topology_lock();
607163837Spjd	g_access(cp, -1, 0, 0);
608163906Spjd	if (buf == NULL) {
609163837Spjd		GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).",
610163837Spjd		    cp->provider->name, error);
611163837Spjd		return (error);
612163837Spjd	}
613163837Spjd
614163837Spjd	/* Decode metadata. */
615163837Spjd	error = journal_metadata_decode(buf, md);
616163837Spjd	g_free(buf);
617163837Spjd	/* Is this is gjournal provider at all? */
618163837Spjd	if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0)
619163837Spjd		return (EINVAL);
620163837Spjd	/*
621163837Spjd	 * Are we able to handle this version of metadata?
622163837Spjd	 * We only maintain backward compatibility.
623163837Spjd	 */
624163837Spjd	if (md->md_version > G_JOURNAL_VERSION) {
625163837Spjd		GJ_DEBUG(0,
626163837Spjd		    "Kernel module is too old to handle metadata from %s.",
627163837Spjd		    cp->provider->name);
628163837Spjd		return (EINVAL);
629163837Spjd	}
630163837Spjd	/* Is checksum correct? */
631163837Spjd	if (error != 0) {
632163837Spjd		GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.",
633163837Spjd		    cp->provider->name);
634163837Spjd		return (error);
635163837Spjd	}
636163837Spjd	return (0);
637163837Spjd}
638163837Spjd
639163837Spjd/*
640163837Spjd * Two functions below are responsible for updating metadata.
641163837Spjd * Only metadata on the data provider is updated (we need to update
642163837Spjd * information about active journal in there).
643163837Spjd */
644163837Spjdstatic void
645163837Spjdg_journal_metadata_done(struct bio *bp)
646163837Spjd{
647163837Spjd
648163837Spjd	/*
649163837Spjd	 * There is not much we can do on error except informing about it.
650163837Spjd	 */
651163837Spjd	if (bp->bio_error != 0) {
652163837Spjd		GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).",
653163837Spjd		    bp->bio_error);
654163837Spjd	} else {
655163837Spjd		GJ_LOGREQ(2, bp, "Metadata updated.");
656163837Spjd	}
657163837Spjd	gj_free(bp->bio_data, bp->bio_length);
658163837Spjd	g_destroy_bio(bp);
659163837Spjd}
660163837Spjd
661163837Spjdstatic void
662163837Spjdg_journal_metadata_update(struct g_journal_softc *sc)
663163837Spjd{
664163837Spjd	struct g_journal_metadata md;
665163837Spjd	struct g_consumer *cp;
666163837Spjd	struct bio *bp;
667163837Spjd	u_char *sector;
668163837Spjd
669163837Spjd	cp = sc->sc_dconsumer;
670163837Spjd	sector = gj_malloc(cp->provider->sectorsize, M_WAITOK);
671163837Spjd	strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic));
672163837Spjd	md.md_version = G_JOURNAL_VERSION;
673163837Spjd	md.md_id = sc->sc_id;
674163837Spjd	md.md_type = sc->sc_orig_type;
675163837Spjd	md.md_jstart = sc->sc_jstart;
676163837Spjd	md.md_jend = sc->sc_jend;
677163837Spjd	md.md_joffset = sc->sc_inactive.jj_offset;
678163837Spjd	md.md_jid = sc->sc_journal_previous_id;
679163837Spjd	md.md_flags = 0;
680163837Spjd	if (sc->sc_flags & GJF_DEVICE_CLEAN)
681163837Spjd		md.md_flags |= GJ_FLAG_CLEAN;
682163837Spjd
683163837Spjd	if (sc->sc_flags & GJF_DEVICE_HARDCODED)
684163837Spjd		strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider));
685163837Spjd	else
686163837Spjd		bzero(md.md_provider, sizeof(md.md_provider));
687163837Spjd	md.md_provsize = cp->provider->mediasize;
688163837Spjd	journal_metadata_encode(&md, sector);
689163837Spjd
690163837Spjd	/*
691163837Spjd	 * Flush the cache, so we know all data are on disk.
692163837Spjd	 * We write here informations like "journal is consistent", so we need
693163837Spjd	 * to be sure it is. Without BIO_FLUSH here, we can end up in situation
694163837Spjd	 * where metadata is stored on disk, but not all data.
695163837Spjd	 */
696163837Spjd	g_journal_flush_cache(sc);
697163837Spjd
698163837Spjd	bp = g_alloc_bio();
699163837Spjd	bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize;
700163837Spjd	bp->bio_length = cp->provider->sectorsize;
701163837Spjd	bp->bio_data = sector;
702163837Spjd	bp->bio_cmd = BIO_WRITE;
703163837Spjd	if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) {
704163837Spjd		bp->bio_done = g_journal_metadata_done;
705163837Spjd		g_io_request(bp, cp);
706163837Spjd	} else {
707163837Spjd		bp->bio_done = NULL;
708163837Spjd		g_io_request(bp, cp);
709163837Spjd		biowait(bp, "gjmdu");
710163837Spjd		g_journal_metadata_done(bp);
711163837Spjd	}
712163837Spjd
713163837Spjd	/*
714163837Spjd	 * Be sure metadata reached the disk.
715163837Spjd	 */
716163837Spjd	g_journal_flush_cache(sc);
717163837Spjd}
718163837Spjd
719163837Spjd/*
720163837Spjd * This is where the I/O request comes from the GEOM.
721163837Spjd */
722163837Spjdstatic void
723163837Spjdg_journal_start(struct bio *bp)
724163837Spjd{
725163837Spjd	struct g_journal_softc *sc;
726163837Spjd
727163837Spjd	sc = bp->bio_to->geom->softc;
728163837Spjd	GJ_LOGREQ(3, bp, "Request received.");
729163837Spjd
730163837Spjd	switch (bp->bio_cmd) {
731163837Spjd	case BIO_READ:
732163837Spjd	case BIO_WRITE:
733163837Spjd		mtx_lock(&sc->sc_mtx);
734163837Spjd		bioq_insert_tail(&sc->sc_regular_queue, bp);
735163837Spjd		wakeup(sc);
736163837Spjd		mtx_unlock(&sc->sc_mtx);
737163837Spjd		return;
738163837Spjd	case BIO_GETATTR:
739163837Spjd		if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) {
740163837Spjd			strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length);
741163837Spjd			bp->bio_completed = strlen(bp->bio_to->name) + 1;
742163837Spjd			g_io_deliver(bp, 0);
743163837Spjd			return;
744163837Spjd		}
745163837Spjd		/* FALLTHROUGH */
746163837Spjd	case BIO_DELETE:
747163837Spjd	default:
748163837Spjd		g_io_deliver(bp, EOPNOTSUPP);
749163837Spjd		return;
750163837Spjd	}
751163837Spjd}
752163837Spjd
753163837Spjdstatic void
754163837Spjdg_journal_std_done(struct bio *bp)
755163837Spjd{
756163837Spjd	struct g_journal_softc *sc;
757163837Spjd
758163837Spjd	sc = bp->bio_from->geom->softc;
759163837Spjd	mtx_lock(&sc->sc_mtx);
760163837Spjd	bioq_insert_tail(&sc->sc_back_queue, bp);
761163837Spjd	wakeup(sc);
762163837Spjd	mtx_unlock(&sc->sc_mtx);
763163837Spjd}
764163837Spjd
765163837Spjdstatic struct bio *
766163837Spjdg_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data,
767163837Spjd    int flags)
768163837Spjd{
769163837Spjd	struct bio *bp;
770163837Spjd
771163837Spjd	bp = g_alloc_bio();
772163837Spjd	bp->bio_offset = start;
773163837Spjd	bp->bio_joffset = joffset;
774163837Spjd	bp->bio_length = end - start;
775163837Spjd	bp->bio_cmd = BIO_WRITE;
776163837Spjd	bp->bio_done = g_journal_std_done;
777163837Spjd	if (data == NULL)
778163837Spjd		bp->bio_data = NULL;
779163837Spjd	else {
780163837Spjd		bp->bio_data = gj_malloc(bp->bio_length, flags);
781163837Spjd		if (bp->bio_data != NULL)
782163837Spjd			bcopy(data, bp->bio_data, bp->bio_length);
783163837Spjd	}
784163837Spjd	return (bp);
785163837Spjd}
786163837Spjd
787163837Spjd#define	g_journal_insert_bio(head, bp, flags)				\
788163837Spjd	g_journal_insert((head), (bp)->bio_offset,			\
789163837Spjd		(bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset,	\
790163837Spjd		(bp)->bio_data, flags)
791163837Spjd/*
792163837Spjd * The function below does a lot more than just inserting bio to the queue.
793163837Spjd * It keeps the queue sorted by offset and ensures that there are no doubled
794163837Spjd * data (it combines bios where ranges overlap).
795163837Spjd *
796163837Spjd * The function returns the number of bios inserted (as bio can be splitted).
797163837Spjd */
798163837Spjdstatic int
799163837Spjdg_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset,
800163837Spjd    u_char *data, int flags)
801163837Spjd{
802163837Spjd	struct bio *nbp, *cbp, *pbp;
803163837Spjd	off_t cstart, cend;
804163837Spjd	u_char *tmpdata;
805163837Spjd	int n;
806163837Spjd
807163837Spjd	GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend,
808163837Spjd	    joffset);
809163837Spjd	n = 0;
810163837Spjd	pbp = NULL;
811163837Spjd	GJQ_FOREACH(*head, cbp) {
812163837Spjd		cstart = cbp->bio_offset;
813163837Spjd		cend = cbp->bio_offset + cbp->bio_length;
814163837Spjd
815163837Spjd		if (nstart >= cend) {
816163837Spjd			/*
817163837Spjd			 *  +-------------+
818163837Spjd			 *  |             |
819163837Spjd			 *  |   current   |  +-------------+
820163837Spjd			 *  |     bio     |  |             |
821163837Spjd			 *  |             |  |     new     |
822163837Spjd			 *  +-------------+  |     bio     |
823163837Spjd			 *                   |             |
824163837Spjd			 *                   +-------------+
825163837Spjd			 */
826163837Spjd			GJ_DEBUG(3, "INSERT(%p): 1", *head);
827163837Spjd		} else if (nend <= cstart) {
828163837Spjd			/*
829163837Spjd			 *                   +-------------+
830163837Spjd			 *                   |             |
831163837Spjd			 *  +-------------+  |   current   |
832163837Spjd			 *  |             |  |     bio     |
833163837Spjd			 *  |     new     |  |             |
834163837Spjd			 *  |     bio     |  +-------------+
835163837Spjd			 *  |             |
836163837Spjd			 *  +-------------+
837163837Spjd			 */
838163837Spjd			nbp = g_journal_new_bio(nstart, nend, joffset, data,
839163837Spjd			    flags);
840163837Spjd			if (pbp == NULL)
841163837Spjd				*head = nbp;
842163837Spjd			else
843163837Spjd				pbp->bio_next = nbp;
844163837Spjd			nbp->bio_next = cbp;
845163837Spjd			n++;
846163837Spjd			GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp,
847163837Spjd			    pbp);
848163837Spjd			goto end;
849163837Spjd		} else if (nstart <= cstart && nend >= cend) {
850163837Spjd			/*
851163837Spjd			 *      +-------------+      +-------------+
852163837Spjd			 *      | current bio |      | current bio |
853163837Spjd			 *  +---+-------------+---+  +-------------+---+
854163837Spjd			 *  |   |             |   |  |             |   |
855163837Spjd			 *  |   |             |   |  |             |   |
856163837Spjd			 *  |   +-------------+   |  +-------------+   |
857163837Spjd			 *  |       new bio       |  |     new bio     |
858163837Spjd			 *  +---------------------+  +-----------------+
859163837Spjd			 *
860163837Spjd			 *      +-------------+  +-------------+
861163837Spjd			 *      | current bio |  | current bio |
862163837Spjd			 *  +---+-------------+  +-------------+
863163837Spjd			 *  |   |             |  |             |
864163837Spjd			 *  |   |             |  |             |
865163837Spjd			 *  |   +-------------+  +-------------+
866163837Spjd			 *  |     new bio     |  |   new bio   |
867163837Spjd			 *  +-----------------+  +-------------+
868163837Spjd			 */
869163837Spjd			g_journal_stats_bytes_skipped += cbp->bio_length;
870163837Spjd			cbp->bio_offset = nstart;
871163837Spjd			cbp->bio_joffset = joffset;
872163837Spjd			cbp->bio_length = cend - nstart;
873163837Spjd			if (cbp->bio_data != NULL) {
874163837Spjd				gj_free(cbp->bio_data, cend - cstart);
875163837Spjd				cbp->bio_data = NULL;
876163837Spjd			}
877163837Spjd			if (data != NULL) {
878163837Spjd				cbp->bio_data = gj_malloc(cbp->bio_length,
879163837Spjd				    flags);
880163837Spjd				if (cbp->bio_data != NULL) {
881163837Spjd					bcopy(data, cbp->bio_data,
882163837Spjd					    cbp->bio_length);
883163837Spjd				}
884163837Spjd				data += cend - nstart;
885163837Spjd			}
886163837Spjd			joffset += cend - nstart;
887163837Spjd			nstart = cend;
888163837Spjd			GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp);
889163837Spjd		} else if (nstart > cstart && nend >= cend) {
890163837Spjd			/*
891163837Spjd			 *  +-----------------+  +-------------+
892163837Spjd			 *  |   current bio   |  | current bio |
893163837Spjd			 *  |   +-------------+  |   +---------+---+
894163837Spjd			 *  |   |             |  |   |         |   |
895163837Spjd			 *  |   |             |  |   |         |   |
896163837Spjd			 *  +---+-------------+  +---+---------+   |
897163837Spjd			 *      |   new bio   |      |   new bio   |
898163837Spjd			 *      +-------------+      +-------------+
899163837Spjd			 */
900163837Spjd			g_journal_stats_bytes_skipped += cend - nstart;
901163837Spjd			nbp = g_journal_new_bio(nstart, cend, joffset, data,
902163837Spjd			    flags);
903163837Spjd			nbp->bio_next = cbp->bio_next;
904163837Spjd			cbp->bio_next = nbp;
905163837Spjd			cbp->bio_length = nstart - cstart;
906163837Spjd			if (cbp->bio_data != NULL) {
907163837Spjd				cbp->bio_data = gj_realloc(cbp->bio_data,
908163837Spjd				    cbp->bio_length, cend - cstart);
909163837Spjd			}
910163837Spjd			if (data != NULL)
911163837Spjd				data += cend - nstart;
912163837Spjd			joffset += cend - nstart;
913163837Spjd			nstart = cend;
914163837Spjd			n++;
915163837Spjd			GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp);
916163837Spjd		} else if (nstart > cstart && nend < cend) {
917163837Spjd			/*
918163837Spjd			 *  +---------------------+
919163837Spjd			 *  |     current bio     |
920163837Spjd			 *  |   +-------------+   |
921163837Spjd			 *  |   |             |   |
922163837Spjd			 *  |   |             |   |
923163837Spjd			 *  +---+-------------+---+
924163837Spjd			 *      |   new bio   |
925163837Spjd			 *      +-------------+
926163837Spjd			 */
927163837Spjd			g_journal_stats_bytes_skipped += nend - nstart;
928163837Spjd			nbp = g_journal_new_bio(nstart, nend, joffset, data,
929163837Spjd			    flags);
930163837Spjd			nbp->bio_next = cbp->bio_next;
931163837Spjd			cbp->bio_next = nbp;
932163837Spjd			if (cbp->bio_data == NULL)
933163837Spjd				tmpdata = NULL;
934163837Spjd			else
935163837Spjd				tmpdata = cbp->bio_data + nend - cstart;
936163837Spjd			nbp = g_journal_new_bio(nend, cend,
937163837Spjd			    cbp->bio_joffset + nend - cstart, tmpdata, flags);
938163837Spjd			nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next;
939163837Spjd			((struct bio *)cbp->bio_next)->bio_next = nbp;
940163837Spjd			cbp->bio_length = nstart - cstart;
941163837Spjd			if (cbp->bio_data != NULL) {
942163837Spjd				cbp->bio_data = gj_realloc(cbp->bio_data,
943163837Spjd				    cbp->bio_length, cend - cstart);
944163837Spjd			}
945163837Spjd			n += 2;
946163837Spjd			GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp);
947163837Spjd			goto end;
948163837Spjd		} else if (nstart <= cstart && nend < cend) {
949163837Spjd			/*
950163837Spjd			 *  +-----------------+      +-------------+
951163837Spjd			 *  |   current bio   |      | current bio |
952163837Spjd			 *  +-------------+   |  +---+---------+   |
953163837Spjd			 *  |             |   |  |   |         |   |
954163837Spjd			 *  |             |   |  |   |         |   |
955163837Spjd			 *  +-------------+---+  |   +---------+---+
956163837Spjd			 *  |   new bio   |      |   new bio   |
957163837Spjd			 *  +-------------+      +-------------+
958163837Spjd			 */
959163837Spjd			g_journal_stats_bytes_skipped += nend - nstart;
960163837Spjd			nbp = g_journal_new_bio(nstart, nend, joffset, data,
961163837Spjd			    flags);
962163837Spjd			if (pbp == NULL)
963163837Spjd				*head = nbp;
964163837Spjd			else
965163837Spjd				pbp->bio_next = nbp;
966163837Spjd			nbp->bio_next = cbp;
967163837Spjd			cbp->bio_offset = nend;
968163837Spjd			cbp->bio_length = cend - nend;
969163837Spjd			cbp->bio_joffset += nend - cstart;
970163837Spjd			tmpdata = cbp->bio_data;
971163837Spjd			if (tmpdata != NULL) {
972163837Spjd				cbp->bio_data = gj_malloc(cbp->bio_length,
973163837Spjd				    flags);
974163837Spjd				if (cbp->bio_data != NULL) {
975163837Spjd					bcopy(tmpdata + nend - cstart,
976163837Spjd					    cbp->bio_data, cbp->bio_length);
977163837Spjd				}
978163837Spjd				gj_free(tmpdata, cend - cstart);
979163837Spjd			}
980163837Spjd			n++;
981163837Spjd			GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp);
982163837Spjd			goto end;
983163837Spjd		}
984163837Spjd		if (nstart == nend)
985163837Spjd			goto end;
986163837Spjd		pbp = cbp;
987163837Spjd	}
988163837Spjd	nbp = g_journal_new_bio(nstart, nend, joffset, data, flags);
989163837Spjd	if (pbp == NULL)
990163837Spjd		*head = nbp;
991163837Spjd	else
992163837Spjd		pbp->bio_next = nbp;
993163837Spjd	nbp->bio_next = NULL;
994163837Spjd	n++;
995163837Spjd	GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp);
996163837Spjdend:
997163837Spjd	if (g_journal_debug >= 3) {
998163837Spjd		GJQ_FOREACH(*head, cbp) {
999163837Spjd			GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp,
1000163837Spjd			    (intmax_t)cbp->bio_offset,
1001163837Spjd			    (intmax_t)cbp->bio_length,
1002163837Spjd			    (intmax_t)cbp->bio_joffset, cbp->bio_data);
1003163837Spjd		}
1004163837Spjd		GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n);
1005163837Spjd	}
1006163837Spjd	return (n);
1007163837Spjd}
1008163837Spjd
1009163837Spjd/*
1010163837Spjd * The function combines neighbour bios trying to squeeze as much data as
1011163837Spjd * possible into one bio.
1012163837Spjd *
1013163837Spjd * The function returns the number of bios combined (negative value).
1014163837Spjd */
1015163837Spjdstatic int
1016163837Spjdg_journal_optimize(struct bio *head)
1017163837Spjd{
1018163837Spjd	struct bio *cbp, *pbp;
1019163837Spjd	int n;
1020163837Spjd
1021163837Spjd	n = 0;
1022163837Spjd	pbp = NULL;
1023163837Spjd	GJQ_FOREACH(head, cbp) {
1024163837Spjd		/* Skip bios which has to be read first. */
1025163837Spjd		if (cbp->bio_data == NULL) {
1026163837Spjd			pbp = NULL;
1027163837Spjd			continue;
1028163837Spjd		}
1029163837Spjd		/* There is no previous bio yet. */
1030163837Spjd		if (pbp == NULL) {
1031163837Spjd			pbp = cbp;
1032163837Spjd			continue;
1033163837Spjd		}
1034163837Spjd		/* Is this a neighbour bio? */
1035163837Spjd		if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) {
1036163837Spjd			/* Be sure that bios queue is sorted. */
1037163837Spjd			KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset,
1038163837Spjd			    ("poffset=%jd plength=%jd coffset=%jd",
1039163837Spjd			    (intmax_t)pbp->bio_offset,
1040163837Spjd			    (intmax_t)pbp->bio_length,
1041163837Spjd			    (intmax_t)cbp->bio_offset));
1042163837Spjd			pbp = cbp;
1043163837Spjd			continue;
1044163837Spjd		}
1045163837Spjd		/* Be sure we don't end up with too big bio. */
1046163837Spjd		if (pbp->bio_length + cbp->bio_length > MAXPHYS) {
1047163837Spjd			pbp = cbp;
1048163837Spjd			continue;
1049163837Spjd		}
1050163837Spjd		/* Ok, we can join bios. */
1051163837Spjd		GJ_LOGREQ(4, pbp, "Join: ");
1052163837Spjd		GJ_LOGREQ(4, cbp, "and: ");
1053163837Spjd		pbp->bio_data = gj_realloc(pbp->bio_data,
1054163837Spjd		    pbp->bio_length + cbp->bio_length, pbp->bio_length);
1055163837Spjd		bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length,
1056163837Spjd		    cbp->bio_length);
1057163837Spjd		gj_free(cbp->bio_data, cbp->bio_length);
1058163837Spjd		pbp->bio_length += cbp->bio_length;
1059163837Spjd		pbp->bio_next = cbp->bio_next;
1060163837Spjd		g_destroy_bio(cbp);
1061163837Spjd		cbp = pbp;
1062163837Spjd		g_journal_stats_combined_ios++;
1063163837Spjd		n--;
1064163837Spjd		GJ_LOGREQ(4, pbp, "Got: ");
1065163837Spjd	}
1066163837Spjd	return (n);
1067163837Spjd}
1068163837Spjd
1069163837Spjd/*
1070163837Spjd * TODO: Update comment.
1071163837Spjd * These are functions responsible for copying one portion of data from journal
1072163837Spjd * to the destination provider.
1073163837Spjd * The order goes like this:
1074163837Spjd * 1. Read the header, which contains informations about data blocks
1075163837Spjd *    following it.
1076163837Spjd * 2. Read the data blocks from the journal.
1077163837Spjd * 3. Write the data blocks on the data provider.
1078163837Spjd *
1079163837Spjd * g_journal_copy_start()
1080163837Spjd * g_journal_copy_done() - got finished write request, logs potential errors.
1081163837Spjd */
1082163837Spjd
1083163837Spjd/*
1084163837Spjd * When there is no data in cache, this function is used to read it.
1085163837Spjd */
1086163837Spjdstatic void
1087163837Spjdg_journal_read_first(struct g_journal_softc *sc, struct bio *bp)
1088163837Spjd{
1089163837Spjd	struct bio *cbp;
1090163837Spjd
1091163837Spjd	/*
1092163837Spjd	 * We were short in memory, so data was freed.
1093163837Spjd	 * In that case we need to read it back from journal.
1094163837Spjd	 */
1095163837Spjd	cbp = g_alloc_bio();
1096163837Spjd	cbp->bio_cflags = bp->bio_cflags;
1097163837Spjd	cbp->bio_parent = bp;
1098163837Spjd	cbp->bio_offset = bp->bio_joffset;
1099163837Spjd	cbp->bio_length = bp->bio_length;
1100163837Spjd	cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK);
1101163837Spjd	cbp->bio_cmd = BIO_READ;
1102163837Spjd	cbp->bio_done = g_journal_std_done;
1103163837Spjd	GJ_LOGREQ(4, cbp, "READ FIRST");
1104163837Spjd	g_io_request(cbp, sc->sc_jconsumer);
1105163837Spjd	g_journal_cache_misses++;
1106163837Spjd}
1107163837Spjd
1108163837Spjdstatic void
1109163837Spjdg_journal_copy_send(struct g_journal_softc *sc)
1110163837Spjd{
1111163837Spjd	struct bio *bioq, *bp, *lbp;
1112163837Spjd
1113163837Spjd	bioq = lbp = NULL;
1114163837Spjd	mtx_lock(&sc->sc_mtx);
1115163837Spjd	for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) {
1116163837Spjd		bp = GJQ_FIRST(sc->sc_inactive.jj_queue);
1117163837Spjd		if (bp == NULL)
1118163837Spjd			break;
1119163837Spjd		GJQ_REMOVE(sc->sc_inactive.jj_queue, bp);
1120163837Spjd		sc->sc_copy_in_progress++;
1121163837Spjd		GJQ_INSERT_AFTER(bioq, bp, lbp);
1122163837Spjd		lbp = bp;
1123163837Spjd	}
1124163837Spjd	mtx_unlock(&sc->sc_mtx);
1125163837Spjd	if (g_journal_do_optimize)
1126163837Spjd		sc->sc_copy_in_progress += g_journal_optimize(bioq);
1127163837Spjd	while ((bp = GJQ_FIRST(bioq)) != NULL) {
1128163837Spjd		GJQ_REMOVE(bioq, bp);
1129163837Spjd		GJQ_INSERT_HEAD(sc->sc_copy_queue, bp);
1130163837Spjd		bp->bio_cflags = GJ_BIO_COPY;
1131163837Spjd		if (bp->bio_data == NULL)
1132163837Spjd			g_journal_read_first(sc, bp);
1133163837Spjd		else {
1134163837Spjd			bp->bio_joffset = 0;
1135163837Spjd			GJ_LOGREQ(4, bp, "SEND");
1136163837Spjd			g_io_request(bp, sc->sc_dconsumer);
1137163837Spjd		}
1138163837Spjd	}
1139163837Spjd}
1140163837Spjd
1141163837Spjdstatic void
1142163837Spjdg_journal_copy_start(struct g_journal_softc *sc)
1143163837Spjd{
1144163837Spjd
1145163837Spjd	/*
1146163837Spjd	 * Remember in metadata that we're starting to copy journaled data
1147163837Spjd	 * to the data provider.
1148163837Spjd	 * In case of power failure, we will copy these data once again on boot.
1149163837Spjd	 */
1150163837Spjd	if (!sc->sc_journal_copying) {
1151163837Spjd		sc->sc_journal_copying = 1;
1152163837Spjd		GJ_DEBUG(1, "Starting copy of journal.");
1153163837Spjd		g_journal_metadata_update(sc);
1154163837Spjd	}
1155163837Spjd	g_journal_copy_send(sc);
1156163837Spjd}
1157163837Spjd
1158163837Spjd/*
1159163837Spjd * Data block has been read from the journal provider.
1160163837Spjd */
1161163837Spjdstatic int
1162163837Spjdg_journal_copy_read_done(struct bio *bp)
1163163837Spjd{
1164163837Spjd	struct g_journal_softc *sc;
1165163837Spjd	struct g_consumer *cp;
1166163837Spjd	struct bio *pbp;
1167163837Spjd
1168163837Spjd	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1169163837Spjd	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1170163837Spjd
1171163837Spjd	sc = bp->bio_from->geom->softc;
1172163837Spjd	pbp = bp->bio_parent;
1173163837Spjd
1174163837Spjd	if (bp->bio_error != 0) {
1175163837Spjd		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1176163837Spjd		    bp->bio_to->name, bp->bio_error);
1177163837Spjd		/*
1178163837Spjd		 * We will not be able to deliver WRITE request as well.
1179163837Spjd		 */
1180163837Spjd		gj_free(bp->bio_data, bp->bio_length);
1181163837Spjd		g_destroy_bio(pbp);
1182163837Spjd		g_destroy_bio(bp);
1183163837Spjd		sc->sc_copy_in_progress--;
1184163837Spjd		return (1);
1185163837Spjd	}
1186163837Spjd	pbp->bio_data = bp->bio_data;
1187163837Spjd	cp = sc->sc_dconsumer;
1188163837Spjd	g_io_request(pbp, cp);
1189163837Spjd	GJ_LOGREQ(4, bp, "READ DONE");
1190163837Spjd	g_destroy_bio(bp);
1191163837Spjd	return (0);
1192163837Spjd}
1193163837Spjd
1194163837Spjd/*
1195163837Spjd * Data block has been written to the data provider.
1196163837Spjd */
1197163837Spjdstatic void
1198163837Spjdg_journal_copy_write_done(struct bio *bp)
1199163837Spjd{
1200163837Spjd	struct g_journal_softc *sc;
1201163837Spjd
1202163837Spjd	KASSERT(bp->bio_cflags == GJ_BIO_COPY,
1203163837Spjd	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY));
1204163837Spjd
1205163837Spjd	sc = bp->bio_from->geom->softc;
1206163837Spjd	sc->sc_copy_in_progress--;
1207163837Spjd
1208163837Spjd	if (bp->bio_error != 0) {
1209179897Slulf		GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)",
1210163837Spjd		    bp->bio_error);
1211163837Spjd	}
1212163837Spjd	GJQ_REMOVE(sc->sc_copy_queue, bp);
1213163837Spjd	gj_free(bp->bio_data, bp->bio_length);
1214163837Spjd	GJ_LOGREQ(4, bp, "DONE");
1215163837Spjd	g_destroy_bio(bp);
1216163837Spjd
1217163837Spjd	if (sc->sc_copy_in_progress == 0) {
1218163837Spjd		/*
1219163837Spjd		 * This was the last write request for this journal.
1220163837Spjd		 */
1221163837Spjd		GJ_DEBUG(1, "Data has been copied.");
1222163837Spjd		sc->sc_journal_copying = 0;
1223163837Spjd	}
1224163837Spjd}
1225163837Spjd
1226163837Spjdstatic void g_journal_flush_done(struct bio *bp);
1227163837Spjd
1228163837Spjd/*
1229163837Spjd * Flush one record onto active journal provider.
1230163837Spjd */
1231163837Spjdstatic void
1232163837Spjdg_journal_flush(struct g_journal_softc *sc)
1233163837Spjd{
1234163837Spjd	struct g_journal_record_header hdr;
1235163837Spjd	struct g_journal_entry *ent;
1236163837Spjd	struct g_provider *pp;
1237163837Spjd	struct bio **bioq;
1238163837Spjd	struct bio *bp, *fbp, *pbp;
1239163837Spjd	off_t joffset, size;
1240163837Spjd	u_char *data, hash[16];
1241163837Spjd	MD5_CTX ctx;
1242163837Spjd	u_int i;
1243163837Spjd
1244163837Spjd	if (sc->sc_current_count == 0)
1245163837Spjd		return;
1246163837Spjd
1247163837Spjd	size = 0;
1248163837Spjd	pp = sc->sc_jprovider;
1249163837Spjd	GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1250163837Spjd	joffset = sc->sc_journal_offset;
1251163837Spjd
1252163837Spjd	GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.",
1253163837Spjd	    sc->sc_current_count, pp->name, (intmax_t)joffset);
1254163837Spjd
1255163837Spjd	/*
1256163837Spjd	 * Store 'journal id', so we know to which journal this record belongs.
1257163837Spjd	 */
1258163837Spjd	hdr.jrh_journal_id = sc->sc_journal_id;
1259163837Spjd	/* Could be less than g_journal_record_entries if called due timeout. */
1260163837Spjd	hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries);
1261163837Spjd	strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic));
1262163837Spjd
1263163837Spjd	bioq = &sc->sc_active.jj_queue;
1264163837Spjd	pbp = sc->sc_flush_queue;
1265163837Spjd
1266163837Spjd	fbp = g_alloc_bio();
1267163837Spjd	fbp->bio_parent = NULL;
1268163837Spjd	fbp->bio_cflags = GJ_BIO_JOURNAL;
1269163837Spjd	fbp->bio_offset = -1;
1270163837Spjd	fbp->bio_joffset = joffset;
1271163837Spjd	fbp->bio_length = pp->sectorsize;
1272163837Spjd	fbp->bio_cmd = BIO_WRITE;
1273163837Spjd	fbp->bio_done = g_journal_std_done;
1274163837Spjd	GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp);
1275163837Spjd	pbp = fbp;
1276163837Spjd	fbp->bio_to = pp;
1277163837Spjd	GJ_LOGREQ(4, fbp, "FLUSH_OUT");
1278163837Spjd	joffset += pp->sectorsize;
1279163837Spjd	sc->sc_flush_count++;
1280163837Spjd	if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1281163837Spjd		MD5Init(&ctx);
1282163837Spjd
1283163837Spjd	for (i = 0; i < hdr.jrh_nentries; i++) {
1284163837Spjd		bp = sc->sc_current_queue;
1285163837Spjd		KASSERT(bp != NULL, ("NULL bp"));
1286163837Spjd		bp->bio_to = pp;
1287163837Spjd		GJ_LOGREQ(4, bp, "FLUSHED");
1288163837Spjd		sc->sc_current_queue = bp->bio_next;
1289163837Spjd		bp->bio_next = NULL;
1290163837Spjd		sc->sc_current_count--;
1291163837Spjd
1292163837Spjd		/* Add to the header. */
1293163837Spjd		ent = &hdr.jrh_entries[i];
1294163837Spjd		ent->je_offset = bp->bio_offset;
1295163837Spjd		ent->je_joffset = joffset;
1296163837Spjd		ent->je_length = bp->bio_length;
1297163837Spjd		size += ent->je_length;
1298163837Spjd
1299163837Spjd		data = bp->bio_data;
1300163837Spjd		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1301163837Spjd			MD5Update(&ctx, data, ent->je_length);
1302163837Spjd		bzero(bp, sizeof(*bp));
1303163837Spjd		bp->bio_cflags = GJ_BIO_JOURNAL;
1304163837Spjd		bp->bio_offset = ent->je_offset;
1305163837Spjd		bp->bio_joffset = ent->je_joffset;
1306163837Spjd		bp->bio_length = ent->je_length;
1307163837Spjd		bp->bio_data = data;
1308163837Spjd		bp->bio_cmd = BIO_WRITE;
1309163837Spjd		bp->bio_done = g_journal_std_done;
1310163837Spjd		GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp);
1311163837Spjd		pbp = bp;
1312163837Spjd		bp->bio_to = pp;
1313163837Spjd		GJ_LOGREQ(4, bp, "FLUSH_OUT");
1314163837Spjd		joffset += bp->bio_length;
1315163837Spjd		sc->sc_flush_count++;
1316163837Spjd
1317163837Spjd		/*
1318163837Spjd		 * Add request to the active sc_journal_queue queue.
1319163837Spjd		 * This is our cache. After journal switch we don't have to
1320163837Spjd		 * read the data from the inactive journal, because we keep
1321163837Spjd		 * it in memory.
1322163837Spjd		 */
1323163837Spjd		g_journal_insert(bioq, ent->je_offset,
1324163837Spjd		    ent->je_offset + ent->je_length, ent->je_joffset, data,
1325163837Spjd		    M_NOWAIT);
1326163837Spjd	}
1327163837Spjd
1328163837Spjd	/*
1329163837Spjd	 * After all requests, store valid header.
1330163837Spjd	 */
1331163837Spjd	data = gj_malloc(pp->sectorsize, M_WAITOK);
1332163837Spjd	if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1333163837Spjd		MD5Final(hash, &ctx);
1334163837Spjd		bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum));
1335163837Spjd	}
1336163837Spjd	g_journal_record_header_encode(&hdr, data);
1337163837Spjd	fbp->bio_data = data;
1338163837Spjd
1339163837Spjd	sc->sc_journal_offset = joffset;
1340163837Spjd
1341163837Spjd	g_journal_check_overflow(sc);
1342163837Spjd}
1343163837Spjd
1344163837Spjd/*
1345163837Spjd * Flush request finished.
1346163837Spjd */
1347163837Spjdstatic void
1348163837Spjdg_journal_flush_done(struct bio *bp)
1349163837Spjd{
1350163837Spjd	struct g_journal_softc *sc;
1351163837Spjd	struct g_consumer *cp;
1352163837Spjd
1353163837Spjd	KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL,
1354163837Spjd	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL));
1355163837Spjd
1356163837Spjd	cp = bp->bio_from;
1357163837Spjd	sc = cp->geom->softc;
1358163837Spjd	sc->sc_flush_in_progress--;
1359163837Spjd
1360163837Spjd	if (bp->bio_error != 0) {
1361179897Slulf		GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)",
1362163837Spjd		    bp->bio_error);
1363163837Spjd	}
1364163837Spjd	gj_free(bp->bio_data, bp->bio_length);
1365163837Spjd	GJ_LOGREQ(4, bp, "DONE");
1366163837Spjd	g_destroy_bio(bp);
1367163837Spjd}
1368163837Spjd
1369163837Spjdstatic void g_journal_release_delayed(struct g_journal_softc *sc);
1370163837Spjd
1371163837Spjdstatic void
1372163837Spjdg_journal_flush_send(struct g_journal_softc *sc)
1373163837Spjd{
1374163837Spjd	struct g_consumer *cp;
1375163837Spjd	struct bio *bioq, *bp, *lbp;
1376163837Spjd
1377163837Spjd	cp = sc->sc_jconsumer;
1378163837Spjd	bioq = lbp = NULL;
1379163837Spjd	while (sc->sc_flush_in_progress < g_journal_parallel_flushes) {
1380163837Spjd		/* Send one flush requests to the active journal. */
1381163837Spjd		bp = GJQ_FIRST(sc->sc_flush_queue);
1382163837Spjd		if (bp != NULL) {
1383163837Spjd			GJQ_REMOVE(sc->sc_flush_queue, bp);
1384163837Spjd			sc->sc_flush_count--;
1385163837Spjd			bp->bio_offset = bp->bio_joffset;
1386163837Spjd			bp->bio_joffset = 0;
1387163837Spjd			sc->sc_flush_in_progress++;
1388163837Spjd			GJQ_INSERT_AFTER(bioq, bp, lbp);
1389163837Spjd			lbp = bp;
1390163837Spjd		}
1391163837Spjd		/* Try to release delayed requests. */
1392163837Spjd		g_journal_release_delayed(sc);
1393163837Spjd		/* If there are no requests to flush, leave. */
1394163837Spjd		if (GJQ_FIRST(sc->sc_flush_queue) == NULL)
1395163837Spjd			break;
1396163837Spjd	}
1397163837Spjd	if (g_journal_do_optimize)
1398163837Spjd		sc->sc_flush_in_progress += g_journal_optimize(bioq);
1399163837Spjd	while ((bp = GJQ_FIRST(bioq)) != NULL) {
1400163837Spjd		GJQ_REMOVE(bioq, bp);
1401163837Spjd		GJ_LOGREQ(3, bp, "Flush request send");
1402163837Spjd		g_io_request(bp, cp);
1403163837Spjd	}
1404163837Spjd}
1405163837Spjd
1406163837Spjdstatic void
1407163837Spjdg_journal_add_current(struct g_journal_softc *sc, struct bio *bp)
1408163837Spjd{
1409163837Spjd	int n;
1410163837Spjd
1411163837Spjd	GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count);
1412163837Spjd	n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK);
1413163837Spjd	sc->sc_current_count += n;
1414163837Spjd	n = g_journal_optimize(sc->sc_current_queue);
1415163837Spjd	sc->sc_current_count += n;
1416163837Spjd	/*
1417163837Spjd	 * For requests which are added to the current queue we deliver
1418163837Spjd	 * response immediately.
1419163837Spjd	 */
1420163837Spjd	bp->bio_completed = bp->bio_length;
1421163837Spjd	g_io_deliver(bp, 0);
1422163837Spjd	if (sc->sc_current_count >= g_journal_record_entries) {
1423163837Spjd		/*
1424163837Spjd		 * Let's flush one record onto active journal provider.
1425163837Spjd		 */
1426163837Spjd		g_journal_flush(sc);
1427163837Spjd	}
1428163837Spjd}
1429163837Spjd
1430163837Spjdstatic void
1431163837Spjdg_journal_release_delayed(struct g_journal_softc *sc)
1432163837Spjd{
1433163837Spjd	struct bio *bp;
1434163837Spjd
1435163837Spjd	for (;;) {
1436163837Spjd		/* The flush queue is full, exit. */
1437163837Spjd		if (sc->sc_flush_count >= g_journal_accept_immediately)
1438163837Spjd			return;
1439163837Spjd		bp = bioq_takefirst(&sc->sc_delayed_queue);
1440163837Spjd		if (bp == NULL)
1441163837Spjd			return;
1442163837Spjd		sc->sc_delayed_count--;
1443163837Spjd		g_journal_add_current(sc, bp);
1444163837Spjd	}
1445163837Spjd}
1446163837Spjd
1447163837Spjd/*
1448163837Spjd * Add I/O request to the current queue. If we have enough requests for one
1449163837Spjd * journal record we flush them onto active journal provider.
1450163837Spjd */
1451163837Spjdstatic void
1452163837Spjdg_journal_add_request(struct g_journal_softc *sc, struct bio *bp)
1453163837Spjd{
1454163837Spjd
1455163837Spjd	/*
1456163837Spjd	 * The flush queue is full, we need to delay the request.
1457163837Spjd	 */
1458163837Spjd	if (sc->sc_delayed_count > 0 ||
1459163837Spjd	    sc->sc_flush_count >= g_journal_accept_immediately) {
1460163837Spjd		GJ_LOGREQ(4, bp, "DELAYED");
1461163837Spjd		bioq_insert_tail(&sc->sc_delayed_queue, bp);
1462163837Spjd		sc->sc_delayed_count++;
1463163837Spjd		return;
1464163837Spjd	}
1465163837Spjd
1466163837Spjd	KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue),
1467163837Spjd	    ("DELAYED queue not empty."));
1468163837Spjd	g_journal_add_current(sc, bp);
1469163837Spjd}
1470163837Spjd
1471163837Spjdstatic void g_journal_read_done(struct bio *bp);
1472163837Spjd
1473163837Spjd/*
1474163837Spjd * Try to find requested data in cache.
1475163837Spjd */
1476163837Spjdstatic struct bio *
1477163837Spjdg_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart,
1478163837Spjd    off_t oend)
1479163837Spjd{
1480163837Spjd	off_t cstart, cend;
1481163837Spjd	struct bio *bp;
1482163837Spjd
1483163837Spjd	GJQ_FOREACH(head, bp) {
1484163837Spjd		if (bp->bio_offset == -1)
1485163837Spjd			continue;
1486163837Spjd		cstart = MAX(ostart, bp->bio_offset);
1487163837Spjd		cend = MIN(oend, bp->bio_offset + bp->bio_length);
1488163837Spjd		if (cend <= ostart)
1489163837Spjd			continue;
1490163837Spjd		else if (cstart >= oend) {
1491163837Spjd			if (!sorted)
1492163837Spjd				continue;
1493163837Spjd			else {
1494163837Spjd				bp = NULL;
1495163837Spjd				break;
1496163837Spjd			}
1497163837Spjd		}
1498163837Spjd		if (bp->bio_data == NULL)
1499163837Spjd			break;
1500163837Spjd		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1501163837Spjd		    bp);
1502163837Spjd		bcopy(bp->bio_data + cstart - bp->bio_offset,
1503163837Spjd		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1504163837Spjd		pbp->bio_completed += cend - cstart;
1505163837Spjd		if (pbp->bio_completed == pbp->bio_length) {
1506163837Spjd			/*
1507163837Spjd			 * Cool, the whole request was in cache, deliver happy
1508163837Spjd			 * message.
1509163837Spjd			 */
1510163837Spjd			g_io_deliver(pbp, 0);
1511163837Spjd			return (pbp);
1512163837Spjd		}
1513163837Spjd		break;
1514163837Spjd	}
1515163837Spjd	return (bp);
1516163837Spjd}
1517163837Spjd
1518163837Spjd/*
1519163837Spjd * Try to find requested data in cache.
1520163837Spjd */
1521163837Spjdstatic struct bio *
1522163837Spjdg_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart,
1523163837Spjd    off_t oend)
1524163837Spjd{
1525163837Spjd	off_t cstart, cend;
1526163837Spjd	struct bio *bp;
1527163837Spjd
1528163837Spjd	TAILQ_FOREACH(bp, head, bio_queue) {
1529163837Spjd		cstart = MAX(ostart, bp->bio_offset);
1530163837Spjd		cend = MIN(oend, bp->bio_offset + bp->bio_length);
1531163837Spjd		if (cend <= ostart)
1532163837Spjd			continue;
1533163837Spjd		else if (cstart >= oend)
1534163837Spjd			continue;
1535163837Spjd		KASSERT(bp->bio_data != NULL,
1536163837Spjd		    ("%s: bio_data == NULL", __func__));
1537163837Spjd		GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend,
1538163837Spjd		    bp);
1539163837Spjd		bcopy(bp->bio_data + cstart - bp->bio_offset,
1540163837Spjd		    pbp->bio_data + cstart - pbp->bio_offset, cend - cstart);
1541163837Spjd		pbp->bio_completed += cend - cstart;
1542163837Spjd		if (pbp->bio_completed == pbp->bio_length) {
1543163837Spjd			/*
1544163837Spjd			 * Cool, the whole request was in cache, deliver happy
1545163837Spjd			 * message.
1546163837Spjd			 */
1547163837Spjd			g_io_deliver(pbp, 0);
1548163837Spjd			return (pbp);
1549163837Spjd		}
1550163837Spjd		break;
1551163837Spjd	}
1552163837Spjd	return (bp);
1553163837Spjd}
1554163837Spjd
1555163837Spjd/*
1556163837Spjd * This function is used for colecting data on read.
1557163837Spjd * The complexity is because parts of the data can be stored in four different
1558163837Spjd * places:
1559163837Spjd * - in delayed requests
1560163837Spjd * - in memory - the data not yet send to the active journal provider
1561163837Spjd * - in requests which are going to be sent to the active journal
1562163837Spjd * - in the active journal
1563163837Spjd * - in the inactive journal
1564163837Spjd * - in the data provider
1565163837Spjd */
1566163837Spjdstatic void
1567163837Spjdg_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart,
1568163837Spjd    off_t oend)
1569163837Spjd{
1570163837Spjd	struct bio *bp, *nbp, *head;
1571163837Spjd	off_t cstart, cend;
1572163837Spjd	u_int i, sorted = 0;
1573163837Spjd
1574163837Spjd	GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend);
1575163837Spjd
1576163837Spjd	cstart = cend = -1;
1577163837Spjd	bp = NULL;
1578163837Spjd	head = NULL;
1579163837Spjd	for (i = 0; i <= 5; i++) {
1580163837Spjd		switch (i) {
1581163837Spjd		case 0:	/* Delayed requests. */
1582163837Spjd			head = NULL;
1583163837Spjd			sorted = 0;
1584163837Spjd			break;
1585163837Spjd		case 1:	/* Not-yet-send data. */
1586163837Spjd			head = sc->sc_current_queue;
1587163837Spjd			sorted = 1;
1588163837Spjd			break;
1589163837Spjd		case 2:	/* In-flight to the active journal. */
1590163837Spjd			head = sc->sc_flush_queue;
1591163837Spjd			sorted = 0;
1592163837Spjd			break;
1593163837Spjd		case 3:	/* Active journal. */
1594163837Spjd			head = sc->sc_active.jj_queue;
1595163837Spjd			sorted = 1;
1596163837Spjd			break;
1597163837Spjd		case 4:	/* Inactive journal. */
1598163837Spjd			/*
1599163837Spjd			 * XXX: Here could be a race with g_journal_lowmem().
1600163837Spjd			 */
1601163837Spjd			head = sc->sc_inactive.jj_queue;
1602163837Spjd			sorted = 1;
1603163837Spjd			break;
1604163837Spjd		case 5:	/* In-flight to the data provider. */
1605163837Spjd			head = sc->sc_copy_queue;
1606163837Spjd			sorted = 0;
1607163837Spjd			break;
1608163837Spjd		default:
1609163837Spjd			panic("gjournal %s: i=%d", __func__, i);
1610163837Spjd		}
1611163837Spjd		if (i == 0)
1612163837Spjd			bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend);
1613163837Spjd		else
1614163837Spjd			bp = g_journal_read_find(head, sorted, pbp, ostart, oend);
1615163837Spjd		if (bp == pbp) { /* Got the whole request. */
1616163837Spjd			GJ_DEBUG(2, "Got the whole request from %u.", i);
1617163837Spjd			return;
1618163837Spjd		} else if (bp != NULL) {
1619163837Spjd			cstart = MAX(ostart, bp->bio_offset);
1620163837Spjd			cend = MIN(oend, bp->bio_offset + bp->bio_length);
1621163837Spjd			GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).",
1622163837Spjd			    i, (intmax_t)cstart, (intmax_t)cend);
1623163837Spjd			break;
1624163837Spjd		}
1625163837Spjd	}
1626163837Spjd	if (bp != NULL) {
1627163837Spjd		if (bp->bio_data == NULL) {
1628163906Spjd			nbp = g_duplicate_bio(pbp);
1629163837Spjd			nbp->bio_cflags = GJ_BIO_READ;
1630163837Spjd			nbp->bio_data =
1631163837Spjd			    pbp->bio_data + cstart - pbp->bio_offset;
1632163837Spjd			nbp->bio_offset =
1633163837Spjd			    bp->bio_joffset + cstart - bp->bio_offset;
1634163837Spjd			nbp->bio_length = cend - cstart;
1635163837Spjd			nbp->bio_done = g_journal_read_done;
1636163837Spjd			g_io_request(nbp, sc->sc_jconsumer);
1637163837Spjd		}
1638163837Spjd		/*
1639163837Spjd		 * If we don't have the whole request yet, call g_journal_read()
1640163837Spjd		 * recursively.
1641163837Spjd		 */
1642163837Spjd		if (ostart < cstart)
1643163837Spjd			g_journal_read(sc, pbp, ostart, cstart);
1644163837Spjd		if (oend > cend)
1645163837Spjd			g_journal_read(sc, pbp, cend, oend);
1646163837Spjd	} else {
1647163837Spjd		/*
1648163837Spjd		 * No data in memory, no data in journal.
1649163837Spjd		 * Its time for asking data provider.
1650163837Spjd		 */
1651163837Spjd		GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend);
1652163906Spjd		nbp = g_duplicate_bio(pbp);
1653163837Spjd		nbp->bio_cflags = GJ_BIO_READ;
1654163837Spjd		nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset;
1655163837Spjd		nbp->bio_offset = ostart;
1656163837Spjd		nbp->bio_length = oend - ostart;
1657163837Spjd		nbp->bio_done = g_journal_read_done;
1658163837Spjd		g_io_request(nbp, sc->sc_dconsumer);
1659163837Spjd		/* We have the whole request, return here. */
1660163837Spjd		return;
1661163837Spjd	}
1662163837Spjd}
1663163837Spjd
1664163837Spjd/*
1665163837Spjd * Function responsible for handling finished READ requests.
1666163837Spjd * Actually, g_std_done() could be used here, the only difference is that we
1667163837Spjd * log error.
1668163837Spjd */
1669163837Spjdstatic void
1670163837Spjdg_journal_read_done(struct bio *bp)
1671163837Spjd{
1672163837Spjd	struct bio *pbp;
1673163837Spjd
1674163837Spjd	KASSERT(bp->bio_cflags == GJ_BIO_READ,
1675163837Spjd	    ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ));
1676163837Spjd
1677163837Spjd	pbp = bp->bio_parent;
1678163837Spjd	pbp->bio_inbed++;
1679163837Spjd	pbp->bio_completed += bp->bio_length;
1680163837Spjd
1681163837Spjd	if (bp->bio_error != 0) {
1682163837Spjd		if (pbp->bio_error == 0)
1683163837Spjd			pbp->bio_error = bp->bio_error;
1684163837Spjd		GJ_DEBUG(0, "Error while reading data from %s (error=%d).",
1685163837Spjd		    bp->bio_to->name, bp->bio_error);
1686163837Spjd	}
1687163837Spjd	g_destroy_bio(bp);
1688163837Spjd	if (pbp->bio_children == pbp->bio_inbed &&
1689163837Spjd	    pbp->bio_completed == pbp->bio_length) {
1690163837Spjd		/* We're done. */
1691163837Spjd		g_io_deliver(pbp, 0);
1692163837Spjd	}
1693163837Spjd}
1694163837Spjd
1695163837Spjd/*
1696163837Spjd * Deactive current journal and active next one.
1697163837Spjd */
1698163837Spjdstatic void
1699163837Spjdg_journal_switch(struct g_journal_softc *sc)
1700163837Spjd{
1701163837Spjd	struct g_provider *pp;
1702163837Spjd
1703163837Spjd	if (JEMPTY(sc)) {
1704163837Spjd		GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
1705163837Spjd		pp = LIST_FIRST(&sc->sc_geom->provider);
1706163837Spjd		if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) {
1707163837Spjd			sc->sc_flags |= GJF_DEVICE_CLEAN;
1708163837Spjd			GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
1709163837Spjd			g_journal_metadata_update(sc);
1710163837Spjd		}
1711163837Spjd	} else {
1712163837Spjd		GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name);
1713163837Spjd
1714163837Spjd		pp = sc->sc_jprovider;
1715163837Spjd
1716163837Spjd		sc->sc_journal_previous_id = sc->sc_journal_id;
1717163837Spjd
1718163837Spjd		sc->sc_journal_id = sc->sc_journal_next_id;
1719163837Spjd		sc->sc_journal_next_id = arc4random();
1720163837Spjd
1721163837Spjd		GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc);
1722163837Spjd
1723163837Spjd		g_journal_write_header(sc);
1724163837Spjd
1725163837Spjd		sc->sc_inactive.jj_offset = sc->sc_active.jj_offset;
1726163837Spjd		sc->sc_inactive.jj_queue = sc->sc_active.jj_queue;
1727163837Spjd
1728163837Spjd		sc->sc_active.jj_offset =
1729163837Spjd		    sc->sc_journal_offset - pp->sectorsize;
1730163837Spjd		sc->sc_active.jj_queue = NULL;
1731163837Spjd
1732163837Spjd		/*
1733163837Spjd		 * Switch is done, start copying data from the (now) inactive
1734163837Spjd		 * journal to the data provider.
1735163837Spjd		 */
1736163837Spjd		g_journal_copy_start(sc);
1737163837Spjd	}
1738163837Spjd	mtx_lock(&sc->sc_mtx);
1739163837Spjd	sc->sc_flags &= ~GJF_DEVICE_SWITCH;
1740163837Spjd	mtx_unlock(&sc->sc_mtx);
1741163837Spjd}
1742163837Spjd
1743163837Spjdstatic void
1744163837Spjdg_journal_initialize(struct g_journal_softc *sc)
1745163837Spjd{
1746163837Spjd
1747163837Spjd	sc->sc_journal_id = arc4random();
1748163837Spjd	sc->sc_journal_next_id = arc4random();
1749163837Spjd	sc->sc_journal_previous_id = sc->sc_journal_id;
1750163837Spjd	sc->sc_journal_offset = sc->sc_jstart;
1751163837Spjd	sc->sc_inactive.jj_offset = sc->sc_jstart;
1752163837Spjd	g_journal_write_header(sc);
1753163837Spjd	sc->sc_active.jj_offset = sc->sc_jstart;
1754163837Spjd}
1755163837Spjd
1756163837Spjdstatic void
1757163837Spjdg_journal_mark_as_dirty(struct g_journal_softc *sc)
1758163837Spjd{
1759163837Spjd	const struct g_journal_desc *desc;
1760163837Spjd	int i;
1761163837Spjd
1762163837Spjd	GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name);
1763163837Spjd	for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++)
1764163837Spjd		desc->jd_dirty(sc->sc_dconsumer);
1765163837Spjd}
1766163837Spjd
1767163837Spjd/*
1768163837Spjd * Function read record header from the given journal.
1769163837Spjd * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio
1770163837Spjd * and data on every call.
1771163837Spjd */
1772163837Spjdstatic int
1773163837Spjdg_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset,
1774163837Spjd    void *data)
1775163837Spjd{
1776163837Spjd	int error;
1777163837Spjd
1778163837Spjd	bzero(bp, sizeof(*bp));
1779163837Spjd	bp->bio_cmd = BIO_READ;
1780163837Spjd	bp->bio_done = NULL;
1781163837Spjd	bp->bio_offset = offset;
1782163837Spjd	bp->bio_length = cp->provider->sectorsize;
1783163837Spjd	bp->bio_data = data;
1784163837Spjd	g_io_request(bp, cp);
1785163837Spjd	error = biowait(bp, "gjs_read");
1786163837Spjd	return (error);
1787163837Spjd}
1788163837Spjd
1789163837Spjd#if 0
1790163837Spjd/*
1791163837Spjd * Function is called when we start the journal device and we detect that
1792163837Spjd * one of the journals was not fully copied.
1793163837Spjd * The purpose of this function is to read all records headers from journal
1794163837Spjd * and placed them in the inactive queue, so we can start journal
1795163837Spjd * synchronization process and the journal provider itself.
1796163837Spjd * Design decision was taken to not synchronize the whole journal here as it
1797163837Spjd * can take too much time. Reading headers only and delaying synchronization
1798163837Spjd * process until after journal provider is started should be the best choice.
1799163837Spjd */
1800163837Spjd#endif
1801163837Spjd
1802163837Spjdstatic void
1803163837Spjdg_journal_sync(struct g_journal_softc *sc)
1804163837Spjd{
1805163837Spjd	struct g_journal_record_header rhdr;
1806163837Spjd	struct g_journal_entry *ent;
1807163837Spjd	struct g_journal_header jhdr;
1808163837Spjd	struct g_consumer *cp;
1809163837Spjd	struct bio *bp, *fbp, *tbp;
1810163837Spjd	off_t joffset, offset;
1811163837Spjd	u_char *buf, sum[16];
1812163837Spjd	uint64_t id;
1813163837Spjd	MD5_CTX ctx;
1814163837Spjd	int error, found, i;
1815163837Spjd
1816163837Spjd	found = 0;
1817163837Spjd	fbp = NULL;
1818163837Spjd	cp = sc->sc_jconsumer;
1819163837Spjd	bp = g_alloc_bio();
1820163837Spjd	buf = gj_malloc(cp->provider->sectorsize, M_WAITOK);
1821163837Spjd	offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset;
1822163837Spjd
1823163837Spjd	GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset);
1824163837Spjd
1825163837Spjd	/*
1826163837Spjd	 * Read and decode first journal header.
1827163837Spjd	 */
1828163837Spjd	error = g_journal_sync_read(cp, bp, offset, buf);
1829163837Spjd	if (error != 0) {
1830163837Spjd		GJ_DEBUG(0, "Error while reading journal header from %s.",
1831163837Spjd		    cp->provider->name);
1832163837Spjd		goto end;
1833163837Spjd	}
1834163837Spjd	error = g_journal_header_decode(buf, &jhdr);
1835163837Spjd	if (error != 0) {
1836163837Spjd		GJ_DEBUG(0, "Cannot decode journal header from %s.",
1837163837Spjd		    cp->provider->name);
1838163837Spjd		goto end;
1839163837Spjd	}
1840163837Spjd	id = sc->sc_journal_id;
1841163837Spjd	if (jhdr.jh_journal_id != sc->sc_journal_id) {
1842163837Spjd		GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).",
1843163837Spjd		    (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id);
1844163837Spjd		goto end;
1845163837Spjd	}
1846163837Spjd	offset += cp->provider->sectorsize;
1847163837Spjd	id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1848163837Spjd
1849163837Spjd	for (;;) {
1850163837Spjd		/*
1851163837Spjd		 * If the biggest record won't fit, look for a record header or
1852163837Spjd		 * journal header from the begining.
1853163837Spjd		 */
1854163837Spjd		GJ_VALIDATE_OFFSET(offset, sc);
1855163837Spjd		error = g_journal_sync_read(cp, bp, offset, buf);
1856163837Spjd		if (error != 0) {
1857163837Spjd			/*
1858163837Spjd			 * Not good. Having an error while reading header
1859163837Spjd			 * means, that we cannot read next headers and in
1860163837Spjd			 * consequence we cannot find termination.
1861163837Spjd			 */
1862163837Spjd			GJ_DEBUG(0,
1863163837Spjd			    "Error while reading record header from %s.",
1864163837Spjd			    cp->provider->name);
1865163837Spjd			break;
1866163837Spjd		}
1867163837Spjd
1868163837Spjd		error = g_journal_record_header_decode(buf, &rhdr);
1869163837Spjd		if (error != 0) {
1870163837Spjd			GJ_DEBUG(2, "Not a record header at %jd (error=%d).",
1871163837Spjd			    (intmax_t)offset, error);
1872163837Spjd			/*
1873163837Spjd			 * This is not a record header.
1874163837Spjd			 * If we are lucky, this is next journal header.
1875163837Spjd			 */
1876163837Spjd			error = g_journal_header_decode(buf, &jhdr);
1877163837Spjd			if (error != 0) {
1878163837Spjd				GJ_DEBUG(1, "Not a journal header at %jd (error=%d).",
1879163837Spjd				    (intmax_t)offset, error);
1880163837Spjd				/*
1881163837Spjd				 * Nope, this is not journal header, which
1882163837Spjd				 * bascially means that journal is not
1883163837Spjd				 * terminated properly.
1884163837Spjd				 */
1885163837Spjd				error = ENOENT;
1886163837Spjd				break;
1887163837Spjd			}
1888163837Spjd			/*
1889163837Spjd			 * Ok. This is header of _some_ journal. Now we need to
1890163837Spjd			 * verify if this is header of the _next_ journal.
1891163837Spjd			 */
1892163837Spjd			if (jhdr.jh_journal_id != id) {
1893163837Spjd				GJ_DEBUG(1, "Journal ID mismatch at %jd "
1894163837Spjd				    "(0x%08x != 0x%08x).", (intmax_t)offset,
1895163837Spjd				    (u_int)jhdr.jh_journal_id, (u_int)id);
1896163837Spjd				error = ENOENT;
1897163837Spjd				break;
1898163837Spjd			}
1899163837Spjd
1900163837Spjd			/* Found termination. */
1901163837Spjd			found++;
1902163837Spjd			GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).",
1903163837Spjd			    (intmax_t)offset, (u_int)id);
1904163837Spjd			sc->sc_active.jj_offset = offset;
1905163837Spjd			sc->sc_journal_offset =
1906163837Spjd			    offset + cp->provider->sectorsize;
1907163837Spjd			sc->sc_journal_id = id;
1908163837Spjd			id = sc->sc_journal_next_id = jhdr.jh_journal_next_id;
1909163837Spjd
1910163837Spjd			while ((tbp = fbp) != NULL) {
1911163837Spjd				fbp = tbp->bio_next;
1912163837Spjd				GJ_LOGREQ(3, tbp, "Adding request.");
1913163837Spjd				g_journal_insert_bio(&sc->sc_inactive.jj_queue,
1914163837Spjd				    tbp, M_WAITOK);
1915163837Spjd			}
1916163837Spjd
1917163837Spjd			/* Skip journal's header. */
1918163837Spjd			offset += cp->provider->sectorsize;
1919163837Spjd			continue;
1920163837Spjd		}
1921163837Spjd
1922163837Spjd		/* Skip record's header. */
1923163837Spjd		offset += cp->provider->sectorsize;
1924163837Spjd
1925163837Spjd		/*
1926163837Spjd		 * Add information about every record entry to the inactive
1927163837Spjd		 * queue.
1928163837Spjd		 */
1929163837Spjd		if (sc->sc_flags & GJF_DEVICE_CHECKSUM)
1930163837Spjd			MD5Init(&ctx);
1931163837Spjd		for (i = 0; i < rhdr.jrh_nentries; i++) {
1932163837Spjd			ent = &rhdr.jrh_entries[i];
1933163837Spjd			GJ_DEBUG(3, "Insert entry: %jd %jd.",
1934163837Spjd			    (intmax_t)ent->je_offset, (intmax_t)ent->je_length);
1935163837Spjd			g_journal_insert(&fbp, ent->je_offset,
1936163837Spjd			    ent->je_offset + ent->je_length, ent->je_joffset,
1937163837Spjd			    NULL, M_WAITOK);
1938163837Spjd			if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1939163837Spjd				u_char *buf2;
1940163837Spjd
1941163837Spjd				/*
1942163837Spjd				 * TODO: Should use faster function (like
1943163837Spjd				 *       g_journal_sync_read()).
1944163837Spjd				 */
1945163837Spjd				buf2 = g_read_data(cp, offset, ent->je_length,
1946163837Spjd				    NULL);
1947163837Spjd				if (buf2 == NULL)
1948163837Spjd					GJ_DEBUG(0, "Cannot read data at %jd.",
1949163837Spjd					    (intmax_t)offset);
1950163837Spjd				else {
1951163837Spjd					MD5Update(&ctx, buf2, ent->je_length);
1952163837Spjd					g_free(buf2);
1953163837Spjd				}
1954163837Spjd			}
1955163837Spjd			/* Skip entry's data. */
1956163837Spjd			offset += ent->je_length;
1957163837Spjd		}
1958163837Spjd		if (sc->sc_flags & GJF_DEVICE_CHECKSUM) {
1959163837Spjd			MD5Final(sum, &ctx);
1960163837Spjd			if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) {
1961163837Spjd				GJ_DEBUG(0, "MD5 hash mismatch at %jd!",
1962163837Spjd				    (intmax_t)offset);
1963163837Spjd			}
1964163837Spjd		}
1965163837Spjd	}
1966163837Spjdend:
1967163837Spjd	gj_free(bp->bio_data, cp->provider->sectorsize);
1968163837Spjd	g_destroy_bio(bp);
1969163837Spjd
1970163837Spjd	/* Remove bios from unterminated journal. */
1971163837Spjd	while ((tbp = fbp) != NULL) {
1972163837Spjd		fbp = tbp->bio_next;
1973163837Spjd		g_destroy_bio(tbp);
1974163837Spjd	}
1975163837Spjd
1976163837Spjd	if (found < 1 && joffset > 0) {
1977163837Spjd		GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.",
1978163837Spjd		    sc->sc_name);
1979163837Spjd		while ((tbp = sc->sc_inactive.jj_queue) != NULL) {
1980163837Spjd			sc->sc_inactive.jj_queue = tbp->bio_next;
1981163837Spjd			g_destroy_bio(tbp);
1982163837Spjd		}
1983163837Spjd		g_journal_initialize(sc);
1984163837Spjd		g_journal_mark_as_dirty(sc);
1985163837Spjd	} else {
1986163837Spjd		GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name);
1987163837Spjd		g_journal_copy_start(sc);
1988163837Spjd	}
1989163837Spjd}
1990163837Spjd
1991163837Spjd/*
1992163837Spjd * Wait for requests.
1993163837Spjd * If we have requests in the current queue, flush them after 3 seconds from the
1994163837Spjd * last flush. In this way we don't wait forever (or for journal switch) with
1995163837Spjd * storing not full records on journal.
1996163837Spjd */
1997163837Spjdstatic void
1998163837Spjdg_journal_wait(struct g_journal_softc *sc, time_t last_write)
1999163837Spjd{
2000163837Spjd	int error, timeout;
2001163837Spjd
2002163837Spjd	GJ_DEBUG(3, "%s: enter", __func__);
2003163837Spjd	if (sc->sc_current_count == 0) {
2004163837Spjd		if (g_journal_debug < 2)
2005163837Spjd			msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0);
2006163837Spjd		else {
2007163837Spjd			/*
2008163837Spjd			 * If we have debug turned on, show number of elements
2009163837Spjd			 * in various queues.
2010163837Spjd			 */
2011163837Spjd			for (;;) {
2012163837Spjd				error = msleep(sc, &sc->sc_mtx, PRIBIO,
2013163837Spjd				    "gj:work", hz * 3);
2014163837Spjd				if (error == 0) {
2015163837Spjd					mtx_unlock(&sc->sc_mtx);
2016163837Spjd					break;
2017163837Spjd				}
2018163837Spjd				GJ_DEBUG(3, "Report: current count=%d",
2019163837Spjd				    sc->sc_current_count);
2020163837Spjd				GJ_DEBUG(3, "Report: flush count=%d",
2021163837Spjd				    sc->sc_flush_count);
2022163837Spjd				GJ_DEBUG(3, "Report: flush in progress=%d",
2023163837Spjd				    sc->sc_flush_in_progress);
2024163837Spjd				GJ_DEBUG(3, "Report: copy in progress=%d",
2025163837Spjd				    sc->sc_copy_in_progress);
2026163837Spjd				GJ_DEBUG(3, "Report: delayed=%d",
2027163837Spjd				    sc->sc_delayed_count);
2028163837Spjd			}
2029163837Spjd		}
2030163837Spjd		GJ_DEBUG(3, "%s: exit 1", __func__);
2031163837Spjd		return;
2032163837Spjd	}
2033163837Spjd
2034163837Spjd	/*
2035163837Spjd	 * Flush even not full records every 3 seconds.
2036163837Spjd	 */
2037163837Spjd	timeout = (last_write + 3 - time_second) * hz;
2038163837Spjd	if (timeout <= 0) {
2039163837Spjd		mtx_unlock(&sc->sc_mtx);
2040163837Spjd		g_journal_flush(sc);
2041163837Spjd		g_journal_flush_send(sc);
2042163837Spjd		GJ_DEBUG(3, "%s: exit 2", __func__);
2043163837Spjd		return;
2044163837Spjd	}
2045163837Spjd	error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout);
2046163837Spjd	if (error == EWOULDBLOCK)
2047163837Spjd		g_journal_flush_send(sc);
2048163837Spjd	GJ_DEBUG(3, "%s: exit 3", __func__);
2049163837Spjd}
2050163837Spjd
2051163837Spjd/*
2052163837Spjd * Worker thread.
2053163837Spjd */
2054163837Spjdstatic void
2055163837Spjdg_journal_worker(void *arg)
2056163837Spjd{
2057163837Spjd	struct g_journal_softc *sc;
2058163837Spjd	struct g_geom *gp;
2059163837Spjd	struct g_provider *pp;
2060163837Spjd	struct bio *bp;
2061163837Spjd	time_t last_write;
2062163837Spjd	int type;
2063163837Spjd
2064170307Sjeff	thread_lock(curthread);
2065163837Spjd	sched_prio(curthread, PRIBIO);
2066170307Sjeff	thread_unlock(curthread);
2067163837Spjd
2068163837Spjd	sc = arg;
2069163894Spjd	type = 0;	/* gcc */
2070163837Spjd
2071163837Spjd	if (sc->sc_flags & GJF_DEVICE_CLEAN) {
2072163837Spjd		GJ_DEBUG(0, "Journal %s clean.", sc->sc_name);
2073163837Spjd		g_journal_initialize(sc);
2074163837Spjd	} else {
2075163837Spjd		g_journal_sync(sc);
2076163837Spjd	}
2077163837Spjd	/*
2078163837Spjd	 * Check if we can use BIO_FLUSH.
2079163837Spjd	 */
2080163837Spjd	sc->sc_bio_flush = 0;
2081163837Spjd	if (g_io_flush(sc->sc_jconsumer) == 0) {
2082163837Spjd		sc->sc_bio_flush |= GJ_FLUSH_JOURNAL;
2083163837Spjd		GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2084163837Spjd		    sc->sc_jconsumer->provider->name);
2085163837Spjd	} else {
2086163837Spjd		GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2087163837Spjd		    sc->sc_jconsumer->provider->name);
2088163837Spjd	}
2089163837Spjd	if (sc->sc_jconsumer != sc->sc_dconsumer) {
2090163837Spjd		if (g_io_flush(sc->sc_dconsumer) == 0) {
2091163837Spjd			sc->sc_bio_flush |= GJ_FLUSH_DATA;
2092163837Spjd			GJ_DEBUG(1, "BIO_FLUSH supported by %s.",
2093163837Spjd			    sc->sc_dconsumer->provider->name);
2094163837Spjd		} else {
2095163837Spjd			GJ_DEBUG(0, "BIO_FLUSH not supported by %s.",
2096163837Spjd			    sc->sc_dconsumer->provider->name);
2097163837Spjd		}
2098163837Spjd	}
2099163837Spjd
2100163837Spjd	gp = sc->sc_geom;
2101163837Spjd	g_topology_lock();
2102163837Spjd	pp = g_new_providerf(gp, "%s.journal", sc->sc_name);
2103163837Spjd	pp->mediasize = sc->sc_mediasize;
2104163837Spjd	/*
2105163837Spjd	 * There could be a problem when data provider and journal providers
2106163837Spjd	 * have different sectorsize, but such scenario is prevented on journal
2107163837Spjd	 * creation.
2108163837Spjd	 */
2109163837Spjd	pp->sectorsize = sc->sc_sectorsize;
2110163837Spjd	g_error_provider(pp, 0);
2111163837Spjd	g_topology_unlock();
2112163837Spjd	last_write = time_second;
2113163837Spjd
2114185693Strasz	if (sc->sc_rootmount != NULL) {
2115185693Strasz		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2116185693Strasz		root_mount_rel(sc->sc_rootmount);
2117185693Strasz		sc->sc_rootmount = NULL;
2118185693Strasz	}
2119185693Strasz
2120163837Spjd	for (;;) {
2121163837Spjd		/* Get first request from the queue. */
2122163837Spjd		mtx_lock(&sc->sc_mtx);
2123163837Spjd		bp = bioq_first(&sc->sc_back_queue);
2124163837Spjd		if (bp != NULL)
2125163837Spjd			type = (bp->bio_cflags & GJ_BIO_MASK);
2126163837Spjd		if (bp == NULL) {
2127163837Spjd			bp = bioq_first(&sc->sc_regular_queue);
2128163837Spjd			if (bp != NULL)
2129163837Spjd				type = GJ_BIO_REGULAR;
2130163837Spjd		}
2131163837Spjd		if (bp == NULL) {
2132163837Spjdtry_switch:
2133163837Spjd			if ((sc->sc_flags & GJF_DEVICE_SWITCH) ||
2134163837Spjd			    (sc->sc_flags & GJF_DEVICE_DESTROY)) {
2135163837Spjd				if (sc->sc_current_count > 0) {
2136163837Spjd					mtx_unlock(&sc->sc_mtx);
2137163837Spjd					g_journal_flush(sc);
2138163837Spjd					g_journal_flush_send(sc);
2139163837Spjd					continue;
2140163837Spjd				}
2141163837Spjd				if (sc->sc_flush_in_progress > 0)
2142163837Spjd					goto sleep;
2143163837Spjd				if (sc->sc_copy_in_progress > 0)
2144163837Spjd					goto sleep;
2145163837Spjd			}
2146163837Spjd			if (sc->sc_flags & GJF_DEVICE_SWITCH) {
2147163837Spjd				mtx_unlock(&sc->sc_mtx);
2148163837Spjd				g_journal_switch(sc);
2149163837Spjd				wakeup(&sc->sc_journal_copying);
2150163837Spjd				continue;
2151163837Spjd			}
2152163837Spjd			if (sc->sc_flags & GJF_DEVICE_DESTROY) {
2153163837Spjd				GJ_DEBUG(1, "Shutting down worker "
2154163837Spjd				    "thread for %s.", gp->name);
2155163837Spjd				sc->sc_worker = NULL;
2156163837Spjd				wakeup(&sc->sc_worker);
2157163837Spjd				mtx_unlock(&sc->sc_mtx);
2158172836Sjulian				kproc_exit(0);
2159163837Spjd			}
2160163837Spjdsleep:
2161163837Spjd			g_journal_wait(sc, last_write);
2162163837Spjd			continue;
2163163837Spjd		}
2164163837Spjd		/*
2165163837Spjd		 * If we're in switch process, we need to delay all new
2166163837Spjd		 * write requests until its done.
2167163837Spjd		 */
2168163837Spjd		if ((sc->sc_flags & GJF_DEVICE_SWITCH) &&
2169163837Spjd		    type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) {
2170163837Spjd			GJ_LOGREQ(2, bp, "WRITE on SWITCH");
2171163837Spjd			goto try_switch;
2172163837Spjd		}
2173163837Spjd		if (type == GJ_BIO_REGULAR)
2174163837Spjd			bioq_remove(&sc->sc_regular_queue, bp);
2175163837Spjd		else
2176163837Spjd			bioq_remove(&sc->sc_back_queue, bp);
2177163837Spjd		mtx_unlock(&sc->sc_mtx);
2178163837Spjd		switch (type) {
2179163837Spjd		case GJ_BIO_REGULAR:
2180163837Spjd			/* Regular request. */
2181163837Spjd			switch (bp->bio_cmd) {
2182163837Spjd			case BIO_READ:
2183163837Spjd				g_journal_read(sc, bp, bp->bio_offset,
2184163837Spjd				    bp->bio_offset + bp->bio_length);
2185163837Spjd				break;
2186163837Spjd			case BIO_WRITE:
2187163837Spjd				last_write = time_second;
2188163837Spjd				g_journal_add_request(sc, bp);
2189163837Spjd				g_journal_flush_send(sc);
2190163837Spjd				break;
2191163837Spjd			default:
2192163837Spjd				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2193163837Spjd			}
2194163837Spjd			break;
2195163837Spjd		case GJ_BIO_COPY:
2196163837Spjd			switch (bp->bio_cmd) {
2197163837Spjd			case BIO_READ:
2198163837Spjd				if (g_journal_copy_read_done(bp))
2199163837Spjd					g_journal_copy_send(sc);
2200163837Spjd				break;
2201163837Spjd			case BIO_WRITE:
2202163837Spjd				g_journal_copy_write_done(bp);
2203163837Spjd				g_journal_copy_send(sc);
2204163837Spjd				break;
2205163837Spjd			default:
2206163837Spjd				panic("Invalid bio_cmd (%d).", bp->bio_cmd);
2207163837Spjd			}
2208163837Spjd			break;
2209163837Spjd		case GJ_BIO_JOURNAL:
2210163837Spjd			g_journal_flush_done(bp);
2211163837Spjd			g_journal_flush_send(sc);
2212163837Spjd			break;
2213163837Spjd		case GJ_BIO_READ:
2214163837Spjd		default:
2215163837Spjd			panic("Invalid bio (%d).", type);
2216163837Spjd		}
2217163837Spjd	}
2218163837Spjd}
2219163837Spjd
2220163837Spjdstatic void
2221163837Spjdg_journal_destroy_event(void *arg, int flags __unused)
2222163837Spjd{
2223163837Spjd	struct g_journal_softc *sc;
2224163837Spjd
2225163837Spjd	g_topology_assert();
2226163837Spjd	sc = arg;
2227163837Spjd	g_journal_destroy(sc);
2228163837Spjd}
2229163837Spjd
2230163837Spjdstatic void
2231163837Spjdg_journal_timeout(void *arg)
2232163837Spjd{
2233163837Spjd	struct g_journal_softc *sc;
2234163837Spjd
2235163837Spjd	sc = arg;
2236163837Spjd	GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.",
2237163837Spjd	    sc->sc_geom->name);
2238163837Spjd	g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL);
2239163837Spjd}
2240163837Spjd
2241163837Spjdstatic struct g_geom *
2242163837Spjdg_journal_create(struct g_class *mp, struct g_provider *pp,
2243163837Spjd    const struct g_journal_metadata *md)
2244163837Spjd{
2245163837Spjd	struct g_journal_softc *sc;
2246163837Spjd	struct g_geom *gp;
2247163837Spjd	struct g_consumer *cp;
2248163837Spjd	int error;
2249163837Spjd
2250163912Spjd	sc = NULL;	/* gcc */
2251163912Spjd
2252163837Spjd	g_topology_assert();
2253163837Spjd	/*
2254163837Spjd	 * There are two possibilities:
2255163837Spjd	 * 1. Data and both journals are on the same provider.
2256163837Spjd	 * 2. Data and journals are all on separated providers.
2257163837Spjd	 */
2258163837Spjd	/* Look for journal device with the same ID. */
2259163837Spjd	LIST_FOREACH(gp, &mp->geom, geom) {
2260163837Spjd		sc = gp->softc;
2261163837Spjd		if (sc == NULL)
2262163837Spjd			continue;
2263163837Spjd		if (sc->sc_id == md->md_id)
2264163837Spjd			break;
2265163837Spjd	}
2266163837Spjd	if (gp == NULL)
2267163837Spjd		sc = NULL;
2268163837Spjd	else if (sc != NULL && (sc->sc_type & md->md_type) != 0) {
2269163837Spjd		GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id);
2270163837Spjd		return (NULL);
2271163837Spjd	}
2272163837Spjd	if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) {
2273163837Spjd		GJ_DEBUG(0, "Invalid type on %s.", pp->name);
2274163837Spjd		return (NULL);
2275163837Spjd	}
2276163837Spjd	if (md->md_type & GJ_TYPE_DATA) {
2277163837Spjd		GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id,
2278163837Spjd		    pp->name);
2279163837Spjd	}
2280163837Spjd	if (md->md_type & GJ_TYPE_JOURNAL) {
2281163837Spjd		GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id,
2282163837Spjd		    pp->name);
2283163837Spjd	}
2284163837Spjd
2285163837Spjd	if (sc == NULL) {
2286163837Spjd		/* Action geom. */
2287163837Spjd		sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO);
2288163837Spjd		sc->sc_id = md->md_id;
2289163837Spjd		sc->sc_type = 0;
2290163837Spjd		sc->sc_flags = 0;
2291163837Spjd		sc->sc_worker = NULL;
2292163837Spjd
2293163837Spjd		gp = g_new_geomf(mp, "gjournal %u", sc->sc_id);
2294163837Spjd		gp->start = g_journal_start;
2295163837Spjd		gp->orphan = g_journal_orphan;
2296163837Spjd		gp->access = g_journal_access;
2297163837Spjd		gp->softc = sc;
2298195195Strasz		gp->flags |= G_GEOM_VOLATILE_BIO;
2299163837Spjd		sc->sc_geom = gp;
2300163837Spjd
2301163837Spjd		mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF);
2302163837Spjd
2303163837Spjd		bioq_init(&sc->sc_back_queue);
2304163837Spjd		bioq_init(&sc->sc_regular_queue);
2305163837Spjd		bioq_init(&sc->sc_delayed_queue);
2306163837Spjd		sc->sc_delayed_count = 0;
2307163837Spjd		sc->sc_current_queue = NULL;
2308163837Spjd		sc->sc_current_count = 0;
2309163837Spjd		sc->sc_flush_queue = NULL;
2310163837Spjd		sc->sc_flush_count = 0;
2311163837Spjd		sc->sc_flush_in_progress = 0;
2312163837Spjd		sc->sc_copy_queue = NULL;
2313163837Spjd		sc->sc_copy_in_progress = 0;
2314163837Spjd		sc->sc_inactive.jj_queue = NULL;
2315163837Spjd		sc->sc_active.jj_queue = NULL;
2316163837Spjd
2317190878Sthompsa		sc->sc_rootmount = root_mount_hold("GJOURNAL");
2318185693Strasz		GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount);
2319185693Strasz
2320163837Spjd		callout_init(&sc->sc_callout, CALLOUT_MPSAFE);
2321163837Spjd		if (md->md_type != GJ_TYPE_COMPLETE) {
2322163837Spjd			/*
2323163837Spjd			 * Journal and data are on separate providers.
2324163837Spjd			 * At this point we have only one of them.
2325163837Spjd			 * We setup a timeout in case the other part will not
2326163837Spjd			 * appear, so we won't wait forever.
2327163837Spjd			 */
2328163837Spjd			callout_reset(&sc->sc_callout, 5 * hz,
2329163837Spjd			    g_journal_timeout, sc);
2330163837Spjd		}
2331163837Spjd	}
2332163837Spjd
2333163837Spjd	/* Remember type of the data provider. */
2334163837Spjd	if (md->md_type & GJ_TYPE_DATA)
2335163837Spjd		sc->sc_orig_type = md->md_type;
2336163837Spjd	sc->sc_type |= md->md_type;
2337163837Spjd	cp = NULL;
2338163837Spjd
2339163837Spjd	if (md->md_type & GJ_TYPE_DATA) {
2340163837Spjd		if (md->md_flags & GJ_FLAG_CLEAN)
2341163837Spjd			sc->sc_flags |= GJF_DEVICE_CLEAN;
2342163837Spjd		if (md->md_flags & GJ_FLAG_CHECKSUM)
2343163837Spjd			sc->sc_flags |= GJF_DEVICE_CHECKSUM;
2344163837Spjd		cp = g_new_consumer(gp);
2345163837Spjd		error = g_attach(cp, pp);
2346163837Spjd		KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2347163837Spjd		    pp->name, error));
2348163837Spjd		error = g_access(cp, 1, 1, 1);
2349163837Spjd		if (error != 0) {
2350163837Spjd			GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name,
2351163837Spjd			    error);
2352163837Spjd			g_journal_destroy(sc);
2353163837Spjd			return (NULL);
2354163837Spjd		}
2355163837Spjd		sc->sc_dconsumer = cp;
2356163837Spjd		sc->sc_mediasize = pp->mediasize - pp->sectorsize;
2357163837Spjd		sc->sc_sectorsize = pp->sectorsize;
2358163837Spjd		sc->sc_jstart = md->md_jstart;
2359163837Spjd		sc->sc_jend = md->md_jend;
2360163837Spjd		if (md->md_provider[0] != '\0')
2361163837Spjd			sc->sc_flags |= GJF_DEVICE_HARDCODED;
2362163837Spjd		sc->sc_journal_offset = md->md_joffset;
2363163837Spjd		sc->sc_journal_id = md->md_jid;
2364163837Spjd		sc->sc_journal_previous_id = md->md_jid;
2365163837Spjd	}
2366163837Spjd	if (md->md_type & GJ_TYPE_JOURNAL) {
2367163837Spjd		if (cp == NULL) {
2368163837Spjd			cp = g_new_consumer(gp);
2369163837Spjd			error = g_attach(cp, pp);
2370163837Spjd			KASSERT(error == 0, ("Cannot attach to %s (error=%d).",
2371163837Spjd			    pp->name, error));
2372163837Spjd			error = g_access(cp, 1, 1, 1);
2373163837Spjd			if (error != 0) {
2374163837Spjd				GJ_DEBUG(0, "Cannot access %s (error=%d).",
2375163837Spjd				    pp->name, error);
2376163837Spjd				g_journal_destroy(sc);
2377163837Spjd				return (NULL);
2378163837Spjd			}
2379163837Spjd		} else {
2380163837Spjd			/*
2381163837Spjd			 * Journal is on the same provider as data, which means
2382163837Spjd			 * that data provider ends where journal starts.
2383163837Spjd			 */
2384163837Spjd			sc->sc_mediasize = md->md_jstart;
2385163837Spjd		}
2386163837Spjd		sc->sc_jconsumer = cp;
2387163837Spjd	}
2388163837Spjd
2389163837Spjd	if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) {
2390163837Spjd		/* Journal is not complete yet. */
2391163837Spjd		return (gp);
2392163837Spjd	} else {
2393163837Spjd		/* Journal complete, cancel timeout. */
2394163837Spjd		callout_drain(&sc->sc_callout);
2395163837Spjd	}
2396163837Spjd
2397172836Sjulian	error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0,
2398163837Spjd	    "g_journal %s", sc->sc_name);
2399163837Spjd	if (error != 0) {
2400163837Spjd		GJ_DEBUG(0, "Cannot create worker thread for %s.journal.",
2401163837Spjd		    sc->sc_name);
2402163837Spjd		g_journal_destroy(sc);
2403163837Spjd		return (NULL);
2404163837Spjd	}
2405163837Spjd
2406163837Spjd	return (gp);
2407163837Spjd}
2408163837Spjd
2409163837Spjdstatic void
2410163837Spjdg_journal_destroy_consumer(void *arg, int flags __unused)
2411163837Spjd{
2412163837Spjd	struct g_consumer *cp;
2413163837Spjd
2414163837Spjd	g_topology_assert();
2415163837Spjd	cp = arg;
2416163837Spjd	g_detach(cp);
2417163837Spjd	g_destroy_consumer(cp);
2418163837Spjd}
2419163837Spjd
2420163837Spjdstatic int
2421163837Spjdg_journal_destroy(struct g_journal_softc *sc)
2422163837Spjd{
2423163837Spjd	struct g_geom *gp;
2424163837Spjd	struct g_provider *pp;
2425163837Spjd	struct g_consumer *cp;
2426163837Spjd
2427163837Spjd	g_topology_assert();
2428163837Spjd
2429163837Spjd	if (sc == NULL)
2430163837Spjd		return (ENXIO);
2431163837Spjd
2432163837Spjd	gp = sc->sc_geom;
2433163837Spjd	pp = LIST_FIRST(&gp->provider);
2434163837Spjd	if (pp != NULL) {
2435163837Spjd		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) {
2436163837Spjd			GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).",
2437163837Spjd			    pp->name, pp->acr, pp->acw, pp->ace);
2438163837Spjd			return (EBUSY);
2439163837Spjd		}
2440163837Spjd		g_error_provider(pp, ENXIO);
2441163837Spjd
2442163837Spjd		g_journal_flush(sc);
2443163837Spjd		g_journal_flush_send(sc);
2444163837Spjd		g_journal_switch(sc);
2445163837Spjd	}
2446163837Spjd
2447163837Spjd	sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN);
2448163837Spjd
2449163837Spjd	g_topology_unlock();
2450185693Strasz
2451185693Strasz	if (sc->sc_rootmount != NULL) {
2452185693Strasz		GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount);
2453185693Strasz		root_mount_rel(sc->sc_rootmount);
2454185693Strasz		sc->sc_rootmount = NULL;
2455185693Strasz	}
2456185693Strasz
2457163837Spjd	callout_drain(&sc->sc_callout);
2458163837Spjd	mtx_lock(&sc->sc_mtx);
2459163837Spjd	wakeup(sc);
2460163837Spjd	while (sc->sc_worker != NULL)
2461163837Spjd		msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0);
2462163837Spjd	mtx_unlock(&sc->sc_mtx);
2463163837Spjd
2464163837Spjd	if (pp != NULL) {
2465163837Spjd		GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name);
2466163837Spjd		g_journal_metadata_update(sc);
2467163837Spjd		g_topology_lock();
2468163837Spjd		pp->flags |= G_PF_WITHER;
2469163837Spjd		g_orphan_provider(pp, ENXIO);
2470163837Spjd	} else {
2471163837Spjd		g_topology_lock();
2472163837Spjd	}
2473163837Spjd	mtx_destroy(&sc->sc_mtx);
2474163837Spjd
2475163837Spjd	if (sc->sc_current_count != 0) {
2476163837Spjd		GJ_DEBUG(0, "Warning! Number of current requests %d.",
2477163837Spjd		    sc->sc_current_count);
2478163837Spjd	}
2479163837Spjd
2480163837Spjd	LIST_FOREACH(cp, &gp->consumer, consumer) {
2481163837Spjd		if (cp->acr + cp->acw + cp->ace > 0)
2482163837Spjd			g_access(cp, -1, -1, -1);
2483163837Spjd		/*
2484163837Spjd		 * We keep all consumers open for writting, so if I'll detach
2485163837Spjd		 * and destroy consumer here, I'll get providers for taste, so
2486163837Spjd		 * journal will be started again.
2487163837Spjd		 * Sending an event here, prevents this from happening.
2488163837Spjd		 */
2489163837Spjd		g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL);
2490163837Spjd	}
2491163837Spjd	gp->softc = NULL;
2492163837Spjd	g_wither_geom(gp, ENXIO);
2493163837Spjd	free(sc, M_JOURNAL);
2494163837Spjd	return (0);
2495163837Spjd}
2496163837Spjd
2497163837Spjdstatic void
2498163837Spjdg_journal_taste_orphan(struct g_consumer *cp)
2499163837Spjd{
2500163837Spjd
2501163837Spjd	KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
2502163837Spjd	    cp->provider->name));
2503163837Spjd}
2504163837Spjd
2505163837Spjdstatic struct g_geom *
2506163837Spjdg_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
2507163837Spjd{
2508163837Spjd	struct g_journal_metadata md;
2509163837Spjd	struct g_consumer *cp;
2510163837Spjd	struct g_geom *gp;
2511163837Spjd	int error;
2512163837Spjd
2513163837Spjd	g_topology_assert();
2514163837Spjd	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
2515163837Spjd	GJ_DEBUG(2, "Tasting %s.", pp->name);
2516163837Spjd	if (pp->geom->class == mp)
2517163837Spjd		return (NULL);
2518163837Spjd
2519163837Spjd	gp = g_new_geomf(mp, "journal:taste");
2520163837Spjd	/* This orphan function should be never called. */
2521163837Spjd	gp->orphan = g_journal_taste_orphan;
2522163837Spjd	cp = g_new_consumer(gp);
2523163837Spjd	g_attach(cp, pp);
2524163837Spjd	error = g_journal_metadata_read(cp, &md);
2525163837Spjd	g_detach(cp);
2526163837Spjd	g_destroy_consumer(cp);
2527163837Spjd	g_destroy_geom(gp);
2528163837Spjd	if (error != 0)
2529163837Spjd		return (NULL);
2530163837Spjd	gp = NULL;
2531163837Spjd
2532221101Smav	if (md.md_provider[0] != '\0' &&
2533221101Smav	    !g_compare_names(md.md_provider, pp->name))
2534163837Spjd		return (NULL);
2535163837Spjd	if (md.md_provsize != 0 && md.md_provsize != pp->mediasize)
2536163837Spjd		return (NULL);
2537163837Spjd	if (g_journal_debug >= 2)
2538163837Spjd		journal_metadata_dump(&md);
2539163837Spjd
2540163837Spjd	gp = g_journal_create(mp, pp, &md);
2541163837Spjd	return (gp);
2542163837Spjd}
2543163837Spjd
2544163837Spjdstatic struct g_journal_softc *
2545163837Spjdg_journal_find_device(struct g_class *mp, const char *name)
2546163837Spjd{
2547163837Spjd	struct g_journal_softc *sc;
2548163837Spjd	struct g_geom *gp;
2549163837Spjd	struct g_provider *pp;
2550163837Spjd
2551163837Spjd	if (strncmp(name, "/dev/", 5) == 0)
2552163837Spjd		name += 5;
2553163837Spjd	LIST_FOREACH(gp, &mp->geom, geom) {
2554163837Spjd		sc = gp->softc;
2555163837Spjd		if (sc == NULL)
2556163837Spjd			continue;
2557163837Spjd		if (sc->sc_flags & GJF_DEVICE_DESTROY)
2558163837Spjd			continue;
2559163837Spjd		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2560163837Spjd			continue;
2561163837Spjd		pp = LIST_FIRST(&gp->provider);
2562163837Spjd		if (strcmp(sc->sc_name, name) == 0)
2563163837Spjd			return (sc);
2564163837Spjd		if (pp != NULL && strcmp(pp->name, name) == 0)
2565163837Spjd			return (sc);
2566163837Spjd	}
2567163837Spjd	return (NULL);
2568163837Spjd}
2569163837Spjd
2570163837Spjdstatic void
2571163837Spjdg_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp)
2572163837Spjd{
2573163837Spjd	struct g_journal_softc *sc;
2574163837Spjd	const char *name;
2575163837Spjd	char param[16];
2576163837Spjd	int *nargs;
2577163837Spjd	int error, i;
2578163837Spjd
2579163837Spjd	g_topology_assert();
2580163837Spjd
2581163837Spjd	nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs));
2582163837Spjd	if (nargs == NULL) {
2583163837Spjd		gctl_error(req, "No '%s' argument.", "nargs");
2584163837Spjd		return;
2585163837Spjd	}
2586163837Spjd	if (*nargs <= 0) {
2587163837Spjd		gctl_error(req, "Missing device(s).");
2588163837Spjd		return;
2589163837Spjd	}
2590163837Spjd
2591163837Spjd	for (i = 0; i < *nargs; i++) {
2592163837Spjd		snprintf(param, sizeof(param), "arg%d", i);
2593163837Spjd		name = gctl_get_asciiparam(req, param);
2594163837Spjd		if (name == NULL) {
2595163837Spjd			gctl_error(req, "No 'arg%d' argument.", i);
2596163837Spjd			return;
2597163837Spjd		}
2598163837Spjd		sc = g_journal_find_device(mp, name);
2599163837Spjd		if (sc == NULL) {
2600163837Spjd			gctl_error(req, "No such device: %s.", name);
2601163837Spjd			return;
2602163837Spjd		}
2603163837Spjd		error = g_journal_destroy(sc);
2604163837Spjd		if (error != 0) {
2605163837Spjd			gctl_error(req, "Cannot destroy device %s (error=%d).",
2606163837Spjd			    LIST_FIRST(&sc->sc_geom->provider)->name, error);
2607163837Spjd			return;
2608163837Spjd		}
2609163837Spjd	}
2610163837Spjd}
2611163837Spjd
2612163837Spjdstatic void
2613163837Spjdg_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused)
2614163837Spjd{
2615163837Spjd
2616163837Spjd	g_topology_assert();
2617163837Spjd	g_topology_unlock();
2618163837Spjd	g_journal_sync_requested++;
2619163837Spjd	wakeup(&g_journal_switcher_state);
2620163837Spjd	while (g_journal_sync_requested > 0)
2621163837Spjd		tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2);
2622163837Spjd	g_topology_lock();
2623163837Spjd}
2624163837Spjd
2625163837Spjdstatic void
2626163837Spjdg_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb)
2627163837Spjd{
2628163837Spjd	uint32_t *version;
2629163837Spjd
2630163837Spjd	g_topology_assert();
2631163837Spjd
2632163837Spjd	version = gctl_get_paraml(req, "version", sizeof(*version));
2633163837Spjd	if (version == NULL) {
2634163837Spjd		gctl_error(req, "No '%s' argument.", "version");
2635163837Spjd		return;
2636163837Spjd	}
2637163837Spjd	if (*version != G_JOURNAL_VERSION) {
2638163837Spjd		gctl_error(req, "Userland and kernel parts are out of sync.");
2639163837Spjd		return;
2640163837Spjd	}
2641163837Spjd
2642163837Spjd	if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) {
2643163837Spjd		g_journal_ctl_destroy(req, mp);
2644163837Spjd		return;
2645163837Spjd	} else if (strcmp(verb, "sync") == 0) {
2646163837Spjd		g_journal_ctl_sync(req, mp);
2647163837Spjd		return;
2648163837Spjd	}
2649163837Spjd
2650163837Spjd	gctl_error(req, "Unknown verb.");
2651163837Spjd}
2652163837Spjd
2653163837Spjdstatic void
2654163837Spjdg_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2655163837Spjd    struct g_consumer *cp, struct g_provider *pp)
2656163837Spjd{
2657163837Spjd	struct g_journal_softc *sc;
2658163837Spjd
2659163837Spjd	g_topology_assert();
2660163837Spjd
2661163837Spjd	sc = gp->softc;
2662163837Spjd	if (sc == NULL)
2663163837Spjd		return;
2664163837Spjd	if (pp != NULL) {
2665163837Spjd		/* Nothing here. */
2666163837Spjd	} else if (cp != NULL) {
2667163837Spjd		int first = 1;
2668163837Spjd
2669163837Spjd		sbuf_printf(sb, "%s<Role>", indent);
2670163837Spjd		if (cp == sc->sc_dconsumer) {
2671163837Spjd			sbuf_printf(sb, "Data");
2672163837Spjd			first = 0;
2673163837Spjd		}
2674163837Spjd		if (cp == sc->sc_jconsumer) {
2675163837Spjd			if (!first)
2676163837Spjd				sbuf_printf(sb, ",");
2677163837Spjd			sbuf_printf(sb, "Journal");
2678163837Spjd		}
2679163837Spjd		sbuf_printf(sb, "</Role>\n");
2680163837Spjd		if (cp == sc->sc_jconsumer) {
2681167800Spjd			sbuf_printf(sb, "<Jstart>%jd</Jstart>\n",
2682163837Spjd			    (intmax_t)sc->sc_jstart);
2683167800Spjd			sbuf_printf(sb, "<Jend>%jd</Jend>\n",
2684163837Spjd			    (intmax_t)sc->sc_jend);
2685163837Spjd		}
2686163837Spjd	} else {
2687163837Spjd		sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id);
2688163837Spjd	}
2689163837Spjd}
2690163837Spjd
2691163837Spjdstatic eventhandler_tag g_journal_event_shutdown = NULL;
2692163837Spjdstatic eventhandler_tag g_journal_event_lowmem = NULL;
2693163837Spjd
2694163837Spjdstatic void
2695163837Spjdg_journal_shutdown(void *arg, int howto __unused)
2696163837Spjd{
2697163837Spjd	struct g_class *mp;
2698163837Spjd	struct g_geom *gp, *gp2;
2699163837Spjd
2700163837Spjd	if (panicstr != NULL)
2701163837Spjd		return;
2702163837Spjd	mp = arg;
2703163837Spjd	DROP_GIANT();
2704163837Spjd	g_topology_lock();
2705163837Spjd	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
2706163837Spjd		if (gp->softc == NULL)
2707163837Spjd			continue;
2708163837Spjd		GJ_DEBUG(0, "Shutting down geom %s.", gp->name);
2709163837Spjd		g_journal_destroy(gp->softc);
2710163837Spjd	}
2711163837Spjd	g_topology_unlock();
2712163837Spjd	PICKUP_GIANT();
2713163837Spjd}
2714163837Spjd
2715163837Spjd/*
2716163837Spjd * Free cached requests from inactive queue in case of low memory.
2717163837Spjd * We free GJ_FREE_AT_ONCE elements at once.
2718163837Spjd */
2719163837Spjd#define	GJ_FREE_AT_ONCE	4
2720163837Spjdstatic void
2721163837Spjdg_journal_lowmem(void *arg, int howto __unused)
2722163837Spjd{
2723163837Spjd	struct g_journal_softc *sc;
2724163837Spjd	struct g_class *mp;
2725163837Spjd	struct g_geom *gp;
2726163837Spjd	struct bio *bp;
2727163837Spjd	u_int nfree = GJ_FREE_AT_ONCE;
2728163837Spjd
2729163837Spjd	g_journal_stats_low_mem++;
2730163837Spjd	mp = arg;
2731163837Spjd	DROP_GIANT();
2732163837Spjd	g_topology_lock();
2733163837Spjd	LIST_FOREACH(gp, &mp->geom, geom) {
2734163837Spjd		sc = gp->softc;
2735163837Spjd		if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY))
2736163837Spjd			continue;
2737163837Spjd		mtx_lock(&sc->sc_mtx);
2738163837Spjd		for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL;
2739163837Spjd		    nfree--, bp = bp->bio_next) {
2740163837Spjd			/*
2741163837Spjd			 * This is safe to free the bio_data, because:
2742163837Spjd			 * 1. If bio_data is NULL it will be read from the
2743163837Spjd			 *    inactive journal.
2744163837Spjd			 * 2. If bp is sent down, it is first removed from the
2745163837Spjd			 *    inactive queue, so it's impossible to free the
2746163837Spjd			 *    data from under in-flight bio.
2747163837Spjd			 * On the other hand, freeing elements from the active
2748163837Spjd			 * queue, is not safe.
2749163837Spjd			 */
2750163837Spjd			if (bp->bio_data != NULL) {
2751163837Spjd				GJ_DEBUG(2, "Freeing data from %s.",
2752163837Spjd				    sc->sc_name);
2753163837Spjd				gj_free(bp->bio_data, bp->bio_length);
2754163837Spjd				bp->bio_data = NULL;
2755163837Spjd			}
2756163837Spjd		}
2757163837Spjd		mtx_unlock(&sc->sc_mtx);
2758163837Spjd		if (nfree == 0)
2759163837Spjd			break;
2760163837Spjd	}
2761163837Spjd	g_topology_unlock();
2762163837Spjd	PICKUP_GIANT();
2763163837Spjd}
2764163837Spjd
2765163837Spjdstatic void g_journal_switcher(void *arg);
2766163837Spjd
2767163837Spjdstatic void
2768163837Spjdg_journal_init(struct g_class *mp)
2769163837Spjd{
2770163837Spjd	int error;
2771163837Spjd
2772163837Spjd	/* Pick a conservative value if provided value sucks. */
2773163837Spjd	if (g_journal_cache_divisor <= 0 ||
2774163837Spjd	    (vm_kmem_size / g_journal_cache_divisor == 0)) {
2775163837Spjd		g_journal_cache_divisor = 5;
2776163837Spjd	}
2777163837Spjd	if (g_journal_cache_limit > 0) {
2778163837Spjd		g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor;
2779163837Spjd		g_journal_cache_low =
2780163837Spjd		    (g_journal_cache_limit / 100) * g_journal_cache_switch;
2781163837Spjd	}
2782163837Spjd	g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync,
2783163837Spjd	    g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST);
2784163837Spjd	if (g_journal_event_shutdown == NULL)
2785163837Spjd		GJ_DEBUG(0, "Warning! Cannot register shutdown event.");
2786163837Spjd	g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem,
2787163837Spjd	    g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST);
2788163837Spjd	if (g_journal_event_lowmem == NULL)
2789163837Spjd		GJ_DEBUG(0, "Warning! Cannot register lowmem event.");
2790172836Sjulian	error = kproc_create(g_journal_switcher, mp, NULL, 0, 0,
2791163837Spjd	    "g_journal switcher");
2792163837Spjd	KASSERT(error == 0, ("Cannot create switcher thread."));
2793163837Spjd}
2794163837Spjd
2795163837Spjdstatic void
2796163837Spjdg_journal_fini(struct g_class *mp)
2797163837Spjd{
2798163837Spjd
2799163837Spjd	if (g_journal_event_shutdown != NULL) {
2800163837Spjd		EVENTHANDLER_DEREGISTER(shutdown_post_sync,
2801163837Spjd		    g_journal_event_shutdown);
2802163837Spjd	}
2803163837Spjd	if (g_journal_event_lowmem != NULL)
2804163837Spjd		EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem);
2805163837Spjd	g_journal_switcher_state = GJ_SWITCHER_DIE;
2806163837Spjd	wakeup(&g_journal_switcher_state);
2807163837Spjd	while (g_journal_switcher_state != GJ_SWITCHER_DIED)
2808163837Spjd		tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5);
2809163837Spjd	GJ_DEBUG(1, "Switcher died.");
2810163837Spjd}
2811163837Spjd
2812163837SpjdDECLARE_GEOM_CLASS(g_journal_class, g_journal);
2813163837Spjd
2814163837Spjdstatic const struct g_journal_desc *
2815163837Spjdg_journal_find_desc(const char *fstype)
2816163837Spjd{
2817163837Spjd	const struct g_journal_desc *desc;
2818163837Spjd	int i;
2819163837Spjd
2820163837Spjd	for (desc = g_journal_filesystems[i = 0]; desc != NULL;
2821163837Spjd	     desc = g_journal_filesystems[++i]) {
2822163837Spjd		if (strcmp(desc->jd_fstype, fstype) == 0)
2823163837Spjd			break;
2824163837Spjd	}
2825163837Spjd	return (desc);
2826163837Spjd}
2827163837Spjd
2828163837Spjdstatic void
2829163837Spjdg_journal_switch_wait(struct g_journal_softc *sc)
2830163837Spjd{
2831163837Spjd	struct bintime bt;
2832163837Spjd
2833163837Spjd	mtx_assert(&sc->sc_mtx, MA_OWNED);
2834163837Spjd	if (g_journal_debug >= 2) {
2835163837Spjd		if (sc->sc_flush_in_progress > 0) {
2836163837Spjd			GJ_DEBUG(2, "%d requests flushing.",
2837163837Spjd			    sc->sc_flush_in_progress);
2838163837Spjd		}
2839163837Spjd		if (sc->sc_copy_in_progress > 0) {
2840163837Spjd			GJ_DEBUG(2, "%d requests copying.",
2841163837Spjd			    sc->sc_copy_in_progress);
2842163837Spjd		}
2843163837Spjd		if (sc->sc_flush_count > 0) {
2844163837Spjd			GJ_DEBUG(2, "%d requests to flush.",
2845163837Spjd			    sc->sc_flush_count);
2846163837Spjd		}
2847163837Spjd		if (sc->sc_delayed_count > 0) {
2848163837Spjd			GJ_DEBUG(2, "%d requests delayed.",
2849163837Spjd			    sc->sc_delayed_count);
2850163837Spjd		}
2851163837Spjd	}
2852163837Spjd	g_journal_stats_switches++;
2853163837Spjd	if (sc->sc_copy_in_progress > 0)
2854163837Spjd		g_journal_stats_wait_for_copy++;
2855163837Spjd	GJ_TIMER_START(1, &bt);
2856163837Spjd	sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2857163837Spjd	sc->sc_flags |= GJF_DEVICE_SWITCH;
2858163837Spjd	wakeup(sc);
2859163837Spjd	while (sc->sc_flags & GJF_DEVICE_SWITCH) {
2860163837Spjd		msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO,
2861163837Spjd		    "gj:switch", 0);
2862163837Spjd	}
2863163837Spjd	GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name);
2864163837Spjd}
2865163837Spjd
2866163837Spjdstatic void
2867182542Sattiliog_journal_do_switch(struct g_class *classp)
2868163837Spjd{
2869163837Spjd	struct g_journal_softc *sc;
2870163837Spjd	const struct g_journal_desc *desc;
2871163837Spjd	struct g_geom *gp;
2872163837Spjd	struct mount *mp;
2873163837Spjd	struct bintime bt;
2874163837Spjd	char *mountpoint;
2875241896Skib	int error, save;
2876163837Spjd
2877163837Spjd	DROP_GIANT();
2878163837Spjd	g_topology_lock();
2879163837Spjd	LIST_FOREACH(gp, &classp->geom, geom) {
2880163837Spjd		sc = gp->softc;
2881163837Spjd		if (sc == NULL)
2882163837Spjd			continue;
2883163837Spjd		if (sc->sc_flags & GJF_DEVICE_DESTROY)
2884163837Spjd			continue;
2885163837Spjd		if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE)
2886163837Spjd			continue;
2887163837Spjd		mtx_lock(&sc->sc_mtx);
2888163837Spjd		sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH;
2889163837Spjd		mtx_unlock(&sc->sc_mtx);
2890163837Spjd	}
2891163837Spjd	g_topology_unlock();
2892163837Spjd	PICKUP_GIANT();
2893163837Spjd
2894163837Spjd	mtx_lock(&mountlist_mtx);
2895163837Spjd	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
2896163837Spjd		if (mp->mnt_gjprovider == NULL)
2897163837Spjd			continue;
2898163837Spjd		if (mp->mnt_flag & MNT_RDONLY)
2899163837Spjd			continue;
2900163837Spjd		desc = g_journal_find_desc(mp->mnt_stat.f_fstypename);
2901163837Spjd		if (desc == NULL)
2902163837Spjd			continue;
2903184554Sattilio		if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK))
2904163837Spjd			continue;
2905163837Spjd		/* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */
2906163837Spjd
2907163837Spjd		DROP_GIANT();
2908163837Spjd		g_topology_lock();
2909163837Spjd		sc = g_journal_find_device(classp, mp->mnt_gjprovider);
2910163837Spjd		g_topology_unlock();
2911163837Spjd		PICKUP_GIANT();
2912163837Spjd
2913163837Spjd		if (sc == NULL) {
2914163837Spjd			GJ_DEBUG(0, "Cannot find journal geom for %s.",
2915163837Spjd			    mp->mnt_gjprovider);
2916163837Spjd			goto next;
2917163837Spjd		} else if (JEMPTY(sc)) {
2918163837Spjd			mtx_lock(&sc->sc_mtx);
2919163837Spjd			sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH;
2920163837Spjd			mtx_unlock(&sc->sc_mtx);
2921163837Spjd			GJ_DEBUG(3, "No need for %s switch.", sc->sc_name);
2922163837Spjd			goto next;
2923163837Spjd		}
2924163837Spjd
2925163837Spjd		mountpoint = mp->mnt_stat.f_mntonname;
2926163837Spjd
2927163837Spjd		error = vn_start_write(NULL, &mp, V_WAIT);
2928163837Spjd		if (error != 0) {
2929163837Spjd			GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).",
2930163837Spjd			    mountpoint, error);
2931163837Spjd			goto next;
2932163837Spjd		}
2933163837Spjd
2934231075Skib		save = curthread_pflags_set(TDP_SYNCIO);
2935163865Spjd
2936163837Spjd		GJ_TIMER_START(1, &bt);
2937163837Spjd		vfs_msync(mp, MNT_NOWAIT);
2938163837Spjd		GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint);
2939163837Spjd
2940163837Spjd		GJ_TIMER_START(1, &bt);
2941191990Sattilio		error = VFS_SYNC(mp, MNT_NOWAIT);
2942163837Spjd		if (error == 0)
2943163837Spjd			GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint);
2944163837Spjd		else {
2945163837Spjd			GJ_DEBUG(0, "Cannot sync file system %s (error=%d).",
2946163837Spjd			    mountpoint, error);
2947163837Spjd		}
2948163837Spjd
2949231075Skib		curthread_pflags_restore(save);
2950163865Spjd
2951163837Spjd		vn_finished_write(mp);
2952163837Spjd
2953241896Skib		if (error != 0)
2954163837Spjd			goto next;
2955163837Spjd
2956163837Spjd		/*
2957163837Spjd		 * Send BIO_FLUSH before freezing the file system, so it can be
2958163837Spjd		 * faster after the freeze.
2959163837Spjd		 */
2960163837Spjd		GJ_TIMER_START(1, &bt);
2961163837Spjd		g_journal_flush_cache(sc);
2962163837Spjd		GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name);
2963163837Spjd
2964163837Spjd		GJ_TIMER_START(1, &bt);
2965253106Skib		error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT);
2966163837Spjd		GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint);
2967163837Spjd		if (error != 0) {
2968163837Spjd			GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).",
2969163837Spjd			    mountpoint, error);
2970163837Spjd			goto next;
2971163837Spjd		}
2972163837Spjd
2973163837Spjd		error = desc->jd_clean(mp);
2974163837Spjd		if (error != 0)
2975163837Spjd			goto next;
2976163837Spjd
2977163837Spjd		mtx_lock(&sc->sc_mtx);
2978163837Spjd		g_journal_switch_wait(sc);
2979163837Spjd		mtx_unlock(&sc->sc_mtx);
2980163837Spjd
2981245286Skib		vfs_write_resume(mp, 0);
2982163837Spjdnext:
2983163837Spjd		mtx_lock(&mountlist_mtx);
2984182542Sattilio		vfs_unbusy(mp);
2985163837Spjd	}
2986163837Spjd	mtx_unlock(&mountlist_mtx);
2987163837Spjd
2988163837Spjd	sc = NULL;
2989163837Spjd	for (;;) {
2990163837Spjd		DROP_GIANT();
2991163837Spjd		g_topology_lock();
2992163837Spjd		LIST_FOREACH(gp, &g_journal_class.geom, geom) {
2993163837Spjd			sc = gp->softc;
2994163837Spjd			if (sc == NULL)
2995163837Spjd				continue;
2996163837Spjd			mtx_lock(&sc->sc_mtx);
2997163837Spjd			if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE &&
2998163837Spjd			    !(sc->sc_flags & GJF_DEVICE_DESTROY) &&
2999163837Spjd			    (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) {
3000163837Spjd				break;
3001163837Spjd			}
3002163837Spjd			mtx_unlock(&sc->sc_mtx);
3003163837Spjd			sc = NULL;
3004163837Spjd		}
3005163837Spjd		g_topology_unlock();
3006163837Spjd		PICKUP_GIANT();
3007163837Spjd		if (sc == NULL)
3008163837Spjd			break;
3009163837Spjd		mtx_assert(&sc->sc_mtx, MA_OWNED);
3010163837Spjd		g_journal_switch_wait(sc);
3011163837Spjd		mtx_unlock(&sc->sc_mtx);
3012163837Spjd	}
3013163837Spjd}
3014163837Spjd
3015163837Spjd/*
3016163837Spjd * TODO: Switcher thread should be started on first geom creation and killed on
3017163837Spjd * last geom destruction.
3018163837Spjd */
3019163837Spjdstatic void
3020163837Spjdg_journal_switcher(void *arg)
3021163837Spjd{
3022163837Spjd	struct g_class *mp;
3023163837Spjd	struct bintime bt;
3024163837Spjd	int error;
3025163837Spjd
3026163837Spjd	mp = arg;
3027217880Skib	curthread->td_pflags |= TDP_NORUNNINGBUF;
3028163837Spjd	for (;;) {
3029163837Spjd		g_journal_switcher_wokenup = 0;
3030163837Spjd		error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait",
3031163837Spjd		    g_journal_switch_time * hz);
3032163837Spjd		if (g_journal_switcher_state == GJ_SWITCHER_DIE) {
3033163837Spjd			g_journal_switcher_state = GJ_SWITCHER_DIED;
3034163837Spjd			GJ_DEBUG(1, "Switcher exiting.");
3035163837Spjd			wakeup(&g_journal_switcher_state);
3036172836Sjulian			kproc_exit(0);
3037163837Spjd		}
3038163837Spjd		if (error == 0 && g_journal_sync_requested == 0) {
3039163837Spjd			GJ_DEBUG(1, "Out of cache, force switch (used=%u "
3040163837Spjd			    "limit=%u).", g_journal_cache_used,
3041163837Spjd			    g_journal_cache_limit);
3042163837Spjd		}
3043163837Spjd		GJ_TIMER_START(1, &bt);
3044182542Sattilio		g_journal_do_switch(mp);
3045163837Spjd		GJ_TIMER_STOP(1, &bt, "Entire switch time");
3046163837Spjd		if (g_journal_sync_requested > 0) {
3047163837Spjd			g_journal_sync_requested = 0;
3048163837Spjd			wakeup(&g_journal_sync_requested);
3049163837Spjd		}
3050163837Spjd	}
3051163837Spjd}
3052