1163837Spjd/*- 2163837Spjd * Copyright (c) 2005-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 3163837Spjd * All rights reserved. 4163837Spjd * 5163837Spjd * Redistribution and use in source and binary forms, with or without 6163837Spjd * modification, are permitted provided that the following conditions 7163837Spjd * are met: 8163837Spjd * 1. Redistributions of source code must retain the above copyright 9163837Spjd * notice, this list of conditions and the following disclaimer. 10163837Spjd * 2. Redistributions in binary form must reproduce the above copyright 11163837Spjd * notice, this list of conditions and the following disclaimer in the 12163837Spjd * documentation and/or other materials provided with the distribution. 13163837Spjd * 14163837Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15163837Spjd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16163837Spjd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17163837Spjd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18163837Spjd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19163837Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20163837Spjd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21163837Spjd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22163837Spjd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23163837Spjd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24163837Spjd * SUCH DAMAGE. 25163837Spjd */ 26163837Spjd 27163837Spjd#include <sys/cdefs.h> 28163837Spjd__FBSDID("$FreeBSD$"); 29163837Spjd 30163837Spjd#include <sys/param.h> 31163837Spjd#include <sys/systm.h> 32163837Spjd#include <sys/kernel.h> 33163837Spjd#include <sys/module.h> 34163837Spjd#include <sys/limits.h> 35163837Spjd#include <sys/lock.h> 36163837Spjd#include <sys/mutex.h> 37163837Spjd#include <sys/bio.h> 38163837Spjd#include <sys/sysctl.h> 39163837Spjd#include <sys/malloc.h> 40163837Spjd#include <sys/mount.h> 41163837Spjd#include <sys/eventhandler.h> 42163837Spjd#include <sys/proc.h> 43163837Spjd#include <sys/kthread.h> 44163837Spjd#include <sys/sched.h> 45163837Spjd#include <sys/taskqueue.h> 46163837Spjd#include <sys/vnode.h> 47163837Spjd#include <sys/sbuf.h> 48163837Spjd#ifdef GJ_MEMDEBUG 49163837Spjd#include <sys/stack.h> 50163837Spjd#include <sys/kdb.h> 51163837Spjd#endif 52163837Spjd#include <vm/vm.h> 53163837Spjd#include <vm/vm_kern.h> 54163837Spjd#include <geom/geom.h> 55163837Spjd 56163837Spjd#include <geom/journal/g_journal.h> 57163837Spjd 58219029SnetchildFEATURE(geom_journal, "GEOM journaling support"); 59163837Spjd 60163837Spjd/* 61163837Spjd * On-disk journal format: 62163837Spjd * 63163837Spjd * JH - Journal header 64163837Spjd * RH - Record header 65163837Spjd * 66163837Spjd * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% 67163837Spjd * % JH % * RH * | Data | | Data | ... * RH * | Data | ... % JH % ... 68163837Spjd * %%%%%% ****** +------+ +------+ ****** +------+ %%%%%% 69163837Spjd * 70163837Spjd */ 71163837Spjd 72163837SpjdCTASSERT(sizeof(struct g_journal_header) <= 512); 73163837SpjdCTASSERT(sizeof(struct g_journal_record_header) <= 512); 74163837Spjd 75163837Spjdstatic MALLOC_DEFINE(M_JOURNAL, "journal_data", "GEOM_JOURNAL Data"); 76163837Spjdstatic struct mtx g_journal_cache_mtx; 77163837SpjdMTX_SYSINIT(g_journal_cache, &g_journal_cache_mtx, "cache usage", MTX_DEF); 78163837Spjd 79163837Spjdconst struct g_journal_desc *g_journal_filesystems[] = { 80163837Spjd &g_journal_ufs, 81163837Spjd NULL 82163837Spjd}; 83163837Spjd 84163837SpjdSYSCTL_DECL(_kern_geom); 85163837Spjd 86163837Spjdint g_journal_debug = 0; 87163837SpjdTUNABLE_INT("kern.geom.journal.debug", &g_journal_debug); 88163837Spjdstatic u_int g_journal_switch_time = 10; 89163837Spjdstatic u_int g_journal_force_switch = 70; 90163837Spjdstatic u_int g_journal_parallel_flushes = 16; 91163837Spjdstatic u_int g_journal_parallel_copies = 16; 92163837Spjdstatic u_int g_journal_accept_immediately = 64; 93163837Spjdstatic u_int g_journal_record_entries = GJ_RECORD_HEADER_NENTRIES; 94163837Spjdstatic u_int g_journal_do_optimize = 1; 95163837Spjd 96227309Sedstatic SYSCTL_NODE(_kern_geom, OID_AUTO, journal, CTLFLAG_RW, 0, 97227309Sed "GEOM_JOURNAL stuff"); 98163837SpjdSYSCTL_INT(_kern_geom_journal, OID_AUTO, debug, CTLFLAG_RW, &g_journal_debug, 0, 99163837Spjd "Debug level"); 100163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, switch_time, CTLFLAG_RW, 101163837Spjd &g_journal_switch_time, 0, "Switch journals every N seconds"); 102163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, force_switch, CTLFLAG_RW, 103168426Spjd &g_journal_force_switch, 0, "Force switch when journal is N% full"); 104163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_flushes, CTLFLAG_RW, 105163837Spjd &g_journal_parallel_flushes, 0, 106179897Slulf "Number of flush I/O requests to send in parallel"); 107163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, accept_immediately, CTLFLAG_RW, 108163837Spjd &g_journal_accept_immediately, 0, 109179897Slulf "Number of I/O requests accepted immediately"); 110163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, parallel_copies, CTLFLAG_RW, 111163837Spjd &g_journal_parallel_copies, 0, 112179897Slulf "Number of copy I/O requests to send in parallel"); 113163837Spjdstatic int 114163837Spjdg_journal_record_entries_sysctl(SYSCTL_HANDLER_ARGS) 115163837Spjd{ 116163837Spjd u_int entries; 117163837Spjd int error; 118163837Spjd 119163837Spjd entries = g_journal_record_entries; 120170289Sdwmalone error = sysctl_handle_int(oidp, &entries, 0, req); 121163837Spjd if (error != 0 || req->newptr == NULL) 122163837Spjd return (error); 123163837Spjd if (entries < 1 || entries > GJ_RECORD_HEADER_NENTRIES) 124163837Spjd return (EINVAL); 125163837Spjd g_journal_record_entries = entries; 126163837Spjd return (0); 127163837Spjd} 128163837SpjdSYSCTL_PROC(_kern_geom_journal, OID_AUTO, record_entries, 129163837Spjd CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_record_entries_sysctl, "I", 130163837Spjd "Maximum number of entires in one journal record"); 131163837SpjdSYSCTL_UINT(_kern_geom_journal, OID_AUTO, optimize, CTLFLAG_RW, 132163837Spjd &g_journal_do_optimize, 0, "Try to combine bios on flush and copy"); 133163837Spjd 134163837Spjdstatic u_int g_journal_cache_used = 0; 135163837Spjdstatic u_int g_journal_cache_limit = 64 * 1024 * 1024; 136163837SpjdTUNABLE_INT("kern.geom.journal.cache.limit", &g_journal_cache_limit); 137163837Spjdstatic u_int g_journal_cache_divisor = 2; 138163837SpjdTUNABLE_INT("kern.geom.journal.cache.divisor", &g_journal_cache_divisor); 139163837Spjdstatic u_int g_journal_cache_switch = 90; 140163837Spjdstatic u_int g_journal_cache_misses = 0; 141163837Spjdstatic u_int g_journal_cache_alloc_failures = 0; 142163837Spjdstatic u_int g_journal_cache_low = 0; 143163837Spjd 144227309Sedstatic SYSCTL_NODE(_kern_geom_journal, OID_AUTO, cache, CTLFLAG_RW, 0, 145163837Spjd "GEOM_JOURNAL cache"); 146163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, used, CTLFLAG_RD, 147163837Spjd &g_journal_cache_used, 0, "Number of allocated bytes"); 148163837Spjdstatic int 149163837Spjdg_journal_cache_limit_sysctl(SYSCTL_HANDLER_ARGS) 150163837Spjd{ 151163837Spjd u_int limit; 152163837Spjd int error; 153163837Spjd 154163837Spjd limit = g_journal_cache_limit; 155170289Sdwmalone error = sysctl_handle_int(oidp, &limit, 0, req); 156163837Spjd if (error != 0 || req->newptr == NULL) 157163837Spjd return (error); 158163837Spjd g_journal_cache_limit = limit; 159163837Spjd g_journal_cache_low = (limit / 100) * g_journal_cache_switch; 160163837Spjd return (0); 161163837Spjd} 162163837SpjdSYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, limit, 163163837Spjd CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_limit_sysctl, "I", 164163837Spjd "Maximum number of allocated bytes"); 165163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, divisor, CTLFLAG_RDTUN, 166163837Spjd &g_journal_cache_divisor, 0, 167163837Spjd "(kmem_size / kern.geom.journal.cache.divisor) == cache size"); 168163837Spjdstatic int 169163837Spjdg_journal_cache_switch_sysctl(SYSCTL_HANDLER_ARGS) 170163837Spjd{ 171163837Spjd u_int cswitch; 172163837Spjd int error; 173163837Spjd 174163837Spjd cswitch = g_journal_cache_switch; 175170289Sdwmalone error = sysctl_handle_int(oidp, &cswitch, 0, req); 176163837Spjd if (error != 0 || req->newptr == NULL) 177163837Spjd return (error); 178163837Spjd if (cswitch < 0 || cswitch > 100) 179163837Spjd return (EINVAL); 180163837Spjd g_journal_cache_switch = cswitch; 181163837Spjd g_journal_cache_low = (g_journal_cache_limit / 100) * cswitch; 182163837Spjd return (0); 183163837Spjd} 184163837SpjdSYSCTL_PROC(_kern_geom_journal_cache, OID_AUTO, switch, 185163837Spjd CTLTYPE_UINT | CTLFLAG_RW, NULL, 0, g_journal_cache_switch_sysctl, "I", 186163837Spjd "Force switch when we hit this percent of cache use"); 187163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, misses, CTLFLAG_RW, 188163837Spjd &g_journal_cache_misses, 0, "Number of cache misses"); 189163837SpjdSYSCTL_UINT(_kern_geom_journal_cache, OID_AUTO, alloc_failures, CTLFLAG_RW, 190163837Spjd &g_journal_cache_alloc_failures, 0, "Memory allocation failures"); 191163837Spjd 192163837Spjdstatic u_long g_journal_stats_bytes_skipped = 0; 193163837Spjdstatic u_long g_journal_stats_combined_ios = 0; 194163837Spjdstatic u_long g_journal_stats_switches = 0; 195163837Spjdstatic u_long g_journal_stats_wait_for_copy = 0; 196163837Spjdstatic u_long g_journal_stats_journal_full = 0; 197163837Spjdstatic u_long g_journal_stats_low_mem = 0; 198163837Spjd 199227309Sedstatic SYSCTL_NODE(_kern_geom_journal, OID_AUTO, stats, CTLFLAG_RW, 0, 200163837Spjd "GEOM_JOURNAL statistics"); 201163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, skipped_bytes, CTLFLAG_RW, 202163837Spjd &g_journal_stats_bytes_skipped, 0, "Number of skipped bytes"); 203163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, combined_ios, CTLFLAG_RW, 204163837Spjd &g_journal_stats_combined_ios, 0, "Number of combined I/O requests"); 205163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, switches, CTLFLAG_RW, 206163837Spjd &g_journal_stats_switches, 0, "Number of journal switches"); 207163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, wait_for_copy, CTLFLAG_RW, 208163837Spjd &g_journal_stats_wait_for_copy, 0, "Wait for journal copy on switch"); 209163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, journal_full, CTLFLAG_RW, 210163837Spjd &g_journal_stats_journal_full, 0, 211163837Spjd "Number of times journal was almost full."); 212163837SpjdSYSCTL_ULONG(_kern_geom_journal_stats, OID_AUTO, low_mem, CTLFLAG_RW, 213163837Spjd &g_journal_stats_low_mem, 0, "Number of times low_mem hook was called."); 214163837Spjd 215163837Spjdstatic g_taste_t g_journal_taste; 216163837Spjdstatic g_ctl_req_t g_journal_config; 217163837Spjdstatic g_dumpconf_t g_journal_dumpconf; 218163837Spjdstatic g_init_t g_journal_init; 219163837Spjdstatic g_fini_t g_journal_fini; 220163837Spjd 221163837Spjdstruct g_class g_journal_class = { 222163837Spjd .name = G_JOURNAL_CLASS_NAME, 223163837Spjd .version = G_VERSION, 224163837Spjd .taste = g_journal_taste, 225163837Spjd .ctlreq = g_journal_config, 226163837Spjd .dumpconf = g_journal_dumpconf, 227163837Spjd .init = g_journal_init, 228163837Spjd .fini = g_journal_fini 229163837Spjd}; 230163837Spjd 231163837Spjdstatic int g_journal_destroy(struct g_journal_softc *sc); 232163837Spjdstatic void g_journal_metadata_update(struct g_journal_softc *sc); 233163837Spjdstatic void g_journal_switch_wait(struct g_journal_softc *sc); 234163837Spjd 235163837Spjd#define GJ_SWITCHER_WORKING 0 236163837Spjd#define GJ_SWITCHER_DIE 1 237163837Spjd#define GJ_SWITCHER_DIED 2 238163837Spjdstatic int g_journal_switcher_state = GJ_SWITCHER_WORKING; 239163837Spjdstatic int g_journal_switcher_wokenup = 0; 240163837Spjdstatic int g_journal_sync_requested = 0; 241163837Spjd 242163837Spjd#ifdef GJ_MEMDEBUG 243163837Spjdstruct meminfo { 244163837Spjd size_t mi_size; 245163837Spjd struct stack mi_stack; 246163837Spjd}; 247163837Spjd#endif 248163837Spjd 249163837Spjd/* 250163837Spjd * We use our own malloc/realloc/free funtions, so we can collect statistics 251163837Spjd * and force journal switch when we're running out of cache. 252163837Spjd */ 253163837Spjdstatic void * 254163837Spjdgj_malloc(size_t size, int flags) 255163837Spjd{ 256163837Spjd void *p; 257163837Spjd#ifdef GJ_MEMDEBUG 258163837Spjd struct meminfo *mi; 259163837Spjd#endif 260163837Spjd 261163837Spjd mtx_lock(&g_journal_cache_mtx); 262163837Spjd if (g_journal_cache_limit > 0 && !g_journal_switcher_wokenup && 263163837Spjd g_journal_cache_used + size > g_journal_cache_low) { 264163837Spjd GJ_DEBUG(1, "No cache, waking up the switcher."); 265163837Spjd g_journal_switcher_wokenup = 1; 266163837Spjd wakeup(&g_journal_switcher_state); 267163837Spjd } 268163837Spjd if ((flags & M_NOWAIT) && g_journal_cache_limit > 0 && 269163837Spjd g_journal_cache_used + size > g_journal_cache_limit) { 270163837Spjd mtx_unlock(&g_journal_cache_mtx); 271163837Spjd g_journal_cache_alloc_failures++; 272163837Spjd return (NULL); 273163837Spjd } 274163837Spjd g_journal_cache_used += size; 275163837Spjd mtx_unlock(&g_journal_cache_mtx); 276163837Spjd flags &= ~M_NOWAIT; 277163837Spjd#ifndef GJ_MEMDEBUG 278163837Spjd p = malloc(size, M_JOURNAL, flags | M_WAITOK); 279163837Spjd#else 280163837Spjd mi = malloc(sizeof(*mi) + size, M_JOURNAL, flags | M_WAITOK); 281163837Spjd p = (u_char *)mi + sizeof(*mi); 282163837Spjd mi->mi_size = size; 283163837Spjd stack_save(&mi->mi_stack); 284163837Spjd#endif 285163837Spjd return (p); 286163837Spjd} 287163837Spjd 288163837Spjdstatic void 289163837Spjdgj_free(void *p, size_t size) 290163837Spjd{ 291163837Spjd#ifdef GJ_MEMDEBUG 292163837Spjd struct meminfo *mi; 293163837Spjd#endif 294163837Spjd 295163837Spjd KASSERT(p != NULL, ("p=NULL")); 296163837Spjd KASSERT(size > 0, ("size=0")); 297163837Spjd mtx_lock(&g_journal_cache_mtx); 298163837Spjd KASSERT(g_journal_cache_used >= size, ("Freeing too much?")); 299163837Spjd g_journal_cache_used -= size; 300163837Spjd mtx_unlock(&g_journal_cache_mtx); 301163837Spjd#ifdef GJ_MEMDEBUG 302163837Spjd mi = p = (void *)((u_char *)p - sizeof(*mi)); 303163837Spjd if (mi->mi_size != size) { 304163837Spjd printf("GJOURNAL: Size mismatch! %zu != %zu\n", size, 305163837Spjd mi->mi_size); 306163837Spjd printf("GJOURNAL: Alloc backtrace:\n"); 307163837Spjd stack_print(&mi->mi_stack); 308163837Spjd printf("GJOURNAL: Free backtrace:\n"); 309163837Spjd kdb_backtrace(); 310163837Spjd } 311163837Spjd#endif 312163837Spjd free(p, M_JOURNAL); 313163837Spjd} 314163837Spjd 315163837Spjdstatic void * 316163837Spjdgj_realloc(void *p, size_t size, size_t oldsize) 317163837Spjd{ 318163837Spjd void *np; 319163837Spjd 320163837Spjd#ifndef GJ_MEMDEBUG 321163837Spjd mtx_lock(&g_journal_cache_mtx); 322163837Spjd g_journal_cache_used -= oldsize; 323163837Spjd g_journal_cache_used += size; 324163837Spjd mtx_unlock(&g_journal_cache_mtx); 325163837Spjd np = realloc(p, size, M_JOURNAL, M_WAITOK); 326163837Spjd#else 327163837Spjd np = gj_malloc(size, M_WAITOK); 328163837Spjd bcopy(p, np, MIN(oldsize, size)); 329163837Spjd gj_free(p, oldsize); 330163837Spjd#endif 331163837Spjd return (np); 332163837Spjd} 333163837Spjd 334163837Spjdstatic void 335163837Spjdg_journal_check_overflow(struct g_journal_softc *sc) 336163837Spjd{ 337163837Spjd off_t length, used; 338163837Spjd 339163837Spjd if ((sc->sc_active.jj_offset < sc->sc_inactive.jj_offset && 340163837Spjd sc->sc_journal_offset >= sc->sc_inactive.jj_offset) || 341163837Spjd (sc->sc_active.jj_offset > sc->sc_inactive.jj_offset && 342163837Spjd sc->sc_journal_offset >= sc->sc_inactive.jj_offset && 343163837Spjd sc->sc_journal_offset < sc->sc_active.jj_offset)) { 344253141Skib panic("Journal overflow " 345253141Skib "(id = %u joffset=%jd active=%jd inactive=%jd)", 346253141Skib (unsigned)sc->sc_id, 347163837Spjd (intmax_t)sc->sc_journal_offset, 348163837Spjd (intmax_t)sc->sc_active.jj_offset, 349163837Spjd (intmax_t)sc->sc_inactive.jj_offset); 350163837Spjd } 351163837Spjd if (sc->sc_active.jj_offset < sc->sc_inactive.jj_offset) { 352163837Spjd length = sc->sc_inactive.jj_offset - sc->sc_active.jj_offset; 353163837Spjd used = sc->sc_journal_offset - sc->sc_active.jj_offset; 354163837Spjd } else { 355163837Spjd length = sc->sc_jend - sc->sc_active.jj_offset; 356163837Spjd length += sc->sc_inactive.jj_offset - sc->sc_jstart; 357163837Spjd if (sc->sc_journal_offset >= sc->sc_active.jj_offset) 358163837Spjd used = sc->sc_journal_offset - sc->sc_active.jj_offset; 359163837Spjd else { 360163837Spjd used = sc->sc_jend - sc->sc_active.jj_offset; 361163837Spjd used += sc->sc_journal_offset - sc->sc_jstart; 362163837Spjd } 363163837Spjd } 364163837Spjd /* Already woken up? */ 365163837Spjd if (g_journal_switcher_wokenup) 366163837Spjd return; 367163837Spjd /* 368163837Spjd * If the active journal takes more than g_journal_force_switch precent 369163837Spjd * of free journal space, we force journal switch. 370163837Spjd */ 371163837Spjd KASSERT(length > 0, 372163837Spjd ("length=%jd used=%jd active=%jd inactive=%jd joffset=%jd", 373163837Spjd (intmax_t)length, (intmax_t)used, 374163837Spjd (intmax_t)sc->sc_active.jj_offset, 375163837Spjd (intmax_t)sc->sc_inactive.jj_offset, 376163837Spjd (intmax_t)sc->sc_journal_offset)); 377163837Spjd if ((used * 100) / length > g_journal_force_switch) { 378163837Spjd g_journal_stats_journal_full++; 379163837Spjd GJ_DEBUG(1, "Journal %s %jd%% full, forcing journal switch.", 380163837Spjd sc->sc_name, (used * 100) / length); 381163837Spjd mtx_lock(&g_journal_cache_mtx); 382163837Spjd g_journal_switcher_wokenup = 1; 383163837Spjd wakeup(&g_journal_switcher_state); 384163837Spjd mtx_unlock(&g_journal_cache_mtx); 385163837Spjd } 386163837Spjd} 387163837Spjd 388163837Spjdstatic void 389163837Spjdg_journal_orphan(struct g_consumer *cp) 390163837Spjd{ 391163837Spjd struct g_journal_softc *sc; 392163837Spjd char name[256]; 393163837Spjd int error; 394163837Spjd 395163837Spjd g_topology_assert(); 396163837Spjd sc = cp->geom->softc; 397164821Spjd strlcpy(name, cp->provider->name, sizeof(name)); 398164821Spjd GJ_DEBUG(0, "Lost provider %s.", name); 399164821Spjd if (sc == NULL) 400164821Spjd return; 401163837Spjd error = g_journal_destroy(sc); 402163837Spjd if (error == 0) 403163837Spjd GJ_DEBUG(0, "Journal %s destroyed.", name); 404163837Spjd else { 405163837Spjd GJ_DEBUG(0, "Cannot destroy journal %s (error=%d). " 406163837Spjd "Destroy it manually after last close.", sc->sc_name, 407163837Spjd error); 408163837Spjd } 409163837Spjd} 410163837Spjd 411163837Spjdstatic int 412163837Spjdg_journal_access(struct g_provider *pp, int acr, int acw, int ace) 413163837Spjd{ 414163837Spjd struct g_journal_softc *sc; 415163837Spjd int dcr, dcw, dce; 416163837Spjd 417163837Spjd g_topology_assert(); 418163837Spjd GJ_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, 419163837Spjd acr, acw, ace); 420163837Spjd 421163837Spjd dcr = pp->acr + acr; 422163837Spjd dcw = pp->acw + acw; 423163837Spjd dce = pp->ace + ace; 424163837Spjd 425163837Spjd sc = pp->geom->softc; 426163837Spjd if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) { 427163837Spjd if (acr <= 0 && acw <= 0 && ace <= 0) 428163837Spjd return (0); 429163837Spjd else 430163837Spjd return (ENXIO); 431163837Spjd } 432163837Spjd if (pp->acw == 0 && dcw > 0) { 433163837Spjd GJ_DEBUG(1, "Marking %s as dirty.", sc->sc_name); 434163837Spjd sc->sc_flags &= ~GJF_DEVICE_CLEAN; 435163837Spjd g_topology_unlock(); 436163837Spjd g_journal_metadata_update(sc); 437163837Spjd g_topology_lock(); 438163837Spjd } /* else if (pp->acw == 0 && dcw > 0 && JEMPTY(sc)) { 439163837Spjd GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); 440163837Spjd sc->sc_flags |= GJF_DEVICE_CLEAN; 441163837Spjd g_topology_unlock(); 442163837Spjd g_journal_metadata_update(sc); 443163837Spjd g_topology_lock(); 444163837Spjd } */ 445163837Spjd return (0); 446163837Spjd} 447163837Spjd 448163837Spjdstatic void 449163837Spjdg_journal_header_encode(struct g_journal_header *hdr, u_char *data) 450163837Spjd{ 451163837Spjd 452163837Spjd bcopy(GJ_HEADER_MAGIC, data, sizeof(GJ_HEADER_MAGIC)); 453163837Spjd data += sizeof(GJ_HEADER_MAGIC); 454163837Spjd le32enc(data, hdr->jh_journal_id); 455163837Spjd data += 4; 456163837Spjd le32enc(data, hdr->jh_journal_next_id); 457163837Spjd} 458163837Spjd 459163837Spjdstatic int 460163837Spjdg_journal_header_decode(const u_char *data, struct g_journal_header *hdr) 461163837Spjd{ 462163837Spjd 463163837Spjd bcopy(data, hdr->jh_magic, sizeof(hdr->jh_magic)); 464163837Spjd data += sizeof(hdr->jh_magic); 465163837Spjd if (bcmp(hdr->jh_magic, GJ_HEADER_MAGIC, sizeof(GJ_HEADER_MAGIC)) != 0) 466163837Spjd return (EINVAL); 467163837Spjd hdr->jh_journal_id = le32dec(data); 468163837Spjd data += 4; 469163837Spjd hdr->jh_journal_next_id = le32dec(data); 470163837Spjd return (0); 471163837Spjd} 472163837Spjd 473163837Spjdstatic void 474163837Spjdg_journal_flush_cache(struct g_journal_softc *sc) 475163837Spjd{ 476163837Spjd struct bintime bt; 477163837Spjd int error; 478163837Spjd 479163837Spjd if (sc->sc_bio_flush == 0) 480163837Spjd return; 481163837Spjd GJ_TIMER_START(1, &bt); 482163837Spjd if (sc->sc_bio_flush & GJ_FLUSH_JOURNAL) { 483163837Spjd error = g_io_flush(sc->sc_jconsumer); 484163837Spjd GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", 485163837Spjd sc->sc_jconsumer->provider->name, error); 486163837Spjd } 487163837Spjd if (sc->sc_bio_flush & GJ_FLUSH_DATA) { 488163837Spjd /* 489163837Spjd * TODO: This could be called in parallel with the 490163837Spjd * previous call. 491163837Spjd */ 492163837Spjd error = g_io_flush(sc->sc_dconsumer); 493163837Spjd GJ_DEBUG(error == 0 ? 2 : 0, "Flush cache of %s: error=%d.", 494163837Spjd sc->sc_dconsumer->provider->name, error); 495163837Spjd } 496163837Spjd GJ_TIMER_STOP(1, &bt, "Cache flush time"); 497163837Spjd} 498163837Spjd 499163837Spjdstatic int 500163837Spjdg_journal_write_header(struct g_journal_softc *sc) 501163837Spjd{ 502163837Spjd struct g_journal_header hdr; 503163837Spjd struct g_consumer *cp; 504163837Spjd u_char *buf; 505163837Spjd int error; 506163837Spjd 507163837Spjd cp = sc->sc_jconsumer; 508163837Spjd buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); 509163837Spjd 510163837Spjd strlcpy(hdr.jh_magic, GJ_HEADER_MAGIC, sizeof(hdr.jh_magic)); 511163837Spjd hdr.jh_journal_id = sc->sc_journal_id; 512163837Spjd hdr.jh_journal_next_id = sc->sc_journal_next_id; 513163837Spjd g_journal_header_encode(&hdr, buf); 514163837Spjd error = g_write_data(cp, sc->sc_journal_offset, buf, 515163837Spjd cp->provider->sectorsize); 516163837Spjd /* if (error == 0) */ 517163837Spjd sc->sc_journal_offset += cp->provider->sectorsize; 518163837Spjd 519163837Spjd gj_free(buf, cp->provider->sectorsize); 520163837Spjd return (error); 521163837Spjd} 522163837Spjd 523163837Spjd/* 524163837Spjd * Every journal record has a header and data following it. 525163837Spjd * Functions below are used to decode the header before storing it to 526163837Spjd * little endian and to encode it after reading to system endianess. 527163837Spjd */ 528163837Spjdstatic void 529163837Spjdg_journal_record_header_encode(struct g_journal_record_header *hdr, 530163837Spjd u_char *data) 531163837Spjd{ 532163837Spjd struct g_journal_entry *ent; 533163837Spjd u_int i; 534163837Spjd 535163837Spjd bcopy(GJ_RECORD_HEADER_MAGIC, data, sizeof(GJ_RECORD_HEADER_MAGIC)); 536163837Spjd data += sizeof(GJ_RECORD_HEADER_MAGIC); 537163837Spjd le32enc(data, hdr->jrh_journal_id); 538163837Spjd data += 8; 539163837Spjd le16enc(data, hdr->jrh_nentries); 540163837Spjd data += 2; 541163837Spjd bcopy(hdr->jrh_sum, data, sizeof(hdr->jrh_sum)); 542163837Spjd data += 8; 543163837Spjd for (i = 0; i < hdr->jrh_nentries; i++) { 544163837Spjd ent = &hdr->jrh_entries[i]; 545163837Spjd le64enc(data, ent->je_joffset); 546163837Spjd data += 8; 547163837Spjd le64enc(data, ent->je_offset); 548163837Spjd data += 8; 549163837Spjd le64enc(data, ent->je_length); 550163837Spjd data += 8; 551163837Spjd } 552163837Spjd} 553163837Spjd 554163837Spjdstatic int 555163837Spjdg_journal_record_header_decode(const u_char *data, 556163837Spjd struct g_journal_record_header *hdr) 557163837Spjd{ 558163837Spjd struct g_journal_entry *ent; 559163837Spjd u_int i; 560163837Spjd 561163837Spjd bcopy(data, hdr->jrh_magic, sizeof(hdr->jrh_magic)); 562163837Spjd data += sizeof(hdr->jrh_magic); 563163837Spjd if (strcmp(hdr->jrh_magic, GJ_RECORD_HEADER_MAGIC) != 0) 564163837Spjd return (EINVAL); 565163837Spjd hdr->jrh_journal_id = le32dec(data); 566163837Spjd data += 8; 567163837Spjd hdr->jrh_nentries = le16dec(data); 568163837Spjd data += 2; 569163837Spjd if (hdr->jrh_nentries > GJ_RECORD_HEADER_NENTRIES) 570163837Spjd return (EINVAL); 571163837Spjd bcopy(data, hdr->jrh_sum, sizeof(hdr->jrh_sum)); 572163837Spjd data += 8; 573163837Spjd for (i = 0; i < hdr->jrh_nentries; i++) { 574163837Spjd ent = &hdr->jrh_entries[i]; 575163837Spjd ent->je_joffset = le64dec(data); 576163837Spjd data += 8; 577163837Spjd ent->je_offset = le64dec(data); 578163837Spjd data += 8; 579163837Spjd ent->je_length = le64dec(data); 580163837Spjd data += 8; 581163837Spjd } 582163837Spjd return (0); 583163837Spjd} 584163837Spjd 585163837Spjd/* 586163837Spjd * Function reads metadata from a provider (via the given consumer), decodes 587163837Spjd * it to system endianess and verifies its correctness. 588163837Spjd */ 589163837Spjdstatic int 590163837Spjdg_journal_metadata_read(struct g_consumer *cp, struct g_journal_metadata *md) 591163837Spjd{ 592163837Spjd struct g_provider *pp; 593163837Spjd u_char *buf; 594163837Spjd int error; 595163837Spjd 596163837Spjd g_topology_assert(); 597163837Spjd 598163837Spjd error = g_access(cp, 1, 0, 0); 599163837Spjd if (error != 0) 600163837Spjd return (error); 601163837Spjd pp = cp->provider; 602163837Spjd g_topology_unlock(); 603163837Spjd /* Metadata is stored in last sector. */ 604163837Spjd buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, 605163837Spjd &error); 606163837Spjd g_topology_lock(); 607163837Spjd g_access(cp, -1, 0, 0); 608163906Spjd if (buf == NULL) { 609163837Spjd GJ_DEBUG(1, "Cannot read metadata from %s (error=%d).", 610163837Spjd cp->provider->name, error); 611163837Spjd return (error); 612163837Spjd } 613163837Spjd 614163837Spjd /* Decode metadata. */ 615163837Spjd error = journal_metadata_decode(buf, md); 616163837Spjd g_free(buf); 617163837Spjd /* Is this is gjournal provider at all? */ 618163837Spjd if (strcmp(md->md_magic, G_JOURNAL_MAGIC) != 0) 619163837Spjd return (EINVAL); 620163837Spjd /* 621163837Spjd * Are we able to handle this version of metadata? 622163837Spjd * We only maintain backward compatibility. 623163837Spjd */ 624163837Spjd if (md->md_version > G_JOURNAL_VERSION) { 625163837Spjd GJ_DEBUG(0, 626163837Spjd "Kernel module is too old to handle metadata from %s.", 627163837Spjd cp->provider->name); 628163837Spjd return (EINVAL); 629163837Spjd } 630163837Spjd /* Is checksum correct? */ 631163837Spjd if (error != 0) { 632163837Spjd GJ_DEBUG(0, "MD5 metadata hash mismatch for provider %s.", 633163837Spjd cp->provider->name); 634163837Spjd return (error); 635163837Spjd } 636163837Spjd return (0); 637163837Spjd} 638163837Spjd 639163837Spjd/* 640163837Spjd * Two functions below are responsible for updating metadata. 641163837Spjd * Only metadata on the data provider is updated (we need to update 642163837Spjd * information about active journal in there). 643163837Spjd */ 644163837Spjdstatic void 645163837Spjdg_journal_metadata_done(struct bio *bp) 646163837Spjd{ 647163837Spjd 648163837Spjd /* 649163837Spjd * There is not much we can do on error except informing about it. 650163837Spjd */ 651163837Spjd if (bp->bio_error != 0) { 652163837Spjd GJ_LOGREQ(0, bp, "Cannot update metadata (error=%d).", 653163837Spjd bp->bio_error); 654163837Spjd } else { 655163837Spjd GJ_LOGREQ(2, bp, "Metadata updated."); 656163837Spjd } 657163837Spjd gj_free(bp->bio_data, bp->bio_length); 658163837Spjd g_destroy_bio(bp); 659163837Spjd} 660163837Spjd 661163837Spjdstatic void 662163837Spjdg_journal_metadata_update(struct g_journal_softc *sc) 663163837Spjd{ 664163837Spjd struct g_journal_metadata md; 665163837Spjd struct g_consumer *cp; 666163837Spjd struct bio *bp; 667163837Spjd u_char *sector; 668163837Spjd 669163837Spjd cp = sc->sc_dconsumer; 670163837Spjd sector = gj_malloc(cp->provider->sectorsize, M_WAITOK); 671163837Spjd strlcpy(md.md_magic, G_JOURNAL_MAGIC, sizeof(md.md_magic)); 672163837Spjd md.md_version = G_JOURNAL_VERSION; 673163837Spjd md.md_id = sc->sc_id; 674163837Spjd md.md_type = sc->sc_orig_type; 675163837Spjd md.md_jstart = sc->sc_jstart; 676163837Spjd md.md_jend = sc->sc_jend; 677163837Spjd md.md_joffset = sc->sc_inactive.jj_offset; 678163837Spjd md.md_jid = sc->sc_journal_previous_id; 679163837Spjd md.md_flags = 0; 680163837Spjd if (sc->sc_flags & GJF_DEVICE_CLEAN) 681163837Spjd md.md_flags |= GJ_FLAG_CLEAN; 682163837Spjd 683163837Spjd if (sc->sc_flags & GJF_DEVICE_HARDCODED) 684163837Spjd strlcpy(md.md_provider, sc->sc_name, sizeof(md.md_provider)); 685163837Spjd else 686163837Spjd bzero(md.md_provider, sizeof(md.md_provider)); 687163837Spjd md.md_provsize = cp->provider->mediasize; 688163837Spjd journal_metadata_encode(&md, sector); 689163837Spjd 690163837Spjd /* 691163837Spjd * Flush the cache, so we know all data are on disk. 692163837Spjd * We write here informations like "journal is consistent", so we need 693163837Spjd * to be sure it is. Without BIO_FLUSH here, we can end up in situation 694163837Spjd * where metadata is stored on disk, but not all data. 695163837Spjd */ 696163837Spjd g_journal_flush_cache(sc); 697163837Spjd 698163837Spjd bp = g_alloc_bio(); 699163837Spjd bp->bio_offset = cp->provider->mediasize - cp->provider->sectorsize; 700163837Spjd bp->bio_length = cp->provider->sectorsize; 701163837Spjd bp->bio_data = sector; 702163837Spjd bp->bio_cmd = BIO_WRITE; 703163837Spjd if (!(sc->sc_flags & GJF_DEVICE_DESTROY)) { 704163837Spjd bp->bio_done = g_journal_metadata_done; 705163837Spjd g_io_request(bp, cp); 706163837Spjd } else { 707163837Spjd bp->bio_done = NULL; 708163837Spjd g_io_request(bp, cp); 709163837Spjd biowait(bp, "gjmdu"); 710163837Spjd g_journal_metadata_done(bp); 711163837Spjd } 712163837Spjd 713163837Spjd /* 714163837Spjd * Be sure metadata reached the disk. 715163837Spjd */ 716163837Spjd g_journal_flush_cache(sc); 717163837Spjd} 718163837Spjd 719163837Spjd/* 720163837Spjd * This is where the I/O request comes from the GEOM. 721163837Spjd */ 722163837Spjdstatic void 723163837Spjdg_journal_start(struct bio *bp) 724163837Spjd{ 725163837Spjd struct g_journal_softc *sc; 726163837Spjd 727163837Spjd sc = bp->bio_to->geom->softc; 728163837Spjd GJ_LOGREQ(3, bp, "Request received."); 729163837Spjd 730163837Spjd switch (bp->bio_cmd) { 731163837Spjd case BIO_READ: 732163837Spjd case BIO_WRITE: 733163837Spjd mtx_lock(&sc->sc_mtx); 734163837Spjd bioq_insert_tail(&sc->sc_regular_queue, bp); 735163837Spjd wakeup(sc); 736163837Spjd mtx_unlock(&sc->sc_mtx); 737163837Spjd return; 738163837Spjd case BIO_GETATTR: 739163837Spjd if (strcmp(bp->bio_attribute, "GJOURNAL::provider") == 0) { 740163837Spjd strlcpy(bp->bio_data, bp->bio_to->name, bp->bio_length); 741163837Spjd bp->bio_completed = strlen(bp->bio_to->name) + 1; 742163837Spjd g_io_deliver(bp, 0); 743163837Spjd return; 744163837Spjd } 745163837Spjd /* FALLTHROUGH */ 746163837Spjd case BIO_DELETE: 747163837Spjd default: 748163837Spjd g_io_deliver(bp, EOPNOTSUPP); 749163837Spjd return; 750163837Spjd } 751163837Spjd} 752163837Spjd 753163837Spjdstatic void 754163837Spjdg_journal_std_done(struct bio *bp) 755163837Spjd{ 756163837Spjd struct g_journal_softc *sc; 757163837Spjd 758163837Spjd sc = bp->bio_from->geom->softc; 759163837Spjd mtx_lock(&sc->sc_mtx); 760163837Spjd bioq_insert_tail(&sc->sc_back_queue, bp); 761163837Spjd wakeup(sc); 762163837Spjd mtx_unlock(&sc->sc_mtx); 763163837Spjd} 764163837Spjd 765163837Spjdstatic struct bio * 766163837Spjdg_journal_new_bio(off_t start, off_t end, off_t joffset, u_char *data, 767163837Spjd int flags) 768163837Spjd{ 769163837Spjd struct bio *bp; 770163837Spjd 771163837Spjd bp = g_alloc_bio(); 772163837Spjd bp->bio_offset = start; 773163837Spjd bp->bio_joffset = joffset; 774163837Spjd bp->bio_length = end - start; 775163837Spjd bp->bio_cmd = BIO_WRITE; 776163837Spjd bp->bio_done = g_journal_std_done; 777163837Spjd if (data == NULL) 778163837Spjd bp->bio_data = NULL; 779163837Spjd else { 780163837Spjd bp->bio_data = gj_malloc(bp->bio_length, flags); 781163837Spjd if (bp->bio_data != NULL) 782163837Spjd bcopy(data, bp->bio_data, bp->bio_length); 783163837Spjd } 784163837Spjd return (bp); 785163837Spjd} 786163837Spjd 787163837Spjd#define g_journal_insert_bio(head, bp, flags) \ 788163837Spjd g_journal_insert((head), (bp)->bio_offset, \ 789163837Spjd (bp)->bio_offset + (bp)->bio_length, (bp)->bio_joffset, \ 790163837Spjd (bp)->bio_data, flags) 791163837Spjd/* 792163837Spjd * The function below does a lot more than just inserting bio to the queue. 793163837Spjd * It keeps the queue sorted by offset and ensures that there are no doubled 794163837Spjd * data (it combines bios where ranges overlap). 795163837Spjd * 796163837Spjd * The function returns the number of bios inserted (as bio can be splitted). 797163837Spjd */ 798163837Spjdstatic int 799163837Spjdg_journal_insert(struct bio **head, off_t nstart, off_t nend, off_t joffset, 800163837Spjd u_char *data, int flags) 801163837Spjd{ 802163837Spjd struct bio *nbp, *cbp, *pbp; 803163837Spjd off_t cstart, cend; 804163837Spjd u_char *tmpdata; 805163837Spjd int n; 806163837Spjd 807163837Spjd GJ_DEBUG(3, "INSERT(%p): (%jd, %jd, %jd)", *head, nstart, nend, 808163837Spjd joffset); 809163837Spjd n = 0; 810163837Spjd pbp = NULL; 811163837Spjd GJQ_FOREACH(*head, cbp) { 812163837Spjd cstart = cbp->bio_offset; 813163837Spjd cend = cbp->bio_offset + cbp->bio_length; 814163837Spjd 815163837Spjd if (nstart >= cend) { 816163837Spjd /* 817163837Spjd * +-------------+ 818163837Spjd * | | 819163837Spjd * | current | +-------------+ 820163837Spjd * | bio | | | 821163837Spjd * | | | new | 822163837Spjd * +-------------+ | bio | 823163837Spjd * | | 824163837Spjd * +-------------+ 825163837Spjd */ 826163837Spjd GJ_DEBUG(3, "INSERT(%p): 1", *head); 827163837Spjd } else if (nend <= cstart) { 828163837Spjd /* 829163837Spjd * +-------------+ 830163837Spjd * | | 831163837Spjd * +-------------+ | current | 832163837Spjd * | | | bio | 833163837Spjd * | new | | | 834163837Spjd * | bio | +-------------+ 835163837Spjd * | | 836163837Spjd * +-------------+ 837163837Spjd */ 838163837Spjd nbp = g_journal_new_bio(nstart, nend, joffset, data, 839163837Spjd flags); 840163837Spjd if (pbp == NULL) 841163837Spjd *head = nbp; 842163837Spjd else 843163837Spjd pbp->bio_next = nbp; 844163837Spjd nbp->bio_next = cbp; 845163837Spjd n++; 846163837Spjd GJ_DEBUG(3, "INSERT(%p): 2 (nbp=%p pbp=%p)", *head, nbp, 847163837Spjd pbp); 848163837Spjd goto end; 849163837Spjd } else if (nstart <= cstart && nend >= cend) { 850163837Spjd /* 851163837Spjd * +-------------+ +-------------+ 852163837Spjd * | current bio | | current bio | 853163837Spjd * +---+-------------+---+ +-------------+---+ 854163837Spjd * | | | | | | | 855163837Spjd * | | | | | | | 856163837Spjd * | +-------------+ | +-------------+ | 857163837Spjd * | new bio | | new bio | 858163837Spjd * +---------------------+ +-----------------+ 859163837Spjd * 860163837Spjd * +-------------+ +-------------+ 861163837Spjd * | current bio | | current bio | 862163837Spjd * +---+-------------+ +-------------+ 863163837Spjd * | | | | | 864163837Spjd * | | | | | 865163837Spjd * | +-------------+ +-------------+ 866163837Spjd * | new bio | | new bio | 867163837Spjd * +-----------------+ +-------------+ 868163837Spjd */ 869163837Spjd g_journal_stats_bytes_skipped += cbp->bio_length; 870163837Spjd cbp->bio_offset = nstart; 871163837Spjd cbp->bio_joffset = joffset; 872163837Spjd cbp->bio_length = cend - nstart; 873163837Spjd if (cbp->bio_data != NULL) { 874163837Spjd gj_free(cbp->bio_data, cend - cstart); 875163837Spjd cbp->bio_data = NULL; 876163837Spjd } 877163837Spjd if (data != NULL) { 878163837Spjd cbp->bio_data = gj_malloc(cbp->bio_length, 879163837Spjd flags); 880163837Spjd if (cbp->bio_data != NULL) { 881163837Spjd bcopy(data, cbp->bio_data, 882163837Spjd cbp->bio_length); 883163837Spjd } 884163837Spjd data += cend - nstart; 885163837Spjd } 886163837Spjd joffset += cend - nstart; 887163837Spjd nstart = cend; 888163837Spjd GJ_DEBUG(3, "INSERT(%p): 3 (cbp=%p)", *head, cbp); 889163837Spjd } else if (nstart > cstart && nend >= cend) { 890163837Spjd /* 891163837Spjd * +-----------------+ +-------------+ 892163837Spjd * | current bio | | current bio | 893163837Spjd * | +-------------+ | +---------+---+ 894163837Spjd * | | | | | | | 895163837Spjd * | | | | | | | 896163837Spjd * +---+-------------+ +---+---------+ | 897163837Spjd * | new bio | | new bio | 898163837Spjd * +-------------+ +-------------+ 899163837Spjd */ 900163837Spjd g_journal_stats_bytes_skipped += cend - nstart; 901163837Spjd nbp = g_journal_new_bio(nstart, cend, joffset, data, 902163837Spjd flags); 903163837Spjd nbp->bio_next = cbp->bio_next; 904163837Spjd cbp->bio_next = nbp; 905163837Spjd cbp->bio_length = nstart - cstart; 906163837Spjd if (cbp->bio_data != NULL) { 907163837Spjd cbp->bio_data = gj_realloc(cbp->bio_data, 908163837Spjd cbp->bio_length, cend - cstart); 909163837Spjd } 910163837Spjd if (data != NULL) 911163837Spjd data += cend - nstart; 912163837Spjd joffset += cend - nstart; 913163837Spjd nstart = cend; 914163837Spjd n++; 915163837Spjd GJ_DEBUG(3, "INSERT(%p): 4 (cbp=%p)", *head, cbp); 916163837Spjd } else if (nstart > cstart && nend < cend) { 917163837Spjd /* 918163837Spjd * +---------------------+ 919163837Spjd * | current bio | 920163837Spjd * | +-------------+ | 921163837Spjd * | | | | 922163837Spjd * | | | | 923163837Spjd * +---+-------------+---+ 924163837Spjd * | new bio | 925163837Spjd * +-------------+ 926163837Spjd */ 927163837Spjd g_journal_stats_bytes_skipped += nend - nstart; 928163837Spjd nbp = g_journal_new_bio(nstart, nend, joffset, data, 929163837Spjd flags); 930163837Spjd nbp->bio_next = cbp->bio_next; 931163837Spjd cbp->bio_next = nbp; 932163837Spjd if (cbp->bio_data == NULL) 933163837Spjd tmpdata = NULL; 934163837Spjd else 935163837Spjd tmpdata = cbp->bio_data + nend - cstart; 936163837Spjd nbp = g_journal_new_bio(nend, cend, 937163837Spjd cbp->bio_joffset + nend - cstart, tmpdata, flags); 938163837Spjd nbp->bio_next = ((struct bio *)cbp->bio_next)->bio_next; 939163837Spjd ((struct bio *)cbp->bio_next)->bio_next = nbp; 940163837Spjd cbp->bio_length = nstart - cstart; 941163837Spjd if (cbp->bio_data != NULL) { 942163837Spjd cbp->bio_data = gj_realloc(cbp->bio_data, 943163837Spjd cbp->bio_length, cend - cstart); 944163837Spjd } 945163837Spjd n += 2; 946163837Spjd GJ_DEBUG(3, "INSERT(%p): 5 (cbp=%p)", *head, cbp); 947163837Spjd goto end; 948163837Spjd } else if (nstart <= cstart && nend < cend) { 949163837Spjd /* 950163837Spjd * +-----------------+ +-------------+ 951163837Spjd * | current bio | | current bio | 952163837Spjd * +-------------+ | +---+---------+ | 953163837Spjd * | | | | | | | 954163837Spjd * | | | | | | | 955163837Spjd * +-------------+---+ | +---------+---+ 956163837Spjd * | new bio | | new bio | 957163837Spjd * +-------------+ +-------------+ 958163837Spjd */ 959163837Spjd g_journal_stats_bytes_skipped += nend - nstart; 960163837Spjd nbp = g_journal_new_bio(nstart, nend, joffset, data, 961163837Spjd flags); 962163837Spjd if (pbp == NULL) 963163837Spjd *head = nbp; 964163837Spjd else 965163837Spjd pbp->bio_next = nbp; 966163837Spjd nbp->bio_next = cbp; 967163837Spjd cbp->bio_offset = nend; 968163837Spjd cbp->bio_length = cend - nend; 969163837Spjd cbp->bio_joffset += nend - cstart; 970163837Spjd tmpdata = cbp->bio_data; 971163837Spjd if (tmpdata != NULL) { 972163837Spjd cbp->bio_data = gj_malloc(cbp->bio_length, 973163837Spjd flags); 974163837Spjd if (cbp->bio_data != NULL) { 975163837Spjd bcopy(tmpdata + nend - cstart, 976163837Spjd cbp->bio_data, cbp->bio_length); 977163837Spjd } 978163837Spjd gj_free(tmpdata, cend - cstart); 979163837Spjd } 980163837Spjd n++; 981163837Spjd GJ_DEBUG(3, "INSERT(%p): 6 (cbp=%p)", *head, cbp); 982163837Spjd goto end; 983163837Spjd } 984163837Spjd if (nstart == nend) 985163837Spjd goto end; 986163837Spjd pbp = cbp; 987163837Spjd } 988163837Spjd nbp = g_journal_new_bio(nstart, nend, joffset, data, flags); 989163837Spjd if (pbp == NULL) 990163837Spjd *head = nbp; 991163837Spjd else 992163837Spjd pbp->bio_next = nbp; 993163837Spjd nbp->bio_next = NULL; 994163837Spjd n++; 995163837Spjd GJ_DEBUG(3, "INSERT(%p): 8 (nbp=%p pbp=%p)", *head, nbp, pbp); 996163837Spjdend: 997163837Spjd if (g_journal_debug >= 3) { 998163837Spjd GJQ_FOREACH(*head, cbp) { 999163837Spjd GJ_DEBUG(3, "ELEMENT: %p (%jd, %jd, %jd, %p)", cbp, 1000163837Spjd (intmax_t)cbp->bio_offset, 1001163837Spjd (intmax_t)cbp->bio_length, 1002163837Spjd (intmax_t)cbp->bio_joffset, cbp->bio_data); 1003163837Spjd } 1004163837Spjd GJ_DEBUG(3, "INSERT(%p): DONE %d", *head, n); 1005163837Spjd } 1006163837Spjd return (n); 1007163837Spjd} 1008163837Spjd 1009163837Spjd/* 1010163837Spjd * The function combines neighbour bios trying to squeeze as much data as 1011163837Spjd * possible into one bio. 1012163837Spjd * 1013163837Spjd * The function returns the number of bios combined (negative value). 1014163837Spjd */ 1015163837Spjdstatic int 1016163837Spjdg_journal_optimize(struct bio *head) 1017163837Spjd{ 1018163837Spjd struct bio *cbp, *pbp; 1019163837Spjd int n; 1020163837Spjd 1021163837Spjd n = 0; 1022163837Spjd pbp = NULL; 1023163837Spjd GJQ_FOREACH(head, cbp) { 1024163837Spjd /* Skip bios which has to be read first. */ 1025163837Spjd if (cbp->bio_data == NULL) { 1026163837Spjd pbp = NULL; 1027163837Spjd continue; 1028163837Spjd } 1029163837Spjd /* There is no previous bio yet. */ 1030163837Spjd if (pbp == NULL) { 1031163837Spjd pbp = cbp; 1032163837Spjd continue; 1033163837Spjd } 1034163837Spjd /* Is this a neighbour bio? */ 1035163837Spjd if (pbp->bio_offset + pbp->bio_length != cbp->bio_offset) { 1036163837Spjd /* Be sure that bios queue is sorted. */ 1037163837Spjd KASSERT(pbp->bio_offset + pbp->bio_length < cbp->bio_offset, 1038163837Spjd ("poffset=%jd plength=%jd coffset=%jd", 1039163837Spjd (intmax_t)pbp->bio_offset, 1040163837Spjd (intmax_t)pbp->bio_length, 1041163837Spjd (intmax_t)cbp->bio_offset)); 1042163837Spjd pbp = cbp; 1043163837Spjd continue; 1044163837Spjd } 1045163837Spjd /* Be sure we don't end up with too big bio. */ 1046163837Spjd if (pbp->bio_length + cbp->bio_length > MAXPHYS) { 1047163837Spjd pbp = cbp; 1048163837Spjd continue; 1049163837Spjd } 1050163837Spjd /* Ok, we can join bios. */ 1051163837Spjd GJ_LOGREQ(4, pbp, "Join: "); 1052163837Spjd GJ_LOGREQ(4, cbp, "and: "); 1053163837Spjd pbp->bio_data = gj_realloc(pbp->bio_data, 1054163837Spjd pbp->bio_length + cbp->bio_length, pbp->bio_length); 1055163837Spjd bcopy(cbp->bio_data, pbp->bio_data + pbp->bio_length, 1056163837Spjd cbp->bio_length); 1057163837Spjd gj_free(cbp->bio_data, cbp->bio_length); 1058163837Spjd pbp->bio_length += cbp->bio_length; 1059163837Spjd pbp->bio_next = cbp->bio_next; 1060163837Spjd g_destroy_bio(cbp); 1061163837Spjd cbp = pbp; 1062163837Spjd g_journal_stats_combined_ios++; 1063163837Spjd n--; 1064163837Spjd GJ_LOGREQ(4, pbp, "Got: "); 1065163837Spjd } 1066163837Spjd return (n); 1067163837Spjd} 1068163837Spjd 1069163837Spjd/* 1070163837Spjd * TODO: Update comment. 1071163837Spjd * These are functions responsible for copying one portion of data from journal 1072163837Spjd * to the destination provider. 1073163837Spjd * The order goes like this: 1074163837Spjd * 1. Read the header, which contains informations about data blocks 1075163837Spjd * following it. 1076163837Spjd * 2. Read the data blocks from the journal. 1077163837Spjd * 3. Write the data blocks on the data provider. 1078163837Spjd * 1079163837Spjd * g_journal_copy_start() 1080163837Spjd * g_journal_copy_done() - got finished write request, logs potential errors. 1081163837Spjd */ 1082163837Spjd 1083163837Spjd/* 1084163837Spjd * When there is no data in cache, this function is used to read it. 1085163837Spjd */ 1086163837Spjdstatic void 1087163837Spjdg_journal_read_first(struct g_journal_softc *sc, struct bio *bp) 1088163837Spjd{ 1089163837Spjd struct bio *cbp; 1090163837Spjd 1091163837Spjd /* 1092163837Spjd * We were short in memory, so data was freed. 1093163837Spjd * In that case we need to read it back from journal. 1094163837Spjd */ 1095163837Spjd cbp = g_alloc_bio(); 1096163837Spjd cbp->bio_cflags = bp->bio_cflags; 1097163837Spjd cbp->bio_parent = bp; 1098163837Spjd cbp->bio_offset = bp->bio_joffset; 1099163837Spjd cbp->bio_length = bp->bio_length; 1100163837Spjd cbp->bio_data = gj_malloc(bp->bio_length, M_WAITOK); 1101163837Spjd cbp->bio_cmd = BIO_READ; 1102163837Spjd cbp->bio_done = g_journal_std_done; 1103163837Spjd GJ_LOGREQ(4, cbp, "READ FIRST"); 1104163837Spjd g_io_request(cbp, sc->sc_jconsumer); 1105163837Spjd g_journal_cache_misses++; 1106163837Spjd} 1107163837Spjd 1108163837Spjdstatic void 1109163837Spjdg_journal_copy_send(struct g_journal_softc *sc) 1110163837Spjd{ 1111163837Spjd struct bio *bioq, *bp, *lbp; 1112163837Spjd 1113163837Spjd bioq = lbp = NULL; 1114163837Spjd mtx_lock(&sc->sc_mtx); 1115163837Spjd for (; sc->sc_copy_in_progress < g_journal_parallel_copies;) { 1116163837Spjd bp = GJQ_FIRST(sc->sc_inactive.jj_queue); 1117163837Spjd if (bp == NULL) 1118163837Spjd break; 1119163837Spjd GJQ_REMOVE(sc->sc_inactive.jj_queue, bp); 1120163837Spjd sc->sc_copy_in_progress++; 1121163837Spjd GJQ_INSERT_AFTER(bioq, bp, lbp); 1122163837Spjd lbp = bp; 1123163837Spjd } 1124163837Spjd mtx_unlock(&sc->sc_mtx); 1125163837Spjd if (g_journal_do_optimize) 1126163837Spjd sc->sc_copy_in_progress += g_journal_optimize(bioq); 1127163837Spjd while ((bp = GJQ_FIRST(bioq)) != NULL) { 1128163837Spjd GJQ_REMOVE(bioq, bp); 1129163837Spjd GJQ_INSERT_HEAD(sc->sc_copy_queue, bp); 1130163837Spjd bp->bio_cflags = GJ_BIO_COPY; 1131163837Spjd if (bp->bio_data == NULL) 1132163837Spjd g_journal_read_first(sc, bp); 1133163837Spjd else { 1134163837Spjd bp->bio_joffset = 0; 1135163837Spjd GJ_LOGREQ(4, bp, "SEND"); 1136163837Spjd g_io_request(bp, sc->sc_dconsumer); 1137163837Spjd } 1138163837Spjd } 1139163837Spjd} 1140163837Spjd 1141163837Spjdstatic void 1142163837Spjdg_journal_copy_start(struct g_journal_softc *sc) 1143163837Spjd{ 1144163837Spjd 1145163837Spjd /* 1146163837Spjd * Remember in metadata that we're starting to copy journaled data 1147163837Spjd * to the data provider. 1148163837Spjd * In case of power failure, we will copy these data once again on boot. 1149163837Spjd */ 1150163837Spjd if (!sc->sc_journal_copying) { 1151163837Spjd sc->sc_journal_copying = 1; 1152163837Spjd GJ_DEBUG(1, "Starting copy of journal."); 1153163837Spjd g_journal_metadata_update(sc); 1154163837Spjd } 1155163837Spjd g_journal_copy_send(sc); 1156163837Spjd} 1157163837Spjd 1158163837Spjd/* 1159163837Spjd * Data block has been read from the journal provider. 1160163837Spjd */ 1161163837Spjdstatic int 1162163837Spjdg_journal_copy_read_done(struct bio *bp) 1163163837Spjd{ 1164163837Spjd struct g_journal_softc *sc; 1165163837Spjd struct g_consumer *cp; 1166163837Spjd struct bio *pbp; 1167163837Spjd 1168163837Spjd KASSERT(bp->bio_cflags == GJ_BIO_COPY, 1169163837Spjd ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); 1170163837Spjd 1171163837Spjd sc = bp->bio_from->geom->softc; 1172163837Spjd pbp = bp->bio_parent; 1173163837Spjd 1174163837Spjd if (bp->bio_error != 0) { 1175163837Spjd GJ_DEBUG(0, "Error while reading data from %s (error=%d).", 1176163837Spjd bp->bio_to->name, bp->bio_error); 1177163837Spjd /* 1178163837Spjd * We will not be able to deliver WRITE request as well. 1179163837Spjd */ 1180163837Spjd gj_free(bp->bio_data, bp->bio_length); 1181163837Spjd g_destroy_bio(pbp); 1182163837Spjd g_destroy_bio(bp); 1183163837Spjd sc->sc_copy_in_progress--; 1184163837Spjd return (1); 1185163837Spjd } 1186163837Spjd pbp->bio_data = bp->bio_data; 1187163837Spjd cp = sc->sc_dconsumer; 1188163837Spjd g_io_request(pbp, cp); 1189163837Spjd GJ_LOGREQ(4, bp, "READ DONE"); 1190163837Spjd g_destroy_bio(bp); 1191163837Spjd return (0); 1192163837Spjd} 1193163837Spjd 1194163837Spjd/* 1195163837Spjd * Data block has been written to the data provider. 1196163837Spjd */ 1197163837Spjdstatic void 1198163837Spjdg_journal_copy_write_done(struct bio *bp) 1199163837Spjd{ 1200163837Spjd struct g_journal_softc *sc; 1201163837Spjd 1202163837Spjd KASSERT(bp->bio_cflags == GJ_BIO_COPY, 1203163837Spjd ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_COPY)); 1204163837Spjd 1205163837Spjd sc = bp->bio_from->geom->softc; 1206163837Spjd sc->sc_copy_in_progress--; 1207163837Spjd 1208163837Spjd if (bp->bio_error != 0) { 1209179897Slulf GJ_LOGREQ(0, bp, "[copy] Error while writing data (error=%d)", 1210163837Spjd bp->bio_error); 1211163837Spjd } 1212163837Spjd GJQ_REMOVE(sc->sc_copy_queue, bp); 1213163837Spjd gj_free(bp->bio_data, bp->bio_length); 1214163837Spjd GJ_LOGREQ(4, bp, "DONE"); 1215163837Spjd g_destroy_bio(bp); 1216163837Spjd 1217163837Spjd if (sc->sc_copy_in_progress == 0) { 1218163837Spjd /* 1219163837Spjd * This was the last write request for this journal. 1220163837Spjd */ 1221163837Spjd GJ_DEBUG(1, "Data has been copied."); 1222163837Spjd sc->sc_journal_copying = 0; 1223163837Spjd } 1224163837Spjd} 1225163837Spjd 1226163837Spjdstatic void g_journal_flush_done(struct bio *bp); 1227163837Spjd 1228163837Spjd/* 1229163837Spjd * Flush one record onto active journal provider. 1230163837Spjd */ 1231163837Spjdstatic void 1232163837Spjdg_journal_flush(struct g_journal_softc *sc) 1233163837Spjd{ 1234163837Spjd struct g_journal_record_header hdr; 1235163837Spjd struct g_journal_entry *ent; 1236163837Spjd struct g_provider *pp; 1237163837Spjd struct bio **bioq; 1238163837Spjd struct bio *bp, *fbp, *pbp; 1239163837Spjd off_t joffset, size; 1240163837Spjd u_char *data, hash[16]; 1241163837Spjd MD5_CTX ctx; 1242163837Spjd u_int i; 1243163837Spjd 1244163837Spjd if (sc->sc_current_count == 0) 1245163837Spjd return; 1246163837Spjd 1247163837Spjd size = 0; 1248163837Spjd pp = sc->sc_jprovider; 1249163837Spjd GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); 1250163837Spjd joffset = sc->sc_journal_offset; 1251163837Spjd 1252163837Spjd GJ_DEBUG(2, "Storing %d journal entries on %s at %jd.", 1253163837Spjd sc->sc_current_count, pp->name, (intmax_t)joffset); 1254163837Spjd 1255163837Spjd /* 1256163837Spjd * Store 'journal id', so we know to which journal this record belongs. 1257163837Spjd */ 1258163837Spjd hdr.jrh_journal_id = sc->sc_journal_id; 1259163837Spjd /* Could be less than g_journal_record_entries if called due timeout. */ 1260163837Spjd hdr.jrh_nentries = MIN(sc->sc_current_count, g_journal_record_entries); 1261163837Spjd strlcpy(hdr.jrh_magic, GJ_RECORD_HEADER_MAGIC, sizeof(hdr.jrh_magic)); 1262163837Spjd 1263163837Spjd bioq = &sc->sc_active.jj_queue; 1264163837Spjd pbp = sc->sc_flush_queue; 1265163837Spjd 1266163837Spjd fbp = g_alloc_bio(); 1267163837Spjd fbp->bio_parent = NULL; 1268163837Spjd fbp->bio_cflags = GJ_BIO_JOURNAL; 1269163837Spjd fbp->bio_offset = -1; 1270163837Spjd fbp->bio_joffset = joffset; 1271163837Spjd fbp->bio_length = pp->sectorsize; 1272163837Spjd fbp->bio_cmd = BIO_WRITE; 1273163837Spjd fbp->bio_done = g_journal_std_done; 1274163837Spjd GJQ_INSERT_AFTER(sc->sc_flush_queue, fbp, pbp); 1275163837Spjd pbp = fbp; 1276163837Spjd fbp->bio_to = pp; 1277163837Spjd GJ_LOGREQ(4, fbp, "FLUSH_OUT"); 1278163837Spjd joffset += pp->sectorsize; 1279163837Spjd sc->sc_flush_count++; 1280163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) 1281163837Spjd MD5Init(&ctx); 1282163837Spjd 1283163837Spjd for (i = 0; i < hdr.jrh_nentries; i++) { 1284163837Spjd bp = sc->sc_current_queue; 1285163837Spjd KASSERT(bp != NULL, ("NULL bp")); 1286163837Spjd bp->bio_to = pp; 1287163837Spjd GJ_LOGREQ(4, bp, "FLUSHED"); 1288163837Spjd sc->sc_current_queue = bp->bio_next; 1289163837Spjd bp->bio_next = NULL; 1290163837Spjd sc->sc_current_count--; 1291163837Spjd 1292163837Spjd /* Add to the header. */ 1293163837Spjd ent = &hdr.jrh_entries[i]; 1294163837Spjd ent->je_offset = bp->bio_offset; 1295163837Spjd ent->je_joffset = joffset; 1296163837Spjd ent->je_length = bp->bio_length; 1297163837Spjd size += ent->je_length; 1298163837Spjd 1299163837Spjd data = bp->bio_data; 1300163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) 1301163837Spjd MD5Update(&ctx, data, ent->je_length); 1302163837Spjd bzero(bp, sizeof(*bp)); 1303163837Spjd bp->bio_cflags = GJ_BIO_JOURNAL; 1304163837Spjd bp->bio_offset = ent->je_offset; 1305163837Spjd bp->bio_joffset = ent->je_joffset; 1306163837Spjd bp->bio_length = ent->je_length; 1307163837Spjd bp->bio_data = data; 1308163837Spjd bp->bio_cmd = BIO_WRITE; 1309163837Spjd bp->bio_done = g_journal_std_done; 1310163837Spjd GJQ_INSERT_AFTER(sc->sc_flush_queue, bp, pbp); 1311163837Spjd pbp = bp; 1312163837Spjd bp->bio_to = pp; 1313163837Spjd GJ_LOGREQ(4, bp, "FLUSH_OUT"); 1314163837Spjd joffset += bp->bio_length; 1315163837Spjd sc->sc_flush_count++; 1316163837Spjd 1317163837Spjd /* 1318163837Spjd * Add request to the active sc_journal_queue queue. 1319163837Spjd * This is our cache. After journal switch we don't have to 1320163837Spjd * read the data from the inactive journal, because we keep 1321163837Spjd * it in memory. 1322163837Spjd */ 1323163837Spjd g_journal_insert(bioq, ent->je_offset, 1324163837Spjd ent->je_offset + ent->je_length, ent->je_joffset, data, 1325163837Spjd M_NOWAIT); 1326163837Spjd } 1327163837Spjd 1328163837Spjd /* 1329163837Spjd * After all requests, store valid header. 1330163837Spjd */ 1331163837Spjd data = gj_malloc(pp->sectorsize, M_WAITOK); 1332163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { 1333163837Spjd MD5Final(hash, &ctx); 1334163837Spjd bcopy(hash, hdr.jrh_sum, sizeof(hdr.jrh_sum)); 1335163837Spjd } 1336163837Spjd g_journal_record_header_encode(&hdr, data); 1337163837Spjd fbp->bio_data = data; 1338163837Spjd 1339163837Spjd sc->sc_journal_offset = joffset; 1340163837Spjd 1341163837Spjd g_journal_check_overflow(sc); 1342163837Spjd} 1343163837Spjd 1344163837Spjd/* 1345163837Spjd * Flush request finished. 1346163837Spjd */ 1347163837Spjdstatic void 1348163837Spjdg_journal_flush_done(struct bio *bp) 1349163837Spjd{ 1350163837Spjd struct g_journal_softc *sc; 1351163837Spjd struct g_consumer *cp; 1352163837Spjd 1353163837Spjd KASSERT((bp->bio_cflags & GJ_BIO_MASK) == GJ_BIO_JOURNAL, 1354163837Spjd ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_JOURNAL)); 1355163837Spjd 1356163837Spjd cp = bp->bio_from; 1357163837Spjd sc = cp->geom->softc; 1358163837Spjd sc->sc_flush_in_progress--; 1359163837Spjd 1360163837Spjd if (bp->bio_error != 0) { 1361179897Slulf GJ_LOGREQ(0, bp, "[flush] Error while writing data (error=%d)", 1362163837Spjd bp->bio_error); 1363163837Spjd } 1364163837Spjd gj_free(bp->bio_data, bp->bio_length); 1365163837Spjd GJ_LOGREQ(4, bp, "DONE"); 1366163837Spjd g_destroy_bio(bp); 1367163837Spjd} 1368163837Spjd 1369163837Spjdstatic void g_journal_release_delayed(struct g_journal_softc *sc); 1370163837Spjd 1371163837Spjdstatic void 1372163837Spjdg_journal_flush_send(struct g_journal_softc *sc) 1373163837Spjd{ 1374163837Spjd struct g_consumer *cp; 1375163837Spjd struct bio *bioq, *bp, *lbp; 1376163837Spjd 1377163837Spjd cp = sc->sc_jconsumer; 1378163837Spjd bioq = lbp = NULL; 1379163837Spjd while (sc->sc_flush_in_progress < g_journal_parallel_flushes) { 1380163837Spjd /* Send one flush requests to the active journal. */ 1381163837Spjd bp = GJQ_FIRST(sc->sc_flush_queue); 1382163837Spjd if (bp != NULL) { 1383163837Spjd GJQ_REMOVE(sc->sc_flush_queue, bp); 1384163837Spjd sc->sc_flush_count--; 1385163837Spjd bp->bio_offset = bp->bio_joffset; 1386163837Spjd bp->bio_joffset = 0; 1387163837Spjd sc->sc_flush_in_progress++; 1388163837Spjd GJQ_INSERT_AFTER(bioq, bp, lbp); 1389163837Spjd lbp = bp; 1390163837Spjd } 1391163837Spjd /* Try to release delayed requests. */ 1392163837Spjd g_journal_release_delayed(sc); 1393163837Spjd /* If there are no requests to flush, leave. */ 1394163837Spjd if (GJQ_FIRST(sc->sc_flush_queue) == NULL) 1395163837Spjd break; 1396163837Spjd } 1397163837Spjd if (g_journal_do_optimize) 1398163837Spjd sc->sc_flush_in_progress += g_journal_optimize(bioq); 1399163837Spjd while ((bp = GJQ_FIRST(bioq)) != NULL) { 1400163837Spjd GJQ_REMOVE(bioq, bp); 1401163837Spjd GJ_LOGREQ(3, bp, "Flush request send"); 1402163837Spjd g_io_request(bp, cp); 1403163837Spjd } 1404163837Spjd} 1405163837Spjd 1406163837Spjdstatic void 1407163837Spjdg_journal_add_current(struct g_journal_softc *sc, struct bio *bp) 1408163837Spjd{ 1409163837Spjd int n; 1410163837Spjd 1411163837Spjd GJ_LOGREQ(4, bp, "CURRENT %d", sc->sc_current_count); 1412163837Spjd n = g_journal_insert_bio(&sc->sc_current_queue, bp, M_WAITOK); 1413163837Spjd sc->sc_current_count += n; 1414163837Spjd n = g_journal_optimize(sc->sc_current_queue); 1415163837Spjd sc->sc_current_count += n; 1416163837Spjd /* 1417163837Spjd * For requests which are added to the current queue we deliver 1418163837Spjd * response immediately. 1419163837Spjd */ 1420163837Spjd bp->bio_completed = bp->bio_length; 1421163837Spjd g_io_deliver(bp, 0); 1422163837Spjd if (sc->sc_current_count >= g_journal_record_entries) { 1423163837Spjd /* 1424163837Spjd * Let's flush one record onto active journal provider. 1425163837Spjd */ 1426163837Spjd g_journal_flush(sc); 1427163837Spjd } 1428163837Spjd} 1429163837Spjd 1430163837Spjdstatic void 1431163837Spjdg_journal_release_delayed(struct g_journal_softc *sc) 1432163837Spjd{ 1433163837Spjd struct bio *bp; 1434163837Spjd 1435163837Spjd for (;;) { 1436163837Spjd /* The flush queue is full, exit. */ 1437163837Spjd if (sc->sc_flush_count >= g_journal_accept_immediately) 1438163837Spjd return; 1439163837Spjd bp = bioq_takefirst(&sc->sc_delayed_queue); 1440163837Spjd if (bp == NULL) 1441163837Spjd return; 1442163837Spjd sc->sc_delayed_count--; 1443163837Spjd g_journal_add_current(sc, bp); 1444163837Spjd } 1445163837Spjd} 1446163837Spjd 1447163837Spjd/* 1448163837Spjd * Add I/O request to the current queue. If we have enough requests for one 1449163837Spjd * journal record we flush them onto active journal provider. 1450163837Spjd */ 1451163837Spjdstatic void 1452163837Spjdg_journal_add_request(struct g_journal_softc *sc, struct bio *bp) 1453163837Spjd{ 1454163837Spjd 1455163837Spjd /* 1456163837Spjd * The flush queue is full, we need to delay the request. 1457163837Spjd */ 1458163837Spjd if (sc->sc_delayed_count > 0 || 1459163837Spjd sc->sc_flush_count >= g_journal_accept_immediately) { 1460163837Spjd GJ_LOGREQ(4, bp, "DELAYED"); 1461163837Spjd bioq_insert_tail(&sc->sc_delayed_queue, bp); 1462163837Spjd sc->sc_delayed_count++; 1463163837Spjd return; 1464163837Spjd } 1465163837Spjd 1466163837Spjd KASSERT(TAILQ_EMPTY(&sc->sc_delayed_queue.queue), 1467163837Spjd ("DELAYED queue not empty.")); 1468163837Spjd g_journal_add_current(sc, bp); 1469163837Spjd} 1470163837Spjd 1471163837Spjdstatic void g_journal_read_done(struct bio *bp); 1472163837Spjd 1473163837Spjd/* 1474163837Spjd * Try to find requested data in cache. 1475163837Spjd */ 1476163837Spjdstatic struct bio * 1477163837Spjdg_journal_read_find(struct bio *head, int sorted, struct bio *pbp, off_t ostart, 1478163837Spjd off_t oend) 1479163837Spjd{ 1480163837Spjd off_t cstart, cend; 1481163837Spjd struct bio *bp; 1482163837Spjd 1483163837Spjd GJQ_FOREACH(head, bp) { 1484163837Spjd if (bp->bio_offset == -1) 1485163837Spjd continue; 1486163837Spjd cstart = MAX(ostart, bp->bio_offset); 1487163837Spjd cend = MIN(oend, bp->bio_offset + bp->bio_length); 1488163837Spjd if (cend <= ostart) 1489163837Spjd continue; 1490163837Spjd else if (cstart >= oend) { 1491163837Spjd if (!sorted) 1492163837Spjd continue; 1493163837Spjd else { 1494163837Spjd bp = NULL; 1495163837Spjd break; 1496163837Spjd } 1497163837Spjd } 1498163837Spjd if (bp->bio_data == NULL) 1499163837Spjd break; 1500163837Spjd GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, 1501163837Spjd bp); 1502163837Spjd bcopy(bp->bio_data + cstart - bp->bio_offset, 1503163837Spjd pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); 1504163837Spjd pbp->bio_completed += cend - cstart; 1505163837Spjd if (pbp->bio_completed == pbp->bio_length) { 1506163837Spjd /* 1507163837Spjd * Cool, the whole request was in cache, deliver happy 1508163837Spjd * message. 1509163837Spjd */ 1510163837Spjd g_io_deliver(pbp, 0); 1511163837Spjd return (pbp); 1512163837Spjd } 1513163837Spjd break; 1514163837Spjd } 1515163837Spjd return (bp); 1516163837Spjd} 1517163837Spjd 1518163837Spjd/* 1519163837Spjd * Try to find requested data in cache. 1520163837Spjd */ 1521163837Spjdstatic struct bio * 1522163837Spjdg_journal_read_queue_find(struct bio_queue *head, struct bio *pbp, off_t ostart, 1523163837Spjd off_t oend) 1524163837Spjd{ 1525163837Spjd off_t cstart, cend; 1526163837Spjd struct bio *bp; 1527163837Spjd 1528163837Spjd TAILQ_FOREACH(bp, head, bio_queue) { 1529163837Spjd cstart = MAX(ostart, bp->bio_offset); 1530163837Spjd cend = MIN(oend, bp->bio_offset + bp->bio_length); 1531163837Spjd if (cend <= ostart) 1532163837Spjd continue; 1533163837Spjd else if (cstart >= oend) 1534163837Spjd continue; 1535163837Spjd KASSERT(bp->bio_data != NULL, 1536163837Spjd ("%s: bio_data == NULL", __func__)); 1537163837Spjd GJ_DEBUG(3, "READ(%p): (%jd, %jd) (bp=%p)", head, cstart, cend, 1538163837Spjd bp); 1539163837Spjd bcopy(bp->bio_data + cstart - bp->bio_offset, 1540163837Spjd pbp->bio_data + cstart - pbp->bio_offset, cend - cstart); 1541163837Spjd pbp->bio_completed += cend - cstart; 1542163837Spjd if (pbp->bio_completed == pbp->bio_length) { 1543163837Spjd /* 1544163837Spjd * Cool, the whole request was in cache, deliver happy 1545163837Spjd * message. 1546163837Spjd */ 1547163837Spjd g_io_deliver(pbp, 0); 1548163837Spjd return (pbp); 1549163837Spjd } 1550163837Spjd break; 1551163837Spjd } 1552163837Spjd return (bp); 1553163837Spjd} 1554163837Spjd 1555163837Spjd/* 1556163837Spjd * This function is used for colecting data on read. 1557163837Spjd * The complexity is because parts of the data can be stored in four different 1558163837Spjd * places: 1559163837Spjd * - in delayed requests 1560163837Spjd * - in memory - the data not yet send to the active journal provider 1561163837Spjd * - in requests which are going to be sent to the active journal 1562163837Spjd * - in the active journal 1563163837Spjd * - in the inactive journal 1564163837Spjd * - in the data provider 1565163837Spjd */ 1566163837Spjdstatic void 1567163837Spjdg_journal_read(struct g_journal_softc *sc, struct bio *pbp, off_t ostart, 1568163837Spjd off_t oend) 1569163837Spjd{ 1570163837Spjd struct bio *bp, *nbp, *head; 1571163837Spjd off_t cstart, cend; 1572163837Spjd u_int i, sorted = 0; 1573163837Spjd 1574163837Spjd GJ_DEBUG(3, "READ: (%jd, %jd)", ostart, oend); 1575163837Spjd 1576163837Spjd cstart = cend = -1; 1577163837Spjd bp = NULL; 1578163837Spjd head = NULL; 1579163837Spjd for (i = 0; i <= 5; i++) { 1580163837Spjd switch (i) { 1581163837Spjd case 0: /* Delayed requests. */ 1582163837Spjd head = NULL; 1583163837Spjd sorted = 0; 1584163837Spjd break; 1585163837Spjd case 1: /* Not-yet-send data. */ 1586163837Spjd head = sc->sc_current_queue; 1587163837Spjd sorted = 1; 1588163837Spjd break; 1589163837Spjd case 2: /* In-flight to the active journal. */ 1590163837Spjd head = sc->sc_flush_queue; 1591163837Spjd sorted = 0; 1592163837Spjd break; 1593163837Spjd case 3: /* Active journal. */ 1594163837Spjd head = sc->sc_active.jj_queue; 1595163837Spjd sorted = 1; 1596163837Spjd break; 1597163837Spjd case 4: /* Inactive journal. */ 1598163837Spjd /* 1599163837Spjd * XXX: Here could be a race with g_journal_lowmem(). 1600163837Spjd */ 1601163837Spjd head = sc->sc_inactive.jj_queue; 1602163837Spjd sorted = 1; 1603163837Spjd break; 1604163837Spjd case 5: /* In-flight to the data provider. */ 1605163837Spjd head = sc->sc_copy_queue; 1606163837Spjd sorted = 0; 1607163837Spjd break; 1608163837Spjd default: 1609163837Spjd panic("gjournal %s: i=%d", __func__, i); 1610163837Spjd } 1611163837Spjd if (i == 0) 1612163837Spjd bp = g_journal_read_queue_find(&sc->sc_delayed_queue.queue, pbp, ostart, oend); 1613163837Spjd else 1614163837Spjd bp = g_journal_read_find(head, sorted, pbp, ostart, oend); 1615163837Spjd if (bp == pbp) { /* Got the whole request. */ 1616163837Spjd GJ_DEBUG(2, "Got the whole request from %u.", i); 1617163837Spjd return; 1618163837Spjd } else if (bp != NULL) { 1619163837Spjd cstart = MAX(ostart, bp->bio_offset); 1620163837Spjd cend = MIN(oend, bp->bio_offset + bp->bio_length); 1621163837Spjd GJ_DEBUG(2, "Got part of the request from %u (%jd-%jd).", 1622163837Spjd i, (intmax_t)cstart, (intmax_t)cend); 1623163837Spjd break; 1624163837Spjd } 1625163837Spjd } 1626163837Spjd if (bp != NULL) { 1627163837Spjd if (bp->bio_data == NULL) { 1628163906Spjd nbp = g_duplicate_bio(pbp); 1629163837Spjd nbp->bio_cflags = GJ_BIO_READ; 1630163837Spjd nbp->bio_data = 1631163837Spjd pbp->bio_data + cstart - pbp->bio_offset; 1632163837Spjd nbp->bio_offset = 1633163837Spjd bp->bio_joffset + cstart - bp->bio_offset; 1634163837Spjd nbp->bio_length = cend - cstart; 1635163837Spjd nbp->bio_done = g_journal_read_done; 1636163837Spjd g_io_request(nbp, sc->sc_jconsumer); 1637163837Spjd } 1638163837Spjd /* 1639163837Spjd * If we don't have the whole request yet, call g_journal_read() 1640163837Spjd * recursively. 1641163837Spjd */ 1642163837Spjd if (ostart < cstart) 1643163837Spjd g_journal_read(sc, pbp, ostart, cstart); 1644163837Spjd if (oend > cend) 1645163837Spjd g_journal_read(sc, pbp, cend, oend); 1646163837Spjd } else { 1647163837Spjd /* 1648163837Spjd * No data in memory, no data in journal. 1649163837Spjd * Its time for asking data provider. 1650163837Spjd */ 1651163837Spjd GJ_DEBUG(3, "READ(data): (%jd, %jd)", ostart, oend); 1652163906Spjd nbp = g_duplicate_bio(pbp); 1653163837Spjd nbp->bio_cflags = GJ_BIO_READ; 1654163837Spjd nbp->bio_data = pbp->bio_data + ostart - pbp->bio_offset; 1655163837Spjd nbp->bio_offset = ostart; 1656163837Spjd nbp->bio_length = oend - ostart; 1657163837Spjd nbp->bio_done = g_journal_read_done; 1658163837Spjd g_io_request(nbp, sc->sc_dconsumer); 1659163837Spjd /* We have the whole request, return here. */ 1660163837Spjd return; 1661163837Spjd } 1662163837Spjd} 1663163837Spjd 1664163837Spjd/* 1665163837Spjd * Function responsible for handling finished READ requests. 1666163837Spjd * Actually, g_std_done() could be used here, the only difference is that we 1667163837Spjd * log error. 1668163837Spjd */ 1669163837Spjdstatic void 1670163837Spjdg_journal_read_done(struct bio *bp) 1671163837Spjd{ 1672163837Spjd struct bio *pbp; 1673163837Spjd 1674163837Spjd KASSERT(bp->bio_cflags == GJ_BIO_READ, 1675163837Spjd ("Invalid bio (%d != %d).", bp->bio_cflags, GJ_BIO_READ)); 1676163837Spjd 1677163837Spjd pbp = bp->bio_parent; 1678163837Spjd pbp->bio_inbed++; 1679163837Spjd pbp->bio_completed += bp->bio_length; 1680163837Spjd 1681163837Spjd if (bp->bio_error != 0) { 1682163837Spjd if (pbp->bio_error == 0) 1683163837Spjd pbp->bio_error = bp->bio_error; 1684163837Spjd GJ_DEBUG(0, "Error while reading data from %s (error=%d).", 1685163837Spjd bp->bio_to->name, bp->bio_error); 1686163837Spjd } 1687163837Spjd g_destroy_bio(bp); 1688163837Spjd if (pbp->bio_children == pbp->bio_inbed && 1689163837Spjd pbp->bio_completed == pbp->bio_length) { 1690163837Spjd /* We're done. */ 1691163837Spjd g_io_deliver(pbp, 0); 1692163837Spjd } 1693163837Spjd} 1694163837Spjd 1695163837Spjd/* 1696163837Spjd * Deactive current journal and active next one. 1697163837Spjd */ 1698163837Spjdstatic void 1699163837Spjdg_journal_switch(struct g_journal_softc *sc) 1700163837Spjd{ 1701163837Spjd struct g_provider *pp; 1702163837Spjd 1703163837Spjd if (JEMPTY(sc)) { 1704163837Spjd GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); 1705163837Spjd pp = LIST_FIRST(&sc->sc_geom->provider); 1706163837Spjd if (!(sc->sc_flags & GJF_DEVICE_CLEAN) && pp->acw == 0) { 1707163837Spjd sc->sc_flags |= GJF_DEVICE_CLEAN; 1708163837Spjd GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); 1709163837Spjd g_journal_metadata_update(sc); 1710163837Spjd } 1711163837Spjd } else { 1712163837Spjd GJ_DEBUG(3, "Switching journal %s.", sc->sc_geom->name); 1713163837Spjd 1714163837Spjd pp = sc->sc_jprovider; 1715163837Spjd 1716163837Spjd sc->sc_journal_previous_id = sc->sc_journal_id; 1717163837Spjd 1718163837Spjd sc->sc_journal_id = sc->sc_journal_next_id; 1719163837Spjd sc->sc_journal_next_id = arc4random(); 1720163837Spjd 1721163837Spjd GJ_VALIDATE_OFFSET(sc->sc_journal_offset, sc); 1722163837Spjd 1723163837Spjd g_journal_write_header(sc); 1724163837Spjd 1725163837Spjd sc->sc_inactive.jj_offset = sc->sc_active.jj_offset; 1726163837Spjd sc->sc_inactive.jj_queue = sc->sc_active.jj_queue; 1727163837Spjd 1728163837Spjd sc->sc_active.jj_offset = 1729163837Spjd sc->sc_journal_offset - pp->sectorsize; 1730163837Spjd sc->sc_active.jj_queue = NULL; 1731163837Spjd 1732163837Spjd /* 1733163837Spjd * Switch is done, start copying data from the (now) inactive 1734163837Spjd * journal to the data provider. 1735163837Spjd */ 1736163837Spjd g_journal_copy_start(sc); 1737163837Spjd } 1738163837Spjd mtx_lock(&sc->sc_mtx); 1739163837Spjd sc->sc_flags &= ~GJF_DEVICE_SWITCH; 1740163837Spjd mtx_unlock(&sc->sc_mtx); 1741163837Spjd} 1742163837Spjd 1743163837Spjdstatic void 1744163837Spjdg_journal_initialize(struct g_journal_softc *sc) 1745163837Spjd{ 1746163837Spjd 1747163837Spjd sc->sc_journal_id = arc4random(); 1748163837Spjd sc->sc_journal_next_id = arc4random(); 1749163837Spjd sc->sc_journal_previous_id = sc->sc_journal_id; 1750163837Spjd sc->sc_journal_offset = sc->sc_jstart; 1751163837Spjd sc->sc_inactive.jj_offset = sc->sc_jstart; 1752163837Spjd g_journal_write_header(sc); 1753163837Spjd sc->sc_active.jj_offset = sc->sc_jstart; 1754163837Spjd} 1755163837Spjd 1756163837Spjdstatic void 1757163837Spjdg_journal_mark_as_dirty(struct g_journal_softc *sc) 1758163837Spjd{ 1759163837Spjd const struct g_journal_desc *desc; 1760163837Spjd int i; 1761163837Spjd 1762163837Spjd GJ_DEBUG(1, "Marking file system %s as dirty.", sc->sc_name); 1763163837Spjd for (i = 0; (desc = g_journal_filesystems[i]) != NULL; i++) 1764163837Spjd desc->jd_dirty(sc->sc_dconsumer); 1765163837Spjd} 1766163837Spjd 1767163837Spjd/* 1768163837Spjd * Function read record header from the given journal. 1769163837Spjd * It is very simlar to g_read_data(9), but it doesn't allocate memory for bio 1770163837Spjd * and data on every call. 1771163837Spjd */ 1772163837Spjdstatic int 1773163837Spjdg_journal_sync_read(struct g_consumer *cp, struct bio *bp, off_t offset, 1774163837Spjd void *data) 1775163837Spjd{ 1776163837Spjd int error; 1777163837Spjd 1778163837Spjd bzero(bp, sizeof(*bp)); 1779163837Spjd bp->bio_cmd = BIO_READ; 1780163837Spjd bp->bio_done = NULL; 1781163837Spjd bp->bio_offset = offset; 1782163837Spjd bp->bio_length = cp->provider->sectorsize; 1783163837Spjd bp->bio_data = data; 1784163837Spjd g_io_request(bp, cp); 1785163837Spjd error = biowait(bp, "gjs_read"); 1786163837Spjd return (error); 1787163837Spjd} 1788163837Spjd 1789163837Spjd#if 0 1790163837Spjd/* 1791163837Spjd * Function is called when we start the journal device and we detect that 1792163837Spjd * one of the journals was not fully copied. 1793163837Spjd * The purpose of this function is to read all records headers from journal 1794163837Spjd * and placed them in the inactive queue, so we can start journal 1795163837Spjd * synchronization process and the journal provider itself. 1796163837Spjd * Design decision was taken to not synchronize the whole journal here as it 1797163837Spjd * can take too much time. Reading headers only and delaying synchronization 1798163837Spjd * process until after journal provider is started should be the best choice. 1799163837Spjd */ 1800163837Spjd#endif 1801163837Spjd 1802163837Spjdstatic void 1803163837Spjdg_journal_sync(struct g_journal_softc *sc) 1804163837Spjd{ 1805163837Spjd struct g_journal_record_header rhdr; 1806163837Spjd struct g_journal_entry *ent; 1807163837Spjd struct g_journal_header jhdr; 1808163837Spjd struct g_consumer *cp; 1809163837Spjd struct bio *bp, *fbp, *tbp; 1810163837Spjd off_t joffset, offset; 1811163837Spjd u_char *buf, sum[16]; 1812163837Spjd uint64_t id; 1813163837Spjd MD5_CTX ctx; 1814163837Spjd int error, found, i; 1815163837Spjd 1816163837Spjd found = 0; 1817163837Spjd fbp = NULL; 1818163837Spjd cp = sc->sc_jconsumer; 1819163837Spjd bp = g_alloc_bio(); 1820163837Spjd buf = gj_malloc(cp->provider->sectorsize, M_WAITOK); 1821163837Spjd offset = joffset = sc->sc_inactive.jj_offset = sc->sc_journal_offset; 1822163837Spjd 1823163837Spjd GJ_DEBUG(2, "Looking for termination at %jd.", (intmax_t)joffset); 1824163837Spjd 1825163837Spjd /* 1826163837Spjd * Read and decode first journal header. 1827163837Spjd */ 1828163837Spjd error = g_journal_sync_read(cp, bp, offset, buf); 1829163837Spjd if (error != 0) { 1830163837Spjd GJ_DEBUG(0, "Error while reading journal header from %s.", 1831163837Spjd cp->provider->name); 1832163837Spjd goto end; 1833163837Spjd } 1834163837Spjd error = g_journal_header_decode(buf, &jhdr); 1835163837Spjd if (error != 0) { 1836163837Spjd GJ_DEBUG(0, "Cannot decode journal header from %s.", 1837163837Spjd cp->provider->name); 1838163837Spjd goto end; 1839163837Spjd } 1840163837Spjd id = sc->sc_journal_id; 1841163837Spjd if (jhdr.jh_journal_id != sc->sc_journal_id) { 1842163837Spjd GJ_DEBUG(1, "Journal ID mismatch at %jd (0x%08x != 0x%08x).", 1843163837Spjd (intmax_t)offset, (u_int)jhdr.jh_journal_id, (u_int)id); 1844163837Spjd goto end; 1845163837Spjd } 1846163837Spjd offset += cp->provider->sectorsize; 1847163837Spjd id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; 1848163837Spjd 1849163837Spjd for (;;) { 1850163837Spjd /* 1851163837Spjd * If the biggest record won't fit, look for a record header or 1852163837Spjd * journal header from the begining. 1853163837Spjd */ 1854163837Spjd GJ_VALIDATE_OFFSET(offset, sc); 1855163837Spjd error = g_journal_sync_read(cp, bp, offset, buf); 1856163837Spjd if (error != 0) { 1857163837Spjd /* 1858163837Spjd * Not good. Having an error while reading header 1859163837Spjd * means, that we cannot read next headers and in 1860163837Spjd * consequence we cannot find termination. 1861163837Spjd */ 1862163837Spjd GJ_DEBUG(0, 1863163837Spjd "Error while reading record header from %s.", 1864163837Spjd cp->provider->name); 1865163837Spjd break; 1866163837Spjd } 1867163837Spjd 1868163837Spjd error = g_journal_record_header_decode(buf, &rhdr); 1869163837Spjd if (error != 0) { 1870163837Spjd GJ_DEBUG(2, "Not a record header at %jd (error=%d).", 1871163837Spjd (intmax_t)offset, error); 1872163837Spjd /* 1873163837Spjd * This is not a record header. 1874163837Spjd * If we are lucky, this is next journal header. 1875163837Spjd */ 1876163837Spjd error = g_journal_header_decode(buf, &jhdr); 1877163837Spjd if (error != 0) { 1878163837Spjd GJ_DEBUG(1, "Not a journal header at %jd (error=%d).", 1879163837Spjd (intmax_t)offset, error); 1880163837Spjd /* 1881163837Spjd * Nope, this is not journal header, which 1882163837Spjd * bascially means that journal is not 1883163837Spjd * terminated properly. 1884163837Spjd */ 1885163837Spjd error = ENOENT; 1886163837Spjd break; 1887163837Spjd } 1888163837Spjd /* 1889163837Spjd * Ok. This is header of _some_ journal. Now we need to 1890163837Spjd * verify if this is header of the _next_ journal. 1891163837Spjd */ 1892163837Spjd if (jhdr.jh_journal_id != id) { 1893163837Spjd GJ_DEBUG(1, "Journal ID mismatch at %jd " 1894163837Spjd "(0x%08x != 0x%08x).", (intmax_t)offset, 1895163837Spjd (u_int)jhdr.jh_journal_id, (u_int)id); 1896163837Spjd error = ENOENT; 1897163837Spjd break; 1898163837Spjd } 1899163837Spjd 1900163837Spjd /* Found termination. */ 1901163837Spjd found++; 1902163837Spjd GJ_DEBUG(1, "Found termination at %jd (id=0x%08x).", 1903163837Spjd (intmax_t)offset, (u_int)id); 1904163837Spjd sc->sc_active.jj_offset = offset; 1905163837Spjd sc->sc_journal_offset = 1906163837Spjd offset + cp->provider->sectorsize; 1907163837Spjd sc->sc_journal_id = id; 1908163837Spjd id = sc->sc_journal_next_id = jhdr.jh_journal_next_id; 1909163837Spjd 1910163837Spjd while ((tbp = fbp) != NULL) { 1911163837Spjd fbp = tbp->bio_next; 1912163837Spjd GJ_LOGREQ(3, tbp, "Adding request."); 1913163837Spjd g_journal_insert_bio(&sc->sc_inactive.jj_queue, 1914163837Spjd tbp, M_WAITOK); 1915163837Spjd } 1916163837Spjd 1917163837Spjd /* Skip journal's header. */ 1918163837Spjd offset += cp->provider->sectorsize; 1919163837Spjd continue; 1920163837Spjd } 1921163837Spjd 1922163837Spjd /* Skip record's header. */ 1923163837Spjd offset += cp->provider->sectorsize; 1924163837Spjd 1925163837Spjd /* 1926163837Spjd * Add information about every record entry to the inactive 1927163837Spjd * queue. 1928163837Spjd */ 1929163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) 1930163837Spjd MD5Init(&ctx); 1931163837Spjd for (i = 0; i < rhdr.jrh_nentries; i++) { 1932163837Spjd ent = &rhdr.jrh_entries[i]; 1933163837Spjd GJ_DEBUG(3, "Insert entry: %jd %jd.", 1934163837Spjd (intmax_t)ent->je_offset, (intmax_t)ent->je_length); 1935163837Spjd g_journal_insert(&fbp, ent->je_offset, 1936163837Spjd ent->je_offset + ent->je_length, ent->je_joffset, 1937163837Spjd NULL, M_WAITOK); 1938163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { 1939163837Spjd u_char *buf2; 1940163837Spjd 1941163837Spjd /* 1942163837Spjd * TODO: Should use faster function (like 1943163837Spjd * g_journal_sync_read()). 1944163837Spjd */ 1945163837Spjd buf2 = g_read_data(cp, offset, ent->je_length, 1946163837Spjd NULL); 1947163837Spjd if (buf2 == NULL) 1948163837Spjd GJ_DEBUG(0, "Cannot read data at %jd.", 1949163837Spjd (intmax_t)offset); 1950163837Spjd else { 1951163837Spjd MD5Update(&ctx, buf2, ent->je_length); 1952163837Spjd g_free(buf2); 1953163837Spjd } 1954163837Spjd } 1955163837Spjd /* Skip entry's data. */ 1956163837Spjd offset += ent->je_length; 1957163837Spjd } 1958163837Spjd if (sc->sc_flags & GJF_DEVICE_CHECKSUM) { 1959163837Spjd MD5Final(sum, &ctx); 1960163837Spjd if (bcmp(sum, rhdr.jrh_sum, sizeof(rhdr.jrh_sum)) != 0) { 1961163837Spjd GJ_DEBUG(0, "MD5 hash mismatch at %jd!", 1962163837Spjd (intmax_t)offset); 1963163837Spjd } 1964163837Spjd } 1965163837Spjd } 1966163837Spjdend: 1967163837Spjd gj_free(bp->bio_data, cp->provider->sectorsize); 1968163837Spjd g_destroy_bio(bp); 1969163837Spjd 1970163837Spjd /* Remove bios from unterminated journal. */ 1971163837Spjd while ((tbp = fbp) != NULL) { 1972163837Spjd fbp = tbp->bio_next; 1973163837Spjd g_destroy_bio(tbp); 1974163837Spjd } 1975163837Spjd 1976163837Spjd if (found < 1 && joffset > 0) { 1977163837Spjd GJ_DEBUG(0, "Journal on %s is broken/corrupted. Initializing.", 1978163837Spjd sc->sc_name); 1979163837Spjd while ((tbp = sc->sc_inactive.jj_queue) != NULL) { 1980163837Spjd sc->sc_inactive.jj_queue = tbp->bio_next; 1981163837Spjd g_destroy_bio(tbp); 1982163837Spjd } 1983163837Spjd g_journal_initialize(sc); 1984163837Spjd g_journal_mark_as_dirty(sc); 1985163837Spjd } else { 1986163837Spjd GJ_DEBUG(0, "Journal %s consistent.", sc->sc_name); 1987163837Spjd g_journal_copy_start(sc); 1988163837Spjd } 1989163837Spjd} 1990163837Spjd 1991163837Spjd/* 1992163837Spjd * Wait for requests. 1993163837Spjd * If we have requests in the current queue, flush them after 3 seconds from the 1994163837Spjd * last flush. In this way we don't wait forever (or for journal switch) with 1995163837Spjd * storing not full records on journal. 1996163837Spjd */ 1997163837Spjdstatic void 1998163837Spjdg_journal_wait(struct g_journal_softc *sc, time_t last_write) 1999163837Spjd{ 2000163837Spjd int error, timeout; 2001163837Spjd 2002163837Spjd GJ_DEBUG(3, "%s: enter", __func__); 2003163837Spjd if (sc->sc_current_count == 0) { 2004163837Spjd if (g_journal_debug < 2) 2005163837Spjd msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", 0); 2006163837Spjd else { 2007163837Spjd /* 2008163837Spjd * If we have debug turned on, show number of elements 2009163837Spjd * in various queues. 2010163837Spjd */ 2011163837Spjd for (;;) { 2012163837Spjd error = msleep(sc, &sc->sc_mtx, PRIBIO, 2013163837Spjd "gj:work", hz * 3); 2014163837Spjd if (error == 0) { 2015163837Spjd mtx_unlock(&sc->sc_mtx); 2016163837Spjd break; 2017163837Spjd } 2018163837Spjd GJ_DEBUG(3, "Report: current count=%d", 2019163837Spjd sc->sc_current_count); 2020163837Spjd GJ_DEBUG(3, "Report: flush count=%d", 2021163837Spjd sc->sc_flush_count); 2022163837Spjd GJ_DEBUG(3, "Report: flush in progress=%d", 2023163837Spjd sc->sc_flush_in_progress); 2024163837Spjd GJ_DEBUG(3, "Report: copy in progress=%d", 2025163837Spjd sc->sc_copy_in_progress); 2026163837Spjd GJ_DEBUG(3, "Report: delayed=%d", 2027163837Spjd sc->sc_delayed_count); 2028163837Spjd } 2029163837Spjd } 2030163837Spjd GJ_DEBUG(3, "%s: exit 1", __func__); 2031163837Spjd return; 2032163837Spjd } 2033163837Spjd 2034163837Spjd /* 2035163837Spjd * Flush even not full records every 3 seconds. 2036163837Spjd */ 2037163837Spjd timeout = (last_write + 3 - time_second) * hz; 2038163837Spjd if (timeout <= 0) { 2039163837Spjd mtx_unlock(&sc->sc_mtx); 2040163837Spjd g_journal_flush(sc); 2041163837Spjd g_journal_flush_send(sc); 2042163837Spjd GJ_DEBUG(3, "%s: exit 2", __func__); 2043163837Spjd return; 2044163837Spjd } 2045163837Spjd error = msleep(sc, &sc->sc_mtx, PRIBIO | PDROP, "gj:work", timeout); 2046163837Spjd if (error == EWOULDBLOCK) 2047163837Spjd g_journal_flush_send(sc); 2048163837Spjd GJ_DEBUG(3, "%s: exit 3", __func__); 2049163837Spjd} 2050163837Spjd 2051163837Spjd/* 2052163837Spjd * Worker thread. 2053163837Spjd */ 2054163837Spjdstatic void 2055163837Spjdg_journal_worker(void *arg) 2056163837Spjd{ 2057163837Spjd struct g_journal_softc *sc; 2058163837Spjd struct g_geom *gp; 2059163837Spjd struct g_provider *pp; 2060163837Spjd struct bio *bp; 2061163837Spjd time_t last_write; 2062163837Spjd int type; 2063163837Spjd 2064170307Sjeff thread_lock(curthread); 2065163837Spjd sched_prio(curthread, PRIBIO); 2066170307Sjeff thread_unlock(curthread); 2067163837Spjd 2068163837Spjd sc = arg; 2069163894Spjd type = 0; /* gcc */ 2070163837Spjd 2071163837Spjd if (sc->sc_flags & GJF_DEVICE_CLEAN) { 2072163837Spjd GJ_DEBUG(0, "Journal %s clean.", sc->sc_name); 2073163837Spjd g_journal_initialize(sc); 2074163837Spjd } else { 2075163837Spjd g_journal_sync(sc); 2076163837Spjd } 2077163837Spjd /* 2078163837Spjd * Check if we can use BIO_FLUSH. 2079163837Spjd */ 2080163837Spjd sc->sc_bio_flush = 0; 2081163837Spjd if (g_io_flush(sc->sc_jconsumer) == 0) { 2082163837Spjd sc->sc_bio_flush |= GJ_FLUSH_JOURNAL; 2083163837Spjd GJ_DEBUG(1, "BIO_FLUSH supported by %s.", 2084163837Spjd sc->sc_jconsumer->provider->name); 2085163837Spjd } else { 2086163837Spjd GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", 2087163837Spjd sc->sc_jconsumer->provider->name); 2088163837Spjd } 2089163837Spjd if (sc->sc_jconsumer != sc->sc_dconsumer) { 2090163837Spjd if (g_io_flush(sc->sc_dconsumer) == 0) { 2091163837Spjd sc->sc_bio_flush |= GJ_FLUSH_DATA; 2092163837Spjd GJ_DEBUG(1, "BIO_FLUSH supported by %s.", 2093163837Spjd sc->sc_dconsumer->provider->name); 2094163837Spjd } else { 2095163837Spjd GJ_DEBUG(0, "BIO_FLUSH not supported by %s.", 2096163837Spjd sc->sc_dconsumer->provider->name); 2097163837Spjd } 2098163837Spjd } 2099163837Spjd 2100163837Spjd gp = sc->sc_geom; 2101163837Spjd g_topology_lock(); 2102163837Spjd pp = g_new_providerf(gp, "%s.journal", sc->sc_name); 2103163837Spjd pp->mediasize = sc->sc_mediasize; 2104163837Spjd /* 2105163837Spjd * There could be a problem when data provider and journal providers 2106163837Spjd * have different sectorsize, but such scenario is prevented on journal 2107163837Spjd * creation. 2108163837Spjd */ 2109163837Spjd pp->sectorsize = sc->sc_sectorsize; 2110163837Spjd g_error_provider(pp, 0); 2111163837Spjd g_topology_unlock(); 2112163837Spjd last_write = time_second; 2113163837Spjd 2114185693Strasz if (sc->sc_rootmount != NULL) { 2115185693Strasz GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); 2116185693Strasz root_mount_rel(sc->sc_rootmount); 2117185693Strasz sc->sc_rootmount = NULL; 2118185693Strasz } 2119185693Strasz 2120163837Spjd for (;;) { 2121163837Spjd /* Get first request from the queue. */ 2122163837Spjd mtx_lock(&sc->sc_mtx); 2123163837Spjd bp = bioq_first(&sc->sc_back_queue); 2124163837Spjd if (bp != NULL) 2125163837Spjd type = (bp->bio_cflags & GJ_BIO_MASK); 2126163837Spjd if (bp == NULL) { 2127163837Spjd bp = bioq_first(&sc->sc_regular_queue); 2128163837Spjd if (bp != NULL) 2129163837Spjd type = GJ_BIO_REGULAR; 2130163837Spjd } 2131163837Spjd if (bp == NULL) { 2132163837Spjdtry_switch: 2133163837Spjd if ((sc->sc_flags & GJF_DEVICE_SWITCH) || 2134163837Spjd (sc->sc_flags & GJF_DEVICE_DESTROY)) { 2135163837Spjd if (sc->sc_current_count > 0) { 2136163837Spjd mtx_unlock(&sc->sc_mtx); 2137163837Spjd g_journal_flush(sc); 2138163837Spjd g_journal_flush_send(sc); 2139163837Spjd continue; 2140163837Spjd } 2141163837Spjd if (sc->sc_flush_in_progress > 0) 2142163837Spjd goto sleep; 2143163837Spjd if (sc->sc_copy_in_progress > 0) 2144163837Spjd goto sleep; 2145163837Spjd } 2146163837Spjd if (sc->sc_flags & GJF_DEVICE_SWITCH) { 2147163837Spjd mtx_unlock(&sc->sc_mtx); 2148163837Spjd g_journal_switch(sc); 2149163837Spjd wakeup(&sc->sc_journal_copying); 2150163837Spjd continue; 2151163837Spjd } 2152163837Spjd if (sc->sc_flags & GJF_DEVICE_DESTROY) { 2153163837Spjd GJ_DEBUG(1, "Shutting down worker " 2154163837Spjd "thread for %s.", gp->name); 2155163837Spjd sc->sc_worker = NULL; 2156163837Spjd wakeup(&sc->sc_worker); 2157163837Spjd mtx_unlock(&sc->sc_mtx); 2158172836Sjulian kproc_exit(0); 2159163837Spjd } 2160163837Spjdsleep: 2161163837Spjd g_journal_wait(sc, last_write); 2162163837Spjd continue; 2163163837Spjd } 2164163837Spjd /* 2165163837Spjd * If we're in switch process, we need to delay all new 2166163837Spjd * write requests until its done. 2167163837Spjd */ 2168163837Spjd if ((sc->sc_flags & GJF_DEVICE_SWITCH) && 2169163837Spjd type == GJ_BIO_REGULAR && bp->bio_cmd == BIO_WRITE) { 2170163837Spjd GJ_LOGREQ(2, bp, "WRITE on SWITCH"); 2171163837Spjd goto try_switch; 2172163837Spjd } 2173163837Spjd if (type == GJ_BIO_REGULAR) 2174163837Spjd bioq_remove(&sc->sc_regular_queue, bp); 2175163837Spjd else 2176163837Spjd bioq_remove(&sc->sc_back_queue, bp); 2177163837Spjd mtx_unlock(&sc->sc_mtx); 2178163837Spjd switch (type) { 2179163837Spjd case GJ_BIO_REGULAR: 2180163837Spjd /* Regular request. */ 2181163837Spjd switch (bp->bio_cmd) { 2182163837Spjd case BIO_READ: 2183163837Spjd g_journal_read(sc, bp, bp->bio_offset, 2184163837Spjd bp->bio_offset + bp->bio_length); 2185163837Spjd break; 2186163837Spjd case BIO_WRITE: 2187163837Spjd last_write = time_second; 2188163837Spjd g_journal_add_request(sc, bp); 2189163837Spjd g_journal_flush_send(sc); 2190163837Spjd break; 2191163837Spjd default: 2192163837Spjd panic("Invalid bio_cmd (%d).", bp->bio_cmd); 2193163837Spjd } 2194163837Spjd break; 2195163837Spjd case GJ_BIO_COPY: 2196163837Spjd switch (bp->bio_cmd) { 2197163837Spjd case BIO_READ: 2198163837Spjd if (g_journal_copy_read_done(bp)) 2199163837Spjd g_journal_copy_send(sc); 2200163837Spjd break; 2201163837Spjd case BIO_WRITE: 2202163837Spjd g_journal_copy_write_done(bp); 2203163837Spjd g_journal_copy_send(sc); 2204163837Spjd break; 2205163837Spjd default: 2206163837Spjd panic("Invalid bio_cmd (%d).", bp->bio_cmd); 2207163837Spjd } 2208163837Spjd break; 2209163837Spjd case GJ_BIO_JOURNAL: 2210163837Spjd g_journal_flush_done(bp); 2211163837Spjd g_journal_flush_send(sc); 2212163837Spjd break; 2213163837Spjd case GJ_BIO_READ: 2214163837Spjd default: 2215163837Spjd panic("Invalid bio (%d).", type); 2216163837Spjd } 2217163837Spjd } 2218163837Spjd} 2219163837Spjd 2220163837Spjdstatic void 2221163837Spjdg_journal_destroy_event(void *arg, int flags __unused) 2222163837Spjd{ 2223163837Spjd struct g_journal_softc *sc; 2224163837Spjd 2225163837Spjd g_topology_assert(); 2226163837Spjd sc = arg; 2227163837Spjd g_journal_destroy(sc); 2228163837Spjd} 2229163837Spjd 2230163837Spjdstatic void 2231163837Spjdg_journal_timeout(void *arg) 2232163837Spjd{ 2233163837Spjd struct g_journal_softc *sc; 2234163837Spjd 2235163837Spjd sc = arg; 2236163837Spjd GJ_DEBUG(0, "Timeout. Journal %s cannot be completed.", 2237163837Spjd sc->sc_geom->name); 2238163837Spjd g_post_event(g_journal_destroy_event, sc, M_NOWAIT, NULL); 2239163837Spjd} 2240163837Spjd 2241163837Spjdstatic struct g_geom * 2242163837Spjdg_journal_create(struct g_class *mp, struct g_provider *pp, 2243163837Spjd const struct g_journal_metadata *md) 2244163837Spjd{ 2245163837Spjd struct g_journal_softc *sc; 2246163837Spjd struct g_geom *gp; 2247163837Spjd struct g_consumer *cp; 2248163837Spjd int error; 2249163837Spjd 2250163912Spjd sc = NULL; /* gcc */ 2251163912Spjd 2252163837Spjd g_topology_assert(); 2253163837Spjd /* 2254163837Spjd * There are two possibilities: 2255163837Spjd * 1. Data and both journals are on the same provider. 2256163837Spjd * 2. Data and journals are all on separated providers. 2257163837Spjd */ 2258163837Spjd /* Look for journal device with the same ID. */ 2259163837Spjd LIST_FOREACH(gp, &mp->geom, geom) { 2260163837Spjd sc = gp->softc; 2261163837Spjd if (sc == NULL) 2262163837Spjd continue; 2263163837Spjd if (sc->sc_id == md->md_id) 2264163837Spjd break; 2265163837Spjd } 2266163837Spjd if (gp == NULL) 2267163837Spjd sc = NULL; 2268163837Spjd else if (sc != NULL && (sc->sc_type & md->md_type) != 0) { 2269163837Spjd GJ_DEBUG(1, "Journal device %u already configured.", sc->sc_id); 2270163837Spjd return (NULL); 2271163837Spjd } 2272163837Spjd if (md->md_type == 0 || (md->md_type & ~GJ_TYPE_COMPLETE) != 0) { 2273163837Spjd GJ_DEBUG(0, "Invalid type on %s.", pp->name); 2274163837Spjd return (NULL); 2275163837Spjd } 2276163837Spjd if (md->md_type & GJ_TYPE_DATA) { 2277163837Spjd GJ_DEBUG(0, "Journal %u: %s contains data.", md->md_id, 2278163837Spjd pp->name); 2279163837Spjd } 2280163837Spjd if (md->md_type & GJ_TYPE_JOURNAL) { 2281163837Spjd GJ_DEBUG(0, "Journal %u: %s contains journal.", md->md_id, 2282163837Spjd pp->name); 2283163837Spjd } 2284163837Spjd 2285163837Spjd if (sc == NULL) { 2286163837Spjd /* Action geom. */ 2287163837Spjd sc = malloc(sizeof(*sc), M_JOURNAL, M_WAITOK | M_ZERO); 2288163837Spjd sc->sc_id = md->md_id; 2289163837Spjd sc->sc_type = 0; 2290163837Spjd sc->sc_flags = 0; 2291163837Spjd sc->sc_worker = NULL; 2292163837Spjd 2293163837Spjd gp = g_new_geomf(mp, "gjournal %u", sc->sc_id); 2294163837Spjd gp->start = g_journal_start; 2295163837Spjd gp->orphan = g_journal_orphan; 2296163837Spjd gp->access = g_journal_access; 2297163837Spjd gp->softc = sc; 2298195195Strasz gp->flags |= G_GEOM_VOLATILE_BIO; 2299163837Spjd sc->sc_geom = gp; 2300163837Spjd 2301163837Spjd mtx_init(&sc->sc_mtx, "gjournal", NULL, MTX_DEF); 2302163837Spjd 2303163837Spjd bioq_init(&sc->sc_back_queue); 2304163837Spjd bioq_init(&sc->sc_regular_queue); 2305163837Spjd bioq_init(&sc->sc_delayed_queue); 2306163837Spjd sc->sc_delayed_count = 0; 2307163837Spjd sc->sc_current_queue = NULL; 2308163837Spjd sc->sc_current_count = 0; 2309163837Spjd sc->sc_flush_queue = NULL; 2310163837Spjd sc->sc_flush_count = 0; 2311163837Spjd sc->sc_flush_in_progress = 0; 2312163837Spjd sc->sc_copy_queue = NULL; 2313163837Spjd sc->sc_copy_in_progress = 0; 2314163837Spjd sc->sc_inactive.jj_queue = NULL; 2315163837Spjd sc->sc_active.jj_queue = NULL; 2316163837Spjd 2317190878Sthompsa sc->sc_rootmount = root_mount_hold("GJOURNAL"); 2318185693Strasz GJ_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); 2319185693Strasz 2320163837Spjd callout_init(&sc->sc_callout, CALLOUT_MPSAFE); 2321163837Spjd if (md->md_type != GJ_TYPE_COMPLETE) { 2322163837Spjd /* 2323163837Spjd * Journal and data are on separate providers. 2324163837Spjd * At this point we have only one of them. 2325163837Spjd * We setup a timeout in case the other part will not 2326163837Spjd * appear, so we won't wait forever. 2327163837Spjd */ 2328163837Spjd callout_reset(&sc->sc_callout, 5 * hz, 2329163837Spjd g_journal_timeout, sc); 2330163837Spjd } 2331163837Spjd } 2332163837Spjd 2333163837Spjd /* Remember type of the data provider. */ 2334163837Spjd if (md->md_type & GJ_TYPE_DATA) 2335163837Spjd sc->sc_orig_type = md->md_type; 2336163837Spjd sc->sc_type |= md->md_type; 2337163837Spjd cp = NULL; 2338163837Spjd 2339163837Spjd if (md->md_type & GJ_TYPE_DATA) { 2340163837Spjd if (md->md_flags & GJ_FLAG_CLEAN) 2341163837Spjd sc->sc_flags |= GJF_DEVICE_CLEAN; 2342163837Spjd if (md->md_flags & GJ_FLAG_CHECKSUM) 2343163837Spjd sc->sc_flags |= GJF_DEVICE_CHECKSUM; 2344163837Spjd cp = g_new_consumer(gp); 2345163837Spjd error = g_attach(cp, pp); 2346163837Spjd KASSERT(error == 0, ("Cannot attach to %s (error=%d).", 2347163837Spjd pp->name, error)); 2348163837Spjd error = g_access(cp, 1, 1, 1); 2349163837Spjd if (error != 0) { 2350163837Spjd GJ_DEBUG(0, "Cannot access %s (error=%d).", pp->name, 2351163837Spjd error); 2352163837Spjd g_journal_destroy(sc); 2353163837Spjd return (NULL); 2354163837Spjd } 2355163837Spjd sc->sc_dconsumer = cp; 2356163837Spjd sc->sc_mediasize = pp->mediasize - pp->sectorsize; 2357163837Spjd sc->sc_sectorsize = pp->sectorsize; 2358163837Spjd sc->sc_jstart = md->md_jstart; 2359163837Spjd sc->sc_jend = md->md_jend; 2360163837Spjd if (md->md_provider[0] != '\0') 2361163837Spjd sc->sc_flags |= GJF_DEVICE_HARDCODED; 2362163837Spjd sc->sc_journal_offset = md->md_joffset; 2363163837Spjd sc->sc_journal_id = md->md_jid; 2364163837Spjd sc->sc_journal_previous_id = md->md_jid; 2365163837Spjd } 2366163837Spjd if (md->md_type & GJ_TYPE_JOURNAL) { 2367163837Spjd if (cp == NULL) { 2368163837Spjd cp = g_new_consumer(gp); 2369163837Spjd error = g_attach(cp, pp); 2370163837Spjd KASSERT(error == 0, ("Cannot attach to %s (error=%d).", 2371163837Spjd pp->name, error)); 2372163837Spjd error = g_access(cp, 1, 1, 1); 2373163837Spjd if (error != 0) { 2374163837Spjd GJ_DEBUG(0, "Cannot access %s (error=%d).", 2375163837Spjd pp->name, error); 2376163837Spjd g_journal_destroy(sc); 2377163837Spjd return (NULL); 2378163837Spjd } 2379163837Spjd } else { 2380163837Spjd /* 2381163837Spjd * Journal is on the same provider as data, which means 2382163837Spjd * that data provider ends where journal starts. 2383163837Spjd */ 2384163837Spjd sc->sc_mediasize = md->md_jstart; 2385163837Spjd } 2386163837Spjd sc->sc_jconsumer = cp; 2387163837Spjd } 2388163837Spjd 2389163837Spjd if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) { 2390163837Spjd /* Journal is not complete yet. */ 2391163837Spjd return (gp); 2392163837Spjd } else { 2393163837Spjd /* Journal complete, cancel timeout. */ 2394163837Spjd callout_drain(&sc->sc_callout); 2395163837Spjd } 2396163837Spjd 2397172836Sjulian error = kproc_create(g_journal_worker, sc, &sc->sc_worker, 0, 0, 2398163837Spjd "g_journal %s", sc->sc_name); 2399163837Spjd if (error != 0) { 2400163837Spjd GJ_DEBUG(0, "Cannot create worker thread for %s.journal.", 2401163837Spjd sc->sc_name); 2402163837Spjd g_journal_destroy(sc); 2403163837Spjd return (NULL); 2404163837Spjd } 2405163837Spjd 2406163837Spjd return (gp); 2407163837Spjd} 2408163837Spjd 2409163837Spjdstatic void 2410163837Spjdg_journal_destroy_consumer(void *arg, int flags __unused) 2411163837Spjd{ 2412163837Spjd struct g_consumer *cp; 2413163837Spjd 2414163837Spjd g_topology_assert(); 2415163837Spjd cp = arg; 2416163837Spjd g_detach(cp); 2417163837Spjd g_destroy_consumer(cp); 2418163837Spjd} 2419163837Spjd 2420163837Spjdstatic int 2421163837Spjdg_journal_destroy(struct g_journal_softc *sc) 2422163837Spjd{ 2423163837Spjd struct g_geom *gp; 2424163837Spjd struct g_provider *pp; 2425163837Spjd struct g_consumer *cp; 2426163837Spjd 2427163837Spjd g_topology_assert(); 2428163837Spjd 2429163837Spjd if (sc == NULL) 2430163837Spjd return (ENXIO); 2431163837Spjd 2432163837Spjd gp = sc->sc_geom; 2433163837Spjd pp = LIST_FIRST(&gp->provider); 2434163837Spjd if (pp != NULL) { 2435163837Spjd if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0) { 2436163837Spjd GJ_DEBUG(1, "Device %s is still open (r%dw%de%d).", 2437163837Spjd pp->name, pp->acr, pp->acw, pp->ace); 2438163837Spjd return (EBUSY); 2439163837Spjd } 2440163837Spjd g_error_provider(pp, ENXIO); 2441163837Spjd 2442163837Spjd g_journal_flush(sc); 2443163837Spjd g_journal_flush_send(sc); 2444163837Spjd g_journal_switch(sc); 2445163837Spjd } 2446163837Spjd 2447163837Spjd sc->sc_flags |= (GJF_DEVICE_DESTROY | GJF_DEVICE_CLEAN); 2448163837Spjd 2449163837Spjd g_topology_unlock(); 2450185693Strasz 2451185693Strasz if (sc->sc_rootmount != NULL) { 2452185693Strasz GJ_DEBUG(1, "root_mount_rel %p", sc->sc_rootmount); 2453185693Strasz root_mount_rel(sc->sc_rootmount); 2454185693Strasz sc->sc_rootmount = NULL; 2455185693Strasz } 2456185693Strasz 2457163837Spjd callout_drain(&sc->sc_callout); 2458163837Spjd mtx_lock(&sc->sc_mtx); 2459163837Spjd wakeup(sc); 2460163837Spjd while (sc->sc_worker != NULL) 2461163837Spjd msleep(&sc->sc_worker, &sc->sc_mtx, PRIBIO, "gj:destroy", 0); 2462163837Spjd mtx_unlock(&sc->sc_mtx); 2463163837Spjd 2464163837Spjd if (pp != NULL) { 2465163837Spjd GJ_DEBUG(1, "Marking %s as clean.", sc->sc_name); 2466163837Spjd g_journal_metadata_update(sc); 2467163837Spjd g_topology_lock(); 2468163837Spjd pp->flags |= G_PF_WITHER; 2469163837Spjd g_orphan_provider(pp, ENXIO); 2470163837Spjd } else { 2471163837Spjd g_topology_lock(); 2472163837Spjd } 2473163837Spjd mtx_destroy(&sc->sc_mtx); 2474163837Spjd 2475163837Spjd if (sc->sc_current_count != 0) { 2476163837Spjd GJ_DEBUG(0, "Warning! Number of current requests %d.", 2477163837Spjd sc->sc_current_count); 2478163837Spjd } 2479163837Spjd 2480163837Spjd LIST_FOREACH(cp, &gp->consumer, consumer) { 2481163837Spjd if (cp->acr + cp->acw + cp->ace > 0) 2482163837Spjd g_access(cp, -1, -1, -1); 2483163837Spjd /* 2484163837Spjd * We keep all consumers open for writting, so if I'll detach 2485163837Spjd * and destroy consumer here, I'll get providers for taste, so 2486163837Spjd * journal will be started again. 2487163837Spjd * Sending an event here, prevents this from happening. 2488163837Spjd */ 2489163837Spjd g_post_event(g_journal_destroy_consumer, cp, M_WAITOK, NULL); 2490163837Spjd } 2491163837Spjd gp->softc = NULL; 2492163837Spjd g_wither_geom(gp, ENXIO); 2493163837Spjd free(sc, M_JOURNAL); 2494163837Spjd return (0); 2495163837Spjd} 2496163837Spjd 2497163837Spjdstatic void 2498163837Spjdg_journal_taste_orphan(struct g_consumer *cp) 2499163837Spjd{ 2500163837Spjd 2501163837Spjd KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 2502163837Spjd cp->provider->name)); 2503163837Spjd} 2504163837Spjd 2505163837Spjdstatic struct g_geom * 2506163837Spjdg_journal_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 2507163837Spjd{ 2508163837Spjd struct g_journal_metadata md; 2509163837Spjd struct g_consumer *cp; 2510163837Spjd struct g_geom *gp; 2511163837Spjd int error; 2512163837Spjd 2513163837Spjd g_topology_assert(); 2514163837Spjd g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 2515163837Spjd GJ_DEBUG(2, "Tasting %s.", pp->name); 2516163837Spjd if (pp->geom->class == mp) 2517163837Spjd return (NULL); 2518163837Spjd 2519163837Spjd gp = g_new_geomf(mp, "journal:taste"); 2520163837Spjd /* This orphan function should be never called. */ 2521163837Spjd gp->orphan = g_journal_taste_orphan; 2522163837Spjd cp = g_new_consumer(gp); 2523163837Spjd g_attach(cp, pp); 2524163837Spjd error = g_journal_metadata_read(cp, &md); 2525163837Spjd g_detach(cp); 2526163837Spjd g_destroy_consumer(cp); 2527163837Spjd g_destroy_geom(gp); 2528163837Spjd if (error != 0) 2529163837Spjd return (NULL); 2530163837Spjd gp = NULL; 2531163837Spjd 2532221101Smav if (md.md_provider[0] != '\0' && 2533221101Smav !g_compare_names(md.md_provider, pp->name)) 2534163837Spjd return (NULL); 2535163837Spjd if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) 2536163837Spjd return (NULL); 2537163837Spjd if (g_journal_debug >= 2) 2538163837Spjd journal_metadata_dump(&md); 2539163837Spjd 2540163837Spjd gp = g_journal_create(mp, pp, &md); 2541163837Spjd return (gp); 2542163837Spjd} 2543163837Spjd 2544163837Spjdstatic struct g_journal_softc * 2545163837Spjdg_journal_find_device(struct g_class *mp, const char *name) 2546163837Spjd{ 2547163837Spjd struct g_journal_softc *sc; 2548163837Spjd struct g_geom *gp; 2549163837Spjd struct g_provider *pp; 2550163837Spjd 2551163837Spjd if (strncmp(name, "/dev/", 5) == 0) 2552163837Spjd name += 5; 2553163837Spjd LIST_FOREACH(gp, &mp->geom, geom) { 2554163837Spjd sc = gp->softc; 2555163837Spjd if (sc == NULL) 2556163837Spjd continue; 2557163837Spjd if (sc->sc_flags & GJF_DEVICE_DESTROY) 2558163837Spjd continue; 2559163837Spjd if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) 2560163837Spjd continue; 2561163837Spjd pp = LIST_FIRST(&gp->provider); 2562163837Spjd if (strcmp(sc->sc_name, name) == 0) 2563163837Spjd return (sc); 2564163837Spjd if (pp != NULL && strcmp(pp->name, name) == 0) 2565163837Spjd return (sc); 2566163837Spjd } 2567163837Spjd return (NULL); 2568163837Spjd} 2569163837Spjd 2570163837Spjdstatic void 2571163837Spjdg_journal_ctl_destroy(struct gctl_req *req, struct g_class *mp) 2572163837Spjd{ 2573163837Spjd struct g_journal_softc *sc; 2574163837Spjd const char *name; 2575163837Spjd char param[16]; 2576163837Spjd int *nargs; 2577163837Spjd int error, i; 2578163837Spjd 2579163837Spjd g_topology_assert(); 2580163837Spjd 2581163837Spjd nargs = gctl_get_paraml(req, "nargs", sizeof(*nargs)); 2582163837Spjd if (nargs == NULL) { 2583163837Spjd gctl_error(req, "No '%s' argument.", "nargs"); 2584163837Spjd return; 2585163837Spjd } 2586163837Spjd if (*nargs <= 0) { 2587163837Spjd gctl_error(req, "Missing device(s)."); 2588163837Spjd return; 2589163837Spjd } 2590163837Spjd 2591163837Spjd for (i = 0; i < *nargs; i++) { 2592163837Spjd snprintf(param, sizeof(param), "arg%d", i); 2593163837Spjd name = gctl_get_asciiparam(req, param); 2594163837Spjd if (name == NULL) { 2595163837Spjd gctl_error(req, "No 'arg%d' argument.", i); 2596163837Spjd return; 2597163837Spjd } 2598163837Spjd sc = g_journal_find_device(mp, name); 2599163837Spjd if (sc == NULL) { 2600163837Spjd gctl_error(req, "No such device: %s.", name); 2601163837Spjd return; 2602163837Spjd } 2603163837Spjd error = g_journal_destroy(sc); 2604163837Spjd if (error != 0) { 2605163837Spjd gctl_error(req, "Cannot destroy device %s (error=%d).", 2606163837Spjd LIST_FIRST(&sc->sc_geom->provider)->name, error); 2607163837Spjd return; 2608163837Spjd } 2609163837Spjd } 2610163837Spjd} 2611163837Spjd 2612163837Spjdstatic void 2613163837Spjdg_journal_ctl_sync(struct gctl_req *req __unused, struct g_class *mp __unused) 2614163837Spjd{ 2615163837Spjd 2616163837Spjd g_topology_assert(); 2617163837Spjd g_topology_unlock(); 2618163837Spjd g_journal_sync_requested++; 2619163837Spjd wakeup(&g_journal_switcher_state); 2620163837Spjd while (g_journal_sync_requested > 0) 2621163837Spjd tsleep(&g_journal_sync_requested, PRIBIO, "j:sreq", hz / 2); 2622163837Spjd g_topology_lock(); 2623163837Spjd} 2624163837Spjd 2625163837Spjdstatic void 2626163837Spjdg_journal_config(struct gctl_req *req, struct g_class *mp, const char *verb) 2627163837Spjd{ 2628163837Spjd uint32_t *version; 2629163837Spjd 2630163837Spjd g_topology_assert(); 2631163837Spjd 2632163837Spjd version = gctl_get_paraml(req, "version", sizeof(*version)); 2633163837Spjd if (version == NULL) { 2634163837Spjd gctl_error(req, "No '%s' argument.", "version"); 2635163837Spjd return; 2636163837Spjd } 2637163837Spjd if (*version != G_JOURNAL_VERSION) { 2638163837Spjd gctl_error(req, "Userland and kernel parts are out of sync."); 2639163837Spjd return; 2640163837Spjd } 2641163837Spjd 2642163837Spjd if (strcmp(verb, "destroy") == 0 || strcmp(verb, "stop") == 0) { 2643163837Spjd g_journal_ctl_destroy(req, mp); 2644163837Spjd return; 2645163837Spjd } else if (strcmp(verb, "sync") == 0) { 2646163837Spjd g_journal_ctl_sync(req, mp); 2647163837Spjd return; 2648163837Spjd } 2649163837Spjd 2650163837Spjd gctl_error(req, "Unknown verb."); 2651163837Spjd} 2652163837Spjd 2653163837Spjdstatic void 2654163837Spjdg_journal_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 2655163837Spjd struct g_consumer *cp, struct g_provider *pp) 2656163837Spjd{ 2657163837Spjd struct g_journal_softc *sc; 2658163837Spjd 2659163837Spjd g_topology_assert(); 2660163837Spjd 2661163837Spjd sc = gp->softc; 2662163837Spjd if (sc == NULL) 2663163837Spjd return; 2664163837Spjd if (pp != NULL) { 2665163837Spjd /* Nothing here. */ 2666163837Spjd } else if (cp != NULL) { 2667163837Spjd int first = 1; 2668163837Spjd 2669163837Spjd sbuf_printf(sb, "%s<Role>", indent); 2670163837Spjd if (cp == sc->sc_dconsumer) { 2671163837Spjd sbuf_printf(sb, "Data"); 2672163837Spjd first = 0; 2673163837Spjd } 2674163837Spjd if (cp == sc->sc_jconsumer) { 2675163837Spjd if (!first) 2676163837Spjd sbuf_printf(sb, ","); 2677163837Spjd sbuf_printf(sb, "Journal"); 2678163837Spjd } 2679163837Spjd sbuf_printf(sb, "</Role>\n"); 2680163837Spjd if (cp == sc->sc_jconsumer) { 2681167800Spjd sbuf_printf(sb, "<Jstart>%jd</Jstart>\n", 2682163837Spjd (intmax_t)sc->sc_jstart); 2683167800Spjd sbuf_printf(sb, "<Jend>%jd</Jend>\n", 2684163837Spjd (intmax_t)sc->sc_jend); 2685163837Spjd } 2686163837Spjd } else { 2687163837Spjd sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id); 2688163837Spjd } 2689163837Spjd} 2690163837Spjd 2691163837Spjdstatic eventhandler_tag g_journal_event_shutdown = NULL; 2692163837Spjdstatic eventhandler_tag g_journal_event_lowmem = NULL; 2693163837Spjd 2694163837Spjdstatic void 2695163837Spjdg_journal_shutdown(void *arg, int howto __unused) 2696163837Spjd{ 2697163837Spjd struct g_class *mp; 2698163837Spjd struct g_geom *gp, *gp2; 2699163837Spjd 2700163837Spjd if (panicstr != NULL) 2701163837Spjd return; 2702163837Spjd mp = arg; 2703163837Spjd DROP_GIANT(); 2704163837Spjd g_topology_lock(); 2705163837Spjd LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 2706163837Spjd if (gp->softc == NULL) 2707163837Spjd continue; 2708163837Spjd GJ_DEBUG(0, "Shutting down geom %s.", gp->name); 2709163837Spjd g_journal_destroy(gp->softc); 2710163837Spjd } 2711163837Spjd g_topology_unlock(); 2712163837Spjd PICKUP_GIANT(); 2713163837Spjd} 2714163837Spjd 2715163837Spjd/* 2716163837Spjd * Free cached requests from inactive queue in case of low memory. 2717163837Spjd * We free GJ_FREE_AT_ONCE elements at once. 2718163837Spjd */ 2719163837Spjd#define GJ_FREE_AT_ONCE 4 2720163837Spjdstatic void 2721163837Spjdg_journal_lowmem(void *arg, int howto __unused) 2722163837Spjd{ 2723163837Spjd struct g_journal_softc *sc; 2724163837Spjd struct g_class *mp; 2725163837Spjd struct g_geom *gp; 2726163837Spjd struct bio *bp; 2727163837Spjd u_int nfree = GJ_FREE_AT_ONCE; 2728163837Spjd 2729163837Spjd g_journal_stats_low_mem++; 2730163837Spjd mp = arg; 2731163837Spjd DROP_GIANT(); 2732163837Spjd g_topology_lock(); 2733163837Spjd LIST_FOREACH(gp, &mp->geom, geom) { 2734163837Spjd sc = gp->softc; 2735163837Spjd if (sc == NULL || (sc->sc_flags & GJF_DEVICE_DESTROY)) 2736163837Spjd continue; 2737163837Spjd mtx_lock(&sc->sc_mtx); 2738163837Spjd for (bp = sc->sc_inactive.jj_queue; nfree > 0 && bp != NULL; 2739163837Spjd nfree--, bp = bp->bio_next) { 2740163837Spjd /* 2741163837Spjd * This is safe to free the bio_data, because: 2742163837Spjd * 1. If bio_data is NULL it will be read from the 2743163837Spjd * inactive journal. 2744163837Spjd * 2. If bp is sent down, it is first removed from the 2745163837Spjd * inactive queue, so it's impossible to free the 2746163837Spjd * data from under in-flight bio. 2747163837Spjd * On the other hand, freeing elements from the active 2748163837Spjd * queue, is not safe. 2749163837Spjd */ 2750163837Spjd if (bp->bio_data != NULL) { 2751163837Spjd GJ_DEBUG(2, "Freeing data from %s.", 2752163837Spjd sc->sc_name); 2753163837Spjd gj_free(bp->bio_data, bp->bio_length); 2754163837Spjd bp->bio_data = NULL; 2755163837Spjd } 2756163837Spjd } 2757163837Spjd mtx_unlock(&sc->sc_mtx); 2758163837Spjd if (nfree == 0) 2759163837Spjd break; 2760163837Spjd } 2761163837Spjd g_topology_unlock(); 2762163837Spjd PICKUP_GIANT(); 2763163837Spjd} 2764163837Spjd 2765163837Spjdstatic void g_journal_switcher(void *arg); 2766163837Spjd 2767163837Spjdstatic void 2768163837Spjdg_journal_init(struct g_class *mp) 2769163837Spjd{ 2770163837Spjd int error; 2771163837Spjd 2772163837Spjd /* Pick a conservative value if provided value sucks. */ 2773163837Spjd if (g_journal_cache_divisor <= 0 || 2774163837Spjd (vm_kmem_size / g_journal_cache_divisor == 0)) { 2775163837Spjd g_journal_cache_divisor = 5; 2776163837Spjd } 2777163837Spjd if (g_journal_cache_limit > 0) { 2778163837Spjd g_journal_cache_limit = vm_kmem_size / g_journal_cache_divisor; 2779163837Spjd g_journal_cache_low = 2780163837Spjd (g_journal_cache_limit / 100) * g_journal_cache_switch; 2781163837Spjd } 2782163837Spjd g_journal_event_shutdown = EVENTHANDLER_REGISTER(shutdown_post_sync, 2783163837Spjd g_journal_shutdown, mp, EVENTHANDLER_PRI_FIRST); 2784163837Spjd if (g_journal_event_shutdown == NULL) 2785163837Spjd GJ_DEBUG(0, "Warning! Cannot register shutdown event."); 2786163837Spjd g_journal_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, 2787163837Spjd g_journal_lowmem, mp, EVENTHANDLER_PRI_FIRST); 2788163837Spjd if (g_journal_event_lowmem == NULL) 2789163837Spjd GJ_DEBUG(0, "Warning! Cannot register lowmem event."); 2790172836Sjulian error = kproc_create(g_journal_switcher, mp, NULL, 0, 0, 2791163837Spjd "g_journal switcher"); 2792163837Spjd KASSERT(error == 0, ("Cannot create switcher thread.")); 2793163837Spjd} 2794163837Spjd 2795163837Spjdstatic void 2796163837Spjdg_journal_fini(struct g_class *mp) 2797163837Spjd{ 2798163837Spjd 2799163837Spjd if (g_journal_event_shutdown != NULL) { 2800163837Spjd EVENTHANDLER_DEREGISTER(shutdown_post_sync, 2801163837Spjd g_journal_event_shutdown); 2802163837Spjd } 2803163837Spjd if (g_journal_event_lowmem != NULL) 2804163837Spjd EVENTHANDLER_DEREGISTER(vm_lowmem, g_journal_event_lowmem); 2805163837Spjd g_journal_switcher_state = GJ_SWITCHER_DIE; 2806163837Spjd wakeup(&g_journal_switcher_state); 2807163837Spjd while (g_journal_switcher_state != GJ_SWITCHER_DIED) 2808163837Spjd tsleep(&g_journal_switcher_state, PRIBIO, "jfini:wait", hz / 5); 2809163837Spjd GJ_DEBUG(1, "Switcher died."); 2810163837Spjd} 2811163837Spjd 2812163837SpjdDECLARE_GEOM_CLASS(g_journal_class, g_journal); 2813163837Spjd 2814163837Spjdstatic const struct g_journal_desc * 2815163837Spjdg_journal_find_desc(const char *fstype) 2816163837Spjd{ 2817163837Spjd const struct g_journal_desc *desc; 2818163837Spjd int i; 2819163837Spjd 2820163837Spjd for (desc = g_journal_filesystems[i = 0]; desc != NULL; 2821163837Spjd desc = g_journal_filesystems[++i]) { 2822163837Spjd if (strcmp(desc->jd_fstype, fstype) == 0) 2823163837Spjd break; 2824163837Spjd } 2825163837Spjd return (desc); 2826163837Spjd} 2827163837Spjd 2828163837Spjdstatic void 2829163837Spjdg_journal_switch_wait(struct g_journal_softc *sc) 2830163837Spjd{ 2831163837Spjd struct bintime bt; 2832163837Spjd 2833163837Spjd mtx_assert(&sc->sc_mtx, MA_OWNED); 2834163837Spjd if (g_journal_debug >= 2) { 2835163837Spjd if (sc->sc_flush_in_progress > 0) { 2836163837Spjd GJ_DEBUG(2, "%d requests flushing.", 2837163837Spjd sc->sc_flush_in_progress); 2838163837Spjd } 2839163837Spjd if (sc->sc_copy_in_progress > 0) { 2840163837Spjd GJ_DEBUG(2, "%d requests copying.", 2841163837Spjd sc->sc_copy_in_progress); 2842163837Spjd } 2843163837Spjd if (sc->sc_flush_count > 0) { 2844163837Spjd GJ_DEBUG(2, "%d requests to flush.", 2845163837Spjd sc->sc_flush_count); 2846163837Spjd } 2847163837Spjd if (sc->sc_delayed_count > 0) { 2848163837Spjd GJ_DEBUG(2, "%d requests delayed.", 2849163837Spjd sc->sc_delayed_count); 2850163837Spjd } 2851163837Spjd } 2852163837Spjd g_journal_stats_switches++; 2853163837Spjd if (sc->sc_copy_in_progress > 0) 2854163837Spjd g_journal_stats_wait_for_copy++; 2855163837Spjd GJ_TIMER_START(1, &bt); 2856163837Spjd sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; 2857163837Spjd sc->sc_flags |= GJF_DEVICE_SWITCH; 2858163837Spjd wakeup(sc); 2859163837Spjd while (sc->sc_flags & GJF_DEVICE_SWITCH) { 2860163837Spjd msleep(&sc->sc_journal_copying, &sc->sc_mtx, PRIBIO, 2861163837Spjd "gj:switch", 0); 2862163837Spjd } 2863163837Spjd GJ_TIMER_STOP(1, &bt, "Switch time of %s", sc->sc_name); 2864163837Spjd} 2865163837Spjd 2866163837Spjdstatic void 2867182542Sattiliog_journal_do_switch(struct g_class *classp) 2868163837Spjd{ 2869163837Spjd struct g_journal_softc *sc; 2870163837Spjd const struct g_journal_desc *desc; 2871163837Spjd struct g_geom *gp; 2872163837Spjd struct mount *mp; 2873163837Spjd struct bintime bt; 2874163837Spjd char *mountpoint; 2875241896Skib int error, save; 2876163837Spjd 2877163837Spjd DROP_GIANT(); 2878163837Spjd g_topology_lock(); 2879163837Spjd LIST_FOREACH(gp, &classp->geom, geom) { 2880163837Spjd sc = gp->softc; 2881163837Spjd if (sc == NULL) 2882163837Spjd continue; 2883163837Spjd if (sc->sc_flags & GJF_DEVICE_DESTROY) 2884163837Spjd continue; 2885163837Spjd if ((sc->sc_type & GJ_TYPE_COMPLETE) != GJ_TYPE_COMPLETE) 2886163837Spjd continue; 2887163837Spjd mtx_lock(&sc->sc_mtx); 2888163837Spjd sc->sc_flags |= GJF_DEVICE_BEFORE_SWITCH; 2889163837Spjd mtx_unlock(&sc->sc_mtx); 2890163837Spjd } 2891163837Spjd g_topology_unlock(); 2892163837Spjd PICKUP_GIANT(); 2893163837Spjd 2894163837Spjd mtx_lock(&mountlist_mtx); 2895163837Spjd TAILQ_FOREACH(mp, &mountlist, mnt_list) { 2896163837Spjd if (mp->mnt_gjprovider == NULL) 2897163837Spjd continue; 2898163837Spjd if (mp->mnt_flag & MNT_RDONLY) 2899163837Spjd continue; 2900163837Spjd desc = g_journal_find_desc(mp->mnt_stat.f_fstypename); 2901163837Spjd if (desc == NULL) 2902163837Spjd continue; 2903184554Sattilio if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK)) 2904163837Spjd continue; 2905163837Spjd /* mtx_unlock(&mountlist_mtx) was done inside vfs_busy() */ 2906163837Spjd 2907163837Spjd DROP_GIANT(); 2908163837Spjd g_topology_lock(); 2909163837Spjd sc = g_journal_find_device(classp, mp->mnt_gjprovider); 2910163837Spjd g_topology_unlock(); 2911163837Spjd PICKUP_GIANT(); 2912163837Spjd 2913163837Spjd if (sc == NULL) { 2914163837Spjd GJ_DEBUG(0, "Cannot find journal geom for %s.", 2915163837Spjd mp->mnt_gjprovider); 2916163837Spjd goto next; 2917163837Spjd } else if (JEMPTY(sc)) { 2918163837Spjd mtx_lock(&sc->sc_mtx); 2919163837Spjd sc->sc_flags &= ~GJF_DEVICE_BEFORE_SWITCH; 2920163837Spjd mtx_unlock(&sc->sc_mtx); 2921163837Spjd GJ_DEBUG(3, "No need for %s switch.", sc->sc_name); 2922163837Spjd goto next; 2923163837Spjd } 2924163837Spjd 2925163837Spjd mountpoint = mp->mnt_stat.f_mntonname; 2926163837Spjd 2927163837Spjd error = vn_start_write(NULL, &mp, V_WAIT); 2928163837Spjd if (error != 0) { 2929163837Spjd GJ_DEBUG(0, "vn_start_write(%s) failed (error=%d).", 2930163837Spjd mountpoint, error); 2931163837Spjd goto next; 2932163837Spjd } 2933163837Spjd 2934231075Skib save = curthread_pflags_set(TDP_SYNCIO); 2935163865Spjd 2936163837Spjd GJ_TIMER_START(1, &bt); 2937163837Spjd vfs_msync(mp, MNT_NOWAIT); 2938163837Spjd GJ_TIMER_STOP(1, &bt, "Msync time of %s", mountpoint); 2939163837Spjd 2940163837Spjd GJ_TIMER_START(1, &bt); 2941191990Sattilio error = VFS_SYNC(mp, MNT_NOWAIT); 2942163837Spjd if (error == 0) 2943163837Spjd GJ_TIMER_STOP(1, &bt, "Sync time of %s", mountpoint); 2944163837Spjd else { 2945163837Spjd GJ_DEBUG(0, "Cannot sync file system %s (error=%d).", 2946163837Spjd mountpoint, error); 2947163837Spjd } 2948163837Spjd 2949231075Skib curthread_pflags_restore(save); 2950163865Spjd 2951163837Spjd vn_finished_write(mp); 2952163837Spjd 2953241896Skib if (error != 0) 2954163837Spjd goto next; 2955163837Spjd 2956163837Spjd /* 2957163837Spjd * Send BIO_FLUSH before freezing the file system, so it can be 2958163837Spjd * faster after the freeze. 2959163837Spjd */ 2960163837Spjd GJ_TIMER_START(1, &bt); 2961163837Spjd g_journal_flush_cache(sc); 2962163837Spjd GJ_TIMER_STOP(1, &bt, "BIO_FLUSH time of %s", sc->sc_name); 2963163837Spjd 2964163837Spjd GJ_TIMER_START(1, &bt); 2965253106Skib error = vfs_write_suspend(mp, VS_SKIP_UNMOUNT); 2966163837Spjd GJ_TIMER_STOP(1, &bt, "Suspend time of %s", mountpoint); 2967163837Spjd if (error != 0) { 2968163837Spjd GJ_DEBUG(0, "Cannot suspend file system %s (error=%d).", 2969163837Spjd mountpoint, error); 2970163837Spjd goto next; 2971163837Spjd } 2972163837Spjd 2973163837Spjd error = desc->jd_clean(mp); 2974163837Spjd if (error != 0) 2975163837Spjd goto next; 2976163837Spjd 2977163837Spjd mtx_lock(&sc->sc_mtx); 2978163837Spjd g_journal_switch_wait(sc); 2979163837Spjd mtx_unlock(&sc->sc_mtx); 2980163837Spjd 2981245286Skib vfs_write_resume(mp, 0); 2982163837Spjdnext: 2983163837Spjd mtx_lock(&mountlist_mtx); 2984182542Sattilio vfs_unbusy(mp); 2985163837Spjd } 2986163837Spjd mtx_unlock(&mountlist_mtx); 2987163837Spjd 2988163837Spjd sc = NULL; 2989163837Spjd for (;;) { 2990163837Spjd DROP_GIANT(); 2991163837Spjd g_topology_lock(); 2992163837Spjd LIST_FOREACH(gp, &g_journal_class.geom, geom) { 2993163837Spjd sc = gp->softc; 2994163837Spjd if (sc == NULL) 2995163837Spjd continue; 2996163837Spjd mtx_lock(&sc->sc_mtx); 2997163837Spjd if ((sc->sc_type & GJ_TYPE_COMPLETE) == GJ_TYPE_COMPLETE && 2998163837Spjd !(sc->sc_flags & GJF_DEVICE_DESTROY) && 2999163837Spjd (sc->sc_flags & GJF_DEVICE_BEFORE_SWITCH)) { 3000163837Spjd break; 3001163837Spjd } 3002163837Spjd mtx_unlock(&sc->sc_mtx); 3003163837Spjd sc = NULL; 3004163837Spjd } 3005163837Spjd g_topology_unlock(); 3006163837Spjd PICKUP_GIANT(); 3007163837Spjd if (sc == NULL) 3008163837Spjd break; 3009163837Spjd mtx_assert(&sc->sc_mtx, MA_OWNED); 3010163837Spjd g_journal_switch_wait(sc); 3011163837Spjd mtx_unlock(&sc->sc_mtx); 3012163837Spjd } 3013163837Spjd} 3014163837Spjd 3015163837Spjd/* 3016163837Spjd * TODO: Switcher thread should be started on first geom creation and killed on 3017163837Spjd * last geom destruction. 3018163837Spjd */ 3019163837Spjdstatic void 3020163837Spjdg_journal_switcher(void *arg) 3021163837Spjd{ 3022163837Spjd struct g_class *mp; 3023163837Spjd struct bintime bt; 3024163837Spjd int error; 3025163837Spjd 3026163837Spjd mp = arg; 3027217880Skib curthread->td_pflags |= TDP_NORUNNINGBUF; 3028163837Spjd for (;;) { 3029163837Spjd g_journal_switcher_wokenup = 0; 3030163837Spjd error = tsleep(&g_journal_switcher_state, PRIBIO, "jsw:wait", 3031163837Spjd g_journal_switch_time * hz); 3032163837Spjd if (g_journal_switcher_state == GJ_SWITCHER_DIE) { 3033163837Spjd g_journal_switcher_state = GJ_SWITCHER_DIED; 3034163837Spjd GJ_DEBUG(1, "Switcher exiting."); 3035163837Spjd wakeup(&g_journal_switcher_state); 3036172836Sjulian kproc_exit(0); 3037163837Spjd } 3038163837Spjd if (error == 0 && g_journal_sync_requested == 0) { 3039163837Spjd GJ_DEBUG(1, "Out of cache, force switch (used=%u " 3040163837Spjd "limit=%u).", g_journal_cache_used, 3041163837Spjd g_journal_cache_limit); 3042163837Spjd } 3043163837Spjd GJ_TIMER_START(1, &bt); 3044182542Sattilio g_journal_do_switch(mp); 3045163837Spjd GJ_TIMER_STOP(1, &bt, "Entire switch time"); 3046163837Spjd if (g_journal_sync_requested > 0) { 3047163837Spjd g_journal_sync_requested = 0; 3048163837Spjd wakeup(&g_journal_sync_requested); 3049163837Spjd } 3050163837Spjd } 3051163837Spjd} 3052