1219974Smav/*- 2219974Smav * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3219974Smav * All rights reserved. 4219974Smav * 5219974Smav * Redistribution and use in source and binary forms, with or without 6219974Smav * modification, are permitted provided that the following conditions 7219974Smav * are met: 8219974Smav * 1. Redistributions of source code must retain the above copyright 9219974Smav * notice, this list of conditions and the following disclaimer. 10219974Smav * 2. Redistributions in binary form must reproduce the above copyright 11219974Smav * notice, this list of conditions and the following disclaimer in the 12219974Smav * documentation and/or other materials provided with the distribution. 13219974Smav * 14219974Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15219974Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16219974Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17219974Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18219974Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19219974Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20219974Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21219974Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22219974Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23219974Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24219974Smav * SUCH DAMAGE. 25219974Smav */ 26219974Smav 27219974Smav#include <sys/cdefs.h> 28219974Smav__FBSDID("$FreeBSD$"); 29219974Smav 30219974Smav#include <sys/param.h> 31219974Smav#include <sys/bio.h> 32219974Smav#include <sys/endian.h> 33219974Smav#include <sys/kernel.h> 34219974Smav#include <sys/kobj.h> 35219974Smav#include <sys/limits.h> 36219974Smav#include <sys/lock.h> 37219974Smav#include <sys/malloc.h> 38219974Smav#include <sys/mutex.h> 39219974Smav#include <sys/sysctl.h> 40219974Smav#include <sys/systm.h> 41219974Smav#include <geom/geom.h> 42219974Smav#include "geom/raid/g_raid.h" 43219974Smav#include "g_raid_tr_if.h" 44219974Smav 45240465SmavSYSCTL_DECL(_kern_geom_raid_raid1); 46219974Smav 47219974Smav#define RAID1_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */ 48219974Smavstatic int g_raid1_rebuild_slab = RAID1_REBUILD_SLAB; 49219974SmavTUNABLE_INT("kern.geom.raid.raid1.rebuild_slab_size", 50219974Smav &g_raid1_rebuild_slab); 51219974SmavSYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_slab_size, CTLFLAG_RW, 52219974Smav &g_raid1_rebuild_slab, 0, 53219974Smav "Amount of the disk to rebuild each read/write cycle of the rebuild."); 54219974Smav 55219974Smav#define RAID1_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */ 56219974Smavstatic int g_raid1_rebuild_fair_io = RAID1_REBUILD_FAIR_IO; 57219974SmavTUNABLE_INT("kern.geom.raid.raid1.rebuild_fair_io", 58219974Smav &g_raid1_rebuild_fair_io); 59219974SmavSYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_fair_io, CTLFLAG_RW, 60219974Smav &g_raid1_rebuild_fair_io, 0, 61219974Smav "Fraction of the I/O bandwidth to use when disk busy for rebuild."); 62219974Smav 63219974Smav#define RAID1_REBUILD_CLUSTER_IDLE 100 64219974Smavstatic int g_raid1_rebuild_cluster_idle = RAID1_REBUILD_CLUSTER_IDLE; 65219974SmavTUNABLE_INT("kern.geom.raid.raid1.rebuild_cluster_idle", 66219974Smav &g_raid1_rebuild_cluster_idle); 67219974SmavSYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW, 68219974Smav &g_raid1_rebuild_cluster_idle, 0, 69219974Smav "Number of slabs to do each time we trigger a rebuild cycle"); 70219974Smav 71219974Smav#define RAID1_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */ 72219974Smavstatic int g_raid1_rebuild_meta_update = RAID1_REBUILD_META_UPDATE; 73219974SmavTUNABLE_INT("kern.geom.raid.raid1.rebuild_meta_update", 74219974Smav &g_raid1_rebuild_meta_update); 75219974SmavSYSCTL_UINT(_kern_geom_raid_raid1, OID_AUTO, rebuild_meta_update, CTLFLAG_RW, 76219974Smav &g_raid1_rebuild_meta_update, 0, 77219974Smav "When to update the meta data."); 78219974Smav 79219974Smavstatic MALLOC_DEFINE(M_TR_RAID1, "tr_raid1_data", "GEOM_RAID RAID1 data"); 80219974Smav 81219974Smav#define TR_RAID1_NONE 0 82219974Smav#define TR_RAID1_REBUILD 1 83219974Smav#define TR_RAID1_RESYNC 2 84219974Smav 85219974Smav#define TR_RAID1_F_DOING_SOME 0x1 86219974Smav#define TR_RAID1_F_LOCKED 0x2 87219974Smav#define TR_RAID1_F_ABORT 0x4 88219974Smav 89219974Smavstruct g_raid_tr_raid1_object { 90219974Smav struct g_raid_tr_object trso_base; 91219974Smav int trso_starting; 92219974Smav int trso_stopping; 93219974Smav int trso_type; 94219974Smav int trso_recover_slabs; /* slabs before rest */ 95219974Smav int trso_fair_io; 96219974Smav int trso_meta_update; 97219974Smav int trso_flags; 98219974Smav struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 99219974Smav void *trso_buffer; /* Buffer space */ 100219974Smav struct bio trso_bio; 101219974Smav}; 102219974Smav 103219974Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid1; 104219974Smavstatic g_raid_tr_event_t g_raid_tr_event_raid1; 105219974Smavstatic g_raid_tr_start_t g_raid_tr_start_raid1; 106219974Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid1; 107219974Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid1; 108219974Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid1; 109219974Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1; 110219974Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid1; 111219974Smavstatic g_raid_tr_idle_t g_raid_tr_idle_raid1; 112219974Smavstatic g_raid_tr_free_t g_raid_tr_free_raid1; 113219974Smav 114219974Smavstatic kobj_method_t g_raid_tr_raid1_methods[] = { 115219974Smav KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1), 116219974Smav KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1), 117219974Smav KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1), 118219974Smav KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1), 119219974Smav KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1), 120219974Smav KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1), 121219974Smav KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1), 122219974Smav KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1), 123219974Smav KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1), 124219974Smav KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1), 125219974Smav { 0, 0 } 126219974Smav}; 127219974Smav 128219974Smavstatic struct g_raid_tr_class g_raid_tr_raid1_class = { 129219974Smav "RAID1", 130219974Smav g_raid_tr_raid1_methods, 131219974Smav sizeof(struct g_raid_tr_raid1_object), 132240465Smav .trc_enable = 1, 133260385Sscottl .trc_priority = 100, 134260385Sscottl .trc_accept_unmapped = 1 135219974Smav}; 136219974Smav 137219974Smavstatic void g_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr); 138219974Smavstatic void g_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 139219974Smav struct g_raid_subdisk *sd); 140219974Smav 141219974Smavstatic int 142219974Smavg_raid_tr_taste_raid1(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 143219974Smav{ 144219974Smav struct g_raid_tr_raid1_object *trs; 145219974Smav 146219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 147219974Smav if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1 || 148234603Smav (tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1SM && 149234603Smav tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1MM)) 150219974Smav return (G_RAID_TR_TASTE_FAIL); 151219974Smav trs->trso_starting = 1; 152219974Smav return (G_RAID_TR_TASTE_SUCCEED); 153219974Smav} 154219974Smav 155219974Smavstatic int 156219974Smavg_raid_tr_update_state_raid1(struct g_raid_volume *vol, 157219974Smav struct g_raid_subdisk *sd) 158219974Smav{ 159219974Smav struct g_raid_tr_raid1_object *trs; 160219974Smav struct g_raid_softc *sc; 161219974Smav struct g_raid_subdisk *tsd, *bestsd; 162219974Smav u_int s; 163219974Smav int i, na, ns; 164219974Smav 165219974Smav sc = vol->v_softc; 166219974Smav trs = (struct g_raid_tr_raid1_object *)vol->v_tr; 167219974Smav if (trs->trso_stopping && 168219974Smav (trs->trso_flags & TR_RAID1_F_DOING_SOME) == 0) 169219974Smav s = G_RAID_VOLUME_S_STOPPED; 170219974Smav else if (trs->trso_starting) 171219974Smav s = G_RAID_VOLUME_S_STARTING; 172219974Smav else { 173219974Smav /* Make sure we have at least one ACTIVE disk. */ 174219974Smav na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 175219974Smav if (na == 0) { 176219974Smav /* 177219974Smav * Critical situation! We have no any active disk! 178219974Smav * Choose the best disk we have to make it active. 179219974Smav */ 180219974Smav bestsd = &vol->v_subdisks[0]; 181219974Smav for (i = 1; i < vol->v_disks_count; i++) { 182219974Smav tsd = &vol->v_subdisks[i]; 183219974Smav if (tsd->sd_state > bestsd->sd_state) 184219974Smav bestsd = tsd; 185219974Smav else if (tsd->sd_state == bestsd->sd_state && 186219974Smav (tsd->sd_state == G_RAID_SUBDISK_S_REBUILD || 187219974Smav tsd->sd_state == G_RAID_SUBDISK_S_RESYNC) && 188219974Smav tsd->sd_rebuild_pos > bestsd->sd_rebuild_pos) 189219974Smav bestsd = tsd; 190219974Smav } 191219974Smav if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED) { 192219974Smav /* We found reasonable candidate. */ 193219974Smav G_RAID_DEBUG1(1, sc, 194219974Smav "Promote subdisk %s:%d from %s to ACTIVE.", 195219974Smav vol->v_name, bestsd->sd_pos, 196219974Smav g_raid_subdisk_state2str(bestsd->sd_state)); 197219974Smav g_raid_change_subdisk_state(bestsd, 198219974Smav G_RAID_SUBDISK_S_ACTIVE); 199219974Smav g_raid_write_metadata(sc, 200219974Smav vol, bestsd, bestsd->sd_disk); 201219974Smav } 202219974Smav } 203219974Smav na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 204219974Smav ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 205219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 206219974Smav if (na == vol->v_disks_count) 207219974Smav s = G_RAID_VOLUME_S_OPTIMAL; 208219974Smav else if (na + ns == vol->v_disks_count) 209219974Smav s = G_RAID_VOLUME_S_SUBOPTIMAL; 210219974Smav else if (na > 0) 211219974Smav s = G_RAID_VOLUME_S_DEGRADED; 212219974Smav else 213219974Smav s = G_RAID_VOLUME_S_BROKEN; 214219974Smav g_raid_tr_raid1_maybe_rebuild(vol->v_tr, sd); 215219974Smav } 216219974Smav if (s != vol->v_state) { 217219974Smav g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 218219974Smav G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 219219974Smav G_RAID_EVENT_VOLUME); 220219974Smav g_raid_change_volume_state(vol, s); 221219974Smav if (!trs->trso_starting && !trs->trso_stopping) 222219974Smav g_raid_write_metadata(sc, vol, NULL, NULL); 223219974Smav } 224219974Smav return (0); 225219974Smav} 226219974Smav 227219974Smavstatic void 228219974Smavg_raid_tr_raid1_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd, 229219974Smav struct g_raid_disk *disk) 230219974Smav{ 231219974Smav /* 232219974Smav * We don't fail the last disk in the pack, since it still has decent 233219974Smav * data on it and that's better than failing the disk if it is the root 234219974Smav * file system. 235219974Smav * 236219974Smav * XXX should this be controlled via a tunable? It makes sense for 237219974Smav * the volume that has / on it. I can't think of a case where we'd 238219974Smav * want the volume to go away on this kind of event. 239219974Smav */ 240219974Smav if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 && 241219974Smav g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd) 242219974Smav return; 243219974Smav g_raid_fail_disk(sc, sd, disk); 244219974Smav} 245219974Smav 246219974Smavstatic void 247219974Smavg_raid_tr_raid1_rebuild_some(struct g_raid_tr_object *tr) 248219974Smav{ 249219974Smav struct g_raid_tr_raid1_object *trs; 250219974Smav struct g_raid_subdisk *sd, *good_sd; 251219974Smav struct bio *bp; 252219974Smav 253219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 254219974Smav if (trs->trso_flags & TR_RAID1_F_DOING_SOME) 255219974Smav return; 256219974Smav sd = trs->trso_failed_sd; 257219974Smav good_sd = g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE); 258219974Smav if (good_sd == NULL) { 259219974Smav g_raid_tr_raid1_rebuild_abort(tr); 260219974Smav return; 261219974Smav } 262219974Smav bp = &trs->trso_bio; 263219974Smav memset(bp, 0, sizeof(*bp)); 264219974Smav bp->bio_offset = sd->sd_rebuild_pos; 265219974Smav bp->bio_length = MIN(g_raid1_rebuild_slab, 266219974Smav sd->sd_size - sd->sd_rebuild_pos); 267219974Smav bp->bio_data = trs->trso_buffer; 268219974Smav bp->bio_cmd = BIO_READ; 269219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 270219974Smav bp->bio_caller1 = good_sd; 271219974Smav trs->trso_flags |= TR_RAID1_F_DOING_SOME; 272219974Smav trs->trso_flags |= TR_RAID1_F_LOCKED; 273219974Smav g_raid_lock_range(sd->sd_volume, /* Lock callback starts I/O */ 274219974Smav bp->bio_offset, bp->bio_length, NULL, bp); 275219974Smav} 276219974Smav 277219974Smavstatic void 278219974Smavg_raid_tr_raid1_rebuild_done(struct g_raid_tr_raid1_object *trs) 279219974Smav{ 280219974Smav struct g_raid_volume *vol; 281219974Smav struct g_raid_subdisk *sd; 282219974Smav 283219974Smav vol = trs->trso_base.tro_volume; 284219974Smav sd = trs->trso_failed_sd; 285219974Smav g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk); 286219974Smav free(trs->trso_buffer, M_TR_RAID1); 287219974Smav trs->trso_buffer = NULL; 288219974Smav trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 289219974Smav trs->trso_type = TR_RAID1_NONE; 290219974Smav trs->trso_recover_slabs = 0; 291219974Smav trs->trso_failed_sd = NULL; 292219974Smav g_raid_tr_update_state_raid1(vol, NULL); 293219974Smav} 294219974Smav 295219974Smavstatic void 296219974Smavg_raid_tr_raid1_rebuild_finish(struct g_raid_tr_object *tr) 297219974Smav{ 298219974Smav struct g_raid_tr_raid1_object *trs; 299219974Smav struct g_raid_subdisk *sd; 300219974Smav 301219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 302219974Smav sd = trs->trso_failed_sd; 303219974Smav G_RAID_DEBUG1(0, tr->tro_volume->v_softc, 304219974Smav "Subdisk %s:%d-%s rebuild completed.", 305219974Smav sd->sd_volume->v_name, sd->sd_pos, 306219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 307219974Smav g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE); 308219974Smav sd->sd_rebuild_pos = 0; 309219974Smav g_raid_tr_raid1_rebuild_done(trs); 310219974Smav} 311219974Smav 312219974Smavstatic void 313219974Smavg_raid_tr_raid1_rebuild_abort(struct g_raid_tr_object *tr) 314219974Smav{ 315219974Smav struct g_raid_tr_raid1_object *trs; 316219974Smav struct g_raid_subdisk *sd; 317219974Smav struct g_raid_volume *vol; 318219974Smav off_t len; 319219974Smav 320219974Smav vol = tr->tro_volume; 321219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 322219974Smav sd = trs->trso_failed_sd; 323219974Smav if (trs->trso_flags & TR_RAID1_F_DOING_SOME) { 324219974Smav G_RAID_DEBUG1(1, vol->v_softc, 325219974Smav "Subdisk %s:%d-%s rebuild is aborting.", 326219974Smav sd->sd_volume->v_name, sd->sd_pos, 327219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 328219974Smav trs->trso_flags |= TR_RAID1_F_ABORT; 329219974Smav } else { 330219974Smav G_RAID_DEBUG1(0, vol->v_softc, 331219974Smav "Subdisk %s:%d-%s rebuild aborted.", 332219974Smav sd->sd_volume->v_name, sd->sd_pos, 333219974Smav sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 334219974Smav trs->trso_flags &= ~TR_RAID1_F_ABORT; 335219974Smav if (trs->trso_flags & TR_RAID1_F_LOCKED) { 336219974Smav trs->trso_flags &= ~TR_RAID1_F_LOCKED; 337219974Smav len = MIN(g_raid1_rebuild_slab, 338219974Smav sd->sd_size - sd->sd_rebuild_pos); 339219974Smav g_raid_unlock_range(tr->tro_volume, 340219974Smav sd->sd_rebuild_pos, len); 341219974Smav } 342219974Smav g_raid_tr_raid1_rebuild_done(trs); 343219974Smav } 344219974Smav} 345219974Smav 346219974Smavstatic void 347219974Smavg_raid_tr_raid1_rebuild_start(struct g_raid_tr_object *tr) 348219974Smav{ 349219974Smav struct g_raid_volume *vol; 350219974Smav struct g_raid_tr_raid1_object *trs; 351219974Smav struct g_raid_subdisk *sd, *fsd; 352219974Smav 353219974Smav vol = tr->tro_volume; 354219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 355219974Smav if (trs->trso_failed_sd) { 356219974Smav G_RAID_DEBUG1(1, vol->v_softc, 357219974Smav "Already rebuild in start rebuild. pos %jd\n", 358219974Smav (intmax_t)trs->trso_failed_sd->sd_rebuild_pos); 359219974Smav return; 360219974Smav } 361219974Smav sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_ACTIVE); 362219974Smav if (sd == NULL) { 363219974Smav G_RAID_DEBUG1(1, vol->v_softc, 364219974Smav "No active disk to rebuild. night night."); 365219974Smav return; 366219974Smav } 367219974Smav fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC); 368219974Smav if (fsd == NULL) 369219974Smav fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD); 370219974Smav if (fsd == NULL) { 371219974Smav fsd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE); 372219974Smav if (fsd != NULL) { 373219974Smav fsd->sd_rebuild_pos = 0; 374219974Smav g_raid_change_subdisk_state(fsd, 375219974Smav G_RAID_SUBDISK_S_RESYNC); 376219974Smav g_raid_write_metadata(vol->v_softc, vol, fsd, NULL); 377219974Smav } else { 378219974Smav fsd = g_raid_get_subdisk(vol, 379219974Smav G_RAID_SUBDISK_S_UNINITIALIZED); 380219974Smav if (fsd == NULL) 381219974Smav fsd = g_raid_get_subdisk(vol, 382219974Smav G_RAID_SUBDISK_S_NEW); 383219974Smav if (fsd != NULL) { 384219974Smav fsd->sd_rebuild_pos = 0; 385219974Smav g_raid_change_subdisk_state(fsd, 386219974Smav G_RAID_SUBDISK_S_REBUILD); 387219974Smav g_raid_write_metadata(vol->v_softc, 388219974Smav vol, fsd, NULL); 389219974Smav } 390219974Smav } 391219974Smav } 392219974Smav if (fsd == NULL) { 393219974Smav G_RAID_DEBUG1(1, vol->v_softc, 394219974Smav "No failed disk to rebuild. night night."); 395219974Smav return; 396219974Smav } 397219974Smav trs->trso_failed_sd = fsd; 398219974Smav G_RAID_DEBUG1(0, vol->v_softc, 399219974Smav "Subdisk %s:%d-%s rebuild start at %jd.", 400219974Smav fsd->sd_volume->v_name, fsd->sd_pos, 401219974Smav fsd->sd_disk ? g_raid_get_diskname(fsd->sd_disk) : "[none]", 402219974Smav trs->trso_failed_sd->sd_rebuild_pos); 403219974Smav trs->trso_type = TR_RAID1_REBUILD; 404219974Smav trs->trso_buffer = malloc(g_raid1_rebuild_slab, M_TR_RAID1, M_WAITOK); 405219974Smav trs->trso_meta_update = g_raid1_rebuild_meta_update; 406219974Smav g_raid_tr_raid1_rebuild_some(tr); 407219974Smav} 408219974Smav 409219974Smav 410219974Smavstatic void 411219974Smavg_raid_tr_raid1_maybe_rebuild(struct g_raid_tr_object *tr, 412219974Smav struct g_raid_subdisk *sd) 413219974Smav{ 414219974Smav struct g_raid_volume *vol; 415219974Smav struct g_raid_tr_raid1_object *trs; 416219974Smav int na, nr; 417219974Smav 418219974Smav /* 419219974Smav * If we're stopping, don't do anything. If we don't have at least one 420219974Smav * good disk and one bad disk, we don't do anything. And if there's a 421219974Smav * 'good disk' stored in the trs, then we're in progress and we punt. 422219974Smav * If we make it past all these checks, we need to rebuild. 423219974Smav */ 424219974Smav vol = tr->tro_volume; 425219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 426219974Smav if (trs->trso_stopping) 427219974Smav return; 428219974Smav na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 429219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) + 430219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 431219974Smav switch(trs->trso_type) { 432219974Smav case TR_RAID1_NONE: 433219974Smav if (na == 0) 434219974Smav return; 435219974Smav if (nr == 0) { 436219974Smav nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) + 437219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 438219974Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 439219974Smav if (nr == 0) 440219974Smav return; 441219974Smav } 442219974Smav g_raid_tr_raid1_rebuild_start(tr); 443219974Smav break; 444219974Smav case TR_RAID1_REBUILD: 445219974Smav if (na == 0 || nr == 0 || trs->trso_failed_sd == sd) 446219974Smav g_raid_tr_raid1_rebuild_abort(tr); 447219974Smav break; 448219974Smav case TR_RAID1_RESYNC: 449219974Smav break; 450219974Smav } 451219974Smav} 452219974Smav 453219974Smavstatic int 454219974Smavg_raid_tr_event_raid1(struct g_raid_tr_object *tr, 455219974Smav struct g_raid_subdisk *sd, u_int event) 456219974Smav{ 457219974Smav 458219974Smav g_raid_tr_update_state_raid1(tr->tro_volume, sd); 459219974Smav return (0); 460219974Smav} 461219974Smav 462219974Smavstatic int 463219974Smavg_raid_tr_start_raid1(struct g_raid_tr_object *tr) 464219974Smav{ 465219974Smav struct g_raid_tr_raid1_object *trs; 466219974Smav struct g_raid_volume *vol; 467219974Smav 468219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 469219974Smav vol = tr->tro_volume; 470219974Smav trs->trso_starting = 0; 471219974Smav g_raid_tr_update_state_raid1(vol, NULL); 472219974Smav return (0); 473219974Smav} 474219974Smav 475219974Smavstatic int 476219974Smavg_raid_tr_stop_raid1(struct g_raid_tr_object *tr) 477219974Smav{ 478219974Smav struct g_raid_tr_raid1_object *trs; 479219974Smav struct g_raid_volume *vol; 480219974Smav 481219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 482219974Smav vol = tr->tro_volume; 483219974Smav trs->trso_starting = 0; 484219974Smav trs->trso_stopping = 1; 485219974Smav g_raid_tr_update_state_raid1(vol, NULL); 486219974Smav return (0); 487219974Smav} 488219974Smav 489219974Smav/* 490219974Smav * Select the disk to read from. Take into account: subdisk state, running 491219974Smav * error recovery, average disk load, head position and possible cache hits. 492219974Smav */ 493219974Smav#define ABS(x) (((x) >= 0) ? (x) : (-(x))) 494219974Smavstatic struct g_raid_subdisk * 495219974Smavg_raid_tr_raid1_select_read_disk(struct g_raid_volume *vol, struct bio *bp, 496219974Smav u_int mask) 497219974Smav{ 498219974Smav struct g_raid_subdisk *sd, *best; 499219974Smav int i, prio, bestprio; 500219974Smav 501219974Smav best = NULL; 502219974Smav bestprio = INT_MAX; 503219974Smav for (i = 0; i < vol->v_disks_count; i++) { 504219974Smav sd = &vol->v_subdisks[i]; 505219974Smav if (sd->sd_state != G_RAID_SUBDISK_S_ACTIVE && 506219974Smav ((sd->sd_state != G_RAID_SUBDISK_S_REBUILD && 507219974Smav sd->sd_state != G_RAID_SUBDISK_S_RESYNC) || 508219974Smav bp->bio_offset + bp->bio_length > sd->sd_rebuild_pos)) 509219974Smav continue; 510219974Smav if ((mask & (1 << i)) != 0) 511219974Smav continue; 512219974Smav prio = G_RAID_SUBDISK_LOAD(sd); 513219974Smav prio += min(sd->sd_recovery, 255) << 22; 514219974Smav prio += (G_RAID_SUBDISK_S_ACTIVE - sd->sd_state) << 16; 515219974Smav /* If disk head is precisely in position - highly prefer it. */ 516219974Smav if (G_RAID_SUBDISK_POS(sd) == bp->bio_offset) 517219974Smav prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE; 518219974Smav else 519219974Smav /* If disk head is close to position - prefer it. */ 520219974Smav if (ABS(G_RAID_SUBDISK_POS(sd) - bp->bio_offset) < 521219974Smav G_RAID_SUBDISK_TRACK_SIZE) 522219974Smav prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE; 523219974Smav if (prio < bestprio) { 524219974Smav best = sd; 525219974Smav bestprio = prio; 526219974Smav } 527219974Smav } 528219974Smav return (best); 529219974Smav} 530219974Smav 531219974Smavstatic void 532219974Smavg_raid_tr_iostart_raid1_read(struct g_raid_tr_object *tr, struct bio *bp) 533219974Smav{ 534219974Smav struct g_raid_subdisk *sd; 535219974Smav struct bio *cbp; 536219974Smav 537219974Smav sd = g_raid_tr_raid1_select_read_disk(tr->tro_volume, bp, 0); 538219974Smav KASSERT(sd != NULL, ("No active disks in volume %s.", 539219974Smav tr->tro_volume->v_name)); 540219974Smav 541219974Smav cbp = g_clone_bio(bp); 542219974Smav if (cbp == NULL) { 543219974Smav g_raid_iodone(bp, ENOMEM); 544219974Smav return; 545219974Smav } 546219974Smav 547219974Smav g_raid_subdisk_iostart(sd, cbp); 548219974Smav} 549219974Smav 550219974Smavstatic void 551219974Smavg_raid_tr_iostart_raid1_write(struct g_raid_tr_object *tr, struct bio *bp) 552219974Smav{ 553219974Smav struct g_raid_volume *vol; 554219974Smav struct g_raid_subdisk *sd; 555219974Smav struct bio_queue_head queue; 556219974Smav struct bio *cbp; 557219974Smav int i; 558219974Smav 559219974Smav vol = tr->tro_volume; 560219974Smav 561219974Smav /* 562219974Smav * Allocate all bios before sending any request, so we can return 563219974Smav * ENOMEM in nice and clean way. 564219974Smav */ 565219974Smav bioq_init(&queue); 566219974Smav for (i = 0; i < vol->v_disks_count; i++) { 567219974Smav sd = &vol->v_subdisks[i]; 568219974Smav switch (sd->sd_state) { 569219974Smav case G_RAID_SUBDISK_S_ACTIVE: 570219974Smav break; 571219974Smav case G_RAID_SUBDISK_S_REBUILD: 572219974Smav /* 573219974Smav * When rebuilding, only part of this subdisk is 574219974Smav * writable, the rest will be written as part of the 575219974Smav * that process. 576219974Smav */ 577219974Smav if (bp->bio_offset >= sd->sd_rebuild_pos) 578219974Smav continue; 579219974Smav break; 580219974Smav case G_RAID_SUBDISK_S_STALE: 581219974Smav case G_RAID_SUBDISK_S_RESYNC: 582219974Smav /* 583219974Smav * Resyncing still writes on the theory that the 584219974Smav * resync'd disk is very close and writing it will 585219974Smav * keep it that way better if we keep up while 586219974Smav * resyncing. 587219974Smav */ 588219974Smav break; 589219974Smav default: 590219974Smav continue; 591219974Smav } 592219974Smav cbp = g_clone_bio(bp); 593219974Smav if (cbp == NULL) 594219974Smav goto failure; 595219974Smav cbp->bio_caller1 = sd; 596219974Smav bioq_insert_tail(&queue, cbp); 597219974Smav } 598260385Sscottl while ((cbp = bioq_takefirst(&queue)) != NULL) { 599219974Smav sd = cbp->bio_caller1; 600219974Smav cbp->bio_caller1 = NULL; 601219974Smav g_raid_subdisk_iostart(sd, cbp); 602219974Smav } 603219974Smav return; 604219974Smavfailure: 605260385Sscottl while ((cbp = bioq_takefirst(&queue)) != NULL) 606219974Smav g_destroy_bio(cbp); 607219974Smav if (bp->bio_error == 0) 608219974Smav bp->bio_error = ENOMEM; 609219974Smav g_raid_iodone(bp, bp->bio_error); 610219974Smav} 611219974Smav 612219974Smavstatic void 613219974Smavg_raid_tr_iostart_raid1(struct g_raid_tr_object *tr, struct bio *bp) 614219974Smav{ 615219974Smav struct g_raid_volume *vol; 616219974Smav struct g_raid_tr_raid1_object *trs; 617219974Smav 618219974Smav vol = tr->tro_volume; 619219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 620219974Smav if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL && 621219974Smav vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL && 622219974Smav vol->v_state != G_RAID_VOLUME_S_DEGRADED) { 623219974Smav g_raid_iodone(bp, EIO); 624219974Smav return; 625219974Smav } 626219974Smav /* 627219974Smav * If we're rebuilding, squeeze in rebuild activity every so often, 628219974Smav * even when the disk is busy. Be sure to only count real I/O 629219974Smav * to the disk. All 'SPECIAL' I/O is traffic generated to the disk 630219974Smav * by this module. 631219974Smav */ 632219974Smav if (trs->trso_failed_sd != NULL && 633219974Smav !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) { 634219974Smav /* Make this new or running now round short. */ 635219974Smav trs->trso_recover_slabs = 0; 636219974Smav if (--trs->trso_fair_io <= 0) { 637219974Smav trs->trso_fair_io = g_raid1_rebuild_fair_io; 638219974Smav g_raid_tr_raid1_rebuild_some(tr); 639219974Smav } 640219974Smav } 641219974Smav switch (bp->bio_cmd) { 642219974Smav case BIO_READ: 643219974Smav g_raid_tr_iostart_raid1_read(tr, bp); 644219974Smav break; 645219974Smav case BIO_WRITE: 646242323Smav case BIO_DELETE: 647219974Smav g_raid_tr_iostart_raid1_write(tr, bp); 648219974Smav break; 649219974Smav case BIO_FLUSH: 650219974Smav g_raid_tr_flush_common(tr, bp); 651219974Smav break; 652219974Smav default: 653219974Smav KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 654219974Smav bp->bio_cmd, vol->v_name)); 655219974Smav break; 656219974Smav } 657219974Smav} 658219974Smav 659219974Smavstatic void 660219974Smavg_raid_tr_iodone_raid1(struct g_raid_tr_object *tr, 661219974Smav struct g_raid_subdisk *sd, struct bio *bp) 662219974Smav{ 663219974Smav struct bio *cbp; 664219974Smav struct g_raid_subdisk *nsd; 665219974Smav struct g_raid_volume *vol; 666219974Smav struct bio *pbp; 667219974Smav struct g_raid_tr_raid1_object *trs; 668219974Smav uintptr_t *mask; 669219974Smav int error, do_write; 670219974Smav 671219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 672219974Smav vol = tr->tro_volume; 673219974Smav if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) { 674219974Smav /* 675219974Smav * This operation is part of a rebuild or resync operation. 676219974Smav * See what work just got done, then schedule the next bit of 677219974Smav * work, if any. Rebuild/resync is done a little bit at a 678219974Smav * time. Either when a timeout happens, or after we get a 679219974Smav * bunch of I/Os to the disk (to make sure an active system 680219974Smav * will complete in a sane amount of time). 681219974Smav * 682219974Smav * We are setup to do differing amounts of work for each of 683219974Smav * these cases. so long as the slabs is smallish (less than 684219974Smav * 50 or so, I'd guess, but that's just a WAG), we shouldn't 685219974Smav * have any bio starvation issues. For active disks, we do 686219974Smav * 5MB of data, for inactive ones, we do 50MB. 687219974Smav */ 688219974Smav if (trs->trso_type == TR_RAID1_REBUILD) { 689219974Smav if (bp->bio_cmd == BIO_READ) { 690219974Smav 691219974Smav /* Immediately abort rebuild, if requested. */ 692219974Smav if (trs->trso_flags & TR_RAID1_F_ABORT) { 693219974Smav trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 694219974Smav g_raid_tr_raid1_rebuild_abort(tr); 695219974Smav return; 696219974Smav } 697219974Smav 698219974Smav /* On read error, skip and cross fingers. */ 699219974Smav if (bp->bio_error != 0) { 700219974Smav G_RAID_LOGREQ(0, bp, 701219974Smav "Read error during rebuild (%d), " 702219974Smav "possible data loss!", 703219974Smav bp->bio_error); 704219974Smav goto rebuild_round_done; 705219974Smav } 706219974Smav 707219974Smav /* 708219974Smav * The read operation finished, queue the 709219974Smav * write and get out. 710219974Smav */ 711219974Smav G_RAID_LOGREQ(4, bp, "rebuild read done. %d", 712219974Smav bp->bio_error); 713219974Smav bp->bio_cmd = BIO_WRITE; 714219974Smav bp->bio_cflags = G_RAID_BIO_FLAG_SYNC; 715219974Smav G_RAID_LOGREQ(4, bp, "Queueing rebuild write."); 716219974Smav g_raid_subdisk_iostart(trs->trso_failed_sd, bp); 717219974Smav } else { 718219974Smav /* 719219974Smav * The write operation just finished. Do 720219974Smav * another. We keep cloning the master bio 721219974Smav * since it has the right buffers allocated to 722219974Smav * it. 723219974Smav */ 724219974Smav G_RAID_LOGREQ(4, bp, 725219974Smav "rebuild write done. Error %d", 726219974Smav bp->bio_error); 727219974Smav nsd = trs->trso_failed_sd; 728219974Smav if (bp->bio_error != 0 || 729219974Smav trs->trso_flags & TR_RAID1_F_ABORT) { 730219974Smav if ((trs->trso_flags & 731219974Smav TR_RAID1_F_ABORT) == 0) { 732219974Smav g_raid_tr_raid1_fail_disk(sd->sd_softc, 733219974Smav nsd, nsd->sd_disk); 734219974Smav } 735219974Smav trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 736219974Smav g_raid_tr_raid1_rebuild_abort(tr); 737219974Smav return; 738219974Smav } 739219974Smavrebuild_round_done: 740219974Smav nsd = trs->trso_failed_sd; 741219974Smav trs->trso_flags &= ~TR_RAID1_F_LOCKED; 742219974Smav g_raid_unlock_range(sd->sd_volume, 743219974Smav bp->bio_offset, bp->bio_length); 744219974Smav nsd->sd_rebuild_pos += bp->bio_length; 745219974Smav if (nsd->sd_rebuild_pos >= nsd->sd_size) { 746219974Smav g_raid_tr_raid1_rebuild_finish(tr); 747219974Smav return; 748219974Smav } 749219974Smav 750219974Smav /* Abort rebuild if we are stopping */ 751219974Smav if (trs->trso_stopping) { 752219974Smav trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 753219974Smav g_raid_tr_raid1_rebuild_abort(tr); 754219974Smav return; 755219974Smav } 756219974Smav 757219974Smav if (--trs->trso_meta_update <= 0) { 758219974Smav g_raid_write_metadata(vol->v_softc, 759219974Smav vol, nsd, nsd->sd_disk); 760219974Smav trs->trso_meta_update = 761219974Smav g_raid1_rebuild_meta_update; 762219974Smav } 763219974Smav trs->trso_flags &= ~TR_RAID1_F_DOING_SOME; 764219974Smav if (--trs->trso_recover_slabs <= 0) 765219974Smav return; 766219974Smav g_raid_tr_raid1_rebuild_some(tr); 767219974Smav } 768219974Smav } else if (trs->trso_type == TR_RAID1_RESYNC) { 769219974Smav /* 770219974Smav * read good sd, read bad sd in parallel. when both 771219974Smav * done, compare the buffers. write good to the bad 772219974Smav * if different. do the next bit of work. 773219974Smav */ 774219974Smav panic("Somehow, we think we're doing a resync"); 775219974Smav } 776219974Smav return; 777219974Smav } 778219974Smav pbp = bp->bio_parent; 779219974Smav pbp->bio_inbed++; 780219974Smav if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) { 781219974Smav /* 782219974Smav * Read failed on first drive. Retry the read error on 783219974Smav * another disk drive, if available, before erroring out the 784219974Smav * read. 785219974Smav */ 786219974Smav sd->sd_disk->d_read_errs++; 787219974Smav G_RAID_LOGREQ(0, bp, 788219974Smav "Read error (%d), %d read errors total", 789219974Smav bp->bio_error, sd->sd_disk->d_read_errs); 790219974Smav 791219974Smav /* 792219974Smav * If there are too many read errors, we move to degraded. 793219974Smav * XXX Do we want to FAIL the drive (eg, make the user redo 794219974Smav * everything to get it back in sync), or just degrade the 795219974Smav * drive, which kicks off a resync? 796219974Smav */ 797219974Smav do_write = 1; 798219974Smav if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh) { 799219974Smav g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 800219974Smav if (pbp->bio_children == 1) 801219974Smav do_write = 0; 802219974Smav } 803219974Smav 804219974Smav /* 805219974Smav * Find the other disk, and try to do the I/O to it. 806219974Smav */ 807219974Smav mask = (uintptr_t *)(&pbp->bio_driver2); 808219974Smav if (pbp->bio_children == 1) { 809219974Smav /* Save original subdisk. */ 810219974Smav pbp->bio_driver1 = do_write ? sd : NULL; 811219974Smav *mask = 0; 812219974Smav } 813219974Smav *mask |= 1 << sd->sd_pos; 814219974Smav nsd = g_raid_tr_raid1_select_read_disk(vol, pbp, *mask); 815219974Smav if (nsd != NULL && (cbp = g_clone_bio(pbp)) != NULL) { 816219974Smav g_destroy_bio(bp); 817219974Smav G_RAID_LOGREQ(2, cbp, "Retrying read from %d", 818219974Smav nsd->sd_pos); 819219974Smav if (pbp->bio_children == 2 && do_write) { 820219974Smav sd->sd_recovery++; 821219974Smav cbp->bio_caller1 = nsd; 822219974Smav pbp->bio_pflags = G_RAID_BIO_FLAG_LOCKED; 823219974Smav /* Lock callback starts I/O */ 824219974Smav g_raid_lock_range(sd->sd_volume, 825219974Smav cbp->bio_offset, cbp->bio_length, pbp, cbp); 826219974Smav } else { 827219974Smav g_raid_subdisk_iostart(nsd, cbp); 828219974Smav } 829219974Smav return; 830219974Smav } 831219974Smav /* 832219974Smav * We can't retry. Return the original error by falling 833219974Smav * through. This will happen when there's only one good disk. 834219974Smav * We don't need to fail the raid, since its actual state is 835219974Smav * based on the state of the subdisks. 836219974Smav */ 837219974Smav G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it"); 838219974Smav } 839219974Smav if (bp->bio_cmd == BIO_READ && 840219974Smav bp->bio_error == 0 && 841219974Smav pbp->bio_children > 1 && 842219974Smav pbp->bio_driver1 != NULL) { 843219974Smav /* 844219974Smav * If it was a read, and bio_children is >1, then we just 845219974Smav * recovered the data from the second drive. We should try to 846219974Smav * write that data to the first drive if sector remapping is 847219974Smav * enabled. A write should put the data in a new place on the 848219974Smav * disk, remapping the bad sector. Do we need to do that by 849219974Smav * queueing a request to the main worker thread? It doesn't 850219974Smav * affect the return code of this current read, and can be 851219974Smav * done at our liesure. However, to make the code simpler, it 852219974Smav * is done syncrhonously. 853219974Smav */ 854219974Smav G_RAID_LOGREQ(3, bp, "Recovered data from other drive"); 855219974Smav cbp = g_clone_bio(pbp); 856219974Smav if (cbp != NULL) { 857219974Smav g_destroy_bio(bp); 858219974Smav cbp->bio_cmd = BIO_WRITE; 859219974Smav cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP; 860219974Smav G_RAID_LOGREQ(2, cbp, 861219974Smav "Attempting bad sector remap on failing drive."); 862219974Smav g_raid_subdisk_iostart(pbp->bio_driver1, cbp); 863219974Smav return; 864219974Smav } 865219974Smav } 866219974Smav if (pbp->bio_pflags & G_RAID_BIO_FLAG_LOCKED) { 867219974Smav /* 868219974Smav * We're done with a recovery, mark the range as unlocked. 869219974Smav * For any write errors, we agressively fail the disk since 870219974Smav * there was both a READ and a WRITE error at this location. 871219974Smav * Both types of errors generally indicates the drive is on 872219974Smav * the verge of total failure anyway. Better to stop trusting 873219974Smav * it now. However, we need to reset error to 0 in that case 874219974Smav * because we're not failing the original I/O which succeeded. 875219974Smav */ 876219974Smav if (bp->bio_cmd == BIO_WRITE && bp->bio_error) { 877219974Smav G_RAID_LOGREQ(0, bp, "Remap write failed: " 878219974Smav "failing subdisk."); 879219974Smav g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 880219974Smav bp->bio_error = 0; 881219974Smav } 882219974Smav if (pbp->bio_driver1 != NULL) { 883219974Smav ((struct g_raid_subdisk *)pbp->bio_driver1) 884219974Smav ->sd_recovery--; 885219974Smav } 886219974Smav G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error); 887219974Smav g_raid_unlock_range(sd->sd_volume, bp->bio_offset, 888219974Smav bp->bio_length); 889219974Smav } 890242328Smav if (pbp->bio_cmd != BIO_READ) { 891235270Smav if (pbp->bio_inbed == 1 || pbp->bio_error != 0) 892235270Smav pbp->bio_error = bp->bio_error; 893242328Smav if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) { 894235270Smav G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk."); 895235270Smav g_raid_tr_raid1_fail_disk(sd->sd_softc, sd, sd->sd_disk); 896235270Smav } 897235270Smav error = pbp->bio_error; 898235270Smav } else 899235270Smav error = bp->bio_error; 900219974Smav g_destroy_bio(bp); 901219974Smav if (pbp->bio_children == pbp->bio_inbed) { 902219974Smav pbp->bio_completed = pbp->bio_length; 903219974Smav g_raid_iodone(pbp, error); 904219974Smav } 905219974Smav} 906219974Smav 907219974Smavstatic int 908219974Smavg_raid_tr_kerneldump_raid1(struct g_raid_tr_object *tr, 909219974Smav void *virtual, vm_offset_t physical, off_t offset, size_t length) 910219974Smav{ 911219974Smav struct g_raid_volume *vol; 912219974Smav struct g_raid_subdisk *sd; 913219974Smav int error, i, ok; 914219974Smav 915219974Smav vol = tr->tro_volume; 916219974Smav error = 0; 917219974Smav ok = 0; 918219974Smav for (i = 0; i < vol->v_disks_count; i++) { 919219974Smav sd = &vol->v_subdisks[i]; 920219974Smav switch (sd->sd_state) { 921219974Smav case G_RAID_SUBDISK_S_ACTIVE: 922219974Smav break; 923219974Smav case G_RAID_SUBDISK_S_REBUILD: 924219974Smav /* 925219974Smav * When rebuilding, only part of this subdisk is 926219974Smav * writable, the rest will be written as part of the 927219974Smav * that process. 928219974Smav */ 929219974Smav if (offset >= sd->sd_rebuild_pos) 930219974Smav continue; 931219974Smav break; 932219974Smav case G_RAID_SUBDISK_S_STALE: 933219974Smav case G_RAID_SUBDISK_S_RESYNC: 934219974Smav /* 935219974Smav * Resyncing still writes on the theory that the 936219974Smav * resync'd disk is very close and writing it will 937219974Smav * keep it that way better if we keep up while 938219974Smav * resyncing. 939219974Smav */ 940219974Smav break; 941219974Smav default: 942219974Smav continue; 943219974Smav } 944219974Smav error = g_raid_subdisk_kerneldump(sd, 945219974Smav virtual, physical, offset, length); 946219974Smav if (error == 0) 947219974Smav ok++; 948219974Smav } 949219974Smav return (ok > 0 ? 0 : error); 950219974Smav} 951219974Smav 952219974Smavstatic int 953219974Smavg_raid_tr_locked_raid1(struct g_raid_tr_object *tr, void *argp) 954219974Smav{ 955219974Smav struct bio *bp; 956219974Smav struct g_raid_subdisk *sd; 957219974Smav 958219974Smav bp = (struct bio *)argp; 959219974Smav sd = (struct g_raid_subdisk *)bp->bio_caller1; 960219974Smav g_raid_subdisk_iostart(sd, bp); 961219974Smav 962219974Smav return (0); 963219974Smav} 964219974Smav 965219974Smavstatic int 966219974Smavg_raid_tr_idle_raid1(struct g_raid_tr_object *tr) 967219974Smav{ 968219974Smav struct g_raid_tr_raid1_object *trs; 969219974Smav 970219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 971219974Smav trs->trso_fair_io = g_raid1_rebuild_fair_io; 972219974Smav trs->trso_recover_slabs = g_raid1_rebuild_cluster_idle; 973219974Smav if (trs->trso_type == TR_RAID1_REBUILD) 974219974Smav g_raid_tr_raid1_rebuild_some(tr); 975219974Smav return (0); 976219974Smav} 977219974Smav 978219974Smavstatic int 979219974Smavg_raid_tr_free_raid1(struct g_raid_tr_object *tr) 980219974Smav{ 981219974Smav struct g_raid_tr_raid1_object *trs; 982219974Smav 983219974Smav trs = (struct g_raid_tr_raid1_object *)tr; 984219974Smav 985219974Smav if (trs->trso_buffer != NULL) { 986219974Smav free(trs->trso_buffer, M_TR_RAID1); 987219974Smav trs->trso_buffer = NULL; 988219974Smav } 989219974Smav return (0); 990219974Smav} 991219974Smav 992240465SmavG_RAID_TR_DECLARE(raid1, "RAID1"); 993