1234458Smav/*- 2234458Smav * Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org> 3234458Smav * All rights reserved. 4234458Smav * 5234458Smav * Redistribution and use in source and binary forms, with or without 6234458Smav * modification, are permitted provided that the following conditions 7234458Smav * are met: 8234458Smav * 1. Redistributions of source code must retain the above copyright 9234458Smav * notice, this list of conditions and the following disclaimer. 10234458Smav * 2. Redistributions in binary form must reproduce the above copyright 11234458Smav * notice, this list of conditions and the following disclaimer in the 12234458Smav * documentation and/or other materials provided with the distribution. 13234458Smav * 14234458Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15234458Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16234458Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17234458Smav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18234458Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19234458Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20234458Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21234458Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22234458Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23234458Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24234458Smav * SUCH DAMAGE. 25234458Smav */ 26234458Smav 27234458Smav#include <sys/cdefs.h> 28234458Smav__FBSDID("$FreeBSD$"); 29234458Smav 30234458Smav#include <sys/param.h> 31234458Smav#include <sys/bio.h> 32234458Smav#include <sys/endian.h> 33234458Smav#include <sys/kernel.h> 34234458Smav#include <sys/kobj.h> 35234458Smav#include <sys/limits.h> 36234458Smav#include <sys/lock.h> 37234458Smav#include <sys/malloc.h> 38234458Smav#include <sys/mutex.h> 39234458Smav#include <sys/sysctl.h> 40234458Smav#include <sys/systm.h> 41234458Smav#include <geom/geom.h> 42234458Smav#include "geom/raid/g_raid.h" 43234458Smav#include "g_raid_tr_if.h" 44234458Smav 45234458Smavstatic MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data"); 46234458Smav 47234458Smav#define TR_RAID5_NONE 0 48234458Smav#define TR_RAID5_REBUILD 1 49234458Smav#define TR_RAID5_RESYNC 2 50234458Smav 51234458Smav#define TR_RAID5_F_DOING_SOME 0x1 52234458Smav#define TR_RAID5_F_LOCKED 0x2 53234458Smav#define TR_RAID5_F_ABORT 0x4 54234458Smav 55234458Smavstruct g_raid_tr_raid5_object { 56234458Smav struct g_raid_tr_object trso_base; 57234458Smav int trso_starting; 58234458Smav int trso_stopping; 59234458Smav int trso_type; 60234458Smav int trso_recover_slabs; /* slabs before rest */ 61234458Smav int trso_fair_io; 62234458Smav int trso_meta_update; 63234458Smav int trso_flags; 64234458Smav struct g_raid_subdisk *trso_failed_sd; /* like per volume */ 65234458Smav void *trso_buffer; /* Buffer space */ 66234458Smav struct bio trso_bio; 67234458Smav}; 68234458Smav 69234458Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid5; 70234458Smavstatic g_raid_tr_event_t g_raid_tr_event_raid5; 71234458Smavstatic g_raid_tr_start_t g_raid_tr_start_raid5; 72234458Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid5; 73234458Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid5; 74234458Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid5; 75234458Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5; 76234458Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid5; 77234458Smavstatic g_raid_tr_free_t g_raid_tr_free_raid5; 78234458Smav 79234458Smavstatic kobj_method_t g_raid_tr_raid5_methods[] = { 80234458Smav KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid5), 81234458Smav KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid5), 82234458Smav KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid5), 83234458Smav KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid5), 84234458Smav KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid5), 85234458Smav KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid5), 86234458Smav KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5), 87234458Smav KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid5), 88234458Smav KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid5), 89234458Smav { 0, 0 } 90234458Smav}; 91234458Smav 92234458Smavstatic struct g_raid_tr_class g_raid_tr_raid5_class = { 93234458Smav "RAID5", 94234458Smav g_raid_tr_raid5_methods, 95234458Smav sizeof(struct g_raid_tr_raid5_object), 96240465Smav .trc_enable = 1, 97234458Smav .trc_priority = 100 98234458Smav}; 99234458Smav 100234458Smavstatic int 101234458Smavg_raid_tr_taste_raid5(struct g_raid_tr_object *tr, struct g_raid_volume *vol) 102234458Smav{ 103234458Smav struct g_raid_tr_raid5_object *trs; 104234458Smav u_int qual; 105234458Smav 106234458Smav trs = (struct g_raid_tr_raid5_object *)tr; 107234458Smav qual = tr->tro_volume->v_raid_level_qualifier; 108234993Smav if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 && 109254269Smav (qual == G_RAID_VOLUME_RLQ_R4P0 || 110254271Smav qual == G_RAID_VOLUME_RLQ_R4PN)) { 111234993Smav /* RAID4 */ 112234993Smav } else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 || 113234993Smav tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E || 114234993Smav tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE || 115235076Smav tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R || 116234993Smav tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 || 117234993Smav tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) && 118254269Smav (qual == G_RAID_VOLUME_RLQ_R5RA || 119254269Smav qual == G_RAID_VOLUME_RLQ_R5RS || 120254269Smav qual == G_RAID_VOLUME_RLQ_R5LA || 121254269Smav qual == G_RAID_VOLUME_RLQ_R5LS)) { 122235076Smav /* RAID5/5E/5EE/5R/6/MDF */ 123234458Smav } else 124234458Smav return (G_RAID_TR_TASTE_FAIL); 125234458Smav trs->trso_starting = 1; 126234458Smav return (G_RAID_TR_TASTE_SUCCEED); 127234458Smav} 128234458Smav 129234458Smavstatic int 130234458Smavg_raid_tr_update_state_raid5(struct g_raid_volume *vol, 131234458Smav struct g_raid_subdisk *sd) 132234458Smav{ 133234458Smav struct g_raid_tr_raid5_object *trs; 134234458Smav struct g_raid_softc *sc; 135234458Smav u_int s; 136234458Smav int na, ns, nu; 137234458Smav 138234458Smav sc = vol->v_softc; 139234458Smav trs = (struct g_raid_tr_raid5_object *)vol->v_tr; 140234458Smav if (trs->trso_stopping && 141234458Smav (trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0) 142234458Smav s = G_RAID_VOLUME_S_STOPPED; 143234458Smav else if (trs->trso_starting) 144234458Smav s = G_RAID_VOLUME_S_STARTING; 145234458Smav else { 146234458Smav na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE); 147234458Smav ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) + 148234458Smav g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC); 149234458Smav nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED); 150234458Smav if (na == vol->v_disks_count) 151234458Smav s = G_RAID_VOLUME_S_OPTIMAL; 152234458Smav else if (na + ns == vol->v_disks_count || 153234458Smav na + ns + nu == vol->v_disks_count /* XXX: Temporary. */) 154234458Smav s = G_RAID_VOLUME_S_SUBOPTIMAL; 155234458Smav else if (na == vol->v_disks_count - 1 || 156234458Smav na + ns + nu == vol->v_disks_count) 157234458Smav s = G_RAID_VOLUME_S_DEGRADED; 158234458Smav else 159234458Smav s = G_RAID_VOLUME_S_BROKEN; 160234458Smav } 161234458Smav if (s != vol->v_state) { 162234458Smav g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ? 163234458Smav G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN, 164234458Smav G_RAID_EVENT_VOLUME); 165234458Smav g_raid_change_volume_state(vol, s); 166234458Smav if (!trs->trso_starting && !trs->trso_stopping) 167234458Smav g_raid_write_metadata(sc, vol, NULL, NULL); 168234458Smav } 169234458Smav return (0); 170234458Smav} 171234458Smav 172234458Smavstatic int 173234458Smavg_raid_tr_event_raid5(struct g_raid_tr_object *tr, 174234458Smav struct g_raid_subdisk *sd, u_int event) 175234458Smav{ 176234458Smav 177234458Smav g_raid_tr_update_state_raid5(tr->tro_volume, sd); 178234458Smav return (0); 179234458Smav} 180234458Smav 181234458Smavstatic int 182234458Smavg_raid_tr_start_raid5(struct g_raid_tr_object *tr) 183234458Smav{ 184234458Smav struct g_raid_tr_raid5_object *trs; 185234458Smav struct g_raid_volume *vol; 186234458Smav 187234458Smav trs = (struct g_raid_tr_raid5_object *)tr; 188254275Smav trs->trso_starting = 0; 189234458Smav vol = tr->tro_volume; 190254275Smav vol->v_read_only = 1; 191234458Smav g_raid_tr_update_state_raid5(vol, NULL); 192234458Smav return (0); 193234458Smav} 194234458Smav 195234458Smavstatic int 196234458Smavg_raid_tr_stop_raid5(struct g_raid_tr_object *tr) 197234458Smav{ 198234458Smav struct g_raid_tr_raid5_object *trs; 199234458Smav struct g_raid_volume *vol; 200234458Smav 201234458Smav trs = (struct g_raid_tr_raid5_object *)tr; 202234458Smav vol = tr->tro_volume; 203234458Smav trs->trso_starting = 0; 204234458Smav trs->trso_stopping = 1; 205234458Smav g_raid_tr_update_state_raid5(vol, NULL); 206234458Smav return (0); 207234458Smav} 208234458Smav 209234458Smavstatic void 210234458Smavg_raid_tr_iostart_raid5_read(struct g_raid_tr_object *tr, struct bio *bp) 211234458Smav{ 212234458Smav struct g_raid_volume *vol; 213234458Smav struct g_raid_subdisk *sd; 214234458Smav struct bio_queue_head queue; 215234458Smav struct bio *cbp; 216234458Smav char *addr; 217234458Smav off_t offset, start, length, nstripe, remain; 218235076Smav int no, pno, ddisks, pdisks, protate, pleft; 219234993Smav u_int strip_size, lvl, qual; 220234458Smav 221234458Smav vol = tr->tro_volume; 222234458Smav addr = bp->bio_data; 223234458Smav strip_size = vol->v_strip_size; 224234993Smav lvl = tr->tro_volume->v_raid_level; 225234458Smav qual = tr->tro_volume->v_raid_level_qualifier; 226235076Smav protate = tr->tro_volume->v_rotate_parity; 227234458Smav 228234458Smav /* Stripe number. */ 229234458Smav nstripe = bp->bio_offset / strip_size; 230234458Smav /* Start position in stripe. */ 231234458Smav start = bp->bio_offset % strip_size; 232234993Smav /* Number of data and parity disks. */ 233234993Smav if (lvl == G_RAID_VOLUME_RL_RAIDMDF) 234235076Smav pdisks = tr->tro_volume->v_mdf_pdisks; 235234993Smav else if (lvl == G_RAID_VOLUME_RL_RAID5EE || 236234993Smav lvl == G_RAID_VOLUME_RL_RAID6) 237234993Smav pdisks = 2; 238234993Smav else 239234993Smav pdisks = 1; 240234993Smav ddisks = vol->v_disks_count - pdisks; 241234458Smav /* Parity disk number. */ 242234993Smav if (lvl == G_RAID_VOLUME_RL_RAID4) { 243234993Smav if (qual == 0) /* P0 */ 244234993Smav pno = 0; 245234993Smav else /* PN */ 246234993Smav pno = ddisks; 247235076Smav pleft = -1; 248234993Smav } else { 249235076Smav pno = (nstripe / (ddisks * protate)) % vol->v_disks_count; 250235076Smav pleft = protate - (nstripe / ddisks) % protate; 251234993Smav if (qual >= 2) { /* PN/Left */ 252234993Smav pno = ddisks - pno; 253234993Smav if (pno < 0) 254234993Smav pno += vol->v_disks_count; 255234993Smav } 256234993Smav } 257234993Smav /* Data disk number. */ 258234993Smav no = nstripe % ddisks; 259234993Smav if (lvl == G_RAID_VOLUME_RL_RAID4) { 260234993Smav if (qual == 0) 261234993Smav no += pdisks; 262234993Smav } else if (qual & 1) { /* Continuation/Symmetric */ 263234993Smav no = (pno + pdisks + no) % vol->v_disks_count; 264234993Smav } else if (no >= pno) /* Restart/Asymmetric */ 265234993Smav no += pdisks; 266234993Smav else 267234993Smav no += imax(0, pno + pdisks - vol->v_disks_count); 268234458Smav /* Stripe start position in disk. */ 269234993Smav offset = (nstripe / ddisks) * strip_size; 270234458Smav /* Length of data to operate. */ 271234458Smav remain = bp->bio_length; 272234458Smav 273234458Smav bioq_init(&queue); 274234458Smav do { 275234458Smav length = MIN(strip_size - start, remain); 276234458Smav cbp = g_clone_bio(bp); 277234458Smav if (cbp == NULL) 278234458Smav goto failure; 279234458Smav cbp->bio_offset = offset + start; 280234458Smav cbp->bio_data = addr; 281234458Smav cbp->bio_length = length; 282234458Smav cbp->bio_caller1 = &vol->v_subdisks[no]; 283234458Smav bioq_insert_tail(&queue, cbp); 284234458Smav no++; 285234993Smav if (lvl == G_RAID_VOLUME_RL_RAID4) { 286234458Smav no %= vol->v_disks_count; 287234993Smav if (no == pno) 288234993Smav no = (no + pdisks) % vol->v_disks_count; 289234993Smav } else if (qual & 1) { /* Continuation/Symmetric */ 290234993Smav no %= vol->v_disks_count; 291234458Smav if (no == pno) { 292235076Smav if ((--pleft) <= 0) { 293235076Smav pleft += protate; 294235076Smav if (qual < 2) /* P0/Right */ 295235076Smav pno++; 296235076Smav else /* PN/Left */ 297235076Smav pno += vol->v_disks_count - 1; 298235076Smav pno %= vol->v_disks_count; 299235076Smav } 300234993Smav no = (pno + pdisks) % vol->v_disks_count; 301234458Smav offset += strip_size; 302234458Smav } 303234993Smav } else { /* Restart/Asymmetric */ 304234458Smav if (no == pno) 305234993Smav no += pdisks; 306234458Smav if (no >= vol->v_disks_count) { 307234993Smav no -= vol->v_disks_count; 308235076Smav if ((--pleft) <= 0) { 309235076Smav pleft += protate; 310235076Smav if (qual < 2) /* P0/Right */ 311235076Smav pno++; 312235076Smav else /* PN/Left */ 313235076Smav pno += vol->v_disks_count - 1; 314235076Smav pno %= vol->v_disks_count; 315235076Smav } 316234993Smav if (no == pno) 317234993Smav no += pdisks; 318234458Smav else 319234993Smav no += imax(0, pno + pdisks - vol->v_disks_count); 320234458Smav offset += strip_size; 321234458Smav } 322234458Smav } 323234458Smav remain -= length; 324234458Smav addr += length; 325234458Smav start = 0; 326234458Smav } while (remain > 0); 327260385Sscottl while ((cbp = bioq_takefirst(&queue)) != NULL) { 328234458Smav sd = cbp->bio_caller1; 329234458Smav cbp->bio_caller1 = NULL; 330234458Smav g_raid_subdisk_iostart(sd, cbp); 331234458Smav } 332234458Smav return; 333234458Smavfailure: 334260385Sscottl while ((cbp = bioq_takefirst(&queue)) != NULL) 335234458Smav g_destroy_bio(cbp); 336234458Smav if (bp->bio_error == 0) 337234458Smav bp->bio_error = ENOMEM; 338234458Smav g_raid_iodone(bp, bp->bio_error); 339234458Smav} 340234458Smav 341234458Smavstatic void 342234458Smavg_raid_tr_iostart_raid5(struct g_raid_tr_object *tr, struct bio *bp) 343234458Smav{ 344234458Smav struct g_raid_volume *vol; 345234458Smav struct g_raid_tr_raid5_object *trs; 346234458Smav 347234458Smav vol = tr->tro_volume; 348234458Smav trs = (struct g_raid_tr_raid5_object *)tr; 349234458Smav if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) { 350234458Smav g_raid_iodone(bp, EIO); 351234458Smav return; 352234458Smav } 353234458Smav switch (bp->bio_cmd) { 354234458Smav case BIO_READ: 355234458Smav g_raid_tr_iostart_raid5_read(tr, bp); 356234458Smav break; 357234458Smav case BIO_WRITE: 358234458Smav case BIO_DELETE: 359234458Smav case BIO_FLUSH: 360234458Smav g_raid_iodone(bp, ENODEV); 361234458Smav break; 362234458Smav default: 363234458Smav KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)", 364234458Smav bp->bio_cmd, vol->v_name)); 365234458Smav break; 366234458Smav } 367234458Smav} 368234458Smav 369234458Smavstatic void 370234458Smavg_raid_tr_iodone_raid5(struct g_raid_tr_object *tr, 371234458Smav struct g_raid_subdisk *sd, struct bio *bp) 372234458Smav{ 373234458Smav struct bio *pbp; 374234458Smav int error; 375234458Smav 376234458Smav pbp = bp->bio_parent; 377234458Smav pbp->bio_inbed++; 378234458Smav error = bp->bio_error; 379234458Smav g_destroy_bio(bp); 380234458Smav if (pbp->bio_children == pbp->bio_inbed) { 381234458Smav pbp->bio_completed = pbp->bio_length; 382234458Smav g_raid_iodone(pbp, error); 383234458Smav } 384234458Smav} 385234458Smav 386234458Smavstatic int 387234458Smavg_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr, 388234458Smav void *virtual, vm_offset_t physical, off_t offset, size_t length) 389234458Smav{ 390234458Smav 391234458Smav return (ENODEV); 392234458Smav} 393234458Smav 394234458Smavstatic int 395234458Smavg_raid_tr_locked_raid5(struct g_raid_tr_object *tr, void *argp) 396234458Smav{ 397234458Smav struct bio *bp; 398234458Smav struct g_raid_subdisk *sd; 399234458Smav 400234458Smav bp = (struct bio *)argp; 401234458Smav sd = (struct g_raid_subdisk *)bp->bio_caller1; 402234458Smav g_raid_subdisk_iostart(sd, bp); 403234458Smav 404234458Smav return (0); 405234458Smav} 406234458Smav 407234458Smavstatic int 408234458Smavg_raid_tr_free_raid5(struct g_raid_tr_object *tr) 409234458Smav{ 410234458Smav struct g_raid_tr_raid5_object *trs; 411234458Smav 412234458Smav trs = (struct g_raid_tr_raid5_object *)tr; 413234458Smav 414234458Smav if (trs->trso_buffer != NULL) { 415234458Smav free(trs->trso_buffer, M_TR_RAID5); 416234458Smav trs->trso_buffer = NULL; 417234458Smav } 418234458Smav return (0); 419234458Smav} 420234458Smav 421240465SmavG_RAID_TR_DECLARE(raid5, "RAID5"); 422