1234458Smav/*-
2234458Smav * Copyright (c) 2012 Alexander Motin <mav@FreeBSD.org>
3234458Smav * All rights reserved.
4234458Smav *
5234458Smav * Redistribution and use in source and binary forms, with or without
6234458Smav * modification, are permitted provided that the following conditions
7234458Smav * are met:
8234458Smav * 1. Redistributions of source code must retain the above copyright
9234458Smav *    notice, this list of conditions and the following disclaimer.
10234458Smav * 2. Redistributions in binary form must reproduce the above copyright
11234458Smav *    notice, this list of conditions and the following disclaimer in the
12234458Smav *    documentation and/or other materials provided with the distribution.
13234458Smav *
14234458Smav * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15234458Smav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16234458Smav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17234458Smav * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18234458Smav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19234458Smav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20234458Smav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21234458Smav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22234458Smav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23234458Smav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24234458Smav * SUCH DAMAGE.
25234458Smav */
26234458Smav
27234458Smav#include <sys/cdefs.h>
28234458Smav__FBSDID("$FreeBSD$");
29234458Smav
30234458Smav#include <sys/param.h>
31234458Smav#include <sys/bio.h>
32234458Smav#include <sys/endian.h>
33234458Smav#include <sys/kernel.h>
34234458Smav#include <sys/kobj.h>
35234458Smav#include <sys/limits.h>
36234458Smav#include <sys/lock.h>
37234458Smav#include <sys/malloc.h>
38234458Smav#include <sys/mutex.h>
39234458Smav#include <sys/sysctl.h>
40234458Smav#include <sys/systm.h>
41234458Smav#include <geom/geom.h>
42234458Smav#include "geom/raid/g_raid.h"
43234458Smav#include "g_raid_tr_if.h"
44234458Smav
45234458Smavstatic MALLOC_DEFINE(M_TR_RAID5, "tr_raid5_data", "GEOM_RAID RAID5 data");
46234458Smav
47234458Smav#define TR_RAID5_NONE 0
48234458Smav#define TR_RAID5_REBUILD 1
49234458Smav#define TR_RAID5_RESYNC 2
50234458Smav
51234458Smav#define TR_RAID5_F_DOING_SOME	0x1
52234458Smav#define TR_RAID5_F_LOCKED	0x2
53234458Smav#define TR_RAID5_F_ABORT	0x4
54234458Smav
55234458Smavstruct g_raid_tr_raid5_object {
56234458Smav	struct g_raid_tr_object	 trso_base;
57234458Smav	int			 trso_starting;
58234458Smav	int			 trso_stopping;
59234458Smav	int			 trso_type;
60234458Smav	int			 trso_recover_slabs; /* slabs before rest */
61234458Smav	int			 trso_fair_io;
62234458Smav	int			 trso_meta_update;
63234458Smav	int			 trso_flags;
64234458Smav	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
65234458Smav	void			*trso_buffer;	 /* Buffer space */
66234458Smav	struct bio		 trso_bio;
67234458Smav};
68234458Smav
69234458Smavstatic g_raid_tr_taste_t g_raid_tr_taste_raid5;
70234458Smavstatic g_raid_tr_event_t g_raid_tr_event_raid5;
71234458Smavstatic g_raid_tr_start_t g_raid_tr_start_raid5;
72234458Smavstatic g_raid_tr_stop_t g_raid_tr_stop_raid5;
73234458Smavstatic g_raid_tr_iostart_t g_raid_tr_iostart_raid5;
74234458Smavstatic g_raid_tr_iodone_t g_raid_tr_iodone_raid5;
75234458Smavstatic g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid5;
76234458Smavstatic g_raid_tr_locked_t g_raid_tr_locked_raid5;
77234458Smavstatic g_raid_tr_free_t g_raid_tr_free_raid5;
78234458Smav
79234458Smavstatic kobj_method_t g_raid_tr_raid5_methods[] = {
80234458Smav	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid5),
81234458Smav	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid5),
82234458Smav	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid5),
83234458Smav	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid5),
84234458Smav	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid5),
85234458Smav	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid5),
86234458Smav	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid5),
87234458Smav	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid5),
88234458Smav	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid5),
89234458Smav	{ 0, 0 }
90234458Smav};
91234458Smav
92234458Smavstatic struct g_raid_tr_class g_raid_tr_raid5_class = {
93234458Smav	"RAID5",
94234458Smav	g_raid_tr_raid5_methods,
95234458Smav	sizeof(struct g_raid_tr_raid5_object),
96240465Smav	.trc_enable = 1,
97234458Smav	.trc_priority = 100
98234458Smav};
99234458Smav
100234458Smavstatic int
101234458Smavg_raid_tr_taste_raid5(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
102234458Smav{
103234458Smav	struct g_raid_tr_raid5_object *trs;
104234458Smav	u_int qual;
105234458Smav
106234458Smav	trs = (struct g_raid_tr_raid5_object *)tr;
107234458Smav	qual = tr->tro_volume->v_raid_level_qualifier;
108234993Smav	if (tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID4 &&
109254269Smav	    (qual == G_RAID_VOLUME_RLQ_R4P0 ||
110254271Smav	     qual == G_RAID_VOLUME_RLQ_R4PN)) {
111234993Smav		/* RAID4 */
112234993Smav	} else if ((tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5 ||
113234993Smav	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5E ||
114234993Smav	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5EE ||
115235076Smav	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID5R ||
116234993Smav	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAID6 ||
117234993Smav	     tr->tro_volume->v_raid_level == G_RAID_VOLUME_RL_RAIDMDF) &&
118254269Smav	    (qual == G_RAID_VOLUME_RLQ_R5RA ||
119254269Smav	     qual == G_RAID_VOLUME_RLQ_R5RS ||
120254269Smav	     qual == G_RAID_VOLUME_RLQ_R5LA ||
121254269Smav	     qual == G_RAID_VOLUME_RLQ_R5LS)) {
122235076Smav		/* RAID5/5E/5EE/5R/6/MDF */
123234458Smav	} else
124234458Smav		return (G_RAID_TR_TASTE_FAIL);
125234458Smav	trs->trso_starting = 1;
126234458Smav	return (G_RAID_TR_TASTE_SUCCEED);
127234458Smav}
128234458Smav
129234458Smavstatic int
130234458Smavg_raid_tr_update_state_raid5(struct g_raid_volume *vol,
131234458Smav    struct g_raid_subdisk *sd)
132234458Smav{
133234458Smav	struct g_raid_tr_raid5_object *trs;
134234458Smav	struct g_raid_softc *sc;
135234458Smav	u_int s;
136234458Smav	int na, ns, nu;
137234458Smav
138234458Smav	sc = vol->v_softc;
139234458Smav	trs = (struct g_raid_tr_raid5_object *)vol->v_tr;
140234458Smav	if (trs->trso_stopping &&
141234458Smav	    (trs->trso_flags & TR_RAID5_F_DOING_SOME) == 0)
142234458Smav		s = G_RAID_VOLUME_S_STOPPED;
143234458Smav	else if (trs->trso_starting)
144234458Smav		s = G_RAID_VOLUME_S_STARTING;
145234458Smav	else {
146234458Smav		na = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE);
147234458Smav		ns = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
148234458Smav		     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
149234458Smav		nu = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
150234458Smav		if (na == vol->v_disks_count)
151234458Smav			s = G_RAID_VOLUME_S_OPTIMAL;
152234458Smav		else if (na + ns == vol->v_disks_count ||
153234458Smav		    na + ns + nu == vol->v_disks_count /* XXX: Temporary. */)
154234458Smav			s = G_RAID_VOLUME_S_SUBOPTIMAL;
155234458Smav		else if (na == vol->v_disks_count - 1 ||
156234458Smav		    na + ns + nu == vol->v_disks_count)
157234458Smav			s = G_RAID_VOLUME_S_DEGRADED;
158234458Smav		else
159234458Smav			s = G_RAID_VOLUME_S_BROKEN;
160234458Smav	}
161234458Smav	if (s != vol->v_state) {
162234458Smav		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
163234458Smav		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
164234458Smav		    G_RAID_EVENT_VOLUME);
165234458Smav		g_raid_change_volume_state(vol, s);
166234458Smav		if (!trs->trso_starting && !trs->trso_stopping)
167234458Smav			g_raid_write_metadata(sc, vol, NULL, NULL);
168234458Smav	}
169234458Smav	return (0);
170234458Smav}
171234458Smav
172234458Smavstatic int
173234458Smavg_raid_tr_event_raid5(struct g_raid_tr_object *tr,
174234458Smav    struct g_raid_subdisk *sd, u_int event)
175234458Smav{
176234458Smav
177234458Smav	g_raid_tr_update_state_raid5(tr->tro_volume, sd);
178234458Smav	return (0);
179234458Smav}
180234458Smav
181234458Smavstatic int
182234458Smavg_raid_tr_start_raid5(struct g_raid_tr_object *tr)
183234458Smav{
184234458Smav	struct g_raid_tr_raid5_object *trs;
185234458Smav	struct g_raid_volume *vol;
186234458Smav
187234458Smav	trs = (struct g_raid_tr_raid5_object *)tr;
188254275Smav	trs->trso_starting = 0;
189234458Smav	vol = tr->tro_volume;
190254275Smav	vol->v_read_only = 1;
191234458Smav	g_raid_tr_update_state_raid5(vol, NULL);
192234458Smav	return (0);
193234458Smav}
194234458Smav
195234458Smavstatic int
196234458Smavg_raid_tr_stop_raid5(struct g_raid_tr_object *tr)
197234458Smav{
198234458Smav	struct g_raid_tr_raid5_object *trs;
199234458Smav	struct g_raid_volume *vol;
200234458Smav
201234458Smav	trs = (struct g_raid_tr_raid5_object *)tr;
202234458Smav	vol = tr->tro_volume;
203234458Smav	trs->trso_starting = 0;
204234458Smav	trs->trso_stopping = 1;
205234458Smav	g_raid_tr_update_state_raid5(vol, NULL);
206234458Smav	return (0);
207234458Smav}
208234458Smav
209234458Smavstatic void
210234458Smavg_raid_tr_iostart_raid5_read(struct g_raid_tr_object *tr, struct bio *bp)
211234458Smav{
212234458Smav	struct g_raid_volume *vol;
213234458Smav	struct g_raid_subdisk *sd;
214234458Smav	struct bio_queue_head queue;
215234458Smav	struct bio *cbp;
216234458Smav	char *addr;
217234458Smav	off_t offset, start, length, nstripe, remain;
218235076Smav	int no, pno, ddisks, pdisks, protate, pleft;
219234993Smav	u_int strip_size, lvl, qual;
220234458Smav
221234458Smav	vol = tr->tro_volume;
222234458Smav	addr = bp->bio_data;
223234458Smav	strip_size = vol->v_strip_size;
224234993Smav	lvl = tr->tro_volume->v_raid_level;
225234458Smav	qual = tr->tro_volume->v_raid_level_qualifier;
226235076Smav	protate = tr->tro_volume->v_rotate_parity;
227234458Smav
228234458Smav	/* Stripe number. */
229234458Smav	nstripe = bp->bio_offset / strip_size;
230234458Smav	/* Start position in stripe. */
231234458Smav	start = bp->bio_offset % strip_size;
232234993Smav	/* Number of data and parity disks. */
233234993Smav	if (lvl == G_RAID_VOLUME_RL_RAIDMDF)
234235076Smav		pdisks = tr->tro_volume->v_mdf_pdisks;
235234993Smav	else if (lvl == G_RAID_VOLUME_RL_RAID5EE ||
236234993Smav	    lvl == G_RAID_VOLUME_RL_RAID6)
237234993Smav		pdisks = 2;
238234993Smav	else
239234993Smav		pdisks = 1;
240234993Smav	ddisks = vol->v_disks_count - pdisks;
241234458Smav	/* Parity disk number. */
242234993Smav	if (lvl == G_RAID_VOLUME_RL_RAID4) {
243234993Smav		if (qual == 0)		/* P0 */
244234993Smav			pno = 0;
245234993Smav		else			/* PN */
246234993Smav			pno = ddisks;
247235076Smav		pleft = -1;
248234993Smav	} else {
249235076Smav		pno = (nstripe / (ddisks * protate)) % vol->v_disks_count;
250235076Smav		pleft = protate - (nstripe / ddisks) % protate;
251234993Smav		if (qual >= 2) {	/* PN/Left */
252234993Smav			pno = ddisks - pno;
253234993Smav			if (pno < 0)
254234993Smav				pno += vol->v_disks_count;
255234993Smav		}
256234993Smav	}
257234993Smav	/* Data disk number. */
258234993Smav	no = nstripe % ddisks;
259234993Smav	if (lvl == G_RAID_VOLUME_RL_RAID4) {
260234993Smav		if (qual == 0)
261234993Smav			no += pdisks;
262234993Smav	} else if (qual & 1) {	/* Continuation/Symmetric */
263234993Smav		no = (pno + pdisks + no) % vol->v_disks_count;
264234993Smav	} else if (no >= pno)	/* Restart/Asymmetric */
265234993Smav		no += pdisks;
266234993Smav	else
267234993Smav		no += imax(0, pno + pdisks - vol->v_disks_count);
268234458Smav	/* Stripe start position in disk. */
269234993Smav	offset = (nstripe / ddisks) * strip_size;
270234458Smav	/* Length of data to operate. */
271234458Smav	remain = bp->bio_length;
272234458Smav
273234458Smav	bioq_init(&queue);
274234458Smav	do {
275234458Smav		length = MIN(strip_size - start, remain);
276234458Smav		cbp = g_clone_bio(bp);
277234458Smav		if (cbp == NULL)
278234458Smav			goto failure;
279234458Smav		cbp->bio_offset = offset + start;
280234458Smav		cbp->bio_data = addr;
281234458Smav		cbp->bio_length = length;
282234458Smav		cbp->bio_caller1 = &vol->v_subdisks[no];
283234458Smav		bioq_insert_tail(&queue, cbp);
284234458Smav		no++;
285234993Smav		if (lvl == G_RAID_VOLUME_RL_RAID4) {
286234458Smav			no %= vol->v_disks_count;
287234993Smav			if (no == pno)
288234993Smav				no = (no + pdisks) % vol->v_disks_count;
289234993Smav		} else if (qual & 1) {	/* Continuation/Symmetric */
290234993Smav			no %= vol->v_disks_count;
291234458Smav			if (no == pno) {
292235076Smav				if ((--pleft) <= 0) {
293235076Smav					pleft += protate;
294235076Smav					if (qual < 2)	/* P0/Right */
295235076Smav						pno++;
296235076Smav					else		/* PN/Left */
297235076Smav						pno += vol->v_disks_count - 1;
298235076Smav					pno %= vol->v_disks_count;
299235076Smav				}
300234993Smav				no = (pno + pdisks) % vol->v_disks_count;
301234458Smav				offset += strip_size;
302234458Smav			}
303234993Smav		} else {		/* Restart/Asymmetric */
304234458Smav			if (no == pno)
305234993Smav				no += pdisks;
306234458Smav			if (no >= vol->v_disks_count) {
307234993Smav				no -= vol->v_disks_count;
308235076Smav				if ((--pleft) <= 0) {
309235076Smav					pleft += protate;
310235076Smav					if (qual < 2)	/* P0/Right */
311235076Smav						pno++;
312235076Smav					else		/* PN/Left */
313235076Smav						pno += vol->v_disks_count - 1;
314235076Smav					pno %= vol->v_disks_count;
315235076Smav				}
316234993Smav				if (no == pno)
317234993Smav					no += pdisks;
318234458Smav				else
319234993Smav					no += imax(0, pno + pdisks - vol->v_disks_count);
320234458Smav				offset += strip_size;
321234458Smav			}
322234458Smav		}
323234458Smav		remain -= length;
324234458Smav		addr += length;
325234458Smav		start = 0;
326234458Smav	} while (remain > 0);
327260385Sscottl	while ((cbp = bioq_takefirst(&queue)) != NULL) {
328234458Smav		sd = cbp->bio_caller1;
329234458Smav		cbp->bio_caller1 = NULL;
330234458Smav		g_raid_subdisk_iostart(sd, cbp);
331234458Smav	}
332234458Smav	return;
333234458Smavfailure:
334260385Sscottl	while ((cbp = bioq_takefirst(&queue)) != NULL)
335234458Smav		g_destroy_bio(cbp);
336234458Smav	if (bp->bio_error == 0)
337234458Smav		bp->bio_error = ENOMEM;
338234458Smav	g_raid_iodone(bp, bp->bio_error);
339234458Smav}
340234458Smav
341234458Smavstatic void
342234458Smavg_raid_tr_iostart_raid5(struct g_raid_tr_object *tr, struct bio *bp)
343234458Smav{
344234458Smav	struct g_raid_volume *vol;
345234458Smav	struct g_raid_tr_raid5_object *trs;
346234458Smav
347234458Smav	vol = tr->tro_volume;
348234458Smav	trs = (struct g_raid_tr_raid5_object *)tr;
349234458Smav	if (vol->v_state < G_RAID_VOLUME_S_SUBOPTIMAL) {
350234458Smav		g_raid_iodone(bp, EIO);
351234458Smav		return;
352234458Smav	}
353234458Smav	switch (bp->bio_cmd) {
354234458Smav	case BIO_READ:
355234458Smav		g_raid_tr_iostart_raid5_read(tr, bp);
356234458Smav		break;
357234458Smav	case BIO_WRITE:
358234458Smav	case BIO_DELETE:
359234458Smav	case BIO_FLUSH:
360234458Smav		g_raid_iodone(bp, ENODEV);
361234458Smav		break;
362234458Smav	default:
363234458Smav		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
364234458Smav		    bp->bio_cmd, vol->v_name));
365234458Smav		break;
366234458Smav	}
367234458Smav}
368234458Smav
369234458Smavstatic void
370234458Smavg_raid_tr_iodone_raid5(struct g_raid_tr_object *tr,
371234458Smav    struct g_raid_subdisk *sd, struct bio *bp)
372234458Smav{
373234458Smav	struct bio *pbp;
374234458Smav	int error;
375234458Smav
376234458Smav	pbp = bp->bio_parent;
377234458Smav	pbp->bio_inbed++;
378234458Smav	error = bp->bio_error;
379234458Smav	g_destroy_bio(bp);
380234458Smav	if (pbp->bio_children == pbp->bio_inbed) {
381234458Smav		pbp->bio_completed = pbp->bio_length;
382234458Smav		g_raid_iodone(pbp, error);
383234458Smav	}
384234458Smav}
385234458Smav
386234458Smavstatic int
387234458Smavg_raid_tr_kerneldump_raid5(struct g_raid_tr_object *tr,
388234458Smav    void *virtual, vm_offset_t physical, off_t offset, size_t length)
389234458Smav{
390234458Smav
391234458Smav	return (ENODEV);
392234458Smav}
393234458Smav
394234458Smavstatic int
395234458Smavg_raid_tr_locked_raid5(struct g_raid_tr_object *tr, void *argp)
396234458Smav{
397234458Smav	struct bio *bp;
398234458Smav	struct g_raid_subdisk *sd;
399234458Smav
400234458Smav	bp = (struct bio *)argp;
401234458Smav	sd = (struct g_raid_subdisk *)bp->bio_caller1;
402234458Smav	g_raid_subdisk_iostart(sd, bp);
403234458Smav
404234458Smav	return (0);
405234458Smav}
406234458Smav
407234458Smavstatic int
408234458Smavg_raid_tr_free_raid5(struct g_raid_tr_object *tr)
409234458Smav{
410234458Smav	struct g_raid_tr_raid5_object *trs;
411234458Smav
412234458Smav	trs = (struct g_raid_tr_raid5_object *)tr;
413234458Smav
414234458Smav	if (trs->trso_buffer != NULL) {
415234458Smav		free(trs->trso_buffer, M_TR_RAID5);
416234458Smav		trs->trso_buffer = NULL;
417234458Smav	}
418234458Smav	return (0);
419234458Smav}
420234458Smav
421240465SmavG_RAID_TR_DECLARE(raid5, "RAID5");
422