block_if.c revision 280746
1/*-
2 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280746 2015-03-27 08:55:54Z mav $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280746 2015-03-27 08:55:54Z mav $");
31
32#include <sys/param.h>
33#include <sys/queue.h>
34#include <sys/errno.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/disk.h>
38
39#include <assert.h>
40#include <fcntl.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44#include <pthread.h>
45#include <pthread_np.h>
46#include <signal.h>
47#include <unistd.h>
48
49#include <machine/atomic.h>
50
51#include "bhyverun.h"
52#include "mevent.h"
53#include "block_if.h"
54
55#define BLOCKIF_SIG	0xb109b109
56
57#define BLOCKIF_MAXREQ	64
58#define BLOCKIF_NUMTHR	8
59
60enum blockop {
61	BOP_READ,
62	BOP_WRITE,
63	BOP_FLUSH,
64	BOP_DELETE
65};
66
67enum blockstat {
68	BST_FREE,
69	BST_BLOCK,
70	BST_PEND,
71	BST_BUSY,
72	BST_DONE
73};
74
75struct blockif_elem {
76	TAILQ_ENTRY(blockif_elem) be_link;
77	struct blockif_req  *be_req;
78	enum blockop	     be_op;
79	enum blockstat	     be_status;
80	pthread_t            be_tid;
81	off_t		     be_block;
82};
83
84struct blockif_ctxt {
85	int			bc_magic;
86	int			bc_fd;
87	int			bc_ischr;
88	int			bc_candelete;
89	int			bc_rdonly;
90	off_t			bc_size;
91	int			bc_sectsz;
92	int			bc_psectsz;
93	int			bc_psectoff;
94	int			bc_closing;
95	pthread_t		bc_btid[BLOCKIF_NUMTHR];
96        pthread_mutex_t		bc_mtx;
97        pthread_cond_t		bc_cond;
98
99	/* Request elements and free/pending/busy queues */
100	TAILQ_HEAD(, blockif_elem) bc_freeq;
101	TAILQ_HEAD(, blockif_elem) bc_pendq;
102	TAILQ_HEAD(, blockif_elem) bc_busyq;
103	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
104};
105
106static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
107
108struct blockif_sig_elem {
109	pthread_mutex_t			bse_mtx;
110	pthread_cond_t			bse_cond;
111	int				bse_pending;
112	struct blockif_sig_elem		*bse_next;
113};
114
115static struct blockif_sig_elem *blockif_bse_head;
116
117static int
118blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
119		enum blockop op)
120{
121	struct blockif_elem *be, *tbe;
122	off_t off;
123	int i;
124
125	be = TAILQ_FIRST(&bc->bc_freeq);
126	assert(be != NULL);
127	assert(be->be_status == BST_FREE);
128	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
129	be->be_req = breq;
130	be->be_op = op;
131	switch (op) {
132	case BOP_READ:
133	case BOP_WRITE:
134	case BOP_DELETE:
135		off = breq->br_offset;
136		for (i = 0; i < breq->br_iovcnt; i++)
137			off += breq->br_iov[i].iov_len;
138		break;
139	default:
140		off = OFF_MAX;
141	}
142	be->be_block = off;
143	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
144		if (tbe->be_block == breq->br_offset)
145			break;
146	}
147	if (tbe == NULL) {
148		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
149			if (tbe->be_block == breq->br_offset)
150				break;
151		}
152	}
153	if (tbe == NULL)
154		be->be_status = BST_PEND;
155	else
156		be->be_status = BST_BLOCK;
157	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
158	return (be->be_status == BST_PEND);
159}
160
161static int
162blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
163{
164	struct blockif_elem *be;
165
166	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
167		if (be->be_status == BST_PEND)
168			break;
169		assert(be->be_status == BST_BLOCK);
170	}
171	if (be == NULL)
172		return (0);
173	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
174	be->be_status = BST_BUSY;
175	be->be_tid = t;
176	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
177	*bep = be;
178	return (1);
179}
180
181static void
182blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
183{
184	struct blockif_elem *tbe;
185
186	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
187		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
188	else
189		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
190	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
191		if (tbe->be_req->br_offset == be->be_block)
192			tbe->be_status = BST_PEND;
193	}
194	be->be_tid = 0;
195	be->be_status = BST_FREE;
196	be->be_req = NULL;
197	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
198}
199
200static void
201blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
202{
203	struct blockif_req *br;
204	off_t arg[2];
205	int err;
206
207	br = be->be_req;
208	err = 0;
209
210	switch (be->be_op) {
211	case BOP_READ:
212		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
213			   br->br_offset) < 0)
214			err = errno;
215		break;
216	case BOP_WRITE:
217		if (bc->bc_rdonly)
218			err = EROFS;
219		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
220			     br->br_offset) < 0)
221			err = errno;
222		break;
223	case BOP_FLUSH:
224		if (bc->bc_ischr) {
225			if (ioctl(bc->bc_fd, DIOCGFLUSH))
226				err = errno;
227		} else if (fsync(bc->bc_fd))
228			err = errno;
229		break;
230	case BOP_DELETE:
231		if (!bc->bc_candelete)
232			err = EOPNOTSUPP;
233		else if (bc->bc_rdonly)
234			err = EROFS;
235		else if (bc->bc_ischr) {
236			arg[0] = br->br_offset;
237			arg[1] = br->br_iov[0].iov_len;
238			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
239				err = errno;
240		} else
241			err = EOPNOTSUPP;
242		break;
243	default:
244		err = EINVAL;
245		break;
246	}
247
248	be->be_status = BST_DONE;
249
250	(*br->br_callback)(br, err);
251}
252
253static void *
254blockif_thr(void *arg)
255{
256	struct blockif_ctxt *bc;
257	struct blockif_elem *be;
258	pthread_t t;
259
260	bc = arg;
261	t = pthread_self();
262
263	pthread_mutex_lock(&bc->bc_mtx);
264	for (;;) {
265		while (blockif_dequeue(bc, t, &be)) {
266			pthread_mutex_unlock(&bc->bc_mtx);
267			blockif_proc(bc, be);
268			pthread_mutex_lock(&bc->bc_mtx);
269			blockif_complete(bc, be);
270		}
271		/* Check ctxt status here to see if exit requested */
272		if (bc->bc_closing)
273			break;
274		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
275	}
276	pthread_mutex_unlock(&bc->bc_mtx);
277
278	pthread_exit(NULL);
279	return (NULL);
280}
281
282static void
283blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
284{
285	struct blockif_sig_elem *bse;
286
287	for (;;) {
288		/*
289		 * Process the entire list even if not intended for
290		 * this thread.
291		 */
292		do {
293			bse = blockif_bse_head;
294			if (bse == NULL)
295				return;
296		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
297					    (uintptr_t)bse,
298					    (uintptr_t)bse->bse_next));
299
300		pthread_mutex_lock(&bse->bse_mtx);
301		bse->bse_pending = 0;
302		pthread_cond_signal(&bse->bse_cond);
303		pthread_mutex_unlock(&bse->bse_mtx);
304	}
305}
306
307static void
308blockif_init(void)
309{
310	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
311	(void) signal(SIGCONT, SIG_IGN);
312}
313
314struct blockif_ctxt *
315blockif_open(const char *optstr, const char *ident)
316{
317	char tname[MAXCOMLEN + 1];
318	char *nopt, *xopts;
319	struct blockif_ctxt *bc;
320	struct stat sbuf;
321	struct diocgattr_arg arg;
322	off_t size, psectsz, psectoff;
323	int extra, fd, i, sectsz;
324	int nocache, sync, ro, candelete;
325
326	pthread_once(&blockif_once, blockif_init);
327
328	nocache = 0;
329	sync = 0;
330	ro = 0;
331
332	/*
333	 * The first element in the optstring is always a pathname.
334	 * Optional elements follow
335	 */
336	nopt = strdup(optstr);
337	for (xopts = strtok(nopt, ",");
338	     xopts != NULL;
339	     xopts = strtok(NULL, ",")) {
340		if (!strcmp(xopts, "nocache"))
341			nocache = 1;
342		else if (!strcmp(xopts, "sync"))
343			sync = 1;
344		else if (!strcmp(xopts, "ro"))
345			ro = 1;
346	}
347
348	extra = 0;
349	if (nocache)
350		extra |= O_DIRECT;
351	if (sync)
352		extra |= O_SYNC;
353
354	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
355	if (fd < 0 && !ro) {
356		/* Attempt a r/w fail with a r/o open */
357		fd = open(nopt, O_RDONLY | extra);
358		ro = 1;
359	}
360
361	if (fd < 0) {
362		perror("Could not open backing file");
363		return (NULL);
364	}
365
366        if (fstat(fd, &sbuf) < 0) {
367                perror("Could not stat backing file");
368                close(fd);
369                return (NULL);
370        }
371
372        /*
373	 * Deal with raw devices
374	 */
375        size = sbuf.st_size;
376	sectsz = DEV_BSIZE;
377	psectsz = psectoff = 0;
378	candelete = 0;
379	if (S_ISCHR(sbuf.st_mode)) {
380		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
381		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
382			perror("Could not fetch dev blk/sector size");
383			close(fd);
384			return (NULL);
385		}
386		assert(size != 0);
387		assert(sectsz != 0);
388		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
389			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
390		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
391		arg.len = sizeof(arg.value.i);
392		if (ioctl(fd, DIOCGATTR, &arg) == 0)
393			candelete = arg.value.i;
394	} else
395		psectsz = sbuf.st_blksize;
396
397	bc = calloc(1, sizeof(struct blockif_ctxt));
398	if (bc == NULL) {
399		close(fd);
400		return (NULL);
401	}
402
403	bc->bc_magic = BLOCKIF_SIG;
404	bc->bc_fd = fd;
405	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
406	bc->bc_candelete = candelete;
407	bc->bc_rdonly = ro;
408	bc->bc_size = size;
409	bc->bc_sectsz = sectsz;
410	bc->bc_psectsz = psectsz;
411	bc->bc_psectoff = psectoff;
412	pthread_mutex_init(&bc->bc_mtx, NULL);
413	pthread_cond_init(&bc->bc_cond, NULL);
414	TAILQ_INIT(&bc->bc_freeq);
415	TAILQ_INIT(&bc->bc_pendq);
416	TAILQ_INIT(&bc->bc_busyq);
417	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
418		bc->bc_reqs[i].be_status = BST_FREE;
419		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
420	}
421
422	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
423		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
424		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
425		pthread_set_name_np(bc->bc_btid[i], tname);
426	}
427
428	return (bc);
429}
430
431static int
432blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
433		enum blockop op)
434{
435	int err;
436
437	err = 0;
438
439	pthread_mutex_lock(&bc->bc_mtx);
440	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
441		/*
442		 * Enqueue and inform the block i/o thread
443		 * that there is work available
444		 */
445		if (blockif_enqueue(bc, breq, op))
446			pthread_cond_signal(&bc->bc_cond);
447	} else {
448		/*
449		 * Callers are not allowed to enqueue more than
450		 * the specified blockif queue limit. Return an
451		 * error to indicate that the queue length has been
452		 * exceeded.
453		 */
454		err = E2BIG;
455	}
456	pthread_mutex_unlock(&bc->bc_mtx);
457
458	return (err);
459}
460
461int
462blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
463{
464
465	assert(bc->bc_magic == BLOCKIF_SIG);
466	return (blockif_request(bc, breq, BOP_READ));
467}
468
469int
470blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
471{
472
473	assert(bc->bc_magic == BLOCKIF_SIG);
474	return (blockif_request(bc, breq, BOP_WRITE));
475}
476
477int
478blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
479{
480
481	assert(bc->bc_magic == BLOCKIF_SIG);
482	return (blockif_request(bc, breq, BOP_FLUSH));
483}
484
485int
486blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
487{
488
489	assert(bc->bc_magic == BLOCKIF_SIG);
490	return (blockif_request(bc, breq, BOP_DELETE));
491}
492
493int
494blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
495{
496	struct blockif_elem *be;
497
498	assert(bc->bc_magic == BLOCKIF_SIG);
499
500	pthread_mutex_lock(&bc->bc_mtx);
501	/*
502	 * Check pending requests.
503	 */
504	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
505		if (be->be_req == breq)
506			break;
507	}
508	if (be != NULL) {
509		/*
510		 * Found it.
511		 */
512		blockif_complete(bc, be);
513		pthread_mutex_unlock(&bc->bc_mtx);
514
515		return (0);
516	}
517
518	/*
519	 * Check in-flight requests.
520	 */
521	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
522		if (be->be_req == breq)
523			break;
524	}
525	if (be == NULL) {
526		/*
527		 * Didn't find it.
528		 */
529		pthread_mutex_unlock(&bc->bc_mtx);
530		return (EINVAL);
531	}
532
533	/*
534	 * Interrupt the processing thread to force it return
535	 * prematurely via it's normal callback path.
536	 */
537	while (be->be_status == BST_BUSY) {
538		struct blockif_sig_elem bse, *old_head;
539
540		pthread_mutex_init(&bse.bse_mtx, NULL);
541		pthread_cond_init(&bse.bse_cond, NULL);
542
543		bse.bse_pending = 1;
544
545		do {
546			old_head = blockif_bse_head;
547			bse.bse_next = old_head;
548		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
549					    (uintptr_t)old_head,
550					    (uintptr_t)&bse));
551
552		pthread_kill(be->be_tid, SIGCONT);
553
554		pthread_mutex_lock(&bse.bse_mtx);
555		while (bse.bse_pending)
556			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
557		pthread_mutex_unlock(&bse.bse_mtx);
558	}
559
560	pthread_mutex_unlock(&bc->bc_mtx);
561
562	/*
563	 * The processing thread has been interrupted.  Since it's not
564	 * clear if the callback has been invoked yet, return EBUSY.
565	 */
566	return (EBUSY);
567}
568
569int
570blockif_close(struct blockif_ctxt *bc)
571{
572	void *jval;
573	int err, i;
574
575	err = 0;
576
577	assert(bc->bc_magic == BLOCKIF_SIG);
578
579	/*
580	 * Stop the block i/o thread
581	 */
582	pthread_mutex_lock(&bc->bc_mtx);
583	bc->bc_closing = 1;
584	pthread_mutex_unlock(&bc->bc_mtx);
585	pthread_cond_broadcast(&bc->bc_cond);
586	for (i = 0; i < BLOCKIF_NUMTHR; i++)
587		pthread_join(bc->bc_btid[i], &jval);
588
589	/* XXX Cancel queued i/o's ??? */
590
591	/*
592	 * Release resources
593	 */
594	bc->bc_magic = 0;
595	close(bc->bc_fd);
596	free(bc);
597
598	return (0);
599}
600
601/*
602 * Return virtual C/H/S values for a given block. Use the algorithm
603 * outlined in the VHD specification to calculate values.
604 */
605void
606blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
607{
608	off_t sectors;		/* total sectors of the block dev */
609	off_t hcyl;		/* cylinders times heads */
610	uint16_t secpt;		/* sectors per track */
611	uint8_t heads;
612
613	assert(bc->bc_magic == BLOCKIF_SIG);
614
615	sectors = bc->bc_size / bc->bc_sectsz;
616
617	/* Clamp the size to the largest possible with CHS */
618	if (sectors > 65535UL*16*255)
619		sectors = 65535UL*16*255;
620
621	if (sectors >= 65536UL*16*63) {
622		secpt = 255;
623		heads = 16;
624		hcyl = sectors / secpt;
625	} else {
626		secpt = 17;
627		hcyl = sectors / secpt;
628		heads = (hcyl + 1023) / 1024;
629
630		if (heads < 4)
631			heads = 4;
632
633		if (hcyl >= (heads * 1024) || heads > 16) {
634			secpt = 31;
635			heads = 16;
636			hcyl = sectors / secpt;
637		}
638		if (hcyl >= (heads * 1024)) {
639			secpt = 63;
640			heads = 16;
641			hcyl = sectors / secpt;
642		}
643	}
644
645	*c = hcyl / heads;
646	*h = heads;
647	*s = secpt;
648}
649
650/*
651 * Accessors
652 */
653off_t
654blockif_size(struct blockif_ctxt *bc)
655{
656
657	assert(bc->bc_magic == BLOCKIF_SIG);
658	return (bc->bc_size);
659}
660
661int
662blockif_sectsz(struct blockif_ctxt *bc)
663{
664
665	assert(bc->bc_magic == BLOCKIF_SIG);
666	return (bc->bc_sectsz);
667}
668
669void
670blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
671{
672
673	assert(bc->bc_magic == BLOCKIF_SIG);
674	*size = bc->bc_psectsz;
675	*off = bc->bc_psectoff;
676}
677
678int
679blockif_queuesz(struct blockif_ctxt *bc)
680{
681
682	assert(bc->bc_magic == BLOCKIF_SIG);
683	return (BLOCKIF_MAXREQ - 1);
684}
685
686int
687blockif_is_ro(struct blockif_ctxt *bc)
688{
689
690	assert(bc->bc_magic == BLOCKIF_SIG);
691	return (bc->bc_rdonly);
692}
693
694int
695blockif_candelete(struct blockif_ctxt *bc)
696{
697
698	assert(bc->bc_magic == BLOCKIF_SIG);
699	return (bc->bc_candelete);
700}
701