block_if.c revision 280370
1/*-
2 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280370 2015-03-23 14:36:53Z mav $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280370 2015-03-23 14:36:53Z mav $");
31
32#include <sys/param.h>
33#include <sys/queue.h>
34#include <sys/errno.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/disk.h>
38
39#include <assert.h>
40#include <fcntl.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44#include <pthread.h>
45#include <pthread_np.h>
46#include <signal.h>
47#include <unistd.h>
48
49#include <machine/atomic.h>
50
51#include "bhyverun.h"
52#include "mevent.h"
53#include "block_if.h"
54
55#define BLOCKIF_SIG	0xb109b109
56
57#define BLOCKIF_MAXREQ	33
58
59enum blockop {
60	BOP_READ,
61	BOP_WRITE,
62	BOP_FLUSH,
63	BOP_DELETE
64};
65
66enum blockstat {
67	BST_FREE,
68	BST_PEND,
69	BST_BUSY,
70	BST_DONE
71};
72
73struct blockif_elem {
74	TAILQ_ENTRY(blockif_elem) be_link;
75	struct blockif_req  *be_req;
76	enum blockop	     be_op;
77	enum blockstat	     be_status;
78	pthread_t            be_tid;
79};
80
81struct blockif_ctxt {
82	int			bc_magic;
83	int			bc_fd;
84	int			bc_ischr;
85	int			bc_candelete;
86	int			bc_rdonly;
87	off_t			bc_size;
88	int			bc_sectsz;
89	int			bc_psectsz;
90	int			bc_psectoff;
91	pthread_t		bc_btid;
92        pthread_mutex_t		bc_mtx;
93        pthread_cond_t		bc_cond;
94	int			bc_closing;
95
96	/* Request elements and free/pending/busy queues */
97	TAILQ_HEAD(, blockif_elem) bc_freeq;
98	TAILQ_HEAD(, blockif_elem) bc_pendq;
99	TAILQ_HEAD(, blockif_elem) bc_busyq;
100	u_int			bc_req_count;
101	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
102};
103
104static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
105
106struct blockif_sig_elem {
107	pthread_mutex_t			bse_mtx;
108	pthread_cond_t			bse_cond;
109	int				bse_pending;
110	struct blockif_sig_elem		*bse_next;
111};
112
113static struct blockif_sig_elem *blockif_bse_head;
114
115static int
116blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
117		enum blockop op)
118{
119	struct blockif_elem *be;
120
121	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
122
123	be = TAILQ_FIRST(&bc->bc_freeq);
124	assert(be != NULL);
125	assert(be->be_status == BST_FREE);
126
127	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
128	be->be_status = BST_PEND;
129	be->be_req = breq;
130	be->be_op = op;
131	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
132
133	bc->bc_req_count++;
134
135	return (0);
136}
137
138static int
139blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
140{
141	struct blockif_elem *be;
142
143	if (bc->bc_req_count == 0)
144		return (ENOENT);
145
146	be = TAILQ_FIRST(&bc->bc_pendq);
147	assert(be != NULL);
148	assert(be->be_status == BST_PEND);
149	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
150	be->be_status = BST_BUSY;
151	be->be_tid = bc->bc_btid;
152	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
153
154	*bep = be;
155
156	return (0);
157}
158
159static void
160blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
161{
162	assert(be->be_status == BST_DONE);
163
164	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
165	be->be_tid = 0;
166	be->be_status = BST_FREE;
167	be->be_req = NULL;
168	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
169
170	bc->bc_req_count--;
171}
172
173static void
174blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
175{
176	struct blockif_req *br;
177	off_t arg[2];
178	int err;
179
180	br = be->be_req;
181	err = 0;
182
183	switch (be->be_op) {
184	case BOP_READ:
185		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
186			   br->br_offset) < 0)
187			err = errno;
188		break;
189	case BOP_WRITE:
190		if (bc->bc_rdonly)
191			err = EROFS;
192		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
193			     br->br_offset) < 0)
194			err = errno;
195		break;
196	case BOP_FLUSH:
197		if (bc->bc_ischr) {
198			if (ioctl(bc->bc_fd, DIOCGFLUSH))
199				err = errno;
200		} else if (fsync(bc->bc_fd))
201			err = errno;
202		break;
203	case BOP_DELETE:
204		if (!bc->bc_candelete)
205			err = EOPNOTSUPP;
206		else if (bc->bc_rdonly)
207			err = EROFS;
208		else if (bc->bc_ischr) {
209			arg[0] = br->br_offset;
210			arg[1] = br->br_iov[0].iov_len;
211			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
212				err = errno;
213		} else
214			err = EOPNOTSUPP;
215		break;
216	default:
217		err = EINVAL;
218		break;
219	}
220
221	be->be_status = BST_DONE;
222
223	(*br->br_callback)(br, err);
224}
225
226static void *
227blockif_thr(void *arg)
228{
229	struct blockif_ctxt *bc;
230	struct blockif_elem *be;
231
232	bc = arg;
233
234	for (;;) {
235		pthread_mutex_lock(&bc->bc_mtx);
236		while (!blockif_dequeue(bc, &be)) {
237			pthread_mutex_unlock(&bc->bc_mtx);
238			blockif_proc(bc, be);
239			pthread_mutex_lock(&bc->bc_mtx);
240			blockif_complete(bc, be);
241		}
242		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
243		pthread_mutex_unlock(&bc->bc_mtx);
244
245		/*
246		 * Check ctxt status here to see if exit requested
247		 */
248		if (bc->bc_closing)
249			pthread_exit(NULL);
250	}
251
252	/* Not reached */
253	return (NULL);
254}
255
256static void
257blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
258{
259	struct blockif_sig_elem *bse;
260
261	for (;;) {
262		/*
263		 * Process the entire list even if not intended for
264		 * this thread.
265		 */
266		do {
267			bse = blockif_bse_head;
268			if (bse == NULL)
269				return;
270		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
271					    (uintptr_t)bse,
272					    (uintptr_t)bse->bse_next));
273
274		pthread_mutex_lock(&bse->bse_mtx);
275		bse->bse_pending = 0;
276		pthread_cond_signal(&bse->bse_cond);
277		pthread_mutex_unlock(&bse->bse_mtx);
278	}
279}
280
281static void
282blockif_init(void)
283{
284	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
285	(void) signal(SIGCONT, SIG_IGN);
286}
287
288struct blockif_ctxt *
289blockif_open(const char *optstr, const char *ident)
290{
291	char tname[MAXCOMLEN + 1];
292	char *nopt, *xopts;
293	struct blockif_ctxt *bc;
294	struct stat sbuf;
295	struct diocgattr_arg arg;
296	off_t size, psectsz, psectoff;
297	int extra, fd, i, sectsz;
298	int nocache, sync, ro, candelete;
299
300	pthread_once(&blockif_once, blockif_init);
301
302	nocache = 0;
303	sync = 0;
304	ro = 0;
305
306	/*
307	 * The first element in the optstring is always a pathname.
308	 * Optional elements follow
309	 */
310	nopt = strdup(optstr);
311	for (xopts = strtok(nopt, ",");
312	     xopts != NULL;
313	     xopts = strtok(NULL, ",")) {
314		if (!strcmp(xopts, "nocache"))
315			nocache = 1;
316		else if (!strcmp(xopts, "sync"))
317			sync = 1;
318		else if (!strcmp(xopts, "ro"))
319			ro = 1;
320	}
321
322	extra = 0;
323	if (nocache)
324		extra |= O_DIRECT;
325	if (sync)
326		extra |= O_SYNC;
327
328	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
329	if (fd < 0 && !ro) {
330		/* Attempt a r/w fail with a r/o open */
331		fd = open(nopt, O_RDONLY | extra);
332		ro = 1;
333	}
334
335	if (fd < 0) {
336		perror("Could not open backing file");
337		return (NULL);
338	}
339
340        if (fstat(fd, &sbuf) < 0) {
341                perror("Could not stat backing file");
342                close(fd);
343                return (NULL);
344        }
345
346        /*
347	 * Deal with raw devices
348	 */
349        size = sbuf.st_size;
350	sectsz = DEV_BSIZE;
351	psectsz = psectoff = 0;
352	candelete = 0;
353	if (S_ISCHR(sbuf.st_mode)) {
354		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
355		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
356			perror("Could not fetch dev blk/sector size");
357			close(fd);
358			return (NULL);
359		}
360		assert(size != 0);
361		assert(sectsz != 0);
362		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
363			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
364		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
365		arg.len = sizeof(arg.value.i);
366		if (ioctl(fd, DIOCGATTR, &arg) == 0)
367			candelete = arg.value.i;
368	} else
369		psectsz = sbuf.st_blksize;
370
371	bc = calloc(1, sizeof(struct blockif_ctxt));
372	if (bc == NULL) {
373		close(fd);
374		return (NULL);
375	}
376
377	bc->bc_magic = BLOCKIF_SIG;
378	bc->bc_fd = fd;
379	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
380	bc->bc_candelete = candelete;
381	bc->bc_rdonly = ro;
382	bc->bc_size = size;
383	bc->bc_sectsz = sectsz;
384	bc->bc_psectsz = psectsz;
385	bc->bc_psectoff = psectoff;
386	pthread_mutex_init(&bc->bc_mtx, NULL);
387	pthread_cond_init(&bc->bc_cond, NULL);
388	TAILQ_INIT(&bc->bc_freeq);
389	TAILQ_INIT(&bc->bc_pendq);
390	TAILQ_INIT(&bc->bc_busyq);
391	bc->bc_req_count = 0;
392	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
393		bc->bc_reqs[i].be_status = BST_FREE;
394		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
395	}
396
397	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
398
399	snprintf(tname, sizeof(tname), "blk-%s", ident);
400	pthread_set_name_np(bc->bc_btid, tname);
401
402	return (bc);
403}
404
405static int
406blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
407		enum blockop op)
408{
409	int err;
410
411	err = 0;
412
413	pthread_mutex_lock(&bc->bc_mtx);
414	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
415		/*
416		 * Enqueue and inform the block i/o thread
417		 * that there is work available
418		 */
419		blockif_enqueue(bc, breq, op);
420		pthread_cond_signal(&bc->bc_cond);
421	} else {
422		/*
423		 * Callers are not allowed to enqueue more than
424		 * the specified blockif queue limit. Return an
425		 * error to indicate that the queue length has been
426		 * exceeded.
427		 */
428		err = E2BIG;
429	}
430	pthread_mutex_unlock(&bc->bc_mtx);
431
432	return (err);
433}
434
435int
436blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
437{
438
439	assert(bc->bc_magic == BLOCKIF_SIG);
440	return (blockif_request(bc, breq, BOP_READ));
441}
442
443int
444blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
445{
446
447	assert(bc->bc_magic == BLOCKIF_SIG);
448	return (blockif_request(bc, breq, BOP_WRITE));
449}
450
451int
452blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
453{
454
455	assert(bc->bc_magic == BLOCKIF_SIG);
456	return (blockif_request(bc, breq, BOP_FLUSH));
457}
458
459int
460blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
461{
462
463	assert(bc->bc_magic == BLOCKIF_SIG);
464	return (blockif_request(bc, breq, BOP_DELETE));
465}
466
467int
468blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
469{
470	struct blockif_elem *be;
471
472	assert(bc->bc_magic == BLOCKIF_SIG);
473
474	pthread_mutex_lock(&bc->bc_mtx);
475	/*
476	 * Check pending requests.
477	 */
478	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
479		if (be->be_req == breq)
480			break;
481	}
482	if (be != NULL) {
483		/*
484		 * Found it.
485		 */
486		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
487		be->be_status = BST_FREE;
488		be->be_req = NULL;
489		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
490		bc->bc_req_count--;
491		pthread_mutex_unlock(&bc->bc_mtx);
492
493		return (0);
494	}
495
496	/*
497	 * Check in-flight requests.
498	 */
499	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
500		if (be->be_req == breq)
501			break;
502	}
503	if (be == NULL) {
504		/*
505		 * Didn't find it.
506		 */
507		pthread_mutex_unlock(&bc->bc_mtx);
508		return (EINVAL);
509	}
510
511	/*
512	 * Interrupt the processing thread to force it return
513	 * prematurely via it's normal callback path.
514	 */
515	while (be->be_status == BST_BUSY) {
516		struct blockif_sig_elem bse, *old_head;
517
518		pthread_mutex_init(&bse.bse_mtx, NULL);
519		pthread_cond_init(&bse.bse_cond, NULL);
520
521		bse.bse_pending = 1;
522
523		do {
524			old_head = blockif_bse_head;
525			bse.bse_next = old_head;
526		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
527					    (uintptr_t)old_head,
528					    (uintptr_t)&bse));
529
530		pthread_kill(be->be_tid, SIGCONT);
531
532		pthread_mutex_lock(&bse.bse_mtx);
533		while (bse.bse_pending)
534			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
535		pthread_mutex_unlock(&bse.bse_mtx);
536	}
537
538	pthread_mutex_unlock(&bc->bc_mtx);
539
540	/*
541	 * The processing thread has been interrupted.  Since it's not
542	 * clear if the callback has been invoked yet, return EBUSY.
543	 */
544	return (EBUSY);
545}
546
547int
548blockif_close(struct blockif_ctxt *bc)
549{
550	void *jval;
551	int err;
552
553	err = 0;
554
555	assert(bc->bc_magic == BLOCKIF_SIG);
556
557	/*
558	 * Stop the block i/o thread
559	 */
560	bc->bc_closing = 1;
561	pthread_cond_signal(&bc->bc_cond);
562	pthread_join(bc->bc_btid, &jval);
563
564	/* XXX Cancel queued i/o's ??? */
565
566	/*
567	 * Release resources
568	 */
569	bc->bc_magic = 0;
570	close(bc->bc_fd);
571	free(bc);
572
573	return (0);
574}
575
576/*
577 * Return virtual C/H/S values for a given block. Use the algorithm
578 * outlined in the VHD specification to calculate values.
579 */
580void
581blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
582{
583	off_t sectors;		/* total sectors of the block dev */
584	off_t hcyl;		/* cylinders times heads */
585	uint16_t secpt;		/* sectors per track */
586	uint8_t heads;
587
588	assert(bc->bc_magic == BLOCKIF_SIG);
589
590	sectors = bc->bc_size / bc->bc_sectsz;
591
592	/* Clamp the size to the largest possible with CHS */
593	if (sectors > 65535UL*16*255)
594		sectors = 65535UL*16*255;
595
596	if (sectors >= 65536UL*16*63) {
597		secpt = 255;
598		heads = 16;
599		hcyl = sectors / secpt;
600	} else {
601		secpt = 17;
602		hcyl = sectors / secpt;
603		heads = (hcyl + 1023) / 1024;
604
605		if (heads < 4)
606			heads = 4;
607
608		if (hcyl >= (heads * 1024) || heads > 16) {
609			secpt = 31;
610			heads = 16;
611			hcyl = sectors / secpt;
612		}
613		if (hcyl >= (heads * 1024)) {
614			secpt = 63;
615			heads = 16;
616			hcyl = sectors / secpt;
617		}
618	}
619
620	*c = hcyl / heads;
621	*h = heads;
622	*s = secpt;
623}
624
625/*
626 * Accessors
627 */
628off_t
629blockif_size(struct blockif_ctxt *bc)
630{
631
632	assert(bc->bc_magic == BLOCKIF_SIG);
633	return (bc->bc_size);
634}
635
636int
637blockif_sectsz(struct blockif_ctxt *bc)
638{
639
640	assert(bc->bc_magic == BLOCKIF_SIG);
641	return (bc->bc_sectsz);
642}
643
644void
645blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
646{
647
648	assert(bc->bc_magic == BLOCKIF_SIG);
649	*size = bc->bc_psectsz;
650	*off = bc->bc_psectoff;
651}
652
653int
654blockif_queuesz(struct blockif_ctxt *bc)
655{
656
657	assert(bc->bc_magic == BLOCKIF_SIG);
658	return (BLOCKIF_MAXREQ - 1);
659}
660
661int
662blockif_is_ro(struct blockif_ctxt *bc)
663{
664
665	assert(bc->bc_magic == BLOCKIF_SIG);
666	return (bc->bc_rdonly);
667}
668
669int
670blockif_candelete(struct blockif_ctxt *bc)
671{
672
673	assert(bc->bc_magic == BLOCKIF_SIG);
674	return (bc->bc_candelete);
675}
676