1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
5 * All rights reserved.
6 * Copyright 2020 Joyent, Inc.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 */
29
30#include <sys/param.h>
31#ifndef WITHOUT_CAPSICUM
32#include <sys/capsicum.h>
33#endif
34#include <sys/queue.h>
35#include <sys/errno.h>
36#include <sys/stat.h>
37#include <sys/ioctl.h>
38#include <sys/disk.h>
39
40#include <assert.h>
41#ifndef WITHOUT_CAPSICUM
42#include <capsicum_helpers.h>
43#endif
44#include <err.h>
45#include <fcntl.h>
46#include <stdio.h>
47#include <stdlib.h>
48#include <string.h>
49#include <pthread.h>
50#include <pthread_np.h>
51#include <signal.h>
52#include <sysexits.h>
53#include <unistd.h>
54
55#include <machine/atomic.h>
56#include <machine/vmm_snapshot.h>
57
58#include "bhyverun.h"
59#include "config.h"
60#include "debug.h"
61#include "mevent.h"
62#include "pci_emul.h"
63#include "block_if.h"
64
65#define BLOCKIF_SIG	0xb109b109
66
67#define BLOCKIF_NUMTHR	8
68#define BLOCKIF_MAXREQ	(BLOCKIF_RING_MAX + BLOCKIF_NUMTHR)
69
70enum blockop {
71	BOP_READ,
72	BOP_WRITE,
73	BOP_FLUSH,
74	BOP_DELETE
75};
76
77enum blockstat {
78	BST_FREE,
79	BST_BLOCK,
80	BST_PEND,
81	BST_BUSY,
82	BST_DONE
83};
84
85struct blockif_elem {
86	TAILQ_ENTRY(blockif_elem) be_link;
87	struct blockif_req  *be_req;
88	enum blockop	     be_op;
89	enum blockstat	     be_status;
90	pthread_t            be_tid;
91	off_t		     be_block;
92};
93
94struct blockif_ctxt {
95	unsigned int		bc_magic;
96	int			bc_fd;
97	int			bc_ischr;
98	int			bc_isgeom;
99	int			bc_candelete;
100	int			bc_rdonly;
101	off_t			bc_size;
102	int			bc_sectsz;
103	int			bc_psectsz;
104	int			bc_psectoff;
105	int			bc_closing;
106	int			bc_paused;
107	pthread_t		bc_btid[BLOCKIF_NUMTHR];
108	pthread_mutex_t		bc_mtx;
109	pthread_cond_t		bc_cond;
110	pthread_cond_t		bc_work_done_cond;
111	blockif_resize_cb	*bc_resize_cb;
112	void			*bc_resize_cb_arg;
113	struct mevent		*bc_resize_event;
114
115	/* Request elements and free/pending/busy queues */
116	TAILQ_HEAD(, blockif_elem) bc_freeq;
117	TAILQ_HEAD(, blockif_elem) bc_pendq;
118	TAILQ_HEAD(, blockif_elem) bc_busyq;
119	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
120	int			bc_bootindex;
121};
122
123static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
124
125struct blockif_sig_elem {
126	pthread_mutex_t			bse_mtx;
127	pthread_cond_t			bse_cond;
128	int				bse_pending;
129	struct blockif_sig_elem		*bse_next;
130};
131
132static struct blockif_sig_elem *blockif_bse_head;
133
134static int
135blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
136		enum blockop op)
137{
138	struct blockif_elem *be, *tbe;
139	off_t off;
140	int i;
141
142	be = TAILQ_FIRST(&bc->bc_freeq);
143	assert(be != NULL);
144	assert(be->be_status == BST_FREE);
145	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
146	be->be_req = breq;
147	be->be_op = op;
148	switch (op) {
149	case BOP_READ:
150	case BOP_WRITE:
151	case BOP_DELETE:
152		off = breq->br_offset;
153		for (i = 0; i < breq->br_iovcnt; i++)
154			off += breq->br_iov[i].iov_len;
155		break;
156	default:
157		off = OFF_MAX;
158	}
159	be->be_block = off;
160	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
161		if (tbe->be_block == breq->br_offset)
162			break;
163	}
164	if (tbe == NULL) {
165		TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) {
166			if (tbe->be_block == breq->br_offset)
167				break;
168		}
169	}
170	if (tbe == NULL)
171		be->be_status = BST_PEND;
172	else
173		be->be_status = BST_BLOCK;
174	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
175	return (be->be_status == BST_PEND);
176}
177
178static int
179blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep)
180{
181	struct blockif_elem *be;
182
183	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
184		if (be->be_status == BST_PEND)
185			break;
186		assert(be->be_status == BST_BLOCK);
187	}
188	if (be == NULL)
189		return (0);
190	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
191	be->be_status = BST_BUSY;
192	be->be_tid = t;
193	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
194	*bep = be;
195	return (1);
196}
197
198static void
199blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
200{
201	struct blockif_elem *tbe;
202
203	if (be->be_status == BST_DONE || be->be_status == BST_BUSY)
204		TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
205	else
206		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
207	TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) {
208		if (tbe->be_req->br_offset == be->be_block)
209			tbe->be_status = BST_PEND;
210	}
211	be->be_tid = 0;
212	be->be_status = BST_FREE;
213	be->be_req = NULL;
214	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
215}
216
217static int
218blockif_flush_bc(struct blockif_ctxt *bc)
219{
220	if (bc->bc_ischr) {
221		if (ioctl(bc->bc_fd, DIOCGFLUSH))
222			return (errno);
223	} else if (fsync(bc->bc_fd))
224		return (errno);
225
226	return (0);
227}
228
229static void
230blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf)
231{
232	struct spacectl_range range;
233	struct blockif_req *br;
234	off_t arg[2];
235	ssize_t n;
236	size_t clen, len, off, boff, voff;
237	int i, err;
238
239	br = be->be_req;
240	assert(br->br_resid >= 0);
241
242	if (br->br_iovcnt <= 1)
243		buf = NULL;
244	err = 0;
245	switch (be->be_op) {
246	case BOP_READ:
247		if (buf == NULL) {
248			if ((n = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
249			    br->br_offset)) < 0)
250				err = errno;
251			else
252				br->br_resid -= n;
253			break;
254		}
255		i = 0;
256		off = voff = 0;
257		while (br->br_resid > 0) {
258			len = MIN(br->br_resid, MAXPHYS);
259			n = pread(bc->bc_fd, buf, len, br->br_offset + off);
260			if (n < 0) {
261				err = errno;
262				break;
263			}
264			len = (size_t)n;
265			boff = 0;
266			do {
267				clen = MIN(len - boff, br->br_iov[i].iov_len -
268				    voff);
269				memcpy((uint8_t *)br->br_iov[i].iov_base + voff,
270				    buf + boff, clen);
271				if (clen < br->br_iov[i].iov_len - voff)
272					voff += clen;
273				else {
274					i++;
275					voff = 0;
276				}
277				boff += clen;
278			} while (boff < len);
279			off += len;
280			br->br_resid -= len;
281		}
282		break;
283	case BOP_WRITE:
284		if (bc->bc_rdonly) {
285			err = EROFS;
286			break;
287		}
288		if (buf == NULL) {
289			if ((n = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
290			    br->br_offset)) < 0)
291				err = errno;
292			else
293				br->br_resid -= n;
294			break;
295		}
296		i = 0;
297		off = voff = 0;
298		while (br->br_resid > 0) {
299			len = MIN(br->br_resid, MAXPHYS);
300			boff = 0;
301			do {
302				clen = MIN(len - boff, br->br_iov[i].iov_len -
303				    voff);
304				memcpy(buf + boff,
305				    (uint8_t *)br->br_iov[i].iov_base + voff,
306				    clen);
307				if (clen < br->br_iov[i].iov_len - voff)
308					voff += clen;
309				else {
310					i++;
311					voff = 0;
312				}
313				boff += clen;
314			} while (boff < len);
315
316			n = pwrite(bc->bc_fd, buf, len, br->br_offset + off);
317			if (n < 0) {
318				err = errno;
319				break;
320			}
321			off += n;
322			br->br_resid -= n;
323		}
324		break;
325	case BOP_FLUSH:
326		err = blockif_flush_bc(bc);
327		break;
328	case BOP_DELETE:
329		if (!bc->bc_candelete)
330			err = EOPNOTSUPP;
331		else if (bc->bc_rdonly)
332			err = EROFS;
333		else if (bc->bc_ischr) {
334			arg[0] = br->br_offset;
335			arg[1] = br->br_resid;
336			if (ioctl(bc->bc_fd, DIOCGDELETE, arg))
337				err = errno;
338			else
339				br->br_resid = 0;
340		} else {
341			range.r_offset = br->br_offset;
342			range.r_len = br->br_resid;
343
344			while (range.r_len > 0) {
345				if (fspacectl(bc->bc_fd, SPACECTL_DEALLOC,
346				    &range, 0, &range) != 0) {
347					err = errno;
348					break;
349				}
350			}
351			if (err == 0)
352				br->br_resid = 0;
353		}
354		break;
355	default:
356		err = EINVAL;
357		break;
358	}
359
360	be->be_status = BST_DONE;
361
362	(*br->br_callback)(br, err);
363}
364
365static inline bool
366blockif_empty(const struct blockif_ctxt *bc)
367{
368	return (TAILQ_EMPTY(&bc->bc_pendq) && TAILQ_EMPTY(&bc->bc_busyq));
369}
370
371static void *
372blockif_thr(void *arg)
373{
374	struct blockif_ctxt *bc;
375	struct blockif_elem *be;
376	pthread_t t;
377	uint8_t *buf;
378
379	bc = arg;
380	if (bc->bc_isgeom)
381		buf = malloc(MAXPHYS);
382	else
383		buf = NULL;
384	t = pthread_self();
385
386	pthread_mutex_lock(&bc->bc_mtx);
387	for (;;) {
388		while (blockif_dequeue(bc, t, &be)) {
389			pthread_mutex_unlock(&bc->bc_mtx);
390			blockif_proc(bc, be, buf);
391			pthread_mutex_lock(&bc->bc_mtx);
392			blockif_complete(bc, be);
393		}
394
395		/* If none to work, notify the main thread */
396		if (blockif_empty(bc))
397			pthread_cond_broadcast(&bc->bc_work_done_cond);
398
399		/* Check ctxt status here to see if exit requested */
400		if (bc->bc_closing)
401			break;
402
403		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
404	}
405	pthread_mutex_unlock(&bc->bc_mtx);
406
407	if (buf)
408		free(buf);
409	pthread_exit(NULL);
410	return (NULL);
411}
412
413static void
414blockif_sigcont_handler(int signal __unused, enum ev_type type __unused,
415    void *arg __unused)
416{
417	struct blockif_sig_elem *bse;
418
419	for (;;) {
420		/*
421		 * Process the entire list even if not intended for
422		 * this thread.
423		 */
424		do {
425			bse = blockif_bse_head;
426			if (bse == NULL)
427				return;
428		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
429					    (uintptr_t)bse,
430					    (uintptr_t)bse->bse_next));
431
432		pthread_mutex_lock(&bse->bse_mtx);
433		bse->bse_pending = 0;
434		pthread_cond_signal(&bse->bse_cond);
435		pthread_mutex_unlock(&bse->bse_mtx);
436	}
437}
438
439static void
440blockif_init(void)
441{
442	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
443	(void) signal(SIGCONT, SIG_IGN);
444}
445
446int
447blockif_legacy_config(nvlist_t *nvl, const char *opts)
448{
449	char *cp, *path;
450
451	if (opts == NULL)
452		return (0);
453
454	cp = strchr(opts, ',');
455	if (cp == NULL) {
456		set_config_value_node(nvl, "path", opts);
457		return (0);
458	}
459	path = strndup(opts, cp - opts);
460	set_config_value_node(nvl, "path", path);
461	free(path);
462	return (pci_parse_legacy_config(nvl, cp + 1));
463}
464
465int
466blockif_add_boot_device(struct pci_devinst *const pi,
467    struct blockif_ctxt *const bc)
468{
469	if (bc->bc_bootindex < 0)
470		return (0);
471
472	return (pci_emul_add_boot_device(pi, bc->bc_bootindex));
473}
474
475struct blockif_ctxt *
476blockif_open(nvlist_t *nvl, const char *ident)
477{
478	char tname[MAXCOMLEN + 1];
479	char name[MAXPATHLEN];
480	const char *path, *pssval, *ssval, *bootindex_val;
481	char *cp;
482	struct blockif_ctxt *bc;
483	struct stat sbuf;
484	struct diocgattr_arg arg;
485	off_t size, psectsz, psectoff;
486	int extra, fd, i, sectsz;
487	int ro, candelete, geom, ssopt, pssopt;
488	int nodelete;
489	int bootindex;
490
491#ifndef WITHOUT_CAPSICUM
492	cap_rights_t rights;
493	cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE, DIOCGMEDIASIZE };
494#endif
495
496	pthread_once(&blockif_once, blockif_init);
497
498	fd = -1;
499	extra = 0;
500	ssopt = 0;
501	ro = 0;
502	nodelete = 0;
503	bootindex = -1;
504
505	if (get_config_bool_node_default(nvl, "nocache", false))
506		extra |= O_DIRECT;
507	if (get_config_bool_node_default(nvl, "nodelete", false))
508		nodelete = 1;
509	if (get_config_bool_node_default(nvl, "sync", false) ||
510	    get_config_bool_node_default(nvl, "direct", false))
511		extra |= O_SYNC;
512	if (get_config_bool_node_default(nvl, "ro", false))
513		ro = 1;
514	ssval = get_config_value_node(nvl, "sectorsize");
515	if (ssval != NULL) {
516		ssopt = strtol(ssval, &cp, 10);
517		if (cp == ssval) {
518			EPRINTLN("Invalid sector size \"%s\"", ssval);
519			goto err;
520		}
521		if (*cp == '\0') {
522			pssopt = ssopt;
523		} else if (*cp == '/') {
524			pssval = cp + 1;
525			pssopt = strtol(pssval, &cp, 10);
526			if (cp == pssval || *cp != '\0') {
527				EPRINTLN("Invalid sector size \"%s\"", ssval);
528				goto err;
529			}
530		} else {
531			EPRINTLN("Invalid sector size \"%s\"", ssval);
532			goto err;
533		}
534	}
535
536	bootindex_val = get_config_value_node(nvl, "bootindex");
537	if (bootindex_val != NULL) {
538		bootindex = atoi(bootindex_val);
539	}
540
541	path = get_config_value_node(nvl, "path");
542	if (path == NULL) {
543		EPRINTLN("Missing \"path\" for block device.");
544		goto err;
545	}
546
547	fd = open(path, (ro ? O_RDONLY : O_RDWR) | extra);
548	if (fd < 0 && !ro) {
549		/* Attempt a r/w fail with a r/o open */
550		fd = open(path, O_RDONLY | extra);
551		ro = 1;
552	}
553
554	if (fd < 0) {
555		warn("Could not open backing file: %s", path);
556		goto err;
557	}
558
559        if (fstat(fd, &sbuf) < 0) {
560		warn("Could not stat backing file %s", path);
561		goto err;
562        }
563
564#ifndef WITHOUT_CAPSICUM
565	cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK,
566	    CAP_WRITE, CAP_FSTAT, CAP_EVENT, CAP_FPATHCONF);
567	if (ro)
568		cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE);
569
570	if (caph_rights_limit(fd, &rights) == -1)
571		errx(EX_OSERR, "Unable to apply rights for sandbox");
572#endif
573
574        /*
575	 * Deal with raw devices
576	 */
577        size = sbuf.st_size;
578	sectsz = DEV_BSIZE;
579	psectsz = psectoff = 0;
580	candelete = geom = 0;
581	if (S_ISCHR(sbuf.st_mode)) {
582		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
583		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
584			perror("Could not fetch dev blk/sector size");
585			goto err;
586		}
587		assert(size != 0);
588		assert(sectsz != 0);
589		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
590			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
591		strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name));
592		arg.len = sizeof(arg.value.i);
593		if (nodelete == 0 && ioctl(fd, DIOCGATTR, &arg) == 0)
594			candelete = arg.value.i;
595		if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0)
596			geom = 1;
597	} else {
598		psectsz = sbuf.st_blksize;
599		/* Avoid fallback implementation */
600		candelete = fpathconf(fd, _PC_DEALLOC_PRESENT) == 1;
601	}
602
603#ifndef WITHOUT_CAPSICUM
604	if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1)
605		errx(EX_OSERR, "Unable to apply rights for sandbox");
606#endif
607
608	if (ssopt != 0) {
609		if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 ||
610		    ssopt > pssopt) {
611			EPRINTLN("Invalid sector size %d/%d",
612			    ssopt, pssopt);
613			goto err;
614		}
615
616		/*
617		 * Some backend drivers (e.g. cd0, ada0) require that the I/O
618		 * size be a multiple of the device's sector size.
619		 *
620		 * Validate that the emulated sector size complies with this
621		 * requirement.
622		 */
623		if (S_ISCHR(sbuf.st_mode)) {
624			if (ssopt < sectsz || (ssopt % sectsz) != 0) {
625				EPRINTLN("Sector size %d incompatible "
626				    "with underlying device sector size %d",
627				    ssopt, sectsz);
628				goto err;
629			}
630		}
631
632		sectsz = ssopt;
633		psectsz = pssopt;
634		psectoff = 0;
635	}
636
637	bc = calloc(1, sizeof(struct blockif_ctxt));
638	if (bc == NULL) {
639		perror("calloc");
640		goto err;
641	}
642
643	bc->bc_magic = BLOCKIF_SIG;
644	bc->bc_fd = fd;
645	bc->bc_ischr = S_ISCHR(sbuf.st_mode);
646	bc->bc_isgeom = geom;
647	bc->bc_candelete = candelete;
648	bc->bc_rdonly = ro;
649	bc->bc_size = size;
650	bc->bc_sectsz = sectsz;
651	bc->bc_psectsz = psectsz;
652	bc->bc_psectoff = psectoff;
653	pthread_mutex_init(&bc->bc_mtx, NULL);
654	pthread_cond_init(&bc->bc_cond, NULL);
655	bc->bc_paused = 0;
656	pthread_cond_init(&bc->bc_work_done_cond, NULL);
657	TAILQ_INIT(&bc->bc_freeq);
658	TAILQ_INIT(&bc->bc_pendq);
659	TAILQ_INIT(&bc->bc_busyq);
660	bc->bc_bootindex = bootindex;
661	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
662		bc->bc_reqs[i].be_status = BST_FREE;
663		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
664	}
665
666	for (i = 0; i < BLOCKIF_NUMTHR; i++) {
667		pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc);
668		snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i);
669		pthread_set_name_np(bc->bc_btid[i], tname);
670	}
671
672	return (bc);
673err:
674	if (fd >= 0)
675		close(fd);
676	return (NULL);
677}
678
679static void
680blockif_resized(int fd, enum ev_type type __unused, void *arg)
681{
682	struct blockif_ctxt *bc;
683	struct stat sb;
684	off_t mediasize;
685
686	if (fstat(fd, &sb) != 0)
687		return;
688
689	if (S_ISCHR(sb.st_mode)) {
690		if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) < 0) {
691			EPRINTLN("blockif_resized: get mediasize failed: %s",
692			    strerror(errno));
693			return;
694		}
695	} else
696		mediasize = sb.st_size;
697
698	bc = arg;
699	pthread_mutex_lock(&bc->bc_mtx);
700	if (mediasize != bc->bc_size) {
701		bc->bc_size = mediasize;
702		bc->bc_resize_cb(bc, bc->bc_resize_cb_arg, bc->bc_size);
703	}
704	pthread_mutex_unlock(&bc->bc_mtx);
705}
706
707int
708blockif_register_resize_callback(struct blockif_ctxt *bc, blockif_resize_cb *cb,
709    void *cb_arg)
710{
711	struct stat sb;
712	int err;
713
714	if (cb == NULL)
715		return (EINVAL);
716
717	err = 0;
718
719	pthread_mutex_lock(&bc->bc_mtx);
720	if (bc->bc_resize_cb != NULL) {
721		err = EBUSY;
722		goto out;
723	}
724
725	assert(bc->bc_closing == 0);
726
727	if (fstat(bc->bc_fd, &sb) != 0) {
728		err = errno;
729		goto out;
730	}
731
732	bc->bc_resize_event = mevent_add_flags(bc->bc_fd, EVF_VNODE,
733	    EVFF_ATTRIB, blockif_resized, bc);
734	if (bc->bc_resize_event == NULL) {
735		err = ENXIO;
736		goto out;
737	}
738
739	bc->bc_resize_cb = cb;
740	bc->bc_resize_cb_arg = cb_arg;
741out:
742	pthread_mutex_unlock(&bc->bc_mtx);
743
744	return (err);
745}
746
747static int
748blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
749		enum blockop op)
750{
751	int err;
752
753	err = 0;
754
755	pthread_mutex_lock(&bc->bc_mtx);
756	assert(!bc->bc_paused);
757	if (!TAILQ_EMPTY(&bc->bc_freeq)) {
758		/*
759		 * Enqueue and inform the block i/o thread
760		 * that there is work available
761		 */
762		if (blockif_enqueue(bc, breq, op))
763			pthread_cond_signal(&bc->bc_cond);
764	} else {
765		/*
766		 * Callers are not allowed to enqueue more than
767		 * the specified blockif queue limit. Return an
768		 * error to indicate that the queue length has been
769		 * exceeded.
770		 */
771		err = E2BIG;
772	}
773	pthread_mutex_unlock(&bc->bc_mtx);
774
775	return (err);
776}
777
778int
779blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
780{
781	assert(bc->bc_magic == BLOCKIF_SIG);
782	return (blockif_request(bc, breq, BOP_READ));
783}
784
785int
786blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
787{
788	assert(bc->bc_magic == BLOCKIF_SIG);
789	return (blockif_request(bc, breq, BOP_WRITE));
790}
791
792int
793blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
794{
795	assert(bc->bc_magic == BLOCKIF_SIG);
796	return (blockif_request(bc, breq, BOP_FLUSH));
797}
798
799int
800blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq)
801{
802	assert(bc->bc_magic == BLOCKIF_SIG);
803	return (blockif_request(bc, breq, BOP_DELETE));
804}
805
806int
807blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
808{
809	struct blockif_elem *be;
810
811	assert(bc->bc_magic == BLOCKIF_SIG);
812
813	pthread_mutex_lock(&bc->bc_mtx);
814	/* XXX: not waiting while paused */
815
816	/*
817	 * Check pending requests.
818	 */
819	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
820		if (be->be_req == breq)
821			break;
822	}
823	if (be != NULL) {
824		/*
825		 * Found it.
826		 */
827		blockif_complete(bc, be);
828		pthread_mutex_unlock(&bc->bc_mtx);
829
830		return (0);
831	}
832
833	/*
834	 * Check in-flight requests.
835	 */
836	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
837		if (be->be_req == breq)
838			break;
839	}
840	if (be == NULL) {
841		/*
842		 * Didn't find it.
843		 */
844		pthread_mutex_unlock(&bc->bc_mtx);
845		return (EINVAL);
846	}
847
848	/*
849	 * Interrupt the processing thread to force it return
850	 * prematurely via it's normal callback path.
851	 */
852	while (be->be_status == BST_BUSY) {
853		struct blockif_sig_elem bse, *old_head;
854
855		pthread_mutex_init(&bse.bse_mtx, NULL);
856		pthread_cond_init(&bse.bse_cond, NULL);
857
858		bse.bse_pending = 1;
859
860		do {
861			old_head = blockif_bse_head;
862			bse.bse_next = old_head;
863		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
864					    (uintptr_t)old_head,
865					    (uintptr_t)&bse));
866
867		pthread_kill(be->be_tid, SIGCONT);
868
869		pthread_mutex_lock(&bse.bse_mtx);
870		while (bse.bse_pending)
871			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
872		pthread_mutex_unlock(&bse.bse_mtx);
873	}
874
875	pthread_mutex_unlock(&bc->bc_mtx);
876
877	/*
878	 * The processing thread has been interrupted.  Since it's not
879	 * clear if the callback has been invoked yet, return EBUSY.
880	 */
881	return (EBUSY);
882}
883
884int
885blockif_close(struct blockif_ctxt *bc)
886{
887	void *jval;
888	int i;
889
890	assert(bc->bc_magic == BLOCKIF_SIG);
891
892	/*
893	 * Stop the block i/o thread
894	 */
895	pthread_mutex_lock(&bc->bc_mtx);
896	bc->bc_closing = 1;
897	if (bc->bc_resize_event != NULL)
898		mevent_disable(bc->bc_resize_event);
899	pthread_mutex_unlock(&bc->bc_mtx);
900	pthread_cond_broadcast(&bc->bc_cond);
901	for (i = 0; i < BLOCKIF_NUMTHR; i++)
902		pthread_join(bc->bc_btid[i], &jval);
903
904	/* XXX Cancel queued i/o's ??? */
905
906	/*
907	 * Release resources
908	 */
909	bc->bc_magic = 0;
910	close(bc->bc_fd);
911	free(bc);
912
913	return (0);
914}
915
916/*
917 * Return virtual C/H/S values for a given block. Use the algorithm
918 * outlined in the VHD specification to calculate values.
919 */
920void
921blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
922{
923	off_t sectors;		/* total sectors of the block dev */
924	off_t hcyl;		/* cylinders times heads */
925	uint16_t secpt;		/* sectors per track */
926	uint8_t heads;
927
928	assert(bc->bc_magic == BLOCKIF_SIG);
929
930	sectors = bc->bc_size / bc->bc_sectsz;
931
932	/* Clamp the size to the largest possible with CHS */
933	if (sectors > 65535L * 16 * 255)
934		sectors = 65535L * 16 * 255;
935
936	if (sectors >= 65536L * 16 * 63) {
937		secpt = 255;
938		heads = 16;
939		hcyl = sectors / secpt;
940	} else {
941		secpt = 17;
942		hcyl = sectors / secpt;
943		heads = (hcyl + 1023) / 1024;
944
945		if (heads < 4)
946			heads = 4;
947
948		if (hcyl >= (heads * 1024) || heads > 16) {
949			secpt = 31;
950			heads = 16;
951			hcyl = sectors / secpt;
952		}
953		if (hcyl >= (heads * 1024)) {
954			secpt = 63;
955			heads = 16;
956			hcyl = sectors / secpt;
957		}
958	}
959
960	*c = hcyl / heads;
961	*h = heads;
962	*s = secpt;
963}
964
965/*
966 * Accessors
967 */
968off_t
969blockif_size(struct blockif_ctxt *bc)
970{
971	assert(bc->bc_magic == BLOCKIF_SIG);
972	return (bc->bc_size);
973}
974
975int
976blockif_sectsz(struct blockif_ctxt *bc)
977{
978	assert(bc->bc_magic == BLOCKIF_SIG);
979	return (bc->bc_sectsz);
980}
981
982void
983blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
984{
985	assert(bc->bc_magic == BLOCKIF_SIG);
986	*size = bc->bc_psectsz;
987	*off = bc->bc_psectoff;
988}
989
990int
991blockif_queuesz(struct blockif_ctxt *bc)
992{
993	assert(bc->bc_magic == BLOCKIF_SIG);
994	return (BLOCKIF_MAXREQ - 1);
995}
996
997int
998blockif_is_ro(struct blockif_ctxt *bc)
999{
1000	assert(bc->bc_magic == BLOCKIF_SIG);
1001	return (bc->bc_rdonly);
1002}
1003
1004int
1005blockif_candelete(struct blockif_ctxt *bc)
1006{
1007	assert(bc->bc_magic == BLOCKIF_SIG);
1008	return (bc->bc_candelete);
1009}
1010
1011#ifdef BHYVE_SNAPSHOT
1012void
1013blockif_pause(struct blockif_ctxt *bc)
1014{
1015	assert(bc != NULL);
1016	assert(bc->bc_magic == BLOCKIF_SIG);
1017
1018	pthread_mutex_lock(&bc->bc_mtx);
1019	bc->bc_paused = 1;
1020
1021	/* The interface is paused. Wait for workers to finish their work */
1022	while (!blockif_empty(bc))
1023		pthread_cond_wait(&bc->bc_work_done_cond, &bc->bc_mtx);
1024	pthread_mutex_unlock(&bc->bc_mtx);
1025
1026	if (!bc->bc_rdonly && blockif_flush_bc(bc))
1027		EPRINTLN("%s: [WARN] failed to flush backing file.",
1028			__func__);
1029}
1030
1031void
1032blockif_resume(struct blockif_ctxt *bc)
1033{
1034	assert(bc != NULL);
1035	assert(bc->bc_magic == BLOCKIF_SIG);
1036
1037	pthread_mutex_lock(&bc->bc_mtx);
1038	bc->bc_paused = 0;
1039	pthread_mutex_unlock(&bc->bc_mtx);
1040}
1041#endif	/* BHYVE_SNAPSHOT */
1042