block_if.c revision 280244
1/*-
2 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280244 2015-03-19 09:54:48Z mav $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280244 2015-03-19 09:54:48Z mav $");
31
32#include <sys/param.h>
33#include <sys/queue.h>
34#include <sys/errno.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/disk.h>
38
39#include <assert.h>
40#include <fcntl.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44#include <pthread.h>
45#include <pthread_np.h>
46#include <signal.h>
47#include <unistd.h>
48
49#include <machine/atomic.h>
50
51#include "bhyverun.h"
52#include "mevent.h"
53#include "block_if.h"
54
55#define BLOCKIF_SIG	0xb109b109
56
57#define BLOCKIF_MAXREQ	33
58
59enum blockop {
60	BOP_READ,
61	BOP_WRITE,
62	BOP_FLUSH
63};
64
65enum blockstat {
66	BST_FREE,
67	BST_PEND,
68	BST_BUSY,
69	BST_DONE
70};
71
72struct blockif_elem {
73	TAILQ_ENTRY(blockif_elem) be_link;
74	struct blockif_req  *be_req;
75	enum blockop	     be_op;
76	enum blockstat	     be_status;
77	pthread_t            be_tid;
78};
79
80struct blockif_ctxt {
81	int			bc_magic;
82	int			bc_fd;
83	int			bc_rdonly;
84	off_t			bc_size;
85	int			bc_sectsz;
86	int			bc_psectsz;
87	int			bc_psectoff;
88	pthread_t		bc_btid;
89        pthread_mutex_t		bc_mtx;
90        pthread_cond_t		bc_cond;
91	int			bc_closing;
92
93	/* Request elements and free/pending/busy queues */
94	TAILQ_HEAD(, blockif_elem) bc_freeq;
95	TAILQ_HEAD(, blockif_elem) bc_pendq;
96	TAILQ_HEAD(, blockif_elem) bc_busyq;
97	u_int			bc_req_count;
98	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
99};
100
101static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
102
103struct blockif_sig_elem {
104	pthread_mutex_t			bse_mtx;
105	pthread_cond_t			bse_cond;
106	int				bse_pending;
107	struct blockif_sig_elem		*bse_next;
108};
109
110static struct blockif_sig_elem *blockif_bse_head;
111
112static int
113blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
114		enum blockop op)
115{
116	struct blockif_elem *be;
117
118	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
119
120	be = TAILQ_FIRST(&bc->bc_freeq);
121	assert(be != NULL);
122	assert(be->be_status == BST_FREE);
123
124	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
125	be->be_status = BST_PEND;
126	be->be_req = breq;
127	be->be_op = op;
128	TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
129
130	bc->bc_req_count++;
131
132	return (0);
133}
134
135static int
136blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
137{
138	struct blockif_elem *be;
139
140	if (bc->bc_req_count == 0)
141		return (ENOENT);
142
143	be = TAILQ_FIRST(&bc->bc_pendq);
144	assert(be != NULL);
145	assert(be->be_status == BST_PEND);
146	TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
147	be->be_status = BST_BUSY;
148	be->be_tid = bc->bc_btid;
149	TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
150
151	*bep = be;
152
153	return (0);
154}
155
156static void
157blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
158{
159	assert(be->be_status == BST_DONE);
160
161	TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
162	be->be_tid = 0;
163	be->be_status = BST_FREE;
164	be->be_req = NULL;
165	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
166
167	bc->bc_req_count--;
168}
169
170static void
171blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
172{
173	struct blockif_req *br;
174	int err;
175
176	br = be->be_req;
177	err = 0;
178
179	switch (be->be_op) {
180	case BOP_READ:
181		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
182			   br->br_offset) < 0)
183			err = errno;
184		break;
185	case BOP_WRITE:
186		if (bc->bc_rdonly)
187			err = EROFS;
188		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
189			     br->br_offset) < 0)
190			err = errno;
191		break;
192	case BOP_FLUSH:
193		break;
194	default:
195		err = EINVAL;
196		break;
197	}
198
199	be->be_status = BST_DONE;
200
201	(*br->br_callback)(br, err);
202}
203
204static void *
205blockif_thr(void *arg)
206{
207	struct blockif_ctxt *bc;
208	struct blockif_elem *be;
209
210	bc = arg;
211
212	for (;;) {
213		pthread_mutex_lock(&bc->bc_mtx);
214		while (!blockif_dequeue(bc, &be)) {
215			pthread_mutex_unlock(&bc->bc_mtx);
216			blockif_proc(bc, be);
217			pthread_mutex_lock(&bc->bc_mtx);
218			blockif_complete(bc, be);
219		}
220		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
221		pthread_mutex_unlock(&bc->bc_mtx);
222
223		/*
224		 * Check ctxt status here to see if exit requested
225		 */
226		if (bc->bc_closing)
227			pthread_exit(NULL);
228	}
229
230	/* Not reached */
231	return (NULL);
232}
233
234static void
235blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
236{
237	struct blockif_sig_elem *bse;
238
239	for (;;) {
240		/*
241		 * Process the entire list even if not intended for
242		 * this thread.
243		 */
244		do {
245			bse = blockif_bse_head;
246			if (bse == NULL)
247				return;
248		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
249					    (uintptr_t)bse,
250					    (uintptr_t)bse->bse_next));
251
252		pthread_mutex_lock(&bse->bse_mtx);
253		bse->bse_pending = 0;
254		pthread_cond_signal(&bse->bse_cond);
255		pthread_mutex_unlock(&bse->bse_mtx);
256	}
257}
258
259static void
260blockif_init(void)
261{
262	mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
263	(void) signal(SIGCONT, SIG_IGN);
264}
265
266struct blockif_ctxt *
267blockif_open(const char *optstr, const char *ident)
268{
269	char tname[MAXCOMLEN + 1];
270	char *nopt, *xopts;
271	struct blockif_ctxt *bc;
272	struct stat sbuf;
273	off_t size, psectsz, psectoff;
274	int extra, fd, i, sectsz;
275	int nocache, sync, ro;
276
277	pthread_once(&blockif_once, blockif_init);
278
279	nocache = 0;
280	sync = 0;
281	ro = 0;
282
283	/*
284	 * The first element in the optstring is always a pathname.
285	 * Optional elements follow
286	 */
287	nopt = strdup(optstr);
288	for (xopts = strtok(nopt, ",");
289	     xopts != NULL;
290	     xopts = strtok(NULL, ",")) {
291		if (!strcmp(xopts, "nocache"))
292			nocache = 1;
293		else if (!strcmp(xopts, "sync"))
294			sync = 1;
295		else if (!strcmp(xopts, "ro"))
296			ro = 1;
297	}
298
299	extra = 0;
300	if (nocache)
301		extra |= O_DIRECT;
302	if (sync)
303		extra |= O_SYNC;
304
305	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
306	if (fd < 0 && !ro) {
307		/* Attempt a r/w fail with a r/o open */
308		fd = open(nopt, O_RDONLY | extra);
309		ro = 1;
310	}
311
312	if (fd < 0) {
313		perror("Could not open backing file");
314		return (NULL);
315	}
316
317        if (fstat(fd, &sbuf) < 0) {
318                perror("Could not stat backing file");
319                close(fd);
320                return (NULL);
321        }
322
323        /*
324	 * Deal with raw devices
325	 */
326        size = sbuf.st_size;
327	sectsz = DEV_BSIZE;
328	psectsz = psectoff = 0;
329	if (S_ISCHR(sbuf.st_mode)) {
330		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
331		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
332			perror("Could not fetch dev blk/sector size");
333			close(fd);
334			return (NULL);
335		}
336		assert(size != 0);
337		assert(sectsz != 0);
338		if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0)
339			ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff);
340	} else
341		psectsz = sbuf.st_blksize;
342
343	bc = calloc(1, sizeof(struct blockif_ctxt));
344	if (bc == NULL) {
345		close(fd);
346		return (NULL);
347	}
348
349	bc->bc_magic = BLOCKIF_SIG;
350	bc->bc_fd = fd;
351	bc->bc_rdonly = ro;
352	bc->bc_size = size;
353	bc->bc_sectsz = sectsz;
354	bc->bc_psectsz = psectsz;
355	bc->bc_psectoff = psectoff;
356	pthread_mutex_init(&bc->bc_mtx, NULL);
357	pthread_cond_init(&bc->bc_cond, NULL);
358	TAILQ_INIT(&bc->bc_freeq);
359	TAILQ_INIT(&bc->bc_pendq);
360	TAILQ_INIT(&bc->bc_busyq);
361	bc->bc_req_count = 0;
362	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
363		bc->bc_reqs[i].be_status = BST_FREE;
364		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
365	}
366
367	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
368
369	snprintf(tname, sizeof(tname), "blk-%s", ident);
370	pthread_set_name_np(bc->bc_btid, tname);
371
372	return (bc);
373}
374
375static int
376blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
377		enum blockop op)
378{
379	int err;
380
381	err = 0;
382
383	pthread_mutex_lock(&bc->bc_mtx);
384	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
385		/*
386		 * Enqueue and inform the block i/o thread
387		 * that there is work available
388		 */
389		blockif_enqueue(bc, breq, op);
390		pthread_cond_signal(&bc->bc_cond);
391	} else {
392		/*
393		 * Callers are not allowed to enqueue more than
394		 * the specified blockif queue limit. Return an
395		 * error to indicate that the queue length has been
396		 * exceeded.
397		 */
398		err = E2BIG;
399	}
400	pthread_mutex_unlock(&bc->bc_mtx);
401
402	return (err);
403}
404
405int
406blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
407{
408
409	assert(bc->bc_magic == BLOCKIF_SIG);
410	return (blockif_request(bc, breq, BOP_READ));
411}
412
413int
414blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
415{
416
417	assert(bc->bc_magic == BLOCKIF_SIG);
418	return (blockif_request(bc, breq, BOP_WRITE));
419}
420
421int
422blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
423{
424
425	assert(bc->bc_magic == BLOCKIF_SIG);
426	return (blockif_request(bc, breq, BOP_FLUSH));
427}
428
429int
430blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
431{
432	struct blockif_elem *be;
433
434	assert(bc->bc_magic == BLOCKIF_SIG);
435
436	pthread_mutex_lock(&bc->bc_mtx);
437	/*
438	 * Check pending requests.
439	 */
440	TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
441		if (be->be_req == breq)
442			break;
443	}
444	if (be != NULL) {
445		/*
446		 * Found it.
447		 */
448		TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
449		be->be_status = BST_FREE;
450		be->be_req = NULL;
451		TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
452		bc->bc_req_count--;
453		pthread_mutex_unlock(&bc->bc_mtx);
454
455		return (0);
456	}
457
458	/*
459	 * Check in-flight requests.
460	 */
461	TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
462		if (be->be_req == breq)
463			break;
464	}
465	if (be == NULL) {
466		/*
467		 * Didn't find it.
468		 */
469		pthread_mutex_unlock(&bc->bc_mtx);
470		return (EINVAL);
471	}
472
473	/*
474	 * Interrupt the processing thread to force it return
475	 * prematurely via it's normal callback path.
476	 */
477	while (be->be_status == BST_BUSY) {
478		struct blockif_sig_elem bse, *old_head;
479
480		pthread_mutex_init(&bse.bse_mtx, NULL);
481		pthread_cond_init(&bse.bse_cond, NULL);
482
483		bse.bse_pending = 1;
484
485		do {
486			old_head = blockif_bse_head;
487			bse.bse_next = old_head;
488		} while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
489					    (uintptr_t)old_head,
490					    (uintptr_t)&bse));
491
492		pthread_kill(be->be_tid, SIGCONT);
493
494		pthread_mutex_lock(&bse.bse_mtx);
495		while (bse.bse_pending)
496			pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
497		pthread_mutex_unlock(&bse.bse_mtx);
498	}
499
500	pthread_mutex_unlock(&bc->bc_mtx);
501
502	/*
503	 * The processing thread has been interrupted.  Since it's not
504	 * clear if the callback has been invoked yet, return EBUSY.
505	 */
506	return (EBUSY);
507}
508
509int
510blockif_close(struct blockif_ctxt *bc)
511{
512	void *jval;
513	int err;
514
515	err = 0;
516
517	assert(bc->bc_magic == BLOCKIF_SIG);
518
519	/*
520	 * Stop the block i/o thread
521	 */
522	bc->bc_closing = 1;
523	pthread_cond_signal(&bc->bc_cond);
524	pthread_join(bc->bc_btid, &jval);
525
526	/* XXX Cancel queued i/o's ??? */
527
528	/*
529	 * Release resources
530	 */
531	bc->bc_magic = 0;
532	close(bc->bc_fd);
533	free(bc);
534
535	return (0);
536}
537
538/*
539 * Return virtual C/H/S values for a given block. Use the algorithm
540 * outlined in the VHD specification to calculate values.
541 */
542void
543blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
544{
545	off_t sectors;		/* total sectors of the block dev */
546	off_t hcyl;		/* cylinders times heads */
547	uint16_t secpt;		/* sectors per track */
548	uint8_t heads;
549
550	assert(bc->bc_magic == BLOCKIF_SIG);
551
552	sectors = bc->bc_size / bc->bc_sectsz;
553
554	/* Clamp the size to the largest possible with CHS */
555	if (sectors > 65535UL*16*255)
556		sectors = 65535UL*16*255;
557
558	if (sectors >= 65536UL*16*63) {
559		secpt = 255;
560		heads = 16;
561		hcyl = sectors / secpt;
562	} else {
563		secpt = 17;
564		hcyl = sectors / secpt;
565		heads = (hcyl + 1023) / 1024;
566
567		if (heads < 4)
568			heads = 4;
569
570		if (hcyl >= (heads * 1024) || heads > 16) {
571			secpt = 31;
572			heads = 16;
573			hcyl = sectors / secpt;
574		}
575		if (hcyl >= (heads * 1024)) {
576			secpt = 63;
577			heads = 16;
578			hcyl = sectors / secpt;
579		}
580	}
581
582	*c = hcyl / heads;
583	*h = heads;
584	*s = secpt;
585}
586
587/*
588 * Accessors
589 */
590off_t
591blockif_size(struct blockif_ctxt *bc)
592{
593
594	assert(bc->bc_magic == BLOCKIF_SIG);
595	return (bc->bc_size);
596}
597
598int
599blockif_sectsz(struct blockif_ctxt *bc)
600{
601
602	assert(bc->bc_magic == BLOCKIF_SIG);
603	return (bc->bc_sectsz);
604}
605
606void
607blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off)
608{
609
610	assert(bc->bc_magic == BLOCKIF_SIG);
611	*size = bc->bc_psectsz;
612	*off = bc->bc_psectoff;
613}
614
615int
616blockif_queuesz(struct blockif_ctxt *bc)
617{
618
619	assert(bc->bc_magic == BLOCKIF_SIG);
620	return (BLOCKIF_MAXREQ - 1);
621}
622
623int
624blockif_is_ro(struct blockif_ctxt *bc)
625{
626
627	assert(bc->bc_magic == BLOCKIF_SIG);
628	return (bc->bc_rdonly);
629}
630