block_if.c revision 276349
1/*-
2 * Copyright (c) 2013  Peter Grehan <grehan@freebsd.org>
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 276349 2014-12-28 21:27:13Z neel $
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 276349 2014-12-28 21:27:13Z neel $");
31
32#include <sys/param.h>
33#include <sys/queue.h>
34#include <sys/errno.h>
35#include <sys/stat.h>
36#include <sys/ioctl.h>
37#include <sys/disk.h>
38
39#include <assert.h>
40#include <fcntl.h>
41#include <stdio.h>
42#include <stdlib.h>
43#include <string.h>
44#include <pthread.h>
45#include <pthread_np.h>
46#include <unistd.h>
47
48#include "bhyverun.h"
49#include "block_if.h"
50
51#define BLOCKIF_SIG	0xb109b109
52
53#define BLOCKIF_MAXREQ	32
54
55enum blockop {
56	BOP_READ,
57	BOP_WRITE,
58	BOP_FLUSH
59};
60
61enum blockstat {
62	BST_FREE,
63	BST_INUSE
64};
65
66struct blockif_elem {
67	TAILQ_ENTRY(blockif_elem) be_link;
68	struct blockif_req  *be_req;
69	enum blockop	     be_op;
70	enum blockstat	     be_status;
71};
72
73struct blockif_ctxt {
74	int			bc_magic;
75	int			bc_fd;
76	int			bc_rdonly;
77	off_t			bc_size;
78	int			bc_sectsz;
79	pthread_t		bc_btid;
80        pthread_mutex_t		bc_mtx;
81        pthread_cond_t		bc_cond;
82	int			bc_closing;
83
84	/* Request elements and free/inuse queues */
85	TAILQ_HEAD(, blockif_elem) bc_freeq;
86	TAILQ_HEAD(, blockif_elem) bc_inuseq;
87	u_int			bc_req_count;
88	struct blockif_elem	bc_reqs[BLOCKIF_MAXREQ];
89};
90
91static int
92blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
93		enum blockop op)
94{
95	struct blockif_elem *be;
96
97	assert(bc->bc_req_count < BLOCKIF_MAXREQ);
98
99	be = TAILQ_FIRST(&bc->bc_freeq);
100	assert(be != NULL);
101	assert(be->be_status == BST_FREE);
102
103	TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
104	be->be_status = BST_INUSE;
105	be->be_req = breq;
106	be->be_op = op;
107	TAILQ_INSERT_TAIL(&bc->bc_inuseq, be, be_link);
108
109	bc->bc_req_count++;
110
111	return (0);
112}
113
114static int
115blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem *el)
116{
117	struct blockif_elem *be;
118
119	if (bc->bc_req_count == 0)
120		return (ENOENT);
121
122	be = TAILQ_FIRST(&bc->bc_inuseq);
123	assert(be != NULL);
124	assert(be->be_status == BST_INUSE);
125	*el = *be;
126
127	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
128	be->be_status = BST_FREE;
129	be->be_req = NULL;
130	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
131
132	bc->bc_req_count--;
133
134	return (0);
135}
136
137static void
138blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
139{
140	struct blockif_req *br;
141	int err;
142
143	br = be->be_req;
144	err = 0;
145
146	switch (be->be_op) {
147	case BOP_READ:
148		if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
149			   br->br_offset) < 0)
150			err = errno;
151		break;
152	case BOP_WRITE:
153		if (bc->bc_rdonly)
154			err = EROFS;
155		else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
156			     br->br_offset) < 0)
157			err = errno;
158		break;
159	case BOP_FLUSH:
160		break;
161	default:
162		err = EINVAL;
163		break;
164	}
165
166	(*br->br_callback)(br, err);
167}
168
169static void *
170blockif_thr(void *arg)
171{
172	struct blockif_ctxt *bc;
173	struct blockif_elem req;
174
175	bc = arg;
176
177	for (;;) {
178		pthread_mutex_lock(&bc->bc_mtx);
179		while (!blockif_dequeue(bc, &req)) {
180			pthread_mutex_unlock(&bc->bc_mtx);
181			blockif_proc(bc, &req);
182			pthread_mutex_lock(&bc->bc_mtx);
183		}
184		pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
185		pthread_mutex_unlock(&bc->bc_mtx);
186
187		/*
188		 * Check ctxt status here to see if exit requested
189		 */
190		if (bc->bc_closing)
191			pthread_exit(NULL);
192	}
193
194	/* Not reached */
195	return (NULL);
196}
197
198struct blockif_ctxt *
199blockif_open(const char *optstr, const char *ident)
200{
201	char tname[MAXCOMLEN + 1];
202	char *nopt, *xopts;
203	struct blockif_ctxt *bc;
204	struct stat sbuf;
205	off_t size;
206	int extra, fd, i, sectsz;
207	int nocache, sync, ro;
208
209	nocache = 0;
210	sync = 0;
211	ro = 0;
212
213	/*
214	 * The first element in the optstring is always a pathname.
215	 * Optional elements follow
216	 */
217	nopt = strdup(optstr);
218	for (xopts = strtok(nopt, ",");
219	     xopts != NULL;
220	     xopts = strtok(NULL, ",")) {
221		if (!strcmp(xopts, "nocache"))
222			nocache = 1;
223		else if (!strcmp(xopts, "sync"))
224			sync = 1;
225		else if (!strcmp(xopts, "ro"))
226			ro = 1;
227	}
228
229	extra = 0;
230	if (nocache)
231		extra |= O_DIRECT;
232	if (sync)
233		extra |= O_SYNC;
234
235	fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
236	if (fd < 0 && !ro) {
237		/* Attempt a r/w fail with a r/o open */
238		fd = open(nopt, O_RDONLY | extra);
239		ro = 1;
240	}
241
242	if (fd < 0) {
243		perror("Could not open backing file");
244		return (NULL);
245	}
246
247        if (fstat(fd, &sbuf) < 0) {
248                perror("Could not stat backing file");
249                close(fd);
250                return (NULL);
251        }
252
253        /*
254	 * Deal with raw devices
255	 */
256        size = sbuf.st_size;
257	sectsz = DEV_BSIZE;
258	if (S_ISCHR(sbuf.st_mode)) {
259		if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
260		    ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
261			perror("Could not fetch dev blk/sector size");
262			close(fd);
263			return (NULL);
264		}
265		assert(size != 0);
266		assert(sectsz != 0);
267	}
268
269	bc = calloc(1, sizeof(struct blockif_ctxt));
270	if (bc == NULL) {
271		close(fd);
272		return (NULL);
273	}
274
275	bc->bc_magic = BLOCKIF_SIG;
276	bc->bc_fd = fd;
277	bc->bc_rdonly = ro;
278	bc->bc_size = size;
279	bc->bc_sectsz = sectsz;
280	pthread_mutex_init(&bc->bc_mtx, NULL);
281	pthread_cond_init(&bc->bc_cond, NULL);
282	TAILQ_INIT(&bc->bc_freeq);
283	TAILQ_INIT(&bc->bc_inuseq);
284	bc->bc_req_count = 0;
285	for (i = 0; i < BLOCKIF_MAXREQ; i++) {
286		bc->bc_reqs[i].be_status = BST_FREE;
287		TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
288	}
289
290	pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
291
292	snprintf(tname, sizeof(tname), "blk-%s", ident);
293	pthread_set_name_np(bc->bc_btid, tname);
294
295	return (bc);
296}
297
298static int
299blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
300		enum blockop op)
301{
302	int err;
303
304	err = 0;
305
306	pthread_mutex_lock(&bc->bc_mtx);
307	if (bc->bc_req_count < BLOCKIF_MAXREQ) {
308		/*
309		 * Enqueue and inform the block i/o thread
310		 * that there is work available
311		 */
312		blockif_enqueue(bc, breq, op);
313		pthread_cond_signal(&bc->bc_cond);
314	} else {
315		/*
316		 * Callers are not allowed to enqueue more than
317		 * the specified blockif queue limit. Return an
318		 * error to indicate that the queue length has been
319		 * exceeded.
320		 */
321		err = E2BIG;
322	}
323	pthread_mutex_unlock(&bc->bc_mtx);
324
325	return (err);
326}
327
328int
329blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
330{
331
332	assert(bc->bc_magic == BLOCKIF_SIG);
333	return (blockif_request(bc, breq, BOP_READ));
334}
335
336int
337blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
338{
339
340	assert(bc->bc_magic == BLOCKIF_SIG);
341	return (blockif_request(bc, breq, BOP_WRITE));
342}
343
344int
345blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
346{
347
348	assert(bc->bc_magic == BLOCKIF_SIG);
349	return (blockif_request(bc, breq, BOP_FLUSH));
350}
351
352int
353blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
354{
355	struct blockif_elem *be;
356
357	assert(bc->bc_magic == BLOCKIF_SIG);
358
359	pthread_mutex_lock(&bc->bc_mtx);
360	TAILQ_FOREACH(be, &bc->bc_inuseq, be_link) {
361		if (be->be_req == breq)
362			break;
363	}
364	if (be == NULL) {
365		pthread_mutex_unlock(&bc->bc_mtx);
366		return (EINVAL);
367	}
368
369	TAILQ_REMOVE(&bc->bc_inuseq, be, be_link);
370	be->be_status = BST_FREE;
371	be->be_req = NULL;
372	TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
373	bc->bc_req_count--;
374	pthread_mutex_unlock(&bc->bc_mtx);
375
376	return (0);
377}
378
379int
380blockif_close(struct blockif_ctxt *bc)
381{
382	void *jval;
383	int err;
384
385	err = 0;
386
387	assert(bc->bc_magic == BLOCKIF_SIG);
388
389	/*
390	 * Stop the block i/o thread
391	 */
392	bc->bc_closing = 1;
393	pthread_cond_signal(&bc->bc_cond);
394	pthread_join(bc->bc_btid, &jval);
395
396	/* XXX Cancel queued i/o's ??? */
397
398	/*
399	 * Release resources
400	 */
401	bc->bc_magic = 0;
402	close(bc->bc_fd);
403	free(bc);
404
405	return (0);
406}
407
408/*
409 * Return virtual C/H/S values for a given block. Use the algorithm
410 * outlined in the VHD specification to calculate values.
411 */
412void
413blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
414{
415	off_t sectors;		/* total sectors of the block dev */
416	off_t hcyl;		/* cylinders times heads */
417	uint16_t secpt;		/* sectors per track */
418	uint8_t heads;
419
420	assert(bc->bc_magic == BLOCKIF_SIG);
421
422	sectors = bc->bc_size / bc->bc_sectsz;
423
424	/* Clamp the size to the largest possible with CHS */
425	if (sectors > 65535UL*16*255)
426		sectors = 65535UL*16*255;
427
428	if (sectors >= 65536UL*16*63) {
429		secpt = 255;
430		heads = 16;
431		hcyl = sectors / secpt;
432	} else {
433		secpt = 17;
434		hcyl = sectors / secpt;
435		heads = (hcyl + 1023) / 1024;
436
437		if (heads < 4)
438			heads = 4;
439
440		if (hcyl >= (heads * 1024) || heads > 16) {
441			secpt = 31;
442			heads = 16;
443			hcyl = sectors / secpt;
444		}
445		if (hcyl >= (heads * 1024)) {
446			secpt = 63;
447			heads = 16;
448			hcyl = sectors / secpt;
449		}
450	}
451
452	*c = hcyl / heads;
453	*h = heads;
454	*s = secpt;
455}
456
457/*
458 * Accessors
459 */
460off_t
461blockif_size(struct blockif_ctxt *bc)
462{
463
464	assert(bc->bc_magic == BLOCKIF_SIG);
465	return (bc->bc_size);
466}
467
468int
469blockif_sectsz(struct blockif_ctxt *bc)
470{
471
472	assert(bc->bc_magic == BLOCKIF_SIG);
473	return (bc->bc_sectsz);
474}
475
476int
477blockif_queuesz(struct blockif_ctxt *bc)
478{
479
480	assert(bc->bc_magic == BLOCKIF_SIG);
481	return (BLOCKIF_MAXREQ);
482}
483
484int
485blockif_is_ro(struct blockif_ctxt *bc)
486{
487
488	assert(bc->bc_magic == BLOCKIF_SIG);
489	return (bc->bc_rdonly);
490}
491