block_if.c revision 280244
1/*- 2 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280244 2015-03-19 09:54:48Z mav $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280244 2015-03-19 09:54:48Z mav $"); 31 32#include <sys/param.h> 33#include <sys/queue.h> 34#include <sys/errno.h> 35#include <sys/stat.h> 36#include <sys/ioctl.h> 37#include <sys/disk.h> 38 39#include <assert.h> 40#include <fcntl.h> 41#include <stdio.h> 42#include <stdlib.h> 43#include <string.h> 44#include <pthread.h> 45#include <pthread_np.h> 46#include <signal.h> 47#include <unistd.h> 48 49#include <machine/atomic.h> 50 51#include "bhyverun.h" 52#include "mevent.h" 53#include "block_if.h" 54 55#define BLOCKIF_SIG 0xb109b109 56 57#define BLOCKIF_MAXREQ 33 58 59enum blockop { 60 BOP_READ, 61 BOP_WRITE, 62 BOP_FLUSH 63}; 64 65enum blockstat { 66 BST_FREE, 67 BST_PEND, 68 BST_BUSY, 69 BST_DONE 70}; 71 72struct blockif_elem { 73 TAILQ_ENTRY(blockif_elem) be_link; 74 struct blockif_req *be_req; 75 enum blockop be_op; 76 enum blockstat be_status; 77 pthread_t be_tid; 78}; 79 80struct blockif_ctxt { 81 int bc_magic; 82 int bc_fd; 83 int bc_rdonly; 84 off_t bc_size; 85 int bc_sectsz; 86 int bc_psectsz; 87 int bc_psectoff; 88 pthread_t bc_btid; 89 pthread_mutex_t bc_mtx; 90 pthread_cond_t bc_cond; 91 int bc_closing; 92 93 /* Request elements and free/pending/busy queues */ 94 TAILQ_HEAD(, blockif_elem) bc_freeq; 95 TAILQ_HEAD(, blockif_elem) bc_pendq; 96 TAILQ_HEAD(, blockif_elem) bc_busyq; 97 u_int bc_req_count; 98 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 99}; 100 101static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 102 103struct blockif_sig_elem { 104 pthread_mutex_t bse_mtx; 105 pthread_cond_t bse_cond; 106 int bse_pending; 107 struct blockif_sig_elem *bse_next; 108}; 109 110static struct blockif_sig_elem *blockif_bse_head; 111 112static int 113blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 114 enum blockop op) 115{ 116 struct blockif_elem *be; 117 118 assert(bc->bc_req_count < BLOCKIF_MAXREQ); 119 120 be = TAILQ_FIRST(&bc->bc_freeq); 121 assert(be != NULL); 122 assert(be->be_status == BST_FREE); 123 124 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 125 be->be_status = BST_PEND; 126 be->be_req = breq; 127 be->be_op = op; 128 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 129 130 bc->bc_req_count++; 131 132 return (0); 133} 134 135static int 136blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) 137{ 138 struct blockif_elem *be; 139 140 if (bc->bc_req_count == 0) 141 return (ENOENT); 142 143 be = TAILQ_FIRST(&bc->bc_pendq); 144 assert(be != NULL); 145 assert(be->be_status == BST_PEND); 146 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 147 be->be_status = BST_BUSY; 148 be->be_tid = bc->bc_btid; 149 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 150 151 *bep = be; 152 153 return (0); 154} 155 156static void 157blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 158{ 159 assert(be->be_status == BST_DONE); 160 161 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 162 be->be_tid = 0; 163 be->be_status = BST_FREE; 164 be->be_req = NULL; 165 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 166 167 bc->bc_req_count--; 168} 169 170static void 171blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) 172{ 173 struct blockif_req *br; 174 int err; 175 176 br = be->be_req; 177 err = 0; 178 179 switch (be->be_op) { 180 case BOP_READ: 181 if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 182 br->br_offset) < 0) 183 err = errno; 184 break; 185 case BOP_WRITE: 186 if (bc->bc_rdonly) 187 err = EROFS; 188 else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 189 br->br_offset) < 0) 190 err = errno; 191 break; 192 case BOP_FLUSH: 193 break; 194 default: 195 err = EINVAL; 196 break; 197 } 198 199 be->be_status = BST_DONE; 200 201 (*br->br_callback)(br, err); 202} 203 204static void * 205blockif_thr(void *arg) 206{ 207 struct blockif_ctxt *bc; 208 struct blockif_elem *be; 209 210 bc = arg; 211 212 for (;;) { 213 pthread_mutex_lock(&bc->bc_mtx); 214 while (!blockif_dequeue(bc, &be)) { 215 pthread_mutex_unlock(&bc->bc_mtx); 216 blockif_proc(bc, be); 217 pthread_mutex_lock(&bc->bc_mtx); 218 blockif_complete(bc, be); 219 } 220 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 221 pthread_mutex_unlock(&bc->bc_mtx); 222 223 /* 224 * Check ctxt status here to see if exit requested 225 */ 226 if (bc->bc_closing) 227 pthread_exit(NULL); 228 } 229 230 /* Not reached */ 231 return (NULL); 232} 233 234static void 235blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 236{ 237 struct blockif_sig_elem *bse; 238 239 for (;;) { 240 /* 241 * Process the entire list even if not intended for 242 * this thread. 243 */ 244 do { 245 bse = blockif_bse_head; 246 if (bse == NULL) 247 return; 248 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 249 (uintptr_t)bse, 250 (uintptr_t)bse->bse_next)); 251 252 pthread_mutex_lock(&bse->bse_mtx); 253 bse->bse_pending = 0; 254 pthread_cond_signal(&bse->bse_cond); 255 pthread_mutex_unlock(&bse->bse_mtx); 256 } 257} 258 259static void 260blockif_init(void) 261{ 262 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 263 (void) signal(SIGCONT, SIG_IGN); 264} 265 266struct blockif_ctxt * 267blockif_open(const char *optstr, const char *ident) 268{ 269 char tname[MAXCOMLEN + 1]; 270 char *nopt, *xopts; 271 struct blockif_ctxt *bc; 272 struct stat sbuf; 273 off_t size, psectsz, psectoff; 274 int extra, fd, i, sectsz; 275 int nocache, sync, ro; 276 277 pthread_once(&blockif_once, blockif_init); 278 279 nocache = 0; 280 sync = 0; 281 ro = 0; 282 283 /* 284 * The first element in the optstring is always a pathname. 285 * Optional elements follow 286 */ 287 nopt = strdup(optstr); 288 for (xopts = strtok(nopt, ","); 289 xopts != NULL; 290 xopts = strtok(NULL, ",")) { 291 if (!strcmp(xopts, "nocache")) 292 nocache = 1; 293 else if (!strcmp(xopts, "sync")) 294 sync = 1; 295 else if (!strcmp(xopts, "ro")) 296 ro = 1; 297 } 298 299 extra = 0; 300 if (nocache) 301 extra |= O_DIRECT; 302 if (sync) 303 extra |= O_SYNC; 304 305 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 306 if (fd < 0 && !ro) { 307 /* Attempt a r/w fail with a r/o open */ 308 fd = open(nopt, O_RDONLY | extra); 309 ro = 1; 310 } 311 312 if (fd < 0) { 313 perror("Could not open backing file"); 314 return (NULL); 315 } 316 317 if (fstat(fd, &sbuf) < 0) { 318 perror("Could not stat backing file"); 319 close(fd); 320 return (NULL); 321 } 322 323 /* 324 * Deal with raw devices 325 */ 326 size = sbuf.st_size; 327 sectsz = DEV_BSIZE; 328 psectsz = psectoff = 0; 329 if (S_ISCHR(sbuf.st_mode)) { 330 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 331 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 332 perror("Could not fetch dev blk/sector size"); 333 close(fd); 334 return (NULL); 335 } 336 assert(size != 0); 337 assert(sectsz != 0); 338 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 339 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 340 } else 341 psectsz = sbuf.st_blksize; 342 343 bc = calloc(1, sizeof(struct blockif_ctxt)); 344 if (bc == NULL) { 345 close(fd); 346 return (NULL); 347 } 348 349 bc->bc_magic = BLOCKIF_SIG; 350 bc->bc_fd = fd; 351 bc->bc_rdonly = ro; 352 bc->bc_size = size; 353 bc->bc_sectsz = sectsz; 354 bc->bc_psectsz = psectsz; 355 bc->bc_psectoff = psectoff; 356 pthread_mutex_init(&bc->bc_mtx, NULL); 357 pthread_cond_init(&bc->bc_cond, NULL); 358 TAILQ_INIT(&bc->bc_freeq); 359 TAILQ_INIT(&bc->bc_pendq); 360 TAILQ_INIT(&bc->bc_busyq); 361 bc->bc_req_count = 0; 362 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 363 bc->bc_reqs[i].be_status = BST_FREE; 364 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 365 } 366 367 pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); 368 369 snprintf(tname, sizeof(tname), "blk-%s", ident); 370 pthread_set_name_np(bc->bc_btid, tname); 371 372 return (bc); 373} 374 375static int 376blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 377 enum blockop op) 378{ 379 int err; 380 381 err = 0; 382 383 pthread_mutex_lock(&bc->bc_mtx); 384 if (bc->bc_req_count < BLOCKIF_MAXREQ) { 385 /* 386 * Enqueue and inform the block i/o thread 387 * that there is work available 388 */ 389 blockif_enqueue(bc, breq, op); 390 pthread_cond_signal(&bc->bc_cond); 391 } else { 392 /* 393 * Callers are not allowed to enqueue more than 394 * the specified blockif queue limit. Return an 395 * error to indicate that the queue length has been 396 * exceeded. 397 */ 398 err = E2BIG; 399 } 400 pthread_mutex_unlock(&bc->bc_mtx); 401 402 return (err); 403} 404 405int 406blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 407{ 408 409 assert(bc->bc_magic == BLOCKIF_SIG); 410 return (blockif_request(bc, breq, BOP_READ)); 411} 412 413int 414blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 415{ 416 417 assert(bc->bc_magic == BLOCKIF_SIG); 418 return (blockif_request(bc, breq, BOP_WRITE)); 419} 420 421int 422blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 423{ 424 425 assert(bc->bc_magic == BLOCKIF_SIG); 426 return (blockif_request(bc, breq, BOP_FLUSH)); 427} 428 429int 430blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 431{ 432 struct blockif_elem *be; 433 434 assert(bc->bc_magic == BLOCKIF_SIG); 435 436 pthread_mutex_lock(&bc->bc_mtx); 437 /* 438 * Check pending requests. 439 */ 440 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 441 if (be->be_req == breq) 442 break; 443 } 444 if (be != NULL) { 445 /* 446 * Found it. 447 */ 448 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 449 be->be_status = BST_FREE; 450 be->be_req = NULL; 451 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 452 bc->bc_req_count--; 453 pthread_mutex_unlock(&bc->bc_mtx); 454 455 return (0); 456 } 457 458 /* 459 * Check in-flight requests. 460 */ 461 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 462 if (be->be_req == breq) 463 break; 464 } 465 if (be == NULL) { 466 /* 467 * Didn't find it. 468 */ 469 pthread_mutex_unlock(&bc->bc_mtx); 470 return (EINVAL); 471 } 472 473 /* 474 * Interrupt the processing thread to force it return 475 * prematurely via it's normal callback path. 476 */ 477 while (be->be_status == BST_BUSY) { 478 struct blockif_sig_elem bse, *old_head; 479 480 pthread_mutex_init(&bse.bse_mtx, NULL); 481 pthread_cond_init(&bse.bse_cond, NULL); 482 483 bse.bse_pending = 1; 484 485 do { 486 old_head = blockif_bse_head; 487 bse.bse_next = old_head; 488 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 489 (uintptr_t)old_head, 490 (uintptr_t)&bse)); 491 492 pthread_kill(be->be_tid, SIGCONT); 493 494 pthread_mutex_lock(&bse.bse_mtx); 495 while (bse.bse_pending) 496 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 497 pthread_mutex_unlock(&bse.bse_mtx); 498 } 499 500 pthread_mutex_unlock(&bc->bc_mtx); 501 502 /* 503 * The processing thread has been interrupted. Since it's not 504 * clear if the callback has been invoked yet, return EBUSY. 505 */ 506 return (EBUSY); 507} 508 509int 510blockif_close(struct blockif_ctxt *bc) 511{ 512 void *jval; 513 int err; 514 515 err = 0; 516 517 assert(bc->bc_magic == BLOCKIF_SIG); 518 519 /* 520 * Stop the block i/o thread 521 */ 522 bc->bc_closing = 1; 523 pthread_cond_signal(&bc->bc_cond); 524 pthread_join(bc->bc_btid, &jval); 525 526 /* XXX Cancel queued i/o's ??? */ 527 528 /* 529 * Release resources 530 */ 531 bc->bc_magic = 0; 532 close(bc->bc_fd); 533 free(bc); 534 535 return (0); 536} 537 538/* 539 * Return virtual C/H/S values for a given block. Use the algorithm 540 * outlined in the VHD specification to calculate values. 541 */ 542void 543blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 544{ 545 off_t sectors; /* total sectors of the block dev */ 546 off_t hcyl; /* cylinders times heads */ 547 uint16_t secpt; /* sectors per track */ 548 uint8_t heads; 549 550 assert(bc->bc_magic == BLOCKIF_SIG); 551 552 sectors = bc->bc_size / bc->bc_sectsz; 553 554 /* Clamp the size to the largest possible with CHS */ 555 if (sectors > 65535UL*16*255) 556 sectors = 65535UL*16*255; 557 558 if (sectors >= 65536UL*16*63) { 559 secpt = 255; 560 heads = 16; 561 hcyl = sectors / secpt; 562 } else { 563 secpt = 17; 564 hcyl = sectors / secpt; 565 heads = (hcyl + 1023) / 1024; 566 567 if (heads < 4) 568 heads = 4; 569 570 if (hcyl >= (heads * 1024) || heads > 16) { 571 secpt = 31; 572 heads = 16; 573 hcyl = sectors / secpt; 574 } 575 if (hcyl >= (heads * 1024)) { 576 secpt = 63; 577 heads = 16; 578 hcyl = sectors / secpt; 579 } 580 } 581 582 *c = hcyl / heads; 583 *h = heads; 584 *s = secpt; 585} 586 587/* 588 * Accessors 589 */ 590off_t 591blockif_size(struct blockif_ctxt *bc) 592{ 593 594 assert(bc->bc_magic == BLOCKIF_SIG); 595 return (bc->bc_size); 596} 597 598int 599blockif_sectsz(struct blockif_ctxt *bc) 600{ 601 602 assert(bc->bc_magic == BLOCKIF_SIG); 603 return (bc->bc_sectsz); 604} 605 606void 607blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 608{ 609 610 assert(bc->bc_magic == BLOCKIF_SIG); 611 *size = bc->bc_psectsz; 612 *off = bc->bc_psectoff; 613} 614 615int 616blockif_queuesz(struct blockif_ctxt *bc) 617{ 618 619 assert(bc->bc_magic == BLOCKIF_SIG); 620 return (BLOCKIF_MAXREQ - 1); 621} 622 623int 624blockif_is_ro(struct blockif_ctxt *bc) 625{ 626 627 assert(bc->bc_magic == BLOCKIF_SIG); 628 return (bc->bc_rdonly); 629} 630