block_if.c revision 280370
1/*- 2 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280370 2015-03-23 14:36:53Z mav $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280370 2015-03-23 14:36:53Z mav $"); 31 32#include <sys/param.h> 33#include <sys/queue.h> 34#include <sys/errno.h> 35#include <sys/stat.h> 36#include <sys/ioctl.h> 37#include <sys/disk.h> 38 39#include <assert.h> 40#include <fcntl.h> 41#include <stdio.h> 42#include <stdlib.h> 43#include <string.h> 44#include <pthread.h> 45#include <pthread_np.h> 46#include <signal.h> 47#include <unistd.h> 48 49#include <machine/atomic.h> 50 51#include "bhyverun.h" 52#include "mevent.h" 53#include "block_if.h" 54 55#define BLOCKIF_SIG 0xb109b109 56 57#define BLOCKIF_MAXREQ 33 58 59enum blockop { 60 BOP_READ, 61 BOP_WRITE, 62 BOP_FLUSH, 63 BOP_DELETE 64}; 65 66enum blockstat { 67 BST_FREE, 68 BST_PEND, 69 BST_BUSY, 70 BST_DONE 71}; 72 73struct blockif_elem { 74 TAILQ_ENTRY(blockif_elem) be_link; 75 struct blockif_req *be_req; 76 enum blockop be_op; 77 enum blockstat be_status; 78 pthread_t be_tid; 79}; 80 81struct blockif_ctxt { 82 int bc_magic; 83 int bc_fd; 84 int bc_ischr; 85 int bc_candelete; 86 int bc_rdonly; 87 off_t bc_size; 88 int bc_sectsz; 89 int bc_psectsz; 90 int bc_psectoff; 91 pthread_t bc_btid; 92 pthread_mutex_t bc_mtx; 93 pthread_cond_t bc_cond; 94 int bc_closing; 95 96 /* Request elements and free/pending/busy queues */ 97 TAILQ_HEAD(, blockif_elem) bc_freeq; 98 TAILQ_HEAD(, blockif_elem) bc_pendq; 99 TAILQ_HEAD(, blockif_elem) bc_busyq; 100 u_int bc_req_count; 101 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 102}; 103 104static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 105 106struct blockif_sig_elem { 107 pthread_mutex_t bse_mtx; 108 pthread_cond_t bse_cond; 109 int bse_pending; 110 struct blockif_sig_elem *bse_next; 111}; 112 113static struct blockif_sig_elem *blockif_bse_head; 114 115static int 116blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 117 enum blockop op) 118{ 119 struct blockif_elem *be; 120 121 assert(bc->bc_req_count < BLOCKIF_MAXREQ); 122 123 be = TAILQ_FIRST(&bc->bc_freeq); 124 assert(be != NULL); 125 assert(be->be_status == BST_FREE); 126 127 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 128 be->be_status = BST_PEND; 129 be->be_req = breq; 130 be->be_op = op; 131 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 132 133 bc->bc_req_count++; 134 135 return (0); 136} 137 138static int 139blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) 140{ 141 struct blockif_elem *be; 142 143 if (bc->bc_req_count == 0) 144 return (ENOENT); 145 146 be = TAILQ_FIRST(&bc->bc_pendq); 147 assert(be != NULL); 148 assert(be->be_status == BST_PEND); 149 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 150 be->be_status = BST_BUSY; 151 be->be_tid = bc->bc_btid; 152 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 153 154 *bep = be; 155 156 return (0); 157} 158 159static void 160blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 161{ 162 assert(be->be_status == BST_DONE); 163 164 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 165 be->be_tid = 0; 166 be->be_status = BST_FREE; 167 be->be_req = NULL; 168 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 169 170 bc->bc_req_count--; 171} 172 173static void 174blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) 175{ 176 struct blockif_req *br; 177 off_t arg[2]; 178 int err; 179 180 br = be->be_req; 181 err = 0; 182 183 switch (be->be_op) { 184 case BOP_READ: 185 if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 186 br->br_offset) < 0) 187 err = errno; 188 break; 189 case BOP_WRITE: 190 if (bc->bc_rdonly) 191 err = EROFS; 192 else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 193 br->br_offset) < 0) 194 err = errno; 195 break; 196 case BOP_FLUSH: 197 if (bc->bc_ischr) { 198 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 199 err = errno; 200 } else if (fsync(bc->bc_fd)) 201 err = errno; 202 break; 203 case BOP_DELETE: 204 if (!bc->bc_candelete) 205 err = EOPNOTSUPP; 206 else if (bc->bc_rdonly) 207 err = EROFS; 208 else if (bc->bc_ischr) { 209 arg[0] = br->br_offset; 210 arg[1] = br->br_iov[0].iov_len; 211 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 212 err = errno; 213 } else 214 err = EOPNOTSUPP; 215 break; 216 default: 217 err = EINVAL; 218 break; 219 } 220 221 be->be_status = BST_DONE; 222 223 (*br->br_callback)(br, err); 224} 225 226static void * 227blockif_thr(void *arg) 228{ 229 struct blockif_ctxt *bc; 230 struct blockif_elem *be; 231 232 bc = arg; 233 234 for (;;) { 235 pthread_mutex_lock(&bc->bc_mtx); 236 while (!blockif_dequeue(bc, &be)) { 237 pthread_mutex_unlock(&bc->bc_mtx); 238 blockif_proc(bc, be); 239 pthread_mutex_lock(&bc->bc_mtx); 240 blockif_complete(bc, be); 241 } 242 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 243 pthread_mutex_unlock(&bc->bc_mtx); 244 245 /* 246 * Check ctxt status here to see if exit requested 247 */ 248 if (bc->bc_closing) 249 pthread_exit(NULL); 250 } 251 252 /* Not reached */ 253 return (NULL); 254} 255 256static void 257blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 258{ 259 struct blockif_sig_elem *bse; 260 261 for (;;) { 262 /* 263 * Process the entire list even if not intended for 264 * this thread. 265 */ 266 do { 267 bse = blockif_bse_head; 268 if (bse == NULL) 269 return; 270 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 271 (uintptr_t)bse, 272 (uintptr_t)bse->bse_next)); 273 274 pthread_mutex_lock(&bse->bse_mtx); 275 bse->bse_pending = 0; 276 pthread_cond_signal(&bse->bse_cond); 277 pthread_mutex_unlock(&bse->bse_mtx); 278 } 279} 280 281static void 282blockif_init(void) 283{ 284 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 285 (void) signal(SIGCONT, SIG_IGN); 286} 287 288struct blockif_ctxt * 289blockif_open(const char *optstr, const char *ident) 290{ 291 char tname[MAXCOMLEN + 1]; 292 char *nopt, *xopts; 293 struct blockif_ctxt *bc; 294 struct stat sbuf; 295 struct diocgattr_arg arg; 296 off_t size, psectsz, psectoff; 297 int extra, fd, i, sectsz; 298 int nocache, sync, ro, candelete; 299 300 pthread_once(&blockif_once, blockif_init); 301 302 nocache = 0; 303 sync = 0; 304 ro = 0; 305 306 /* 307 * The first element in the optstring is always a pathname. 308 * Optional elements follow 309 */ 310 nopt = strdup(optstr); 311 for (xopts = strtok(nopt, ","); 312 xopts != NULL; 313 xopts = strtok(NULL, ",")) { 314 if (!strcmp(xopts, "nocache")) 315 nocache = 1; 316 else if (!strcmp(xopts, "sync")) 317 sync = 1; 318 else if (!strcmp(xopts, "ro")) 319 ro = 1; 320 } 321 322 extra = 0; 323 if (nocache) 324 extra |= O_DIRECT; 325 if (sync) 326 extra |= O_SYNC; 327 328 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 329 if (fd < 0 && !ro) { 330 /* Attempt a r/w fail with a r/o open */ 331 fd = open(nopt, O_RDONLY | extra); 332 ro = 1; 333 } 334 335 if (fd < 0) { 336 perror("Could not open backing file"); 337 return (NULL); 338 } 339 340 if (fstat(fd, &sbuf) < 0) { 341 perror("Could not stat backing file"); 342 close(fd); 343 return (NULL); 344 } 345 346 /* 347 * Deal with raw devices 348 */ 349 size = sbuf.st_size; 350 sectsz = DEV_BSIZE; 351 psectsz = psectoff = 0; 352 candelete = 0; 353 if (S_ISCHR(sbuf.st_mode)) { 354 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 355 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 356 perror("Could not fetch dev blk/sector size"); 357 close(fd); 358 return (NULL); 359 } 360 assert(size != 0); 361 assert(sectsz != 0); 362 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 363 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 364 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 365 arg.len = sizeof(arg.value.i); 366 if (ioctl(fd, DIOCGATTR, &arg) == 0) 367 candelete = arg.value.i; 368 } else 369 psectsz = sbuf.st_blksize; 370 371 bc = calloc(1, sizeof(struct blockif_ctxt)); 372 if (bc == NULL) { 373 close(fd); 374 return (NULL); 375 } 376 377 bc->bc_magic = BLOCKIF_SIG; 378 bc->bc_fd = fd; 379 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 380 bc->bc_candelete = candelete; 381 bc->bc_rdonly = ro; 382 bc->bc_size = size; 383 bc->bc_sectsz = sectsz; 384 bc->bc_psectsz = psectsz; 385 bc->bc_psectoff = psectoff; 386 pthread_mutex_init(&bc->bc_mtx, NULL); 387 pthread_cond_init(&bc->bc_cond, NULL); 388 TAILQ_INIT(&bc->bc_freeq); 389 TAILQ_INIT(&bc->bc_pendq); 390 TAILQ_INIT(&bc->bc_busyq); 391 bc->bc_req_count = 0; 392 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 393 bc->bc_reqs[i].be_status = BST_FREE; 394 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 395 } 396 397 pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); 398 399 snprintf(tname, sizeof(tname), "blk-%s", ident); 400 pthread_set_name_np(bc->bc_btid, tname); 401 402 return (bc); 403} 404 405static int 406blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 407 enum blockop op) 408{ 409 int err; 410 411 err = 0; 412 413 pthread_mutex_lock(&bc->bc_mtx); 414 if (bc->bc_req_count < BLOCKIF_MAXREQ) { 415 /* 416 * Enqueue and inform the block i/o thread 417 * that there is work available 418 */ 419 blockif_enqueue(bc, breq, op); 420 pthread_cond_signal(&bc->bc_cond); 421 } else { 422 /* 423 * Callers are not allowed to enqueue more than 424 * the specified blockif queue limit. Return an 425 * error to indicate that the queue length has been 426 * exceeded. 427 */ 428 err = E2BIG; 429 } 430 pthread_mutex_unlock(&bc->bc_mtx); 431 432 return (err); 433} 434 435int 436blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 437{ 438 439 assert(bc->bc_magic == BLOCKIF_SIG); 440 return (blockif_request(bc, breq, BOP_READ)); 441} 442 443int 444blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 445{ 446 447 assert(bc->bc_magic == BLOCKIF_SIG); 448 return (blockif_request(bc, breq, BOP_WRITE)); 449} 450 451int 452blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 453{ 454 455 assert(bc->bc_magic == BLOCKIF_SIG); 456 return (blockif_request(bc, breq, BOP_FLUSH)); 457} 458 459int 460blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 461{ 462 463 assert(bc->bc_magic == BLOCKIF_SIG); 464 return (blockif_request(bc, breq, BOP_DELETE)); 465} 466 467int 468blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 469{ 470 struct blockif_elem *be; 471 472 assert(bc->bc_magic == BLOCKIF_SIG); 473 474 pthread_mutex_lock(&bc->bc_mtx); 475 /* 476 * Check pending requests. 477 */ 478 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 479 if (be->be_req == breq) 480 break; 481 } 482 if (be != NULL) { 483 /* 484 * Found it. 485 */ 486 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 487 be->be_status = BST_FREE; 488 be->be_req = NULL; 489 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 490 bc->bc_req_count--; 491 pthread_mutex_unlock(&bc->bc_mtx); 492 493 return (0); 494 } 495 496 /* 497 * Check in-flight requests. 498 */ 499 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 500 if (be->be_req == breq) 501 break; 502 } 503 if (be == NULL) { 504 /* 505 * Didn't find it. 506 */ 507 pthread_mutex_unlock(&bc->bc_mtx); 508 return (EINVAL); 509 } 510 511 /* 512 * Interrupt the processing thread to force it return 513 * prematurely via it's normal callback path. 514 */ 515 while (be->be_status == BST_BUSY) { 516 struct blockif_sig_elem bse, *old_head; 517 518 pthread_mutex_init(&bse.bse_mtx, NULL); 519 pthread_cond_init(&bse.bse_cond, NULL); 520 521 bse.bse_pending = 1; 522 523 do { 524 old_head = blockif_bse_head; 525 bse.bse_next = old_head; 526 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 527 (uintptr_t)old_head, 528 (uintptr_t)&bse)); 529 530 pthread_kill(be->be_tid, SIGCONT); 531 532 pthread_mutex_lock(&bse.bse_mtx); 533 while (bse.bse_pending) 534 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 535 pthread_mutex_unlock(&bse.bse_mtx); 536 } 537 538 pthread_mutex_unlock(&bc->bc_mtx); 539 540 /* 541 * The processing thread has been interrupted. Since it's not 542 * clear if the callback has been invoked yet, return EBUSY. 543 */ 544 return (EBUSY); 545} 546 547int 548blockif_close(struct blockif_ctxt *bc) 549{ 550 void *jval; 551 int err; 552 553 err = 0; 554 555 assert(bc->bc_magic == BLOCKIF_SIG); 556 557 /* 558 * Stop the block i/o thread 559 */ 560 bc->bc_closing = 1; 561 pthread_cond_signal(&bc->bc_cond); 562 pthread_join(bc->bc_btid, &jval); 563 564 /* XXX Cancel queued i/o's ??? */ 565 566 /* 567 * Release resources 568 */ 569 bc->bc_magic = 0; 570 close(bc->bc_fd); 571 free(bc); 572 573 return (0); 574} 575 576/* 577 * Return virtual C/H/S values for a given block. Use the algorithm 578 * outlined in the VHD specification to calculate values. 579 */ 580void 581blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 582{ 583 off_t sectors; /* total sectors of the block dev */ 584 off_t hcyl; /* cylinders times heads */ 585 uint16_t secpt; /* sectors per track */ 586 uint8_t heads; 587 588 assert(bc->bc_magic == BLOCKIF_SIG); 589 590 sectors = bc->bc_size / bc->bc_sectsz; 591 592 /* Clamp the size to the largest possible with CHS */ 593 if (sectors > 65535UL*16*255) 594 sectors = 65535UL*16*255; 595 596 if (sectors >= 65536UL*16*63) { 597 secpt = 255; 598 heads = 16; 599 hcyl = sectors / secpt; 600 } else { 601 secpt = 17; 602 hcyl = sectors / secpt; 603 heads = (hcyl + 1023) / 1024; 604 605 if (heads < 4) 606 heads = 4; 607 608 if (hcyl >= (heads * 1024) || heads > 16) { 609 secpt = 31; 610 heads = 16; 611 hcyl = sectors / secpt; 612 } 613 if (hcyl >= (heads * 1024)) { 614 secpt = 63; 615 heads = 16; 616 hcyl = sectors / secpt; 617 } 618 } 619 620 *c = hcyl / heads; 621 *h = heads; 622 *s = secpt; 623} 624 625/* 626 * Accessors 627 */ 628off_t 629blockif_size(struct blockif_ctxt *bc) 630{ 631 632 assert(bc->bc_magic == BLOCKIF_SIG); 633 return (bc->bc_size); 634} 635 636int 637blockif_sectsz(struct blockif_ctxt *bc) 638{ 639 640 assert(bc->bc_magic == BLOCKIF_SIG); 641 return (bc->bc_sectsz); 642} 643 644void 645blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 646{ 647 648 assert(bc->bc_magic == BLOCKIF_SIG); 649 *size = bc->bc_psectsz; 650 *off = bc->bc_psectoff; 651} 652 653int 654blockif_queuesz(struct blockif_ctxt *bc) 655{ 656 657 assert(bc->bc_magic == BLOCKIF_SIG); 658 return (BLOCKIF_MAXREQ - 1); 659} 660 661int 662blockif_is_ro(struct blockif_ctxt *bc) 663{ 664 665 assert(bc->bc_magic == BLOCKIF_SIG); 666 return (bc->bc_rdonly); 667} 668 669int 670blockif_candelete(struct blockif_ctxt *bc) 671{ 672 673 assert(bc->bc_magic == BLOCKIF_SIG); 674 return (bc->bc_candelete); 675} 676