block_if.c revision 280746
1/*- 2 * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280746 2015-03-27 08:55:54Z mav $ 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/usr.sbin/bhyve/block_if.c 280746 2015-03-27 08:55:54Z mav $"); 31 32#include <sys/param.h> 33#include <sys/queue.h> 34#include <sys/errno.h> 35#include <sys/stat.h> 36#include <sys/ioctl.h> 37#include <sys/disk.h> 38 39#include <assert.h> 40#include <fcntl.h> 41#include <stdio.h> 42#include <stdlib.h> 43#include <string.h> 44#include <pthread.h> 45#include <pthread_np.h> 46#include <signal.h> 47#include <unistd.h> 48 49#include <machine/atomic.h> 50 51#include "bhyverun.h" 52#include "mevent.h" 53#include "block_if.h" 54 55#define BLOCKIF_SIG 0xb109b109 56 57#define BLOCKIF_MAXREQ 64 58#define BLOCKIF_NUMTHR 8 59 60enum blockop { 61 BOP_READ, 62 BOP_WRITE, 63 BOP_FLUSH, 64 BOP_DELETE 65}; 66 67enum blockstat { 68 BST_FREE, 69 BST_BLOCK, 70 BST_PEND, 71 BST_BUSY, 72 BST_DONE 73}; 74 75struct blockif_elem { 76 TAILQ_ENTRY(blockif_elem) be_link; 77 struct blockif_req *be_req; 78 enum blockop be_op; 79 enum blockstat be_status; 80 pthread_t be_tid; 81 off_t be_block; 82}; 83 84struct blockif_ctxt { 85 int bc_magic; 86 int bc_fd; 87 int bc_ischr; 88 int bc_candelete; 89 int bc_rdonly; 90 off_t bc_size; 91 int bc_sectsz; 92 int bc_psectsz; 93 int bc_psectoff; 94 int bc_closing; 95 pthread_t bc_btid[BLOCKIF_NUMTHR]; 96 pthread_mutex_t bc_mtx; 97 pthread_cond_t bc_cond; 98 99 /* Request elements and free/pending/busy queues */ 100 TAILQ_HEAD(, blockif_elem) bc_freeq; 101 TAILQ_HEAD(, blockif_elem) bc_pendq; 102 TAILQ_HEAD(, blockif_elem) bc_busyq; 103 struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; 104}; 105 106static pthread_once_t blockif_once = PTHREAD_ONCE_INIT; 107 108struct blockif_sig_elem { 109 pthread_mutex_t bse_mtx; 110 pthread_cond_t bse_cond; 111 int bse_pending; 112 struct blockif_sig_elem *bse_next; 113}; 114 115static struct blockif_sig_elem *blockif_bse_head; 116 117static int 118blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, 119 enum blockop op) 120{ 121 struct blockif_elem *be, *tbe; 122 off_t off; 123 int i; 124 125 be = TAILQ_FIRST(&bc->bc_freeq); 126 assert(be != NULL); 127 assert(be->be_status == BST_FREE); 128 TAILQ_REMOVE(&bc->bc_freeq, be, be_link); 129 be->be_req = breq; 130 be->be_op = op; 131 switch (op) { 132 case BOP_READ: 133 case BOP_WRITE: 134 case BOP_DELETE: 135 off = breq->br_offset; 136 for (i = 0; i < breq->br_iovcnt; i++) 137 off += breq->br_iov[i].iov_len; 138 break; 139 default: 140 off = OFF_MAX; 141 } 142 be->be_block = off; 143 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 144 if (tbe->be_block == breq->br_offset) 145 break; 146 } 147 if (tbe == NULL) { 148 TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { 149 if (tbe->be_block == breq->br_offset) 150 break; 151 } 152 } 153 if (tbe == NULL) 154 be->be_status = BST_PEND; 155 else 156 be->be_status = BST_BLOCK; 157 TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); 158 return (be->be_status == BST_PEND); 159} 160 161static int 162blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) 163{ 164 struct blockif_elem *be; 165 166 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 167 if (be->be_status == BST_PEND) 168 break; 169 assert(be->be_status == BST_BLOCK); 170 } 171 if (be == NULL) 172 return (0); 173 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 174 be->be_status = BST_BUSY; 175 be->be_tid = t; 176 TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); 177 *bep = be; 178 return (1); 179} 180 181static void 182blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) 183{ 184 struct blockif_elem *tbe; 185 186 if (be->be_status == BST_DONE || be->be_status == BST_BUSY) 187 TAILQ_REMOVE(&bc->bc_busyq, be, be_link); 188 else 189 TAILQ_REMOVE(&bc->bc_pendq, be, be_link); 190 TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { 191 if (tbe->be_req->br_offset == be->be_block) 192 tbe->be_status = BST_PEND; 193 } 194 be->be_tid = 0; 195 be->be_status = BST_FREE; 196 be->be_req = NULL; 197 TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); 198} 199 200static void 201blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) 202{ 203 struct blockif_req *br; 204 off_t arg[2]; 205 int err; 206 207 br = be->be_req; 208 err = 0; 209 210 switch (be->be_op) { 211 case BOP_READ: 212 if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, 213 br->br_offset) < 0) 214 err = errno; 215 break; 216 case BOP_WRITE: 217 if (bc->bc_rdonly) 218 err = EROFS; 219 else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, 220 br->br_offset) < 0) 221 err = errno; 222 break; 223 case BOP_FLUSH: 224 if (bc->bc_ischr) { 225 if (ioctl(bc->bc_fd, DIOCGFLUSH)) 226 err = errno; 227 } else if (fsync(bc->bc_fd)) 228 err = errno; 229 break; 230 case BOP_DELETE: 231 if (!bc->bc_candelete) 232 err = EOPNOTSUPP; 233 else if (bc->bc_rdonly) 234 err = EROFS; 235 else if (bc->bc_ischr) { 236 arg[0] = br->br_offset; 237 arg[1] = br->br_iov[0].iov_len; 238 if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) 239 err = errno; 240 } else 241 err = EOPNOTSUPP; 242 break; 243 default: 244 err = EINVAL; 245 break; 246 } 247 248 be->be_status = BST_DONE; 249 250 (*br->br_callback)(br, err); 251} 252 253static void * 254blockif_thr(void *arg) 255{ 256 struct blockif_ctxt *bc; 257 struct blockif_elem *be; 258 pthread_t t; 259 260 bc = arg; 261 t = pthread_self(); 262 263 pthread_mutex_lock(&bc->bc_mtx); 264 for (;;) { 265 while (blockif_dequeue(bc, t, &be)) { 266 pthread_mutex_unlock(&bc->bc_mtx); 267 blockif_proc(bc, be); 268 pthread_mutex_lock(&bc->bc_mtx); 269 blockif_complete(bc, be); 270 } 271 /* Check ctxt status here to see if exit requested */ 272 if (bc->bc_closing) 273 break; 274 pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); 275 } 276 pthread_mutex_unlock(&bc->bc_mtx); 277 278 pthread_exit(NULL); 279 return (NULL); 280} 281 282static void 283blockif_sigcont_handler(int signal, enum ev_type type, void *arg) 284{ 285 struct blockif_sig_elem *bse; 286 287 for (;;) { 288 /* 289 * Process the entire list even if not intended for 290 * this thread. 291 */ 292 do { 293 bse = blockif_bse_head; 294 if (bse == NULL) 295 return; 296 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 297 (uintptr_t)bse, 298 (uintptr_t)bse->bse_next)); 299 300 pthread_mutex_lock(&bse->bse_mtx); 301 bse->bse_pending = 0; 302 pthread_cond_signal(&bse->bse_cond); 303 pthread_mutex_unlock(&bse->bse_mtx); 304 } 305} 306 307static void 308blockif_init(void) 309{ 310 mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL); 311 (void) signal(SIGCONT, SIG_IGN); 312} 313 314struct blockif_ctxt * 315blockif_open(const char *optstr, const char *ident) 316{ 317 char tname[MAXCOMLEN + 1]; 318 char *nopt, *xopts; 319 struct blockif_ctxt *bc; 320 struct stat sbuf; 321 struct diocgattr_arg arg; 322 off_t size, psectsz, psectoff; 323 int extra, fd, i, sectsz; 324 int nocache, sync, ro, candelete; 325 326 pthread_once(&blockif_once, blockif_init); 327 328 nocache = 0; 329 sync = 0; 330 ro = 0; 331 332 /* 333 * The first element in the optstring is always a pathname. 334 * Optional elements follow 335 */ 336 nopt = strdup(optstr); 337 for (xopts = strtok(nopt, ","); 338 xopts != NULL; 339 xopts = strtok(NULL, ",")) { 340 if (!strcmp(xopts, "nocache")) 341 nocache = 1; 342 else if (!strcmp(xopts, "sync")) 343 sync = 1; 344 else if (!strcmp(xopts, "ro")) 345 ro = 1; 346 } 347 348 extra = 0; 349 if (nocache) 350 extra |= O_DIRECT; 351 if (sync) 352 extra |= O_SYNC; 353 354 fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra); 355 if (fd < 0 && !ro) { 356 /* Attempt a r/w fail with a r/o open */ 357 fd = open(nopt, O_RDONLY | extra); 358 ro = 1; 359 } 360 361 if (fd < 0) { 362 perror("Could not open backing file"); 363 return (NULL); 364 } 365 366 if (fstat(fd, &sbuf) < 0) { 367 perror("Could not stat backing file"); 368 close(fd); 369 return (NULL); 370 } 371 372 /* 373 * Deal with raw devices 374 */ 375 size = sbuf.st_size; 376 sectsz = DEV_BSIZE; 377 psectsz = psectoff = 0; 378 candelete = 0; 379 if (S_ISCHR(sbuf.st_mode)) { 380 if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || 381 ioctl(fd, DIOCGSECTORSIZE, §sz)) { 382 perror("Could not fetch dev blk/sector size"); 383 close(fd); 384 return (NULL); 385 } 386 assert(size != 0); 387 assert(sectsz != 0); 388 if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) 389 ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); 390 strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); 391 arg.len = sizeof(arg.value.i); 392 if (ioctl(fd, DIOCGATTR, &arg) == 0) 393 candelete = arg.value.i; 394 } else 395 psectsz = sbuf.st_blksize; 396 397 bc = calloc(1, sizeof(struct blockif_ctxt)); 398 if (bc == NULL) { 399 close(fd); 400 return (NULL); 401 } 402 403 bc->bc_magic = BLOCKIF_SIG; 404 bc->bc_fd = fd; 405 bc->bc_ischr = S_ISCHR(sbuf.st_mode); 406 bc->bc_candelete = candelete; 407 bc->bc_rdonly = ro; 408 bc->bc_size = size; 409 bc->bc_sectsz = sectsz; 410 bc->bc_psectsz = psectsz; 411 bc->bc_psectoff = psectoff; 412 pthread_mutex_init(&bc->bc_mtx, NULL); 413 pthread_cond_init(&bc->bc_cond, NULL); 414 TAILQ_INIT(&bc->bc_freeq); 415 TAILQ_INIT(&bc->bc_pendq); 416 TAILQ_INIT(&bc->bc_busyq); 417 for (i = 0; i < BLOCKIF_MAXREQ; i++) { 418 bc->bc_reqs[i].be_status = BST_FREE; 419 TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); 420 } 421 422 for (i = 0; i < BLOCKIF_NUMTHR; i++) { 423 pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); 424 snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); 425 pthread_set_name_np(bc->bc_btid[i], tname); 426 } 427 428 return (bc); 429} 430 431static int 432blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, 433 enum blockop op) 434{ 435 int err; 436 437 err = 0; 438 439 pthread_mutex_lock(&bc->bc_mtx); 440 if (!TAILQ_EMPTY(&bc->bc_freeq)) { 441 /* 442 * Enqueue and inform the block i/o thread 443 * that there is work available 444 */ 445 if (blockif_enqueue(bc, breq, op)) 446 pthread_cond_signal(&bc->bc_cond); 447 } else { 448 /* 449 * Callers are not allowed to enqueue more than 450 * the specified blockif queue limit. Return an 451 * error to indicate that the queue length has been 452 * exceeded. 453 */ 454 err = E2BIG; 455 } 456 pthread_mutex_unlock(&bc->bc_mtx); 457 458 return (err); 459} 460 461int 462blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq) 463{ 464 465 assert(bc->bc_magic == BLOCKIF_SIG); 466 return (blockif_request(bc, breq, BOP_READ)); 467} 468 469int 470blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq) 471{ 472 473 assert(bc->bc_magic == BLOCKIF_SIG); 474 return (blockif_request(bc, breq, BOP_WRITE)); 475} 476 477int 478blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) 479{ 480 481 assert(bc->bc_magic == BLOCKIF_SIG); 482 return (blockif_request(bc, breq, BOP_FLUSH)); 483} 484 485int 486blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) 487{ 488 489 assert(bc->bc_magic == BLOCKIF_SIG); 490 return (blockif_request(bc, breq, BOP_DELETE)); 491} 492 493int 494blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) 495{ 496 struct blockif_elem *be; 497 498 assert(bc->bc_magic == BLOCKIF_SIG); 499 500 pthread_mutex_lock(&bc->bc_mtx); 501 /* 502 * Check pending requests. 503 */ 504 TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { 505 if (be->be_req == breq) 506 break; 507 } 508 if (be != NULL) { 509 /* 510 * Found it. 511 */ 512 blockif_complete(bc, be); 513 pthread_mutex_unlock(&bc->bc_mtx); 514 515 return (0); 516 } 517 518 /* 519 * Check in-flight requests. 520 */ 521 TAILQ_FOREACH(be, &bc->bc_busyq, be_link) { 522 if (be->be_req == breq) 523 break; 524 } 525 if (be == NULL) { 526 /* 527 * Didn't find it. 528 */ 529 pthread_mutex_unlock(&bc->bc_mtx); 530 return (EINVAL); 531 } 532 533 /* 534 * Interrupt the processing thread to force it return 535 * prematurely via it's normal callback path. 536 */ 537 while (be->be_status == BST_BUSY) { 538 struct blockif_sig_elem bse, *old_head; 539 540 pthread_mutex_init(&bse.bse_mtx, NULL); 541 pthread_cond_init(&bse.bse_cond, NULL); 542 543 bse.bse_pending = 1; 544 545 do { 546 old_head = blockif_bse_head; 547 bse.bse_next = old_head; 548 } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head, 549 (uintptr_t)old_head, 550 (uintptr_t)&bse)); 551 552 pthread_kill(be->be_tid, SIGCONT); 553 554 pthread_mutex_lock(&bse.bse_mtx); 555 while (bse.bse_pending) 556 pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx); 557 pthread_mutex_unlock(&bse.bse_mtx); 558 } 559 560 pthread_mutex_unlock(&bc->bc_mtx); 561 562 /* 563 * The processing thread has been interrupted. Since it's not 564 * clear if the callback has been invoked yet, return EBUSY. 565 */ 566 return (EBUSY); 567} 568 569int 570blockif_close(struct blockif_ctxt *bc) 571{ 572 void *jval; 573 int err, i; 574 575 err = 0; 576 577 assert(bc->bc_magic == BLOCKIF_SIG); 578 579 /* 580 * Stop the block i/o thread 581 */ 582 pthread_mutex_lock(&bc->bc_mtx); 583 bc->bc_closing = 1; 584 pthread_mutex_unlock(&bc->bc_mtx); 585 pthread_cond_broadcast(&bc->bc_cond); 586 for (i = 0; i < BLOCKIF_NUMTHR; i++) 587 pthread_join(bc->bc_btid[i], &jval); 588 589 /* XXX Cancel queued i/o's ??? */ 590 591 /* 592 * Release resources 593 */ 594 bc->bc_magic = 0; 595 close(bc->bc_fd); 596 free(bc); 597 598 return (0); 599} 600 601/* 602 * Return virtual C/H/S values for a given block. Use the algorithm 603 * outlined in the VHD specification to calculate values. 604 */ 605void 606blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s) 607{ 608 off_t sectors; /* total sectors of the block dev */ 609 off_t hcyl; /* cylinders times heads */ 610 uint16_t secpt; /* sectors per track */ 611 uint8_t heads; 612 613 assert(bc->bc_magic == BLOCKIF_SIG); 614 615 sectors = bc->bc_size / bc->bc_sectsz; 616 617 /* Clamp the size to the largest possible with CHS */ 618 if (sectors > 65535UL*16*255) 619 sectors = 65535UL*16*255; 620 621 if (sectors >= 65536UL*16*63) { 622 secpt = 255; 623 heads = 16; 624 hcyl = sectors / secpt; 625 } else { 626 secpt = 17; 627 hcyl = sectors / secpt; 628 heads = (hcyl + 1023) / 1024; 629 630 if (heads < 4) 631 heads = 4; 632 633 if (hcyl >= (heads * 1024) || heads > 16) { 634 secpt = 31; 635 heads = 16; 636 hcyl = sectors / secpt; 637 } 638 if (hcyl >= (heads * 1024)) { 639 secpt = 63; 640 heads = 16; 641 hcyl = sectors / secpt; 642 } 643 } 644 645 *c = hcyl / heads; 646 *h = heads; 647 *s = secpt; 648} 649 650/* 651 * Accessors 652 */ 653off_t 654blockif_size(struct blockif_ctxt *bc) 655{ 656 657 assert(bc->bc_magic == BLOCKIF_SIG); 658 return (bc->bc_size); 659} 660 661int 662blockif_sectsz(struct blockif_ctxt *bc) 663{ 664 665 assert(bc->bc_magic == BLOCKIF_SIG); 666 return (bc->bc_sectsz); 667} 668 669void 670blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) 671{ 672 673 assert(bc->bc_magic == BLOCKIF_SIG); 674 *size = bc->bc_psectsz; 675 *off = bc->bc_psectoff; 676} 677 678int 679blockif_queuesz(struct blockif_ctxt *bc) 680{ 681 682 assert(bc->bc_magic == BLOCKIF_SIG); 683 return (BLOCKIF_MAXREQ - 1); 684} 685 686int 687blockif_is_ro(struct blockif_ctxt *bc) 688{ 689 690 assert(bc->bc_magic == BLOCKIF_SIG); 691 return (bc->bc_rdonly); 692} 693 694int 695blockif_candelete(struct blockif_ctxt *bc) 696{ 697 698 assert(bc->bc_magic == BLOCKIF_SIG); 699 return (bc->bc_candelete); 700} 701