subr_devstat.c revision 260385
1/*- 2 * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. The name of the author may not be used to endorse or promote products 14 * derived from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29#include <sys/cdefs.h> 30__FBSDID("$FreeBSD: stable/10/sys/kern/subr_devstat.c 260385 2014-01-07 01:32:23Z scottl $"); 31 32#include "opt_kdtrace.h" 33 34#include <sys/param.h> 35#include <sys/kernel.h> 36#include <sys/systm.h> 37#include <sys/bio.h> 38#include <sys/devicestat.h> 39#include <sys/sysctl.h> 40#include <sys/malloc.h> 41#include <sys/lock.h> 42#include <sys/mutex.h> 43#include <sys/conf.h> 44#include <vm/vm.h> 45#include <vm/pmap.h> 46 47#include <machine/atomic.h> 48 49#ifdef KDTRACE_HOOKS 50#include <sys/dtrace_bsd.h> 51 52dtrace_io_start_probe_func_t dtrace_io_start_probe; 53dtrace_io_done_probe_func_t dtrace_io_done_probe; 54dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe; 55dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe; 56 57uint32_t dtio_start_id; 58uint32_t dtio_done_id; 59uint32_t dtio_wait_start_id; 60uint32_t dtio_wait_done_id; 61 62#define DTRACE_DEVSTAT_START() \ 63 if (dtrace_io_start_probe != NULL) \ 64 (*dtrace_io_start_probe)(dtio_start_id, NULL, ds); 65 66#define DTRACE_DEVSTAT_BIO_START() \ 67 if (dtrace_io_start_probe != NULL) \ 68 (*dtrace_io_start_probe)(dtio_start_id, bp, ds); 69 70#define DTRACE_DEVSTAT_DONE() \ 71 if (dtrace_io_done_probe != NULL) \ 72 (*dtrace_io_done_probe)(dtio_done_id, NULL, ds); 73 74#define DTRACE_DEVSTAT_BIO_DONE() \ 75 if (dtrace_io_done_probe != NULL) \ 76 (*dtrace_io_done_probe)(dtio_done_id, bp, ds); 77 78#define DTRACE_DEVSTAT_WAIT_START() \ 79 if (dtrace_io_wait_start_probe != NULL) \ 80 (*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds); 81 82#define DTRACE_DEVSTAT_WAIT_DONE() \ 83 if (dtrace_io_wait_done_probe != NULL) \ 84 (*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds); 85 86#else /* ! KDTRACE_HOOKS */ 87 88#define DTRACE_DEVSTAT_START() 89 90#define DTRACE_DEVSTAT_BIO_START() 91 92#define DTRACE_DEVSTAT_DONE() 93 94#define DTRACE_DEVSTAT_BIO_DONE() 95 96#define DTRACE_DEVSTAT_WAIT_START() 97 98#define DTRACE_DEVSTAT_WAIT_DONE() 99#endif /* KDTRACE_HOOKS */ 100 101static int devstat_num_devs; 102static long devstat_generation = 1; 103static int devstat_version = DEVSTAT_VERSION; 104static int devstat_current_devnumber; 105static struct mtx devstat_mutex; 106MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF); 107 108static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq); 109static struct devstat *devstat_alloc(void); 110static void devstat_free(struct devstat *); 111static void devstat_add_entry(struct devstat *ds, const void *dev_name, 112 int unit_number, uint32_t block_size, 113 devstat_support_flags flags, 114 devstat_type_flags device_type, 115 devstat_priority priority); 116 117/* 118 * Allocate a devstat and initialize it 119 */ 120struct devstat * 121devstat_new_entry(const void *dev_name, 122 int unit_number, uint32_t block_size, 123 devstat_support_flags flags, 124 devstat_type_flags device_type, 125 devstat_priority priority) 126{ 127 struct devstat *ds; 128 129 mtx_assert(&devstat_mutex, MA_NOTOWNED); 130 131 ds = devstat_alloc(); 132 mtx_lock(&devstat_mutex); 133 if (unit_number == -1) { 134 ds->unit_number = unit_number; 135 ds->id = dev_name; 136 binuptime(&ds->creation_time); 137 devstat_generation++; 138 } else { 139 devstat_add_entry(ds, dev_name, unit_number, block_size, 140 flags, device_type, priority); 141 } 142 mtx_unlock(&devstat_mutex); 143 return (ds); 144} 145 146/* 147 * Take a malloced and zeroed devstat structure given to us, fill it in 148 * and add it to the queue of devices. 149 */ 150static void 151devstat_add_entry(struct devstat *ds, const void *dev_name, 152 int unit_number, uint32_t block_size, 153 devstat_support_flags flags, 154 devstat_type_flags device_type, 155 devstat_priority priority) 156{ 157 struct devstatlist *devstat_head; 158 struct devstat *ds_tmp; 159 160 mtx_assert(&devstat_mutex, MA_OWNED); 161 devstat_num_devs++; 162 163 devstat_head = &device_statq; 164 165 /* 166 * Priority sort. Each driver passes in its priority when it adds 167 * its devstat entry. Drivers are sorted first by priority, and 168 * then by probe order. 169 * 170 * For the first device, we just insert it, since the priority 171 * doesn't really matter yet. Subsequent devices are inserted into 172 * the list using the order outlined above. 173 */ 174 if (devstat_num_devs == 1) 175 STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); 176 else { 177 STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) { 178 struct devstat *ds_next; 179 180 ds_next = STAILQ_NEXT(ds_tmp, dev_links); 181 182 /* 183 * If we find a break between higher and lower 184 * priority items, and if this item fits in the 185 * break, insert it. This also applies if the 186 * "lower priority item" is the end of the list. 187 */ 188 if ((priority <= ds_tmp->priority) 189 && ((ds_next == NULL) 190 || (priority > ds_next->priority))) { 191 STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds, 192 dev_links); 193 break; 194 } else if (priority > ds_tmp->priority) { 195 /* 196 * If this is the case, we should be able 197 * to insert ourselves at the head of the 198 * list. If we can't, something is wrong. 199 */ 200 if (ds_tmp == STAILQ_FIRST(devstat_head)) { 201 STAILQ_INSERT_HEAD(devstat_head, 202 ds, dev_links); 203 break; 204 } else { 205 STAILQ_INSERT_TAIL(devstat_head, 206 ds, dev_links); 207 printf("devstat_add_entry: HELP! " 208 "sorting problem detected " 209 "for name %p unit %d\n", 210 dev_name, unit_number); 211 break; 212 } 213 } 214 } 215 } 216 217 ds->device_number = devstat_current_devnumber++; 218 ds->unit_number = unit_number; 219 strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); 220 ds->block_size = block_size; 221 ds->flags = flags; 222 ds->device_type = device_type; 223 ds->priority = priority; 224 binuptime(&ds->creation_time); 225 devstat_generation++; 226} 227 228/* 229 * Remove a devstat structure from the list of devices. 230 */ 231void 232devstat_remove_entry(struct devstat *ds) 233{ 234 struct devstatlist *devstat_head; 235 236 mtx_assert(&devstat_mutex, MA_NOTOWNED); 237 if (ds == NULL) 238 return; 239 240 mtx_lock(&devstat_mutex); 241 242 devstat_head = &device_statq; 243 244 /* Remove this entry from the devstat queue */ 245 atomic_add_acq_int(&ds->sequence1, 1); 246 if (ds->unit_number != -1) { 247 devstat_num_devs--; 248 STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); 249 } 250 devstat_free(ds); 251 devstat_generation++; 252 mtx_unlock(&devstat_mutex); 253} 254 255/* 256 * Record a transaction start. 257 * 258 * See comments for devstat_end_transaction(). Ordering is very important 259 * here. 260 */ 261void 262devstat_start_transaction(struct devstat *ds, struct bintime *now) 263{ 264 265 mtx_assert(&devstat_mutex, MA_NOTOWNED); 266 267 /* sanity check */ 268 if (ds == NULL) 269 return; 270 271 atomic_add_acq_int(&ds->sequence1, 1); 272 /* 273 * We only want to set the start time when we are going from idle 274 * to busy. The start time is really the start of the latest busy 275 * period. 276 */ 277 if (ds->start_count == ds->end_count) { 278 if (now != NULL) 279 ds->busy_from = *now; 280 else 281 binuptime(&ds->busy_from); 282 } 283 ds->start_count++; 284 atomic_add_rel_int(&ds->sequence0, 1); 285 DTRACE_DEVSTAT_START(); 286} 287 288void 289devstat_start_transaction_bio(struct devstat *ds, struct bio *bp) 290{ 291 292 mtx_assert(&devstat_mutex, MA_NOTOWNED); 293 294 /* sanity check */ 295 if (ds == NULL) 296 return; 297 298 binuptime(&bp->bio_t0); 299 devstat_start_transaction(ds, &bp->bio_t0); 300 DTRACE_DEVSTAT_BIO_START(); 301} 302 303/* 304 * Record the ending of a transaction, and incrment the various counters. 305 * 306 * Ordering in this function, and in devstat_start_transaction() is VERY 307 * important. The idea here is to run without locks, so we are very 308 * careful to only modify some fields on the way "down" (i.e. at 309 * transaction start) and some fields on the way "up" (i.e. at transaction 310 * completion). One exception is busy_from, which we only modify in 311 * devstat_start_transaction() when there are no outstanding transactions, 312 * and thus it can't be modified in devstat_end_transaction() 313 * simultaneously. 314 * 315 * The sequence0 and sequence1 fields are provided to enable an application 316 * spying on the structures with mmap(2) to tell when a structure is in a 317 * consistent state or not. 318 * 319 * For this to work 100% reliably, it is important that the two fields 320 * are at opposite ends of the structure and that they are incremented 321 * in the opposite order of how a memcpy(3) in userland would copy them. 322 * We assume that the copying happens front to back, but there is actually 323 * no way short of writing your own memcpy(3) replacement to guarantee 324 * this will be the case. 325 * 326 * In addition to this, being a kind of locks, they must be updated with 327 * atomic instructions using appropriate memory barriers. 328 */ 329void 330devstat_end_transaction(struct devstat *ds, uint32_t bytes, 331 devstat_tag_type tag_type, devstat_trans_flags flags, 332 struct bintime *now, struct bintime *then) 333{ 334 struct bintime dt, lnow; 335 336 /* sanity check */ 337 if (ds == NULL) 338 return; 339 340 if (now == NULL) { 341 now = &lnow; 342 binuptime(now); 343 } 344 345 atomic_add_acq_int(&ds->sequence1, 1); 346 /* Update byte and operations counts */ 347 ds->bytes[flags] += bytes; 348 ds->operations[flags]++; 349 350 /* 351 * Keep a count of the various tag types sent. 352 */ 353 if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 && 354 tag_type != DEVSTAT_TAG_NONE) 355 ds->tag_types[tag_type]++; 356 357 if (then != NULL) { 358 /* Update duration of operations */ 359 dt = *now; 360 bintime_sub(&dt, then); 361 bintime_add(&ds->duration[flags], &dt); 362 } 363 364 /* Accumulate busy time */ 365 dt = *now; 366 bintime_sub(&dt, &ds->busy_from); 367 bintime_add(&ds->busy_time, &dt); 368 ds->busy_from = *now; 369 370 ds->end_count++; 371 atomic_add_rel_int(&ds->sequence0, 1); 372 DTRACE_DEVSTAT_DONE(); 373} 374 375void 376devstat_end_transaction_bio(struct devstat *ds, struct bio *bp) 377{ 378 379 devstat_end_transaction_bio_bt(ds, bp, NULL); 380} 381 382void 383devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp, 384 struct bintime *now) 385{ 386 devstat_trans_flags flg; 387 388 /* sanity check */ 389 if (ds == NULL) 390 return; 391 392 if (bp->bio_cmd == BIO_DELETE) 393 flg = DEVSTAT_FREE; 394 else if (bp->bio_cmd == BIO_READ) 395 flg = DEVSTAT_READ; 396 else if (bp->bio_cmd == BIO_WRITE) 397 flg = DEVSTAT_WRITE; 398 else 399 flg = DEVSTAT_NO_DATA; 400 401 devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid, 402 DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0); 403 DTRACE_DEVSTAT_BIO_DONE(); 404} 405 406/* 407 * This is the sysctl handler for the devstat package. The data pushed out 408 * on the kern.devstat.all sysctl variable consists of the current devstat 409 * generation number, and then an array of devstat structures, one for each 410 * device in the system. 411 * 412 * This is more cryptic that obvious, but basically we neither can nor 413 * want to hold the devstat_mutex for any amount of time, so we grab it 414 * only when we need to and keep an eye on devstat_generation all the time. 415 */ 416static int 417sysctl_devstat(SYSCTL_HANDLER_ARGS) 418{ 419 int error; 420 long mygen; 421 struct devstat *nds; 422 423 mtx_assert(&devstat_mutex, MA_NOTOWNED); 424 425 /* 426 * XXX devstat_generation should really be "volatile" but that 427 * XXX freaks out the sysctl macro below. The places where we 428 * XXX change it and inspect it are bracketed in the mutex which 429 * XXX guarantees us proper write barriers. I don't belive the 430 * XXX compiler is allowed to optimize mygen away across calls 431 * XXX to other functions, so the following is belived to be safe. 432 */ 433 mygen = devstat_generation; 434 435 error = SYSCTL_OUT(req, &mygen, sizeof(mygen)); 436 437 if (devstat_num_devs == 0) 438 return(0); 439 440 if (error != 0) 441 return (error); 442 443 mtx_lock(&devstat_mutex); 444 nds = STAILQ_FIRST(&device_statq); 445 if (mygen != devstat_generation) 446 error = EBUSY; 447 mtx_unlock(&devstat_mutex); 448 449 if (error != 0) 450 return (error); 451 452 for (;nds != NULL;) { 453 error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); 454 if (error != 0) 455 return (error); 456 mtx_lock(&devstat_mutex); 457 if (mygen != devstat_generation) 458 error = EBUSY; 459 else 460 nds = STAILQ_NEXT(nds, dev_links); 461 mtx_unlock(&devstat_mutex); 462 if (error != 0) 463 return (error); 464 } 465 return(error); 466} 467 468/* 469 * Sysctl entries for devstat. The first one is a node that all the rest 470 * hang off of. 471 */ 472static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL, 473 "Device Statistics"); 474 475SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, 476 NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list"); 477/* 478 * Export the number of devices in the system so that userland utilities 479 * can determine how much memory to allocate to hold all the devices. 480 */ 481SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, 482 &devstat_num_devs, 0, "Number of devices in the devstat list"); 483SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, 484 &devstat_generation, 0, "Devstat list generation"); 485SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, 486 &devstat_version, 0, "Devstat list version number"); 487 488/* 489 * Allocator for struct devstat structures. We sub-allocate these from pages 490 * which we get from malloc. These pages are exported for mmap(2)'ing through 491 * a miniature device driver 492 */ 493 494#define statsperpage (PAGE_SIZE / sizeof(struct devstat)) 495 496static d_mmap_t devstat_mmap; 497 498static struct cdevsw devstat_cdevsw = { 499 .d_version = D_VERSION, 500 .d_flags = D_NEEDGIANT, 501 .d_mmap = devstat_mmap, 502 .d_name = "devstat", 503}; 504 505struct statspage { 506 TAILQ_ENTRY(statspage) list; 507 struct devstat *stat; 508 u_int nfree; 509}; 510 511static TAILQ_HEAD(, statspage) pagelist = TAILQ_HEAD_INITIALIZER(pagelist); 512static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics"); 513 514static int 515devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr, 516 int nprot, vm_memattr_t *memattr) 517{ 518 struct statspage *spp; 519 520 if (nprot != VM_PROT_READ) 521 return (-1); 522 TAILQ_FOREACH(spp, &pagelist, list) { 523 if (offset == 0) { 524 *paddr = vtophys(spp->stat); 525 return (0); 526 } 527 offset -= PAGE_SIZE; 528 } 529 return (-1); 530} 531 532static struct devstat * 533devstat_alloc(void) 534{ 535 struct devstat *dsp; 536 struct statspage *spp, *spp2; 537 u_int u; 538 static int once; 539 540 mtx_assert(&devstat_mutex, MA_NOTOWNED); 541 if (!once) { 542 make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME, 543 &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400, 544 DEVSTAT_DEVICE_NAME); 545 once = 1; 546 } 547 spp2 = NULL; 548 mtx_lock(&devstat_mutex); 549 for (;;) { 550 TAILQ_FOREACH(spp, &pagelist, list) { 551 if (spp->nfree > 0) 552 break; 553 } 554 if (spp != NULL) 555 break; 556 mtx_unlock(&devstat_mutex); 557 spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK); 558 spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK); 559 spp2->nfree = statsperpage; 560 561 /* 562 * If free statspages were added while the lock was released 563 * just reuse them. 564 */ 565 mtx_lock(&devstat_mutex); 566 TAILQ_FOREACH(spp, &pagelist, list) 567 if (spp->nfree > 0) 568 break; 569 if (spp == NULL) { 570 spp = spp2; 571 572 /* 573 * It would make more sense to add the new page at the 574 * head but the order on the list determine the 575 * sequence of the mapping so we can't do that. 576 */ 577 TAILQ_INSERT_TAIL(&pagelist, spp, list); 578 } else 579 break; 580 } 581 dsp = spp->stat; 582 for (u = 0; u < statsperpage; u++) { 583 if (dsp->allocated == 0) 584 break; 585 dsp++; 586 } 587 spp->nfree--; 588 dsp->allocated = 1; 589 mtx_unlock(&devstat_mutex); 590 if (spp2 != NULL && spp2 != spp) { 591 free(spp2->stat, M_DEVSTAT); 592 free(spp2, M_DEVSTAT); 593 } 594 return (dsp); 595} 596 597static void 598devstat_free(struct devstat *dsp) 599{ 600 struct statspage *spp; 601 602 mtx_assert(&devstat_mutex, MA_OWNED); 603 bzero(dsp, sizeof *dsp); 604 TAILQ_FOREACH(spp, &pagelist, list) { 605 if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) { 606 spp->nfree++; 607 return; 608 } 609 } 610} 611 612SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD, 613 NULL, sizeof(struct devstat), "sizeof(struct devstat)"); 614