lockstat.c revision 297077
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <stdio.h> 29#include <stddef.h> 30#include <stdlib.h> 31#include <stdarg.h> 32#include <string.h> 33#include <strings.h> 34#include <ctype.h> 35#include <fcntl.h> 36#include <unistd.h> 37#include <errno.h> 38#include <limits.h> 39#include <sys/types.h> 40#include <sys/modctl.h> 41#include <sys/stat.h> 42#include <sys/wait.h> 43#include <dtrace.h> 44#include <sys/lockstat.h> 45#include <alloca.h> 46#include <signal.h> 47#include <assert.h> 48 49#ifdef illumos 50#define GETOPT_EOF EOF 51#else 52#include <sys/time.h> 53#include <sys/resource.h> 54 55#define mergesort(a, b, c, d) lsmergesort(a, b, c, d) 56#define GETOPT_EOF (-1) 57 58typedef uintptr_t pc_t; 59#endif 60 61#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V" 62 63#define LS_MAX_STACK_DEPTH 50 64#define LS_MAX_EVENTS 64 65 66typedef struct lsrec { 67 struct lsrec *ls_next; /* next in hash chain */ 68 uintptr_t ls_lock; /* lock address */ 69 uintptr_t ls_caller; /* caller address */ 70 uint32_t ls_count; /* cumulative event count */ 71 uint32_t ls_event; /* type of event */ 72 uintptr_t ls_refcnt; /* cumulative reference count */ 73 uint64_t ls_time; /* cumulative event duration */ 74 uint32_t ls_hist[64]; /* log2(duration) histogram */ 75 uintptr_t ls_stack[LS_MAX_STACK_DEPTH]; 76} lsrec_t; 77 78typedef struct lsdata { 79 struct lsrec *lsd_next; /* next available */ 80 int lsd_count; /* number of records */ 81} lsdata_t; 82 83/* 84 * Definitions for the types of experiments which can be run. They are 85 * listed in increasing order of memory cost and processing time cost. 86 * The numerical value of each type is the number of bytes needed per record. 87 */ 88#define LS_BASIC offsetof(lsrec_t, ls_time) 89#define LS_TIME offsetof(lsrec_t, ls_hist[0]) 90#define LS_HIST offsetof(lsrec_t, ls_stack[0]) 91#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth]) 92 93static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t); 94static void report_trace(FILE *, lsrec_t **); 95 96extern int symtab_init(void); 97extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *); 98extern uintptr_t sym_to_addr(char *name); 99extern size_t sym_size(char *name); 100extern char *strtok_r(char *, const char *, char **); 101 102#define DEFAULT_NRECS 10000 103#define DEFAULT_HZ 97 104#define MAX_HZ 1000 105#define MIN_AGGSIZE (16 * 1024) 106#define MAX_AGGSIZE (32 * 1024 * 1024) 107 108static int g_stkdepth; 109static int g_topn = INT_MAX; 110static hrtime_t g_elapsed; 111static int g_rates = 0; 112static int g_pflag = 0; 113static int g_Pflag = 0; 114static int g_wflag = 0; 115static int g_Wflag = 0; 116static int g_cflag = 0; 117static int g_kflag = 0; 118static int g_gflag = 0; 119static int g_Vflag = 0; 120static int g_tracing = 0; 121static size_t g_recsize; 122static size_t g_nrecs; 123static int g_nrecs_used; 124static uchar_t g_enabled[LS_MAX_EVENTS]; 125static hrtime_t g_min_duration[LS_MAX_EVENTS]; 126static dtrace_hdl_t *g_dtp; 127static char *g_predicate; 128static char *g_ipredicate; 129static char *g_prog; 130static int g_proglen; 131static int g_dropped; 132 133typedef struct ls_event_info { 134 char ev_type; 135 char ev_lhdr[20]; 136 char ev_desc[80]; 137 char ev_units[10]; 138 char ev_name[DTRACE_NAMELEN]; 139 char *ev_predicate; 140 char *ev_acquire; 141} ls_event_info_t; 142 143static ls_event_info_t g_event_info[LS_MAX_EVENTS] = { 144 { 'C', "Lock", "Adaptive mutex spin", "nsec", 145 "lockstat:::adaptive-spin" }, 146 { 'C', "Lock", "Adaptive mutex block", "nsec", 147 "lockstat:::adaptive-block" }, 148 { 'C', "Lock", "Spin lock spin", "nsec", 149 "lockstat:::spin-spin" }, 150 { 'C', "Lock", "Thread lock spin", "nsec", 151 "lockstat:::thread-spin" }, 152 { 'C', "Lock", "R/W writer blocked by writer", "nsec", 153 "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" }, 154 { 'C', "Lock", "R/W writer blocked by readers", "nsec", 155 "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" }, 156 { 'C', "Lock", "R/W reader blocked by writer", "nsec", 157 "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" }, 158 { 'C', "Lock", "R/W reader blocked by write wanted", "nsec", 159 "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" }, 160 { 'C', "Lock", "R/W writer spin on writer", "nsec", 161 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 1" }, 162 { 'C', "Lock", "R/W writer spin on readers", "nsec", 163 "lockstat:::rw-spin", "arg2 == 0 && arg3 == 0 && arg4" }, 164 { 'C', "Lock", "R/W reader spin on writer", "nsec", 165 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 1" }, 166 { 'C', "Lock", "R/W reader spin on write wanted", "nsec", 167 "lockstat:::rw-spin", "arg2 != 0 && arg3 == 0 && arg4" }, 168 { 'C', "Lock", "SX exclusive block", "nsec", 169 "lockstat:::sx-block", "arg2 == 0" }, 170 { 'C', "Lock", "SX shared block", "nsec", 171 "lockstat:::sx-block", "arg2 != 0" }, 172 { 'C', "Lock", "SX exclusive spin", "nsec", 173 "lockstat:::sx-spin", "arg2 == 0" }, 174 { 'C', "Lock", "SX shared spin", "nsec", 175 "lockstat:::sx-spin", "arg2 != 0" }, 176 { 'C', "Lock", "Unknown event (type 16)", "units" }, 177 { 'C', "Lock", "Unknown event (type 17)", "units" }, 178 { 'C', "Lock", "Unknown event (type 18)", "units" }, 179 { 'C', "Lock", "Unknown event (type 19)", "units" }, 180 { 'C', "Lock", "Unknown event (type 20)", "units" }, 181 { 'C', "Lock", "Unknown event (type 21)", "units" }, 182 { 'C', "Lock", "Unknown event (type 22)", "units" }, 183 { 'C', "Lock", "Unknown event (type 23)", "units" }, 184 { 'C', "Lock", "Unknown event (type 24)", "units" }, 185 { 'C', "Lock", "Unknown event (type 25)", "units" }, 186 { 'C', "Lock", "Unknown event (type 26)", "units" }, 187 { 'C', "Lock", "Unknown event (type 27)", "units" }, 188 { 'C', "Lock", "Unknown event (type 28)", "units" }, 189 { 'C', "Lock", "Unknown event (type 29)", "units" }, 190 { 'C', "Lock", "Unknown event (type 30)", "units" }, 191 { 'C', "Lock", "Unknown event (type 31)", "units" }, 192 { 'H', "Lock", "Adaptive mutex hold", "nsec", 193 "lockstat:::adaptive-release", NULL, 194 "lockstat:::adaptive-acquire" }, 195 { 'H', "Lock", "Spin lock hold", "nsec", 196 "lockstat:::spin-release", NULL, 197 "lockstat:::spin-acquire" }, 198 { 'H', "Lock", "R/W writer hold", "nsec", 199 "lockstat::rw_wunlock:rw-release", NULL, 200 "lockstat::rw_wlock:rw-acquire" }, 201 { 'H', "Lock", "R/W reader hold", "nsec", 202 "lockstat::rw_runlock:rw-release", NULL, 203 "lockstat::rw_rlock:rw-acquire" }, 204 { 'H', "Lock", "SX shared hold", "nsec", 205 "lockstat::sx_sunlock:sx-release", NULL, 206 "lockstat::sx_slock:sx-acquire" }, 207 { 'H', "Lock", "SX exclusive hold", "nsec", 208 "lockstat::sx_xunlock:sx-release", NULL, 209 "lockstat::sx_xlock:sx-acquire" }, 210 { 'H', "Lock", "Unknown event (type 38)", "units" }, 211 { 'H', "Lock", "Unknown event (type 39)", "units" }, 212 { 'H', "Lock", "Unknown event (type 40)", "units" }, 213 { 'H', "Lock", "Unknown event (type 41)", "units" }, 214 { 'H', "Lock", "Unknown event (type 42)", "units" }, 215 { 'H', "Lock", "Unknown event (type 43)", "units" }, 216 { 'H', "Lock", "Unknown event (type 44)", "units" }, 217 { 'H', "Lock", "Unknown event (type 45)", "units" }, 218 { 'H', "Lock", "Unknown event (type 46)", "units" }, 219 { 'H', "Lock", "Unknown event (type 47)", "units" }, 220 { 'H', "Lock", "Unknown event (type 48)", "units" }, 221 { 'H', "Lock", "Unknown event (type 49)", "units" }, 222 { 'H', "Lock", "Unknown event (type 50)", "units" }, 223 { 'H', "Lock", "Unknown event (type 51)", "units" }, 224 { 'H', "Lock", "Unknown event (type 52)", "units" }, 225 { 'H', "Lock", "Unknown event (type 53)", "units" }, 226 { 'H', "Lock", "Unknown event (type 54)", "units" }, 227 { 'H', "Lock", "Unknown event (type 55)", "units" }, 228#ifdef illumos 229 { 'I', "CPU+PIL", "Profiling interrupt", "nsec", 230#else 231 { 'I', "CPU+Pri_Class", "Profiling interrupt", "nsec", 232#endif 233 "profile:::profile-97", NULL }, 234 { 'I', "Lock", "Unknown event (type 57)", "units" }, 235 { 'I', "Lock", "Unknown event (type 58)", "units" }, 236 { 'I', "Lock", "Unknown event (type 59)", "units" }, 237 { 'E', "Lock", "Recursive lock entry detected", "(N/A)", 238 "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" }, 239 { 'E', "Lock", "Lockstat enter failure", "(N/A)" }, 240 { 'E', "Lock", "Lockstat exit failure", "nsec" }, 241 { 'E', "Lock", "Lockstat record failure", "(N/A)" }, 242}; 243 244#ifndef illumos 245static char *g_pri_class[] = { 246 "", 247 "Intr", 248 "RealT", 249 "TShar", 250 "Idle" 251}; 252#endif 253 254static void 255fail(int do_perror, const char *message, ...) 256{ 257 va_list args; 258 int save_errno = errno; 259 260 va_start(args, message); 261 (void) fprintf(stderr, "lockstat: "); 262 (void) vfprintf(stderr, message, args); 263 va_end(args); 264 if (do_perror) 265 (void) fprintf(stderr, ": %s", strerror(save_errno)); 266 (void) fprintf(stderr, "\n"); 267 exit(2); 268} 269 270static void 271dfail(const char *message, ...) 272{ 273 va_list args; 274 275 va_start(args, message); 276 (void) fprintf(stderr, "lockstat: "); 277 (void) vfprintf(stderr, message, args); 278 va_end(args); 279 (void) fprintf(stderr, ": %s\n", 280 dtrace_errmsg(g_dtp, dtrace_errno(g_dtp))); 281 282 exit(2); 283} 284 285static void 286show_events(char event_type, char *desc) 287{ 288 int i, first = -1, last; 289 290 for (i = 0; i < LS_MAX_EVENTS; i++) { 291 ls_event_info_t *evp = &g_event_info[i]; 292 if (evp->ev_type != event_type || 293 strncmp(evp->ev_desc, "Unknown event", 13) == 0) 294 continue; 295 if (first == -1) 296 first = i; 297 last = i; 298 } 299 300 (void) fprintf(stderr, 301 "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n", 302 desc, event_type, first, last); 303 304 for (i = first; i <= last; i++) 305 (void) fprintf(stderr, 306 "%4d = %s\n", i, g_event_info[i].ev_desc); 307} 308 309static void 310usage(void) 311{ 312 (void) fprintf(stderr, 313 "Usage: lockstat [options] command [args]\n" 314 "\nEvent selection options:\n\n" 315 " -C watch contention events [on by default]\n" 316 " -E watch error events [off by default]\n" 317 " -H watch hold events [off by default]\n" 318 " -I watch interrupt events [off by default]\n" 319 " -A watch all lock events [equivalent to -CH]\n" 320 " -e event_list only watch the specified events (shown below);\n" 321 " <event_list> is a comma-separated list of\n" 322 " events or ranges of events, e.g. 1,4-7,35\n" 323 " -i rate interrupt rate for -I [default: %d Hz]\n" 324 "\nData gathering options:\n\n" 325 " -b basic statistics (lock, caller, event count)\n" 326 " -t timing for all events [default]\n" 327 " -h histograms for event times\n" 328 " -s depth stack traces <depth> deep\n" 329 " -x opt[=val] enable or modify DTrace options\n" 330 "\nData filtering options:\n\n" 331 " -n nrecords maximum number of data records [default: %d]\n" 332 " -l lock[,size] only watch <lock>, which can be specified as a\n" 333 " symbolic name or hex address; <size> defaults\n" 334 " to the ELF symbol size if available, 1 if not\n" 335 " -f func[,size] only watch events generated by <func>\n" 336 " -d duration only watch events longer than <duration>\n" 337 " -T trace (rather than sample) events\n" 338 "\nData reporting options:\n\n" 339 " -c coalesce lock data for arrays like pse_mutex[]\n" 340 " -k coalesce PCs within functions\n" 341 " -g show total events generated by function\n" 342 " -w wherever: don't distinguish events by caller\n" 343 " -W whichever: don't distinguish events by lock\n" 344 " -R display rates rather than counts\n" 345 " -p parsable output format (awk(1)-friendly)\n" 346 " -P sort lock data by (count * avg_time) product\n" 347 " -D n only display top <n> events of each type\n" 348 " -o filename send output to <filename>\n", 349 DEFAULT_HZ, DEFAULT_NRECS); 350 351 show_events('C', "Contention"); 352 show_events('H', "Hold-time"); 353 show_events('I', "Interrupt"); 354 show_events('E', "Error"); 355 (void) fprintf(stderr, "\n"); 356 357 exit(1); 358} 359 360static int 361lockcmp(lsrec_t *a, lsrec_t *b) 362{ 363 int i; 364 365 if (a->ls_event < b->ls_event) 366 return (-1); 367 if (a->ls_event > b->ls_event) 368 return (1); 369 370 for (i = g_stkdepth - 1; i >= 0; i--) { 371 if (a->ls_stack[i] < b->ls_stack[i]) 372 return (-1); 373 if (a->ls_stack[i] > b->ls_stack[i]) 374 return (1); 375 } 376 377 if (a->ls_caller < b->ls_caller) 378 return (-1); 379 if (a->ls_caller > b->ls_caller) 380 return (1); 381 382 if (a->ls_lock < b->ls_lock) 383 return (-1); 384 if (a->ls_lock > b->ls_lock) 385 return (1); 386 387 return (0); 388} 389 390static int 391countcmp(lsrec_t *a, lsrec_t *b) 392{ 393 if (a->ls_event < b->ls_event) 394 return (-1); 395 if (a->ls_event > b->ls_event) 396 return (1); 397 398 return (b->ls_count - a->ls_count); 399} 400 401static int 402timecmp(lsrec_t *a, lsrec_t *b) 403{ 404 if (a->ls_event < b->ls_event) 405 return (-1); 406 if (a->ls_event > b->ls_event) 407 return (1); 408 409 if (a->ls_time < b->ls_time) 410 return (1); 411 if (a->ls_time > b->ls_time) 412 return (-1); 413 414 return (0); 415} 416 417static int 418lockcmp_anywhere(lsrec_t *a, lsrec_t *b) 419{ 420 if (a->ls_event < b->ls_event) 421 return (-1); 422 if (a->ls_event > b->ls_event) 423 return (1); 424 425 if (a->ls_lock < b->ls_lock) 426 return (-1); 427 if (a->ls_lock > b->ls_lock) 428 return (1); 429 430 return (0); 431} 432 433static int 434lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b) 435{ 436 if (a->ls_event < b->ls_event) 437 return (-1); 438 if (a->ls_event > b->ls_event) 439 return (1); 440 441 if (a->ls_lock < b->ls_lock) 442 return (-1); 443 if (a->ls_lock > b->ls_lock) 444 return (1); 445 446 return (b->ls_count - a->ls_count); 447} 448 449static int 450sitecmp_anylock(lsrec_t *a, lsrec_t *b) 451{ 452 int i; 453 454 if (a->ls_event < b->ls_event) 455 return (-1); 456 if (a->ls_event > b->ls_event) 457 return (1); 458 459 for (i = g_stkdepth - 1; i >= 0; i--) { 460 if (a->ls_stack[i] < b->ls_stack[i]) 461 return (-1); 462 if (a->ls_stack[i] > b->ls_stack[i]) 463 return (1); 464 } 465 466 if (a->ls_caller < b->ls_caller) 467 return (-1); 468 if (a->ls_caller > b->ls_caller) 469 return (1); 470 471 return (0); 472} 473 474static int 475site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b) 476{ 477 int i; 478 479 if (a->ls_event < b->ls_event) 480 return (-1); 481 if (a->ls_event > b->ls_event) 482 return (1); 483 484 for (i = g_stkdepth - 1; i >= 0; i--) { 485 if (a->ls_stack[i] < b->ls_stack[i]) 486 return (-1); 487 if (a->ls_stack[i] > b->ls_stack[i]) 488 return (1); 489 } 490 491 if (a->ls_caller < b->ls_caller) 492 return (-1); 493 if (a->ls_caller > b->ls_caller) 494 return (1); 495 496 return (b->ls_count - a->ls_count); 497} 498 499static void 500lsmergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n) 501{ 502 int m = n / 2; 503 int i, j; 504 505 if (m > 1) 506 lsmergesort(cmp, a, b, m); 507 if (n - m > 1) 508 lsmergesort(cmp, a + m, b + m, n - m); 509 for (i = m; i > 0; i--) 510 b[i - 1] = a[i - 1]; 511 for (j = m - 1; j < n - 1; j++) 512 b[n + m - j - 2] = a[j + 1]; 513 while (i < j) 514 *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--]; 515 *a = b[i]; 516} 517 518static void 519coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n) 520{ 521 int i, j; 522 lsrec_t *target, *current; 523 524 target = lock[0]; 525 526 for (i = 1; i < n; i++) { 527 current = lock[i]; 528 if (cmp(current, target) != 0) { 529 target = current; 530 continue; 531 } 532 current->ls_event = LS_MAX_EVENTS; 533 target->ls_count += current->ls_count; 534 target->ls_refcnt += current->ls_refcnt; 535 if (g_recsize < LS_TIME) 536 continue; 537 target->ls_time += current->ls_time; 538 if (g_recsize < LS_HIST) 539 continue; 540 for (j = 0; j < 64; j++) 541 target->ls_hist[j] += current->ls_hist[j]; 542 } 543} 544 545static void 546coalesce_symbol(uintptr_t *addrp) 547{ 548 uintptr_t symoff; 549 size_t symsize; 550 551 if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize) 552 *addrp -= symoff; 553} 554 555static void 556predicate_add(char **pred, char *what, char *cmp, uintptr_t value) 557{ 558 char *new; 559 int len, newlen; 560 561 if (what == NULL) 562 return; 563 564 if (*pred == NULL) { 565 *pred = malloc(1); 566 *pred[0] = '\0'; 567 } 568 569 len = strlen(*pred); 570 newlen = len + strlen(what) + 32 + strlen("( && )"); 571 new = malloc(newlen); 572 573 if (*pred[0] != '\0') { 574 if (cmp != NULL) { 575 (void) sprintf(new, "(%s) && (%s %s 0x%p)", 576 *pred, what, cmp, (void *)value); 577 } else { 578 (void) sprintf(new, "(%s) && (%s)", *pred, what); 579 } 580 } else { 581 if (cmp != NULL) { 582 (void) sprintf(new, "%s %s 0x%p", 583 what, cmp, (void *)value); 584 } else { 585 (void) sprintf(new, "%s", what); 586 } 587 } 588 589 free(*pred); 590 *pred = new; 591} 592 593static void 594predicate_destroy(char **pred) 595{ 596 free(*pred); 597 *pred = NULL; 598} 599 600static void 601filter_add(char **filt, char *what, uintptr_t base, uintptr_t size) 602{ 603 char buf[256], *c = buf, *new; 604 int len, newlen; 605 606 if (*filt == NULL) { 607 *filt = malloc(1); 608 *filt[0] = '\0'; 609 } 610 611#ifdef illumos 612 (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ? 613 " || " : "", what, (void *)base, what, (void *)(base + size)); 614#else 615 (void) sprintf(c, "%s(%s >= %p && %s < %p)", *filt[0] != '\0' ? 616 " || " : "", what, (void *)base, what, (void *)(base + size)); 617#endif 618 619 newlen = (len = strlen(*filt) + 1) + strlen(c); 620 new = malloc(newlen); 621 bcopy(*filt, new, len); 622 (void) strcat(new, c); 623 free(*filt); 624 *filt = new; 625} 626 627static void 628filter_destroy(char **filt) 629{ 630 free(*filt); 631 *filt = NULL; 632} 633 634static void 635dprog_add(const char *fmt, ...) 636{ 637 va_list args; 638 int size, offs; 639 char c; 640 641 va_start(args, fmt); 642 size = vsnprintf(&c, 1, fmt, args) + 1; 643 va_end(args); 644 645 if (g_proglen == 0) { 646 offs = 0; 647 } else { 648 offs = g_proglen - 1; 649 } 650 651 g_proglen = offs + size; 652 653 if ((g_prog = realloc(g_prog, g_proglen)) == NULL) 654 fail(1, "failed to reallocate program text"); 655 656 va_start(args, fmt); 657 (void) vsnprintf(&g_prog[offs], size, fmt, args); 658 va_end(args); 659} 660 661/* 662 * This function may read like an open sewer, but keep in mind that programs 663 * that generate other programs are rarely pretty. If one has the unenviable 664 * task of maintaining or -- worse -- extending this code, use the -V option 665 * to examine the D program as generated by this function. 666 */ 667static void 668dprog_addevent(int event) 669{ 670 ls_event_info_t *info = &g_event_info[event]; 671 char *pred = NULL; 672 char stack[20]; 673 const char *arg0, *caller; 674 char *arg1 = "arg1"; 675 char buf[80]; 676 hrtime_t dur; 677 int depth; 678 679 if (info->ev_name[0] == '\0') 680 return; 681 682 if (info->ev_type == 'I') { 683 /* 684 * For interrupt events, arg0 (normally the lock pointer) is 685 * the CPU address plus the current pil, and arg1 (normally 686 * the number of nanoseconds) is the number of nanoseconds 687 * late -- and it's stored in arg2. 688 */ 689#ifdef illumos 690 arg0 = "(uintptr_t)curthread->t_cpu + \n" 691 "\t curthread->t_cpu->cpu_profile_pil"; 692#else 693 arg0 = "(uintptr_t)(curthread->td_oncpu << 16) + \n" 694 "\t 0x01000000 + curthread->td_pri_class"; 695#endif 696 caller = "(uintptr_t)arg0"; 697 arg1 = "arg2"; 698 } else { 699 arg0 = "(uintptr_t)arg0"; 700 caller = "caller"; 701 } 702 703 if (g_recsize > LS_HIST) { 704 for (depth = 0; g_recsize > LS_STACK(depth); depth++) 705 continue; 706 707 if (g_tracing) { 708 (void) sprintf(stack, "\tstack(%d);\n", depth); 709 } else { 710 (void) sprintf(stack, ", stack(%d)", depth); 711 } 712 } else { 713 (void) sprintf(stack, ""); 714 } 715 716 if (info->ev_acquire != NULL) { 717 /* 718 * If this is a hold event, we need to generate an additional 719 * clause for the acquire; the clause for the release will be 720 * generated with the aggregating statement, below. 721 */ 722 dprog_add("%s\n", info->ev_acquire); 723 predicate_add(&pred, info->ev_predicate, NULL, 0); 724 predicate_add(&pred, g_predicate, NULL, 0); 725 if (pred != NULL) 726 dprog_add("/%s/\n", pred); 727 728 dprog_add("{\n"); 729 (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event); 730 731 if (info->ev_type == 'H') { 732 dprog_add("\t%s = timestamp;\n", buf); 733 } else { 734 /* 735 * If this isn't a hold event, it's the recursive 736 * error event. For this, we simply bump the 737 * thread-local, per-lock count. 738 */ 739 dprog_add("\t%s++;\n", buf); 740 } 741 742 dprog_add("}\n\n"); 743 predicate_destroy(&pred); 744 pred = NULL; 745 746 if (info->ev_type == 'E') { 747 /* 748 * If this is the recursive lock error event, we need 749 * to generate an additional clause to decrement the 750 * thread-local, per-lock count. This assures that we 751 * only execute the aggregating clause if we have 752 * recursive entry. 753 */ 754 dprog_add("%s\n", info->ev_name); 755 dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf); 756 } 757 758 predicate_add(&pred, buf, NULL, 0); 759 760 if (info->ev_type == 'H') { 761 (void) sprintf(buf, "timestamp -\n\t " 762 "self->ev%d[(uintptr_t)arg0]", event); 763 } 764 765 arg1 = buf; 766 } else { 767 predicate_add(&pred, info->ev_predicate, NULL, 0); 768 if (info->ev_type != 'I') 769 predicate_add(&pred, g_predicate, NULL, 0); 770 else 771 predicate_add(&pred, g_ipredicate, NULL, 0); 772 } 773 774 if ((dur = g_min_duration[event]) != 0) 775 predicate_add(&pred, arg1, ">=", dur); 776 777 dprog_add("%s\n", info->ev_name); 778 779 if (pred != NULL) 780 dprog_add("/%s/\n", pred); 781 predicate_destroy(&pred); 782 783 dprog_add("{\n"); 784 785 if (g_tracing) { 786 dprog_add("\ttrace(%dULL);\n", event); 787 dprog_add("\ttrace(%s);\n", arg0); 788 dprog_add("\ttrace(%s);\n", caller); 789 dprog_add(stack); 790 } else { 791 /* 792 * The ordering here is important: when we process the 793 * aggregate, we count on the fact that @avg appears before 794 * @hist in program order to assure that @avg is assigned the 795 * first aggregation variable ID and @hist assigned the 796 * second; see the comment in process_aggregate() for details. 797 */ 798 dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n", 799 event, arg0, caller, stack, arg1); 800 801 if (g_recsize >= LS_HIST) { 802 dprog_add("\t@hist[%dULL, %s, %s%s] = quantize" 803 "(%s);\n", event, arg0, caller, stack, arg1); 804 } 805 } 806 807 if (info->ev_acquire != NULL) 808 dprog_add("\tself->ev%d[arg0] = 0;\n", event); 809 810 dprog_add("}\n\n"); 811} 812 813static void 814dprog_compile() 815{ 816 dtrace_prog_t *prog; 817 dtrace_proginfo_t info; 818 819 if (g_Vflag) { 820 (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n"); 821 (void) fputs(g_prog, stderr); 822 (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n"); 823 } 824 825 if ((prog = dtrace_program_strcompile(g_dtp, g_prog, 826 DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL) 827 dfail("failed to compile program"); 828 829 if (dtrace_program_exec(g_dtp, prog, &info) == -1) 830 dfail("failed to enable probes"); 831 832 if (dtrace_go(g_dtp) != 0) 833 dfail("couldn't start tracing"); 834} 835 836static void 837#ifdef illumos 838status_fire(void) 839#else 840status_fire(int i) 841#endif 842{} 843 844static void 845status_init(void) 846{ 847 dtrace_optval_t val, status, agg; 848 struct sigaction act; 849 struct itimerspec ts; 850 struct sigevent ev; 851 timer_t tid; 852 853 if (dtrace_getopt(g_dtp, "statusrate", &status) == -1) 854 dfail("failed to get 'statusrate'"); 855 856 if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1) 857 dfail("failed to get 'statusrate'"); 858 859 /* 860 * We would want to awaken at a rate that is the GCD of the statusrate 861 * and the aggrate -- but that seems a bit absurd. Instead, we'll 862 * simply awaken at a rate that is the more frequent of the two, which 863 * assures that we're never later than the interval implied by the 864 * more frequent rate. 865 */ 866 val = status < agg ? status : agg; 867 868 (void) sigemptyset(&act.sa_mask); 869 act.sa_flags = 0; 870 act.sa_handler = status_fire; 871 (void) sigaction(SIGUSR1, &act, NULL); 872 873 ev.sigev_notify = SIGEV_SIGNAL; 874 ev.sigev_signo = SIGUSR1; 875 876 if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1) 877 dfail("cannot create CLOCK_REALTIME timer"); 878 879 ts.it_value.tv_sec = val / NANOSEC; 880 ts.it_value.tv_nsec = val % NANOSEC; 881 ts.it_interval = ts.it_value; 882 883 if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1) 884 dfail("cannot set time on CLOCK_REALTIME timer"); 885} 886 887static void 888status_check(void) 889{ 890 if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0) 891 dfail("failed to snap aggregate"); 892 893 if (dtrace_status(g_dtp) == -1) 894 dfail("dtrace_status()"); 895} 896 897static void 898lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data) 899{ 900 bzero(lsrec, g_recsize); 901 lsrec->ls_count = 1; 902 903 if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3)) 904 fail(0, "truncated DTrace record"); 905 906 if (rec->dtrd_size != sizeof (uint64_t)) 907 fail(0, "bad event size in first record"); 908 909 /* LINTED - alignment */ 910 lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset)); 911 rec++; 912 913 if (rec->dtrd_size != sizeof (uintptr_t)) 914 fail(0, "bad lock address size in second record"); 915 916 /* LINTED - alignment */ 917 lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset)); 918 rec++; 919 920 if (rec->dtrd_size != sizeof (uintptr_t)) 921 fail(0, "bad caller size in third record"); 922 923 /* LINTED - alignment */ 924 lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset)); 925 rec++; 926 927 if (g_recsize > LS_HIST) { 928 int frames, i; 929 pc_t *stack; 930 931 frames = rec->dtrd_size / sizeof (pc_t); 932 /* LINTED - alignment */ 933 stack = (pc_t *)(data + rec->dtrd_offset); 934 935 for (i = 1; i < frames; i++) 936 lsrec->ls_stack[i - 1] = stack[i]; 937 } 938} 939 940/*ARGSUSED*/ 941static int 942count_aggregate(const dtrace_aggdata_t *agg, void *arg) 943{ 944 *((size_t *)arg) += 1; 945 946 return (DTRACE_AGGWALK_NEXT); 947} 948 949static int 950process_aggregate(const dtrace_aggdata_t *agg, void *arg) 951{ 952 const dtrace_aggdesc_t *aggdesc = agg->dtada_desc; 953 caddr_t data = agg->dtada_data; 954 lsdata_t *lsdata = arg; 955 lsrec_t *lsrec = lsdata->lsd_next; 956 const dtrace_recdesc_t *rec; 957 uint64_t *avg, *quantized; 958 int i, j; 959 960 assert(lsdata->lsd_count < g_nrecs); 961 962 /* 963 * Aggregation variable IDs are guaranteed to be generated in program 964 * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE 965 * plus one. As "avg" appears before "hist" in program order, we know 966 * that "avg" will be allocated the first aggregation variable ID, and 967 * "hist" will be allocated the second aggregation variable ID -- and 968 * we therefore use the aggregation variable ID to differentiate the 969 * cases. 970 */ 971 if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) { 972 /* 973 * If this is the histogram entry. We'll copy the quantized 974 * data into lc_hist, and jump over the rest. 975 */ 976 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 977 978 if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2) 979 fail(0, "bad variable ID in aggregation record"); 980 981 if (rec->dtrd_size != 982 DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t)) 983 fail(0, "bad quantize size in aggregation record"); 984 985 /* LINTED - alignment */ 986 quantized = (uint64_t *)(data + rec->dtrd_offset); 987 988 for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0; 989 i < DTRACE_QUANTIZE_NBUCKETS; i++, j++) 990 lsrec->ls_hist[j] = quantized[i]; 991 992 goto out; 993 } 994 995 lsrec_fill(lsrec, &aggdesc->dtagd_rec[1], 996 aggdesc->dtagd_nrecs - 1, data); 997 998 rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1]; 999 1000 if (rec->dtrd_size != 2 * sizeof (uint64_t)) 1001 fail(0, "bad avg size in aggregation record"); 1002 1003 /* LINTED - alignment */ 1004 avg = (uint64_t *)(data + rec->dtrd_offset); 1005 lsrec->ls_count = (uint32_t)avg[0]; 1006 lsrec->ls_time = (uintptr_t)avg[1]; 1007 1008 if (g_recsize >= LS_HIST) 1009 return (DTRACE_AGGWALK_NEXT); 1010 1011out: 1012 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1013 lsdata->lsd_count++; 1014 1015 return (DTRACE_AGGWALK_NEXT); 1016} 1017 1018static int 1019process_trace(const dtrace_probedata_t *pdata, void *arg) 1020{ 1021 lsdata_t *lsdata = arg; 1022 lsrec_t *lsrec = lsdata->lsd_next; 1023 dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc; 1024 caddr_t data = pdata->dtpda_data; 1025 1026 if (lsdata->lsd_count >= g_nrecs) 1027 return (DTRACE_CONSUME_NEXT); 1028 1029 lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data); 1030 1031 lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize); 1032 lsdata->lsd_count++; 1033 1034 return (DTRACE_CONSUME_NEXT); 1035} 1036 1037static int 1038process_data(FILE *out, char *data) 1039{ 1040 lsdata_t lsdata; 1041 1042 /* LINTED - alignment */ 1043 lsdata.lsd_next = (lsrec_t *)data; 1044 lsdata.lsd_count = 0; 1045 1046 if (g_tracing) { 1047 if (dtrace_consume(g_dtp, out, 1048 process_trace, NULL, &lsdata) != 0) 1049 dfail("failed to consume buffer"); 1050 1051 return (lsdata.lsd_count); 1052 } 1053 1054 if (dtrace_aggregate_walk_keyvarsorted(g_dtp, 1055 process_aggregate, &lsdata) != 0) 1056 dfail("failed to walk aggregate"); 1057 1058 return (lsdata.lsd_count); 1059} 1060 1061/*ARGSUSED*/ 1062static int 1063drophandler(const dtrace_dropdata_t *data, void *arg) 1064{ 1065 g_dropped++; 1066 (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg); 1067 return (DTRACE_HANDLE_OK); 1068} 1069 1070int 1071main(int argc, char **argv) 1072{ 1073 char *data_buf; 1074 lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf; 1075 FILE *out = stdout; 1076 int c; 1077 pid_t child; 1078 int status; 1079 int i, j; 1080 hrtime_t duration; 1081 char *addrp, *offp, *sizep, *evp, *lastp, *p; 1082 uintptr_t addr; 1083 size_t size, off; 1084 int events_specified = 0; 1085 int exec_errno = 0; 1086 uint32_t event; 1087 char *filt = NULL, *ifilt = NULL; 1088 static uint64_t ev_count[LS_MAX_EVENTS + 1]; 1089 static uint64_t ev_time[LS_MAX_EVENTS + 1]; 1090 dtrace_optval_t aggsize; 1091 char aggstr[10]; 1092 long ncpus; 1093 int dynvar = 0; 1094 int err; 1095 1096 if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) { 1097 fail(0, "cannot open dtrace library: %s", 1098 dtrace_errmsg(NULL, err)); 1099 } 1100 1101 if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1) 1102 dfail("couldn't establish drop handler"); 1103 1104 if (symtab_init() == -1) 1105 fail(1, "can't load kernel symbols"); 1106 1107 g_nrecs = DEFAULT_NRECS; 1108 1109 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1110 switch (c) { 1111 case 'b': 1112 g_recsize = LS_BASIC; 1113 break; 1114 1115 case 't': 1116 g_recsize = LS_TIME; 1117 break; 1118 1119 case 'h': 1120 g_recsize = LS_HIST; 1121 break; 1122 1123 case 's': 1124 if (!isdigit(optarg[0])) 1125 usage(); 1126 g_stkdepth = atoi(optarg); 1127 if (g_stkdepth > LS_MAX_STACK_DEPTH) 1128 fail(0, "max stack depth is %d", 1129 LS_MAX_STACK_DEPTH); 1130 g_recsize = LS_STACK(g_stkdepth); 1131 break; 1132 1133 case 'n': 1134 if (!isdigit(optarg[0])) 1135 usage(); 1136 g_nrecs = atoi(optarg); 1137 break; 1138 1139 case 'd': 1140 if (!isdigit(optarg[0])) 1141 usage(); 1142 duration = atoll(optarg); 1143 1144 /* 1145 * XXX -- durations really should be per event 1146 * since the units are different, but it's hard 1147 * to express this nicely in the interface. 1148 * Not clear yet what the cleanest solution is. 1149 */ 1150 for (i = 0; i < LS_MAX_EVENTS; i++) 1151 if (g_event_info[i].ev_type != 'E') 1152 g_min_duration[i] = duration; 1153 1154 break; 1155 1156 case 'i': 1157 if (!isdigit(optarg[0])) 1158 usage(); 1159 i = atoi(optarg); 1160 if (i <= 0) 1161 usage(); 1162 if (i > MAX_HZ) 1163 fail(0, "max interrupt rate is %d Hz", MAX_HZ); 1164 1165 for (j = 0; j < LS_MAX_EVENTS; j++) 1166 if (strcmp(g_event_info[j].ev_desc, 1167 "Profiling interrupt") == 0) 1168 break; 1169 1170 (void) sprintf(g_event_info[j].ev_name, 1171 "profile:::profile-%d", i); 1172 break; 1173 1174 case 'l': 1175 case 'f': 1176 addrp = strtok(optarg, ","); 1177 sizep = strtok(NULL, ","); 1178 addrp = strtok(optarg, ",+"); 1179 offp = strtok(NULL, ","); 1180 1181 size = sizep ? strtoul(sizep, NULL, 0) : 1; 1182 off = offp ? strtoul(offp, NULL, 0) : 0; 1183 1184 if (addrp[0] == '0') { 1185 addr = strtoul(addrp, NULL, 16) + off; 1186 } else { 1187 addr = sym_to_addr(addrp) + off; 1188 if (sizep == NULL) 1189 size = sym_size(addrp) - off; 1190 if (addr - off == 0) 1191 fail(0, "symbol '%s' not found", addrp); 1192 if (size == 0) 1193 size = 1; 1194 } 1195 1196 1197 if (c == 'l') { 1198 filter_add(&filt, "arg0", addr, size); 1199 } else { 1200 filter_add(&filt, "caller", addr, size); 1201 filter_add(&ifilt, "arg0", addr, size); 1202 } 1203 break; 1204 1205 case 'e': 1206 evp = strtok_r(optarg, ",", &lastp); 1207 while (evp) { 1208 int ev1, ev2; 1209 char *evp2; 1210 1211 (void) strtok(evp, "-"); 1212 evp2 = strtok(NULL, "-"); 1213 ev1 = atoi(evp); 1214 ev2 = evp2 ? atoi(evp2) : ev1; 1215 if ((uint_t)ev1 >= LS_MAX_EVENTS || 1216 (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2) 1217 fail(0, "-e events out of range"); 1218 for (i = ev1; i <= ev2; i++) 1219 g_enabled[i] = 1; 1220 evp = strtok_r(NULL, ",", &lastp); 1221 } 1222 events_specified = 1; 1223 break; 1224 1225 case 'c': 1226 g_cflag = 1; 1227 break; 1228 1229 case 'k': 1230 g_kflag = 1; 1231 break; 1232 1233 case 'w': 1234 g_wflag = 1; 1235 break; 1236 1237 case 'W': 1238 g_Wflag = 1; 1239 break; 1240 1241 case 'g': 1242 g_gflag = 1; 1243 break; 1244 1245 case 'C': 1246 case 'E': 1247 case 'H': 1248 case 'I': 1249 for (i = 0; i < LS_MAX_EVENTS; i++) 1250 if (g_event_info[i].ev_type == c) 1251 g_enabled[i] = 1; 1252 events_specified = 1; 1253 break; 1254 1255 case 'A': 1256 for (i = 0; i < LS_MAX_EVENTS; i++) 1257 if (strchr("CH", g_event_info[i].ev_type)) 1258 g_enabled[i] = 1; 1259 events_specified = 1; 1260 break; 1261 1262 case 'T': 1263 g_tracing = 1; 1264 break; 1265 1266 case 'D': 1267 if (!isdigit(optarg[0])) 1268 usage(); 1269 g_topn = atoi(optarg); 1270 break; 1271 1272 case 'R': 1273 g_rates = 1; 1274 break; 1275 1276 case 'p': 1277 g_pflag = 1; 1278 break; 1279 1280 case 'P': 1281 g_Pflag = 1; 1282 break; 1283 1284 case 'o': 1285 if ((out = fopen(optarg, "w")) == NULL) 1286 fail(1, "error opening file"); 1287 break; 1288 1289 case 'V': 1290 g_Vflag = 1; 1291 break; 1292 1293 default: 1294 if (strchr(LOCKSTAT_OPTSTR, c) == NULL) 1295 usage(); 1296 } 1297 } 1298 1299 if (filt != NULL) { 1300 predicate_add(&g_predicate, filt, NULL, 0); 1301 filter_destroy(&filt); 1302 } 1303 1304 if (ifilt != NULL) { 1305 predicate_add(&g_ipredicate, ifilt, NULL, 0); 1306 filter_destroy(&ifilt); 1307 } 1308 1309 if (g_recsize == 0) { 1310 if (g_gflag) { 1311 g_stkdepth = LS_MAX_STACK_DEPTH; 1312 g_recsize = LS_STACK(g_stkdepth); 1313 } else { 1314 g_recsize = LS_TIME; 1315 } 1316 } 1317 1318 if (g_gflag && g_recsize <= LS_STACK(0)) 1319 fail(0, "'-g' requires at least '-s 1' data gathering"); 1320 1321 /* 1322 * Make sure the alignment is reasonable 1323 */ 1324 g_recsize = -(-g_recsize & -sizeof (uint64_t)); 1325 1326 for (i = 0; i < LS_MAX_EVENTS; i++) { 1327 /* 1328 * If no events were specified, enable -C. 1329 */ 1330 if (!events_specified && g_event_info[i].ev_type == 'C') 1331 g_enabled[i] = 1; 1332 } 1333 1334 for (i = 0; i < LS_MAX_EVENTS; i++) { 1335 if (!g_enabled[i]) 1336 continue; 1337 1338 if (g_event_info[i].ev_acquire != NULL) { 1339 /* 1340 * If we've enabled a hold event, we must explicitly 1341 * allocate dynamic variable space. 1342 */ 1343 dynvar = 1; 1344 } 1345 1346 dprog_addevent(i); 1347 } 1348 1349 /* 1350 * Make sure there are remaining arguments to specify a child command 1351 * to execute. 1352 */ 1353 if (argc <= optind) 1354 usage(); 1355 1356 if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1) 1357 dfail("couldn't determine number of online CPUs"); 1358 1359 /* 1360 * By default, we set our data buffer size to be the number of records 1361 * multiplied by the size of the record, doubled to account for some 1362 * DTrace slop and divided by the number of CPUs. We silently clamp 1363 * the aggregation size at both a minimum and a maximum to prevent 1364 * absurdly low or high values. 1365 */ 1366 if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE) 1367 aggsize = MIN_AGGSIZE; 1368 1369 if (aggsize > MAX_AGGSIZE) 1370 aggsize = MAX_AGGSIZE; 1371 1372 (void) sprintf(aggstr, "%lld", (long long)aggsize); 1373 1374 if (!g_tracing) { 1375 if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1) 1376 dfail("failed to set 'bufsize'"); 1377 1378 if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1) 1379 dfail("failed to set 'aggsize'"); 1380 1381 if (dynvar) { 1382 /* 1383 * If we're using dynamic variables, we set our 1384 * dynamic variable size to be one megabyte per CPU, 1385 * with a hard-limit of 32 megabytes. This may still 1386 * be too small in some cases, but it can be tuned 1387 * manually via -x if need be. 1388 */ 1389 (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32); 1390 1391 if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1) 1392 dfail("failed to set 'dynvarsize'"); 1393 } 1394 } else { 1395 if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1) 1396 dfail("failed to set 'bufsize'"); 1397 } 1398 1399 if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1) 1400 dfail("failed to set 'statusrate'"); 1401 1402 optind = 1; 1403 while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != GETOPT_EOF) { 1404 switch (c) { 1405 case 'x': 1406 if ((p = strchr(optarg, '=')) != NULL) 1407 *p++ = '\0'; 1408 1409 if (dtrace_setopt(g_dtp, optarg, p) != 0) 1410 dfail("failed to set -x %s", optarg); 1411 break; 1412 } 1413 } 1414 1415 argc -= optind; 1416 argv += optind; 1417 1418 dprog_compile(); 1419 status_init(); 1420 1421 g_elapsed = -gethrtime(); 1422 1423 /* 1424 * Spawn the specified command and wait for it to complete. 1425 */ 1426 child = fork(); 1427 if (child == -1) 1428 fail(1, "cannot fork"); 1429 if (child == 0) { 1430 (void) dtrace_close(g_dtp); 1431 (void) execvp(argv[0], &argv[0]); 1432 exec_errno = errno; 1433 exit(127); 1434 } 1435 1436#ifdef illumos 1437 while (waitpid(child, &status, WEXITED) != child) 1438#else 1439 while (waitpid(child, &status, 0) != child) 1440#endif 1441 status_check(); 1442 1443 g_elapsed += gethrtime(); 1444 1445 if (WIFEXITED(status)) { 1446 if (WEXITSTATUS(status) != 0) { 1447 if (exec_errno != 0) { 1448 errno = exec_errno; 1449 fail(1, "could not execute %s", argv[0]); 1450 } 1451 (void) fprintf(stderr, 1452 "lockstat: warning: %s exited with code %d\n", 1453 argv[0], WEXITSTATUS(status)); 1454 } 1455 } else { 1456 (void) fprintf(stderr, 1457 "lockstat: warning: %s died on signal %d\n", 1458 argv[0], WTERMSIG(status)); 1459 } 1460 1461 if (dtrace_stop(g_dtp) == -1) 1462 dfail("failed to stop dtrace"); 1463 1464 /* 1465 * Before we read out the results, we need to allocate our buffer. 1466 * If we're tracing, then we'll just use the precalculated size. If 1467 * we're not, then we'll take a snapshot of the aggregate, and walk 1468 * it to count the number of records. 1469 */ 1470 if (!g_tracing) { 1471 if (dtrace_aggregate_snap(g_dtp) != 0) 1472 dfail("failed to snap aggregate"); 1473 1474 g_nrecs = 0; 1475 1476 if (dtrace_aggregate_walk(g_dtp, 1477 count_aggregate, &g_nrecs) != 0) 1478 dfail("failed to walk aggregate"); 1479 } 1480 1481#ifdef illumos 1482 if ((data_buf = memalign(sizeof (uint64_t), 1483 (g_nrecs + 1) * g_recsize)) == NULL) 1484#else 1485 if (posix_memalign((void **)&data_buf, sizeof (uint64_t), 1486 (g_nrecs + 1) * g_recsize) ) 1487#endif 1488 fail(1, "Memory allocation failed"); 1489 1490 /* 1491 * Read out the DTrace data. 1492 */ 1493 g_nrecs_used = process_data(out, data_buf); 1494 1495 if (g_nrecs_used > g_nrecs || g_dropped) 1496 (void) fprintf(stderr, "lockstat: warning: " 1497 "ran out of data records (use -n for more)\n"); 1498 1499 /* LINTED - alignment */ 1500 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1501 /* LINTED - alignment */ 1502 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1503 ev_count[lsp->ls_event] += lsp->ls_count; 1504 ev_time[lsp->ls_event] += lsp->ls_time; 1505 } 1506 1507 /* 1508 * If -g was specified, convert stacks into individual records. 1509 */ 1510 if (g_gflag) { 1511 lsrec_t *newlsp, *oldlsp; 1512 1513#ifdef illumos 1514 newlsp = memalign(sizeof (uint64_t), 1515 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1516#else 1517 posix_memalign((void **)&newlsp, sizeof (uint64_t), 1518 g_nrecs_used * LS_TIME * (g_stkdepth + 1)); 1519#endif 1520 if (newlsp == NULL) 1521 fail(1, "Cannot allocate space for -g processing"); 1522 lsp = newlsp; 1523 /* LINTED - alignment */ 1524 for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1525 /* LINTED - alignment */ 1526 oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) { 1527 int fr; 1528 int caller_in_stack = 0; 1529 1530 if (oldlsp->ls_count == 0) 1531 continue; 1532 1533 for (fr = 0; fr < g_stkdepth; fr++) { 1534 if (oldlsp->ls_stack[fr] == 0) 1535 break; 1536 if (oldlsp->ls_stack[fr] == oldlsp->ls_caller) 1537 caller_in_stack = 1; 1538 bcopy(oldlsp, lsp, LS_TIME); 1539 lsp->ls_caller = oldlsp->ls_stack[fr]; 1540 /* LINTED - alignment */ 1541 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1542 } 1543 if (!caller_in_stack) { 1544 bcopy(oldlsp, lsp, LS_TIME); 1545 /* LINTED - alignment */ 1546 lsp = (lsrec_t *)((char *)lsp + LS_TIME); 1547 } 1548 } 1549 g_nrecs = g_nrecs_used = 1550 ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME; 1551 g_recsize = LS_TIME; 1552 g_stkdepth = 0; 1553 free(data_buf); 1554 data_buf = (char *)newlsp; 1555 } 1556 1557 if ((sort_buf = calloc(2 * (g_nrecs + 1), 1558 sizeof (void *))) == NULL) 1559 fail(1, "Sort buffer allocation failed"); 1560 merge_buf = sort_buf + (g_nrecs + 1); 1561 1562 /* 1563 * Build the sort buffer, discarding zero-count records along the way. 1564 */ 1565 /* LINTED - alignment */ 1566 for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++, 1567 /* LINTED - alignment */ 1568 lsp = (lsrec_t *)((char *)lsp + g_recsize)) { 1569 if (lsp->ls_count == 0) 1570 lsp->ls_event = LS_MAX_EVENTS; 1571 sort_buf[i] = lsp; 1572 } 1573 1574 if (g_nrecs_used == 0) 1575 exit(0); 1576 1577 /* 1578 * Add a sentinel after the last record 1579 */ 1580 sort_buf[i] = lsp; 1581 lsp->ls_event = LS_MAX_EVENTS; 1582 1583 if (g_tracing) { 1584 report_trace(out, sort_buf); 1585 return (0); 1586 } 1587 1588 /* 1589 * Application of -g may have resulted in multiple records 1590 * with the same signature; coalesce them. 1591 */ 1592 if (g_gflag) { 1593 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1594 coalesce(lockcmp, sort_buf, g_nrecs_used); 1595 } 1596 1597 /* 1598 * Coalesce locks within the same symbol if -c option specified. 1599 * Coalesce PCs within the same function if -k option specified. 1600 */ 1601 if (g_cflag || g_kflag) { 1602 for (i = 0; i < g_nrecs_used; i++) { 1603 int fr; 1604 lsp = sort_buf[i]; 1605 if (g_cflag) 1606 coalesce_symbol(&lsp->ls_lock); 1607 if (g_kflag) { 1608 for (fr = 0; fr < g_stkdepth; fr++) 1609 coalesce_symbol(&lsp->ls_stack[fr]); 1610 coalesce_symbol(&lsp->ls_caller); 1611 } 1612 } 1613 mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used); 1614 coalesce(lockcmp, sort_buf, g_nrecs_used); 1615 } 1616 1617 /* 1618 * Coalesce callers if -w option specified 1619 */ 1620 if (g_wflag) { 1621 mergesort(lock_and_count_cmp_anywhere, 1622 sort_buf, merge_buf, g_nrecs_used); 1623 coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used); 1624 } 1625 1626 /* 1627 * Coalesce locks if -W option specified 1628 */ 1629 if (g_Wflag) { 1630 mergesort(site_and_count_cmp_anylock, 1631 sort_buf, merge_buf, g_nrecs_used); 1632 coalesce(sitecmp_anylock, sort_buf, g_nrecs_used); 1633 } 1634 1635 /* 1636 * Sort data by contention count (ls_count) or total time (ls_time), 1637 * depending on g_Pflag. Override g_Pflag if time wasn't measured. 1638 */ 1639 if (g_recsize < LS_TIME) 1640 g_Pflag = 0; 1641 1642 if (g_Pflag) 1643 mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used); 1644 else 1645 mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used); 1646 1647 /* 1648 * Display data by event type 1649 */ 1650 first = &sort_buf[0]; 1651 while ((event = (*first)->ls_event) < LS_MAX_EVENTS) { 1652 current = first; 1653 while ((lsp = *current)->ls_event == event) 1654 current++; 1655 report_stats(out, first, current - first, ev_count[event], 1656 ev_time[event]); 1657 first = current; 1658 } 1659 1660 return (0); 1661} 1662 1663static char * 1664format_symbol(char *buf, uintptr_t addr, int show_size) 1665{ 1666 uintptr_t symoff; 1667 char *symname; 1668 size_t symsize; 1669 1670 symname = addr_to_sym(addr, &symoff, &symsize); 1671 1672 if (show_size && symoff == 0) 1673 (void) sprintf(buf, "%s[%ld]", symname, (long)symsize); 1674 else if (symoff == 0) 1675 (void) sprintf(buf, "%s", symname); 1676 else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0) /* CPU+PIL */ 1677#ifdef illumos 1678 (void) sprintf(buf, "%s+%ld", symname, (long)symoff); 1679#else 1680 (void) sprintf(buf, "%s+%s", symname, g_pri_class[(int)symoff]); 1681#endif 1682 else if (symoff <= symsize || (symoff < 256 && addr != symoff)) 1683 (void) sprintf(buf, "%s+0x%llx", symname, 1684 (unsigned long long)symoff); 1685 else 1686 (void) sprintf(buf, "0x%llx", (unsigned long long)addr); 1687 return (buf); 1688} 1689 1690static void 1691report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count, 1692 uint64_t total_time) 1693{ 1694 uint32_t event = sort_buf[0]->ls_event; 1695 lsrec_t *lsp; 1696 double ptotal = 0.0; 1697 double percent; 1698 int i, j, fr; 1699 int displayed; 1700 int first_bin, last_bin, max_bin_count, total_bin_count; 1701 int rectype; 1702 char buf[256]; 1703 char lhdr[80], chdr[80]; 1704 1705 rectype = g_recsize; 1706 1707 if (g_topn == 0) { 1708 (void) fprintf(out, "%20llu %s\n", 1709 g_rates == 0 ? total_count : 1710 ((unsigned long long)total_count * NANOSEC) / g_elapsed, 1711 g_event_info[event].ev_desc); 1712 return; 1713 } 1714 1715 (void) sprintf(lhdr, "%s%s", 1716 g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr); 1717 (void) sprintf(chdr, "%s%s", 1718 g_wflag ? "Hottest " : "", "Caller"); 1719 1720 if (!g_pflag) 1721 (void) fprintf(out, 1722 "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n", 1723 g_event_info[event].ev_desc, (double)total_count, 1724 (double)g_elapsed / NANOSEC, 1725 (double)total_count * NANOSEC / g_elapsed); 1726 1727 if (!g_pflag && rectype < LS_HIST) { 1728 (void) sprintf(buf, "%s", g_event_info[event].ev_units); 1729 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1730 g_rates ? "ops/s" : "Count", 1731 g_gflag ? "genr" : "indv", 1732 "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr); 1733 (void) fprintf(out, "---------------------------------" 1734 "----------------------------------------------\n"); 1735 } 1736 1737 displayed = 0; 1738 for (i = 0; i < nrecs; i++) { 1739 lsp = sort_buf[i]; 1740 1741 if (displayed++ >= g_topn) 1742 break; 1743 1744 if (g_pflag) { 1745 int j; 1746 1747 (void) fprintf(out, "%u %u", 1748 lsp->ls_event, lsp->ls_count); 1749 (void) fprintf(out, " %s", 1750 format_symbol(buf, lsp->ls_lock, g_cflag)); 1751 (void) fprintf(out, " %s", 1752 format_symbol(buf, lsp->ls_caller, 0)); 1753 (void) fprintf(out, " %f", 1754 (double)lsp->ls_refcnt / lsp->ls_count); 1755 if (rectype >= LS_TIME) 1756 (void) fprintf(out, " %llu", 1757 (unsigned long long)lsp->ls_time); 1758 if (rectype >= LS_HIST) { 1759 for (j = 0; j < 64; j++) 1760 (void) fprintf(out, " %u", 1761 lsp->ls_hist[j]); 1762 } 1763 for (j = 0; j < LS_MAX_STACK_DEPTH; j++) { 1764 if (rectype <= LS_STACK(j) || 1765 lsp->ls_stack[j] == 0) 1766 break; 1767 (void) fprintf(out, " %s", 1768 format_symbol(buf, lsp->ls_stack[j], 0)); 1769 } 1770 (void) fprintf(out, "\n"); 1771 continue; 1772 } 1773 1774 if (rectype >= LS_HIST) { 1775 (void) fprintf(out, "---------------------------------" 1776 "----------------------------------------------\n"); 1777 (void) sprintf(buf, "%s", 1778 g_event_info[event].ev_units); 1779 (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n", 1780 g_rates ? "ops/s" : "Count", 1781 g_gflag ? "genr" : "indv", 1782 "cuml", "rcnt", buf, lhdr, chdr); 1783 } 1784 1785 if (g_Pflag && total_time != 0) 1786 percent = (lsp->ls_time * 100.00) / total_time; 1787 else 1788 percent = (lsp->ls_count * 100.00) / total_count; 1789 1790 ptotal += percent; 1791 1792 if (rectype >= LS_TIME) 1793 (void) sprintf(buf, "%llu", 1794 (unsigned long long)(lsp->ls_time / lsp->ls_count)); 1795 else 1796 buf[0] = '\0'; 1797 1798 (void) fprintf(out, "%5llu ", 1799 g_rates == 0 ? lsp->ls_count : 1800 ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed); 1801 1802 (void) fprintf(out, "%3.0f%% ", percent); 1803 1804 if (g_gflag) 1805 (void) fprintf(out, "---- "); 1806 else 1807 (void) fprintf(out, "%3.0f%% ", ptotal); 1808 1809 (void) fprintf(out, "%4.2f %8s ", 1810 (double)lsp->ls_refcnt / lsp->ls_count, buf); 1811 1812 (void) fprintf(out, "%-22s ", 1813 format_symbol(buf, lsp->ls_lock, g_cflag)); 1814 1815 (void) fprintf(out, "%-24s\n", 1816 format_symbol(buf, lsp->ls_caller, 0)); 1817 1818 if (rectype < LS_HIST) 1819 continue; 1820 1821 (void) fprintf(out, "\n"); 1822 (void) fprintf(out, "%10s %31s %-9s %-24s\n", 1823 g_event_info[event].ev_units, 1824 "------ Time Distribution ------", 1825 g_rates ? "ops/s" : "count", 1826 rectype > LS_STACK(0) ? "Stack" : ""); 1827 1828 first_bin = 0; 1829 while (lsp->ls_hist[first_bin] == 0) 1830 first_bin++; 1831 1832 last_bin = 63; 1833 while (lsp->ls_hist[last_bin] == 0) 1834 last_bin--; 1835 1836 max_bin_count = 0; 1837 total_bin_count = 0; 1838 for (j = first_bin; j <= last_bin; j++) { 1839 total_bin_count += lsp->ls_hist[j]; 1840 if (lsp->ls_hist[j] > max_bin_count) 1841 max_bin_count = lsp->ls_hist[j]; 1842 } 1843 1844 /* 1845 * If we went a few frames below the caller, ignore them 1846 */ 1847 for (fr = 3; fr > 0; fr--) 1848 if (lsp->ls_stack[fr] == lsp->ls_caller) 1849 break; 1850 1851 for (j = first_bin; j <= last_bin; j++) { 1852 uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count; 1853 (void) fprintf(out, "%10llu |%s%s %-9u ", 1854 1ULL << j, 1855 "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth, 1856 " " + depth, 1857 g_rates == 0 ? lsp->ls_hist[j] : 1858 (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) / 1859 g_elapsed)); 1860 if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) { 1861 (void) fprintf(out, "\n"); 1862 continue; 1863 } 1864 (void) fprintf(out, "%-24s\n", 1865 format_symbol(buf, lsp->ls_stack[fr], 0)); 1866 fr++; 1867 } 1868 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1869 (void) fprintf(out, "%15s %-36s %-24s\n", "", "", 1870 format_symbol(buf, lsp->ls_stack[fr], 0)); 1871 fr++; 1872 } 1873 } 1874 1875 if (!g_pflag) 1876 (void) fprintf(out, "---------------------------------" 1877 "----------------------------------------------\n"); 1878 1879 (void) fflush(out); 1880} 1881 1882static void 1883report_trace(FILE *out, lsrec_t **sort_buf) 1884{ 1885 lsrec_t *lsp; 1886 int i, fr; 1887 int rectype; 1888 char buf[256], buf2[256]; 1889 1890 rectype = g_recsize; 1891 1892 if (!g_pflag) { 1893 (void) fprintf(out, "%5s %7s %11s %-24s %-24s\n", 1894 "Event", "Time", "Owner", "Lock", "Caller"); 1895 (void) fprintf(out, "---------------------------------" 1896 "----------------------------------------------\n"); 1897 } 1898 1899 for (i = 0; i < g_nrecs_used; i++) { 1900 1901 lsp = sort_buf[i]; 1902 1903 if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0) 1904 continue; 1905 1906 (void) fprintf(out, "%2d %10llu %11p %-24s %-24s\n", 1907 lsp->ls_event, (unsigned long long)lsp->ls_time, 1908 (void *)lsp->ls_next, 1909 format_symbol(buf, lsp->ls_lock, 0), 1910 format_symbol(buf2, lsp->ls_caller, 0)); 1911 1912 if (rectype <= LS_STACK(0)) 1913 continue; 1914 1915 /* 1916 * If we went a few frames below the caller, ignore them 1917 */ 1918 for (fr = 3; fr > 0; fr--) 1919 if (lsp->ls_stack[fr] == lsp->ls_caller) 1920 break; 1921 1922 while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) { 1923 (void) fprintf(out, "%53s %-24s\n", "", 1924 format_symbol(buf, lsp->ls_stack[fr], 0)); 1925 fr++; 1926 } 1927 (void) fprintf(out, "\n"); 1928 } 1929 1930 (void) fflush(out); 1931} 1932