1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2008-2022 NetApp, Inc.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27#include <sys/param.h>
28#include <sys/systm.h>
29#include <sys/boottrace.h>
30#include <sys/kernel.h>
31#include <sys/malloc.h>
32#include <sys/pcpu.h>
33#include <sys/proc.h>
34#include <sys/resourcevar.h>
35#include <sys/sbuf.h>
36#include <sys/syscallsubr.h>
37#include <sys/sysctl.h>
38#include <sys/time.h>
39
40#include <machine/stdarg.h>
41
42#define	dprintf(fmt, ...)						\
43	do {								\
44		if (dotrace_debugging)					\
45			printf(fmt, ##__VA_ARGS__);			\
46	} while (0);
47
48static MALLOC_DEFINE(M_BOOTTRACE, "boottrace", "memory for boot tracing");
49
50#define	BT_TABLE_DEFSIZE	3000
51#define	BT_TABLE_RUNSIZE	2000
52#define	BT_TABLE_SHTSIZE	1000
53#define	BT_TABLE_MINSIZE	500
54
55/*
56 * Boot-time & shutdown-time event.
57 */
58struct bt_event {
59	uint64_t tsc;			/* Timestamp */
60	uint64_t tick;			/* Kernel tick */
61	uint64_t cputime;		/* Microseconds of process CPU time */
62	uint32_t cpuid;			/* CPU on which the event ran */
63	uint32_t inblock;		/* # of blocks in */
64	uint32_t oublock;		/* # of blocks out */
65	pid_t pid;			/* Current PID */
66	char name[BT_EVENT_NAMELEN];	/* Event name */
67	char tdname[BT_EVENT_TDNAMELEN]; /* Thread name */
68};
69
70struct bt_table {
71	uint32_t size;			/* Trace table size */
72	uint32_t curr;			/* Trace entry to use */
73	uint32_t wrap;			/* Wrap-around, instead of dropping */
74	uint32_t drops_early;		/* Trace entries dropped before init */
75	uint32_t drops_full;		/* Trace entries dropped after full */
76	struct bt_event *table;
77};
78
79/* Boot-time tracing */
80static struct bt_table bt;
81
82/* Run-time tracing */
83static struct bt_table rt;
84
85/* Shutdown-time tracing */
86static struct bt_table st;
87
88/* Set when system boot is complete, and we've switched to runtime tracing. */
89static bool bootdone;
90
91/* Set when system shutdown has started. */
92static bool shutdown;
93
94/* Turn on dotrace() debug logging to console. */
95static bool dotrace_debugging;
96TUNABLE_BOOL("debug.boottrace.dotrace_debugging", &dotrace_debugging);
97
98/* Trace kernel events */
99static bool dotrace_kernel = true;
100TUNABLE_BOOL("kern.boottrace.dotrace_kernel", &dotrace_kernel);
101
102/* Trace userspace events */
103static bool dotrace_user = true;
104TUNABLE_BOOL("kern.boottrace.dotrace_user", &dotrace_user);
105
106static int sysctl_log(SYSCTL_HANDLER_ARGS);
107static int sysctl_boottrace(SYSCTL_HANDLER_ARGS);
108static int sysctl_runtrace(SYSCTL_HANDLER_ARGS);
109static int sysctl_shuttrace(SYSCTL_HANDLER_ARGS);
110static int sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS);
111
112SYSCTL_NODE(_kern, OID_AUTO, boottrace, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
113    "boottrace statistics");
114
115SYSCTL_PROC(_kern_boottrace, OID_AUTO, log,
116    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
117    0, 0, sysctl_log, "A",
118    "Print a log of the boottrace trace data");
119SYSCTL_PROC(_kern_boottrace, OID_AUTO, boottrace,
120    CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
121    0, 0, sysctl_boottrace, "A",
122    "Capture a boot-time trace event");
123SYSCTL_PROC(_kern_boottrace, OID_AUTO, runtrace,
124    CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
125    0, 0, sysctl_runtrace, "A",
126    "Capture a run-time trace event");
127SYSCTL_PROC(_kern_boottrace, OID_AUTO, shuttrace,
128    CTLTYPE_STRING | CTLFLAG_WR | CTLFLAG_MPSAFE,
129    0, 0, sysctl_shuttrace, "A",
130    "Capture a shutdown-time trace event");
131SYSCTL_PROC(_kern_boottrace, OID_AUTO, reset,
132    CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_MPSAFE,
133    0, 0, sysctl_boottrace_reset, "A",
134    "Reset run-time tracing table");
135
136/*
137 * Global enable.
138 */
139bool __read_mostly boottrace_enabled = false;
140SYSCTL_BOOL(_kern_boottrace, OID_AUTO, enabled, CTLFLAG_RDTUN,
141    &boottrace_enabled, 0,
142    "Boot-time and shutdown-time tracing enabled");
143
144/*
145 * Enable dumping of the shutdown trace entries to console.
146 */
147bool shutdown_trace = false;
148SYSCTL_BOOL(_kern_boottrace, OID_AUTO, shutdown_trace, CTLFLAG_RWTUN,
149   &shutdown_trace, 0,
150   "Enable kernel shutdown tracing to the console");
151
152/*
153 * Set the delta threshold (ms) below which events are ignored, used in
154 * determining what to dump to the console.
155 */
156static int shutdown_trace_threshold;
157SYSCTL_INT(_kern_boottrace, OID_AUTO, shutdown_trace_threshold, CTLFLAG_RWTUN,
158   &shutdown_trace_threshold, 0,
159   "Tracing threshold (ms) below which tracing is ignored");
160
161SYSCTL_UINT(_kern_boottrace, OID_AUTO, table_size,
162    CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &bt.size, 0,
163    "Boot-time tracing table size");
164
165/*
166 * Dump a trace to buffer or if buffer is NULL to console.
167 *
168 * Non-zero delta_threshold selectively prints entries based on delta
169 * with current and previous entry. Otherwise, delta_threshold of 0
170 * prints every trace entry and delta.
171 *
172 * Output something like this:
173 *
174 * CPU      msecs      delta process                  event
175 *  11 1228262715          0 init                     shutdown pre sync begin
176 *   3 1228265622       2907 init                     shutdown pre sync complete
177 *   3 1228265623          0 init                     shutdown turned swap off
178 *  18 1228266466        843 init                     shutdown unmounted all filesystems
179 *
180 * How to interpret:
181 *
182 * delta column represents the time in milliseconds between this event and the previous.
183 * Usually that means you can take the previous event, current event, match them
184 * up in the code, and whatever lies between is the culprit taking time.
185 *
186 * For example, above: Pre-Sync is taking 2907ms, and something between swap and unmount
187 * filesystems is taking 843 milliseconds.
188 *
189 * An event with a delta of 0 are 'landmark' events that simply exist in the output
190 * for the developer to know where the time measurement begins. The 0 is an arbitrary
191 * number that can effectively be ignored.
192 */
193#define BTC_DELTA_PRINT(bte, msecs, delta) do {				\
194	if (sbp) {							\
195		sbuf_printf(sbp, fmt, (bte)->cpuid, msecs, delta,	\
196		    (bte)->tdname, (bte)->name, (bte)->pid,		\
197		    (bte)->cputime / 1000000,				\
198		    ((bte)->cputime % 1000000) / 10000,			\
199		    (bte)->inblock, (bte)->oublock);			\
200	} else {							\
201		printf(fmt, (bte)->cpuid, msecs, delta,			\
202		    (bte)->tdname, (bte)->name, (bte)->pid,		\
203		    (bte)->cputime / 1000000,				\
204		    ((bte)->cputime % 1000000) / 10000,			\
205		    (bte)->inblock, (bte)->oublock);			\
206	}								\
207} while (0)
208
209/*
210 * Print the trace entries to the message buffer, or to an sbuf, if provided.
211 *
212 * Entries with a difference less than dthres will not be printed.
213 */
214static void
215boottrace_display(struct sbuf *sbp, struct bt_table *btp, uint64_t dthres)
216{
217	struct bt_event *evtp;
218	struct bt_event *last_evtp;
219	uint64_t msecs;
220	uint64_t first_msecs;
221	uint64_t last_msecs;
222	uint64_t dmsecs;
223	uint64_t last_dmsecs;
224	uint64_t total_dmsecs;
225	uint32_t i;
226	uint32_t curr;
227	const char *fmt     = "%3u %10llu %10llu %-24s %-40s %5d %4d.%02d %5u %5u\n";
228	const char *hdr_fmt = "\n\n%3s %10s %10s %-24s %-40s %5s %6s %5s %5s\n";
229	bool printed;
230	bool last_printed;
231
232	/* Print the header */
233	if (sbp != NULL)
234		sbuf_printf(sbp, hdr_fmt,
235		    "CPU", "msecs", "delta", "process",
236		    "event", "PID", "CPUtime", "IBlks", "OBlks");
237	else
238		printf(hdr_fmt,
239		    "CPU", "msecs", "delta", "process",
240		    "event", "PID", "CPUtime", "IBlks", "OBlks");
241
242	first_msecs = 0;
243	last_evtp = NULL;
244	last_msecs = 0;
245	last_dmsecs = 0;
246	last_printed = false;
247	i = curr = btp->curr;
248
249	do {
250		evtp = &btp->table[i];
251		if (evtp->tsc == 0)
252			goto next;
253
254		msecs = cputick2usec(evtp->tick) / 1000;
255		dmsecs = (last_msecs != 0 && msecs > last_msecs) ?
256		    msecs - last_msecs : 0;
257		printed = false;
258
259		/*
260		 * If a threshold is defined, start filtering events by the
261		 * delta of msecs.
262		 */
263		if (dthres != 0 && (dmsecs > dthres)) {
264			/*
265			 * Print the previous entry as a landmark, even if it
266			 * falls below the threshold.
267			 */
268			if (last_evtp != NULL && !last_printed)
269				BTC_DELTA_PRINT(last_evtp, last_msecs,
270				    last_dmsecs);
271			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
272			printed = true;
273		} else if (dthres == 0) {
274			BTC_DELTA_PRINT(evtp, msecs, dmsecs);
275			printed = true;
276		}
277		if (first_msecs == 0 || msecs < first_msecs)
278			first_msecs = msecs;
279		last_evtp = evtp;
280		last_msecs = msecs;
281		last_dmsecs = dmsecs;
282		last_printed = printed;
283		maybe_yield();
284next:
285		i = (i + 1) % btp->size;
286	} while (i != curr);
287
288	total_dmsecs = last_msecs > first_msecs ?
289	    (last_msecs - first_msecs) : 0;
290	if (sbp != NULL)
291		sbuf_printf(sbp, "Total measured time: %ju msecs\n",
292		    (uintmax_t)total_dmsecs);
293	else
294		printf("Total measured time: %ju msecs\n",
295		    (uintmax_t)total_dmsecs);
296}
297
298/*
299 * Dump trace table entries to the console, given a delta threshold.
300 */
301void
302boottrace_dump_console(void)
303{
304	if (!boottrace_enabled)
305		return;
306
307	if (shutdown || rebooting || KERNEL_PANICKED()) {
308		boottrace_display(NULL, &st, shutdown_trace_threshold);
309	} else {
310		boottrace_display(NULL, &bt, 0);
311		boottrace_display(NULL, &rt, 0);
312	}
313}
314
315/*
316 * Records a new tracing event to the specified table.
317 *
318 * We don't use a lock because we want this to be callable from interrupt
319 * context.
320 */
321static int
322dotrace(struct bt_table *btp, const char *eventname, const char *tdname)
323{
324	uint32_t idx, nxt;
325	struct rusage usage;
326
327	MPASS(boottrace_enabled);
328	if (tdname == NULL)
329		tdname = (curproc->p_flag & P_SYSTEM) ?
330		    curthread->td_name : curproc->p_comm;
331
332	dprintf("dotrace[");
333	dprintf("cpu=%u, pid=%d, tsc=%ju, tick=%d, td='%s', event='%s'",
334	    PCPU_GET(cpuid), curthread->td_proc->p_pid,
335	    (uintmax_t)get_cyclecount(), ticks, tdname, eventname);
336	if (btp->table == NULL) {
337		btp->drops_early++;
338		dprintf(", return=ENOSPC_1]\n");
339		return (ENOSPC);
340	}
341
342	/* Claim a slot in the table. */
343	do {
344		idx = btp->curr;
345		nxt = (idx + 1) % btp->size;
346		if (nxt == 0 && btp->wrap == 0) {
347			btp->drops_full++;
348			dprintf(", return=ENOSPC_2]\n");
349			return (ENOSPC);
350		}
351	} while (!atomic_cmpset_int(&btp->curr, idx, nxt));
352
353	btp->table[idx].cpuid = PCPU_GET(cpuid);
354	btp->table[idx].tsc = get_cyclecount(),
355	btp->table[idx].tick = cpu_ticks();
356	btp->table[idx].pid = curthread->td_proc->p_pid;
357
358	/*
359	 * Get the resource usage for the process. Don't attempt this for the
360	 * kernel proc0 or for critical section activities.
361	 */
362	if ((curthread->td_proc == &proc0) || (curthread->td_critnest != 0)) {
363		btp->table[idx].cputime = 0;
364		btp->table[idx].inblock = 0;
365		btp->table[idx].oublock = 0;
366	} else {
367		kern_getrusage(curthread, RUSAGE_CHILDREN, &usage);
368		btp->table[idx].cputime =
369		    (uint64_t)(usage.ru_utime.tv_sec * 1000000 +
370		    usage.ru_utime.tv_usec + usage.ru_stime.tv_sec * 1000000 +
371		    usage.ru_stime.tv_usec);
372		btp->table[idx].inblock = (uint32_t)usage.ru_inblock;
373		btp->table[idx].oublock = (uint32_t)usage.ru_oublock;
374	}
375	strlcpy(btp->table[idx].name, eventname, BT_EVENT_NAMELEN);
376	strlcpy(btp->table[idx].tdname, tdname, BT_EVENT_TDNAMELEN);
377
378	dprintf(", return=0]\n");
379	return (0);
380}
381
382/*
383 * Log various boot-time events, with the trace message encoded using
384 * printf-like arguments.
385 */
386int
387boottrace(const char *tdname, const char *fmt, ...)
388{
389	char eventname[BT_EVENT_NAMELEN];
390	struct bt_table *btp;
391	va_list ap;
392
393	if (!dotrace_kernel)
394		 return (ENXIO);
395
396	va_start(ap, fmt);
397	vsnprintf(eventname, sizeof(eventname), fmt, ap);
398	va_end(ap);
399
400	btp = &bt;
401	if (bootdone)
402		btp = &rt;
403	if (shutdown || rebooting || KERNEL_PANICKED())
404		btp = &st;
405
406	return (dotrace(btp, eventname, tdname));
407}
408
409/*
410 * Log a run-time event & switch over to run-time tracing mode.
411 */
412static int
413runtrace(const char *eventname, const char *tdname)
414{
415
416	bootdone = true;
417	return (dotrace(&rt, eventname, tdname));
418}
419
420/*
421 * Parse a boottrace message from userspace.
422 *
423 * The input from may contain a ':' to denote tdname. If not, tdname is
424 * inferred from the process' name.
425 *
426 * e.g. reboot(8):SIGINT to init(8)
427 */
428static void
429boottrace_parse_message(char *message, char **eventname, char **tdname)
430{
431	char *delim;
432
433	delim = strchr(message, ':');
434	if (delim != NULL) {
435		*delim = '\0';
436		*tdname = message;
437		*eventname = ++delim;
438	} else {
439		*tdname = curproc->p_comm;
440		*eventname = message;
441	}
442}
443
444static int
445_boottrace_sysctl(struct bt_table *btp, struct sysctl_oid *oidp,
446    struct sysctl_req *req)
447{
448	char message[BT_MSGLEN];
449	char *eventname, *tdname;
450	int error;
451
452	if (!dotrace_user)
453	       return (ENXIO);
454
455	message[0] = '\0';
456	error = sysctl_handle_string(oidp, message, sizeof(message), req);
457	if (error)
458		return (error);
459
460	boottrace_parse_message(message, &eventname, &tdname);
461	error = dotrace(btp, eventname, tdname);
462	if (error == ENOSPC) {
463		/* Ignore table full error. */
464		error = 0;
465	}
466	return (error);
467}
468
469static int
470sysctl_log(SYSCTL_HANDLER_ARGS)
471{
472	struct sbuf *sbuf;
473	int error;
474
475	if (!boottrace_enabled || req->newptr != NULL)
476		return (0);
477
478	sbuf = sbuf_new(NULL, NULL, 0, SBUF_AUTOEXTEND);
479	boottrace_display(sbuf, &bt, 0);
480	boottrace_display(sbuf, &rt, 0);
481	sbuf_finish(sbuf);
482	error = SYSCTL_OUT(req, sbuf_data(sbuf), sbuf_len(sbuf));
483	sbuf_delete(sbuf);
484
485	return (error);
486}
487
488static int
489sysctl_boottrace(SYSCTL_HANDLER_ARGS)
490{
491
492	if (!boottrace_enabled)
493		return (0);
494
495	/* No output */
496	if (req->newptr == NULL)
497		return (0);
498
499	/* Trace to rt if we have reached multi-user. */
500	return (_boottrace_sysctl(bootdone ? &rt : &bt, oidp, req));
501}
502
503/*
504 * Log a run-time event and switch over to run-time tracing mode.
505 */
506static int
507sysctl_runtrace(SYSCTL_HANDLER_ARGS)
508{
509
510	if (!boottrace_enabled)
511		return (0);
512
513	/* No output */
514	if (req->newptr == NULL)
515		return (0);
516
517	bootdone = true;
518	return (_boottrace_sysctl(&rt, oidp, req));
519}
520
521/*
522 * Log a shutdown-time event and switch over to shutdown tracing mode.
523 */
524static int
525sysctl_shuttrace(SYSCTL_HANDLER_ARGS)
526{
527
528	if (!boottrace_enabled)
529		return (0);
530
531	/* No output */
532	if (req->newptr == NULL)
533		return (0);
534
535	shutdown = true;
536	return (_boottrace_sysctl(&st, oidp, req));
537}
538
539/*
540 * Reset the runtime tracing buffer.
541 */
542void
543boottrace_reset(const char *actor)
544{
545	char tmpbuf[64];
546
547	snprintf(tmpbuf, sizeof(tmpbuf), "reset: %s", actor);
548	runtrace(tmpbuf, NULL);
549}
550
551/*
552 * Note that a resize implies a reset, i.e., the index is reset to 0.
553 * We never shrink the array; we can only increase its size.
554 */
555int
556boottrace_resize(u_int newsize)
557{
558
559	if (newsize <= rt.size) {
560		return (EINVAL);
561	}
562	rt.table = realloc(rt.table, newsize * sizeof(struct bt_event),
563	    M_BOOTTRACE, M_WAITOK | M_ZERO);
564	if (rt.table == NULL)
565		return (ENOMEM);
566
567	rt.size = newsize;
568	boottrace_reset("boottrace_resize");
569	return (0);
570}
571
572static int
573sysctl_boottrace_reset(SYSCTL_HANDLER_ARGS)
574{
575
576	if (!boottrace_enabled)
577		return (0);
578
579	if (req->newptr != NULL)
580		boottrace_reset("sysctl_boottrace_reset");
581
582	return (0);
583}
584
585static void
586boottrace_init(void)
587{
588
589	if (!boottrace_enabled)
590		return;
591
592	/* Boottime trace table */
593	bt.size = BT_TABLE_DEFSIZE;
594	TUNABLE_INT_FETCH("kern.boottrace.table_size", &bt.size);
595	bt.size = max(bt.size, BT_TABLE_MINSIZE);
596	bt.table = malloc(bt.size * sizeof(struct bt_event), M_BOOTTRACE,
597	    M_WAITOK | M_ZERO);
598
599	/* Stick in an initial entry. */
600	bt.table[0].cpuid = PCPU_GET(cpuid);
601	strlcpy(bt.table[0].tdname, "boottime", BT_EVENT_TDNAMELEN);
602	strlcpy(bt.table[0].name, "initial event", BT_EVENT_NAMELEN);
603	bt.curr = 1;
604
605	/* Run-time trace table (may wrap-around). */
606	rt.wrap = 1;
607	rt.size = BT_TABLE_RUNSIZE;
608	rt.table = malloc(rt.size * sizeof(struct bt_event), M_BOOTTRACE,
609	    M_WAITOK | M_ZERO);
610
611	/* Shutdown trace table */
612	st.size = BT_TABLE_SHTSIZE;
613	st.table = malloc(st.size * sizeof(struct bt_event), M_BOOTTRACE,
614	    M_WAITOK | M_ZERO);
615}
616SYSINIT(boottrace, SI_SUB_CPU, SI_ORDER_ANY, boottrace_init, NULL);
617