subr_devstat.c revision 260385
1/*-
2 * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 * 3. The name of the author may not be used to endorse or promote products
14 *    derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/cdefs.h>
30__FBSDID("$FreeBSD: stable/10/sys/kern/subr_devstat.c 260385 2014-01-07 01:32:23Z scottl $");
31
32#include "opt_kdtrace.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/systm.h>
37#include <sys/bio.h>
38#include <sys/devicestat.h>
39#include <sys/sysctl.h>
40#include <sys/malloc.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/conf.h>
44#include <vm/vm.h>
45#include <vm/pmap.h>
46
47#include <machine/atomic.h>
48
49#ifdef KDTRACE_HOOKS
50#include <sys/dtrace_bsd.h>
51
52dtrace_io_start_probe_func_t dtrace_io_start_probe;
53dtrace_io_done_probe_func_t dtrace_io_done_probe;
54dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
55dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
56
57uint32_t	dtio_start_id;
58uint32_t	dtio_done_id;
59uint32_t	dtio_wait_start_id;
60uint32_t	dtio_wait_done_id;
61
62#define DTRACE_DEVSTAT_START() \
63	if (dtrace_io_start_probe != NULL) \
64		(*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
65
66#define DTRACE_DEVSTAT_BIO_START() \
67	if (dtrace_io_start_probe != NULL) \
68		(*dtrace_io_start_probe)(dtio_start_id, bp, ds);
69
70#define DTRACE_DEVSTAT_DONE() \
71	if (dtrace_io_done_probe != NULL) \
72		(*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
73
74#define DTRACE_DEVSTAT_BIO_DONE() \
75	if (dtrace_io_done_probe != NULL) \
76		(*dtrace_io_done_probe)(dtio_done_id, bp, ds);
77
78#define DTRACE_DEVSTAT_WAIT_START() \
79	if (dtrace_io_wait_start_probe != NULL) \
80		(*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
81
82#define DTRACE_DEVSTAT_WAIT_DONE() \
83	if (dtrace_io_wait_done_probe != NULL) \
84		(*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
85
86#else /* ! KDTRACE_HOOKS */
87
88#define DTRACE_DEVSTAT_START()
89
90#define DTRACE_DEVSTAT_BIO_START()
91
92#define DTRACE_DEVSTAT_DONE()
93
94#define DTRACE_DEVSTAT_BIO_DONE()
95
96#define DTRACE_DEVSTAT_WAIT_START()
97
98#define DTRACE_DEVSTAT_WAIT_DONE()
99#endif /* KDTRACE_HOOKS */
100
101static int devstat_num_devs;
102static long devstat_generation = 1;
103static int devstat_version = DEVSTAT_VERSION;
104static int devstat_current_devnumber;
105static struct mtx devstat_mutex;
106MTX_SYSINIT(devstat_mutex, &devstat_mutex, "devstat", MTX_DEF);
107
108static struct devstatlist device_statq = STAILQ_HEAD_INITIALIZER(device_statq);
109static struct devstat *devstat_alloc(void);
110static void devstat_free(struct devstat *);
111static void devstat_add_entry(struct devstat *ds, const void *dev_name,
112		       int unit_number, uint32_t block_size,
113		       devstat_support_flags flags,
114		       devstat_type_flags device_type,
115		       devstat_priority priority);
116
117/*
118 * Allocate a devstat and initialize it
119 */
120struct devstat *
121devstat_new_entry(const void *dev_name,
122		  int unit_number, uint32_t block_size,
123		  devstat_support_flags flags,
124		  devstat_type_flags device_type,
125		  devstat_priority priority)
126{
127	struct devstat *ds;
128
129	mtx_assert(&devstat_mutex, MA_NOTOWNED);
130
131	ds = devstat_alloc();
132	mtx_lock(&devstat_mutex);
133	if (unit_number == -1) {
134		ds->unit_number = unit_number;
135		ds->id = dev_name;
136		binuptime(&ds->creation_time);
137		devstat_generation++;
138	} else {
139		devstat_add_entry(ds, dev_name, unit_number, block_size,
140				  flags, device_type, priority);
141	}
142	mtx_unlock(&devstat_mutex);
143	return (ds);
144}
145
146/*
147 * Take a malloced and zeroed devstat structure given to us, fill it in
148 * and add it to the queue of devices.
149 */
150static void
151devstat_add_entry(struct devstat *ds, const void *dev_name,
152		  int unit_number, uint32_t block_size,
153		  devstat_support_flags flags,
154		  devstat_type_flags device_type,
155		  devstat_priority priority)
156{
157	struct devstatlist *devstat_head;
158	struct devstat *ds_tmp;
159
160	mtx_assert(&devstat_mutex, MA_OWNED);
161	devstat_num_devs++;
162
163	devstat_head = &device_statq;
164
165	/*
166	 * Priority sort.  Each driver passes in its priority when it adds
167	 * its devstat entry.  Drivers are sorted first by priority, and
168	 * then by probe order.
169	 *
170	 * For the first device, we just insert it, since the priority
171	 * doesn't really matter yet.  Subsequent devices are inserted into
172	 * the list using the order outlined above.
173	 */
174	if (devstat_num_devs == 1)
175		STAILQ_INSERT_TAIL(devstat_head, ds, dev_links);
176	else {
177		STAILQ_FOREACH(ds_tmp, devstat_head, dev_links) {
178			struct devstat *ds_next;
179
180			ds_next = STAILQ_NEXT(ds_tmp, dev_links);
181
182			/*
183			 * If we find a break between higher and lower
184			 * priority items, and if this item fits in the
185			 * break, insert it.  This also applies if the
186			 * "lower priority item" is the end of the list.
187			 */
188			if ((priority <= ds_tmp->priority)
189			 && ((ds_next == NULL)
190			   || (priority > ds_next->priority))) {
191				STAILQ_INSERT_AFTER(devstat_head, ds_tmp, ds,
192						    dev_links);
193				break;
194			} else if (priority > ds_tmp->priority) {
195				/*
196				 * If this is the case, we should be able
197				 * to insert ourselves at the head of the
198				 * list.  If we can't, something is wrong.
199				 */
200				if (ds_tmp == STAILQ_FIRST(devstat_head)) {
201					STAILQ_INSERT_HEAD(devstat_head,
202							   ds, dev_links);
203					break;
204				} else {
205					STAILQ_INSERT_TAIL(devstat_head,
206							   ds, dev_links);
207					printf("devstat_add_entry: HELP! "
208					       "sorting problem detected "
209					       "for name %p unit %d\n",
210					       dev_name, unit_number);
211					break;
212				}
213			}
214		}
215	}
216
217	ds->device_number = devstat_current_devnumber++;
218	ds->unit_number = unit_number;
219	strlcpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN);
220	ds->block_size = block_size;
221	ds->flags = flags;
222	ds->device_type = device_type;
223	ds->priority = priority;
224	binuptime(&ds->creation_time);
225	devstat_generation++;
226}
227
228/*
229 * Remove a devstat structure from the list of devices.
230 */
231void
232devstat_remove_entry(struct devstat *ds)
233{
234	struct devstatlist *devstat_head;
235
236	mtx_assert(&devstat_mutex, MA_NOTOWNED);
237	if (ds == NULL)
238		return;
239
240	mtx_lock(&devstat_mutex);
241
242	devstat_head = &device_statq;
243
244	/* Remove this entry from the devstat queue */
245	atomic_add_acq_int(&ds->sequence1, 1);
246	if (ds->unit_number != -1) {
247		devstat_num_devs--;
248		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
249	}
250	devstat_free(ds);
251	devstat_generation++;
252	mtx_unlock(&devstat_mutex);
253}
254
255/*
256 * Record a transaction start.
257 *
258 * See comments for devstat_end_transaction().  Ordering is very important
259 * here.
260 */
261void
262devstat_start_transaction(struct devstat *ds, struct bintime *now)
263{
264
265	mtx_assert(&devstat_mutex, MA_NOTOWNED);
266
267	/* sanity check */
268	if (ds == NULL)
269		return;
270
271	atomic_add_acq_int(&ds->sequence1, 1);
272	/*
273	 * We only want to set the start time when we are going from idle
274	 * to busy.  The start time is really the start of the latest busy
275	 * period.
276	 */
277	if (ds->start_count == ds->end_count) {
278		if (now != NULL)
279			ds->busy_from = *now;
280		else
281			binuptime(&ds->busy_from);
282	}
283	ds->start_count++;
284	atomic_add_rel_int(&ds->sequence0, 1);
285	DTRACE_DEVSTAT_START();
286}
287
288void
289devstat_start_transaction_bio(struct devstat *ds, struct bio *bp)
290{
291
292	mtx_assert(&devstat_mutex, MA_NOTOWNED);
293
294	/* sanity check */
295	if (ds == NULL)
296		return;
297
298	binuptime(&bp->bio_t0);
299	devstat_start_transaction(ds, &bp->bio_t0);
300	DTRACE_DEVSTAT_BIO_START();
301}
302
303/*
304 * Record the ending of a transaction, and incrment the various counters.
305 *
306 * Ordering in this function, and in devstat_start_transaction() is VERY
307 * important.  The idea here is to run without locks, so we are very
308 * careful to only modify some fields on the way "down" (i.e. at
309 * transaction start) and some fields on the way "up" (i.e. at transaction
310 * completion).  One exception is busy_from, which we only modify in
311 * devstat_start_transaction() when there are no outstanding transactions,
312 * and thus it can't be modified in devstat_end_transaction()
313 * simultaneously.
314 *
315 * The sequence0 and sequence1 fields are provided to enable an application
316 * spying on the structures with mmap(2) to tell when a structure is in a
317 * consistent state or not.
318 *
319 * For this to work 100% reliably, it is important that the two fields
320 * are at opposite ends of the structure and that they are incremented
321 * in the opposite order of how a memcpy(3) in userland would copy them.
322 * We assume that the copying happens front to back, but there is actually
323 * no way short of writing your own memcpy(3) replacement to guarantee
324 * this will be the case.
325 *
326 * In addition to this, being a kind of locks, they must be updated with
327 * atomic instructions using appropriate memory barriers.
328 */
329void
330devstat_end_transaction(struct devstat *ds, uint32_t bytes,
331			devstat_tag_type tag_type, devstat_trans_flags flags,
332			struct bintime *now, struct bintime *then)
333{
334	struct bintime dt, lnow;
335
336	/* sanity check */
337	if (ds == NULL)
338		return;
339
340	if (now == NULL) {
341		now = &lnow;
342		binuptime(now);
343	}
344
345	atomic_add_acq_int(&ds->sequence1, 1);
346	/* Update byte and operations counts */
347	ds->bytes[flags] += bytes;
348	ds->operations[flags]++;
349
350	/*
351	 * Keep a count of the various tag types sent.
352	 */
353	if ((ds->flags & DEVSTAT_NO_ORDERED_TAGS) == 0 &&
354	    tag_type != DEVSTAT_TAG_NONE)
355		ds->tag_types[tag_type]++;
356
357	if (then != NULL) {
358		/* Update duration of operations */
359		dt = *now;
360		bintime_sub(&dt, then);
361		bintime_add(&ds->duration[flags], &dt);
362	}
363
364	/* Accumulate busy time */
365	dt = *now;
366	bintime_sub(&dt, &ds->busy_from);
367	bintime_add(&ds->busy_time, &dt);
368	ds->busy_from = *now;
369
370	ds->end_count++;
371	atomic_add_rel_int(&ds->sequence0, 1);
372	DTRACE_DEVSTAT_DONE();
373}
374
375void
376devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
377{
378
379	devstat_end_transaction_bio_bt(ds, bp, NULL);
380}
381
382void
383devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp,
384    struct bintime *now)
385{
386	devstat_trans_flags flg;
387
388	/* sanity check */
389	if (ds == NULL)
390		return;
391
392	if (bp->bio_cmd == BIO_DELETE)
393		flg = DEVSTAT_FREE;
394	else if (bp->bio_cmd == BIO_READ)
395		flg = DEVSTAT_READ;
396	else if (bp->bio_cmd == BIO_WRITE)
397		flg = DEVSTAT_WRITE;
398	else
399		flg = DEVSTAT_NO_DATA;
400
401	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
402				DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0);
403	DTRACE_DEVSTAT_BIO_DONE();
404}
405
406/*
407 * This is the sysctl handler for the devstat package.  The data pushed out
408 * on the kern.devstat.all sysctl variable consists of the current devstat
409 * generation number, and then an array of devstat structures, one for each
410 * device in the system.
411 *
412 * This is more cryptic that obvious, but basically we neither can nor
413 * want to hold the devstat_mutex for any amount of time, so we grab it
414 * only when we need to and keep an eye on devstat_generation all the time.
415 */
416static int
417sysctl_devstat(SYSCTL_HANDLER_ARGS)
418{
419	int error;
420	long mygen;
421	struct devstat *nds;
422
423	mtx_assert(&devstat_mutex, MA_NOTOWNED);
424
425	/*
426	 * XXX devstat_generation should really be "volatile" but that
427	 * XXX freaks out the sysctl macro below.  The places where we
428	 * XXX change it and inspect it are bracketed in the mutex which
429	 * XXX guarantees us proper write barriers.  I don't belive the
430	 * XXX compiler is allowed to optimize mygen away across calls
431	 * XXX to other functions, so the following is belived to be safe.
432	 */
433	mygen = devstat_generation;
434
435	error = SYSCTL_OUT(req, &mygen, sizeof(mygen));
436
437	if (devstat_num_devs == 0)
438		return(0);
439
440	if (error != 0)
441		return (error);
442
443	mtx_lock(&devstat_mutex);
444	nds = STAILQ_FIRST(&device_statq);
445	if (mygen != devstat_generation)
446		error = EBUSY;
447	mtx_unlock(&devstat_mutex);
448
449	if (error != 0)
450		return (error);
451
452	for (;nds != NULL;) {
453		error = SYSCTL_OUT(req, nds, sizeof(struct devstat));
454		if (error != 0)
455			return (error);
456		mtx_lock(&devstat_mutex);
457		if (mygen != devstat_generation)
458			error = EBUSY;
459		else
460			nds = STAILQ_NEXT(nds, dev_links);
461		mtx_unlock(&devstat_mutex);
462		if (error != 0)
463			return (error);
464	}
465	return(error);
466}
467
468/*
469 * Sysctl entries for devstat.  The first one is a node that all the rest
470 * hang off of.
471 */
472static SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, NULL,
473    "Device Statistics");
474
475SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE,
476    NULL, 0, sysctl_devstat, "S,devstat", "All devices in the devstat list");
477/*
478 * Export the number of devices in the system so that userland utilities
479 * can determine how much memory to allocate to hold all the devices.
480 */
481SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD,
482    &devstat_num_devs, 0, "Number of devices in the devstat list");
483SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD,
484    &devstat_generation, 0, "Devstat list generation");
485SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD,
486    &devstat_version, 0, "Devstat list version number");
487
488/*
489 * Allocator for struct devstat structures.  We sub-allocate these from pages
490 * which we get from malloc.  These pages are exported for mmap(2)'ing through
491 * a miniature device driver
492 */
493
494#define statsperpage (PAGE_SIZE / sizeof(struct devstat))
495
496static d_mmap_t devstat_mmap;
497
498static struct cdevsw devstat_cdevsw = {
499	.d_version =	D_VERSION,
500	.d_flags =	D_NEEDGIANT,
501	.d_mmap =	devstat_mmap,
502	.d_name =	"devstat",
503};
504
505struct statspage {
506	TAILQ_ENTRY(statspage)	list;
507	struct devstat		*stat;
508	u_int			nfree;
509};
510
511static TAILQ_HEAD(, statspage)	pagelist = TAILQ_HEAD_INITIALIZER(pagelist);
512static MALLOC_DEFINE(M_DEVSTAT, "devstat", "Device statistics");
513
514static int
515devstat_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
516    int nprot, vm_memattr_t *memattr)
517{
518	struct statspage *spp;
519
520	if (nprot != VM_PROT_READ)
521		return (-1);
522	TAILQ_FOREACH(spp, &pagelist, list) {
523		if (offset == 0) {
524			*paddr = vtophys(spp->stat);
525			return (0);
526		}
527		offset -= PAGE_SIZE;
528	}
529	return (-1);
530}
531
532static struct devstat *
533devstat_alloc(void)
534{
535	struct devstat *dsp;
536	struct statspage *spp, *spp2;
537	u_int u;
538	static int once;
539
540	mtx_assert(&devstat_mutex, MA_NOTOWNED);
541	if (!once) {
542		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
543		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0400,
544		    DEVSTAT_DEVICE_NAME);
545		once = 1;
546	}
547	spp2 = NULL;
548	mtx_lock(&devstat_mutex);
549	for (;;) {
550		TAILQ_FOREACH(spp, &pagelist, list) {
551			if (spp->nfree > 0)
552				break;
553		}
554		if (spp != NULL)
555			break;
556		mtx_unlock(&devstat_mutex);
557		spp2 = malloc(sizeof *spp, M_DEVSTAT, M_ZERO | M_WAITOK);
558		spp2->stat = malloc(PAGE_SIZE, M_DEVSTAT, M_ZERO | M_WAITOK);
559		spp2->nfree = statsperpage;
560
561		/*
562		 * If free statspages were added while the lock was released
563		 * just reuse them.
564		 */
565		mtx_lock(&devstat_mutex);
566		TAILQ_FOREACH(spp, &pagelist, list)
567			if (spp->nfree > 0)
568				break;
569		if (spp == NULL) {
570			spp = spp2;
571
572			/*
573			 * It would make more sense to add the new page at the
574			 * head but the order on the list determine the
575			 * sequence of the mapping so we can't do that.
576			 */
577			TAILQ_INSERT_TAIL(&pagelist, spp, list);
578		} else
579			break;
580	}
581	dsp = spp->stat;
582	for (u = 0; u < statsperpage; u++) {
583		if (dsp->allocated == 0)
584			break;
585		dsp++;
586	}
587	spp->nfree--;
588	dsp->allocated = 1;
589	mtx_unlock(&devstat_mutex);
590	if (spp2 != NULL && spp2 != spp) {
591		free(spp2->stat, M_DEVSTAT);
592		free(spp2, M_DEVSTAT);
593	}
594	return (dsp);
595}
596
597static void
598devstat_free(struct devstat *dsp)
599{
600	struct statspage *spp;
601
602	mtx_assert(&devstat_mutex, MA_OWNED);
603	bzero(dsp, sizeof *dsp);
604	TAILQ_FOREACH(spp, &pagelist, list) {
605		if (dsp >= spp->stat && dsp < (spp->stat + statsperpage)) {
606			spp->nfree++;
607			return;
608		}
609	}
610}
611
612SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
613    NULL, sizeof(struct devstat), "sizeof(struct devstat)");
614