kern_mutex.c revision 72224
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/kern_mutex.c 72224 2001-02-09 15:19:41Z jhb $
31 */
32
33/*
34 * Machine independent bits of mutex implementation and implementation of
35 * `witness' structure & related debugging routines.
36 */
37
38/*
39 *	Main Entry: witness
40 *	Pronunciation: 'wit-n&s
41 *	Function: noun
42 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
43 *	    testimony, witness, from 2wit
44 *	Date: before 12th century
45 *	1 : attestation of a fact or event : TESTIMONY
46 *	2 : one that gives evidence; specifically : one who testifies in
47 *	    a cause or before a judicial tribunal
48 *	3 : one asked to be present at a transaction so as to be able to
49 *	    testify to its having taken place
50 *	4 : one who has personal knowledge of something
51 *	5 a : something serving as evidence or proof : SIGN
52 *	  b : public affirmation by word or example of usually
53 *	      religious faith or conviction <the heroic witness to divine
54 *	      life -- Pilot>
55 *	6 capitalized : a member of the Jehovah's Witnesses
56 */
57
58#include "opt_ddb.h"
59#include "opt_witness.h"
60
61#include <sys/param.h>
62#include <sys/bus.h>
63#include <sys/kernel.h>
64#include <sys/malloc.h>
65#include <sys/proc.h>
66#include <sys/sysctl.h>
67#include <sys/systm.h>
68#include <sys/vmmeter.h>
69#include <sys/ktr.h>
70
71#include <machine/atomic.h>
72#include <machine/bus.h>
73#include <machine/clock.h>
74#include <machine/cpu.h>
75
76#include <ddb/ddb.h>
77
78#include <vm/vm.h>
79#include <vm/vm_extern.h>
80
81#include <sys/mutex.h>
82
83/*
84 * The WITNESS-enabled mutex debug structure.
85 */
86#ifdef WITNESS
87struct mtx_debug {
88	struct witness	*mtxd_witness;
89	LIST_ENTRY(mtx)	mtxd_held;
90	const char	*mtxd_file;
91	int		mtxd_line;
92};
93
94#define mtx_held	mtx_debug->mtxd_held
95#define	mtx_file	mtx_debug->mtxd_file
96#define	mtx_line	mtx_debug->mtxd_line
97#define	mtx_witness	mtx_debug->mtxd_witness
98#endif	/* WITNESS */
99
100/*
101 * Internal utility macros.
102 */
103#define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
104
105#define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
106	: (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
107
108#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
109#define SET_PRIO(p, pri)	(p)->p_priority = (pri)
110
111/*
112 * Early WITNESS-enabled declarations.
113 */
114#ifdef WITNESS
115
116/*
117 * Internal WITNESS routines which must be prototyped early.
118 *
119 * XXX: When/if witness code is cleaned up, it would be wise to place all
120 *	witness prototyping early in this file.
121 */
122static void witness_init(struct mtx *, int flag);
123static void witness_destroy(struct mtx *);
124static void witness_display(void(*)(const char *fmt, ...));
125
126MALLOC_DEFINE(M_WITNESS, "witness", "witness mtx_debug structure");
127
128/* All mutexes in system (used for debug/panic) */
129static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0 };
130
131/*
132 * This global is set to 0 once it becomes safe to use the witness code.
133 */
134static int witness_cold = 1;
135
136#else	/* WITNESS */
137
138/* XXX XXX XXX
139 * flag++ is sleazoid way of shuting up warning
140 */
141#define witness_init(m, flag) flag++
142#define witness_destroy(m)
143#define witness_try_enter(m, t, f, l)
144#endif	/* WITNESS */
145
146/*
147 * All mutex locks in system are kept on the all_mtx list.
148 */
149static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, 0, "All mutexes queue head",
150	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
151	{ NULL, NULL }, &all_mtx, &all_mtx,
152#ifdef WITNESS
153	&all_mtx_debug
154#else
155	NULL
156#endif
157	 };
158
159/*
160 * Global variables for book keeping.
161 */
162static int	mtx_cur_cnt;
163static int	mtx_max_cnt;
164
165/*
166 * Prototypes for non-exported routines.
167 *
168 * NOTE: Prototypes for witness routines are placed at the bottom of the file.
169 */
170static void	propagate_priority(struct proc *);
171
172static void
173propagate_priority(struct proc *p)
174{
175	int pri = p->p_priority;
176	struct mtx *m = p->p_blocked;
177
178	mtx_assert(&sched_lock, MA_OWNED);
179	for (;;) {
180		struct proc *p1;
181
182		p = mtx_owner(m);
183
184		if (p == NULL) {
185			/*
186			 * This really isn't quite right. Really
187			 * ought to bump priority of process that
188			 * next acquires the mutex.
189			 */
190			MPASS(m->mtx_lock == MTX_CONTESTED);
191			return;
192		}
193
194		MPASS(p->p_magic == P_MAGIC);
195		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
196		if (p->p_priority <= pri)
197			return;
198
199		/*
200		 * Bump this process' priority.
201		 */
202		SET_PRIO(p, pri);
203
204		/*
205		 * If lock holder is actually running, just bump priority.
206		 */
207#ifdef SMP
208		/*
209		 * For SMP, we can check the p_oncpu field to see if we are
210		 * running.
211		 */
212		if (p->p_oncpu != 0xff) {
213			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
214			return;
215		}
216#else
217		/*
218		 * For UP, we check to see if p is curproc (this shouldn't
219		 * ever happen however as it would mean we are in a deadlock.)
220		 */
221		if (p == curproc) {
222			panic("Deadlock detected");
223			return;
224		}
225#endif
226		/*
227		 * If on run queue move to new run queue, and
228		 * quit.
229		 */
230		if (p->p_stat == SRUN) {
231			printf("XXX: moving proc %d(%s) to a new run queue\n",
232			       p->p_pid, p->p_comm);
233			MPASS(p->p_blocked == NULL);
234			remrunqueue(p);
235			setrunqueue(p);
236			return;
237		}
238
239		/*
240		 * If we aren't blocked on a mutex, we should be.
241		 */
242		KASSERT(p->p_stat == SMTX, (
243		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
244		    p->p_pid, p->p_comm, p->p_stat,
245		    m->mtx_description));
246
247		/*
248		 * Pick up the mutex that p is blocked on.
249		 */
250		m = p->p_blocked;
251		MPASS(m != NULL);
252
253		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
254		    p->p_comm, m->mtx_description);
255
256		/*
257		 * Check if the proc needs to be moved up on
258		 * the blocked chain
259		 */
260		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
261			printf("XXX: process at head of run queue\n");
262			continue;
263		}
264
265		p1 = TAILQ_PREV(p, rq, p_procq);
266		if (p1->p_priority <= pri) {
267			printf(
268			   "XXX: previous process %d(%s) has higher priority\n",
269	                    p->p_pid, p->p_comm);
270			continue;
271		}
272
273		/*
274		 * Remove proc from blocked chain and determine where
275		 * it should be moved up to.  Since we know that p1 has
276		 * a lower priority than p, we know that at least one
277		 * process in the chain has a lower priority and that
278		 * p1 will thus not be NULL after the loop.
279		 */
280		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
281		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
282			MPASS(p1->p_magic == P_MAGIC);
283			if (p1->p_priority > pri)
284				break;
285		}
286
287		MPASS(p1 != NULL);
288		TAILQ_INSERT_BEFORE(p1, p, p_procq);
289		CTR4(KTR_LOCK,
290		    "propagate_priority: p %p moved before %p on [%p] %s",
291		    p, p1, m, m->mtx_description);
292	}
293}
294
295/*
296 * The important part of mtx_trylock{,_flags}()
297 * Tries to acquire lock `m.' We do NOT handle recursion here; we assume that
298 * if we're called, it's because we know we don't already own this lock.
299 */
300int
301_mtx_trylock(struct mtx *m, int opts, const char *file, int line)
302{
303	int rval;
304
305	KASSERT(CURPROC != NULL, ("curproc is NULL in _mtx_trylock"));
306
307	/*
308	 * _mtx_trylock does not accept MTX_NOSWITCH option.
309	 */
310	MPASS((opts & MTX_NOSWITCH) == 0);
311
312	rval = _obtain_lock(m, CURTHD);
313
314#ifdef WITNESS
315	if (rval && m->mtx_witness != NULL) {
316		/*
317		 * We do not handle recursion in _mtx_trylock; see the
318		 * note at the top of the routine.
319		 */
320		MPASS(!mtx_recursed(m));
321		witness_try_enter(m, (opts | m->mtx_flags), file, line);
322	}
323#endif	/* WITNESS */
324
325	if ((opts & MTX_QUIET) == 0)
326		CTR5(KTR_LOCK, "TRY_ENTER %s [%p] result=%d at %s:%d",
327		    m->mtx_description, m, rval, file, line);
328
329	return rval;
330}
331
332/*
333 * _mtx_lock_sleep: the tougher part of acquiring an MTX_DEF lock.
334 *
335 * We call this if the lock is either contested (i.e. we need to go to
336 * sleep waiting for it), or if we need to recurse on it.
337 */
338void
339_mtx_lock_sleep(struct mtx *m, int opts, const char *file, int line)
340{
341	struct proc *p = CURPROC;
342
343	if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
344		m->mtx_recurse++;
345		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
346		if ((opts & MTX_QUIET) == 0)
347			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recurse", m);
348		return;
349	}
350
351	if ((opts & MTX_QUIET) == 0)
352		CTR3(KTR_LOCK, "mtx_lock: %p contested (lock=%p) [%p]", m,
353		    (void *)m->mtx_lock, (void *)RETIP(m));
354
355	/*
356	 * Save our priority. Even though p_nativepri is protected by
357	 * sched_lock, we don't obtain it here as it can be expensive.
358	 * Since this is the only place p_nativepri is set, and since two
359	 * CPUs will not be executing the same process concurrently, we know
360	 * that no other CPU is going to be messing with this. Also,
361	 * p_nativepri is only read when we are blocked on a mutex, so that
362	 * can't be happening right now either.
363	 */
364	p->p_nativepri = p->p_priority;
365
366	while (!_obtain_lock(m, p)) {
367		uintptr_t v;
368		struct proc *p1;
369
370		mtx_lock_spin(&sched_lock);
371		/*
372		 * Check if the lock has been released while spinning for
373		 * the sched_lock.
374		 */
375		if ((v = m->mtx_lock) == MTX_UNOWNED) {
376			mtx_unlock_spin(&sched_lock);
377			continue;
378		}
379
380		/*
381		 * The mutex was marked contested on release. This means that
382		 * there are processes blocked on it.
383		 */
384		if (v == MTX_CONTESTED) {
385			p1 = TAILQ_FIRST(&m->mtx_blocked);
386			KASSERT(p1 != NULL,
387			    ("contested mutex has no contesters"));
388			m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
389
390			if (p1->p_priority < p->p_priority)
391				SET_PRIO(p, p1->p_priority);
392			mtx_unlock_spin(&sched_lock);
393			return;
394		}
395
396		/*
397		 * If the mutex isn't already contested and a failure occurs
398		 * setting the contested bit, the mutex was either released
399		 * or the state of the MTX_RECURSED bit changed.
400		 */
401		if ((v & MTX_CONTESTED) == 0 &&
402		    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
403			(void *)(v | MTX_CONTESTED))) {
404			mtx_unlock_spin(&sched_lock);
405			continue;
406		}
407
408		/*
409		 * We deffinately must sleep for this lock.
410		 */
411		mtx_assert(m, MA_NOTOWNED);
412
413#ifdef notyet
414		/*
415		 * If we're borrowing an interrupted thread's VM context, we
416		 * must clean up before going to sleep.
417		 */
418		if (p->p_flag & (P_ITHD | P_SITHD)) {
419			ithd_t *it = (ithd_t *)p;
420
421			if (it->it_interrupted) {
422				if ((opts & MTX_QUIET) == 0)
423					CTR2(KTR_LOCK,
424					    "mtx_lock: 0x%x interrupted 0x%x",
425					    it, it->it_interrupted);
426				intr_thd_fixup(it);
427			}
428		}
429#endif
430
431		/*
432		 * Put us on the list of threads blocked on this mutex.
433		 */
434		if (TAILQ_EMPTY(&m->mtx_blocked)) {
435			p1 = (struct proc *)(m->mtx_lock & MTX_FLAGMASK);
436			LIST_INSERT_HEAD(&p1->p_contested, m, mtx_contested);
437			TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
438		} else {
439			TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
440				if (p1->p_priority > p->p_priority)
441					break;
442			if (p1)
443				TAILQ_INSERT_BEFORE(p1, p, p_procq);
444			else
445				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
446		}
447
448		/*
449		 * Save who we're blocked on.
450		 */
451		p->p_blocked = m;
452		p->p_mtxname = m->mtx_description;
453		p->p_stat = SMTX;
454#if 0
455		propagate_priority(p);
456#endif
457
458		if ((opts & MTX_QUIET) == 0)
459			CTR3(KTR_LOCK,
460			    "_mtx_lock_sleep: p %p blocked on [%p] %s", p, m,
461			    m->mtx_description);
462
463		mi_switch();
464
465		if ((opts & MTX_QUIET) == 0)
466			CTR3(KTR_LOCK,
467			  "_mtx_lock_sleep: p %p free from blocked on [%p] %s",
468			  p, m, m->mtx_description);
469
470		mtx_unlock_spin(&sched_lock);
471	}
472
473	return;
474}
475
476/*
477 * _mtx_lock_spin: the tougher part of acquiring an MTX_SPIN lock.
478 *
479 * This is only called if we need to actually spin for the lock. Recursion
480 * is handled inline.
481 */
482void
483_mtx_lock_spin(struct mtx *m, int opts, u_int mtx_intr, const char *file,
484	       int line)
485{
486	int i = 0;
487
488	if ((opts & MTX_QUIET) == 0)
489		CTR1(KTR_LOCK, "mtx_lock_spin: %p spinning", m);
490
491	for (;;) {
492		if (_obtain_lock(m, CURPROC))
493			break;
494
495		while (m->mtx_lock != MTX_UNOWNED) {
496			if (i++ < 1000000)
497				continue;
498			if (i++ < 6000000)
499				DELAY(1);
500#ifdef DDB
501			else if (!db_active)
502#else
503			else
504#endif
505			panic("spin lock %s held by %p for > 5 seconds",
506			    m->mtx_description, (void *)m->mtx_lock);
507		}
508	}
509
510	m->mtx_saveintr = mtx_intr;
511	if ((opts & MTX_QUIET) == 0)
512		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
513
514	return;
515}
516
517/*
518 * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
519 *
520 * We are only called here if the lock is recursed or contested (i.e. we
521 * need to wake up a blocked thread).
522 */
523void
524_mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
525{
526	struct proc *p, *p1;
527	struct mtx *m1;
528	int pri;
529
530	p = CURPROC;
531	MPASS4(mtx_owned(m), "mtx_owned(mpp)", file, line);
532
533	if ((opts & MTX_QUIET) == 0)
534		CTR5(KTR_LOCK, "REL %s [%p] r=%d at %s:%d", m->mtx_description,
535		    m, m->mtx_recurse, file, line);
536
537	if (mtx_recursed(m)) {
538		if (--(m->mtx_recurse) == 0)
539			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
540		if ((opts & MTX_QUIET) == 0)
541			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
542		return;
543	}
544
545	mtx_lock_spin(&sched_lock);
546	if ((opts & MTX_QUIET) == 0)
547		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
548
549	p1 = TAILQ_FIRST(&m->mtx_blocked);
550	MPASS(p->p_magic == P_MAGIC);
551	MPASS(p1->p_magic == P_MAGIC);
552
553	TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
554
555	if (TAILQ_EMPTY(&m->mtx_blocked)) {
556		LIST_REMOVE(m, mtx_contested);
557		_release_lock_quick(m);
558		if ((opts & MTX_QUIET) == 0)
559			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
560	} else
561		atomic_store_rel_ptr(&m->mtx_lock, (void *)MTX_CONTESTED);
562
563	pri = MAXPRI;
564	LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
565		int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
566		if (cp < pri)
567			pri = cp;
568	}
569
570	if (pri > p->p_nativepri)
571		pri = p->p_nativepri;
572	SET_PRIO(p, pri);
573
574	if ((opts & MTX_QUIET) == 0)
575		CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p contested setrunqueue %p",
576		    m, p1);
577
578	p1->p_blocked = NULL;
579	p1->p_mtxname = NULL;
580	p1->p_stat = SRUN;
581	setrunqueue(p1);
582
583	if ((opts & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
584#ifdef notyet
585		if (p->p_flag & (P_ITHD | P_SITHD)) {
586			ithd_t *it = (ithd_t *)p;
587
588			if (it->it_interrupted) {
589				if ((opts & MTX_QUIET) == 0)
590					CTR2(KTR_LOCK,
591				    "_mtx_unlock_sleep: 0x%x interrupted 0x%x",
592					    it, it->it_interrupted);
593				intr_thd_fixup(it);
594			}
595		}
596#endif
597		setrunqueue(p);
598		if ((opts & MTX_QUIET) == 0)
599			CTR2(KTR_LOCK,
600			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
601			    (void *)m->mtx_lock);
602
603		mi_switch();
604		if ((opts & MTX_QUIET) == 0)
605			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
606			    m, (void *)m->mtx_lock);
607	}
608
609	mtx_unlock_spin(&sched_lock);
610
611	return;
612}
613
614/*
615 * All the unlocking of MTX_SPIN locks is done inline.
616 * See the _rel_spin_lock() macro for the details.
617 */
618
619/*
620 * The INVARIANTS-enabled mtx_assert()
621 */
622#ifdef INVARIANTS
623void
624_mtx_assert(struct mtx *m, int what, const char *file, int line)
625{
626	switch ((what)) {
627	case MA_OWNED:
628	case MA_OWNED | MA_RECURSED:
629	case MA_OWNED | MA_NOTRECURSED:
630		if (!mtx_owned((m)))
631			panic("mutex %s not owned at %s:%d",
632			    (m)->mtx_description, file, line);
633		if (mtx_recursed((m))) {
634			if (((what) & MA_NOTRECURSED) != 0)
635				panic("mutex %s recursed at %s:%d",
636				    (m)->mtx_description, file, line);
637		} else if (((what) & MA_RECURSED) != 0) {
638			panic("mutex %s unrecursed at %s:%d",
639			    (m)->mtx_description, file, line);
640		}
641		break;
642	case MA_NOTOWNED:
643		if (mtx_owned((m)))
644			panic("mutex %s owned at %s:%d",
645			    (m)->mtx_description, file, line);
646		break;
647	default:
648		panic("unknown mtx_assert at %s:%d", file, line);
649	}
650}
651#endif
652
653/*
654 * The MUTEX_DEBUG-enabled mtx_validate()
655 */
656#define MV_DESTROY	0	/* validate before destory */
657#define MV_INIT		1	/* validate before init */
658
659#ifdef MUTEX_DEBUG
660
661int mtx_validate __P((struct mtx *, int));
662
663int
664mtx_validate(struct mtx *m, int when)
665{
666	struct mtx *mp;
667	int i;
668	int retval = 0;
669
670#ifdef WITNESS
671	if (witness_cold)
672		return 0;
673#endif
674	if (m == &all_mtx || cold)
675		return 0;
676
677	mtx_lock(&all_mtx);
678/*
679 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
680 * we can re-enable the kernacc() checks.
681 */
682#ifndef __alpha__
683	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
684	    VM_PROT_READ) == 1);
685#endif
686	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
687	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
688#ifndef __alpha__
689		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
690		    VM_PROT_READ) != 1) {
691			panic("mtx_validate: mp=%p mp->mtx_next=%p",
692			    mp, mp->mtx_next);
693		}
694#endif
695		i++;
696		if (i > mtx_cur_cnt) {
697			panic("mtx_validate: too many in chain, known=%d\n",
698			    mtx_cur_cnt);
699		}
700	}
701	MPASS(i == mtx_cur_cnt);
702	switch (when) {
703	case MV_DESTROY:
704		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
705			if (mp == m)
706				break;
707		MPASS(mp == m);
708		break;
709	case MV_INIT:
710		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
711		if (mp == m) {
712			/*
713			 * Not good. This mutex already exists.
714			 */
715			printf("re-initing existing mutex %s\n",
716			    m->mtx_description);
717			MPASS(m->mtx_lock == MTX_UNOWNED);
718			retval = 1;
719		}
720	}
721	mtx_unlock(&all_mtx);
722	return (retval);
723}
724#endif
725
726/*
727 * Mutex initialization routine; initialize lock `m' of type contained in
728 * `opts' with options contained in `opts' and description `description.'
729 * Place on "all_mtx" queue.
730 */
731void
732mtx_init(struct mtx *m, const char *description, int opts)
733{
734
735	if ((opts & MTX_QUIET) == 0)
736		CTR2(KTR_LOCK, "mtx_init %p (%s)", m, description);
737
738#ifdef MUTEX_DEBUG
739	/* Diagnostic and error correction */
740	if (mtx_validate(m, MV_INIT))
741		return;
742#endif
743
744	bzero((void *)m, sizeof *m);
745	TAILQ_INIT(&m->mtx_blocked);
746
747#ifdef WITNESS
748	if (!witness_cold) {
749		m->mtx_debug = malloc(sizeof(struct mtx_debug),
750		    M_WITNESS, M_NOWAIT | M_ZERO);
751		MPASS(m->mtx_debug != NULL);
752	}
753#endif
754
755	m->mtx_description = description;
756	m->mtx_flags = opts;
757	m->mtx_lock = MTX_UNOWNED;
758
759	/* Put on all mutex queue */
760	mtx_lock(&all_mtx);
761	m->mtx_next = &all_mtx;
762	m->mtx_prev = all_mtx.mtx_prev;
763	m->mtx_prev->mtx_next = m;
764	all_mtx.mtx_prev = m;
765	if (++mtx_cur_cnt > mtx_max_cnt)
766		mtx_max_cnt = mtx_cur_cnt;
767	mtx_unlock(&all_mtx);
768
769#ifdef WITNESS
770	if (!witness_cold)
771		witness_init(m, opts);
772#endif
773}
774
775/*
776 * Remove lock `m' from all_mtx queue.
777 */
778void
779mtx_destroy(struct mtx *m)
780{
781
782#ifdef WITNESS
783	KASSERT(!witness_cold, ("%s: Cannot destroy while still cold\n",
784	    __FUNCTION__));
785#endif
786
787	CTR2(KTR_LOCK, "mtx_destroy %p (%s)", m, m->mtx_description);
788
789#ifdef MUTEX_DEBUG
790	if (m->mtx_next == NULL)
791		panic("mtx_destroy: %p (%s) already destroyed",
792		    m, m->mtx_description);
793
794	if (!mtx_owned(m)) {
795		MPASS(m->mtx_lock == MTX_UNOWNED);
796	} else {
797		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
798	}
799
800	/* diagnostic */
801	mtx_validate(m, MV_DESTROY);
802#endif
803
804#ifdef WITNESS
805	if (m->mtx_witness)
806		witness_destroy(m);
807#endif /* WITNESS */
808
809	/* Remove from the all mutex queue */
810	mtx_lock(&all_mtx);
811	m->mtx_next->mtx_prev = m->mtx_prev;
812	m->mtx_prev->mtx_next = m->mtx_next;
813
814#ifdef MUTEX_DEBUG
815	m->mtx_next = m->mtx_prev = NULL;
816#endif
817
818#ifdef WITNESS
819	free(m->mtx_debug, M_WITNESS);
820	m->mtx_debug = NULL;
821#endif
822
823	mtx_cur_cnt--;
824	mtx_unlock(&all_mtx);
825}
826
827
828/*
829 * The WITNESS-enabled diagnostic code.
830 */
831#ifdef WITNESS
832static void
833witness_fixup(void *dummy __unused)
834{
835	struct mtx *mp;
836
837	/*
838	 * We have to release Giant before initializing its witness
839	 * structure so that WITNESS doesn't get confused.
840	 */
841	mtx_unlock(&Giant);
842	mtx_assert(&Giant, MA_NOTOWNED);
843
844	mtx_lock(&all_mtx);
845
846	/* Iterate through all mutexes and finish up mutex initialization. */
847	for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
848
849		mp->mtx_debug = malloc(sizeof(struct mtx_debug),
850		    M_WITNESS, M_NOWAIT | M_ZERO);
851		MPASS(mp->mtx_debug != NULL);
852
853		witness_init(mp, mp->mtx_flags);
854	}
855	mtx_unlock(&all_mtx);
856
857	/* Mark the witness code as being ready for use. */
858	atomic_store_rel_int(&witness_cold, 0);
859
860	mtx_lock(&Giant);
861}
862SYSINIT(wtnsfxup, SI_SUB_MUTEX, SI_ORDER_FIRST, witness_fixup, NULL)
863
864#define WITNESS_COUNT 200
865#define	WITNESS_NCHILDREN 2
866
867int witness_watch = 1;
868
869struct witness {
870	struct witness	*w_next;
871	const char	*w_description;
872	const char	*w_file;
873	int		 w_line;
874	struct witness	*w_morechildren;
875	u_char		 w_childcnt;
876	u_char		 w_Giant_squawked:1;
877	u_char		 w_other_squawked:1;
878	u_char		 w_same_squawked:1;
879	u_char		 w_spin:1;	/* MTX_SPIN type mutex. */
880	u_int		 w_level;
881	struct witness	*w_children[WITNESS_NCHILDREN];
882};
883
884struct witness_blessed {
885	char 	*b_lock1;
886	char	*b_lock2;
887};
888
889#ifdef DDB
890/*
891 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
892 * drop into kdebug() when:
893 *	- a lock heirarchy violation occurs
894 *	- locks are held when going to sleep.
895 */
896int	witness_ddb;
897#ifdef WITNESS_DDB
898TUNABLE_INT_DECL("debug.witness_ddb", 1, witness_ddb);
899#else
900TUNABLE_INT_DECL("debug.witness_ddb", 0, witness_ddb);
901#endif
902SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
903#endif /* DDB */
904
905int	witness_skipspin;
906#ifdef WITNESS_SKIPSPIN
907TUNABLE_INT_DECL("debug.witness_skipspin", 1, witness_skipspin);
908#else
909TUNABLE_INT_DECL("debug.witness_skipspin", 0, witness_skipspin);
910#endif
911SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
912    "");
913
914/*
915 * Witness-enabled globals
916 */
917static struct mtx	w_mtx;
918static struct witness	*w_free;
919static struct witness	*w_all;
920static int		 w_inited;
921static int		 witness_dead;	/* fatal error, probably no memory */
922
923static struct witness	 w_data[WITNESS_COUNT];
924
925/*
926 * Internal witness routine prototypes
927 */
928static struct witness *enroll(const char *description, int flag);
929static int itismychild(struct witness *parent, struct witness *child);
930static void removechild(struct witness *parent, struct witness *child);
931static int isitmychild(struct witness *parent, struct witness *child);
932static int isitmydescendant(struct witness *parent, struct witness *child);
933static int dup_ok(struct witness *);
934static int blessed(struct witness *, struct witness *);
935static void
936    witness_displaydescendants(void(*)(const char *fmt, ...), struct witness *);
937static void witness_leveldescendents(struct witness *parent, int level);
938static void witness_levelall(void);
939static struct witness * witness_get(void);
940static void witness_free(struct witness *m);
941
942static char *ignore_list[] = {
943	"witness lock",
944	NULL
945};
946
947static char *spin_order_list[] = {
948#if defined(__i386__) && defined (SMP)
949	"com",
950#endif
951	"sio",
952#ifdef __i386__
953	"cy",
954#endif
955	"sched lock",
956#ifdef __i386__
957	"clk",
958#endif
959	"callout",
960	/*
961	 * leaf locks
962	 */
963	"ithread table lock",
964	"ithread list lock",
965#ifdef SMP
966#ifdef __i386__
967	"ap boot",
968	"imen",
969#endif
970	"smp rendezvous",
971#endif
972	NULL
973};
974
975static char *order_list[] = {
976	"Giant", "uidinfo hash", "uidinfo struct", NULL,
977	"Giant", "proctree", "allproc", "process lock", NULL,
978	NULL
979};
980
981static char *dup_list[] = {
982	NULL
983};
984
985static char *sleep_list[] = {
986	"Giant",
987	NULL
988};
989
990/*
991 * Pairs of locks which have been blessed
992 * Don't complain about order problems with blessed locks
993 */
994static struct witness_blessed blessed_list[] = {
995};
996static int blessed_count =
997	sizeof(blessed_list) / sizeof(struct witness_blessed);
998
999static void
1000witness_init(struct mtx *m, int flag)
1001{
1002	m->mtx_witness = enroll(m->mtx_description, flag);
1003}
1004
1005static void
1006witness_destroy(struct mtx *m)
1007{
1008	struct mtx *m1;
1009	struct proc *p;
1010	p = CURPROC;
1011	LIST_FOREACH(m1, &p->p_heldmtx, mtx_held) {
1012		if (m1 == m) {
1013			LIST_REMOVE(m, mtx_held);
1014			break;
1015		}
1016	}
1017	return;
1018
1019}
1020
1021static void
1022witness_display(void(*prnt)(const char *fmt, ...))
1023{
1024	struct witness *w, *w1;
1025	int level, found;
1026
1027	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
1028	witness_levelall();
1029
1030	/*
1031	 * First, handle sleep mutexes which have been acquired at least
1032	 * once.
1033	 */
1034	prnt("Sleep mutexes:\n");
1035	for (w = w_all; w; w = w->w_next) {
1036		if (w->w_file == NULL || w->w_spin)
1037			continue;
1038		for (w1 = w_all; w1; w1 = w1->w_next) {
1039			if (isitmychild(w1, w))
1040				break;
1041		}
1042		if (w1 != NULL)
1043			continue;
1044		/*
1045		 * This lock has no anscestors, display its descendants.
1046		 */
1047		witness_displaydescendants(prnt, w);
1048	}
1049
1050	/*
1051	 * Now do spin mutexes which have been acquired at least once.
1052	 */
1053	prnt("\nSpin mutexes:\n");
1054	level = 0;
1055	while (level < sizeof(spin_order_list) / sizeof(char *)) {
1056		found = 0;
1057		for (w = w_all; w; w = w->w_next) {
1058			if (w->w_file == NULL || !w->w_spin)
1059				continue;
1060			if (w->w_level == 1 << level) {
1061				witness_displaydescendants(prnt, w);
1062				level++;
1063				found = 1;
1064			}
1065		}
1066		if (found == 0)
1067			level++;
1068	}
1069
1070	/*
1071	 * Finally, any mutexes which have not been acquired yet.
1072	 */
1073	prnt("\nMutexes which were never acquired:\n");
1074	for (w = w_all; w; w = w->w_next) {
1075		if (w->w_file != NULL)
1076			continue;
1077		prnt("%s\n", w->w_description);
1078	}
1079}
1080
1081void
1082witness_enter(struct mtx *m, int flags, const char *file, int line)
1083{
1084	struct witness *w, *w1;
1085	struct mtx *m1;
1086	struct proc *p;
1087	int i;
1088#ifdef DDB
1089	int go_into_ddb = 0;
1090#endif /* DDB */
1091
1092	if (witness_cold || m->mtx_witness == NULL || panicstr)
1093		return;
1094	w = m->mtx_witness;
1095	p = CURPROC;
1096
1097	if (flags & MTX_SPIN) {
1098		if ((m->mtx_flags & MTX_SPIN) == 0)
1099			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
1100			    " %s:%d", m->mtx_description, file, line);
1101		if (mtx_recursed(m)) {
1102			if ((m->mtx_flags & MTX_RECURSE) == 0)
1103				panic("mutex_enter: recursion on non-recursive"
1104				    " mutex %s @ %s:%d", m->mtx_description,
1105				    file, line);
1106			return;
1107		}
1108		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
1109		i = PCPU_GET(witness_spin_check);
1110		if (i != 0 && w->w_level < i) {
1111			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1112			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
1113			    " %s:%d already holding %s:%x",
1114			    m->mtx_description, w->w_level, file, line,
1115			    spin_order_list[ffs(i)-1], i);
1116		}
1117		PCPU_SET(witness_spin_check, i | w->w_level);
1118		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1119		w->w_file = file;
1120		w->w_line = line;
1121		m->mtx_line = line;
1122		m->mtx_file = file;
1123		return;
1124	}
1125	if ((m->mtx_flags & MTX_SPIN) != 0)
1126		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
1127		    m->mtx_description, file, line);
1128
1129	if (mtx_recursed(m)) {
1130		if ((m->mtx_flags & MTX_RECURSE) == 0)
1131			panic("mutex_enter: recursion on non-recursive"
1132			    " mutex %s @ %s:%d", m->mtx_description,
1133			    file, line);
1134		return;
1135	}
1136	if (witness_dead)
1137		goto out;
1138	if (cold)
1139		goto out;
1140
1141	if (!mtx_legal2block())
1142		panic("blockable mtx_lock() of %s when not legal @ %s:%d",
1143			    m->mtx_description, file, line);
1144	/*
1145	 * Is this the first mutex acquired
1146	 */
1147	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
1148		goto out;
1149
1150	if ((w1 = m1->mtx_witness) == w) {
1151		if (w->w_same_squawked || dup_ok(w))
1152			goto out;
1153		w->w_same_squawked = 1;
1154		printf("acquring duplicate lock of same type: \"%s\"\n",
1155			m->mtx_description);
1156		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
1157		printf(" 2nd @ %s:%d\n", file, line);
1158#ifdef DDB
1159		go_into_ddb = 1;
1160#endif /* DDB */
1161		goto out;
1162	}
1163	MPASS(!mtx_owned(&w_mtx));
1164	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
1165	/*
1166	 * If we have a known higher number just say ok
1167	 */
1168	if (witness_watch > 1 && w->w_level > w1->w_level) {
1169		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1170		goto out;
1171	}
1172	if (isitmydescendant(m1->mtx_witness, w)) {
1173		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1174		goto out;
1175	}
1176	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
1177
1178		MPASS(i < 200);
1179		w1 = m1->mtx_witness;
1180		if (isitmydescendant(w, w1)) {
1181			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1182			if (blessed(w, w1))
1183				goto out;
1184			if (m1 == &Giant) {
1185				if (w1->w_Giant_squawked)
1186					goto out;
1187				else
1188					w1->w_Giant_squawked = 1;
1189			} else {
1190				if (w1->w_other_squawked)
1191					goto out;
1192				else
1193					w1->w_other_squawked = 1;
1194			}
1195			printf("lock order reversal\n");
1196			printf(" 1st %s last acquired @ %s:%d\n",
1197			    w->w_description, w->w_file, w->w_line);
1198			printf(" 2nd %p %s @ %s:%d\n",
1199			    m1, w1->w_description, w1->w_file, w1->w_line);
1200			printf(" 3rd %p %s @ %s:%d\n",
1201			    m, w->w_description, file, line);
1202#ifdef DDB
1203			go_into_ddb = 1;
1204#endif /* DDB */
1205			goto out;
1206		}
1207	}
1208	m1 = LIST_FIRST(&p->p_heldmtx);
1209	if (!itismychild(m1->mtx_witness, w))
1210		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1211
1212out:
1213#ifdef DDB
1214	if (witness_ddb && go_into_ddb)
1215		Debugger("witness_enter");
1216#endif /* DDB */
1217	w->w_file = file;
1218	w->w_line = line;
1219	m->mtx_line = line;
1220	m->mtx_file = file;
1221
1222	/*
1223	 * If this pays off it likely means that a mutex being witnessed
1224	 * is acquired in hardclock. Put it in the ignore list. It is
1225	 * likely not the mutex this assert fails on.
1226	 */
1227	MPASS(m->mtx_held.le_prev == NULL);
1228	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
1229}
1230
1231void
1232witness_try_enter(struct mtx *m, int flags, const char *file, int line)
1233{
1234	struct proc *p;
1235	struct witness *w = m->mtx_witness;
1236
1237	if (witness_cold)
1238		return;
1239	if (panicstr)
1240		return;
1241	if (flags & MTX_SPIN) {
1242		if ((m->mtx_flags & MTX_SPIN) == 0)
1243			panic("mutex_try_enter: "
1244			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
1245			    m->mtx_description, file, line);
1246		if (mtx_recursed(m)) {
1247			if ((m->mtx_flags & MTX_RECURSE) == 0)
1248				panic("mutex_try_enter: recursion on"
1249				    " non-recursive mutex %s @ %s:%d",
1250				    m->mtx_description, file, line);
1251			return;
1252		}
1253		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
1254		PCPU_SET(witness_spin_check,
1255		    PCPU_GET(witness_spin_check) | w->w_level);
1256		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1257		w->w_file = file;
1258		w->w_line = line;
1259		m->mtx_line = line;
1260		m->mtx_file = file;
1261		return;
1262	}
1263
1264	if ((m->mtx_flags & MTX_SPIN) != 0)
1265		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
1266		    m->mtx_description, file, line);
1267
1268	if (mtx_recursed(m)) {
1269		if ((m->mtx_flags & MTX_RECURSE) == 0)
1270			panic("mutex_try_enter: recursion on non-recursive"
1271			    " mutex %s @ %s:%d", m->mtx_description, file,
1272			    line);
1273		return;
1274	}
1275	w->w_file = file;
1276	w->w_line = line;
1277	m->mtx_line = line;
1278	m->mtx_file = file;
1279	p = CURPROC;
1280	MPASS(m->mtx_held.le_prev == NULL);
1281	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
1282}
1283
1284void
1285witness_exit(struct mtx *m, int flags, const char *file, int line)
1286{
1287	struct witness *w;
1288
1289	if (witness_cold || m->mtx_witness == NULL || panicstr)
1290		return;
1291	w = m->mtx_witness;
1292
1293	if (flags & MTX_SPIN) {
1294		if ((m->mtx_flags & MTX_SPIN) == 0)
1295			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
1296			    " %s:%d", m->mtx_description, file, line);
1297		if (mtx_recursed(m)) {
1298			if ((m->mtx_flags & MTX_RECURSE) == 0)
1299				panic("mutex_exit: recursion on non-recursive"
1300				    " mutex %s @ %s:%d", m->mtx_description,
1301				    file, line);
1302			return;
1303		}
1304		mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
1305		PCPU_SET(witness_spin_check,
1306		    PCPU_GET(witness_spin_check) & ~w->w_level);
1307		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1308		return;
1309	}
1310	if ((m->mtx_flags & MTX_SPIN) != 0)
1311		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
1312		    m->mtx_description, file, line);
1313
1314	if (mtx_recursed(m)) {
1315		if ((m->mtx_flags & MTX_RECURSE) == 0)
1316			panic("mutex_exit: recursion on non-recursive"
1317			    " mutex %s @ %s:%d", m->mtx_description,
1318			    file, line);
1319		return;
1320	}
1321
1322	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
1323		panic("switchable mtx_unlock() of %s when not legal @ %s:%d",
1324			    m->mtx_description, file, line);
1325	LIST_REMOVE(m, mtx_held);
1326	m->mtx_held.le_prev = NULL;
1327}
1328
1329int
1330witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
1331{
1332	struct mtx *m;
1333	struct proc *p;
1334	char **sleep;
1335	int n = 0;
1336
1337	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
1338	p = CURPROC;
1339	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
1340		if (m == mtx)
1341			continue;
1342		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1343			if (strcmp(m->mtx_description, *sleep) == 0)
1344				goto next;
1345		if (n == 0)
1346			printf("Whee!\n");
1347		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1348			file, line, check_only ? "could sleep" : "sleeping",
1349			m->mtx_description,
1350			m->mtx_witness->w_file, m->mtx_witness->w_line);
1351		n++;
1352	next:
1353	}
1354#ifdef DDB
1355	if (witness_ddb && n)
1356		Debugger("witness_sleep");
1357#endif /* DDB */
1358	return (n);
1359}
1360
1361static struct witness *
1362enroll(const char *description, int flag)
1363{
1364	int i;
1365	struct witness *w, *w1;
1366	char **ignore;
1367	char **order;
1368
1369	if (!witness_watch)
1370		return (NULL);
1371	for (ignore = ignore_list; *ignore != NULL; ignore++)
1372		if (strcmp(description, *ignore) == 0)
1373			return (NULL);
1374
1375	if (w_inited == 0) {
1376		mtx_init(&w_mtx, "witness lock", MTX_SPIN);
1377		for (i = 0; i < WITNESS_COUNT; i++) {
1378			w = &w_data[i];
1379			witness_free(w);
1380		}
1381		w_inited = 1;
1382		for (order = order_list; *order != NULL; order++) {
1383			w = enroll(*order, MTX_DEF);
1384			w->w_file = "order list";
1385			for (order++; *order != NULL; order++) {
1386				w1 = enroll(*order, MTX_DEF);
1387				w1->w_file = "order list";
1388				itismychild(w, w1);
1389				w = w1;
1390    	    	    	}
1391		}
1392	}
1393	if ((flag & MTX_SPIN) && witness_skipspin)
1394		return (NULL);
1395	mtx_lock_spin_flags(&w_mtx, MTX_QUIET);
1396	for (w = w_all; w; w = w->w_next) {
1397		if (strcmp(description, w->w_description) == 0) {
1398			mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1399			return (w);
1400		}
1401	}
1402	if ((w = witness_get()) == NULL)
1403		return (NULL);
1404	w->w_next = w_all;
1405	w_all = w;
1406	w->w_description = description;
1407	mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1408	if (flag & MTX_SPIN) {
1409		w->w_spin = 1;
1410
1411		i = 1;
1412		for (order = spin_order_list; *order != NULL; order++) {
1413			if (strcmp(description, *order) == 0)
1414				break;
1415			i <<= 1;
1416		}
1417		if (*order == NULL)
1418			panic("spin lock %s not in order list", description);
1419		w->w_level = i;
1420	}
1421
1422	return (w);
1423}
1424
1425static int
1426itismychild(struct witness *parent, struct witness *child)
1427{
1428	static int recursed;
1429
1430	/*
1431	 * Insert "child" after "parent"
1432	 */
1433	while (parent->w_morechildren)
1434		parent = parent->w_morechildren;
1435
1436	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1437		if ((parent->w_morechildren = witness_get()) == NULL)
1438			return (1);
1439		parent = parent->w_morechildren;
1440	}
1441	MPASS(child != NULL);
1442	parent->w_children[parent->w_childcnt++] = child;
1443	/*
1444	 * now prune whole tree
1445	 */
1446	if (recursed)
1447		return (0);
1448	recursed = 1;
1449	for (child = w_all; child != NULL; child = child->w_next) {
1450		for (parent = w_all; parent != NULL;
1451		    parent = parent->w_next) {
1452			if (!isitmychild(parent, child))
1453				continue;
1454			removechild(parent, child);
1455			if (isitmydescendant(parent, child))
1456				continue;
1457			itismychild(parent, child);
1458		}
1459	}
1460	recursed = 0;
1461	witness_levelall();
1462	return (0);
1463}
1464
1465static void
1466removechild(struct witness *parent, struct witness *child)
1467{
1468	struct witness *w, *w1;
1469	int i;
1470
1471	for (w = parent; w != NULL; w = w->w_morechildren)
1472		for (i = 0; i < w->w_childcnt; i++)
1473			if (w->w_children[i] == child)
1474				goto found;
1475	return;
1476found:
1477	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1478		continue;
1479	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1480	MPASS(w->w_children[i] != NULL);
1481
1482	if (w1->w_childcnt != 0)
1483		return;
1484
1485	if (w1 == parent)
1486		return;
1487	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1488		continue;
1489	w->w_morechildren = 0;
1490	witness_free(w1);
1491}
1492
1493static int
1494isitmychild(struct witness *parent, struct witness *child)
1495{
1496	struct witness *w;
1497	int i;
1498
1499	for (w = parent; w != NULL; w = w->w_morechildren) {
1500		for (i = 0; i < w->w_childcnt; i++) {
1501			if (w->w_children[i] == child)
1502				return (1);
1503		}
1504	}
1505	return (0);
1506}
1507
1508static int
1509isitmydescendant(struct witness *parent, struct witness *child)
1510{
1511	struct witness *w;
1512	int i;
1513	int j;
1514
1515	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1516		MPASS(j < 1000);
1517		for (i = 0; i < w->w_childcnt; i++) {
1518			if (w->w_children[i] == child)
1519				return (1);
1520		}
1521		for (i = 0; i < w->w_childcnt; i++) {
1522			if (isitmydescendant(w->w_children[i], child))
1523				return (1);
1524		}
1525	}
1526	return (0);
1527}
1528
1529void
1530witness_levelall (void)
1531{
1532	struct witness *w, *w1;
1533
1534	for (w = w_all; w; w = w->w_next)
1535		if (!(w->w_spin))
1536			w->w_level = 0;
1537	for (w = w_all; w; w = w->w_next) {
1538		if (w->w_spin)
1539			continue;
1540		for (w1 = w_all; w1; w1 = w1->w_next) {
1541			if (isitmychild(w1, w))
1542				break;
1543		}
1544		if (w1 != NULL)
1545			continue;
1546		witness_leveldescendents(w, 0);
1547	}
1548}
1549
1550static void
1551witness_leveldescendents(struct witness *parent, int level)
1552{
1553	int i;
1554	struct witness *w;
1555
1556	if (parent->w_level < level)
1557		parent->w_level = level;
1558	level++;
1559	for (w = parent; w != NULL; w = w->w_morechildren)
1560		for (i = 0; i < w->w_childcnt; i++)
1561			witness_leveldescendents(w->w_children[i], level);
1562}
1563
1564static void
1565witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1566			   struct witness *parent)
1567{
1568	struct witness *w;
1569	int i;
1570	int level;
1571
1572	level = parent->w_spin ? ffs(parent->w_level) : parent->w_level;
1573
1574	prnt("%d", level);
1575	if (level < 10)
1576		prnt(" ");
1577	for (i = 0; i < level; i++)
1578		prnt(" ");
1579	prnt("%s", parent->w_description);
1580	if (parent->w_file != NULL)
1581		prnt(" -- last acquired @ %s:%d\n", parent->w_file,
1582		    parent->w_line);
1583
1584	for (w = parent; w != NULL; w = w->w_morechildren)
1585		for (i = 0; i < w->w_childcnt; i++)
1586			    witness_displaydescendants(prnt, w->w_children[i]);
1587    }
1588
1589static int
1590dup_ok(struct witness *w)
1591{
1592	char **dup;
1593
1594	for (dup = dup_list; *dup!= NULL; dup++)
1595		if (strcmp(w->w_description, *dup) == 0)
1596			return (1);
1597	return (0);
1598}
1599
1600static int
1601blessed(struct witness *w1, struct witness *w2)
1602{
1603	int i;
1604	struct witness_blessed *b;
1605
1606	for (i = 0; i < blessed_count; i++) {
1607		b = &blessed_list[i];
1608		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1609			if (strcmp(w2->w_description, b->b_lock2) == 0)
1610				return (1);
1611			continue;
1612		}
1613		if (strcmp(w1->w_description, b->b_lock2) == 0)
1614			if (strcmp(w2->w_description, b->b_lock1) == 0)
1615				return (1);
1616	}
1617	return (0);
1618}
1619
1620static struct witness *
1621witness_get()
1622{
1623	struct witness *w;
1624
1625	if ((w = w_free) == NULL) {
1626		witness_dead = 1;
1627		mtx_unlock_spin_flags(&w_mtx, MTX_QUIET);
1628		printf("witness exhausted\n");
1629		return (NULL);
1630	}
1631	w_free = w->w_next;
1632	bzero(w, sizeof(*w));
1633	return (w);
1634}
1635
1636static void
1637witness_free(struct witness *w)
1638{
1639	w->w_next = w_free;
1640	w_free = w;
1641}
1642
1643int
1644witness_list(struct proc *p)
1645{
1646	struct mtx *m;
1647	int nheld;
1648
1649	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
1650	nheld = 0;
1651	LIST_FOREACH(m, &p->p_heldmtx, mtx_held) {
1652		printf("\t\"%s\" (%p) locked at %s:%d\n",
1653		    m->mtx_description, m,
1654		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1655		nheld++;
1656	}
1657
1658	return (nheld);
1659}
1660
1661#ifdef DDB
1662
1663DB_SHOW_COMMAND(mutexes, db_witness_list)
1664{
1665
1666	witness_list(CURPROC);
1667}
1668
1669DB_SHOW_COMMAND(witness, db_witness_display)
1670{
1671
1672	witness_display(db_printf);
1673}
1674#endif
1675
1676void
1677witness_save(struct mtx *m, const char **filep, int *linep)
1678{
1679
1680	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
1681	if (m->mtx_witness == NULL)
1682		return;
1683
1684	*filep = m->mtx_witness->w_file;
1685	*linep = m->mtx_witness->w_line;
1686}
1687
1688void
1689witness_restore(struct mtx *m, const char *file, int line)
1690{
1691
1692	KASSERT(!witness_cold, ("%s: witness_cold\n", __FUNCTION__));
1693	if (m->mtx_witness == NULL)
1694		return;
1695
1696	m->mtx_witness->w_file = file;
1697	m->mtx_witness->w_line = line;
1698}
1699
1700#endif	/* WITNESS */
1701