kern_mutex.c revision 69376
1/*-
2 * Copyright (c) 1998 Berkeley Software Design, Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 * 3. Berkeley Software Design Inc's name may not be used to endorse or
13 *    promote products derived from this software without specific prior
14 *    written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY BERKELEY SOFTWARE DESIGN INC ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL BERKELEY SOFTWARE DESIGN INC BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 *
28 *	from BSDI $Id: mutex_witness.c,v 1.1.2.20 2000/04/27 03:10:27 cp Exp $
29 *	and BSDI $Id: synch_machdep.c,v 2.3.2.39 2000/04/27 03:10:25 cp Exp $
30 * $FreeBSD: head/sys/kern/kern_mutex.c 69376 2000-11-30 00:51:16Z jhb $
31 */
32
33/*
34 *	Main Entry: witness
35 *	Pronunciation: 'wit-n&s
36 *	Function: noun
37 *	Etymology: Middle English witnesse, from Old English witnes knowledge,
38 *	    testimony, witness, from 2wit
39 *	Date: before 12th century
40 *	1 : attestation of a fact or event : TESTIMONY
41 *	2 : one that gives evidence; specifically : one who testifies in
42 *	    a cause or before a judicial tribunal
43 *	3 : one asked to be present at a transaction so as to be able to
44 *	    testify to its having taken place
45 *	4 : one who has personal knowledge of something
46 *	5 a : something serving as evidence or proof : SIGN
47 *	  b : public affirmation by word or example of usually
48 *	      religious faith or conviction <the heroic witness to divine
49 *	      life -- Pilot>
50 *	6 capitalized : a member of the Jehovah's Witnesses
51 */
52
53#include "opt_ddb.h"
54#include "opt_witness.h"
55
56/*
57 * Cause non-inlined mtx_*() to be compiled.
58 * Must be defined early because other system headers may include mutex.h.
59 */
60#define _KERN_MUTEX_C_
61
62#include <sys/param.h>
63#include <sys/bus.h>
64#include <sys/kernel.h>
65#include <sys/malloc.h>
66#include <sys/proc.h>
67#include <sys/sysctl.h>
68#include <sys/systm.h>
69#include <sys/vmmeter.h>
70#include <sys/ktr.h>
71
72#include <machine/atomic.h>
73#include <machine/bus.h>
74#include <machine/clock.h>
75#include <machine/cpu.h>
76
77#include <ddb/ddb.h>
78
79#include <vm/vm.h>
80#include <vm/vm_extern.h>
81
82#include <sys/mutex.h>
83
84/*
85 * Machine independent bits of the mutex implementation
86 */
87/* All mutexes in system (used for debug/panic) */
88#ifdef MUTEX_DEBUG
89static struct mtx_debug all_mtx_debug = { NULL, {NULL, NULL}, NULL, 0,
90	"All mutexes queue head" };
91static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, &all_mtx_debug,
92	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
93	{ NULL, NULL }, &all_mtx, &all_mtx };
94#else	/* MUTEX_DEBUG */
95static struct mtx all_mtx = { MTX_UNOWNED, 0, 0, "All mutexes queue head",
96	TAILQ_HEAD_INITIALIZER(all_mtx.mtx_blocked),
97	{ NULL, NULL }, &all_mtx, &all_mtx };
98#endif	/* MUTEX_DEBUG */
99
100static int	mtx_cur_cnt;
101static int	mtx_max_cnt;
102
103void	_mtx_enter_giant_def(void);
104void	_mtx_exit_giant_def(void);
105static void propagate_priority(struct proc *);
106
107#define	mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
108#define	mtx_owner(m)	(mtx_unowned(m) ? NULL \
109			    : (struct proc *)((m)->mtx_lock & MTX_FLAGMASK))
110
111#define RETIP(x)		*(((uintptr_t *)(&x)) - 1)
112#define	SET_PRIO(p, pri)	(p)->p_priority = (pri)
113
114/*
115 * XXX Temporary, for use from assembly language
116 */
117
118void
119_mtx_enter_giant_def(void)
120{
121
122	mtx_enter(&Giant, MTX_DEF);
123}
124
125void
126_mtx_exit_giant_def(void)
127{
128
129	mtx_exit(&Giant, MTX_DEF);
130}
131
132static void
133propagate_priority(struct proc *p)
134{
135	int pri = p->p_priority;
136	struct mtx *m = p->p_blocked;
137
138	mtx_assert(&sched_lock, MA_OWNED);
139	for (;;) {
140		struct proc *p1;
141
142		p = mtx_owner(m);
143
144		if (p == NULL) {
145			/*
146			 * This really isn't quite right. Really
147			 * ought to bump priority of process that
148			 * next acquires the mutex.
149			 */
150			MPASS(m->mtx_lock == MTX_CONTESTED);
151			return;
152		}
153		MPASS(p->p_magic == P_MAGIC);
154		KASSERT(p->p_stat != SSLEEP, ("sleeping process owns a mutex"));
155		if (p->p_priority <= pri)
156			return;
157
158		/*
159		 * Bump this process' priority.
160		 */
161		SET_PRIO(p, pri);
162
163		/*
164		 * If lock holder is actually running, just bump priority.
165		 */
166#ifdef SMP
167		/*
168		 * For SMP, we can check the p_oncpu field to see if we are
169		 * running.
170		 */
171		if (p->p_oncpu != 0xff) {
172			MPASS(p->p_stat == SRUN || p->p_stat == SZOMB);
173			return;
174		}
175#else
176		/*
177		 * For UP, we check to see if p is curproc (this shouldn't
178		 * ever happen however as it would mean we are in a deadlock.)
179		 */
180		if (p == curproc) {
181			panic("Deadlock detected");
182			return;
183		}
184#endif
185		/*
186		 * If on run queue move to new run queue, and
187		 * quit.
188		 */
189		if (p->p_stat == SRUN) {
190			printf("XXX: moving process %d(%s) to a new run queue\n",
191			       p->p_pid, p->p_comm);
192			MPASS(p->p_blocked == NULL);
193			remrunqueue(p);
194			setrunqueue(p);
195			return;
196		}
197
198		/*
199		 * If we aren't blocked on a mutex, we should be.
200		 */
201		KASSERT(p->p_stat == SMTX, (
202		    "process %d(%s):%d holds %s but isn't blocked on a mutex\n",
203		    p->p_pid, p->p_comm, p->p_stat,
204		    m->mtx_description));
205
206		/*
207		 * Pick up the mutex that p is blocked on.
208		 */
209		m = p->p_blocked;
210		MPASS(m != NULL);
211
212		printf("XXX: process %d(%s) is blocked on %s\n", p->p_pid,
213		    p->p_comm, m->mtx_description);
214		/*
215		 * Check if the proc needs to be moved up on
216		 * the blocked chain
217		 */
218		if (p == TAILQ_FIRST(&m->mtx_blocked)) {
219			printf("XXX: process at head of run queue\n");
220			continue;
221		}
222		p1 = TAILQ_PREV(p, rq, p_procq);
223		if (p1->p_priority <= pri) {
224			printf(
225	"XXX: previous process %d(%s) has higher priority\n",
226	                    p->p_pid, p->p_comm);
227			continue;
228		}
229
230		/*
231		 * Remove proc from blocked chain and determine where
232		 * it should be moved up to.  Since we know that p1 has
233		 * a lower priority than p, we know that at least one
234		 * process in the chain has a lower priority and that
235		 * p1 will thus not be NULL after the loop.
236		 */
237		TAILQ_REMOVE(&m->mtx_blocked, p, p_procq);
238		TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq) {
239			MPASS(p1->p_magic == P_MAGIC);
240			if (p1->p_priority > pri)
241				break;
242		}
243		MPASS(p1 != NULL);
244		TAILQ_INSERT_BEFORE(p1, p, p_procq);
245		CTR4(KTR_LOCK,
246		    "propagate_priority: p 0x%p moved before 0x%p on [0x%p] %s",
247		    p, p1, m, m->mtx_description);
248	}
249}
250
251void
252mtx_enter_hard(struct mtx *m, int type, int saveintr)
253{
254	struct proc *p = CURPROC;
255
256	KASSERT(p != NULL, ("curproc is NULL in mutex"));
257
258	switch (type) {
259	case MTX_DEF:
260		if ((m->mtx_lock & MTX_FLAGMASK) == (uintptr_t)p) {
261			m->mtx_recurse++;
262			atomic_set_ptr(&m->mtx_lock, MTX_RECURSE);
263			CTR1(KTR_LOCK, "mtx_enter: 0x%p recurse", m);
264			return;
265		}
266		CTR3(KTR_LOCK, "mtx_enter: 0x%p contested (lock=%p) [0x%p]",
267		    m, (void *)m->mtx_lock, (void *)RETIP(m));
268
269		/*
270		 * Save our priority.  Even though p_nativepri is protected
271		 * by sched_lock, we don't obtain it here as it can be
272		 * expensive.  Since this is the only place p_nativepri is
273		 * set, and since two CPUs will not be executing the same
274		 * process concurrently, we know that no other CPU is going
275		 * to be messing with this.  Also, p_nativepri is only read
276		 * when we are blocked on a mutex, so that can't be happening
277		 * right now either.
278		 */
279		p->p_nativepri = p->p_priority;
280		while (!_obtain_lock(m, p)) {
281			uintptr_t v;
282			struct proc *p1;
283
284			mtx_enter(&sched_lock, MTX_SPIN | MTX_RLIKELY);
285			/*
286			 * check if the lock has been released while
287			 * waiting for the schedlock.
288			 */
289			if ((v = m->mtx_lock) == MTX_UNOWNED) {
290				mtx_exit(&sched_lock, MTX_SPIN);
291				continue;
292			}
293			/*
294			 * The mutex was marked contested on release. This
295			 * means that there are processes blocked on it.
296			 */
297			if (v == MTX_CONTESTED) {
298				p1 = TAILQ_FIRST(&m->mtx_blocked);
299				KASSERT(p1 != NULL, ("contested mutex has no contesters"));
300				KASSERT(p != NULL, ("curproc is NULL for contested mutex"));
301				m->mtx_lock = (uintptr_t)p | MTX_CONTESTED;
302				if (p1->p_priority < p->p_priority) {
303					SET_PRIO(p, p1->p_priority);
304				}
305				mtx_exit(&sched_lock, MTX_SPIN);
306				return;
307			}
308			/*
309			 * If the mutex isn't already contested and
310			 * a failure occurs setting the contested bit the
311			 * mutex was either release or the
312			 * state of the RECURSION bit changed.
313			 */
314			if ((v & MTX_CONTESTED) == 0 &&
315			    !atomic_cmpset_ptr(&m->mtx_lock, (void *)v,
316				               (void *)(v | MTX_CONTESTED))) {
317				mtx_exit(&sched_lock, MTX_SPIN);
318				continue;
319			}
320
321			/* We definitely have to sleep for this lock */
322			mtx_assert(m, MA_NOTOWNED);
323
324#ifdef notyet
325			/*
326			 * If we're borrowing an interrupted thread's VM
327			 * context must clean up before going to sleep.
328			 */
329			if (p->p_flag & (P_ITHD | P_SITHD)) {
330				ithd_t *it = (ithd_t *)p;
331
332				if (it->it_interrupted) {
333					CTR2(KTR_LOCK,
334					    "mtx_enter: 0x%x interrupted 0x%x",
335					    it, it->it_interrupted);
336					intr_thd_fixup(it);
337				}
338			}
339#endif
340
341			/* Put us on the list of procs blocked on this mutex */
342			if (TAILQ_EMPTY(&m->mtx_blocked)) {
343				p1 = (struct proc *)(m->mtx_lock &
344						     MTX_FLAGMASK);
345				LIST_INSERT_HEAD(&p1->p_contested, m,
346						 mtx_contested);
347				TAILQ_INSERT_TAIL(&m->mtx_blocked, p, p_procq);
348			} else {
349				TAILQ_FOREACH(p1, &m->mtx_blocked, p_procq)
350					if (p1->p_priority > p->p_priority)
351						break;
352				if (p1)
353					TAILQ_INSERT_BEFORE(p1, p, p_procq);
354				else
355					TAILQ_INSERT_TAIL(&m->mtx_blocked, p,
356							  p_procq);
357			}
358
359			p->p_blocked = m;	/* Who we're blocked on */
360			p->p_mtxname = m->mtx_description;
361			p->p_stat = SMTX;
362#if 0
363			propagate_priority(p);
364#endif
365			CTR3(KTR_LOCK, "mtx_enter: p 0x%p blocked on [0x%p] %s",
366			    p, m, m->mtx_description);
367			mi_switch();
368			CTR3(KTR_LOCK,
369			    "mtx_enter: p 0x%p free from blocked on [0x%p] %s",
370			    p, m, m->mtx_description);
371			mtx_exit(&sched_lock, MTX_SPIN);
372		}
373		return;
374	case MTX_SPIN:
375	case MTX_SPIN | MTX_FIRST:
376	case MTX_SPIN | MTX_TOPHALF:
377	    {
378		int i = 0;
379
380		if (m->mtx_lock == (uintptr_t)p) {
381			m->mtx_recurse++;
382			return;
383		}
384		CTR1(KTR_LOCK, "mtx_enter: %p spinning", m);
385		for (;;) {
386			if (_obtain_lock(m, p))
387				break;
388			while (m->mtx_lock != MTX_UNOWNED) {
389				if (i++ < 1000000)
390					continue;
391				if (i++ < 6000000)
392					DELAY (1);
393#ifdef DDB
394				else if (!db_active)
395#else
396				else
397#endif
398					panic(
399				"spin lock %s held by 0x%p for > 5 seconds",
400					    m->mtx_description,
401					    (void *)m->mtx_lock);
402			}
403		}
404
405#ifdef MUTEX_DEBUG
406		if (type != MTX_SPIN)
407			m->mtx_saveintr = 0xbeefface;
408		else
409#endif
410			m->mtx_saveintr = saveintr;
411		CTR1(KTR_LOCK, "mtx_enter: 0x%p spin done", m);
412		return;
413	    }
414	}
415}
416
417void
418mtx_exit_hard(struct mtx *m, int type)
419{
420	struct proc *p, *p1;
421	struct mtx *m1;
422	int pri;
423
424	p = CURPROC;
425	switch (type) {
426	case MTX_DEF:
427	case MTX_DEF | MTX_NOSWITCH:
428		if (m->mtx_recurse != 0) {
429			if (--(m->mtx_recurse) == 0)
430				atomic_clear_ptr(&m->mtx_lock, MTX_RECURSE);
431			CTR1(KTR_LOCK, "mtx_exit: 0x%p unrecurse", m);
432			return;
433		}
434		mtx_enter(&sched_lock, MTX_SPIN);
435		CTR1(KTR_LOCK, "mtx_exit: 0x%p contested", m);
436		p1 = TAILQ_FIRST(&m->mtx_blocked);
437		MPASS(p->p_magic == P_MAGIC);
438		MPASS(p1->p_magic == P_MAGIC);
439		TAILQ_REMOVE(&m->mtx_blocked, p1, p_procq);
440		if (TAILQ_EMPTY(&m->mtx_blocked)) {
441			LIST_REMOVE(m, mtx_contested);
442			_release_lock_quick(m);
443			CTR1(KTR_LOCK, "mtx_exit: 0x%p not held", m);
444		} else
445			atomic_store_rel_ptr(&m->mtx_lock,
446			    (void *)MTX_CONTESTED);
447		pri = MAXPRI;
448		LIST_FOREACH(m1, &p->p_contested, mtx_contested) {
449			int cp = TAILQ_FIRST(&m1->mtx_blocked)->p_priority;
450			if (cp < pri)
451				pri = cp;
452		}
453		if (pri > p->p_nativepri)
454			pri = p->p_nativepri;
455		SET_PRIO(p, pri);
456		CTR2(KTR_LOCK, "mtx_exit: 0x%p contested setrunqueue 0x%p",
457		    m, p1);
458		p1->p_blocked = NULL;
459		p1->p_mtxname = NULL;
460		p1->p_stat = SRUN;
461		setrunqueue(p1);
462		if ((type & MTX_NOSWITCH) == 0 && p1->p_priority < pri) {
463#ifdef notyet
464			if (p->p_flag & (P_ITHD | P_SITHD)) {
465				ithd_t *it = (ithd_t *)p;
466
467				if (it->it_interrupted) {
468					CTR2(KTR_LOCK,
469					    "mtx_exit: 0x%x interruped 0x%x",
470					    it, it->it_interrupted);
471					intr_thd_fixup(it);
472				}
473			}
474#endif
475			setrunqueue(p);
476			CTR2(KTR_LOCK, "mtx_exit: 0x%p switching out lock=0x%p",
477			    m, (void *)m->mtx_lock);
478			mi_switch();
479			CTR2(KTR_LOCK, "mtx_exit: 0x%p resuming lock=0x%p",
480			    m, (void *)m->mtx_lock);
481		}
482		mtx_exit(&sched_lock, MTX_SPIN);
483		break;
484	case MTX_SPIN:
485	case MTX_SPIN | MTX_FIRST:
486		if (m->mtx_recurse != 0) {
487			m->mtx_recurse--;
488			return;
489		}
490		MPASS(mtx_owned(m));
491		_release_lock_quick(m);
492		if (type & MTX_FIRST)
493			enable_intr();	/* XXX is this kosher? */
494		else {
495			MPASS(m->mtx_saveintr != 0xbeefface);
496			restore_intr(m->mtx_saveintr);
497		}
498		break;
499	case MTX_SPIN | MTX_TOPHALF:
500		if (m->mtx_recurse != 0) {
501			m->mtx_recurse--;
502			return;
503		}
504		MPASS(mtx_owned(m));
505		_release_lock_quick(m);
506		break;
507	default:
508		panic("mtx_exit_hard: unsupported type 0x%x\n", type);
509	}
510}
511
512#define MV_DESTROY	0	/* validate before destory */
513#define MV_INIT		1	/* validate before init */
514
515#ifdef MUTEX_DEBUG
516
517int mtx_validate __P((struct mtx *, int));
518
519int
520mtx_validate(struct mtx *m, int when)
521{
522	struct mtx *mp;
523	int i;
524	int retval = 0;
525
526	if (m == &all_mtx || cold)
527		return 0;
528
529	mtx_enter(&all_mtx, MTX_DEF);
530/*
531 * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
532 * we can re-enable the kernacc() checks.
533 */
534#ifndef __alpha__
535	MPASS(kernacc((caddr_t)all_mtx.mtx_next, sizeof(uintptr_t),
536	    VM_PROT_READ) == 1);
537#endif
538	MPASS(all_mtx.mtx_next->mtx_prev == &all_mtx);
539	for (i = 0, mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next) {
540#ifndef __alpha__
541		if (kernacc((caddr_t)mp->mtx_next, sizeof(uintptr_t),
542		    VM_PROT_READ) != 1) {
543			panic("mtx_validate: mp=%p mp->mtx_next=%p",
544			    mp, mp->mtx_next);
545		}
546#endif
547		i++;
548		if (i > mtx_cur_cnt) {
549			panic("mtx_validate: too many in chain, known=%d\n",
550			    mtx_cur_cnt);
551		}
552	}
553	MPASS(i == mtx_cur_cnt);
554	switch (when) {
555	case MV_DESTROY:
556		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
557			if (mp == m)
558				break;
559		MPASS(mp == m);
560		break;
561	case MV_INIT:
562		for (mp = all_mtx.mtx_next; mp != &all_mtx; mp = mp->mtx_next)
563		if (mp == m) {
564			/*
565			 * Not good. This mutex already exists.
566			 */
567			printf("re-initing existing mutex %s\n",
568			    m->mtx_description);
569			MPASS(m->mtx_lock == MTX_UNOWNED);
570			retval = 1;
571		}
572	}
573	mtx_exit(&all_mtx, MTX_DEF);
574	return (retval);
575}
576#endif
577
578void
579mtx_init(struct mtx *m, const char *t, int flag)
580{
581#ifdef MUTEX_DEBUG
582	struct mtx_debug *debug;
583#endif
584
585	CTR2(KTR_LOCK, "mtx_init 0x%p (%s)", m, t);
586#ifdef MUTEX_DEBUG
587	if (mtx_validate(m, MV_INIT))	/* diagnostic and error correction */
588		return;
589	if (flag & MTX_COLD)
590		debug = m->mtx_debug;
591	else
592		debug = NULL;
593	if (debug == NULL) {
594#ifdef DIAGNOSTIC
595		if(cold && bootverbose)
596			printf("malloc'ing mtx_debug while cold for %s\n", t);
597#endif
598
599		/* XXX - should not use DEVBUF */
600		debug = malloc(sizeof(struct mtx_debug), M_DEVBUF, M_NOWAIT);
601		MPASS(debug != NULL);
602		bzero(debug, sizeof(struct mtx_debug));
603	}
604#endif
605	bzero((void *)m, sizeof *m);
606	TAILQ_INIT(&m->mtx_blocked);
607#ifdef MUTEX_DEBUG
608	m->mtx_debug = debug;
609#endif
610	m->mtx_description = t;
611	m->mtx_lock = MTX_UNOWNED;
612	/* Put on all mutex queue */
613	mtx_enter(&all_mtx, MTX_DEF);
614	m->mtx_next = &all_mtx;
615	m->mtx_prev = all_mtx.mtx_prev;
616	m->mtx_prev->mtx_next = m;
617	all_mtx.mtx_prev = m;
618	if (++mtx_cur_cnt > mtx_max_cnt)
619		mtx_max_cnt = mtx_cur_cnt;
620	mtx_exit(&all_mtx, MTX_DEF);
621	witness_init(m, flag);
622}
623
624void
625mtx_destroy(struct mtx *m)
626{
627
628	CTR2(KTR_LOCK, "mtx_destroy 0x%p (%s)", m, m->mtx_description);
629#ifdef MUTEX_DEBUG
630	if (m->mtx_next == NULL)
631		panic("mtx_destroy: %p (%s) already destroyed",
632		    m, m->mtx_description);
633
634	if (!mtx_owned(m)) {
635		MPASS(m->mtx_lock == MTX_UNOWNED);
636	} else {
637		MPASS((m->mtx_lock & (MTX_RECURSE|MTX_CONTESTED)) == 0);
638	}
639	mtx_validate(m, MV_DESTROY);		/* diagnostic */
640#endif
641
642#ifdef WITNESS
643	if (m->mtx_witness)
644		witness_destroy(m);
645#endif /* WITNESS */
646
647	/* Remove from the all mutex queue */
648	mtx_enter(&all_mtx, MTX_DEF);
649	m->mtx_next->mtx_prev = m->mtx_prev;
650	m->mtx_prev->mtx_next = m->mtx_next;
651#ifdef MUTEX_DEBUG
652	m->mtx_next = m->mtx_prev = NULL;
653	free(m->mtx_debug, M_DEVBUF);
654	m->mtx_debug = NULL;
655#endif
656	mtx_cur_cnt--;
657	mtx_exit(&all_mtx, MTX_DEF);
658}
659
660/*
661 * The non-inlined versions of the mtx_*() functions are always built (above),
662 * but the witness code depends on the MUTEX_DEBUG and WITNESS kernel options
663 * being specified.
664 */
665#if (defined(MUTEX_DEBUG) && defined(WITNESS))
666
667#define WITNESS_COUNT 200
668#define	WITNESS_NCHILDREN 2
669
670int witness_watch = 1;
671
672struct witness {
673	struct witness	*w_next;
674	const char	*w_description;
675	const char	*w_file;
676	int		 w_line;
677	struct witness	*w_morechildren;
678	u_char		 w_childcnt;
679	u_char		 w_Giant_squawked:1;
680	u_char		 w_other_squawked:1;
681	u_char		 w_same_squawked:1;
682	u_char		 w_sleep:1;
683	u_char		 w_spin:1;	/* this is a spin mutex */
684	u_int		 w_level;
685	struct witness	*w_children[WITNESS_NCHILDREN];
686};
687
688struct witness_blessed {
689	char 	*b_lock1;
690	char	*b_lock2;
691};
692
693#ifdef DDB
694/*
695 * When DDB is enabled and witness_ddb is set to 1, it will cause the system to
696 * drop into kdebug() when:
697 *	- a lock heirarchy violation occurs
698 *	- locks are held when going to sleep.
699 */
700#ifdef WITNESS_DDB
701int	witness_ddb = 1;
702#else
703int	witness_ddb = 0;
704#endif
705SYSCTL_INT(_debug, OID_AUTO, witness_ddb, CTLFLAG_RW, &witness_ddb, 0, "");
706#endif /* DDB */
707
708#ifdef WITNESS_SKIPSPIN
709int	witness_skipspin = 1;
710#else
711int	witness_skipspin = 0;
712#endif
713SYSCTL_INT(_debug, OID_AUTO, witness_skipspin, CTLFLAG_RD, &witness_skipspin, 0,
714    "");
715
716MUTEX_DECLARE(static,w_mtx);
717static struct witness	*w_free;
718static struct witness	*w_all;
719static int		 w_inited;
720static int		 witness_dead;	/* fatal error, probably no memory */
721
722static struct witness	 w_data[WITNESS_COUNT];
723
724static struct witness	 *enroll __P((const char *description, int flag));
725static int itismychild __P((struct witness *parent, struct witness *child));
726static void removechild __P((struct witness *parent, struct witness *child));
727static int isitmychild __P((struct witness *parent, struct witness *child));
728static int isitmydescendant __P((struct witness *parent, struct witness *child));
729static int dup_ok __P((struct witness *));
730static int blessed __P((struct witness *, struct witness *));
731static void witness_displaydescendants
732    __P((void(*)(const char *fmt, ...), struct witness *));
733static void witness_leveldescendents __P((struct witness *parent, int level));
734static void witness_levelall __P((void));
735static struct witness * witness_get __P((void));
736static void witness_free __P((struct witness *m));
737
738
739static char *ignore_list[] = {
740	"witness lock",
741	NULL
742};
743
744static char *spin_order_list[] = {
745	"sio",
746	"sched lock",
747#ifdef __i386__
748	"clk",
749#endif
750	"callout",
751	/*
752	 * leaf locks
753	 */
754	NULL
755};
756
757static char *order_list[] = {
758	"uidinfo hash", "uidinfo struct", NULL,
759	NULL
760};
761
762static char *dup_list[] = {
763	NULL
764};
765
766static char *sleep_list[] = {
767	"Giant",
768	NULL
769};
770
771/*
772 * Pairs of locks which have been blessed
773 * Don't complain about order problems with blessed locks
774 */
775static struct witness_blessed blessed_list[] = {
776};
777static int blessed_count = sizeof(blessed_list) / sizeof(struct witness_blessed);
778
779void
780witness_init(struct mtx *m, int flag)
781{
782	m->mtx_witness = enroll(m->mtx_description, flag);
783}
784
785void
786witness_destroy(struct mtx *m)
787{
788	struct mtx *m1;
789	struct proc *p;
790	p = CURPROC;
791	for ((m1 = LIST_FIRST(&p->p_heldmtx)); m1 != NULL;
792		m1 = LIST_NEXT(m1, mtx_held)) {
793		if (m1 == m) {
794			LIST_REMOVE(m, mtx_held);
795			break;
796		}
797	}
798	return;
799
800}
801
802void
803witness_enter(struct mtx *m, int flags, const char *file, int line)
804{
805	struct witness *w, *w1;
806	struct mtx *m1;
807	struct proc *p;
808	int i;
809#ifdef DDB
810	int go_into_ddb = 0;
811#endif /* DDB */
812
813	w = m->mtx_witness;
814	p = CURPROC;
815
816	if (flags & MTX_SPIN) {
817		if (!w->w_spin)
818			panic("mutex_enter: MTX_SPIN on MTX_DEF mutex %s @"
819			    " %s:%d", m->mtx_description, file, line);
820		if (m->mtx_recurse != 0)
821			return;
822		mtx_enter(&w_mtx, MTX_SPIN);
823		i = witness_spin_check;
824		if (i != 0 && w->w_level < i) {
825			mtx_exit(&w_mtx, MTX_SPIN);
826			panic("mutex_enter(%s:%x, MTX_SPIN) out of order @"
827			    " %s:%d already holding %s:%x",
828			    m->mtx_description, w->w_level, file, line,
829			    spin_order_list[ffs(i)-1], i);
830		}
831		PCPU_SET(witness_spin_check, i | w->w_level);
832		mtx_exit(&w_mtx, MTX_SPIN);
833		w->w_file = file;
834		w->w_line = line;
835		m->mtx_line = line;
836		m->mtx_file = file;
837		return;
838	}
839	if (w->w_spin)
840		panic("mutex_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
841		    m->mtx_description, file, line);
842
843	if (m->mtx_recurse != 0)
844		return;
845	if (witness_dead)
846		goto out;
847	if (cold || panicstr)
848		goto out;
849
850	if (!mtx_legal2block())
851		panic("blockable mtx_enter() of %s when not legal @ %s:%d",
852			    m->mtx_description, file, line);
853	/*
854	 * Is this the first mutex acquired
855	 */
856	if ((m1 = LIST_FIRST(&p->p_heldmtx)) == NULL)
857		goto out;
858
859	if ((w1 = m1->mtx_witness) == w) {
860		if (w->w_same_squawked || dup_ok(w))
861			goto out;
862		w->w_same_squawked = 1;
863		printf("acquring duplicate lock of same type: \"%s\"\n",
864			m->mtx_description);
865		printf(" 1st @ %s:%d\n", w->w_file, w->w_line);
866		printf(" 2nd @ %s:%d\n", file, line);
867#ifdef DDB
868		go_into_ddb = 1;
869#endif /* DDB */
870		goto out;
871	}
872	MPASS(!mtx_owned(&w_mtx));
873	mtx_enter(&w_mtx, MTX_SPIN);
874	/*
875	 * If we have a known higher number just say ok
876	 */
877	if (witness_watch > 1 && w->w_level > w1->w_level) {
878		mtx_exit(&w_mtx, MTX_SPIN);
879		goto out;
880	}
881	if (isitmydescendant(m1->mtx_witness, w)) {
882		mtx_exit(&w_mtx, MTX_SPIN);
883		goto out;
884	}
885	for (i = 0; m1 != NULL; m1 = LIST_NEXT(m1, mtx_held), i++) {
886
887		MPASS(i < 200);
888		w1 = m1->mtx_witness;
889		if (isitmydescendant(w, w1)) {
890			mtx_exit(&w_mtx, MTX_SPIN);
891			if (blessed(w, w1))
892				goto out;
893			if (m1 == &Giant) {
894				if (w1->w_Giant_squawked)
895					goto out;
896				else
897					w1->w_Giant_squawked = 1;
898			} else {
899				if (w1->w_other_squawked)
900					goto out;
901				else
902					w1->w_other_squawked = 1;
903			}
904			printf("lock order reversal\n");
905			printf(" 1st %s last acquired @ %s:%d\n",
906			    w->w_description, w->w_file, w->w_line);
907			printf(" 2nd %p %s @ %s:%d\n",
908			    m1, w1->w_description, w1->w_file, w1->w_line);
909			printf(" 3rd %p %s @ %s:%d\n",
910			    m, w->w_description, file, line);
911#ifdef DDB
912			go_into_ddb = 1;
913#endif /* DDB */
914			goto out;
915		}
916	}
917	m1 = LIST_FIRST(&p->p_heldmtx);
918	if (!itismychild(m1->mtx_witness, w))
919		mtx_exit(&w_mtx, MTX_SPIN);
920
921out:
922#ifdef DDB
923	if (witness_ddb && go_into_ddb)
924		Debugger("witness_enter");
925#endif /* DDB */
926	w->w_file = file;
927	w->w_line = line;
928	m->mtx_line = line;
929	m->mtx_file = file;
930
931	/*
932	 * If this pays off it likely means that a mutex being witnessed
933	 * is acquired in hardclock. Put it in the ignore list. It is
934	 * likely not the mutex this assert fails on.
935	 */
936	MPASS(m->mtx_held.le_prev == NULL);
937	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
938}
939
940void
941witness_exit(struct mtx *m, int flags, const char *file, int line)
942{
943	struct witness *w;
944
945	w = m->mtx_witness;
946
947	if (flags & MTX_SPIN) {
948		if (!w->w_spin)
949			panic("mutex_exit: MTX_SPIN on MTX_DEF mutex %s @"
950			    " %s:%d", m->mtx_description, file, line);
951		if (m->mtx_recurse != 0)
952			return;
953		mtx_enter(&w_mtx, MTX_SPIN);
954		PCPU_SET(witness_spin_check, witness_spin_check & ~w->w_level);
955		mtx_exit(&w_mtx, MTX_SPIN);
956		return;
957	}
958	if (w->w_spin)
959		panic("mutex_exit: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
960		    m->mtx_description, file, line);
961
962	if (m->mtx_recurse != 0)
963		return;
964
965	if ((flags & MTX_NOSWITCH) == 0 && !mtx_legal2block() && !cold)
966		panic("switchable mtx_exit() of %s when not legal @ %s:%d",
967			    m->mtx_description, file, line);
968	LIST_REMOVE(m, mtx_held);
969	m->mtx_held.le_prev = NULL;
970}
971
972void
973witness_try_enter(struct mtx *m, int flags, const char *file, int line)
974{
975	struct proc *p;
976	struct witness *w = m->mtx_witness;
977
978	if (flags & MTX_SPIN) {
979		if (!w->w_spin)
980			panic("mutex_try_enter: "
981			    "MTX_SPIN on MTX_DEF mutex %s @ %s:%d",
982			    m->mtx_description, file, line);
983		if (m->mtx_recurse != 0)
984			return;
985		mtx_enter(&w_mtx, MTX_SPIN);
986		PCPU_SET(witness_spin_check, witness_spin_check | w->w_level);
987		mtx_exit(&w_mtx, MTX_SPIN);
988		w->w_file = file;
989		w->w_line = line;
990		m->mtx_line = line;
991		m->mtx_file = file;
992		return;
993	}
994
995	if (w->w_spin)
996		panic("mutex_try_enter: MTX_DEF on MTX_SPIN mutex %s @ %s:%d",
997		    m->mtx_description, file, line);
998
999	if (m->mtx_recurse != 0)
1000		return;
1001
1002	w->w_file = file;
1003	w->w_line = line;
1004	m->mtx_line = line;
1005	m->mtx_file = file;
1006	p = CURPROC;
1007	MPASS(m->mtx_held.le_prev == NULL);
1008	LIST_INSERT_HEAD(&p->p_heldmtx, (struct mtx*)m, mtx_held);
1009}
1010
1011void
1012witness_display(void(*prnt)(const char *fmt, ...))
1013{
1014	struct witness *w, *w1;
1015
1016	witness_levelall();
1017
1018	for (w = w_all; w; w = w->w_next) {
1019		if (w->w_file == NULL)
1020			continue;
1021		for (w1 = w_all; w1; w1 = w1->w_next) {
1022			if (isitmychild(w1, w))
1023				break;
1024		}
1025		if (w1 != NULL)
1026			continue;
1027		/*
1028		 * This lock has no anscestors, display its descendants.
1029		 */
1030		witness_displaydescendants(prnt, w);
1031	}
1032	prnt("\nMutex which were never acquired\n");
1033	for (w = w_all; w; w = w->w_next) {
1034		if (w->w_file != NULL)
1035			continue;
1036		prnt("%s\n", w->w_description);
1037	}
1038}
1039
1040int
1041witness_sleep(int check_only, struct mtx *mtx, const char *file, int line)
1042{
1043	struct mtx *m;
1044	struct proc *p;
1045	char **sleep;
1046	int n = 0;
1047
1048	p = CURPROC;
1049	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1050	    m = LIST_NEXT(m, mtx_held)) {
1051		if (m == mtx)
1052			continue;
1053		for (sleep = sleep_list; *sleep!= NULL; sleep++)
1054			if (strcmp(m->mtx_description, *sleep) == 0)
1055				goto next;
1056		printf("%s:%d: %s with \"%s\" locked from %s:%d\n",
1057			file, line, check_only ? "could sleep" : "sleeping",
1058			m->mtx_description,
1059			m->mtx_witness->w_file, m->mtx_witness->w_line);
1060		n++;
1061	next:
1062	}
1063#ifdef DDB
1064	if (witness_ddb && n)
1065		Debugger("witness_sleep");
1066#endif /* DDB */
1067	return (n);
1068}
1069
1070static struct witness *
1071enroll(const char *description, int flag)
1072{
1073	int i;
1074	struct witness *w, *w1;
1075	char **ignore;
1076	char **order;
1077
1078	if (!witness_watch)
1079		return (NULL);
1080	for (ignore = ignore_list; *ignore != NULL; ignore++)
1081		if (strcmp(description, *ignore) == 0)
1082			return (NULL);
1083
1084	if (w_inited == 0) {
1085		mtx_init(&w_mtx, "witness lock", MTX_COLD | MTX_DEF);
1086		for (i = 0; i < WITNESS_COUNT; i++) {
1087			w = &w_data[i];
1088			witness_free(w);
1089		}
1090		w_inited = 1;
1091		for (order = order_list; *order != NULL; order++) {
1092			w = enroll(*order, MTX_DEF);
1093			w->w_file = "order list";
1094			for (order++; *order != NULL; order++) {
1095				w1 = enroll(*order, MTX_DEF);
1096				w1->w_file = "order list";
1097				itismychild(w, w1);
1098				w = w1;
1099    	    	    	}
1100		}
1101	}
1102	if ((flag & MTX_SPIN) && witness_skipspin)
1103		return (NULL);
1104	mtx_enter(&w_mtx, MTX_SPIN);
1105	for (w = w_all; w; w = w->w_next) {
1106		if (strcmp(description, w->w_description) == 0) {
1107			mtx_exit(&w_mtx, MTX_SPIN);
1108			return (w);
1109		}
1110	}
1111	if ((w = witness_get()) == NULL)
1112		return (NULL);
1113	w->w_next = w_all;
1114	w_all = w;
1115	w->w_description = description;
1116	mtx_exit(&w_mtx, MTX_SPIN);
1117	if (flag & MTX_SPIN) {
1118		w->w_spin = 1;
1119
1120		i = 1;
1121		for (order = spin_order_list; *order != NULL; order++) {
1122			if (strcmp(description, *order) == 0)
1123				break;
1124			i <<= 1;
1125		}
1126		if (*order == NULL)
1127			panic("spin lock %s not in order list", description);
1128		w->w_level = i;
1129	}
1130	return (w);
1131}
1132
1133static int
1134itismychild(struct witness *parent, struct witness *child)
1135{
1136	static int recursed;
1137
1138	/*
1139	 * Insert "child" after "parent"
1140	 */
1141	while (parent->w_morechildren)
1142		parent = parent->w_morechildren;
1143
1144	if (parent->w_childcnt == WITNESS_NCHILDREN) {
1145		if ((parent->w_morechildren = witness_get()) == NULL)
1146			return (1);
1147		parent = parent->w_morechildren;
1148	}
1149	MPASS(child != NULL);
1150	parent->w_children[parent->w_childcnt++] = child;
1151	/*
1152	 * now prune whole tree
1153	 */
1154	if (recursed)
1155		return (0);
1156	recursed = 1;
1157	for (child = w_all; child != NULL; child = child->w_next) {
1158		for (parent = w_all; parent != NULL;
1159		    parent = parent->w_next) {
1160			if (!isitmychild(parent, child))
1161				continue;
1162			removechild(parent, child);
1163			if (isitmydescendant(parent, child))
1164				continue;
1165			itismychild(parent, child);
1166		}
1167	}
1168	recursed = 0;
1169	witness_levelall();
1170	return (0);
1171}
1172
1173static void
1174removechild(struct witness *parent, struct witness *child)
1175{
1176	struct witness *w, *w1;
1177	int i;
1178
1179	for (w = parent; w != NULL; w = w->w_morechildren)
1180		for (i = 0; i < w->w_childcnt; i++)
1181			if (w->w_children[i] == child)
1182				goto found;
1183	return;
1184found:
1185	for (w1 = w; w1->w_morechildren != NULL; w1 = w1->w_morechildren)
1186		continue;
1187	w->w_children[i] = w1->w_children[--w1->w_childcnt];
1188	MPASS(w->w_children[i] != NULL);
1189
1190	if (w1->w_childcnt != 0)
1191		return;
1192
1193	if (w1 == parent)
1194		return;
1195	for (w = parent; w->w_morechildren != w1; w = w->w_morechildren)
1196		continue;
1197	w->w_morechildren = 0;
1198	witness_free(w1);
1199}
1200
1201static int
1202isitmychild(struct witness *parent, struct witness *child)
1203{
1204	struct witness *w;
1205	int i;
1206
1207	for (w = parent; w != NULL; w = w->w_morechildren) {
1208		for (i = 0; i < w->w_childcnt; i++) {
1209			if (w->w_children[i] == child)
1210				return (1);
1211		}
1212	}
1213	return (0);
1214}
1215
1216static int
1217isitmydescendant(struct witness *parent, struct witness *child)
1218{
1219	struct witness *w;
1220	int i;
1221	int j;
1222
1223	for (j = 0, w = parent; w != NULL; w = w->w_morechildren, j++) {
1224		MPASS(j < 1000);
1225		for (i = 0; i < w->w_childcnt; i++) {
1226			if (w->w_children[i] == child)
1227				return (1);
1228		}
1229		for (i = 0; i < w->w_childcnt; i++) {
1230			if (isitmydescendant(w->w_children[i], child))
1231				return (1);
1232		}
1233	}
1234	return (0);
1235}
1236
1237void
1238witness_levelall (void)
1239{
1240	struct witness *w, *w1;
1241
1242	for (w = w_all; w; w = w->w_next)
1243		if (!w->w_spin)
1244			w->w_level = 0;
1245	for (w = w_all; w; w = w->w_next) {
1246		if (w->w_spin)
1247			continue;
1248		for (w1 = w_all; w1; w1 = w1->w_next) {
1249			if (isitmychild(w1, w))
1250				break;
1251		}
1252		if (w1 != NULL)
1253			continue;
1254		witness_leveldescendents(w, 0);
1255	}
1256}
1257
1258static void
1259witness_leveldescendents(struct witness *parent, int level)
1260{
1261	int i;
1262	struct witness *w;
1263
1264	if (parent->w_level < level)
1265		parent->w_level = level;
1266	level++;
1267	for (w = parent; w != NULL; w = w->w_morechildren)
1268		for (i = 0; i < w->w_childcnt; i++)
1269			witness_leveldescendents(w->w_children[i], level);
1270}
1271
1272static void
1273witness_displaydescendants(void(*prnt)(const char *fmt, ...),
1274			   struct witness *parent)
1275{
1276	struct witness *w;
1277	int i;
1278	int level = parent->w_level;
1279
1280	prnt("%d", level);
1281	if (level < 10)
1282		prnt(" ");
1283	for (i = 0; i < level; i++)
1284		prnt(" ");
1285	prnt("%s", parent->w_description);
1286	if (parent->w_file != NULL) {
1287		prnt(" -- last acquired @ %s", parent->w_file);
1288#ifndef W_USE_WHERE
1289		prnt(":%d", parent->w_line);
1290#endif
1291		prnt("\n");
1292	}
1293
1294	for (w = parent; w != NULL; w = w->w_morechildren)
1295		for (i = 0; i < w->w_childcnt; i++)
1296			    witness_displaydescendants(prnt, w->w_children[i]);
1297    }
1298
1299static int
1300dup_ok(struct witness *w)
1301{
1302	char **dup;
1303
1304	for (dup = dup_list; *dup!= NULL; dup++)
1305		if (strcmp(w->w_description, *dup) == 0)
1306			return (1);
1307	return (0);
1308}
1309
1310static int
1311blessed(struct witness *w1, struct witness *w2)
1312{
1313	int i;
1314	struct witness_blessed *b;
1315
1316	for (i = 0; i < blessed_count; i++) {
1317		b = &blessed_list[i];
1318		if (strcmp(w1->w_description, b->b_lock1) == 0) {
1319			if (strcmp(w2->w_description, b->b_lock2) == 0)
1320				return (1);
1321			continue;
1322		}
1323		if (strcmp(w1->w_description, b->b_lock2) == 0)
1324			if (strcmp(w2->w_description, b->b_lock1) == 0)
1325				return (1);
1326	}
1327	return (0);
1328}
1329
1330static struct witness *
1331witness_get()
1332{
1333	struct witness *w;
1334
1335	if ((w = w_free) == NULL) {
1336		witness_dead = 1;
1337		mtx_exit(&w_mtx, MTX_SPIN);
1338		printf("witness exhausted\n");
1339		return (NULL);
1340	}
1341	w_free = w->w_next;
1342	bzero(w, sizeof(*w));
1343	return (w);
1344}
1345
1346static void
1347witness_free(struct witness *w)
1348{
1349	w->w_next = w_free;
1350	w_free = w;
1351}
1352
1353void
1354witness_list(struct proc *p)
1355{
1356	struct mtx *m;
1357
1358	for ((m = LIST_FIRST(&p->p_heldmtx)); m != NULL;
1359	    m = LIST_NEXT(m, mtx_held)) {
1360		printf("\t\"%s\" (%p) locked at %s:%d\n",
1361		    m->mtx_description, m,
1362		    m->mtx_witness->w_file, m->mtx_witness->w_line);
1363	}
1364}
1365
1366void
1367witness_save(struct mtx *m, const char **filep, int *linep)
1368{
1369	*filep = m->mtx_witness->w_file;
1370	*linep = m->mtx_witness->w_line;
1371}
1372
1373void
1374witness_restore(struct mtx *m, const char *file, int line)
1375{
1376	m->mtx_witness->w_file = file;
1377	m->mtx_witness->w_line = line;
1378}
1379
1380#endif	/* (defined(MUTEX_DEBUG) && defined(WITNESS)) */
1381