1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the author nor the names of any co-contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * Machine independent bits of reader/writer lock implementation.
34 */
35
36#include <sys/cdefs.h>
37__FBSDID("$FreeBSD$");
38
39#include "opt_ddb.h"
40
41#include <sys/param.h>
42#include <sys/systm.h>
43
44#include <sys/kernel.h>
45#include <sys/kdb.h>
46#include <sys/ktr.h>
47#include <sys/lock.h>
48#include <sys/mutex.h>
49#include <sys/proc.h>
50#include <sys/rmlock.h>
51#include <sys/sched.h>
52#include <sys/smp.h>
53#include <sys/turnstile.h>
54#include <sys/lock_profile.h>
55#include <machine/cpu.h>
56#include <vm/uma.h>
57
58#ifdef DDB
59#include <ddb/ddb.h>
60#endif
61
62/*
63 * A cookie to mark destroyed rmlocks.  This is stored in the head of
64 * rm_activeReaders.
65 */
66#define	RM_DESTROYED	((void *)0xdead)
67
68#define	rm_destroyed(rm)						\
69	(LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
70
71#define RMPF_ONQUEUE	1
72#define RMPF_SIGNAL	2
73
74#ifndef INVARIANTS
75#define	_rm_assert(c, what, file, line)
76#endif
77
78static void	assert_rm(const struct lock_object *lock, int what);
79#ifdef DDB
80static void	db_show_rm(const struct lock_object *lock);
81#endif
82static void	lock_rm(struct lock_object *lock, uintptr_t how);
83#ifdef KDTRACE_HOOKS
84static int	owner_rm(const struct lock_object *lock, struct thread **owner);
85#endif
86static uintptr_t unlock_rm(struct lock_object *lock);
87
88struct lock_class lock_class_rm = {
89	.lc_name = "rm",
90	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
91	.lc_assert = assert_rm,
92#ifdef DDB
93	.lc_ddb_show = db_show_rm,
94#endif
95	.lc_lock = lock_rm,
96	.lc_unlock = unlock_rm,
97#ifdef KDTRACE_HOOKS
98	.lc_owner = owner_rm,
99#endif
100};
101
102struct lock_class lock_class_rm_sleepable = {
103	.lc_name = "sleepable rm",
104	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
105	.lc_assert = assert_rm,
106#ifdef DDB
107	.lc_ddb_show = db_show_rm,
108#endif
109	.lc_lock = lock_rm,
110	.lc_unlock = unlock_rm,
111#ifdef KDTRACE_HOOKS
112	.lc_owner = owner_rm,
113#endif
114};
115
116static void
117assert_rm(const struct lock_object *lock, int what)
118{
119
120	rm_assert((const struct rmlock *)lock, what);
121}
122
123static void
124lock_rm(struct lock_object *lock, uintptr_t how)
125{
126	struct rmlock *rm;
127	struct rm_priotracker *tracker;
128
129	rm = (struct rmlock *)lock;
130	if (how == 0)
131		rm_wlock(rm);
132	else {
133		tracker = (struct rm_priotracker *)how;
134		rm_rlock(rm, tracker);
135	}
136}
137
138static uintptr_t
139unlock_rm(struct lock_object *lock)
140{
141	struct thread *td;
142	struct pcpu *pc;
143	struct rmlock *rm;
144	struct rm_queue *queue;
145	struct rm_priotracker *tracker;
146	uintptr_t how;
147
148	rm = (struct rmlock *)lock;
149	tracker = NULL;
150	how = 0;
151	rm_assert(rm, RA_LOCKED | RA_NOTRECURSED);
152	if (rm_wowned(rm))
153		rm_wunlock(rm);
154	else {
155		/*
156		 * Find the right rm_priotracker structure for curthread.
157		 * The guarantee about its uniqueness is given by the fact
158		 * we already asserted the lock wasn't recursively acquired.
159		 */
160		critical_enter();
161		td = curthread;
162		pc = get_pcpu();
163		for (queue = pc->pc_rm_queue.rmq_next;
164		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
165			tracker = (struct rm_priotracker *)queue;
166				if ((tracker->rmp_rmlock == rm) &&
167				    (tracker->rmp_thread == td)) {
168					how = (uintptr_t)tracker;
169					break;
170				}
171		}
172		KASSERT(tracker != NULL,
173		    ("rm_priotracker is non-NULL when lock held in read mode"));
174		critical_exit();
175		rm_runlock(rm, tracker);
176	}
177	return (how);
178}
179
180#ifdef KDTRACE_HOOKS
181static int
182owner_rm(const struct lock_object *lock, struct thread **owner)
183{
184	const struct rmlock *rm;
185	struct lock_class *lc;
186
187	rm = (const struct rmlock *)lock;
188	lc = LOCK_CLASS(&rm->rm_wlock_object);
189	return (lc->lc_owner(&rm->rm_wlock_object, owner));
190}
191#endif
192
193static struct mtx rm_spinlock;
194
195MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
196
197/*
198 * Add or remove tracker from per-cpu list.
199 *
200 * The per-cpu list can be traversed at any time in forward direction from an
201 * interrupt on the *local* cpu.
202 */
203static void inline
204rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
205{
206	struct rm_queue *next;
207
208	/* Initialize all tracker pointers */
209	tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
210	next = pc->pc_rm_queue.rmq_next;
211	tracker->rmp_cpuQueue.rmq_next = next;
212
213	/* rmq_prev is not used during froward traversal. */
214	next->rmq_prev = &tracker->rmp_cpuQueue;
215
216	/* Update pointer to first element. */
217	pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
218}
219
220/*
221 * Return a count of the number of trackers the thread 'td' already
222 * has on this CPU for the lock 'rm'.
223 */
224static int
225rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
226    const struct thread *td)
227{
228	struct rm_queue *queue;
229	struct rm_priotracker *tracker;
230	int count;
231
232	count = 0;
233	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
234	    queue = queue->rmq_next) {
235		tracker = (struct rm_priotracker *)queue;
236		if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
237			count++;
238	}
239	return (count);
240}
241
242static void inline
243rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
244{
245	struct rm_queue *next, *prev;
246
247	next = tracker->rmp_cpuQueue.rmq_next;
248	prev = tracker->rmp_cpuQueue.rmq_prev;
249
250	/* Not used during forward traversal. */
251	next->rmq_prev = prev;
252
253	/* Remove from list. */
254	prev->rmq_next = next;
255}
256
257static void
258rm_cleanIPI(void *arg)
259{
260	struct pcpu *pc;
261	struct rmlock *rm = arg;
262	struct rm_priotracker *tracker;
263	struct rm_queue *queue;
264	pc = get_pcpu();
265
266	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
267	    queue = queue->rmq_next) {
268		tracker = (struct rm_priotracker *)queue;
269		if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
270			tracker->rmp_flags = RMPF_ONQUEUE;
271			mtx_lock_spin(&rm_spinlock);
272			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
273			    rmp_qentry);
274			mtx_unlock_spin(&rm_spinlock);
275		}
276	}
277}
278
279void
280rm_init_flags(struct rmlock *rm, const char *name, int opts)
281{
282	struct lock_class *lc;
283	int liflags, xflags;
284
285	liflags = 0;
286	if (!(opts & RM_NOWITNESS))
287		liflags |= LO_WITNESS;
288	if (opts & RM_RECURSE)
289		liflags |= LO_RECURSABLE;
290	if (opts & RM_NEW)
291		liflags |= LO_NEW;
292	if (opts & RM_DUPOK)
293		liflags |= LO_DUPOK;
294	rm->rm_writecpus = all_cpus;
295	LIST_INIT(&rm->rm_activeReaders);
296	if (opts & RM_SLEEPABLE) {
297		liflags |= LO_SLEEPABLE;
298		lc = &lock_class_rm_sleepable;
299		xflags = (opts & RM_NEW ? SX_NEW : 0);
300		sx_init_flags(&rm->rm_lock_sx, "rmlock_sx",
301		    xflags | SX_NOWITNESS);
302	} else {
303		lc = &lock_class_rm;
304		xflags = (opts & RM_NEW ? MTX_NEW : 0);
305		mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx",
306		    xflags | MTX_NOWITNESS);
307	}
308	lock_init(&rm->lock_object, lc, name, NULL, liflags);
309}
310
311void
312rm_init(struct rmlock *rm, const char *name)
313{
314
315	rm_init_flags(rm, name, 0);
316}
317
318void
319rm_destroy(struct rmlock *rm)
320{
321
322	rm_assert(rm, RA_UNLOCKED);
323	LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
324	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
325		sx_destroy(&rm->rm_lock_sx);
326	else
327		mtx_destroy(&rm->rm_lock_mtx);
328	lock_destroy(&rm->lock_object);
329}
330
331int
332rm_wowned(const struct rmlock *rm)
333{
334
335	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
336		return (sx_xlocked(&rm->rm_lock_sx));
337	else
338		return (mtx_owned(&rm->rm_lock_mtx));
339}
340
341void
342rm_sysinit(void *arg)
343{
344	struct rm_args *args;
345
346	args = arg;
347	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
348}
349
350static __noinline int
351_rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
352{
353	struct pcpu *pc;
354
355	critical_enter();
356	pc = get_pcpu();
357
358	/* Check if we just need to do a proper critical_exit. */
359	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
360		critical_exit();
361		return (1);
362	}
363
364	/* Remove our tracker from the per-cpu list. */
365	rm_tracker_remove(pc, tracker);
366
367	/*
368	 * Check to see if the IPI granted us the lock after all.  The load of
369	 * rmp_flags must happen after the tracker is removed from the list.
370	 */
371	atomic_interrupt_fence();
372	if (tracker->rmp_flags) {
373		/* Just add back tracker - we hold the lock. */
374		rm_tracker_add(pc, tracker);
375		critical_exit();
376		return (1);
377	}
378
379	/*
380	 * We allow readers to acquire a lock even if a writer is blocked if
381	 * the lock is recursive and the reader already holds the lock.
382	 */
383	if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
384		/*
385		 * Just grant the lock if this thread already has a tracker
386		 * for this lock on the per-cpu queue.
387		 */
388		if (rm_trackers_present(pc, rm, curthread) != 0) {
389			mtx_lock_spin(&rm_spinlock);
390			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
391			    rmp_qentry);
392			tracker->rmp_flags = RMPF_ONQUEUE;
393			mtx_unlock_spin(&rm_spinlock);
394			rm_tracker_add(pc, tracker);
395			critical_exit();
396			return (1);
397		}
398	}
399
400	sched_unpin();
401	critical_exit();
402
403	if (trylock) {
404		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
405			if (!sx_try_xlock(&rm->rm_lock_sx))
406				return (0);
407		} else {
408			if (!mtx_trylock(&rm->rm_lock_mtx))
409				return (0);
410		}
411	} else {
412		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
413			THREAD_SLEEPING_OK();
414			sx_xlock(&rm->rm_lock_sx);
415			THREAD_NO_SLEEPING();
416		} else
417			mtx_lock(&rm->rm_lock_mtx);
418	}
419
420	critical_enter();
421	pc = get_pcpu();
422	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
423	rm_tracker_add(pc, tracker);
424	sched_pin();
425	critical_exit();
426
427	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
428		sx_xunlock(&rm->rm_lock_sx);
429	else
430		mtx_unlock(&rm->rm_lock_mtx);
431
432	return (1);
433}
434
435int
436_rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
437{
438	struct thread *td = curthread;
439	struct pcpu *pc;
440
441	if (SCHEDULER_STOPPED())
442		return (1);
443
444	tracker->rmp_flags  = 0;
445	tracker->rmp_thread = td;
446	tracker->rmp_rmlock = rm;
447
448	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
449		THREAD_NO_SLEEPING();
450
451	td->td_critnest++;	/* critical_enter(); */
452
453	atomic_interrupt_fence();
454
455	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
456
457	rm_tracker_add(pc, tracker);
458
459	sched_pin();
460
461	atomic_interrupt_fence();
462
463	td->td_critnest--;
464
465	/*
466	 * Fast path to combine two common conditions into a single
467	 * conditional jump.
468	 */
469	if (__predict_true(0 == (td->td_owepreempt |
470	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))))
471		return (1);
472
473	/* We do not have a read token and need to acquire one. */
474	return _rm_rlock_hard(rm, tracker, trylock);
475}
476
477static __noinline void
478_rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
479{
480
481	if (td->td_owepreempt) {
482		td->td_critnest++;
483		critical_exit();
484	}
485
486	if (!tracker->rmp_flags)
487		return;
488
489	mtx_lock_spin(&rm_spinlock);
490	LIST_REMOVE(tracker, rmp_qentry);
491
492	if (tracker->rmp_flags & RMPF_SIGNAL) {
493		struct rmlock *rm;
494		struct turnstile *ts;
495
496		rm = tracker->rmp_rmlock;
497
498		turnstile_chain_lock(&rm->lock_object);
499		mtx_unlock_spin(&rm_spinlock);
500
501		ts = turnstile_lookup(&rm->lock_object);
502
503		turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
504		turnstile_unpend(ts);
505		turnstile_chain_unlock(&rm->lock_object);
506	} else
507		mtx_unlock_spin(&rm_spinlock);
508}
509
510void
511_rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
512{
513	struct pcpu *pc;
514	struct thread *td = tracker->rmp_thread;
515
516	if (SCHEDULER_STOPPED())
517		return;
518
519	td->td_critnest++;	/* critical_enter(); */
520	pc = cpuid_to_pcpu[td->td_oncpu]; /* pcpu_find(td->td_oncpu); */
521	rm_tracker_remove(pc, tracker);
522	td->td_critnest--;
523	sched_unpin();
524
525	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
526		THREAD_SLEEPING_OK();
527
528	if (__predict_true(0 == (td->td_owepreempt | tracker->rmp_flags)))
529		return;
530
531	_rm_unlock_hard(td, tracker);
532}
533
534void
535_rm_wlock(struct rmlock *rm)
536{
537	struct rm_priotracker *prio;
538	struct turnstile *ts;
539	cpuset_t readcpus;
540
541	if (SCHEDULER_STOPPED())
542		return;
543
544	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
545		sx_xlock(&rm->rm_lock_sx);
546	else
547		mtx_lock(&rm->rm_lock_mtx);
548
549	if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
550		/* Get all read tokens back */
551		readcpus = all_cpus;
552		CPU_ANDNOT(&readcpus, &rm->rm_writecpus);
553		rm->rm_writecpus = all_cpus;
554
555		/*
556		 * Assumes rm->rm_writecpus update is visible on other CPUs
557		 * before rm_cleanIPI is called.
558		 */
559#ifdef SMP
560		smp_rendezvous_cpus(readcpus,
561		    smp_no_rendezvous_barrier,
562		    rm_cleanIPI,
563		    smp_no_rendezvous_barrier,
564		    rm);
565
566#else
567		rm_cleanIPI(rm);
568#endif
569
570		mtx_lock_spin(&rm_spinlock);
571		while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
572			ts = turnstile_trywait(&rm->lock_object);
573			prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
574			mtx_unlock_spin(&rm_spinlock);
575			turnstile_wait(ts, prio->rmp_thread,
576			    TS_EXCLUSIVE_QUEUE);
577			mtx_lock_spin(&rm_spinlock);
578		}
579		mtx_unlock_spin(&rm_spinlock);
580	}
581}
582
583void
584_rm_wunlock(struct rmlock *rm)
585{
586
587	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
588		sx_xunlock(&rm->rm_lock_sx);
589	else
590		mtx_unlock(&rm->rm_lock_mtx);
591}
592
593#if LOCK_DEBUG > 0
594
595void
596_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
597{
598
599	if (SCHEDULER_STOPPED())
600		return;
601
602	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
603	    ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
604	    curthread, rm->lock_object.lo_name, file, line));
605	KASSERT(!rm_destroyed(rm),
606	    ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
607	_rm_assert(rm, RA_UNLOCKED, file, line);
608
609	WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
610	    file, line, NULL);
611
612	_rm_wlock(rm);
613
614	LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
615	WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
616	TD_LOCKS_INC(curthread);
617}
618
619void
620_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
621{
622
623	if (SCHEDULER_STOPPED())
624		return;
625
626	KASSERT(!rm_destroyed(rm),
627	    ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
628	_rm_assert(rm, RA_WLOCKED, file, line);
629	WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
630	LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
631	_rm_wunlock(rm);
632	TD_LOCKS_DEC(curthread);
633}
634
635int
636_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
637    int trylock, const char *file, int line)
638{
639
640	if (SCHEDULER_STOPPED())
641		return (1);
642
643#ifdef INVARIANTS
644	if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
645		critical_enter();
646		KASSERT(rm_trackers_present(get_pcpu(), rm,
647		    curthread) == 0,
648		    ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
649		    rm->lock_object.lo_name, file, line));
650		critical_exit();
651	}
652#endif
653	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
654	    ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
655	    curthread, rm->lock_object.lo_name, file, line));
656	KASSERT(!rm_destroyed(rm),
657	    ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
658	if (!trylock) {
659		KASSERT(!rm_wowned(rm),
660		    ("rm_rlock: wlock already held for %s @ %s:%d",
661		    rm->lock_object.lo_name, file, line));
662		WITNESS_CHECKORDER(&rm->lock_object,
663		    LOP_NEWORDER | LOP_NOSLEEP, file, line, NULL);
664	}
665
666	if (_rm_rlock(rm, tracker, trylock)) {
667		if (trylock)
668			LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
669			    line);
670		else
671			LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
672			    line);
673		WITNESS_LOCK(&rm->lock_object, LOP_NOSLEEP, file, line);
674		TD_LOCKS_INC(curthread);
675		return (1);
676	} else if (trylock)
677		LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
678
679	return (0);
680}
681
682void
683_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
684    const char *file, int line)
685{
686
687	if (SCHEDULER_STOPPED())
688		return;
689
690	KASSERT(!rm_destroyed(rm),
691	    ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
692	_rm_assert(rm, RA_RLOCKED, file, line);
693	WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
694	LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
695	_rm_runlock(rm, tracker);
696	TD_LOCKS_DEC(curthread);
697}
698
699#else
700
701/*
702 * Just strip out file and line arguments if no lock debugging is enabled in
703 * the kernel - we are called from a kernel module.
704 */
705void
706_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
707{
708
709	_rm_wlock(rm);
710}
711
712void
713_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
714{
715
716	_rm_wunlock(rm);
717}
718
719int
720_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
721    int trylock, const char *file, int line)
722{
723
724	return _rm_rlock(rm, tracker, trylock);
725}
726
727void
728_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
729    const char *file, int line)
730{
731
732	_rm_runlock(rm, tracker);
733}
734
735#endif
736
737#ifdef INVARIANT_SUPPORT
738#ifndef INVARIANTS
739#undef _rm_assert
740#endif
741
742/*
743 * Note that this does not need to use witness_assert() for read lock
744 * assertions since an exact count of read locks held by this thread
745 * is computable.
746 */
747void
748_rm_assert(const struct rmlock *rm, int what, const char *file, int line)
749{
750	int count;
751
752	if (SCHEDULER_STOPPED())
753		return;
754	switch (what) {
755	case RA_LOCKED:
756	case RA_LOCKED | RA_RECURSED:
757	case RA_LOCKED | RA_NOTRECURSED:
758	case RA_RLOCKED:
759	case RA_RLOCKED | RA_RECURSED:
760	case RA_RLOCKED | RA_NOTRECURSED:
761		/*
762		 * Handle the write-locked case.  Unlike other
763		 * primitives, writers can never recurse.
764		 */
765		if (rm_wowned(rm)) {
766			if (what & RA_RLOCKED)
767				panic("Lock %s exclusively locked @ %s:%d\n",
768				    rm->lock_object.lo_name, file, line);
769			if (what & RA_RECURSED)
770				panic("Lock %s not recursed @ %s:%d\n",
771				    rm->lock_object.lo_name, file, line);
772			break;
773		}
774
775		critical_enter();
776		count = rm_trackers_present(get_pcpu(), rm, curthread);
777		critical_exit();
778
779		if (count == 0)
780			panic("Lock %s not %slocked @ %s:%d\n",
781			    rm->lock_object.lo_name, (what & RA_RLOCKED) ?
782			    "read " : "", file, line);
783		if (count > 1) {
784			if (what & RA_NOTRECURSED)
785				panic("Lock %s recursed @ %s:%d\n",
786				    rm->lock_object.lo_name, file, line);
787		} else if (what & RA_RECURSED)
788			panic("Lock %s not recursed @ %s:%d\n",
789			    rm->lock_object.lo_name, file, line);
790		break;
791	case RA_WLOCKED:
792		if (!rm_wowned(rm))
793			panic("Lock %s not exclusively locked @ %s:%d\n",
794			    rm->lock_object.lo_name, file, line);
795		break;
796	case RA_UNLOCKED:
797		if (rm_wowned(rm))
798			panic("Lock %s exclusively locked @ %s:%d\n",
799			    rm->lock_object.lo_name, file, line);
800
801		critical_enter();
802		count = rm_trackers_present(get_pcpu(), rm, curthread);
803		critical_exit();
804
805		if (count != 0)
806			panic("Lock %s read locked @ %s:%d\n",
807			    rm->lock_object.lo_name, file, line);
808		break;
809	default:
810		panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
811		    line);
812	}
813}
814#endif /* INVARIANT_SUPPORT */
815
816#ifdef DDB
817static void
818print_tracker(struct rm_priotracker *tr)
819{
820	struct thread *td;
821
822	td = tr->rmp_thread;
823	db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
824	    td->td_proc->p_pid, td->td_name);
825	if (tr->rmp_flags & RMPF_ONQUEUE) {
826		db_printf("ONQUEUE");
827		if (tr->rmp_flags & RMPF_SIGNAL)
828			db_printf(",SIGNAL");
829	} else
830		db_printf("0");
831	db_printf("}\n");
832}
833
834static void
835db_show_rm(const struct lock_object *lock)
836{
837	struct rm_priotracker *tr;
838	struct rm_queue *queue;
839	const struct rmlock *rm;
840	struct lock_class *lc;
841	struct pcpu *pc;
842
843	rm = (const struct rmlock *)lock;
844	db_printf(" writecpus: ");
845	ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
846	db_printf("\n");
847	db_printf(" per-CPU readers:\n");
848	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
849		for (queue = pc->pc_rm_queue.rmq_next;
850		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
851			tr = (struct rm_priotracker *)queue;
852			if (tr->rmp_rmlock == rm)
853				print_tracker(tr);
854		}
855	db_printf(" active readers:\n");
856	LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
857		print_tracker(tr);
858	lc = LOCK_CLASS(&rm->rm_wlock_object);
859	db_printf("Backing write-lock (%s):\n", lc->lc_name);
860	lc->lc_ddb_show(&rm->rm_wlock_object);
861}
862#endif
863
864/*
865 * Read-mostly sleepable locks.
866 *
867 * These primitives allow both readers and writers to sleep. However, neither
868 * readers nor writers are tracked and subsequently there is no priority
869 * propagation.
870 *
871 * They are intended to be only used when write-locking is almost never needed
872 * (e.g., they can guard against unloading a kernel module) while read-locking
873 * happens all the time.
874 *
875 * Concurrent writers take turns taking the lock while going off cpu. If this is
876 * of concern for your usecase, this is not the right primitive.
877 *
878 * Neither rms_rlock nor rms_runlock use thread fences. Instead interrupt
879 * fences are inserted to ensure ordering with the code executed in the IPI
880 * handler.
881 *
882 * No attempt is made to track which CPUs read locked at least once,
883 * consequently write locking sends IPIs to all of them. This will become a
884 * problem at some point. The easiest way to lessen it is to provide a bitmap.
885 */
886
887#define	RMS_NOOWNER	((void *)0x1)
888#define	RMS_TRANSIENT	((void *)0x2)
889#define	RMS_FLAGMASK	0xf
890
891struct rmslock_pcpu {
892	int influx;
893	int readers;
894};
895
896_Static_assert(sizeof(struct rmslock_pcpu) == 8, "bad size");
897
898/*
899 * Internal routines
900 */
901static struct rmslock_pcpu *
902rms_int_pcpu(struct rmslock *rms)
903{
904
905	CRITICAL_ASSERT(curthread);
906	return (zpcpu_get(rms->pcpu));
907}
908
909static struct rmslock_pcpu *
910rms_int_remote_pcpu(struct rmslock *rms, int cpu)
911{
912
913	return (zpcpu_get_cpu(rms->pcpu, cpu));
914}
915
916static void
917rms_int_influx_enter(struct rmslock *rms, struct rmslock_pcpu *pcpu)
918{
919
920	CRITICAL_ASSERT(curthread);
921	MPASS(pcpu->influx == 0);
922	pcpu->influx = 1;
923}
924
925static void
926rms_int_influx_exit(struct rmslock *rms, struct rmslock_pcpu *pcpu)
927{
928
929	CRITICAL_ASSERT(curthread);
930	MPASS(pcpu->influx == 1);
931	pcpu->influx = 0;
932}
933
934#ifdef INVARIANTS
935static void
936rms_int_debug_readers_inc(struct rmslock *rms)
937{
938	int old;
939	old = atomic_fetchadd_int(&rms->debug_readers, 1);
940	KASSERT(old >= 0, ("%s: bad readers count %d\n", __func__, old));
941}
942
943static void
944rms_int_debug_readers_dec(struct rmslock *rms)
945{
946	int old;
947
948	old = atomic_fetchadd_int(&rms->debug_readers, -1);
949	KASSERT(old > 0, ("%s: bad readers count %d\n", __func__, old));
950}
951#else
952static void
953rms_int_debug_readers_inc(struct rmslock *rms)
954{
955}
956
957static void
958rms_int_debug_readers_dec(struct rmslock *rms)
959{
960}
961#endif
962
963static void
964rms_int_readers_inc(struct rmslock *rms, struct rmslock_pcpu *pcpu)
965{
966
967	CRITICAL_ASSERT(curthread);
968	rms_int_debug_readers_inc(rms);
969	pcpu->readers++;
970}
971
972static void
973rms_int_readers_dec(struct rmslock *rms, struct rmslock_pcpu *pcpu)
974{
975
976	CRITICAL_ASSERT(curthread);
977	rms_int_debug_readers_dec(rms);
978	pcpu->readers--;
979}
980
981/*
982 * Public API
983 */
984void
985rms_init(struct rmslock *rms, const char *name)
986{
987
988	rms->owner = RMS_NOOWNER;
989	rms->writers = 0;
990	rms->readers = 0;
991	rms->debug_readers = 0;
992	mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW);
993	rms->pcpu = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK | M_ZERO);
994}
995
996void
997rms_destroy(struct rmslock *rms)
998{
999
1000	MPASS(rms->writers == 0);
1001	MPASS(rms->readers == 0);
1002	mtx_destroy(&rms->mtx);
1003	uma_zfree_pcpu(pcpu_zone_8, rms->pcpu);
1004}
1005
1006static void __noinline
1007rms_rlock_fallback(struct rmslock *rms)
1008{
1009
1010	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1011	critical_exit();
1012
1013	mtx_lock(&rms->mtx);
1014	while (rms->writers > 0)
1015		msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
1016	critical_enter();
1017	rms_int_readers_inc(rms, rms_int_pcpu(rms));
1018	mtx_unlock(&rms->mtx);
1019	critical_exit();
1020}
1021
1022void
1023rms_rlock(struct rmslock *rms)
1024{
1025	struct rmslock_pcpu *pcpu;
1026
1027	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1028	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1029
1030	critical_enter();
1031	pcpu = rms_int_pcpu(rms);
1032	rms_int_influx_enter(rms, pcpu);
1033	atomic_interrupt_fence();
1034	if (__predict_false(rms->writers > 0)) {
1035		rms_rlock_fallback(rms);
1036		return;
1037	}
1038	atomic_interrupt_fence();
1039	rms_int_readers_inc(rms, pcpu);
1040	atomic_interrupt_fence();
1041	rms_int_influx_exit(rms, pcpu);
1042	critical_exit();
1043}
1044
1045int
1046rms_try_rlock(struct rmslock *rms)
1047{
1048	struct rmslock_pcpu *pcpu;
1049
1050	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1051
1052	critical_enter();
1053	pcpu = rms_int_pcpu(rms);
1054	rms_int_influx_enter(rms, pcpu);
1055	atomic_interrupt_fence();
1056	if (__predict_false(rms->writers > 0)) {
1057		rms_int_influx_exit(rms, pcpu);
1058		critical_exit();
1059		return (0);
1060	}
1061	atomic_interrupt_fence();
1062	rms_int_readers_inc(rms, pcpu);
1063	atomic_interrupt_fence();
1064	rms_int_influx_exit(rms, pcpu);
1065	critical_exit();
1066	return (1);
1067}
1068
1069static void __noinline
1070rms_runlock_fallback(struct rmslock *rms)
1071{
1072
1073	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1074	critical_exit();
1075
1076	mtx_lock(&rms->mtx);
1077	MPASS(rms->writers > 0);
1078	MPASS(rms->readers > 0);
1079	MPASS(rms->debug_readers == rms->readers);
1080	rms_int_debug_readers_dec(rms);
1081	rms->readers--;
1082	if (rms->readers == 0)
1083		wakeup_one(&rms->writers);
1084	mtx_unlock(&rms->mtx);
1085}
1086
1087void
1088rms_runlock(struct rmslock *rms)
1089{
1090	struct rmslock_pcpu *pcpu;
1091
1092	critical_enter();
1093	pcpu = rms_int_pcpu(rms);
1094	rms_int_influx_enter(rms, pcpu);
1095	atomic_interrupt_fence();
1096	if (__predict_false(rms->writers > 0)) {
1097		rms_runlock_fallback(rms);
1098		return;
1099	}
1100	atomic_interrupt_fence();
1101	rms_int_readers_dec(rms, pcpu);
1102	atomic_interrupt_fence();
1103	rms_int_influx_exit(rms, pcpu);
1104	critical_exit();
1105}
1106
1107struct rmslock_ipi {
1108	struct rmslock *rms;
1109	struct smp_rendezvous_cpus_retry_arg srcra;
1110};
1111
1112static void
1113rms_action_func(void *arg)
1114{
1115	struct rmslock_ipi *rmsipi;
1116	struct rmslock_pcpu *pcpu;
1117	struct rmslock *rms;
1118
1119	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1120	rms = rmsipi->rms;
1121	pcpu = rms_int_pcpu(rms);
1122
1123	if (pcpu->influx)
1124		return;
1125	if (pcpu->readers != 0) {
1126		atomic_add_int(&rms->readers, pcpu->readers);
1127		pcpu->readers = 0;
1128	}
1129	smp_rendezvous_cpus_done(arg);
1130}
1131
1132static void
1133rms_wait_func(void *arg, int cpu)
1134{
1135	struct rmslock_ipi *rmsipi;
1136	struct rmslock_pcpu *pcpu;
1137	struct rmslock *rms;
1138
1139	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1140	rms = rmsipi->rms;
1141	pcpu = rms_int_remote_pcpu(rms, cpu);
1142
1143	while (atomic_load_int(&pcpu->influx))
1144		cpu_spinwait();
1145}
1146
1147#ifdef INVARIANTS
1148static void
1149rms_assert_no_pcpu_readers(struct rmslock *rms)
1150{
1151	struct rmslock_pcpu *pcpu;
1152	int cpu;
1153
1154	CPU_FOREACH(cpu) {
1155		pcpu = rms_int_remote_pcpu(rms, cpu);
1156		if (pcpu->readers != 0) {
1157			panic("%s: got %d readers on cpu %d\n", __func__,
1158			    pcpu->readers, cpu);
1159		}
1160	}
1161}
1162#else
1163static void
1164rms_assert_no_pcpu_readers(struct rmslock *rms)
1165{
1166}
1167#endif
1168
1169static void
1170rms_wlock_switch(struct rmslock *rms)
1171{
1172	struct rmslock_ipi rmsipi;
1173
1174	MPASS(rms->readers == 0);
1175	MPASS(rms->writers == 1);
1176
1177	rmsipi.rms = rms;
1178
1179	smp_rendezvous_cpus_retry(all_cpus,
1180	    smp_no_rendezvous_barrier,
1181	    rms_action_func,
1182	    smp_no_rendezvous_barrier,
1183	    rms_wait_func,
1184	    &rmsipi.srcra);
1185}
1186
1187void
1188rms_wlock(struct rmslock *rms)
1189{
1190
1191	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1192	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1193
1194	mtx_lock(&rms->mtx);
1195	rms->writers++;
1196	if (rms->writers > 1) {
1197		msleep(&rms->owner, &rms->mtx, (PUSER - 1),
1198		    mtx_name(&rms->mtx), 0);
1199		MPASS(rms->readers == 0);
1200		KASSERT(rms->owner == RMS_TRANSIENT,
1201		    ("%s: unexpected owner value %p\n", __func__,
1202		    rms->owner));
1203		goto out_grab;
1204	}
1205
1206	KASSERT(rms->owner == RMS_NOOWNER,
1207	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1208
1209	rms_wlock_switch(rms);
1210	rms_assert_no_pcpu_readers(rms);
1211
1212	if (rms->readers > 0) {
1213		msleep(&rms->writers, &rms->mtx, (PUSER - 1),
1214		    mtx_name(&rms->mtx), 0);
1215	}
1216
1217out_grab:
1218	rms->owner = curthread;
1219	rms_assert_no_pcpu_readers(rms);
1220	mtx_unlock(&rms->mtx);
1221	MPASS(rms->readers == 0);
1222}
1223
1224void
1225rms_wunlock(struct rmslock *rms)
1226{
1227
1228	mtx_lock(&rms->mtx);
1229	KASSERT(rms->owner == curthread,
1230	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1231	MPASS(rms->writers >= 1);
1232	MPASS(rms->readers == 0);
1233	rms->writers--;
1234	if (rms->writers > 0) {
1235		wakeup_one(&rms->owner);
1236		rms->owner = RMS_TRANSIENT;
1237	} else {
1238		wakeup(&rms->readers);
1239		rms->owner = RMS_NOOWNER;
1240	}
1241	mtx_unlock(&rms->mtx);
1242}
1243
1244void
1245rms_unlock(struct rmslock *rms)
1246{
1247
1248	if (rms_wowned(rms))
1249		rms_wunlock(rms);
1250	else
1251		rms_runlock(rms);
1252}
1253