1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2007 Stephan Uphoff <ups@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 * 3. Neither the name of the author nor the names of any co-contributors
16 *    may be used to endorse or promote products derived from this software
17 *    without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32/*
33 * Machine independent bits of reader/writer lock implementation.
34 */
35
36#include <sys/cdefs.h>
37#include "opt_ddb.h"
38
39#include <sys/param.h>
40#include <sys/systm.h>
41
42#include <sys/kernel.h>
43#include <sys/kdb.h>
44#include <sys/ktr.h>
45#include <sys/lock.h>
46#include <sys/mutex.h>
47#include <sys/proc.h>
48#include <sys/rmlock.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/turnstile.h>
52#include <sys/lock_profile.h>
53#include <machine/cpu.h>
54#include <vm/uma.h>
55
56#ifdef DDB
57#include <ddb/ddb.h>
58#endif
59
60/*
61 * A cookie to mark destroyed rmlocks.  This is stored in the head of
62 * rm_activeReaders.
63 */
64#define	RM_DESTROYED	((void *)0xdead)
65
66#define	rm_destroyed(rm)						\
67	(LIST_FIRST(&(rm)->rm_activeReaders) == RM_DESTROYED)
68
69#define RMPF_ONQUEUE	1
70#define RMPF_SIGNAL	2
71
72#ifndef INVARIANTS
73#define	_rm_assert(c, what, file, line)
74#endif
75
76static void	assert_rm(const struct lock_object *lock, int what);
77#ifdef DDB
78static void	db_show_rm(const struct lock_object *lock);
79#endif
80static void	lock_rm(struct lock_object *lock, uintptr_t how);
81#ifdef KDTRACE_HOOKS
82static int	owner_rm(const struct lock_object *lock, struct thread **owner);
83#endif
84static uintptr_t unlock_rm(struct lock_object *lock);
85
86struct lock_class lock_class_rm = {
87	.lc_name = "rm",
88	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
89	.lc_assert = assert_rm,
90#ifdef DDB
91	.lc_ddb_show = db_show_rm,
92#endif
93	.lc_lock = lock_rm,
94	.lc_unlock = unlock_rm,
95#ifdef KDTRACE_HOOKS
96	.lc_owner = owner_rm,
97#endif
98};
99
100struct lock_class lock_class_rm_sleepable = {
101	.lc_name = "sleepable rm",
102	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE,
103	.lc_assert = assert_rm,
104#ifdef DDB
105	.lc_ddb_show = db_show_rm,
106#endif
107	.lc_lock = lock_rm,
108	.lc_unlock = unlock_rm,
109#ifdef KDTRACE_HOOKS
110	.lc_owner = owner_rm,
111#endif
112};
113
114static void
115assert_rm(const struct lock_object *lock, int what)
116{
117
118	rm_assert((const struct rmlock *)lock, what);
119}
120
121static void
122lock_rm(struct lock_object *lock, uintptr_t how)
123{
124	struct rmlock *rm;
125	struct rm_priotracker *tracker;
126
127	rm = (struct rmlock *)lock;
128	if (how == 0)
129		rm_wlock(rm);
130	else {
131		tracker = (struct rm_priotracker *)how;
132		rm_rlock(rm, tracker);
133	}
134}
135
136static uintptr_t
137unlock_rm(struct lock_object *lock)
138{
139	struct thread *td;
140	struct pcpu *pc;
141	struct rmlock *rm;
142	struct rm_queue *queue;
143	struct rm_priotracker *tracker;
144	uintptr_t how;
145
146	rm = (struct rmlock *)lock;
147	tracker = NULL;
148	how = 0;
149	rm_assert(rm, RA_LOCKED | RA_NOTRECURSED);
150	if (rm_wowned(rm))
151		rm_wunlock(rm);
152	else {
153		/*
154		 * Find the right rm_priotracker structure for curthread.
155		 * The guarantee about its uniqueness is given by the fact
156		 * we already asserted the lock wasn't recursively acquired.
157		 */
158		critical_enter();
159		td = curthread;
160		pc = get_pcpu();
161		for (queue = pc->pc_rm_queue.rmq_next;
162		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
163			tracker = (struct rm_priotracker *)queue;
164				if ((tracker->rmp_rmlock == rm) &&
165				    (tracker->rmp_thread == td)) {
166					how = (uintptr_t)tracker;
167					break;
168				}
169		}
170		KASSERT(tracker != NULL,
171		    ("rm_priotracker is non-NULL when lock held in read mode"));
172		critical_exit();
173		rm_runlock(rm, tracker);
174	}
175	return (how);
176}
177
178#ifdef KDTRACE_HOOKS
179static int
180owner_rm(const struct lock_object *lock, struct thread **owner)
181{
182	const struct rmlock *rm;
183	struct lock_class *lc;
184
185	rm = (const struct rmlock *)lock;
186	lc = LOCK_CLASS(&rm->rm_wlock_object);
187	return (lc->lc_owner(&rm->rm_wlock_object, owner));
188}
189#endif
190
191static struct mtx rm_spinlock;
192
193MTX_SYSINIT(rm_spinlock, &rm_spinlock, "rm_spinlock", MTX_SPIN);
194
195/*
196 * Add or remove tracker from per-cpu list.
197 *
198 * The per-cpu list can be traversed at any time in forward direction from an
199 * interrupt on the *local* cpu.
200 */
201static void inline
202rm_tracker_add(struct pcpu *pc, struct rm_priotracker *tracker)
203{
204	struct rm_queue *next;
205
206	/* Initialize all tracker pointers */
207	tracker->rmp_cpuQueue.rmq_prev = &pc->pc_rm_queue;
208	next = pc->pc_rm_queue.rmq_next;
209	tracker->rmp_cpuQueue.rmq_next = next;
210
211	/* rmq_prev is not used during froward traversal. */
212	next->rmq_prev = &tracker->rmp_cpuQueue;
213
214	/* Update pointer to first element. */
215	pc->pc_rm_queue.rmq_next = &tracker->rmp_cpuQueue;
216}
217
218/*
219 * Return a count of the number of trackers the thread 'td' already
220 * has on this CPU for the lock 'rm'.
221 */
222static int
223rm_trackers_present(const struct pcpu *pc, const struct rmlock *rm,
224    const struct thread *td)
225{
226	struct rm_queue *queue;
227	struct rm_priotracker *tracker;
228	int count;
229
230	count = 0;
231	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
232	    queue = queue->rmq_next) {
233		tracker = (struct rm_priotracker *)queue;
234		if ((tracker->rmp_rmlock == rm) && (tracker->rmp_thread == td))
235			count++;
236	}
237	return (count);
238}
239
240static void inline
241rm_tracker_remove(struct pcpu *pc, struct rm_priotracker *tracker)
242{
243	struct rm_queue *next, *prev;
244
245	next = tracker->rmp_cpuQueue.rmq_next;
246	prev = tracker->rmp_cpuQueue.rmq_prev;
247
248	/* Not used during forward traversal. */
249	next->rmq_prev = prev;
250
251	/* Remove from list. */
252	prev->rmq_next = next;
253}
254
255static void
256rm_cleanIPI(void *arg)
257{
258	struct pcpu *pc;
259	struct rmlock *rm = arg;
260	struct rm_priotracker *tracker;
261	struct rm_queue *queue;
262	pc = get_pcpu();
263
264	for (queue = pc->pc_rm_queue.rmq_next; queue != &pc->pc_rm_queue;
265	    queue = queue->rmq_next) {
266		tracker = (struct rm_priotracker *)queue;
267		if (tracker->rmp_rmlock == rm && tracker->rmp_flags == 0) {
268			tracker->rmp_flags = RMPF_ONQUEUE;
269			mtx_lock_spin(&rm_spinlock);
270			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
271			    rmp_qentry);
272			mtx_unlock_spin(&rm_spinlock);
273		}
274	}
275}
276
277void
278rm_init_flags(struct rmlock *rm, const char *name, int opts)
279{
280	struct lock_class *lc;
281	int liflags, xflags;
282
283	liflags = 0;
284	if (!(opts & RM_NOWITNESS))
285		liflags |= LO_WITNESS;
286	if (opts & RM_RECURSE)
287		liflags |= LO_RECURSABLE;
288	if (opts & RM_NEW)
289		liflags |= LO_NEW;
290	if (opts & RM_DUPOK)
291		liflags |= LO_DUPOK;
292	rm->rm_writecpus = all_cpus;
293	LIST_INIT(&rm->rm_activeReaders);
294	if (opts & RM_SLEEPABLE) {
295		liflags |= LO_SLEEPABLE;
296		lc = &lock_class_rm_sleepable;
297		xflags = (opts & RM_NEW ? SX_NEW : 0);
298		sx_init_flags(&rm->rm_lock_sx, "rmlock_sx",
299		    xflags | SX_NOWITNESS);
300	} else {
301		lc = &lock_class_rm;
302		xflags = (opts & RM_NEW ? MTX_NEW : 0);
303		mtx_init(&rm->rm_lock_mtx, name, "rmlock_mtx",
304		    xflags | MTX_NOWITNESS);
305	}
306	lock_init(&rm->lock_object, lc, name, NULL, liflags);
307}
308
309void
310rm_init(struct rmlock *rm, const char *name)
311{
312
313	rm_init_flags(rm, name, 0);
314}
315
316void
317rm_destroy(struct rmlock *rm)
318{
319
320	rm_assert(rm, RA_UNLOCKED);
321	LIST_FIRST(&rm->rm_activeReaders) = RM_DESTROYED;
322	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
323		sx_destroy(&rm->rm_lock_sx);
324	else
325		mtx_destroy(&rm->rm_lock_mtx);
326	lock_destroy(&rm->lock_object);
327}
328
329int
330rm_wowned(const struct rmlock *rm)
331{
332
333	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
334		return (sx_xlocked(&rm->rm_lock_sx));
335	else
336		return (mtx_owned(&rm->rm_lock_mtx));
337}
338
339void
340rm_sysinit(void *arg)
341{
342	struct rm_args *args;
343
344	args = arg;
345	rm_init_flags(args->ra_rm, args->ra_desc, args->ra_flags);
346}
347
348static __noinline int
349_rm_rlock_hard(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
350{
351	struct pcpu *pc;
352
353	critical_enter();
354	pc = get_pcpu();
355
356	/* Check if we just need to do a proper critical_exit. */
357	if (!CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus)) {
358		critical_exit();
359		return (1);
360	}
361
362	/* Remove our tracker from the per-cpu list. */
363	rm_tracker_remove(pc, tracker);
364
365	/*
366	 * Check to see if the IPI granted us the lock after all.  The load of
367	 * rmp_flags must happen after the tracker is removed from the list.
368	 */
369	atomic_interrupt_fence();
370	if (tracker->rmp_flags) {
371		/* Just add back tracker - we hold the lock. */
372		rm_tracker_add(pc, tracker);
373		critical_exit();
374		return (1);
375	}
376
377	/*
378	 * We allow readers to acquire a lock even if a writer is blocked if
379	 * the lock is recursive and the reader already holds the lock.
380	 */
381	if ((rm->lock_object.lo_flags & LO_RECURSABLE) != 0) {
382		/*
383		 * Just grant the lock if this thread already has a tracker
384		 * for this lock on the per-cpu queue.
385		 */
386		if (rm_trackers_present(pc, rm, curthread) != 0) {
387			mtx_lock_spin(&rm_spinlock);
388			LIST_INSERT_HEAD(&rm->rm_activeReaders, tracker,
389			    rmp_qentry);
390			tracker->rmp_flags = RMPF_ONQUEUE;
391			mtx_unlock_spin(&rm_spinlock);
392			rm_tracker_add(pc, tracker);
393			critical_exit();
394			return (1);
395		}
396	}
397
398	sched_unpin();
399	critical_exit();
400
401	if (trylock) {
402		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
403			if (!sx_try_xlock(&rm->rm_lock_sx))
404				return (0);
405		} else {
406			if (!mtx_trylock(&rm->rm_lock_mtx))
407				return (0);
408		}
409	} else {
410		if (rm->lock_object.lo_flags & LO_SLEEPABLE) {
411			THREAD_SLEEPING_OK();
412			sx_xlock(&rm->rm_lock_sx);
413			THREAD_NO_SLEEPING();
414		} else
415			mtx_lock(&rm->rm_lock_mtx);
416	}
417
418	critical_enter();
419	pc = get_pcpu();
420	CPU_CLR(pc->pc_cpuid, &rm->rm_writecpus);
421	rm_tracker_add(pc, tracker);
422	sched_pin();
423	critical_exit();
424
425	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
426		sx_xunlock(&rm->rm_lock_sx);
427	else
428		mtx_unlock(&rm->rm_lock_mtx);
429
430	return (1);
431}
432
433int
434_rm_rlock(struct rmlock *rm, struct rm_priotracker *tracker, int trylock)
435{
436	struct thread *td = curthread;
437	struct pcpu *pc;
438
439	if (SCHEDULER_STOPPED())
440		return (1);
441
442	tracker->rmp_flags  = 0;
443	tracker->rmp_thread = td;
444	tracker->rmp_rmlock = rm;
445
446	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
447		THREAD_NO_SLEEPING();
448
449	td->td_critnest++;	/* critical_enter(); */
450	atomic_interrupt_fence();
451
452	pc = cpuid_to_pcpu[td->td_oncpu];
453	rm_tracker_add(pc, tracker);
454	sched_pin();
455
456	atomic_interrupt_fence();
457	td->td_critnest--;
458
459	/*
460	 * Fast path to combine two common conditions into a single
461	 * conditional jump.
462	 */
463	if (__predict_true(0 == (td->td_owepreempt |
464	    CPU_ISSET(pc->pc_cpuid, &rm->rm_writecpus))))
465		return (1);
466
467	/* We do not have a read token and need to acquire one. */
468	return _rm_rlock_hard(rm, tracker, trylock);
469}
470
471static __noinline void
472_rm_unlock_hard(struct thread *td,struct rm_priotracker *tracker)
473{
474
475	if (td->td_owepreempt) {
476		td->td_critnest++;
477		critical_exit();
478	}
479
480	if (!tracker->rmp_flags)
481		return;
482
483	mtx_lock_spin(&rm_spinlock);
484	LIST_REMOVE(tracker, rmp_qentry);
485
486	if (tracker->rmp_flags & RMPF_SIGNAL) {
487		struct rmlock *rm;
488		struct turnstile *ts;
489
490		rm = tracker->rmp_rmlock;
491
492		turnstile_chain_lock(&rm->lock_object);
493		mtx_unlock_spin(&rm_spinlock);
494
495		ts = turnstile_lookup(&rm->lock_object);
496
497		turnstile_signal(ts, TS_EXCLUSIVE_QUEUE);
498		turnstile_unpend(ts);
499		turnstile_chain_unlock(&rm->lock_object);
500	} else
501		mtx_unlock_spin(&rm_spinlock);
502}
503
504void
505_rm_runlock(struct rmlock *rm, struct rm_priotracker *tracker)
506{
507	struct pcpu *pc;
508	struct thread *td = tracker->rmp_thread;
509
510	if (SCHEDULER_STOPPED())
511		return;
512
513	td->td_critnest++;	/* critical_enter(); */
514	atomic_interrupt_fence();
515
516	pc = cpuid_to_pcpu[td->td_oncpu];
517	rm_tracker_remove(pc, tracker);
518
519	atomic_interrupt_fence();
520	td->td_critnest--;
521	sched_unpin();
522
523	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
524		THREAD_SLEEPING_OK();
525
526	if (__predict_true(0 == (td->td_owepreempt | tracker->rmp_flags)))
527		return;
528
529	_rm_unlock_hard(td, tracker);
530}
531
532void
533_rm_wlock(struct rmlock *rm)
534{
535	struct rm_priotracker *prio;
536	struct turnstile *ts;
537	cpuset_t readcpus;
538
539	if (SCHEDULER_STOPPED())
540		return;
541
542	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
543		sx_xlock(&rm->rm_lock_sx);
544	else
545		mtx_lock(&rm->rm_lock_mtx);
546
547	if (CPU_CMP(&rm->rm_writecpus, &all_cpus)) {
548		/* Get all read tokens back */
549		readcpus = all_cpus;
550		CPU_ANDNOT(&readcpus, &readcpus, &rm->rm_writecpus);
551		rm->rm_writecpus = all_cpus;
552
553		/*
554		 * Assumes rm->rm_writecpus update is visible on other CPUs
555		 * before rm_cleanIPI is called.
556		 */
557#ifdef SMP
558		smp_rendezvous_cpus(readcpus,
559		    smp_no_rendezvous_barrier,
560		    rm_cleanIPI,
561		    smp_no_rendezvous_barrier,
562		    rm);
563
564#else
565		rm_cleanIPI(rm);
566#endif
567
568		mtx_lock_spin(&rm_spinlock);
569		while ((prio = LIST_FIRST(&rm->rm_activeReaders)) != NULL) {
570			ts = turnstile_trywait(&rm->lock_object);
571			prio->rmp_flags = RMPF_ONQUEUE | RMPF_SIGNAL;
572			mtx_unlock_spin(&rm_spinlock);
573			turnstile_wait(ts, prio->rmp_thread,
574			    TS_EXCLUSIVE_QUEUE);
575			mtx_lock_spin(&rm_spinlock);
576		}
577		mtx_unlock_spin(&rm_spinlock);
578	}
579}
580
581void
582_rm_wunlock(struct rmlock *rm)
583{
584
585	if (rm->lock_object.lo_flags & LO_SLEEPABLE)
586		sx_xunlock(&rm->rm_lock_sx);
587	else
588		mtx_unlock(&rm->rm_lock_mtx);
589}
590
591#if LOCK_DEBUG > 0
592
593void
594_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
595{
596
597	if (SCHEDULER_STOPPED())
598		return;
599
600	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
601	    ("rm_wlock() by idle thread %p on rmlock %s @ %s:%d",
602	    curthread, rm->lock_object.lo_name, file, line));
603	KASSERT(!rm_destroyed(rm),
604	    ("rm_wlock() of destroyed rmlock @ %s:%d", file, line));
605	_rm_assert(rm, RA_UNLOCKED, file, line);
606
607	WITNESS_CHECKORDER(&rm->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE,
608	    file, line, NULL);
609
610	_rm_wlock(rm);
611
612	LOCK_LOG_LOCK("RMWLOCK", &rm->lock_object, 0, 0, file, line);
613	WITNESS_LOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
614	TD_LOCKS_INC(curthread);
615}
616
617void
618_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
619{
620
621	if (SCHEDULER_STOPPED())
622		return;
623
624	KASSERT(!rm_destroyed(rm),
625	    ("rm_wunlock() of destroyed rmlock @ %s:%d", file, line));
626	_rm_assert(rm, RA_WLOCKED, file, line);
627	WITNESS_UNLOCK(&rm->lock_object, LOP_EXCLUSIVE, file, line);
628	LOCK_LOG_LOCK("RMWUNLOCK", &rm->lock_object, 0, 0, file, line);
629	_rm_wunlock(rm);
630	TD_LOCKS_DEC(curthread);
631}
632
633int
634_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
635    int trylock, const char *file, int line)
636{
637
638	if (SCHEDULER_STOPPED())
639		return (1);
640
641#ifdef INVARIANTS
642	if (!(rm->lock_object.lo_flags & LO_RECURSABLE) && !trylock) {
643		critical_enter();
644		KASSERT(rm_trackers_present(get_pcpu(), rm,
645		    curthread) == 0,
646		    ("rm_rlock: recursed on non-recursive rmlock %s @ %s:%d\n",
647		    rm->lock_object.lo_name, file, line));
648		critical_exit();
649	}
650#endif
651	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
652	    ("rm_rlock() by idle thread %p on rmlock %s @ %s:%d",
653	    curthread, rm->lock_object.lo_name, file, line));
654	KASSERT(!rm_destroyed(rm),
655	    ("rm_rlock() of destroyed rmlock @ %s:%d", file, line));
656	if (!trylock) {
657		KASSERT(!rm_wowned(rm),
658		    ("rm_rlock: wlock already held for %s @ %s:%d",
659		    rm->lock_object.lo_name, file, line));
660		WITNESS_CHECKORDER(&rm->lock_object,
661		    LOP_NEWORDER | LOP_NOSLEEP, file, line, NULL);
662	}
663
664	if (_rm_rlock(rm, tracker, trylock)) {
665		if (trylock)
666			LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 1, file,
667			    line);
668		else
669			LOCK_LOG_LOCK("RMRLOCK", &rm->lock_object, 0, 0, file,
670			    line);
671		WITNESS_LOCK(&rm->lock_object, LOP_NOSLEEP, file, line);
672		TD_LOCKS_INC(curthread);
673		return (1);
674	} else if (trylock)
675		LOCK_LOG_TRY("RMRLOCK", &rm->lock_object, 0, 0, file, line);
676
677	return (0);
678}
679
680void
681_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
682    const char *file, int line)
683{
684
685	if (SCHEDULER_STOPPED())
686		return;
687
688	KASSERT(!rm_destroyed(rm),
689	    ("rm_runlock() of destroyed rmlock @ %s:%d", file, line));
690	_rm_assert(rm, RA_RLOCKED, file, line);
691	WITNESS_UNLOCK(&rm->lock_object, 0, file, line);
692	LOCK_LOG_LOCK("RMRUNLOCK", &rm->lock_object, 0, 0, file, line);
693	_rm_runlock(rm, tracker);
694	TD_LOCKS_DEC(curthread);
695}
696
697#else
698
699/*
700 * Just strip out file and line arguments if no lock debugging is enabled in
701 * the kernel - we are called from a kernel module.
702 */
703void
704_rm_wlock_debug(struct rmlock *rm, const char *file, int line)
705{
706
707	_rm_wlock(rm);
708}
709
710void
711_rm_wunlock_debug(struct rmlock *rm, const char *file, int line)
712{
713
714	_rm_wunlock(rm);
715}
716
717int
718_rm_rlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
719    int trylock, const char *file, int line)
720{
721
722	return _rm_rlock(rm, tracker, trylock);
723}
724
725void
726_rm_runlock_debug(struct rmlock *rm, struct rm_priotracker *tracker,
727    const char *file, int line)
728{
729
730	_rm_runlock(rm, tracker);
731}
732
733#endif
734
735#ifdef INVARIANT_SUPPORT
736#ifndef INVARIANTS
737#undef _rm_assert
738#endif
739
740/*
741 * Note that this does not need to use witness_assert() for read lock
742 * assertions since an exact count of read locks held by this thread
743 * is computable.
744 */
745void
746_rm_assert(const struct rmlock *rm, int what, const char *file, int line)
747{
748	int count;
749
750	if (SCHEDULER_STOPPED())
751		return;
752	switch (what) {
753	case RA_LOCKED:
754	case RA_LOCKED | RA_RECURSED:
755	case RA_LOCKED | RA_NOTRECURSED:
756	case RA_RLOCKED:
757	case RA_RLOCKED | RA_RECURSED:
758	case RA_RLOCKED | RA_NOTRECURSED:
759		/*
760		 * Handle the write-locked case.  Unlike other
761		 * primitives, writers can never recurse.
762		 */
763		if (rm_wowned(rm)) {
764			if (what & RA_RLOCKED)
765				panic("Lock %s exclusively locked @ %s:%d\n",
766				    rm->lock_object.lo_name, file, line);
767			if (what & RA_RECURSED)
768				panic("Lock %s not recursed @ %s:%d\n",
769				    rm->lock_object.lo_name, file, line);
770			break;
771		}
772
773		critical_enter();
774		count = rm_trackers_present(get_pcpu(), rm, curthread);
775		critical_exit();
776
777		if (count == 0)
778			panic("Lock %s not %slocked @ %s:%d\n",
779			    rm->lock_object.lo_name, (what & RA_RLOCKED) ?
780			    "read " : "", file, line);
781		if (count > 1) {
782			if (what & RA_NOTRECURSED)
783				panic("Lock %s recursed @ %s:%d\n",
784				    rm->lock_object.lo_name, file, line);
785		} else if (what & RA_RECURSED)
786			panic("Lock %s not recursed @ %s:%d\n",
787			    rm->lock_object.lo_name, file, line);
788		break;
789	case RA_WLOCKED:
790		if (!rm_wowned(rm))
791			panic("Lock %s not exclusively locked @ %s:%d\n",
792			    rm->lock_object.lo_name, file, line);
793		break;
794	case RA_UNLOCKED:
795		if (rm_wowned(rm))
796			panic("Lock %s exclusively locked @ %s:%d\n",
797			    rm->lock_object.lo_name, file, line);
798
799		critical_enter();
800		count = rm_trackers_present(get_pcpu(), rm, curthread);
801		critical_exit();
802
803		if (count != 0)
804			panic("Lock %s read locked @ %s:%d\n",
805			    rm->lock_object.lo_name, file, line);
806		break;
807	default:
808		panic("Unknown rm lock assertion: %d @ %s:%d", what, file,
809		    line);
810	}
811}
812#endif /* INVARIANT_SUPPORT */
813
814#ifdef DDB
815static void
816print_tracker(struct rm_priotracker *tr)
817{
818	struct thread *td;
819
820	td = tr->rmp_thread;
821	db_printf("   thread %p (tid %d, pid %d, \"%s\") {", td, td->td_tid,
822	    td->td_proc->p_pid, td->td_name);
823	if (tr->rmp_flags & RMPF_ONQUEUE) {
824		db_printf("ONQUEUE");
825		if (tr->rmp_flags & RMPF_SIGNAL)
826			db_printf(",SIGNAL");
827	} else
828		db_printf("0");
829	db_printf("}\n");
830}
831
832static void
833db_show_rm(const struct lock_object *lock)
834{
835	struct rm_priotracker *tr;
836	struct rm_queue *queue;
837	const struct rmlock *rm;
838	struct lock_class *lc;
839	struct pcpu *pc;
840
841	rm = (const struct rmlock *)lock;
842	db_printf(" writecpus: ");
843	ddb_display_cpuset(__DEQUALIFY(const cpuset_t *, &rm->rm_writecpus));
844	db_printf("\n");
845	db_printf(" per-CPU readers:\n");
846	STAILQ_FOREACH(pc, &cpuhead, pc_allcpu)
847		for (queue = pc->pc_rm_queue.rmq_next;
848		    queue != &pc->pc_rm_queue; queue = queue->rmq_next) {
849			tr = (struct rm_priotracker *)queue;
850			if (tr->rmp_rmlock == rm)
851				print_tracker(tr);
852		}
853	db_printf(" active readers:\n");
854	LIST_FOREACH(tr, &rm->rm_activeReaders, rmp_qentry)
855		print_tracker(tr);
856	lc = LOCK_CLASS(&rm->rm_wlock_object);
857	db_printf("Backing write-lock (%s):\n", lc->lc_name);
858	lc->lc_ddb_show(&rm->rm_wlock_object);
859}
860#endif
861
862/*
863 * Read-mostly sleepable locks.
864 *
865 * These primitives allow both readers and writers to sleep. However, neither
866 * readers nor writers are tracked and subsequently there is no priority
867 * propagation.
868 *
869 * They are intended to be only used when write-locking is almost never needed
870 * (e.g., they can guard against unloading a kernel module) while read-locking
871 * happens all the time.
872 *
873 * Concurrent writers take turns taking the lock while going off cpu. If this is
874 * of concern for your usecase, this is not the right primitive.
875 *
876 * Neither rms_rlock nor rms_runlock use thread fences. Instead interrupt
877 * fences are inserted to ensure ordering with the code executed in the IPI
878 * handler.
879 *
880 * No attempt is made to track which CPUs read locked at least once,
881 * consequently write locking sends IPIs to all of them. This will become a
882 * problem at some point. The easiest way to lessen it is to provide a bitmap.
883 */
884
885#define	RMS_NOOWNER	((void *)0x1)
886#define	RMS_TRANSIENT	((void *)0x2)
887#define	RMS_FLAGMASK	0xf
888
889struct rmslock_pcpu {
890	int influx;
891	int readers;
892};
893
894_Static_assert(sizeof(struct rmslock_pcpu) == 8, "bad size");
895
896/*
897 * Internal routines
898 */
899static struct rmslock_pcpu *
900rms_int_pcpu(struct rmslock *rms)
901{
902
903	CRITICAL_ASSERT(curthread);
904	return (zpcpu_get(rms->pcpu));
905}
906
907static struct rmslock_pcpu *
908rms_int_remote_pcpu(struct rmslock *rms, int cpu)
909{
910
911	return (zpcpu_get_cpu(rms->pcpu, cpu));
912}
913
914static void
915rms_int_influx_enter(struct rmslock *rms, struct rmslock_pcpu *pcpu)
916{
917
918	CRITICAL_ASSERT(curthread);
919	MPASS(pcpu->influx == 0);
920	pcpu->influx = 1;
921}
922
923static void
924rms_int_influx_exit(struct rmslock *rms, struct rmslock_pcpu *pcpu)
925{
926
927	CRITICAL_ASSERT(curthread);
928	MPASS(pcpu->influx == 1);
929	pcpu->influx = 0;
930}
931
932#ifdef INVARIANTS
933static void
934rms_int_debug_readers_inc(struct rmslock *rms)
935{
936	int old;
937	old = atomic_fetchadd_int(&rms->debug_readers, 1);
938	KASSERT(old >= 0, ("%s: bad readers count %d\n", __func__, old));
939}
940
941static void
942rms_int_debug_readers_dec(struct rmslock *rms)
943{
944	int old;
945
946	old = atomic_fetchadd_int(&rms->debug_readers, -1);
947	KASSERT(old > 0, ("%s: bad readers count %d\n", __func__, old));
948}
949#else
950static void
951rms_int_debug_readers_inc(struct rmslock *rms)
952{
953}
954
955static void
956rms_int_debug_readers_dec(struct rmslock *rms)
957{
958}
959#endif
960
961static void
962rms_int_readers_inc(struct rmslock *rms, struct rmslock_pcpu *pcpu)
963{
964
965	CRITICAL_ASSERT(curthread);
966	rms_int_debug_readers_inc(rms);
967	pcpu->readers++;
968}
969
970static void
971rms_int_readers_dec(struct rmslock *rms, struct rmslock_pcpu *pcpu)
972{
973
974	CRITICAL_ASSERT(curthread);
975	rms_int_debug_readers_dec(rms);
976	pcpu->readers--;
977}
978
979/*
980 * Public API
981 */
982void
983rms_init(struct rmslock *rms, const char *name)
984{
985
986	rms->owner = RMS_NOOWNER;
987	rms->writers = 0;
988	rms->readers = 0;
989	rms->debug_readers = 0;
990	mtx_init(&rms->mtx, name, NULL, MTX_DEF | MTX_NEW);
991	rms->pcpu = uma_zalloc_pcpu(pcpu_zone_8, M_WAITOK | M_ZERO);
992}
993
994void
995rms_destroy(struct rmslock *rms)
996{
997
998	MPASS(rms->writers == 0);
999	MPASS(rms->readers == 0);
1000	mtx_destroy(&rms->mtx);
1001	uma_zfree_pcpu(pcpu_zone_8, rms->pcpu);
1002}
1003
1004static void __noinline
1005rms_rlock_fallback(struct rmslock *rms)
1006{
1007
1008	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1009	critical_exit();
1010
1011	mtx_lock(&rms->mtx);
1012	while (rms->writers > 0)
1013		msleep(&rms->readers, &rms->mtx, PUSER - 1, mtx_name(&rms->mtx), 0);
1014	critical_enter();
1015	rms_int_readers_inc(rms, rms_int_pcpu(rms));
1016	mtx_unlock(&rms->mtx);
1017	critical_exit();
1018	TD_LOCKS_INC(curthread);
1019}
1020
1021void
1022rms_rlock(struct rmslock *rms)
1023{
1024	struct rmslock_pcpu *pcpu;
1025
1026	rms_assert_rlock_ok(rms);
1027	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1028
1029	critical_enter();
1030	pcpu = rms_int_pcpu(rms);
1031	rms_int_influx_enter(rms, pcpu);
1032	atomic_interrupt_fence();
1033	if (__predict_false(rms->writers > 0)) {
1034		rms_rlock_fallback(rms);
1035		return;
1036	}
1037	atomic_interrupt_fence();
1038	rms_int_readers_inc(rms, pcpu);
1039	atomic_interrupt_fence();
1040	rms_int_influx_exit(rms, pcpu);
1041	critical_exit();
1042	TD_LOCKS_INC(curthread);
1043}
1044
1045int
1046rms_try_rlock(struct rmslock *rms)
1047{
1048	struct rmslock_pcpu *pcpu;
1049
1050	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1051
1052	critical_enter();
1053	pcpu = rms_int_pcpu(rms);
1054	rms_int_influx_enter(rms, pcpu);
1055	atomic_interrupt_fence();
1056	if (__predict_false(rms->writers > 0)) {
1057		rms_int_influx_exit(rms, pcpu);
1058		critical_exit();
1059		return (0);
1060	}
1061	atomic_interrupt_fence();
1062	rms_int_readers_inc(rms, pcpu);
1063	atomic_interrupt_fence();
1064	rms_int_influx_exit(rms, pcpu);
1065	critical_exit();
1066	TD_LOCKS_INC(curthread);
1067	return (1);
1068}
1069
1070static void __noinline
1071rms_runlock_fallback(struct rmslock *rms)
1072{
1073
1074	rms_int_influx_exit(rms, rms_int_pcpu(rms));
1075	critical_exit();
1076
1077	mtx_lock(&rms->mtx);
1078	MPASS(rms->writers > 0);
1079	MPASS(rms->readers > 0);
1080	MPASS(rms->debug_readers == rms->readers);
1081	rms_int_debug_readers_dec(rms);
1082	rms->readers--;
1083	if (rms->readers == 0)
1084		wakeup_one(&rms->writers);
1085	mtx_unlock(&rms->mtx);
1086	TD_LOCKS_DEC(curthread);
1087}
1088
1089void
1090rms_runlock(struct rmslock *rms)
1091{
1092	struct rmslock_pcpu *pcpu;
1093
1094	critical_enter();
1095	pcpu = rms_int_pcpu(rms);
1096	rms_int_influx_enter(rms, pcpu);
1097	atomic_interrupt_fence();
1098	if (__predict_false(rms->writers > 0)) {
1099		rms_runlock_fallback(rms);
1100		return;
1101	}
1102	atomic_interrupt_fence();
1103	rms_int_readers_dec(rms, pcpu);
1104	atomic_interrupt_fence();
1105	rms_int_influx_exit(rms, pcpu);
1106	critical_exit();
1107	TD_LOCKS_DEC(curthread);
1108}
1109
1110struct rmslock_ipi {
1111	struct rmslock *rms;
1112	struct smp_rendezvous_cpus_retry_arg srcra;
1113};
1114
1115static void
1116rms_action_func(void *arg)
1117{
1118	struct rmslock_ipi *rmsipi;
1119	struct rmslock_pcpu *pcpu;
1120	struct rmslock *rms;
1121
1122	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1123	rms = rmsipi->rms;
1124	pcpu = rms_int_pcpu(rms);
1125
1126	if (pcpu->influx)
1127		return;
1128	if (pcpu->readers != 0) {
1129		atomic_add_int(&rms->readers, pcpu->readers);
1130		pcpu->readers = 0;
1131	}
1132	smp_rendezvous_cpus_done(arg);
1133}
1134
1135static void
1136rms_wait_func(void *arg, int cpu)
1137{
1138	struct rmslock_ipi *rmsipi;
1139	struct rmslock_pcpu *pcpu;
1140	struct rmslock *rms;
1141
1142	rmsipi = __containerof(arg, struct rmslock_ipi, srcra);
1143	rms = rmsipi->rms;
1144	pcpu = rms_int_remote_pcpu(rms, cpu);
1145
1146	while (atomic_load_int(&pcpu->influx))
1147		cpu_spinwait();
1148}
1149
1150#ifdef INVARIANTS
1151static void
1152rms_assert_no_pcpu_readers(struct rmslock *rms)
1153{
1154	struct rmslock_pcpu *pcpu;
1155	int cpu;
1156
1157	CPU_FOREACH(cpu) {
1158		pcpu = rms_int_remote_pcpu(rms, cpu);
1159		if (pcpu->readers != 0) {
1160			panic("%s: got %d readers on cpu %d\n", __func__,
1161			    pcpu->readers, cpu);
1162		}
1163	}
1164}
1165#else
1166static void
1167rms_assert_no_pcpu_readers(struct rmslock *rms)
1168{
1169}
1170#endif
1171
1172static void
1173rms_wlock_switch(struct rmslock *rms)
1174{
1175	struct rmslock_ipi rmsipi;
1176
1177	MPASS(rms->readers == 0);
1178	MPASS(rms->writers == 1);
1179
1180	rmsipi.rms = rms;
1181
1182	smp_rendezvous_cpus_retry(all_cpus,
1183	    smp_no_rendezvous_barrier,
1184	    rms_action_func,
1185	    smp_no_rendezvous_barrier,
1186	    rms_wait_func,
1187	    &rmsipi.srcra);
1188}
1189
1190void
1191rms_wlock(struct rmslock *rms)
1192{
1193
1194	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, __func__);
1195	MPASS(atomic_load_ptr(&rms->owner) != curthread);
1196
1197	mtx_lock(&rms->mtx);
1198	rms->writers++;
1199	if (rms->writers > 1) {
1200		msleep(&rms->owner, &rms->mtx, (PUSER - 1),
1201		    mtx_name(&rms->mtx), 0);
1202		MPASS(rms->readers == 0);
1203		KASSERT(rms->owner == RMS_TRANSIENT,
1204		    ("%s: unexpected owner value %p\n", __func__,
1205		    rms->owner));
1206		goto out_grab;
1207	}
1208
1209	KASSERT(rms->owner == RMS_NOOWNER,
1210	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1211
1212	rms_wlock_switch(rms);
1213	rms_assert_no_pcpu_readers(rms);
1214
1215	if (rms->readers > 0) {
1216		msleep(&rms->writers, &rms->mtx, (PUSER - 1),
1217		    mtx_name(&rms->mtx), 0);
1218	}
1219
1220out_grab:
1221	rms->owner = curthread;
1222	rms_assert_no_pcpu_readers(rms);
1223	mtx_unlock(&rms->mtx);
1224	MPASS(rms->readers == 0);
1225	TD_LOCKS_INC(curthread);
1226}
1227
1228void
1229rms_wunlock(struct rmslock *rms)
1230{
1231
1232	mtx_lock(&rms->mtx);
1233	KASSERT(rms->owner == curthread,
1234	    ("%s: unexpected owner value %p\n", __func__, rms->owner));
1235	MPASS(rms->writers >= 1);
1236	MPASS(rms->readers == 0);
1237	rms->writers--;
1238	if (rms->writers > 0) {
1239		wakeup_one(&rms->owner);
1240		rms->owner = RMS_TRANSIENT;
1241	} else {
1242		wakeup(&rms->readers);
1243		rms->owner = RMS_NOOWNER;
1244	}
1245	mtx_unlock(&rms->mtx);
1246	TD_LOCKS_DEC(curthread);
1247}
1248
1249void
1250rms_unlock(struct rmslock *rms)
1251{
1252
1253	if (rms_wowned(rms))
1254		rms_wunlock(rms);
1255	else
1256		rms_runlock(rms);
1257}
1258