1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2006 John Baldwin <jhb@FreeBSD.org>
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 *    notice, this list of conditions and the following disclaimer in the
13 *    documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 * SUCH DAMAGE.
26 */
27
28/*
29 * Machine independent bits of reader/writer lock implementation.
30 */
31
32#include <sys/cdefs.h>
33#include "opt_ddb.h"
34#include "opt_hwpmc_hooks.h"
35#include "opt_no_adaptive_rwlocks.h"
36
37#include <sys/param.h>
38#include <sys/kdb.h>
39#include <sys/ktr.h>
40#include <sys/kernel.h>
41#include <sys/lock.h>
42#include <sys/mutex.h>
43#include <sys/proc.h>
44#include <sys/rwlock.h>
45#include <sys/sched.h>
46#include <sys/smp.h>
47#include <sys/sysctl.h>
48#include <sys/systm.h>
49#include <sys/turnstile.h>
50
51#include <machine/cpu.h>
52
53#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
54#define	ADAPTIVE_RWLOCKS
55#endif
56
57#ifdef HWPMC_HOOKS
58#include <sys/pmckern.h>
59PMC_SOFT_DECLARE( , , lock, failed);
60#endif
61
62/*
63 * Return the rwlock address when the lock cookie address is provided.
64 * This functionality assumes that struct rwlock* have a member named rw_lock.
65 */
66#define	rwlock2rw(c)	(__containerof(c, struct rwlock, rw_lock))
67
68#ifdef DDB
69#include <ddb/ddb.h>
70
71static void	db_show_rwlock(const struct lock_object *lock);
72#endif
73static void	assert_rw(const struct lock_object *lock, int what);
74static void	lock_rw(struct lock_object *lock, uintptr_t how);
75#ifdef KDTRACE_HOOKS
76static int	owner_rw(const struct lock_object *lock, struct thread **owner);
77#endif
78static uintptr_t unlock_rw(struct lock_object *lock);
79
80struct lock_class lock_class_rw = {
81	.lc_name = "rw",
82	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
83	.lc_assert = assert_rw,
84#ifdef DDB
85	.lc_ddb_show = db_show_rwlock,
86#endif
87	.lc_lock = lock_rw,
88	.lc_unlock = unlock_rw,
89#ifdef KDTRACE_HOOKS
90	.lc_owner = owner_rw,
91#endif
92};
93
94#ifdef ADAPTIVE_RWLOCKS
95#ifdef RWLOCK_CUSTOM_BACKOFF
96static u_short __read_frequently rowner_retries;
97static u_short __read_frequently rowner_loops;
98static SYSCTL_NODE(_debug, OID_AUTO, rwlock,
99    CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
100    "rwlock debugging");
101SYSCTL_U16(_debug_rwlock, OID_AUTO, retry, CTLFLAG_RW, &rowner_retries, 0, "");
102SYSCTL_U16(_debug_rwlock, OID_AUTO, loops, CTLFLAG_RW, &rowner_loops, 0, "");
103
104static struct lock_delay_config __read_frequently rw_delay;
105
106SYSCTL_U16(_debug_rwlock, OID_AUTO, delay_base, CTLFLAG_RW, &rw_delay.base,
107    0, "");
108SYSCTL_U16(_debug_rwlock, OID_AUTO, delay_max, CTLFLAG_RW, &rw_delay.max,
109    0, "");
110
111static void
112rw_lock_delay_init(void *arg __unused)
113{
114
115	lock_delay_default_init(&rw_delay);
116	rowner_retries = 10;
117	rowner_loops = max(10000, rw_delay.max);
118}
119LOCK_DELAY_SYSINIT(rw_lock_delay_init);
120#else
121#define rw_delay	locks_delay
122#define rowner_retries	locks_delay_retries
123#define rowner_loops	locks_delay_loops
124#endif
125#endif
126
127/*
128 * Return a pointer to the owning thread if the lock is write-locked or
129 * NULL if the lock is unlocked or read-locked.
130 */
131
132#define	lv_rw_wowner(v)							\
133	((v) & RW_LOCK_READ ? NULL :					\
134	 (struct thread *)RW_OWNER((v)))
135
136#define	rw_wowner(rw)	lv_rw_wowner(RW_READ_VALUE(rw))
137
138/*
139 * Returns if a write owner is recursed.  Write ownership is not assured
140 * here and should be previously checked.
141 */
142#define	rw_recursed(rw)		((rw)->rw_recurse != 0)
143
144/*
145 * Return true if curthread helds the lock.
146 */
147#define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
148
149/*
150 * Return a pointer to the owning thread for this lock who should receive
151 * any priority lent by threads that block on this lock.  Currently this
152 * is identical to rw_wowner().
153 */
154#define	rw_owner(rw)		rw_wowner(rw)
155
156#ifndef INVARIANTS
157#define	__rw_assert(c, what, file, line)
158#endif
159
160void
161assert_rw(const struct lock_object *lock, int what)
162{
163
164	rw_assert((const struct rwlock *)lock, what);
165}
166
167void
168lock_rw(struct lock_object *lock, uintptr_t how)
169{
170	struct rwlock *rw;
171
172	rw = (struct rwlock *)lock;
173	if (how)
174		rw_rlock(rw);
175	else
176		rw_wlock(rw);
177}
178
179uintptr_t
180unlock_rw(struct lock_object *lock)
181{
182	struct rwlock *rw;
183
184	rw = (struct rwlock *)lock;
185	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
186	if (rw->rw_lock & RW_LOCK_READ) {
187		rw_runlock(rw);
188		return (1);
189	} else {
190		rw_wunlock(rw);
191		return (0);
192	}
193}
194
195#ifdef KDTRACE_HOOKS
196int
197owner_rw(const struct lock_object *lock, struct thread **owner)
198{
199	const struct rwlock *rw = (const struct rwlock *)lock;
200	uintptr_t x = rw->rw_lock;
201
202	*owner = rw_wowner(rw);
203	return ((x & RW_LOCK_READ) != 0 ?  (RW_READERS(x) != 0) :
204	    (*owner != NULL));
205}
206#endif
207
208void
209_rw_init_flags(volatile uintptr_t *c, const char *name, int opts)
210{
211	struct rwlock *rw;
212	int flags;
213
214	rw = rwlock2rw(c);
215
216	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
217	    RW_RECURSE | RW_NEW)) == 0);
218	ASSERT_ATOMIC_LOAD_PTR(rw->rw_lock,
219	    ("%s: rw_lock not aligned for %s: %p", __func__, name,
220	    &rw->rw_lock));
221
222	flags = LO_UPGRADABLE;
223	if (opts & RW_DUPOK)
224		flags |= LO_DUPOK;
225	if (opts & RW_NOPROFILE)
226		flags |= LO_NOPROFILE;
227	if (!(opts & RW_NOWITNESS))
228		flags |= LO_WITNESS;
229	if (opts & RW_RECURSE)
230		flags |= LO_RECURSABLE;
231	if (opts & RW_QUIET)
232		flags |= LO_QUIET;
233	if (opts & RW_NEW)
234		flags |= LO_NEW;
235
236	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
237	rw->rw_lock = RW_UNLOCKED;
238	rw->rw_recurse = 0;
239}
240
241void
242_rw_destroy(volatile uintptr_t *c)
243{
244	struct rwlock *rw;
245
246	rw = rwlock2rw(c);
247
248	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock %p not unlocked", rw));
249	KASSERT(rw->rw_recurse == 0, ("rw lock %p still recursed", rw));
250	rw->rw_lock = RW_DESTROYED;
251	lock_destroy(&rw->lock_object);
252}
253
254void
255rw_sysinit(void *arg)
256{
257	struct rw_args *args;
258
259	args = arg;
260	rw_init_flags((struct rwlock *)args->ra_rw, args->ra_desc,
261	    args->ra_flags);
262}
263
264int
265_rw_wowned(const volatile uintptr_t *c)
266{
267
268	return (rw_wowner(rwlock2rw(c)) == curthread);
269}
270
271void
272_rw_wlock_cookie(volatile uintptr_t *c, const char *file, int line)
273{
274	struct rwlock *rw;
275	uintptr_t tid, v;
276
277	rw = rwlock2rw(c);
278
279	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
280	    !TD_IS_IDLETHREAD(curthread),
281	    ("rw_wlock() by idle thread %p on rwlock %s @ %s:%d",
282	    curthread, rw->lock_object.lo_name, file, line));
283	KASSERT(rw->rw_lock != RW_DESTROYED,
284	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
285	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
286	    line, NULL);
287	tid = (uintptr_t)curthread;
288	v = RW_UNLOCKED;
289	if (!_rw_write_lock_fetch(rw, &v, tid))
290		_rw_wlock_hard(rw, v, file, line);
291	else
292		LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw,
293		    0, 0, file, line, LOCKSTAT_WRITER);
294
295	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
296	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
297	TD_LOCKS_INC(curthread);
298}
299
300int
301__rw_try_wlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
302{
303	struct thread *td;
304	uintptr_t tid, v;
305	int rval;
306	bool recursed;
307
308	td = curthread;
309	tid = (uintptr_t)td;
310	if (SCHEDULER_STOPPED())
311		return (1);
312
313	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(td),
314	    ("rw_try_wlock() by idle thread %p on rwlock %s @ %s:%d",
315	    curthread, rw->lock_object.lo_name, file, line));
316	KASSERT(rw->rw_lock != RW_DESTROYED,
317	    ("rw_try_wlock() of destroyed rwlock @ %s:%d", file, line));
318
319	rval = 1;
320	recursed = false;
321	v = RW_UNLOCKED;
322	for (;;) {
323		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid))
324			break;
325		if (v == RW_UNLOCKED)
326			continue;
327		if (v == tid && (rw->lock_object.lo_flags & LO_RECURSABLE)) {
328			rw->rw_recurse++;
329			atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
330			break;
331		}
332		rval = 0;
333		break;
334	}
335
336	LOCK_LOG_TRY("WLOCK", &rw->lock_object, 0, rval, file, line);
337	if (rval) {
338		WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
339		    file, line);
340		if (!recursed)
341			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
342			    rw, 0, 0, file, line, LOCKSTAT_WRITER);
343		TD_LOCKS_INC(curthread);
344	}
345	return (rval);
346}
347
348int
349__rw_try_wlock(volatile uintptr_t *c, const char *file, int line)
350{
351	struct rwlock *rw;
352
353	rw = rwlock2rw(c);
354	return (__rw_try_wlock_int(rw LOCK_FILE_LINE_ARG));
355}
356
357void
358_rw_wunlock_cookie(volatile uintptr_t *c, const char *file, int line)
359{
360	struct rwlock *rw;
361
362	rw = rwlock2rw(c);
363
364	KASSERT(rw->rw_lock != RW_DESTROYED,
365	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
366	__rw_assert(c, RA_WLOCKED, file, line);
367	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
368	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
369	    line);
370
371#ifdef LOCK_PROFILING
372	_rw_wunlock_hard(rw, (uintptr_t)curthread, file, line);
373#else
374	__rw_wunlock(rw, curthread, file, line);
375#endif
376
377	TD_LOCKS_DEC(curthread);
378}
379
380/*
381 * Determines whether a new reader can acquire a lock.  Succeeds if the
382 * reader already owns a read lock and the lock is locked for read to
383 * prevent deadlock from reader recursion.  Also succeeds if the lock
384 * is unlocked and has no writer waiters or spinners.  Failing otherwise
385 * prioritizes writers before readers.
386 */
387static bool __always_inline
388__rw_can_read(struct thread *td, uintptr_t v, bool fp)
389{
390
391	if ((v & (RW_LOCK_READ | RW_LOCK_WRITE_WAITERS | RW_LOCK_WRITE_SPINNER))
392	    == RW_LOCK_READ)
393		return (true);
394	if (!fp && td->td_rw_rlocks && (v & RW_LOCK_READ))
395		return (true);
396	return (false);
397}
398
399static bool __always_inline
400__rw_rlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp, bool fp
401    LOCK_FILE_LINE_ARG_DEF)
402{
403
404	/*
405	 * Handle the easy case.  If no other thread has a write
406	 * lock, then try to bump up the count of read locks.  Note
407	 * that we have to preserve the current state of the
408	 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
409	 * read lock, then rw_lock must have changed, so restart
410	 * the loop.  Note that this handles the case of a
411	 * completely unlocked rwlock since such a lock is encoded
412	 * as a read lock with no waiters.
413	 */
414	while (__rw_can_read(td, *vp, fp)) {
415		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, vp,
416			*vp + RW_ONE_READER)) {
417			if (LOCK_LOG_TEST(&rw->lock_object, 0))
418				CTR4(KTR_LOCK,
419				    "%s: %p succeed %p -> %p", __func__,
420				    rw, (void *)*vp,
421				    (void *)(*vp + RW_ONE_READER));
422			td->td_rw_rlocks++;
423			return (true);
424		}
425	}
426	return (false);
427}
428
429static void __noinline
430__rw_rlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
431    LOCK_FILE_LINE_ARG_DEF)
432{
433	struct turnstile *ts;
434	struct thread *owner;
435#ifdef ADAPTIVE_RWLOCKS
436	int spintries = 0;
437	int i, n;
438#endif
439#ifdef LOCK_PROFILING
440	uint64_t waittime = 0;
441	int contested = 0;
442#endif
443#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
444	struct lock_delay_arg lda;
445#endif
446#ifdef KDTRACE_HOOKS
447	u_int sleep_cnt = 0;
448	int64_t sleep_time = 0;
449	int64_t all_time = 0;
450#endif
451#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
452	uintptr_t state = 0;
453	int doing_lockprof = 0;
454#endif
455
456#ifdef KDTRACE_HOOKS
457	if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) {
458		if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG))
459			goto out_lockstat;
460		doing_lockprof = 1;
461		all_time -= lockstat_nsecs(&rw->lock_object);
462		state = v;
463	}
464#endif
465#ifdef LOCK_PROFILING
466	doing_lockprof = 1;
467	state = v;
468#endif
469
470	if (SCHEDULER_STOPPED())
471		return;
472
473#if defined(ADAPTIVE_RWLOCKS)
474	lock_delay_arg_init(&lda, &rw_delay);
475#elif defined(KDTRACE_HOOKS)
476	lock_delay_arg_init_noadapt(&lda);
477#endif
478
479#ifdef HWPMC_HOOKS
480	PMC_SOFT_CALL( , , lock, failed);
481#endif
482	lock_profile_obtain_lock_failed(&rw->lock_object, false,
483	    &contested, &waittime);
484
485	THREAD_CONTENDS_ON_LOCK(&rw->lock_object);
486
487	for (;;) {
488		if (__rw_rlock_try(rw, td, &v, false LOCK_FILE_LINE_ARG))
489			break;
490#ifdef KDTRACE_HOOKS
491		lda.spin_cnt++;
492#endif
493
494#ifdef ADAPTIVE_RWLOCKS
495		/*
496		 * If the owner is running on another CPU, spin until
497		 * the owner stops running or the state of the lock
498		 * changes.
499		 */
500		if ((v & RW_LOCK_READ) == 0) {
501			owner = (struct thread *)RW_OWNER(v);
502			if (TD_IS_RUNNING(owner)) {
503				if (LOCK_LOG_TEST(&rw->lock_object, 0))
504					CTR3(KTR_LOCK,
505					    "%s: spinning on %p held by %p",
506					    __func__, rw, owner);
507				KTR_STATE1(KTR_SCHED, "thread",
508				    sched_tdname(curthread), "spinning",
509				    "lockname:\"%s\"", rw->lock_object.lo_name);
510				do {
511					lock_delay(&lda);
512					v = RW_READ_VALUE(rw);
513					owner = lv_rw_wowner(v);
514				} while (owner != NULL && TD_IS_RUNNING(owner));
515				KTR_STATE0(KTR_SCHED, "thread",
516				    sched_tdname(curthread), "running");
517				continue;
518			}
519		} else {
520			if ((v & RW_LOCK_WRITE_SPINNER) && RW_READERS(v) == 0) {
521				MPASS(!__rw_can_read(td, v, false));
522				lock_delay_spin(2);
523				v = RW_READ_VALUE(rw);
524				continue;
525			}
526			if (spintries < rowner_retries) {
527				spintries++;
528				KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
529				    "spinning", "lockname:\"%s\"",
530				    rw->lock_object.lo_name);
531				n = RW_READERS(v);
532				for (i = 0; i < rowner_loops; i += n) {
533					lock_delay_spin(n);
534					v = RW_READ_VALUE(rw);
535					if (!(v & RW_LOCK_READ))
536						break;
537					n = RW_READERS(v);
538					if (n == 0)
539						break;
540					if (__rw_can_read(td, v, false))
541						break;
542				}
543#ifdef KDTRACE_HOOKS
544				lda.spin_cnt += rowner_loops - i;
545#endif
546				KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
547				    "running");
548				if (i < rowner_loops)
549					continue;
550			}
551		}
552#endif
553
554		/*
555		 * Okay, now it's the hard case.  Some other thread already
556		 * has a write lock or there are write waiters present,
557		 * acquire the turnstile lock so we can begin the process
558		 * of blocking.
559		 */
560		ts = turnstile_trywait(&rw->lock_object);
561
562		/*
563		 * The lock might have been released while we spun, so
564		 * recheck its state and restart the loop if needed.
565		 */
566		v = RW_READ_VALUE(rw);
567retry_ts:
568		if (((v & RW_LOCK_WRITE_SPINNER) && RW_READERS(v) == 0) ||
569		    __rw_can_read(td, v, false)) {
570			turnstile_cancel(ts);
571			continue;
572		}
573
574		owner = lv_rw_wowner(v);
575
576#ifdef ADAPTIVE_RWLOCKS
577		/*
578		 * The current lock owner might have started executing
579		 * on another CPU (or the lock could have changed
580		 * owners) while we were waiting on the turnstile
581		 * chain lock.  If so, drop the turnstile lock and try
582		 * again.
583		 */
584		if (owner != NULL) {
585			if (TD_IS_RUNNING(owner)) {
586				turnstile_cancel(ts);
587				continue;
588			}
589		}
590#endif
591
592		/*
593		 * The lock is held in write mode or it already has waiters.
594		 */
595		MPASS(!__rw_can_read(td, v, false));
596
597		/*
598		 * If the RW_LOCK_READ_WAITERS flag is already set, then
599		 * we can go ahead and block.  If it is not set then try
600		 * to set it.  If we fail to set it drop the turnstile
601		 * lock and restart the loop.
602		 */
603		if (!(v & RW_LOCK_READ_WAITERS)) {
604			if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
605			    v | RW_LOCK_READ_WAITERS))
606				goto retry_ts;
607			if (LOCK_LOG_TEST(&rw->lock_object, 0))
608				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
609				    __func__, rw);
610		}
611
612		/*
613		 * We were unable to acquire the lock and the read waiters
614		 * flag is set, so we must block on the turnstile.
615		 */
616		if (LOCK_LOG_TEST(&rw->lock_object, 0))
617			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
618			    rw);
619#ifdef KDTRACE_HOOKS
620		sleep_time -= lockstat_nsecs(&rw->lock_object);
621#endif
622		MPASS(owner == rw_owner(rw));
623		turnstile_wait(ts, owner, TS_SHARED_QUEUE);
624#ifdef KDTRACE_HOOKS
625		sleep_time += lockstat_nsecs(&rw->lock_object);
626		sleep_cnt++;
627#endif
628		if (LOCK_LOG_TEST(&rw->lock_object, 0))
629			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
630			    __func__, rw);
631		v = RW_READ_VALUE(rw);
632	}
633	THREAD_CONTENTION_DONE(&rw->lock_object);
634#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
635	if (__predict_true(!doing_lockprof))
636		return;
637#endif
638#ifdef KDTRACE_HOOKS
639	all_time += lockstat_nsecs(&rw->lock_object);
640	if (sleep_time)
641		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
642		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
643		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
644
645	/* Record only the loops spinning and not sleeping. */
646	if (lda.spin_cnt > sleep_cnt)
647		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
648		    LOCKSTAT_READER, (state & RW_LOCK_READ) == 0,
649		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
650out_lockstat:
651#endif
652	/*
653	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
654	 * however.  turnstiles don't like owners changing between calls to
655	 * turnstile_wait() currently.
656	 */
657	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
658	    waittime, file, line, LOCKSTAT_READER);
659}
660
661void
662__rw_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
663{
664	struct thread *td;
665	uintptr_t v;
666
667	td = curthread;
668
669	KASSERT(kdb_active != 0 || SCHEDULER_STOPPED() ||
670	    !TD_IS_IDLETHREAD(td),
671	    ("rw_rlock() by idle thread %p on rwlock %s @ %s:%d",
672	    td, rw->lock_object.lo_name, file, line));
673	KASSERT(rw->rw_lock != RW_DESTROYED,
674	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
675	KASSERT(rw_wowner(rw) != td,
676	    ("rw_rlock: wlock already held for %s @ %s:%d",
677	    rw->lock_object.lo_name, file, line));
678	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line, NULL);
679
680	v = RW_READ_VALUE(rw);
681	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__acquire) ||
682	    !__rw_rlock_try(rw, td, &v, true LOCK_FILE_LINE_ARG)))
683		__rw_rlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
684	else
685		lock_profile_obtain_lock_success(&rw->lock_object, false, 0, 0,
686		    file, line);
687
688	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
689	WITNESS_LOCK(&rw->lock_object, 0, file, line);
690	TD_LOCKS_INC(curthread);
691}
692
693void
694__rw_rlock(volatile uintptr_t *c, const char *file, int line)
695{
696	struct rwlock *rw;
697
698	rw = rwlock2rw(c);
699	__rw_rlock_int(rw LOCK_FILE_LINE_ARG);
700}
701
702int
703__rw_try_rlock_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
704{
705	uintptr_t x;
706
707	if (SCHEDULER_STOPPED())
708		return (1);
709
710	KASSERT(kdb_active != 0 || !TD_IS_IDLETHREAD(curthread),
711	    ("rw_try_rlock() by idle thread %p on rwlock %s @ %s:%d",
712	    curthread, rw->lock_object.lo_name, file, line));
713
714	x = rw->rw_lock;
715	for (;;) {
716		KASSERT(rw->rw_lock != RW_DESTROYED,
717		    ("rw_try_rlock() of destroyed rwlock @ %s:%d", file, line));
718		if (!(x & RW_LOCK_READ))
719			break;
720		if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &x, x + RW_ONE_READER)) {
721			LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 1, file,
722			    line);
723			WITNESS_LOCK(&rw->lock_object, LOP_TRYLOCK, file, line);
724			LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire,
725			    rw, 0, 0, file, line, LOCKSTAT_READER);
726			TD_LOCKS_INC(curthread);
727			curthread->td_rw_rlocks++;
728			return (1);
729		}
730	}
731
732	LOCK_LOG_TRY("RLOCK", &rw->lock_object, 0, 0, file, line);
733	return (0);
734}
735
736int
737__rw_try_rlock(volatile uintptr_t *c, const char *file, int line)
738{
739	struct rwlock *rw;
740
741	rw = rwlock2rw(c);
742	return (__rw_try_rlock_int(rw LOCK_FILE_LINE_ARG));
743}
744
745static bool __always_inline
746__rw_runlock_try(struct rwlock *rw, struct thread *td, uintptr_t *vp)
747{
748
749	for (;;) {
750		if (RW_READERS(*vp) > 1 || !(*vp & RW_LOCK_WAITERS)) {
751			if (atomic_fcmpset_rel_ptr(&rw->rw_lock, vp,
752			    *vp - RW_ONE_READER)) {
753				if (LOCK_LOG_TEST(&rw->lock_object, 0))
754					CTR4(KTR_LOCK,
755					    "%s: %p succeeded %p -> %p",
756					    __func__, rw, (void *)*vp,
757					    (void *)(*vp - RW_ONE_READER));
758				td->td_rw_rlocks--;
759				return (true);
760			}
761			continue;
762		}
763		break;
764	}
765	return (false);
766}
767
768static void __noinline
769__rw_runlock_hard(struct rwlock *rw, struct thread *td, uintptr_t v
770    LOCK_FILE_LINE_ARG_DEF)
771{
772	struct turnstile *ts;
773	uintptr_t setv, queue;
774
775	if (SCHEDULER_STOPPED())
776		return;
777
778	if (__rw_runlock_try(rw, td, &v))
779		goto out_lockstat;
780
781	/*
782	 * Ok, we know we have waiters and we think we are the
783	 * last reader, so grab the turnstile lock.
784	 */
785	turnstile_chain_lock(&rw->lock_object);
786	v = RW_READ_VALUE(rw);
787	for (;;) {
788		if (__rw_runlock_try(rw, td, &v))
789			break;
790
791		MPASS(v & RW_LOCK_WAITERS);
792
793		/*
794		 * Try to drop our lock leaving the lock in a unlocked
795		 * state.
796		 *
797		 * If you wanted to do explicit lock handoff you'd have to
798		 * do it here.  You'd also want to use turnstile_signal()
799		 * and you'd have to handle the race where a higher
800		 * priority thread blocks on the write lock before the
801		 * thread you wakeup actually runs and have the new thread
802		 * "steal" the lock.  For now it's a lot simpler to just
803		 * wakeup all of the waiters.
804		 *
805		 * As above, if we fail, then another thread might have
806		 * acquired a read lock, so drop the turnstile lock and
807		 * restart.
808		 */
809		setv = RW_UNLOCKED;
810		queue = TS_SHARED_QUEUE;
811		if (v & RW_LOCK_WRITE_WAITERS) {
812			queue = TS_EXCLUSIVE_QUEUE;
813			setv |= (v & RW_LOCK_READ_WAITERS);
814		}
815		setv |= (v & RW_LOCK_WRITE_SPINNER);
816		if (!atomic_fcmpset_rel_ptr(&rw->rw_lock, &v, setv))
817			continue;
818		if (LOCK_LOG_TEST(&rw->lock_object, 0))
819			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
820			    __func__, rw);
821
822		/*
823		 * Ok.  The lock is released and all that's left is to
824		 * wake up the waiters.  Note that the lock might not be
825		 * free anymore, but in that case the writers will just
826		 * block again if they run before the new lock holder(s)
827		 * release the lock.
828		 */
829		ts = turnstile_lookup(&rw->lock_object);
830		MPASS(ts != NULL);
831		turnstile_broadcast(ts, queue);
832		turnstile_unpend(ts);
833		td->td_rw_rlocks--;
834		break;
835	}
836	turnstile_chain_unlock(&rw->lock_object);
837out_lockstat:
838	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_READER);
839}
840
841void
842_rw_runlock_cookie_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
843{
844	struct thread *td;
845	uintptr_t v;
846
847	KASSERT(rw->rw_lock != RW_DESTROYED,
848	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
849	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
850	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
851	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
852
853	td = curthread;
854	v = RW_READ_VALUE(rw);
855
856	if (__predict_false(LOCKSTAT_PROFILE_ENABLED(rw__release) ||
857	    !__rw_runlock_try(rw, td, &v)))
858		__rw_runlock_hard(rw, td, v LOCK_FILE_LINE_ARG);
859	else
860		lock_profile_release_lock(&rw->lock_object, false);
861
862	TD_LOCKS_DEC(curthread);
863}
864
865void
866_rw_runlock_cookie(volatile uintptr_t *c, const char *file, int line)
867{
868	struct rwlock *rw;
869
870	rw = rwlock2rw(c);
871	_rw_runlock_cookie_int(rw LOCK_FILE_LINE_ARG);
872}
873
874#ifdef ADAPTIVE_RWLOCKS
875static inline void
876rw_drop_critical(uintptr_t v, bool *in_critical, int *extra_work)
877{
878
879	if (v & RW_LOCK_WRITE_SPINNER)
880		return;
881	if (*in_critical) {
882		critical_exit();
883		*in_critical = false;
884		(*extra_work)--;
885	}
886}
887#else
888#define rw_drop_critical(v, in_critical, extra_work) do { } while (0)
889#endif
890
891/*
892 * This function is called when we are unable to obtain a write lock on the
893 * first try.  This means that at least one other thread holds either a
894 * read or write lock.
895 */
896void
897__rw_wlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
898{
899	uintptr_t tid;
900	struct rwlock *rw;
901	struct turnstile *ts;
902	struct thread *owner;
903#ifdef ADAPTIVE_RWLOCKS
904	int spintries = 0;
905	int i, n;
906	enum { READERS, WRITER } sleep_reason = READERS;
907	bool in_critical = false;
908#endif
909	uintptr_t setv;
910#ifdef LOCK_PROFILING
911	uint64_t waittime = 0;
912	int contested = 0;
913#endif
914#if defined(ADAPTIVE_RWLOCKS) || defined(KDTRACE_HOOKS)
915	struct lock_delay_arg lda;
916#endif
917#ifdef KDTRACE_HOOKS
918	u_int sleep_cnt = 0;
919	int64_t sleep_time = 0;
920	int64_t all_time = 0;
921#endif
922#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
923	uintptr_t state = 0;
924	int doing_lockprof = 0;
925#endif
926	int extra_work = 0;
927
928	tid = (uintptr_t)curthread;
929	rw = rwlock2rw(c);
930
931#ifdef KDTRACE_HOOKS
932	if (LOCKSTAT_PROFILE_ENABLED(rw__acquire)) {
933		while (v == RW_UNLOCKED) {
934			if (_rw_write_lock_fetch(rw, &v, tid))
935				goto out_lockstat;
936		}
937		extra_work = 1;
938		doing_lockprof = 1;
939		all_time -= lockstat_nsecs(&rw->lock_object);
940		state = v;
941	}
942#endif
943#ifdef LOCK_PROFILING
944	extra_work = 1;
945	doing_lockprof = 1;
946	state = v;
947#endif
948
949	if (SCHEDULER_STOPPED())
950		return;
951
952	if (__predict_false(v == RW_UNLOCKED))
953		v = RW_READ_VALUE(rw);
954
955	if (__predict_false(lv_rw_wowner(v) == (struct thread *)tid)) {
956		KASSERT(rw->lock_object.lo_flags & LO_RECURSABLE,
957		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
958		    __func__, rw->lock_object.lo_name, file, line));
959		rw->rw_recurse++;
960		atomic_set_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
961		if (LOCK_LOG_TEST(&rw->lock_object, 0))
962			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
963		return;
964	}
965
966	if (LOCK_LOG_TEST(&rw->lock_object, 0))
967		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
968		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
969
970#if defined(ADAPTIVE_RWLOCKS)
971	lock_delay_arg_init(&lda, &rw_delay);
972#elif defined(KDTRACE_HOOKS)
973	lock_delay_arg_init_noadapt(&lda);
974#endif
975
976#ifdef HWPMC_HOOKS
977	PMC_SOFT_CALL( , , lock, failed);
978#endif
979	lock_profile_obtain_lock_failed(&rw->lock_object, false,
980	    &contested, &waittime);
981
982	THREAD_CONTENDS_ON_LOCK(&rw->lock_object);
983
984	for (;;) {
985		if (v == RW_UNLOCKED) {
986			if (_rw_write_lock_fetch(rw, &v, tid))
987				break;
988			continue;
989		}
990#ifdef KDTRACE_HOOKS
991		lda.spin_cnt++;
992#endif
993
994#ifdef ADAPTIVE_RWLOCKS
995		if (v == (RW_LOCK_READ | RW_LOCK_WRITE_SPINNER)) {
996			if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid))
997				break;
998			continue;
999		}
1000
1001		/*
1002		 * If the lock is write locked and the owner is
1003		 * running on another CPU, spin until the owner stops
1004		 * running or the state of the lock changes.
1005		 */
1006		if (!(v & RW_LOCK_READ)) {
1007			rw_drop_critical(v, &in_critical, &extra_work);
1008			sleep_reason = WRITER;
1009			owner = lv_rw_wowner(v);
1010			if (!TD_IS_RUNNING(owner))
1011				goto ts;
1012			if (LOCK_LOG_TEST(&rw->lock_object, 0))
1013				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
1014				    __func__, rw, owner);
1015			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
1016			    "spinning", "lockname:\"%s\"",
1017			    rw->lock_object.lo_name);
1018			do {
1019				lock_delay(&lda);
1020				v = RW_READ_VALUE(rw);
1021				owner = lv_rw_wowner(v);
1022			} while (owner != NULL && TD_IS_RUNNING(owner));
1023			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
1024			    "running");
1025			continue;
1026		} else if (RW_READERS(v) > 0) {
1027			sleep_reason = READERS;
1028			if (spintries == rowner_retries)
1029				goto ts;
1030			if (!(v & RW_LOCK_WRITE_SPINNER)) {
1031				if (!in_critical) {
1032					critical_enter();
1033					in_critical = true;
1034					extra_work++;
1035				}
1036				if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
1037				    v | RW_LOCK_WRITE_SPINNER)) {
1038					critical_exit();
1039					in_critical = false;
1040					extra_work--;
1041					continue;
1042				}
1043			}
1044			spintries++;
1045			KTR_STATE1(KTR_SCHED, "thread", sched_tdname(curthread),
1046			    "spinning", "lockname:\"%s\"",
1047			    rw->lock_object.lo_name);
1048			n = RW_READERS(v);
1049			for (i = 0; i < rowner_loops; i += n) {
1050				lock_delay_spin(n);
1051				v = RW_READ_VALUE(rw);
1052				if (!(v & RW_LOCK_WRITE_SPINNER))
1053					break;
1054				if (!(v & RW_LOCK_READ))
1055					break;
1056				n = RW_READERS(v);
1057				if (n == 0)
1058					break;
1059			}
1060#ifdef KDTRACE_HOOKS
1061			lda.spin_cnt += i;
1062#endif
1063			KTR_STATE0(KTR_SCHED, "thread", sched_tdname(curthread),
1064			    "running");
1065			if (i < rowner_loops)
1066				continue;
1067		}
1068ts:
1069#endif
1070		ts = turnstile_trywait(&rw->lock_object);
1071		v = RW_READ_VALUE(rw);
1072retry_ts:
1073		owner = lv_rw_wowner(v);
1074
1075#ifdef ADAPTIVE_RWLOCKS
1076		/*
1077		 * The current lock owner might have started executing
1078		 * on another CPU (or the lock could have changed
1079		 * owners) while we were waiting on the turnstile
1080		 * chain lock.  If so, drop the turnstile lock and try
1081		 * again.
1082		 */
1083		if (owner != NULL) {
1084			if (TD_IS_RUNNING(owner)) {
1085				turnstile_cancel(ts);
1086				rw_drop_critical(v, &in_critical, &extra_work);
1087				continue;
1088			}
1089		} else if (RW_READERS(v) > 0 && sleep_reason == WRITER) {
1090			turnstile_cancel(ts);
1091			rw_drop_critical(v, &in_critical, &extra_work);
1092			continue;
1093		}
1094#endif
1095		/*
1096		 * Check for the waiters flags about this rwlock.
1097		 * If the lock was released, without maintain any pending
1098		 * waiters queue, simply try to acquire it.
1099		 * If a pending waiters queue is present, claim the lock
1100		 * ownership and maintain the pending queue.
1101		 */
1102		setv = v & (RW_LOCK_WAITERS | RW_LOCK_WRITE_SPINNER);
1103		if ((v & ~setv) == RW_UNLOCKED) {
1104			setv &= ~RW_LOCK_WRITE_SPINNER;
1105			if (atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid | setv)) {
1106				if (setv)
1107					turnstile_claim(ts);
1108				else
1109					turnstile_cancel(ts);
1110				break;
1111			}
1112			goto retry_ts;
1113		}
1114
1115#ifdef ADAPTIVE_RWLOCKS
1116		if (in_critical) {
1117			if ((v & RW_LOCK_WRITE_SPINNER) ||
1118			    !((v & RW_LOCK_WRITE_WAITERS))) {
1119				setv = v & ~RW_LOCK_WRITE_SPINNER;
1120				setv |= RW_LOCK_WRITE_WAITERS;
1121				if (!atomic_fcmpset_ptr(&rw->rw_lock, &v, setv))
1122					goto retry_ts;
1123			}
1124			critical_exit();
1125			in_critical = false;
1126			extra_work--;
1127		} else {
1128#endif
1129			/*
1130			 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
1131			 * set it.  If we fail to set it, then loop back and try
1132			 * again.
1133			 */
1134			if (!(v & RW_LOCK_WRITE_WAITERS)) {
1135				if (!atomic_fcmpset_ptr(&rw->rw_lock, &v,
1136				    v | RW_LOCK_WRITE_WAITERS))
1137					goto retry_ts;
1138				if (LOCK_LOG_TEST(&rw->lock_object, 0))
1139					CTR2(KTR_LOCK, "%s: %p set write waiters flag",
1140					    __func__, rw);
1141			}
1142#ifdef ADAPTIVE_RWLOCKS
1143		}
1144#endif
1145		/*
1146		 * We were unable to acquire the lock and the write waiters
1147		 * flag is set, so we must block on the turnstile.
1148		 */
1149		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1150			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
1151			    rw);
1152#ifdef KDTRACE_HOOKS
1153		sleep_time -= lockstat_nsecs(&rw->lock_object);
1154#endif
1155		MPASS(owner == rw_owner(rw));
1156		turnstile_wait(ts, owner, TS_EXCLUSIVE_QUEUE);
1157#ifdef KDTRACE_HOOKS
1158		sleep_time += lockstat_nsecs(&rw->lock_object);
1159		sleep_cnt++;
1160#endif
1161		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1162			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
1163			    __func__, rw);
1164#ifdef ADAPTIVE_RWLOCKS
1165		spintries = 0;
1166#endif
1167		v = RW_READ_VALUE(rw);
1168	}
1169	THREAD_CONTENTION_DONE(&rw->lock_object);
1170	if (__predict_true(!extra_work))
1171		return;
1172#ifdef ADAPTIVE_RWLOCKS
1173	if (in_critical)
1174		critical_exit();
1175#endif
1176#if defined(KDTRACE_HOOKS) || defined(LOCK_PROFILING)
1177	if (__predict_true(!doing_lockprof))
1178		return;
1179#endif
1180#ifdef KDTRACE_HOOKS
1181	all_time += lockstat_nsecs(&rw->lock_object);
1182	if (sleep_time)
1183		LOCKSTAT_RECORD4(rw__block, rw, sleep_time,
1184		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
1185		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
1186
1187	/* Record only the loops spinning and not sleeping. */
1188	if (lda.spin_cnt > sleep_cnt)
1189		LOCKSTAT_RECORD4(rw__spin, rw, all_time - sleep_time,
1190		    LOCKSTAT_WRITER, (state & RW_LOCK_READ) == 0,
1191		    (state & RW_LOCK_READ) == 0 ? 0 : RW_READERS(state));
1192out_lockstat:
1193#endif
1194	LOCKSTAT_PROFILE_OBTAIN_RWLOCK_SUCCESS(rw__acquire, rw, contested,
1195	    waittime, file, line, LOCKSTAT_WRITER);
1196}
1197
1198/*
1199 * This function is called if lockstat is active or the first try at releasing
1200 * a write lock failed.  The latter means that the lock is recursed or one of
1201 * the 2 waiter bits must be set indicating that at least one thread is waiting
1202 * on this lock.
1203 */
1204void
1205__rw_wunlock_hard(volatile uintptr_t *c, uintptr_t v LOCK_FILE_LINE_ARG_DEF)
1206{
1207	struct rwlock *rw;
1208	struct turnstile *ts;
1209	uintptr_t tid, setv;
1210	int queue;
1211
1212	tid = (uintptr_t)curthread;
1213	if (SCHEDULER_STOPPED())
1214		return;
1215
1216	rw = rwlock2rw(c);
1217	if (__predict_false(v == tid))
1218		v = RW_READ_VALUE(rw);
1219
1220	if (v & RW_LOCK_WRITER_RECURSED) {
1221		if (--(rw->rw_recurse) == 0)
1222			atomic_clear_ptr(&rw->rw_lock, RW_LOCK_WRITER_RECURSED);
1223		if (LOCK_LOG_TEST(&rw->lock_object, 0))
1224			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
1225		return;
1226	}
1227
1228	LOCKSTAT_PROFILE_RELEASE_RWLOCK(rw__release, rw, LOCKSTAT_WRITER);
1229	if (v == tid && _rw_write_unlock(rw, tid))
1230		return;
1231
1232	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
1233	    ("%s: neither of the waiter flags are set", __func__));
1234
1235	if (LOCK_LOG_TEST(&rw->lock_object, 0))
1236		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
1237
1238	turnstile_chain_lock(&rw->lock_object);
1239
1240	/*
1241	 * Use the same algo as sx locks for now.  Prefer waking up shared
1242	 * waiters if we have any over writers.  This is probably not ideal.
1243	 *
1244	 * 'v' is the value we are going to write back to rw_lock.  If we
1245	 * have waiters on both queues, we need to preserve the state of
1246	 * the waiter flag for the queue we don't wake up.  For now this is
1247	 * hardcoded for the algorithm mentioned above.
1248	 *
1249	 * In the case of both readers and writers waiting we wakeup the
1250	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
1251	 * new writer comes in before a reader it will claim the lock up
1252	 * above.  There is probably a potential priority inversion in
1253	 * there that could be worked around either by waking both queues
1254	 * of waiters or doing some complicated lock handoff gymnastics.
1255	 */
1256	setv = RW_UNLOCKED;
1257	v = RW_READ_VALUE(rw);
1258	queue = TS_SHARED_QUEUE;
1259	if (v & RW_LOCK_WRITE_WAITERS) {
1260		queue = TS_EXCLUSIVE_QUEUE;
1261		setv |= (v & RW_LOCK_READ_WAITERS);
1262	}
1263	atomic_store_rel_ptr(&rw->rw_lock, setv);
1264
1265	/* Wake up all waiters for the specific queue. */
1266	if (LOCK_LOG_TEST(&rw->lock_object, 0))
1267		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
1268		    queue == TS_SHARED_QUEUE ? "read" : "write");
1269
1270	ts = turnstile_lookup(&rw->lock_object);
1271	MPASS(ts != NULL);
1272	turnstile_broadcast(ts, queue);
1273	turnstile_unpend(ts);
1274	turnstile_chain_unlock(&rw->lock_object);
1275}
1276
1277/*
1278 * Attempt to do a non-blocking upgrade from a read lock to a write
1279 * lock.  This will only succeed if this thread holds a single read
1280 * lock.  Returns true if the upgrade succeeded and false otherwise.
1281 */
1282int
1283__rw_try_upgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
1284{
1285	uintptr_t v, setv, tid;
1286	struct turnstile *ts;
1287	int success;
1288
1289	if (SCHEDULER_STOPPED())
1290		return (1);
1291
1292	KASSERT(rw->rw_lock != RW_DESTROYED,
1293	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
1294	__rw_assert(&rw->rw_lock, RA_RLOCKED, file, line);
1295
1296	/*
1297	 * Attempt to switch from one reader to a writer.  If there
1298	 * are any write waiters, then we will have to lock the
1299	 * turnstile first to prevent races with another writer
1300	 * calling turnstile_wait() before we have claimed this
1301	 * turnstile.  So, do the simple case of no waiters first.
1302	 */
1303	tid = (uintptr_t)curthread;
1304	success = 0;
1305	v = RW_READ_VALUE(rw);
1306	for (;;) {
1307		if (RW_READERS(v) > 1)
1308			break;
1309		if (!(v & RW_LOCK_WAITERS)) {
1310			success = atomic_fcmpset_acq_ptr(&rw->rw_lock, &v, tid);
1311			if (!success)
1312				continue;
1313			break;
1314		}
1315
1316		/*
1317		 * Ok, we think we have waiters, so lock the turnstile.
1318		 */
1319		ts = turnstile_trywait(&rw->lock_object);
1320		v = RW_READ_VALUE(rw);
1321retry_ts:
1322		if (RW_READERS(v) > 1) {
1323			turnstile_cancel(ts);
1324			break;
1325		}
1326		/*
1327		 * Try to switch from one reader to a writer again.  This time
1328		 * we honor the current state of the waiters flags.
1329		 * If we obtain the lock with the flags set, then claim
1330		 * ownership of the turnstile.
1331		 */
1332		setv = tid | (v & RW_LOCK_WAITERS);
1333		success = atomic_fcmpset_ptr(&rw->rw_lock, &v, setv);
1334		if (success) {
1335			if (v & RW_LOCK_WAITERS)
1336				turnstile_claim(ts);
1337			else
1338				turnstile_cancel(ts);
1339			break;
1340		}
1341		goto retry_ts;
1342	}
1343	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
1344	if (success) {
1345		curthread->td_rw_rlocks--;
1346		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
1347		    file, line);
1348		LOCKSTAT_RECORD0(rw__upgrade, rw);
1349	}
1350	return (success);
1351}
1352
1353int
1354__rw_try_upgrade(volatile uintptr_t *c, const char *file, int line)
1355{
1356	struct rwlock *rw;
1357
1358	rw = rwlock2rw(c);
1359	return (__rw_try_upgrade_int(rw LOCK_FILE_LINE_ARG));
1360}
1361
1362/*
1363 * Downgrade a write lock into a single read lock.
1364 */
1365void
1366__rw_downgrade_int(struct rwlock *rw LOCK_FILE_LINE_ARG_DEF)
1367{
1368	struct turnstile *ts;
1369	uintptr_t tid, v;
1370	int rwait, wwait;
1371
1372	if (SCHEDULER_STOPPED())
1373		return;
1374
1375	KASSERT(rw->rw_lock != RW_DESTROYED,
1376	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
1377	__rw_assert(&rw->rw_lock, RA_WLOCKED | RA_NOTRECURSED, file, line);
1378#ifndef INVARIANTS
1379	if (rw_recursed(rw))
1380		panic("downgrade of a recursed lock");
1381#endif
1382
1383	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
1384
1385	/*
1386	 * Convert from a writer to a single reader.  First we handle
1387	 * the easy case with no waiters.  If there are any waiters, we
1388	 * lock the turnstile and "disown" the lock.
1389	 */
1390	tid = (uintptr_t)curthread;
1391	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
1392		goto out;
1393
1394	/*
1395	 * Ok, we think we have waiters, so lock the turnstile so we can
1396	 * read the waiter flags without any races.
1397	 */
1398	turnstile_chain_lock(&rw->lock_object);
1399	v = rw->rw_lock & RW_LOCK_WAITERS;
1400	rwait = v & RW_LOCK_READ_WAITERS;
1401	wwait = v & RW_LOCK_WRITE_WAITERS;
1402	MPASS(rwait | wwait);
1403
1404	/*
1405	 * Downgrade from a write lock while preserving waiters flag
1406	 * and give up ownership of the turnstile.
1407	 */
1408	ts = turnstile_lookup(&rw->lock_object);
1409	MPASS(ts != NULL);
1410	if (!wwait)
1411		v &= ~RW_LOCK_READ_WAITERS;
1412	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v);
1413	/*
1414	 * Wake other readers if there are no writers pending.  Otherwise they
1415	 * won't be able to acquire the lock anyway.
1416	 */
1417	if (rwait && !wwait) {
1418		turnstile_broadcast(ts, TS_SHARED_QUEUE);
1419		turnstile_unpend(ts);
1420	} else
1421		turnstile_disown(ts);
1422	turnstile_chain_unlock(&rw->lock_object);
1423out:
1424	curthread->td_rw_rlocks++;
1425	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
1426	LOCKSTAT_RECORD0(rw__downgrade, rw);
1427}
1428
1429void
1430__rw_downgrade(volatile uintptr_t *c, const char *file, int line)
1431{
1432	struct rwlock *rw;
1433
1434	rw = rwlock2rw(c);
1435	__rw_downgrade_int(rw LOCK_FILE_LINE_ARG);
1436}
1437
1438#ifdef INVARIANT_SUPPORT
1439#ifndef INVARIANTS
1440#undef __rw_assert
1441#endif
1442
1443/*
1444 * In the non-WITNESS case, rw_assert() can only detect that at least
1445 * *some* thread owns an rlock, but it cannot guarantee that *this*
1446 * thread owns an rlock.
1447 */
1448void
1449__rw_assert(const volatile uintptr_t *c, int what, const char *file, int line)
1450{
1451	const struct rwlock *rw;
1452
1453	if (SCHEDULER_STOPPED())
1454		return;
1455
1456	rw = rwlock2rw(c);
1457
1458	switch (what) {
1459	case RA_LOCKED:
1460	case RA_LOCKED | RA_RECURSED:
1461	case RA_LOCKED | RA_NOTRECURSED:
1462	case RA_RLOCKED:
1463	case RA_RLOCKED | RA_RECURSED:
1464	case RA_RLOCKED | RA_NOTRECURSED:
1465#ifdef WITNESS
1466		witness_assert(&rw->lock_object, what, file, line);
1467#else
1468		/*
1469		 * If some other thread has a write lock or we have one
1470		 * and are asserting a read lock, fail.  Also, if no one
1471		 * has a lock at all, fail.
1472		 */
1473		if (rw->rw_lock == RW_UNLOCKED ||
1474		    (!(rw->rw_lock & RW_LOCK_READ) && (what & RA_RLOCKED ||
1475		    rw_wowner(rw) != curthread)))
1476			panic("Lock %s not %slocked @ %s:%d\n",
1477			    rw->lock_object.lo_name, (what & RA_RLOCKED) ?
1478			    "read " : "", file, line);
1479
1480		if (!(rw->rw_lock & RW_LOCK_READ) && !(what & RA_RLOCKED)) {
1481			if (rw_recursed(rw)) {
1482				if (what & RA_NOTRECURSED)
1483					panic("Lock %s recursed @ %s:%d\n",
1484					    rw->lock_object.lo_name, file,
1485					    line);
1486			} else if (what & RA_RECURSED)
1487				panic("Lock %s not recursed @ %s:%d\n",
1488				    rw->lock_object.lo_name, file, line);
1489		}
1490#endif
1491		break;
1492	case RA_WLOCKED:
1493	case RA_WLOCKED | RA_RECURSED:
1494	case RA_WLOCKED | RA_NOTRECURSED:
1495		if (rw_wowner(rw) != curthread)
1496			panic("Lock %s not exclusively locked @ %s:%d\n",
1497			    rw->lock_object.lo_name, file, line);
1498		if (rw_recursed(rw)) {
1499			if (what & RA_NOTRECURSED)
1500				panic("Lock %s recursed @ %s:%d\n",
1501				    rw->lock_object.lo_name, file, line);
1502		} else if (what & RA_RECURSED)
1503			panic("Lock %s not recursed @ %s:%d\n",
1504			    rw->lock_object.lo_name, file, line);
1505		break;
1506	case RA_UNLOCKED:
1507#ifdef WITNESS
1508		witness_assert(&rw->lock_object, what, file, line);
1509#else
1510		/*
1511		 * If we hold a write lock fail.  We can't reliably check
1512		 * to see if we hold a read lock or not.
1513		 */
1514		if (rw_wowner(rw) == curthread)
1515			panic("Lock %s exclusively locked @ %s:%d\n",
1516			    rw->lock_object.lo_name, file, line);
1517#endif
1518		break;
1519	default:
1520		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
1521		    line);
1522	}
1523}
1524#endif /* INVARIANT_SUPPORT */
1525
1526#ifdef DDB
1527void
1528db_show_rwlock(const struct lock_object *lock)
1529{
1530	const struct rwlock *rw;
1531	struct thread *td;
1532
1533	rw = (const struct rwlock *)lock;
1534
1535	db_printf(" state: ");
1536	if (rw->rw_lock == RW_UNLOCKED)
1537		db_printf("UNLOCKED\n");
1538	else if (rw->rw_lock == RW_DESTROYED) {
1539		db_printf("DESTROYED\n");
1540		return;
1541	} else if (rw->rw_lock & RW_LOCK_READ)
1542		db_printf("RLOCK: %ju locks\n",
1543		    (uintmax_t)(RW_READERS(rw->rw_lock)));
1544	else {
1545		td = rw_wowner(rw);
1546		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
1547		    td->td_tid, td->td_proc->p_pid, td->td_name);
1548		if (rw_recursed(rw))
1549			db_printf(" recursed: %u\n", rw->rw_recurse);
1550	}
1551	db_printf(" waiters: ");
1552	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
1553	case RW_LOCK_READ_WAITERS:
1554		db_printf("readers\n");
1555		break;
1556	case RW_LOCK_WRITE_WAITERS:
1557		db_printf("writers\n");
1558		break;
1559	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
1560		db_printf("readers and writers\n");
1561		break;
1562	default:
1563		db_printf("none\n");
1564		break;
1565	}
1566}
1567
1568#endif
1569