kern_umtx.c revision 242202
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 242202 2012-10-27 23:42:41Z davide $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45#include <sys/sysent.h>
46#include <sys/systm.h>
47#include <sys/sysproto.h>
48#include <sys/syscallsubr.h>
49#include <sys/eventhandler.h>
50#include <sys/umtx.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/pmap.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57
58#include <machine/cpu.h>
59
60#ifdef COMPAT_FREEBSD32
61#include <compat/freebsd32/freebsd32_proto.h>
62#endif
63
64#define _UMUTEX_TRY		1
65#define _UMUTEX_WAIT		2
66
67/* Priority inheritance mutex info. */
68struct umtx_pi {
69	/* Owner thread */
70	struct thread		*pi_owner;
71
72	/* Reference count */
73	int			pi_refcount;
74
75 	/* List entry to link umtx holding by thread */
76	TAILQ_ENTRY(umtx_pi)	pi_link;
77
78	/* List entry in hash */
79	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80
81	/* List for waiters */
82	TAILQ_HEAD(,umtx_q)	pi_blocked;
83
84	/* Identify a userland lock object */
85	struct umtx_key		pi_key;
86};
87
88/* A userland synchronous object user. */
89struct umtx_q {
90	/* Linked list for the hash. */
91	TAILQ_ENTRY(umtx_q)	uq_link;
92
93	/* Umtx key. */
94	struct umtx_key		uq_key;
95
96	/* Umtx flags. */
97	int			uq_flags;
98#define UQF_UMTXQ	0x0001
99
100	/* The thread waits on. */
101	struct thread		*uq_thread;
102
103	/*
104	 * Blocked on PI mutex. read can use chain lock
105	 * or umtx_lock, write must have both chain lock and
106	 * umtx_lock being hold.
107	 */
108	struct umtx_pi		*uq_pi_blocked;
109
110	/* On blocked list */
111	TAILQ_ENTRY(umtx_q)	uq_lockq;
112
113	/* Thread contending with us */
114	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115
116	/* Inherited priority from PP mutex */
117	u_char			uq_inherited_pri;
118
119	/* Spare queue ready to be reused */
120	struct umtxq_queue	*uq_spare_queue;
121
122	/* The queue we on */
123	struct umtxq_queue	*uq_cur_queue;
124};
125
126TAILQ_HEAD(umtxq_head, umtx_q);
127
128/* Per-key wait-queue */
129struct umtxq_queue {
130	struct umtxq_head	head;
131	struct umtx_key		key;
132	LIST_ENTRY(umtxq_queue)	link;
133	int			length;
134};
135
136LIST_HEAD(umtxq_list, umtxq_queue);
137
138/* Userland lock object's wait-queue chain */
139struct umtxq_chain {
140	/* Lock for this chain. */
141	struct mtx		uc_lock;
142
143	/* List of sleep queues. */
144	struct umtxq_list	uc_queue[2];
145#define UMTX_SHARED_QUEUE	0
146#define UMTX_EXCLUSIVE_QUEUE	1
147
148	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149
150	/* Busy flag */
151	char			uc_busy;
152
153	/* Chain lock waiters */
154	int			uc_waiters;
155
156	/* All PI in the list */
157	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158
159#ifdef UMTX_PROFILING
160	int 			length;
161	int			max_length;
162#endif
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167
168/*
169 * Don't propagate time-sharing priority, there is a security reason,
170 * a user can simply introduce PI-mutex, let thread A lock the mutex,
171 * and let another thread B block on the mutex, because B is
172 * sleeping, its priority will be boosted, this causes A's priority to
173 * be boosted via priority propagating too and will never be lowered even
174 * if it is using 100%CPU, this is unfair to other processes.
175 */
176
177#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180
181#define	GOLDEN_RATIO_PRIME	2654404609U
182#define	UMTX_CHAINS		512
183#define	UMTX_SHIFTS		(__WORD_BIT - 9)
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188#define BUSY_SPINS		200
189
190struct abs_timeout {
191	int clockid;
192	struct timespec cur;
193	struct timespec end;
194};
195
196static uma_zone_t		umtx_pi_zone;
197static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
198static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
199static int			umtx_pi_allocated;
200
201static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
202SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
203    &umtx_pi_allocated, 0, "Allocated umtx_pi");
204
205#ifdef UMTX_PROFILING
206static long max_length;
207SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
208static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
209#endif
210
211static void umtxq_sysinit(void *);
212static void umtxq_hash(struct umtx_key *key);
213static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
214static void umtxq_lock(struct umtx_key *key);
215static void umtxq_unlock(struct umtx_key *key);
216static void umtxq_busy(struct umtx_key *key);
217static void umtxq_unbusy(struct umtx_key *key);
218static void umtxq_insert_queue(struct umtx_q *uq, int q);
219static void umtxq_remove_queue(struct umtx_q *uq, int q);
220static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
221static int umtxq_count(struct umtx_key *key);
222static struct umtx_pi *umtx_pi_alloc(int);
223static void umtx_pi_free(struct umtx_pi *pi);
224static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225static void umtx_thread_cleanup(struct thread *td);
226static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227	struct image_params *imgp __unused);
228SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229
230#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233
234static struct mtx umtx_lock;
235
236#ifdef UMTX_PROFILING
237static void
238umtx_init_profiling(void)
239{
240	struct sysctl_oid *chain_oid;
241	char chain_name[10];
242	int i;
243
244	for (i = 0; i < UMTX_CHAINS; ++i) {
245		snprintf(chain_name, sizeof(chain_name), "%d", i);
246		chain_oid = SYSCTL_ADD_NODE(NULL,
247		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
248		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
249		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
250		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
251		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
252		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
253	}
254}
255#endif
256
257static void
258umtxq_sysinit(void *arg __unused)
259{
260	int i, j;
261
262	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
263		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
264	for (i = 0; i < 2; ++i) {
265		for (j = 0; j < UMTX_CHAINS; ++j) {
266			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
267				 MTX_DEF | MTX_DUPOK);
268			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
269			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
270			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
271			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
272			umtxq_chains[i][j].uc_busy = 0;
273			umtxq_chains[i][j].uc_waiters = 0;
274#ifdef UMTX_PROFILING
275			umtxq_chains[i][j].length = 0;
276			umtxq_chains[i][j].max_length = 0;
277#endif
278		}
279	}
280#ifdef UMTX_PROFILING
281	umtx_init_profiling();
282#endif
283	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
284	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
285	    EVENTHANDLER_PRI_ANY);
286}
287
288struct umtx_q *
289umtxq_alloc(void)
290{
291	struct umtx_q *uq;
292
293	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
294	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
295	TAILQ_INIT(&uq->uq_spare_queue->head);
296	TAILQ_INIT(&uq->uq_pi_contested);
297	uq->uq_inherited_pri = PRI_MAX;
298	return (uq);
299}
300
301void
302umtxq_free(struct umtx_q *uq)
303{
304	MPASS(uq->uq_spare_queue != NULL);
305	free(uq->uq_spare_queue, M_UMTX);
306	free(uq, M_UMTX);
307}
308
309static inline void
310umtxq_hash(struct umtx_key *key)
311{
312	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
313	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
314}
315
316static inline struct umtxq_chain *
317umtxq_getchain(struct umtx_key *key)
318{
319	if (key->type <= TYPE_SEM)
320		return (&umtxq_chains[1][key->hash]);
321	return (&umtxq_chains[0][key->hash]);
322}
323
324/*
325 * Lock a chain.
326 */
327static inline void
328umtxq_lock(struct umtx_key *key)
329{
330	struct umtxq_chain *uc;
331
332	uc = umtxq_getchain(key);
333	mtx_lock(&uc->uc_lock);
334}
335
336/*
337 * Unlock a chain.
338 */
339static inline void
340umtxq_unlock(struct umtx_key *key)
341{
342	struct umtxq_chain *uc;
343
344	uc = umtxq_getchain(key);
345	mtx_unlock(&uc->uc_lock);
346}
347
348/*
349 * Set chain to busy state when following operation
350 * may be blocked (kernel mutex can not be used).
351 */
352static inline void
353umtxq_busy(struct umtx_key *key)
354{
355	struct umtxq_chain *uc;
356
357	uc = umtxq_getchain(key);
358	mtx_assert(&uc->uc_lock, MA_OWNED);
359	if (uc->uc_busy) {
360#ifdef SMP
361		if (smp_cpus > 1) {
362			int count = BUSY_SPINS;
363			if (count > 0) {
364				umtxq_unlock(key);
365				while (uc->uc_busy && --count > 0)
366					cpu_spinwait();
367				umtxq_lock(key);
368			}
369		}
370#endif
371		while (uc->uc_busy) {
372			uc->uc_waiters++;
373			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
374			uc->uc_waiters--;
375		}
376	}
377	uc->uc_busy = 1;
378}
379
380/*
381 * Unbusy a chain.
382 */
383static inline void
384umtxq_unbusy(struct umtx_key *key)
385{
386	struct umtxq_chain *uc;
387
388	uc = umtxq_getchain(key);
389	mtx_assert(&uc->uc_lock, MA_OWNED);
390	KASSERT(uc->uc_busy != 0, ("not busy"));
391	uc->uc_busy = 0;
392	if (uc->uc_waiters)
393		wakeup_one(uc);
394}
395
396static struct umtxq_queue *
397umtxq_queue_lookup(struct umtx_key *key, int q)
398{
399	struct umtxq_queue *uh;
400	struct umtxq_chain *uc;
401
402	uc = umtxq_getchain(key);
403	UMTXQ_LOCKED_ASSERT(uc);
404	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
405		if (umtx_key_match(&uh->key, key))
406			return (uh);
407	}
408
409	return (NULL);
410}
411
412static inline void
413umtxq_insert_queue(struct umtx_q *uq, int q)
414{
415	struct umtxq_queue *uh;
416	struct umtxq_chain *uc;
417
418	uc = umtxq_getchain(&uq->uq_key);
419	UMTXQ_LOCKED_ASSERT(uc);
420	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
421	uh = umtxq_queue_lookup(&uq->uq_key, q);
422	if (uh != NULL) {
423		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
424	} else {
425		uh = uq->uq_spare_queue;
426		uh->key = uq->uq_key;
427		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
428	}
429	uq->uq_spare_queue = NULL;
430
431	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
432	uh->length++;
433#ifdef UMTX_PROFILING
434	uc->length++;
435	if (uc->length > uc->max_length) {
436		uc->max_length = uc->length;
437		if (uc->max_length > max_length)
438			max_length = uc->max_length;
439	}
440#endif
441	uq->uq_flags |= UQF_UMTXQ;
442	uq->uq_cur_queue = uh;
443	return;
444}
445
446static inline void
447umtxq_remove_queue(struct umtx_q *uq, int q)
448{
449	struct umtxq_chain *uc;
450	struct umtxq_queue *uh;
451
452	uc = umtxq_getchain(&uq->uq_key);
453	UMTXQ_LOCKED_ASSERT(uc);
454	if (uq->uq_flags & UQF_UMTXQ) {
455		uh = uq->uq_cur_queue;
456		TAILQ_REMOVE(&uh->head, uq, uq_link);
457		uh->length--;
458#ifdef UMTX_PROFILING
459		uc->length--;
460#endif
461		uq->uq_flags &= ~UQF_UMTXQ;
462		if (TAILQ_EMPTY(&uh->head)) {
463			KASSERT(uh->length == 0,
464			    ("inconsistent umtxq_queue length"));
465			LIST_REMOVE(uh, link);
466		} else {
467			uh = LIST_FIRST(&uc->uc_spare_queue);
468			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
469			LIST_REMOVE(uh, link);
470		}
471		uq->uq_spare_queue = uh;
472		uq->uq_cur_queue = NULL;
473	}
474}
475
476/*
477 * Check if there are multiple waiters
478 */
479static int
480umtxq_count(struct umtx_key *key)
481{
482	struct umtxq_chain *uc;
483	struct umtxq_queue *uh;
484
485	uc = umtxq_getchain(key);
486	UMTXQ_LOCKED_ASSERT(uc);
487	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
488	if (uh != NULL)
489		return (uh->length);
490	return (0);
491}
492
493/*
494 * Check if there are multiple PI waiters and returns first
495 * waiter.
496 */
497static int
498umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
499{
500	struct umtxq_chain *uc;
501	struct umtxq_queue *uh;
502
503	*first = NULL;
504	uc = umtxq_getchain(key);
505	UMTXQ_LOCKED_ASSERT(uc);
506	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
507	if (uh != NULL) {
508		*first = TAILQ_FIRST(&uh->head);
509		return (uh->length);
510	}
511	return (0);
512}
513
514/*
515 * Wake up threads waiting on an userland object.
516 */
517
518static int
519umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
520{
521	struct umtxq_chain *uc;
522	struct umtxq_queue *uh;
523	struct umtx_q *uq;
524	int ret;
525
526	ret = 0;
527	uc = umtxq_getchain(key);
528	UMTXQ_LOCKED_ASSERT(uc);
529	uh = umtxq_queue_lookup(key, q);
530	if (uh != NULL) {
531		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
532			umtxq_remove_queue(uq, q);
533			wakeup(uq);
534			if (++ret >= n_wake)
535				return (ret);
536		}
537	}
538	return (ret);
539}
540
541
542/*
543 * Wake up specified thread.
544 */
545static inline void
546umtxq_signal_thread(struct umtx_q *uq)
547{
548	struct umtxq_chain *uc;
549
550	uc = umtxq_getchain(&uq->uq_key);
551	UMTXQ_LOCKED_ASSERT(uc);
552	umtxq_remove(uq);
553	wakeup(uq);
554}
555
556static inline int
557tstohz(const struct timespec *tsp)
558{
559	struct timeval tv;
560
561	TIMESPEC_TO_TIMEVAL(&tv, tsp);
562	return tvtohz(&tv);
563}
564
565static void
566abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
567	const struct timespec *timeout)
568{
569
570	timo->clockid = clockid;
571	if (!absolute) {
572		kern_clock_gettime(curthread, clockid, &timo->end);
573		timo->cur = timo->end;
574		timespecadd(&timo->end, timeout);
575	} else {
576		timo->end = *timeout;
577		kern_clock_gettime(curthread, clockid, &timo->cur);
578	}
579}
580
581static void
582abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
583{
584
585	abs_timeout_init(timo, umtxtime->_clockid,
586		(umtxtime->_flags & UMTX_ABSTIME) != 0,
587		&umtxtime->_timeout);
588}
589
590static inline void
591abs_timeout_update(struct abs_timeout *timo)
592{
593	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
594}
595
596static int
597abs_timeout_gethz(struct abs_timeout *timo)
598{
599	struct timespec tts;
600
601	if (timespeccmp(&timo->end, &timo->cur, <=))
602		return (-1);
603	tts = timo->end;
604	timespecsub(&tts, &timo->cur);
605	return (tstohz(&tts));
606}
607
608/*
609 * Put thread into sleep state, before sleeping, check if
610 * thread was removed from umtx queue.
611 */
612static inline int
613umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
614{
615	struct umtxq_chain *uc;
616	int error, timo;
617
618	uc = umtxq_getchain(&uq->uq_key);
619	UMTXQ_LOCKED_ASSERT(uc);
620	for (;;) {
621		if (!(uq->uq_flags & UQF_UMTXQ))
622			return (0);
623		if (abstime != NULL) {
624			timo = abs_timeout_gethz(abstime);
625			if (timo < 0)
626				return (ETIMEDOUT);
627		} else
628			timo = 0;
629		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
630		if (error != EWOULDBLOCK) {
631			umtxq_lock(&uq->uq_key);
632			break;
633		}
634		if (abstime != NULL)
635			abs_timeout_update(abstime);
636		umtxq_lock(&uq->uq_key);
637	}
638	return (error);
639}
640
641/*
642 * Convert userspace address into unique logical address.
643 */
644int
645umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
646{
647	struct thread *td = curthread;
648	vm_map_t map;
649	vm_map_entry_t entry;
650	vm_pindex_t pindex;
651	vm_prot_t prot;
652	boolean_t wired;
653
654	key->type = type;
655	if (share == THREAD_SHARE) {
656		key->shared = 0;
657		key->info.private.vs = td->td_proc->p_vmspace;
658		key->info.private.addr = (uintptr_t)addr;
659	} else {
660		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
661		map = &td->td_proc->p_vmspace->vm_map;
662		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
663		    &entry, &key->info.shared.object, &pindex, &prot,
664		    &wired) != KERN_SUCCESS) {
665			return EFAULT;
666		}
667
668		if ((share == PROCESS_SHARE) ||
669		    (share == AUTO_SHARE &&
670		     VM_INHERIT_SHARE == entry->inheritance)) {
671			key->shared = 1;
672			key->info.shared.offset = entry->offset + entry->start -
673				(vm_offset_t)addr;
674			vm_object_reference(key->info.shared.object);
675		} else {
676			key->shared = 0;
677			key->info.private.vs = td->td_proc->p_vmspace;
678			key->info.private.addr = (uintptr_t)addr;
679		}
680		vm_map_lookup_done(map, entry);
681	}
682
683	umtxq_hash(key);
684	return (0);
685}
686
687/*
688 * Release key.
689 */
690void
691umtx_key_release(struct umtx_key *key)
692{
693	if (key->shared)
694		vm_object_deallocate(key->info.shared.object);
695}
696
697/*
698 * Lock a umtx object.
699 */
700static int
701do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
702	const struct timespec *timeout)
703{
704	struct abs_timeout timo;
705	struct umtx_q *uq;
706	u_long owner;
707	u_long old;
708	int error = 0;
709
710	uq = td->td_umtxq;
711	if (timeout != NULL)
712		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
713
714	/*
715	 * Care must be exercised when dealing with umtx structure. It
716	 * can fault on any access.
717	 */
718	for (;;) {
719		/*
720		 * Try the uncontested case.  This should be done in userland.
721		 */
722		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
723
724		/* The acquire succeeded. */
725		if (owner == UMTX_UNOWNED)
726			return (0);
727
728		/* The address was invalid. */
729		if (owner == -1)
730			return (EFAULT);
731
732		/* If no one owns it but it is contested try to acquire it. */
733		if (owner == UMTX_CONTESTED) {
734			owner = casuword(&umtx->u_owner,
735			    UMTX_CONTESTED, id | UMTX_CONTESTED);
736
737			if (owner == UMTX_CONTESTED)
738				return (0);
739
740			/* The address was invalid. */
741			if (owner == -1)
742				return (EFAULT);
743
744			/* If this failed the lock has changed, restart. */
745			continue;
746		}
747
748		/*
749		 * If we caught a signal, we have retried and now
750		 * exit immediately.
751		 */
752		if (error != 0)
753			break;
754
755		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
756			AUTO_SHARE, &uq->uq_key)) != 0)
757			return (error);
758
759		umtxq_lock(&uq->uq_key);
760		umtxq_busy(&uq->uq_key);
761		umtxq_insert(uq);
762		umtxq_unbusy(&uq->uq_key);
763		umtxq_unlock(&uq->uq_key);
764
765		/*
766		 * Set the contested bit so that a release in user space
767		 * knows to use the system call for unlock.  If this fails
768		 * either some one else has acquired the lock or it has been
769		 * released.
770		 */
771		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
772
773		/* The address was invalid. */
774		if (old == -1) {
775			umtxq_lock(&uq->uq_key);
776			umtxq_remove(uq);
777			umtxq_unlock(&uq->uq_key);
778			umtx_key_release(&uq->uq_key);
779			return (EFAULT);
780		}
781
782		/*
783		 * We set the contested bit, sleep. Otherwise the lock changed
784		 * and we need to retry or we lost a race to the thread
785		 * unlocking the umtx.
786		 */
787		umtxq_lock(&uq->uq_key);
788		if (old == owner)
789			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
790			    &timo);
791		umtxq_remove(uq);
792		umtxq_unlock(&uq->uq_key);
793		umtx_key_release(&uq->uq_key);
794	}
795
796	if (timeout == NULL) {
797		/* Mutex locking is restarted if it is interrupted. */
798		if (error == EINTR)
799			error = ERESTART;
800	} else {
801		/* Timed-locking is not restarted. */
802		if (error == ERESTART)
803			error = EINTR;
804	}
805	return (error);
806}
807
808/*
809 * Unlock a umtx object.
810 */
811static int
812do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
813{
814	struct umtx_key key;
815	u_long owner;
816	u_long old;
817	int error;
818	int count;
819
820	/*
821	 * Make sure we own this mtx.
822	 */
823	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
824	if (owner == -1)
825		return (EFAULT);
826
827	if ((owner & ~UMTX_CONTESTED) != id)
828		return (EPERM);
829
830	/* This should be done in userland */
831	if ((owner & UMTX_CONTESTED) == 0) {
832		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
833		if (old == -1)
834			return (EFAULT);
835		if (old == owner)
836			return (0);
837		owner = old;
838	}
839
840	/* We should only ever be in here for contested locks */
841	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
842		&key)) != 0)
843		return (error);
844
845	umtxq_lock(&key);
846	umtxq_busy(&key);
847	count = umtxq_count(&key);
848	umtxq_unlock(&key);
849
850	/*
851	 * When unlocking the umtx, it must be marked as unowned if
852	 * there is zero or one thread only waiting for it.
853	 * Otherwise, it must be marked as contested.
854	 */
855	old = casuword(&umtx->u_owner, owner,
856		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
857	umtxq_lock(&key);
858	umtxq_signal(&key,1);
859	umtxq_unbusy(&key);
860	umtxq_unlock(&key);
861	umtx_key_release(&key);
862	if (old == -1)
863		return (EFAULT);
864	if (old != owner)
865		return (EINVAL);
866	return (0);
867}
868
869#ifdef COMPAT_FREEBSD32
870
871/*
872 * Lock a umtx object.
873 */
874static int
875do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
876	const struct timespec *timeout)
877{
878	struct abs_timeout timo;
879	struct umtx_q *uq;
880	uint32_t owner;
881	uint32_t old;
882	int error = 0;
883
884	uq = td->td_umtxq;
885
886	if (timeout != NULL)
887		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
888
889	/*
890	 * Care must be exercised when dealing with umtx structure. It
891	 * can fault on any access.
892	 */
893	for (;;) {
894		/*
895		 * Try the uncontested case.  This should be done in userland.
896		 */
897		owner = casuword32(m, UMUTEX_UNOWNED, id);
898
899		/* The acquire succeeded. */
900		if (owner == UMUTEX_UNOWNED)
901			return (0);
902
903		/* The address was invalid. */
904		if (owner == -1)
905			return (EFAULT);
906
907		/* If no one owns it but it is contested try to acquire it. */
908		if (owner == UMUTEX_CONTESTED) {
909			owner = casuword32(m,
910			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
911			if (owner == UMUTEX_CONTESTED)
912				return (0);
913
914			/* The address was invalid. */
915			if (owner == -1)
916				return (EFAULT);
917
918			/* If this failed the lock has changed, restart. */
919			continue;
920		}
921
922		/*
923		 * If we caught a signal, we have retried and now
924		 * exit immediately.
925		 */
926		if (error != 0)
927			return (error);
928
929		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
930			AUTO_SHARE, &uq->uq_key)) != 0)
931			return (error);
932
933		umtxq_lock(&uq->uq_key);
934		umtxq_busy(&uq->uq_key);
935		umtxq_insert(uq);
936		umtxq_unbusy(&uq->uq_key);
937		umtxq_unlock(&uq->uq_key);
938
939		/*
940		 * Set the contested bit so that a release in user space
941		 * knows to use the system call for unlock.  If this fails
942		 * either some one else has acquired the lock or it has been
943		 * released.
944		 */
945		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
946
947		/* The address was invalid. */
948		if (old == -1) {
949			umtxq_lock(&uq->uq_key);
950			umtxq_remove(uq);
951			umtxq_unlock(&uq->uq_key);
952			umtx_key_release(&uq->uq_key);
953			return (EFAULT);
954		}
955
956		/*
957		 * We set the contested bit, sleep. Otherwise the lock changed
958		 * and we need to retry or we lost a race to the thread
959		 * unlocking the umtx.
960		 */
961		umtxq_lock(&uq->uq_key);
962		if (old == owner)
963			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
964			    NULL : &timo);
965		umtxq_remove(uq);
966		umtxq_unlock(&uq->uq_key);
967		umtx_key_release(&uq->uq_key);
968	}
969
970	if (timeout == NULL) {
971		/* Mutex locking is restarted if it is interrupted. */
972		if (error == EINTR)
973			error = ERESTART;
974	} else {
975		/* Timed-locking is not restarted. */
976		if (error == ERESTART)
977			error = EINTR;
978	}
979	return (error);
980}
981
982/*
983 * Unlock a umtx object.
984 */
985static int
986do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
987{
988	struct umtx_key key;
989	uint32_t owner;
990	uint32_t old;
991	int error;
992	int count;
993
994	/*
995	 * Make sure we own this mtx.
996	 */
997	owner = fuword32(m);
998	if (owner == -1)
999		return (EFAULT);
1000
1001	if ((owner & ~UMUTEX_CONTESTED) != id)
1002		return (EPERM);
1003
1004	/* This should be done in userland */
1005	if ((owner & UMUTEX_CONTESTED) == 0) {
1006		old = casuword32(m, owner, UMUTEX_UNOWNED);
1007		if (old == -1)
1008			return (EFAULT);
1009		if (old == owner)
1010			return (0);
1011		owner = old;
1012	}
1013
1014	/* We should only ever be in here for contested locks */
1015	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1016		&key)) != 0)
1017		return (error);
1018
1019	umtxq_lock(&key);
1020	umtxq_busy(&key);
1021	count = umtxq_count(&key);
1022	umtxq_unlock(&key);
1023
1024	/*
1025	 * When unlocking the umtx, it must be marked as unowned if
1026	 * there is zero or one thread only waiting for it.
1027	 * Otherwise, it must be marked as contested.
1028	 */
1029	old = casuword32(m, owner,
1030		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1031	umtxq_lock(&key);
1032	umtxq_signal(&key,1);
1033	umtxq_unbusy(&key);
1034	umtxq_unlock(&key);
1035	umtx_key_release(&key);
1036	if (old == -1)
1037		return (EFAULT);
1038	if (old != owner)
1039		return (EINVAL);
1040	return (0);
1041}
1042#endif
1043
1044/*
1045 * Fetch and compare value, sleep on the address if value is not changed.
1046 */
1047static int
1048do_wait(struct thread *td, void *addr, u_long id,
1049	struct _umtx_time *timeout, int compat32, int is_private)
1050{
1051	struct abs_timeout timo;
1052	struct umtx_q *uq;
1053	u_long tmp;
1054	int error = 0;
1055
1056	uq = td->td_umtxq;
1057	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1058		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1059		return (error);
1060
1061	if (timeout != NULL)
1062		abs_timeout_init2(&timo, timeout);
1063
1064	umtxq_lock(&uq->uq_key);
1065	umtxq_insert(uq);
1066	umtxq_unlock(&uq->uq_key);
1067	if (compat32 == 0)
1068		tmp = fuword(addr);
1069        else
1070		tmp = (unsigned int)fuword32(addr);
1071	umtxq_lock(&uq->uq_key);
1072	if (tmp == id)
1073		error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1074		    NULL : &timo);
1075	if ((uq->uq_flags & UQF_UMTXQ) == 0)
1076		error = 0;
1077	else
1078		umtxq_remove(uq);
1079	umtxq_unlock(&uq->uq_key);
1080	umtx_key_release(&uq->uq_key);
1081	if (error == ERESTART)
1082		error = EINTR;
1083	return (error);
1084}
1085
1086/*
1087 * Wake up threads sleeping on the specified address.
1088 */
1089int
1090kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1091{
1092	struct umtx_key key;
1093	int ret;
1094
1095	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1096		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1097		return (ret);
1098	umtxq_lock(&key);
1099	ret = umtxq_signal(&key, n_wake);
1100	umtxq_unlock(&key);
1101	umtx_key_release(&key);
1102	return (0);
1103}
1104
1105/*
1106 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1107 */
1108static int
1109do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1110	struct _umtx_time *timeout, int mode)
1111{
1112	struct abs_timeout timo;
1113	struct umtx_q *uq;
1114	uint32_t owner, old, id;
1115	int error = 0;
1116
1117	id = td->td_tid;
1118	uq = td->td_umtxq;
1119
1120	if (timeout != NULL)
1121		abs_timeout_init2(&timo, timeout);
1122
1123	/*
1124	 * Care must be exercised when dealing with umtx structure. It
1125	 * can fault on any access.
1126	 */
1127	for (;;) {
1128		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1129		if (mode == _UMUTEX_WAIT) {
1130			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1131				return (0);
1132		} else {
1133			/*
1134			 * Try the uncontested case.  This should be done in userland.
1135			 */
1136			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1137
1138			/* The acquire succeeded. */
1139			if (owner == UMUTEX_UNOWNED)
1140				return (0);
1141
1142			/* The address was invalid. */
1143			if (owner == -1)
1144				return (EFAULT);
1145
1146			/* If no one owns it but it is contested try to acquire it. */
1147			if (owner == UMUTEX_CONTESTED) {
1148				owner = casuword32(&m->m_owner,
1149				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1150
1151				if (owner == UMUTEX_CONTESTED)
1152					return (0);
1153
1154				/* The address was invalid. */
1155				if (owner == -1)
1156					return (EFAULT);
1157
1158				/* If this failed the lock has changed, restart. */
1159				continue;
1160			}
1161		}
1162
1163		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1164		    (owner & ~UMUTEX_CONTESTED) == id)
1165			return (EDEADLK);
1166
1167		if (mode == _UMUTEX_TRY)
1168			return (EBUSY);
1169
1170		/*
1171		 * If we caught a signal, we have retried and now
1172		 * exit immediately.
1173		 */
1174		if (error != 0)
1175			return (error);
1176
1177		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1178		    GET_SHARE(flags), &uq->uq_key)) != 0)
1179			return (error);
1180
1181		umtxq_lock(&uq->uq_key);
1182		umtxq_busy(&uq->uq_key);
1183		umtxq_insert(uq);
1184		umtxq_unlock(&uq->uq_key);
1185
1186		/*
1187		 * Set the contested bit so that a release in user space
1188		 * knows to use the system call for unlock.  If this fails
1189		 * either some one else has acquired the lock or it has been
1190		 * released.
1191		 */
1192		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1193
1194		/* The address was invalid. */
1195		if (old == -1) {
1196			umtxq_lock(&uq->uq_key);
1197			umtxq_remove(uq);
1198			umtxq_unbusy(&uq->uq_key);
1199			umtxq_unlock(&uq->uq_key);
1200			umtx_key_release(&uq->uq_key);
1201			return (EFAULT);
1202		}
1203
1204		/*
1205		 * We set the contested bit, sleep. Otherwise the lock changed
1206		 * and we need to retry or we lost a race to the thread
1207		 * unlocking the umtx.
1208		 */
1209		umtxq_lock(&uq->uq_key);
1210		umtxq_unbusy(&uq->uq_key);
1211		if (old == owner)
1212			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1213			    NULL : &timo);
1214		umtxq_remove(uq);
1215		umtxq_unlock(&uq->uq_key);
1216		umtx_key_release(&uq->uq_key);
1217	}
1218
1219	return (0);
1220}
1221
1222/*
1223 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1224 */
1225static int
1226do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1227{
1228	struct umtx_key key;
1229	uint32_t owner, old, id;
1230	int error;
1231	int count;
1232
1233	id = td->td_tid;
1234	/*
1235	 * Make sure we own this mtx.
1236	 */
1237	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1238	if (owner == -1)
1239		return (EFAULT);
1240
1241	if ((owner & ~UMUTEX_CONTESTED) != id)
1242		return (EPERM);
1243
1244	if ((owner & UMUTEX_CONTESTED) == 0) {
1245		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1246		if (old == -1)
1247			return (EFAULT);
1248		if (old == owner)
1249			return (0);
1250		owner = old;
1251	}
1252
1253	/* We should only ever be in here for contested locks */
1254	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1255	    &key)) != 0)
1256		return (error);
1257
1258	umtxq_lock(&key);
1259	umtxq_busy(&key);
1260	count = umtxq_count(&key);
1261	umtxq_unlock(&key);
1262
1263	/*
1264	 * When unlocking the umtx, it must be marked as unowned if
1265	 * there is zero or one thread only waiting for it.
1266	 * Otherwise, it must be marked as contested.
1267	 */
1268	old = casuword32(&m->m_owner, owner,
1269		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1270	umtxq_lock(&key);
1271	umtxq_signal(&key,1);
1272	umtxq_unbusy(&key);
1273	umtxq_unlock(&key);
1274	umtx_key_release(&key);
1275	if (old == -1)
1276		return (EFAULT);
1277	if (old != owner)
1278		return (EINVAL);
1279	return (0);
1280}
1281
1282/*
1283 * Check if the mutex is available and wake up a waiter,
1284 * only for simple mutex.
1285 */
1286static int
1287do_wake_umutex(struct thread *td, struct umutex *m)
1288{
1289	struct umtx_key key;
1290	uint32_t owner;
1291	uint32_t flags;
1292	int error;
1293	int count;
1294
1295	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1296	if (owner == -1)
1297		return (EFAULT);
1298
1299	if ((owner & ~UMUTEX_CONTESTED) != 0)
1300		return (0);
1301
1302	flags = fuword32(&m->m_flags);
1303
1304	/* We should only ever be in here for contested locks */
1305	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1306	    &key)) != 0)
1307		return (error);
1308
1309	umtxq_lock(&key);
1310	umtxq_busy(&key);
1311	count = umtxq_count(&key);
1312	umtxq_unlock(&key);
1313
1314	if (count <= 1)
1315		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1316
1317	umtxq_lock(&key);
1318	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1319		umtxq_signal(&key, 1);
1320	umtxq_unbusy(&key);
1321	umtxq_unlock(&key);
1322	umtx_key_release(&key);
1323	return (0);
1324}
1325
1326/*
1327 * Check if the mutex has waiters and tries to fix contention bit.
1328 */
1329static int
1330do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1331{
1332	struct umtx_key key;
1333	uint32_t owner, old;
1334	int type;
1335	int error;
1336	int count;
1337
1338	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1339	case 0:
1340		type = TYPE_NORMAL_UMUTEX;
1341		break;
1342	case UMUTEX_PRIO_INHERIT:
1343		type = TYPE_PI_UMUTEX;
1344		break;
1345	case UMUTEX_PRIO_PROTECT:
1346		type = TYPE_PP_UMUTEX;
1347		break;
1348	default:
1349		return (EINVAL);
1350	}
1351	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1352	    &key)) != 0)
1353		return (error);
1354
1355	owner = 0;
1356	umtxq_lock(&key);
1357	umtxq_busy(&key);
1358	count = umtxq_count(&key);
1359	umtxq_unlock(&key);
1360	/*
1361	 * Only repair contention bit if there is a waiter, this means the mutex
1362	 * is still being referenced by userland code, otherwise don't update
1363	 * any memory.
1364	 */
1365	if (count > 1) {
1366		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1367		while ((owner & UMUTEX_CONTESTED) ==0) {
1368			old = casuword32(&m->m_owner, owner,
1369			    owner|UMUTEX_CONTESTED);
1370			if (old == owner)
1371				break;
1372			owner = old;
1373		}
1374	} else if (count == 1) {
1375		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1376		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
1377		       (owner & UMUTEX_CONTESTED) == 0) {
1378			old = casuword32(&m->m_owner, owner,
1379			    owner|UMUTEX_CONTESTED);
1380			if (old == owner)
1381				break;
1382			owner = old;
1383		}
1384	}
1385	umtxq_lock(&key);
1386	if (owner == -1) {
1387		error = EFAULT;
1388		umtxq_signal(&key, INT_MAX);
1389	}
1390	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1391		umtxq_signal(&key, 1);
1392	umtxq_unbusy(&key);
1393	umtxq_unlock(&key);
1394	umtx_key_release(&key);
1395	return (error);
1396}
1397
1398static inline struct umtx_pi *
1399umtx_pi_alloc(int flags)
1400{
1401	struct umtx_pi *pi;
1402
1403	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1404	TAILQ_INIT(&pi->pi_blocked);
1405	atomic_add_int(&umtx_pi_allocated, 1);
1406	return (pi);
1407}
1408
1409static inline void
1410umtx_pi_free(struct umtx_pi *pi)
1411{
1412	uma_zfree(umtx_pi_zone, pi);
1413	atomic_add_int(&umtx_pi_allocated, -1);
1414}
1415
1416/*
1417 * Adjust the thread's position on a pi_state after its priority has been
1418 * changed.
1419 */
1420static int
1421umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1422{
1423	struct umtx_q *uq, *uq1, *uq2;
1424	struct thread *td1;
1425
1426	mtx_assert(&umtx_lock, MA_OWNED);
1427	if (pi == NULL)
1428		return (0);
1429
1430	uq = td->td_umtxq;
1431
1432	/*
1433	 * Check if the thread needs to be moved on the blocked chain.
1434	 * It needs to be moved if either its priority is lower than
1435	 * the previous thread or higher than the next thread.
1436	 */
1437	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1438	uq2 = TAILQ_NEXT(uq, uq_lockq);
1439	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1440	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1441		/*
1442		 * Remove thread from blocked chain and determine where
1443		 * it should be moved to.
1444		 */
1445		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1446		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1447			td1 = uq1->uq_thread;
1448			MPASS(td1->td_proc->p_magic == P_MAGIC);
1449			if (UPRI(td1) > UPRI(td))
1450				break;
1451		}
1452
1453		if (uq1 == NULL)
1454			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1455		else
1456			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1457	}
1458	return (1);
1459}
1460
1461/*
1462 * Propagate priority when a thread is blocked on POSIX
1463 * PI mutex.
1464 */
1465static void
1466umtx_propagate_priority(struct thread *td)
1467{
1468	struct umtx_q *uq;
1469	struct umtx_pi *pi;
1470	int pri;
1471
1472	mtx_assert(&umtx_lock, MA_OWNED);
1473	pri = UPRI(td);
1474	uq = td->td_umtxq;
1475	pi = uq->uq_pi_blocked;
1476	if (pi == NULL)
1477		return;
1478
1479	for (;;) {
1480		td = pi->pi_owner;
1481		if (td == NULL || td == curthread)
1482			return;
1483
1484		MPASS(td->td_proc != NULL);
1485		MPASS(td->td_proc->p_magic == P_MAGIC);
1486
1487		thread_lock(td);
1488		if (td->td_lend_user_pri > pri)
1489			sched_lend_user_prio(td, pri);
1490		else {
1491			thread_unlock(td);
1492			break;
1493		}
1494		thread_unlock(td);
1495
1496		/*
1497		 * Pick up the lock that td is blocked on.
1498		 */
1499		uq = td->td_umtxq;
1500		pi = uq->uq_pi_blocked;
1501		if (pi == NULL)
1502			break;
1503		/* Resort td on the list if needed. */
1504		umtx_pi_adjust_thread(pi, td);
1505	}
1506}
1507
1508/*
1509 * Unpropagate priority for a PI mutex when a thread blocked on
1510 * it is interrupted by signal or resumed by others.
1511 */
1512static void
1513umtx_repropagate_priority(struct umtx_pi *pi)
1514{
1515	struct umtx_q *uq, *uq_owner;
1516	struct umtx_pi *pi2;
1517	int pri;
1518
1519	mtx_assert(&umtx_lock, MA_OWNED);
1520
1521	while (pi != NULL && pi->pi_owner != NULL) {
1522		pri = PRI_MAX;
1523		uq_owner = pi->pi_owner->td_umtxq;
1524
1525		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1526			uq = TAILQ_FIRST(&pi2->pi_blocked);
1527			if (uq != NULL) {
1528				if (pri > UPRI(uq->uq_thread))
1529					pri = UPRI(uq->uq_thread);
1530			}
1531		}
1532
1533		if (pri > uq_owner->uq_inherited_pri)
1534			pri = uq_owner->uq_inherited_pri;
1535		thread_lock(pi->pi_owner);
1536		sched_lend_user_prio(pi->pi_owner, pri);
1537		thread_unlock(pi->pi_owner);
1538		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1539			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1540	}
1541}
1542
1543/*
1544 * Insert a PI mutex into owned list.
1545 */
1546static void
1547umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1548{
1549	struct umtx_q *uq_owner;
1550
1551	uq_owner = owner->td_umtxq;
1552	mtx_assert(&umtx_lock, MA_OWNED);
1553	if (pi->pi_owner != NULL)
1554		panic("pi_ower != NULL");
1555	pi->pi_owner = owner;
1556	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1557}
1558
1559/*
1560 * Claim ownership of a PI mutex.
1561 */
1562static int
1563umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1564{
1565	struct umtx_q *uq, *uq_owner;
1566
1567	uq_owner = owner->td_umtxq;
1568	mtx_lock_spin(&umtx_lock);
1569	if (pi->pi_owner == owner) {
1570		mtx_unlock_spin(&umtx_lock);
1571		return (0);
1572	}
1573
1574	if (pi->pi_owner != NULL) {
1575		/*
1576		 * userland may have already messed the mutex, sigh.
1577		 */
1578		mtx_unlock_spin(&umtx_lock);
1579		return (EPERM);
1580	}
1581	umtx_pi_setowner(pi, owner);
1582	uq = TAILQ_FIRST(&pi->pi_blocked);
1583	if (uq != NULL) {
1584		int pri;
1585
1586		pri = UPRI(uq->uq_thread);
1587		thread_lock(owner);
1588		if (pri < UPRI(owner))
1589			sched_lend_user_prio(owner, pri);
1590		thread_unlock(owner);
1591	}
1592	mtx_unlock_spin(&umtx_lock);
1593	return (0);
1594}
1595
1596/*
1597 * Adjust a thread's order position in its blocked PI mutex,
1598 * this may result new priority propagating process.
1599 */
1600void
1601umtx_pi_adjust(struct thread *td, u_char oldpri)
1602{
1603	struct umtx_q *uq;
1604	struct umtx_pi *pi;
1605
1606	uq = td->td_umtxq;
1607	mtx_lock_spin(&umtx_lock);
1608	/*
1609	 * Pick up the lock that td is blocked on.
1610	 */
1611	pi = uq->uq_pi_blocked;
1612	if (pi != NULL) {
1613		umtx_pi_adjust_thread(pi, td);
1614		umtx_repropagate_priority(pi);
1615	}
1616	mtx_unlock_spin(&umtx_lock);
1617}
1618
1619/*
1620 * Sleep on a PI mutex.
1621 */
1622static int
1623umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1624	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1625{
1626	struct umtxq_chain *uc;
1627	struct thread *td, *td1;
1628	struct umtx_q *uq1;
1629	int pri;
1630	int error = 0;
1631
1632	td = uq->uq_thread;
1633	KASSERT(td == curthread, ("inconsistent uq_thread"));
1634	uc = umtxq_getchain(&uq->uq_key);
1635	UMTXQ_LOCKED_ASSERT(uc);
1636	UMTXQ_BUSY_ASSERT(uc);
1637	umtxq_insert(uq);
1638	mtx_lock_spin(&umtx_lock);
1639	if (pi->pi_owner == NULL) {
1640		mtx_unlock_spin(&umtx_lock);
1641		/* XXX Only look up thread in current process. */
1642		td1 = tdfind(owner, curproc->p_pid);
1643		mtx_lock_spin(&umtx_lock);
1644		if (td1 != NULL) {
1645			if (pi->pi_owner == NULL)
1646				umtx_pi_setowner(pi, td1);
1647			PROC_UNLOCK(td1->td_proc);
1648		}
1649	}
1650
1651	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1652		pri = UPRI(uq1->uq_thread);
1653		if (pri > UPRI(td))
1654			break;
1655	}
1656
1657	if (uq1 != NULL)
1658		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1659	else
1660		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1661
1662	uq->uq_pi_blocked = pi;
1663	thread_lock(td);
1664	td->td_flags |= TDF_UPIBLOCKED;
1665	thread_unlock(td);
1666	umtx_propagate_priority(td);
1667	mtx_unlock_spin(&umtx_lock);
1668	umtxq_unbusy(&uq->uq_key);
1669
1670	error = umtxq_sleep(uq, wmesg, timo);
1671	umtxq_remove(uq);
1672
1673	mtx_lock_spin(&umtx_lock);
1674	uq->uq_pi_blocked = NULL;
1675	thread_lock(td);
1676	td->td_flags &= ~TDF_UPIBLOCKED;
1677	thread_unlock(td);
1678	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1679	umtx_repropagate_priority(pi);
1680	mtx_unlock_spin(&umtx_lock);
1681	umtxq_unlock(&uq->uq_key);
1682
1683	return (error);
1684}
1685
1686/*
1687 * Add reference count for a PI mutex.
1688 */
1689static void
1690umtx_pi_ref(struct umtx_pi *pi)
1691{
1692	struct umtxq_chain *uc;
1693
1694	uc = umtxq_getchain(&pi->pi_key);
1695	UMTXQ_LOCKED_ASSERT(uc);
1696	pi->pi_refcount++;
1697}
1698
1699/*
1700 * Decrease reference count for a PI mutex, if the counter
1701 * is decreased to zero, its memory space is freed.
1702 */
1703static void
1704umtx_pi_unref(struct umtx_pi *pi)
1705{
1706	struct umtxq_chain *uc;
1707
1708	uc = umtxq_getchain(&pi->pi_key);
1709	UMTXQ_LOCKED_ASSERT(uc);
1710	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1711	if (--pi->pi_refcount == 0) {
1712		mtx_lock_spin(&umtx_lock);
1713		if (pi->pi_owner != NULL) {
1714			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1715				pi, pi_link);
1716			pi->pi_owner = NULL;
1717		}
1718		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1719			("blocked queue not empty"));
1720		mtx_unlock_spin(&umtx_lock);
1721		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1722		umtx_pi_free(pi);
1723	}
1724}
1725
1726/*
1727 * Find a PI mutex in hash table.
1728 */
1729static struct umtx_pi *
1730umtx_pi_lookup(struct umtx_key *key)
1731{
1732	struct umtxq_chain *uc;
1733	struct umtx_pi *pi;
1734
1735	uc = umtxq_getchain(key);
1736	UMTXQ_LOCKED_ASSERT(uc);
1737
1738	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1739		if (umtx_key_match(&pi->pi_key, key)) {
1740			return (pi);
1741		}
1742	}
1743	return (NULL);
1744}
1745
1746/*
1747 * Insert a PI mutex into hash table.
1748 */
1749static inline void
1750umtx_pi_insert(struct umtx_pi *pi)
1751{
1752	struct umtxq_chain *uc;
1753
1754	uc = umtxq_getchain(&pi->pi_key);
1755	UMTXQ_LOCKED_ASSERT(uc);
1756	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1757}
1758
1759/*
1760 * Lock a PI mutex.
1761 */
1762static int
1763do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1764    struct _umtx_time *timeout, int try)
1765{
1766	struct abs_timeout timo;
1767	struct umtx_q *uq;
1768	struct umtx_pi *pi, *new_pi;
1769	uint32_t id, owner, old;
1770	int error;
1771
1772	id = td->td_tid;
1773	uq = td->td_umtxq;
1774
1775	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1776	    &uq->uq_key)) != 0)
1777		return (error);
1778
1779	if (timeout != NULL)
1780		abs_timeout_init2(&timo, timeout);
1781
1782	umtxq_lock(&uq->uq_key);
1783	pi = umtx_pi_lookup(&uq->uq_key);
1784	if (pi == NULL) {
1785		new_pi = umtx_pi_alloc(M_NOWAIT);
1786		if (new_pi == NULL) {
1787			umtxq_unlock(&uq->uq_key);
1788			new_pi = umtx_pi_alloc(M_WAITOK);
1789			umtxq_lock(&uq->uq_key);
1790			pi = umtx_pi_lookup(&uq->uq_key);
1791			if (pi != NULL) {
1792				umtx_pi_free(new_pi);
1793				new_pi = NULL;
1794			}
1795		}
1796		if (new_pi != NULL) {
1797			new_pi->pi_key = uq->uq_key;
1798			umtx_pi_insert(new_pi);
1799			pi = new_pi;
1800		}
1801	}
1802	umtx_pi_ref(pi);
1803	umtxq_unlock(&uq->uq_key);
1804
1805	/*
1806	 * Care must be exercised when dealing with umtx structure.  It
1807	 * can fault on any access.
1808	 */
1809	for (;;) {
1810		/*
1811		 * Try the uncontested case.  This should be done in userland.
1812		 */
1813		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1814
1815		/* The acquire succeeded. */
1816		if (owner == UMUTEX_UNOWNED) {
1817			error = 0;
1818			break;
1819		}
1820
1821		/* The address was invalid. */
1822		if (owner == -1) {
1823			error = EFAULT;
1824			break;
1825		}
1826
1827		/* If no one owns it but it is contested try to acquire it. */
1828		if (owner == UMUTEX_CONTESTED) {
1829			owner = casuword32(&m->m_owner,
1830			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1831
1832			if (owner == UMUTEX_CONTESTED) {
1833				umtxq_lock(&uq->uq_key);
1834				umtxq_busy(&uq->uq_key);
1835				error = umtx_pi_claim(pi, td);
1836				umtxq_unbusy(&uq->uq_key);
1837				umtxq_unlock(&uq->uq_key);
1838				break;
1839			}
1840
1841			/* The address was invalid. */
1842			if (owner == -1) {
1843				error = EFAULT;
1844				break;
1845			}
1846
1847			/* If this failed the lock has changed, restart. */
1848			continue;
1849		}
1850
1851		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1852		    (owner & ~UMUTEX_CONTESTED) == id) {
1853			error = EDEADLK;
1854			break;
1855		}
1856
1857		if (try != 0) {
1858			error = EBUSY;
1859			break;
1860		}
1861
1862		/*
1863		 * If we caught a signal, we have retried and now
1864		 * exit immediately.
1865		 */
1866		if (error != 0)
1867			break;
1868
1869		umtxq_lock(&uq->uq_key);
1870		umtxq_busy(&uq->uq_key);
1871		umtxq_unlock(&uq->uq_key);
1872
1873		/*
1874		 * Set the contested bit so that a release in user space
1875		 * knows to use the system call for unlock.  If this fails
1876		 * either some one else has acquired the lock or it has been
1877		 * released.
1878		 */
1879		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1880
1881		/* The address was invalid. */
1882		if (old == -1) {
1883			umtxq_lock(&uq->uq_key);
1884			umtxq_unbusy(&uq->uq_key);
1885			umtxq_unlock(&uq->uq_key);
1886			error = EFAULT;
1887			break;
1888		}
1889
1890		umtxq_lock(&uq->uq_key);
1891		/*
1892		 * We set the contested bit, sleep. Otherwise the lock changed
1893		 * and we need to retry or we lost a race to the thread
1894		 * unlocking the umtx.
1895		 */
1896		if (old == owner)
1897			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1898			    "umtxpi", timeout == NULL ? NULL : &timo);
1899		else {
1900			umtxq_unbusy(&uq->uq_key);
1901			umtxq_unlock(&uq->uq_key);
1902		}
1903	}
1904
1905	umtxq_lock(&uq->uq_key);
1906	umtx_pi_unref(pi);
1907	umtxq_unlock(&uq->uq_key);
1908
1909	umtx_key_release(&uq->uq_key);
1910	return (error);
1911}
1912
1913/*
1914 * Unlock a PI mutex.
1915 */
1916static int
1917do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1918{
1919	struct umtx_key key;
1920	struct umtx_q *uq_first, *uq_first2, *uq_me;
1921	struct umtx_pi *pi, *pi2;
1922	uint32_t owner, old, id;
1923	int error;
1924	int count;
1925	int pri;
1926
1927	id = td->td_tid;
1928	/*
1929	 * Make sure we own this mtx.
1930	 */
1931	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1932	if (owner == -1)
1933		return (EFAULT);
1934
1935	if ((owner & ~UMUTEX_CONTESTED) != id)
1936		return (EPERM);
1937
1938	/* This should be done in userland */
1939	if ((owner & UMUTEX_CONTESTED) == 0) {
1940		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1941		if (old == -1)
1942			return (EFAULT);
1943		if (old == owner)
1944			return (0);
1945		owner = old;
1946	}
1947
1948	/* We should only ever be in here for contested locks */
1949	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1950	    &key)) != 0)
1951		return (error);
1952
1953	umtxq_lock(&key);
1954	umtxq_busy(&key);
1955	count = umtxq_count_pi(&key, &uq_first);
1956	if (uq_first != NULL) {
1957		mtx_lock_spin(&umtx_lock);
1958		pi = uq_first->uq_pi_blocked;
1959		KASSERT(pi != NULL, ("pi == NULL?"));
1960		if (pi->pi_owner != curthread) {
1961			mtx_unlock_spin(&umtx_lock);
1962			umtxq_unbusy(&key);
1963			umtxq_unlock(&key);
1964			umtx_key_release(&key);
1965			/* userland messed the mutex */
1966			return (EPERM);
1967		}
1968		uq_me = curthread->td_umtxq;
1969		pi->pi_owner = NULL;
1970		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1971		/* get highest priority thread which is still sleeping. */
1972		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1973		while (uq_first != NULL &&
1974		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1975			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1976		}
1977		pri = PRI_MAX;
1978		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1979			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1980			if (uq_first2 != NULL) {
1981				if (pri > UPRI(uq_first2->uq_thread))
1982					pri = UPRI(uq_first2->uq_thread);
1983			}
1984		}
1985		thread_lock(curthread);
1986		sched_lend_user_prio(curthread, pri);
1987		thread_unlock(curthread);
1988		mtx_unlock_spin(&umtx_lock);
1989		if (uq_first)
1990			umtxq_signal_thread(uq_first);
1991	}
1992	umtxq_unlock(&key);
1993
1994	/*
1995	 * When unlocking the umtx, it must be marked as unowned if
1996	 * there is zero or one thread only waiting for it.
1997	 * Otherwise, it must be marked as contested.
1998	 */
1999	old = casuword32(&m->m_owner, owner,
2000		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2001
2002	umtxq_lock(&key);
2003	umtxq_unbusy(&key);
2004	umtxq_unlock(&key);
2005	umtx_key_release(&key);
2006	if (old == -1)
2007		return (EFAULT);
2008	if (old != owner)
2009		return (EINVAL);
2010	return (0);
2011}
2012
2013/*
2014 * Lock a PP mutex.
2015 */
2016static int
2017do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2018    struct _umtx_time *timeout, int try)
2019{
2020	struct abs_timeout timo;
2021	struct umtx_q *uq, *uq2;
2022	struct umtx_pi *pi;
2023	uint32_t ceiling;
2024	uint32_t owner, id;
2025	int error, pri, old_inherited_pri, su;
2026
2027	id = td->td_tid;
2028	uq = td->td_umtxq;
2029	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2030	    &uq->uq_key)) != 0)
2031		return (error);
2032
2033	if (timeout != NULL)
2034		abs_timeout_init2(&timo, timeout);
2035
2036	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2037	for (;;) {
2038		old_inherited_pri = uq->uq_inherited_pri;
2039		umtxq_lock(&uq->uq_key);
2040		umtxq_busy(&uq->uq_key);
2041		umtxq_unlock(&uq->uq_key);
2042
2043		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
2044		if (ceiling > RTP_PRIO_MAX) {
2045			error = EINVAL;
2046			goto out;
2047		}
2048
2049		mtx_lock_spin(&umtx_lock);
2050		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2051			mtx_unlock_spin(&umtx_lock);
2052			error = EINVAL;
2053			goto out;
2054		}
2055		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2056			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2057			thread_lock(td);
2058			if (uq->uq_inherited_pri < UPRI(td))
2059				sched_lend_user_prio(td, uq->uq_inherited_pri);
2060			thread_unlock(td);
2061		}
2062		mtx_unlock_spin(&umtx_lock);
2063
2064		owner = casuword32(&m->m_owner,
2065		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2066
2067		if (owner == UMUTEX_CONTESTED) {
2068			error = 0;
2069			break;
2070		}
2071
2072		/* The address was invalid. */
2073		if (owner == -1) {
2074			error = EFAULT;
2075			break;
2076		}
2077
2078		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2079		    (owner & ~UMUTEX_CONTESTED) == id) {
2080			error = EDEADLK;
2081			break;
2082		}
2083
2084		if (try != 0) {
2085			error = EBUSY;
2086			break;
2087		}
2088
2089		/*
2090		 * If we caught a signal, we have retried and now
2091		 * exit immediately.
2092		 */
2093		if (error != 0)
2094			break;
2095
2096		umtxq_lock(&uq->uq_key);
2097		umtxq_insert(uq);
2098		umtxq_unbusy(&uq->uq_key);
2099		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2100		    NULL : &timo);
2101		umtxq_remove(uq);
2102		umtxq_unlock(&uq->uq_key);
2103
2104		mtx_lock_spin(&umtx_lock);
2105		uq->uq_inherited_pri = old_inherited_pri;
2106		pri = PRI_MAX;
2107		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2108			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2109			if (uq2 != NULL) {
2110				if (pri > UPRI(uq2->uq_thread))
2111					pri = UPRI(uq2->uq_thread);
2112			}
2113		}
2114		if (pri > uq->uq_inherited_pri)
2115			pri = uq->uq_inherited_pri;
2116		thread_lock(td);
2117		sched_lend_user_prio(td, pri);
2118		thread_unlock(td);
2119		mtx_unlock_spin(&umtx_lock);
2120	}
2121
2122	if (error != 0) {
2123		mtx_lock_spin(&umtx_lock);
2124		uq->uq_inherited_pri = old_inherited_pri;
2125		pri = PRI_MAX;
2126		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2127			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2128			if (uq2 != NULL) {
2129				if (pri > UPRI(uq2->uq_thread))
2130					pri = UPRI(uq2->uq_thread);
2131			}
2132		}
2133		if (pri > uq->uq_inherited_pri)
2134			pri = uq->uq_inherited_pri;
2135		thread_lock(td);
2136		sched_lend_user_prio(td, pri);
2137		thread_unlock(td);
2138		mtx_unlock_spin(&umtx_lock);
2139	}
2140
2141out:
2142	umtxq_lock(&uq->uq_key);
2143	umtxq_unbusy(&uq->uq_key);
2144	umtxq_unlock(&uq->uq_key);
2145	umtx_key_release(&uq->uq_key);
2146	return (error);
2147}
2148
2149/*
2150 * Unlock a PP mutex.
2151 */
2152static int
2153do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2154{
2155	struct umtx_key key;
2156	struct umtx_q *uq, *uq2;
2157	struct umtx_pi *pi;
2158	uint32_t owner, id;
2159	uint32_t rceiling;
2160	int error, pri, new_inherited_pri, su;
2161
2162	id = td->td_tid;
2163	uq = td->td_umtxq;
2164	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2165
2166	/*
2167	 * Make sure we own this mtx.
2168	 */
2169	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2170	if (owner == -1)
2171		return (EFAULT);
2172
2173	if ((owner & ~UMUTEX_CONTESTED) != id)
2174		return (EPERM);
2175
2176	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2177	if (error != 0)
2178		return (error);
2179
2180	if (rceiling == -1)
2181		new_inherited_pri = PRI_MAX;
2182	else {
2183		rceiling = RTP_PRIO_MAX - rceiling;
2184		if (rceiling > RTP_PRIO_MAX)
2185			return (EINVAL);
2186		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2187	}
2188
2189	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2190	    &key)) != 0)
2191		return (error);
2192	umtxq_lock(&key);
2193	umtxq_busy(&key);
2194	umtxq_unlock(&key);
2195	/*
2196	 * For priority protected mutex, always set unlocked state
2197	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2198	 * to lock the mutex, it is necessary because thread priority
2199	 * has to be adjusted for such mutex.
2200	 */
2201	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2202		UMUTEX_CONTESTED);
2203
2204	umtxq_lock(&key);
2205	if (error == 0)
2206		umtxq_signal(&key, 1);
2207	umtxq_unbusy(&key);
2208	umtxq_unlock(&key);
2209
2210	if (error == -1)
2211		error = EFAULT;
2212	else {
2213		mtx_lock_spin(&umtx_lock);
2214		if (su != 0)
2215			uq->uq_inherited_pri = new_inherited_pri;
2216		pri = PRI_MAX;
2217		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2218			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2219			if (uq2 != NULL) {
2220				if (pri > UPRI(uq2->uq_thread))
2221					pri = UPRI(uq2->uq_thread);
2222			}
2223		}
2224		if (pri > uq->uq_inherited_pri)
2225			pri = uq->uq_inherited_pri;
2226		thread_lock(td);
2227		sched_lend_user_prio(td, pri);
2228		thread_unlock(td);
2229		mtx_unlock_spin(&umtx_lock);
2230	}
2231	umtx_key_release(&key);
2232	return (error);
2233}
2234
2235static int
2236do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2237	uint32_t *old_ceiling)
2238{
2239	struct umtx_q *uq;
2240	uint32_t save_ceiling;
2241	uint32_t owner, id;
2242	uint32_t flags;
2243	int error;
2244
2245	flags = fuword32(&m->m_flags);
2246	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2247		return (EINVAL);
2248	if (ceiling > RTP_PRIO_MAX)
2249		return (EINVAL);
2250	id = td->td_tid;
2251	uq = td->td_umtxq;
2252	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2253	   &uq->uq_key)) != 0)
2254		return (error);
2255	for (;;) {
2256		umtxq_lock(&uq->uq_key);
2257		umtxq_busy(&uq->uq_key);
2258		umtxq_unlock(&uq->uq_key);
2259
2260		save_ceiling = fuword32(&m->m_ceilings[0]);
2261
2262		owner = casuword32(&m->m_owner,
2263		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2264
2265		if (owner == UMUTEX_CONTESTED) {
2266			suword32(&m->m_ceilings[0], ceiling);
2267			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2268				UMUTEX_CONTESTED);
2269			error = 0;
2270			break;
2271		}
2272
2273		/* The address was invalid. */
2274		if (owner == -1) {
2275			error = EFAULT;
2276			break;
2277		}
2278
2279		if ((owner & ~UMUTEX_CONTESTED) == id) {
2280			suword32(&m->m_ceilings[0], ceiling);
2281			error = 0;
2282			break;
2283		}
2284
2285		/*
2286		 * If we caught a signal, we have retried and now
2287		 * exit immediately.
2288		 */
2289		if (error != 0)
2290			break;
2291
2292		/*
2293		 * We set the contested bit, sleep. Otherwise the lock changed
2294		 * and we need to retry or we lost a race to the thread
2295		 * unlocking the umtx.
2296		 */
2297		umtxq_lock(&uq->uq_key);
2298		umtxq_insert(uq);
2299		umtxq_unbusy(&uq->uq_key);
2300		error = umtxq_sleep(uq, "umtxpp", NULL);
2301		umtxq_remove(uq);
2302		umtxq_unlock(&uq->uq_key);
2303	}
2304	umtxq_lock(&uq->uq_key);
2305	if (error == 0)
2306		umtxq_signal(&uq->uq_key, INT_MAX);
2307	umtxq_unbusy(&uq->uq_key);
2308	umtxq_unlock(&uq->uq_key);
2309	umtx_key_release(&uq->uq_key);
2310	if (error == 0 && old_ceiling != NULL)
2311		suword32(old_ceiling, save_ceiling);
2312	return (error);
2313}
2314
2315/*
2316 * Lock a userland POSIX mutex.
2317 */
2318static int
2319do_lock_umutex(struct thread *td, struct umutex *m,
2320    struct _umtx_time *timeout, int mode)
2321{
2322	uint32_t flags;
2323	int error;
2324
2325	flags = fuword32(&m->m_flags);
2326	if (flags == -1)
2327		return (EFAULT);
2328
2329	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2330	case 0:
2331		error = do_lock_normal(td, m, flags, timeout, mode);
2332		break;
2333	case UMUTEX_PRIO_INHERIT:
2334		error = do_lock_pi(td, m, flags, timeout, mode);
2335		break;
2336	case UMUTEX_PRIO_PROTECT:
2337		error = do_lock_pp(td, m, flags, timeout, mode);
2338		break;
2339	default:
2340		return (EINVAL);
2341	}
2342	if (timeout == NULL) {
2343		if (error == EINTR && mode != _UMUTEX_WAIT)
2344			error = ERESTART;
2345	} else {
2346		/* Timed-locking is not restarted. */
2347		if (error == ERESTART)
2348			error = EINTR;
2349	}
2350	return (error);
2351}
2352
2353/*
2354 * Unlock a userland POSIX mutex.
2355 */
2356static int
2357do_unlock_umutex(struct thread *td, struct umutex *m)
2358{
2359	uint32_t flags;
2360
2361	flags = fuword32(&m->m_flags);
2362	if (flags == -1)
2363		return (EFAULT);
2364
2365	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2366	case 0:
2367		return (do_unlock_normal(td, m, flags));
2368	case UMUTEX_PRIO_INHERIT:
2369		return (do_unlock_pi(td, m, flags));
2370	case UMUTEX_PRIO_PROTECT:
2371		return (do_unlock_pp(td, m, flags));
2372	}
2373
2374	return (EINVAL);
2375}
2376
2377static int
2378do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2379	struct timespec *timeout, u_long wflags)
2380{
2381	struct abs_timeout timo;
2382	struct umtx_q *uq;
2383	uint32_t flags;
2384	uint32_t clockid;
2385	int error;
2386
2387	uq = td->td_umtxq;
2388	flags = fuword32(&cv->c_flags);
2389	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2390	if (error != 0)
2391		return (error);
2392
2393	if ((wflags & CVWAIT_CLOCKID) != 0) {
2394		clockid = fuword32(&cv->c_clockid);
2395		if (clockid < CLOCK_REALTIME ||
2396		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2397			/* hmm, only HW clock id will work. */
2398			return (EINVAL);
2399		}
2400	} else {
2401		clockid = CLOCK_REALTIME;
2402	}
2403
2404	umtxq_lock(&uq->uq_key);
2405	umtxq_busy(&uq->uq_key);
2406	umtxq_insert(uq);
2407	umtxq_unlock(&uq->uq_key);
2408
2409	/*
2410	 * Set c_has_waiters to 1 before releasing user mutex, also
2411	 * don't modify cache line when unnecessary.
2412	 */
2413	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2414		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2415
2416	umtxq_lock(&uq->uq_key);
2417	umtxq_unbusy(&uq->uq_key);
2418	umtxq_unlock(&uq->uq_key);
2419
2420	error = do_unlock_umutex(td, m);
2421
2422	if (timeout != NULL)
2423		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2424			timeout);
2425
2426	umtxq_lock(&uq->uq_key);
2427	if (error == 0) {
2428		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2429		    NULL : &timo);
2430	}
2431
2432	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2433		error = 0;
2434	else {
2435		/*
2436		 * This must be timeout,interrupted by signal or
2437		 * surprious wakeup, clear c_has_waiter flag when
2438		 * necessary.
2439		 */
2440		umtxq_busy(&uq->uq_key);
2441		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2442			int oldlen = uq->uq_cur_queue->length;
2443			umtxq_remove(uq);
2444			if (oldlen == 1) {
2445				umtxq_unlock(&uq->uq_key);
2446				suword32(
2447				    __DEVOLATILE(uint32_t *,
2448					 &cv->c_has_waiters), 0);
2449				umtxq_lock(&uq->uq_key);
2450			}
2451		}
2452		umtxq_unbusy(&uq->uq_key);
2453		if (error == ERESTART)
2454			error = EINTR;
2455	}
2456
2457	umtxq_unlock(&uq->uq_key);
2458	umtx_key_release(&uq->uq_key);
2459	return (error);
2460}
2461
2462/*
2463 * Signal a userland condition variable.
2464 */
2465static int
2466do_cv_signal(struct thread *td, struct ucond *cv)
2467{
2468	struct umtx_key key;
2469	int error, cnt, nwake;
2470	uint32_t flags;
2471
2472	flags = fuword32(&cv->c_flags);
2473	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2474		return (error);
2475	umtxq_lock(&key);
2476	umtxq_busy(&key);
2477	cnt = umtxq_count(&key);
2478	nwake = umtxq_signal(&key, 1);
2479	if (cnt <= nwake) {
2480		umtxq_unlock(&key);
2481		error = suword32(
2482		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2483		umtxq_lock(&key);
2484	}
2485	umtxq_unbusy(&key);
2486	umtxq_unlock(&key);
2487	umtx_key_release(&key);
2488	return (error);
2489}
2490
2491static int
2492do_cv_broadcast(struct thread *td, struct ucond *cv)
2493{
2494	struct umtx_key key;
2495	int error;
2496	uint32_t flags;
2497
2498	flags = fuword32(&cv->c_flags);
2499	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2500		return (error);
2501
2502	umtxq_lock(&key);
2503	umtxq_busy(&key);
2504	umtxq_signal(&key, INT_MAX);
2505	umtxq_unlock(&key);
2506
2507	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2508
2509	umtxq_lock(&key);
2510	umtxq_unbusy(&key);
2511	umtxq_unlock(&key);
2512
2513	umtx_key_release(&key);
2514	return (error);
2515}
2516
2517static int
2518do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2519{
2520	struct abs_timeout timo;
2521	struct umtx_q *uq;
2522	uint32_t flags, wrflags;
2523	int32_t state, oldstate;
2524	int32_t blocked_readers;
2525	int error;
2526
2527	uq = td->td_umtxq;
2528	flags = fuword32(&rwlock->rw_flags);
2529	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2530	if (error != 0)
2531		return (error);
2532
2533	if (timeout != NULL)
2534		abs_timeout_init2(&timo, timeout);
2535
2536	wrflags = URWLOCK_WRITE_OWNER;
2537	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2538		wrflags |= URWLOCK_WRITE_WAITERS;
2539
2540	for (;;) {
2541		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2542		/* try to lock it */
2543		while (!(state & wrflags)) {
2544			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2545				umtx_key_release(&uq->uq_key);
2546				return (EAGAIN);
2547			}
2548			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2549			if (oldstate == state) {
2550				umtx_key_release(&uq->uq_key);
2551				return (0);
2552			}
2553			state = oldstate;
2554		}
2555
2556		if (error)
2557			break;
2558
2559		/* grab monitor lock */
2560		umtxq_lock(&uq->uq_key);
2561		umtxq_busy(&uq->uq_key);
2562		umtxq_unlock(&uq->uq_key);
2563
2564		/*
2565		 * re-read the state, in case it changed between the try-lock above
2566		 * and the check below
2567		 */
2568		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2569
2570		/* set read contention bit */
2571		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2572			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2573			if (oldstate == state)
2574				goto sleep;
2575			state = oldstate;
2576		}
2577
2578		/* state is changed while setting flags, restart */
2579		if (!(state & wrflags)) {
2580			umtxq_lock(&uq->uq_key);
2581			umtxq_unbusy(&uq->uq_key);
2582			umtxq_unlock(&uq->uq_key);
2583			continue;
2584		}
2585
2586sleep:
2587		/* contention bit is set, before sleeping, increase read waiter count */
2588		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2589		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2590
2591		while (state & wrflags) {
2592			umtxq_lock(&uq->uq_key);
2593			umtxq_insert(uq);
2594			umtxq_unbusy(&uq->uq_key);
2595
2596			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2597			    NULL : &timo);
2598
2599			umtxq_busy(&uq->uq_key);
2600			umtxq_remove(uq);
2601			umtxq_unlock(&uq->uq_key);
2602			if (error)
2603				break;
2604			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2605		}
2606
2607		/* decrease read waiter count, and may clear read contention bit */
2608		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2609		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2610		if (blocked_readers == 1) {
2611			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2612			for (;;) {
2613				oldstate = casuword32(&rwlock->rw_state, state,
2614					 state & ~URWLOCK_READ_WAITERS);
2615				if (oldstate == state)
2616					break;
2617				state = oldstate;
2618			}
2619		}
2620
2621		umtxq_lock(&uq->uq_key);
2622		umtxq_unbusy(&uq->uq_key);
2623		umtxq_unlock(&uq->uq_key);
2624	}
2625	umtx_key_release(&uq->uq_key);
2626	if (error == ERESTART)
2627		error = EINTR;
2628	return (error);
2629}
2630
2631static int
2632do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2633{
2634	struct abs_timeout timo;
2635	struct umtx_q *uq;
2636	uint32_t flags;
2637	int32_t state, oldstate;
2638	int32_t blocked_writers;
2639	int32_t blocked_readers;
2640	int error;
2641
2642	uq = td->td_umtxq;
2643	flags = fuword32(&rwlock->rw_flags);
2644	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2645	if (error != 0)
2646		return (error);
2647
2648	if (timeout != NULL)
2649		abs_timeout_init2(&timo, timeout);
2650
2651	blocked_readers = 0;
2652	for (;;) {
2653		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2654		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2655			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2656			if (oldstate == state) {
2657				umtx_key_release(&uq->uq_key);
2658				return (0);
2659			}
2660			state = oldstate;
2661		}
2662
2663		if (error) {
2664			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2665			    blocked_readers != 0) {
2666				umtxq_lock(&uq->uq_key);
2667				umtxq_busy(&uq->uq_key);
2668				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2669				umtxq_unbusy(&uq->uq_key);
2670				umtxq_unlock(&uq->uq_key);
2671			}
2672
2673			break;
2674		}
2675
2676		/* grab monitor lock */
2677		umtxq_lock(&uq->uq_key);
2678		umtxq_busy(&uq->uq_key);
2679		umtxq_unlock(&uq->uq_key);
2680
2681		/*
2682		 * re-read the state, in case it changed between the try-lock above
2683		 * and the check below
2684		 */
2685		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2686
2687		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2688		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2689			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2690			if (oldstate == state)
2691				goto sleep;
2692			state = oldstate;
2693		}
2694
2695		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2696			umtxq_lock(&uq->uq_key);
2697			umtxq_unbusy(&uq->uq_key);
2698			umtxq_unlock(&uq->uq_key);
2699			continue;
2700		}
2701sleep:
2702		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2703		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2704
2705		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2706			umtxq_lock(&uq->uq_key);
2707			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2708			umtxq_unbusy(&uq->uq_key);
2709
2710			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
2711			    NULL : &timo);
2712
2713			umtxq_busy(&uq->uq_key);
2714			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2715			umtxq_unlock(&uq->uq_key);
2716			if (error)
2717				break;
2718			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2719		}
2720
2721		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2722		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2723		if (blocked_writers == 1) {
2724			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2725			for (;;) {
2726				oldstate = casuword32(&rwlock->rw_state, state,
2727					 state & ~URWLOCK_WRITE_WAITERS);
2728				if (oldstate == state)
2729					break;
2730				state = oldstate;
2731			}
2732			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2733		} else
2734			blocked_readers = 0;
2735
2736		umtxq_lock(&uq->uq_key);
2737		umtxq_unbusy(&uq->uq_key);
2738		umtxq_unlock(&uq->uq_key);
2739	}
2740
2741	umtx_key_release(&uq->uq_key);
2742	if (error == ERESTART)
2743		error = EINTR;
2744	return (error);
2745}
2746
2747static int
2748do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2749{
2750	struct umtx_q *uq;
2751	uint32_t flags;
2752	int32_t state, oldstate;
2753	int error, q, count;
2754
2755	uq = td->td_umtxq;
2756	flags = fuword32(&rwlock->rw_flags);
2757	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2758	if (error != 0)
2759		return (error);
2760
2761	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2762	if (state & URWLOCK_WRITE_OWNER) {
2763		for (;;) {
2764			oldstate = casuword32(&rwlock->rw_state, state,
2765				state & ~URWLOCK_WRITE_OWNER);
2766			if (oldstate != state) {
2767				state = oldstate;
2768				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2769					error = EPERM;
2770					goto out;
2771				}
2772			} else
2773				break;
2774		}
2775	} else if (URWLOCK_READER_COUNT(state) != 0) {
2776		for (;;) {
2777			oldstate = casuword32(&rwlock->rw_state, state,
2778				state - 1);
2779			if (oldstate != state) {
2780				state = oldstate;
2781				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2782					error = EPERM;
2783					goto out;
2784				}
2785			}
2786			else
2787				break;
2788		}
2789	} else {
2790		error = EPERM;
2791		goto out;
2792	}
2793
2794	count = 0;
2795
2796	if (!(flags & URWLOCK_PREFER_READER)) {
2797		if (state & URWLOCK_WRITE_WAITERS) {
2798			count = 1;
2799			q = UMTX_EXCLUSIVE_QUEUE;
2800		} else if (state & URWLOCK_READ_WAITERS) {
2801			count = INT_MAX;
2802			q = UMTX_SHARED_QUEUE;
2803		}
2804	} else {
2805		if (state & URWLOCK_READ_WAITERS) {
2806			count = INT_MAX;
2807			q = UMTX_SHARED_QUEUE;
2808		} else if (state & URWLOCK_WRITE_WAITERS) {
2809			count = 1;
2810			q = UMTX_EXCLUSIVE_QUEUE;
2811		}
2812	}
2813
2814	if (count) {
2815		umtxq_lock(&uq->uq_key);
2816		umtxq_busy(&uq->uq_key);
2817		umtxq_signal_queue(&uq->uq_key, count, q);
2818		umtxq_unbusy(&uq->uq_key);
2819		umtxq_unlock(&uq->uq_key);
2820	}
2821out:
2822	umtx_key_release(&uq->uq_key);
2823	return (error);
2824}
2825
2826static int
2827do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2828{
2829	struct abs_timeout timo;
2830	struct umtx_q *uq;
2831	uint32_t flags, count;
2832	int error;
2833
2834	uq = td->td_umtxq;
2835	flags = fuword32(&sem->_flags);
2836	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2837	if (error != 0)
2838		return (error);
2839
2840	if (timeout != NULL)
2841		abs_timeout_init2(&timo, timeout);
2842
2843	umtxq_lock(&uq->uq_key);
2844	umtxq_busy(&uq->uq_key);
2845	umtxq_insert(uq);
2846	umtxq_unlock(&uq->uq_key);
2847	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2848	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2849	if (count != 0) {
2850		umtxq_lock(&uq->uq_key);
2851		umtxq_unbusy(&uq->uq_key);
2852		umtxq_remove(uq);
2853		umtxq_unlock(&uq->uq_key);
2854		umtx_key_release(&uq->uq_key);
2855		return (0);
2856	}
2857	umtxq_lock(&uq->uq_key);
2858	umtxq_unbusy(&uq->uq_key);
2859
2860	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
2861
2862	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2863		error = 0;
2864	else {
2865		umtxq_remove(uq);
2866		if (error == ERESTART)
2867			error = EINTR;
2868	}
2869	umtxq_unlock(&uq->uq_key);
2870	umtx_key_release(&uq->uq_key);
2871	return (error);
2872}
2873
2874/*
2875 * Signal a userland condition variable.
2876 */
2877static int
2878do_sem_wake(struct thread *td, struct _usem *sem)
2879{
2880	struct umtx_key key;
2881	int error, cnt;
2882	uint32_t flags;
2883
2884	flags = fuword32(&sem->_flags);
2885	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2886		return (error);
2887	umtxq_lock(&key);
2888	umtxq_busy(&key);
2889	cnt = umtxq_count(&key);
2890	if (cnt > 0) {
2891		umtxq_signal(&key, 1);
2892		/*
2893		 * Check if count is greater than 0, this means the memory is
2894		 * still being referenced by user code, so we can safely
2895		 * update _has_waiters flag.
2896		 */
2897		if (cnt == 1) {
2898			umtxq_unlock(&key);
2899			error = suword32(
2900			    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2901			umtxq_lock(&key);
2902		}
2903	}
2904	umtxq_unbusy(&key);
2905	umtxq_unlock(&key);
2906	umtx_key_release(&key);
2907	return (error);
2908}
2909
2910int
2911sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2912    /* struct umtx *umtx */
2913{
2914	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2915}
2916
2917int
2918sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2919    /* struct umtx *umtx */
2920{
2921	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2922}
2923
2924inline int
2925umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2926{
2927	int error;
2928
2929	error = copyin(addr, tsp, sizeof(struct timespec));
2930	if (error == 0) {
2931		if (tsp->tv_sec < 0 ||
2932		    tsp->tv_nsec >= 1000000000 ||
2933		    tsp->tv_nsec < 0)
2934			error = EINVAL;
2935	}
2936	return (error);
2937}
2938
2939static inline int
2940umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2941{
2942	int error;
2943
2944	if (size <= sizeof(struct timespec)) {
2945		tp->_clockid = CLOCK_REALTIME;
2946		tp->_flags = 0;
2947		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2948	} else
2949		error = copyin(addr, tp, sizeof(struct _umtx_time));
2950	if (error != 0)
2951		return (error);
2952	if (tp->_timeout.tv_sec < 0 ||
2953	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
2954		return (EINVAL);
2955	return (0);
2956}
2957
2958static int
2959__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2960{
2961	struct timespec *ts, timeout;
2962	int error;
2963
2964	/* Allow a null timespec (wait forever). */
2965	if (uap->uaddr2 == NULL)
2966		ts = NULL;
2967	else {
2968		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
2969		if (error != 0)
2970			return (error);
2971		ts = &timeout;
2972	}
2973	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2974}
2975
2976static int
2977__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2978{
2979	return (do_unlock_umtx(td, uap->obj, uap->val));
2980}
2981
2982static int
2983__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2984{
2985	struct _umtx_time timeout, *tm_p;
2986	int error;
2987
2988	if (uap->uaddr2 == NULL)
2989		tm_p = NULL;
2990	else {
2991		error = umtx_copyin_umtx_time(
2992		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
2993		if (error != 0)
2994			return (error);
2995		tm_p = &timeout;
2996	}
2997	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
2998}
2999
3000static int
3001__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3002{
3003	struct _umtx_time timeout, *tm_p;
3004	int error;
3005
3006	if (uap->uaddr2 == NULL)
3007		tm_p = NULL;
3008	else {
3009		error = umtx_copyin_umtx_time(
3010		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3011		if (error != 0)
3012			return (error);
3013		tm_p = &timeout;
3014	}
3015	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3016}
3017
3018static int
3019__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3020{
3021	struct _umtx_time *tm_p, timeout;
3022	int error;
3023
3024	if (uap->uaddr2 == NULL)
3025		tm_p = NULL;
3026	else {
3027		error = umtx_copyin_umtx_time(
3028		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3029		if (error != 0)
3030			return (error);
3031		tm_p = &timeout;
3032	}
3033	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3034}
3035
3036static int
3037__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3038{
3039	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3040}
3041
3042#define BATCH_SIZE	128
3043static int
3044__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3045{
3046	int count = uap->val;
3047	void *uaddrs[BATCH_SIZE];
3048	char **upp = (char **)uap->obj;
3049	int tocopy;
3050	int error = 0;
3051	int i, pos = 0;
3052
3053	while (count > 0) {
3054		tocopy = count;
3055		if (tocopy > BATCH_SIZE)
3056			tocopy = BATCH_SIZE;
3057		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3058		if (error != 0)
3059			break;
3060		for (i = 0; i < tocopy; ++i)
3061			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3062		count -= tocopy;
3063		pos += tocopy;
3064	}
3065	return (error);
3066}
3067
3068static int
3069__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3070{
3071	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3072}
3073
3074static int
3075__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3076{
3077	struct _umtx_time *tm_p, timeout;
3078	int error;
3079
3080	/* Allow a null timespec (wait forever). */
3081	if (uap->uaddr2 == NULL)
3082		tm_p = NULL;
3083	else {
3084		error = umtx_copyin_umtx_time(
3085		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3086		if (error != 0)
3087			return (error);
3088		tm_p = &timeout;
3089	}
3090	return do_lock_umutex(td, uap->obj, tm_p, 0);
3091}
3092
3093static int
3094__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3095{
3096	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3097}
3098
3099static int
3100__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3101{
3102	struct _umtx_time *tm_p, timeout;
3103	int error;
3104
3105	/* Allow a null timespec (wait forever). */
3106	if (uap->uaddr2 == NULL)
3107		tm_p = NULL;
3108	else {
3109		error = umtx_copyin_umtx_time(
3110		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3111		if (error != 0)
3112			return (error);
3113		tm_p = &timeout;
3114	}
3115	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3116}
3117
3118static int
3119__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3120{
3121	return do_wake_umutex(td, uap->obj);
3122}
3123
3124static int
3125__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3126{
3127	return do_unlock_umutex(td, uap->obj);
3128}
3129
3130static int
3131__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3132{
3133	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3134}
3135
3136static int
3137__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3138{
3139	struct timespec *ts, timeout;
3140	int error;
3141
3142	/* Allow a null timespec (wait forever). */
3143	if (uap->uaddr2 == NULL)
3144		ts = NULL;
3145	else {
3146		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3147		if (error != 0)
3148			return (error);
3149		ts = &timeout;
3150	}
3151	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3152}
3153
3154static int
3155__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3156{
3157	return do_cv_signal(td, uap->obj);
3158}
3159
3160static int
3161__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3162{
3163	return do_cv_broadcast(td, uap->obj);
3164}
3165
3166static int
3167__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3168{
3169	struct _umtx_time timeout;
3170	int error;
3171
3172	/* Allow a null timespec (wait forever). */
3173	if (uap->uaddr2 == NULL) {
3174		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3175	} else {
3176		error = umtx_copyin_umtx_time(uap->uaddr2,
3177		   (size_t)uap->uaddr1, &timeout);
3178		if (error != 0)
3179			return (error);
3180		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3181	}
3182	return (error);
3183}
3184
3185static int
3186__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3187{
3188	struct _umtx_time timeout;
3189	int error;
3190
3191	/* Allow a null timespec (wait forever). */
3192	if (uap->uaddr2 == NULL) {
3193		error = do_rw_wrlock(td, uap->obj, 0);
3194	} else {
3195		error = umtx_copyin_umtx_time(uap->uaddr2,
3196		   (size_t)uap->uaddr1, &timeout);
3197		if (error != 0)
3198			return (error);
3199
3200		error = do_rw_wrlock(td, uap->obj, &timeout);
3201	}
3202	return (error);
3203}
3204
3205static int
3206__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3207{
3208	return do_rw_unlock(td, uap->obj);
3209}
3210
3211static int
3212__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3213{
3214	struct _umtx_time *tm_p, timeout;
3215	int error;
3216
3217	/* Allow a null timespec (wait forever). */
3218	if (uap->uaddr2 == NULL)
3219		tm_p = NULL;
3220	else {
3221		error = umtx_copyin_umtx_time(
3222		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3223		if (error != 0)
3224			return (error);
3225		tm_p = &timeout;
3226	}
3227	return (do_sem_wait(td, uap->obj, tm_p));
3228}
3229
3230static int
3231__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3232{
3233	return do_sem_wake(td, uap->obj);
3234}
3235
3236static int
3237__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3238{
3239	return do_wake2_umutex(td, uap->obj, uap->val);
3240}
3241
3242typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3243
3244static _umtx_op_func op_table[] = {
3245	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3246	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3247	__umtx_op_wait,			/* UMTX_OP_WAIT */
3248	__umtx_op_wake,			/* UMTX_OP_WAKE */
3249	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3250	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3251	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3252	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3253	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3254	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3255	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3256	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3257	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3258	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3259	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3260	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3261	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3262	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3263	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3264	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3265	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3266	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3267	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3268};
3269
3270int
3271sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3272{
3273	if ((unsigned)uap->op < UMTX_OP_MAX)
3274		return (*op_table[uap->op])(td, uap);
3275	return (EINVAL);
3276}
3277
3278#ifdef COMPAT_FREEBSD32
3279int
3280freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3281    /* struct umtx *umtx */
3282{
3283	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3284}
3285
3286int
3287freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3288    /* struct umtx *umtx */
3289{
3290	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3291}
3292
3293struct timespec32 {
3294	int32_t tv_sec;
3295	int32_t tv_nsec;
3296};
3297
3298struct umtx_time32 {
3299	struct	timespec32	timeout;
3300	uint32_t		flags;
3301	uint32_t		clockid;
3302};
3303
3304static inline int
3305umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3306{
3307	struct timespec32 ts32;
3308	int error;
3309
3310	error = copyin(addr, &ts32, sizeof(struct timespec32));
3311	if (error == 0) {
3312		if (ts32.tv_sec < 0 ||
3313		    ts32.tv_nsec >= 1000000000 ||
3314		    ts32.tv_nsec < 0)
3315			error = EINVAL;
3316		else {
3317			tsp->tv_sec = ts32.tv_sec;
3318			tsp->tv_nsec = ts32.tv_nsec;
3319		}
3320	}
3321	return (error);
3322}
3323
3324static inline int
3325umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3326{
3327	struct umtx_time32 t32;
3328	int error;
3329
3330	t32.clockid = CLOCK_REALTIME;
3331	t32.flags   = 0;
3332	if (size <= sizeof(struct timespec32))
3333		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3334	else
3335		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3336	if (error != 0)
3337		return (error);
3338	if (t32.timeout.tv_sec < 0 ||
3339	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3340		return (EINVAL);
3341	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3342	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3343	tp->_flags = t32.flags;
3344	tp->_clockid = t32.clockid;
3345	return (0);
3346}
3347
3348static int
3349__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3350{
3351	struct timespec *ts, timeout;
3352	int error;
3353
3354	/* Allow a null timespec (wait forever). */
3355	if (uap->uaddr2 == NULL)
3356		ts = NULL;
3357	else {
3358		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3359		if (error != 0)
3360			return (error);
3361		ts = &timeout;
3362	}
3363	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3364}
3365
3366static int
3367__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3368{
3369	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3370}
3371
3372static int
3373__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3374{
3375	struct _umtx_time *tm_p, timeout;
3376	int error;
3377
3378	if (uap->uaddr2 == NULL)
3379		tm_p = NULL;
3380	else {
3381		error = umtx_copyin_umtx_time32(uap->uaddr2,
3382			(size_t)uap->uaddr1, &timeout);
3383		if (error != 0)
3384			return (error);
3385		tm_p = &timeout;
3386	}
3387	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3388}
3389
3390static int
3391__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3392{
3393	struct _umtx_time *tm_p, timeout;
3394	int error;
3395
3396	/* Allow a null timespec (wait forever). */
3397	if (uap->uaddr2 == NULL)
3398		tm_p = NULL;
3399	else {
3400		error = umtx_copyin_umtx_time(uap->uaddr2,
3401			    (size_t)uap->uaddr1, &timeout);
3402		if (error != 0)
3403			return (error);
3404		tm_p = &timeout;
3405	}
3406	return do_lock_umutex(td, uap->obj, tm_p, 0);
3407}
3408
3409static int
3410__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3411{
3412	struct _umtx_time *tm_p, timeout;
3413	int error;
3414
3415	/* Allow a null timespec (wait forever). */
3416	if (uap->uaddr2 == NULL)
3417		tm_p = NULL;
3418	else {
3419		error = umtx_copyin_umtx_time32(uap->uaddr2,
3420		    (size_t)uap->uaddr1, &timeout);
3421		if (error != 0)
3422			return (error);
3423		tm_p = &timeout;
3424	}
3425	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3426}
3427
3428static int
3429__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3430{
3431	struct timespec *ts, timeout;
3432	int error;
3433
3434	/* Allow a null timespec (wait forever). */
3435	if (uap->uaddr2 == NULL)
3436		ts = NULL;
3437	else {
3438		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3439		if (error != 0)
3440			return (error);
3441		ts = &timeout;
3442	}
3443	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3444}
3445
3446static int
3447__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3448{
3449	struct _umtx_time timeout;
3450	int error;
3451
3452	/* Allow a null timespec (wait forever). */
3453	if (uap->uaddr2 == NULL) {
3454		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3455	} else {
3456		error = umtx_copyin_umtx_time32(uap->uaddr2,
3457		    (size_t)uap->uaddr1, &timeout);
3458		if (error != 0)
3459			return (error);
3460		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3461	}
3462	return (error);
3463}
3464
3465static int
3466__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3467{
3468	struct _umtx_time timeout;
3469	int error;
3470
3471	/* Allow a null timespec (wait forever). */
3472	if (uap->uaddr2 == NULL) {
3473		error = do_rw_wrlock(td, uap->obj, 0);
3474	} else {
3475		error = umtx_copyin_umtx_time32(uap->uaddr2,
3476		    (size_t)uap->uaddr1, &timeout);
3477		if (error != 0)
3478			return (error);
3479		error = do_rw_wrlock(td, uap->obj, &timeout);
3480	}
3481	return (error);
3482}
3483
3484static int
3485__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3486{
3487	struct _umtx_time *tm_p, timeout;
3488	int error;
3489
3490	if (uap->uaddr2 == NULL)
3491		tm_p = NULL;
3492	else {
3493		error = umtx_copyin_umtx_time32(
3494		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3495		if (error != 0)
3496			return (error);
3497		tm_p = &timeout;
3498	}
3499	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3500}
3501
3502static int
3503__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3504{
3505	struct _umtx_time *tm_p, timeout;
3506	int error;
3507
3508	/* Allow a null timespec (wait forever). */
3509	if (uap->uaddr2 == NULL)
3510		tm_p = NULL;
3511	else {
3512		error = umtx_copyin_umtx_time32(uap->uaddr2,
3513		    (size_t)uap->uaddr1, &timeout);
3514		if (error != 0)
3515			return (error);
3516		tm_p = &timeout;
3517	}
3518	return (do_sem_wait(td, uap->obj, tm_p));
3519}
3520
3521static int
3522__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3523{
3524	int count = uap->val;
3525	uint32_t uaddrs[BATCH_SIZE];
3526	uint32_t **upp = (uint32_t **)uap->obj;
3527	int tocopy;
3528	int error = 0;
3529	int i, pos = 0;
3530
3531	while (count > 0) {
3532		tocopy = count;
3533		if (tocopy > BATCH_SIZE)
3534			tocopy = BATCH_SIZE;
3535		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3536		if (error != 0)
3537			break;
3538		for (i = 0; i < tocopy; ++i)
3539			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3540				INT_MAX, 1);
3541		count -= tocopy;
3542		pos += tocopy;
3543	}
3544	return (error);
3545}
3546
3547static _umtx_op_func op_table_compat32[] = {
3548	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3549	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3550	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3551	__umtx_op_wake,			/* UMTX_OP_WAKE */
3552	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3553	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3554	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3555	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3556	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3557	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3558	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3559	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3560	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3561	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3562	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3563	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3564	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3565	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3566	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3567	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3568	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3569	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3570	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3571};
3572
3573int
3574freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3575{
3576	if ((unsigned)uap->op < UMTX_OP_MAX)
3577		return (*op_table_compat32[uap->op])(td,
3578			(struct _umtx_op_args *)uap);
3579	return (EINVAL);
3580}
3581#endif
3582
3583void
3584umtx_thread_init(struct thread *td)
3585{
3586	td->td_umtxq = umtxq_alloc();
3587	td->td_umtxq->uq_thread = td;
3588}
3589
3590void
3591umtx_thread_fini(struct thread *td)
3592{
3593	umtxq_free(td->td_umtxq);
3594}
3595
3596/*
3597 * It will be called when new thread is created, e.g fork().
3598 */
3599void
3600umtx_thread_alloc(struct thread *td)
3601{
3602	struct umtx_q *uq;
3603
3604	uq = td->td_umtxq;
3605	uq->uq_inherited_pri = PRI_MAX;
3606
3607	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3608	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3609	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3610	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3611}
3612
3613/*
3614 * exec() hook.
3615 */
3616static void
3617umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3618	struct image_params *imgp __unused)
3619{
3620	umtx_thread_cleanup(curthread);
3621}
3622
3623/*
3624 * thread_exit() hook.
3625 */
3626void
3627umtx_thread_exit(struct thread *td)
3628{
3629	umtx_thread_cleanup(td);
3630}
3631
3632/*
3633 * clean up umtx data.
3634 */
3635static void
3636umtx_thread_cleanup(struct thread *td)
3637{
3638	struct umtx_q *uq;
3639	struct umtx_pi *pi;
3640
3641	if ((uq = td->td_umtxq) == NULL)
3642		return;
3643
3644	mtx_lock_spin(&umtx_lock);
3645	uq->uq_inherited_pri = PRI_MAX;
3646	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3647		pi->pi_owner = NULL;
3648		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3649	}
3650	mtx_unlock_spin(&umtx_lock);
3651	thread_lock(td);
3652	sched_lend_user_prio(td, PRI_MAX);
3653	thread_unlock(td);
3654}
3655