kern_umtx.c revision 201472
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 201472 2010-01-04 05:27:49Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_WAIT	0
62#define TYPE_CV			1
63#define TYPE_SIMPLE_LOCK	2
64#define TYPE_NORMAL_UMUTEX	3
65#define TYPE_PI_UMUTEX		4
66#define TYPE_PP_UMUTEX		5
67#define TYPE_RWLOCK		6
68
69#define _UMUTEX_TRY		1
70#define _UMUTEX_WAIT		2
71
72/* Key to represent a unique userland synchronous object */
73struct umtx_key {
74	int	hash;
75	int	type;
76	int	shared;
77	union {
78		struct {
79			vm_object_t	object;
80			uintptr_t	offset;
81		} shared;
82		struct {
83			struct vmspace	*vs;
84			uintptr_t	addr;
85		} private;
86		struct {
87			void		*a;
88			uintptr_t	b;
89		} both;
90	} info;
91};
92
93/* Priority inheritance mutex info. */
94struct umtx_pi {
95	/* Owner thread */
96	struct thread		*pi_owner;
97
98	/* Reference count */
99	int			pi_refcount;
100
101 	/* List entry to link umtx holding by thread */
102	TAILQ_ENTRY(umtx_pi)	pi_link;
103
104	/* List entry in hash */
105	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
106
107	/* List for waiters */
108	TAILQ_HEAD(,umtx_q)	pi_blocked;
109
110	/* Identify a userland lock object */
111	struct umtx_key		pi_key;
112};
113
114/* A userland synchronous object user. */
115struct umtx_q {
116	/* Linked list for the hash. */
117	TAILQ_ENTRY(umtx_q)	uq_link;
118
119	/* Umtx key. */
120	struct umtx_key		uq_key;
121
122	/* Umtx flags. */
123	int			uq_flags;
124#define UQF_UMTXQ	0x0001
125
126	/* The thread waits on. */
127	struct thread		*uq_thread;
128
129	/*
130	 * Blocked on PI mutex. read can use chain lock
131	 * or umtx_lock, write must have both chain lock and
132	 * umtx_lock being hold.
133	 */
134	struct umtx_pi		*uq_pi_blocked;
135
136	/* On blocked list */
137	TAILQ_ENTRY(umtx_q)	uq_lockq;
138
139	/* Thread contending with us */
140	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
141
142	/* Inherited priority from PP mutex */
143	u_char			uq_inherited_pri;
144};
145
146TAILQ_HEAD(umtxq_head, umtx_q);
147
148/* Userland lock object's wait-queue chain */
149struct umtxq_chain {
150	/* Lock for this chain. */
151	struct mtx		uc_lock;
152
153	/* List of sleep queues. */
154	struct umtxq_head	uc_queue[2];
155#define UMTX_SHARED_QUEUE	0
156#define UMTX_EXCLUSIVE_QUEUE	1
157
158	/* Busy flag */
159	char			uc_busy;
160
161	/* Chain lock waiters */
162	int			uc_waiters;
163
164	/* All PI in the list */
165	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
166};
167
168#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
169#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
170
171/*
172 * Don't propagate time-sharing priority, there is a security reason,
173 * a user can simply introduce PI-mutex, let thread A lock the mutex,
174 * and let another thread B block on the mutex, because B is
175 * sleeping, its priority will be boosted, this causes A's priority to
176 * be boosted via priority propagating too and will never be lowered even
177 * if it is using 100%CPU, this is unfair to other processes.
178 */
179
180#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
181			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
182			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
183
184#define	GOLDEN_RATIO_PRIME	2654404609U
185#define	UMTX_CHAINS		128
186#define	UMTX_SHIFTS		(__WORD_BIT - 7)
187
188#define THREAD_SHARE		0
189#define PROCESS_SHARE		1
190#define AUTO_SHARE		2
191
192#define	GET_SHARE(flags)	\
193    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
194
195#define BUSY_SPINS		200
196
197static uma_zone_t		umtx_pi_zone;
198static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
199static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
200static int			umtx_pi_allocated;
201
202SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
203SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
204    &umtx_pi_allocated, 0, "Allocated umtx_pi");
205
206static void umtxq_sysinit(void *);
207static void umtxq_hash(struct umtx_key *key);
208static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
209static void umtxq_lock(struct umtx_key *key);
210static void umtxq_unlock(struct umtx_key *key);
211static void umtxq_busy(struct umtx_key *key);
212static void umtxq_unbusy(struct umtx_key *key);
213static void umtxq_insert_queue(struct umtx_q *uq, int q);
214static void umtxq_remove_queue(struct umtx_q *uq, int q);
215static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
216static int umtxq_count(struct umtx_key *key);
217static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
218static int umtx_key_get(void *addr, int type, int share,
219	struct umtx_key *key);
220static void umtx_key_release(struct umtx_key *key);
221static struct umtx_pi *umtx_pi_alloc(int);
222static void umtx_pi_free(struct umtx_pi *pi);
223static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
224static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
225static void umtx_thread_cleanup(struct thread *td);
226static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
227	struct image_params *imgp __unused);
228SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
229
230#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
231#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
232#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
233
234static struct mtx umtx_lock;
235
236static void
237umtxq_sysinit(void *arg __unused)
238{
239	int i, j;
240
241	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
242		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
243	for (i = 0; i < 2; ++i) {
244		for (j = 0; j < UMTX_CHAINS; ++j) {
245			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
246				 MTX_DEF | MTX_DUPOK);
247			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[0]);
248			TAILQ_INIT(&umtxq_chains[i][j].uc_queue[1]);
249			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
250			umtxq_chains[i][j].uc_busy = 0;
251			umtxq_chains[i][j].uc_waiters = 0;
252		}
253	}
254	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
255	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
256	    EVENTHANDLER_PRI_ANY);
257}
258
259struct umtx_q *
260umtxq_alloc(void)
261{
262	struct umtx_q *uq;
263
264	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
265	TAILQ_INIT(&uq->uq_pi_contested);
266	uq->uq_inherited_pri = PRI_MAX;
267	return (uq);
268}
269
270void
271umtxq_free(struct umtx_q *uq)
272{
273	free(uq, M_UMTX);
274}
275
276static inline void
277umtxq_hash(struct umtx_key *key)
278{
279	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
280	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
281}
282
283static inline int
284umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
285{
286	return (k1->type == k2->type &&
287		k1->info.both.a == k2->info.both.a &&
288	        k1->info.both.b == k2->info.both.b);
289}
290
291static inline struct umtxq_chain *
292umtxq_getchain(struct umtx_key *key)
293{
294	if (key->type <= TYPE_CV)
295		return (&umtxq_chains[1][key->hash]);
296	return (&umtxq_chains[0][key->hash]);
297}
298
299/*
300 * Lock a chain.
301 */
302static inline void
303umtxq_lock(struct umtx_key *key)
304{
305	struct umtxq_chain *uc;
306
307	uc = umtxq_getchain(key);
308	mtx_lock(&uc->uc_lock);
309}
310
311/*
312 * Unlock a chain.
313 */
314static inline void
315umtxq_unlock(struct umtx_key *key)
316{
317	struct umtxq_chain *uc;
318
319	uc = umtxq_getchain(key);
320	mtx_unlock(&uc->uc_lock);
321}
322
323/*
324 * Set chain to busy state when following operation
325 * may be blocked (kernel mutex can not be used).
326 */
327static inline void
328umtxq_busy(struct umtx_key *key)
329{
330	struct umtxq_chain *uc;
331
332	uc = umtxq_getchain(key);
333	mtx_assert(&uc->uc_lock, MA_OWNED);
334	if (uc->uc_busy) {
335#ifdef SMP
336		if (smp_cpus > 1) {
337			int count = BUSY_SPINS;
338			if (count > 0) {
339				umtxq_unlock(key);
340				while (uc->uc_busy && --count > 0)
341					cpu_spinwait();
342				umtxq_lock(key);
343			}
344		}
345#endif
346		while (uc->uc_busy) {
347			uc->uc_waiters++;
348			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
349			uc->uc_waiters--;
350		}
351	}
352	uc->uc_busy = 1;
353}
354
355/*
356 * Unbusy a chain.
357 */
358static inline void
359umtxq_unbusy(struct umtx_key *key)
360{
361	struct umtxq_chain *uc;
362
363	uc = umtxq_getchain(key);
364	mtx_assert(&uc->uc_lock, MA_OWNED);
365	KASSERT(uc->uc_busy != 0, ("not busy"));
366	uc->uc_busy = 0;
367	if (uc->uc_waiters)
368		wakeup_one(uc);
369}
370
371static inline void
372umtxq_insert_queue(struct umtx_q *uq, int q)
373{
374	struct umtxq_chain *uc;
375
376	uc = umtxq_getchain(&uq->uq_key);
377	UMTXQ_LOCKED_ASSERT(uc);
378	TAILQ_INSERT_TAIL(&uc->uc_queue[q], uq, uq_link);
379	uq->uq_flags |= UQF_UMTXQ;
380}
381
382static inline void
383umtxq_remove_queue(struct umtx_q *uq, int q)
384{
385	struct umtxq_chain *uc;
386
387	uc = umtxq_getchain(&uq->uq_key);
388	UMTXQ_LOCKED_ASSERT(uc);
389	if (uq->uq_flags & UQF_UMTXQ) {
390		TAILQ_REMOVE(&uc->uc_queue[q], uq, uq_link);
391		uq->uq_flags &= ~UQF_UMTXQ;
392	}
393}
394
395/*
396 * Check if there are multiple waiters
397 */
398static int
399umtxq_count(struct umtx_key *key)
400{
401	struct umtxq_chain *uc;
402	struct umtx_q *uq;
403	int count = 0;
404
405	uc = umtxq_getchain(key);
406	UMTXQ_LOCKED_ASSERT(uc);
407	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
408		if (umtx_key_match(&uq->uq_key, key)) {
409			if (++count > 1)
410				break;
411		}
412	}
413	return (count);
414}
415
416/*
417 * Check if there are multiple PI waiters and returns first
418 * waiter.
419 */
420static int
421umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
422{
423	struct umtxq_chain *uc;
424	struct umtx_q *uq;
425	int count = 0;
426
427	*first = NULL;
428	uc = umtxq_getchain(key);
429	UMTXQ_LOCKED_ASSERT(uc);
430	TAILQ_FOREACH(uq, &uc->uc_queue[UMTX_SHARED_QUEUE], uq_link) {
431		if (umtx_key_match(&uq->uq_key, key)) {
432			if (++count > 1)
433				break;
434			*first = uq;
435		}
436	}
437	return (count);
438}
439
440/*
441 * Wake up threads waiting on an userland object.
442 */
443
444static int
445umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
446{
447	struct umtxq_chain *uc;
448	struct umtx_q *uq, *next;
449	int ret;
450
451	ret = 0;
452	uc = umtxq_getchain(key);
453	UMTXQ_LOCKED_ASSERT(uc);
454	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue[q], uq_link, next) {
455		if (umtx_key_match(&uq->uq_key, key)) {
456			umtxq_remove_queue(uq, q);
457			wakeup(uq);
458			if (++ret >= n_wake)
459				break;
460		}
461	}
462	return (ret);
463}
464
465
466/*
467 * Wake up specified thread.
468 */
469static inline void
470umtxq_signal_thread(struct umtx_q *uq)
471{
472	struct umtxq_chain *uc;
473
474	uc = umtxq_getchain(&uq->uq_key);
475	UMTXQ_LOCKED_ASSERT(uc);
476	umtxq_remove(uq);
477	wakeup(uq);
478}
479
480/*
481 * Put thread into sleep state, before sleeping, check if
482 * thread was removed from umtx queue.
483 */
484static inline int
485umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
486{
487	struct umtxq_chain *uc;
488	int error;
489
490	uc = umtxq_getchain(&uq->uq_key);
491	UMTXQ_LOCKED_ASSERT(uc);
492	if (!(uq->uq_flags & UQF_UMTXQ))
493		return (0);
494	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
495	if (error == EWOULDBLOCK)
496		error = ETIMEDOUT;
497	return (error);
498}
499
500/*
501 * Convert userspace address into unique logical address.
502 */
503static int
504umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
505{
506	struct thread *td = curthread;
507	vm_map_t map;
508	vm_map_entry_t entry;
509	vm_pindex_t pindex;
510	vm_prot_t prot;
511	boolean_t wired;
512
513	key->type = type;
514	if (share == THREAD_SHARE) {
515		key->shared = 0;
516		key->info.private.vs = td->td_proc->p_vmspace;
517		key->info.private.addr = (uintptr_t)addr;
518	} else {
519		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
520		map = &td->td_proc->p_vmspace->vm_map;
521		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
522		    &entry, &key->info.shared.object, &pindex, &prot,
523		    &wired) != KERN_SUCCESS) {
524			return EFAULT;
525		}
526
527		if ((share == PROCESS_SHARE) ||
528		    (share == AUTO_SHARE &&
529		     VM_INHERIT_SHARE == entry->inheritance)) {
530			key->shared = 1;
531			key->info.shared.offset = entry->offset + entry->start -
532				(vm_offset_t)addr;
533			vm_object_reference(key->info.shared.object);
534		} else {
535			key->shared = 0;
536			key->info.private.vs = td->td_proc->p_vmspace;
537			key->info.private.addr = (uintptr_t)addr;
538		}
539		vm_map_lookup_done(map, entry);
540	}
541
542	umtxq_hash(key);
543	return (0);
544}
545
546/*
547 * Release key.
548 */
549static inline void
550umtx_key_release(struct umtx_key *key)
551{
552	if (key->shared)
553		vm_object_deallocate(key->info.shared.object);
554}
555
556/*
557 * Lock a umtx object.
558 */
559static int
560_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
561{
562	struct umtx_q *uq;
563	u_long owner;
564	u_long old;
565	int error = 0;
566
567	uq = td->td_umtxq;
568
569	/*
570	 * Care must be exercised when dealing with umtx structure. It
571	 * can fault on any access.
572	 */
573	for (;;) {
574		/*
575		 * Try the uncontested case.  This should be done in userland.
576		 */
577		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
578
579		/* The acquire succeeded. */
580		if (owner == UMTX_UNOWNED)
581			return (0);
582
583		/* The address was invalid. */
584		if (owner == -1)
585			return (EFAULT);
586
587		/* If no one owns it but it is contested try to acquire it. */
588		if (owner == UMTX_CONTESTED) {
589			owner = casuword(&umtx->u_owner,
590			    UMTX_CONTESTED, id | UMTX_CONTESTED);
591
592			if (owner == UMTX_CONTESTED)
593				return (0);
594
595			/* The address was invalid. */
596			if (owner == -1)
597				return (EFAULT);
598
599			/* If this failed the lock has changed, restart. */
600			continue;
601		}
602
603		/*
604		 * If we caught a signal, we have retried and now
605		 * exit immediately.
606		 */
607		if (error != 0)
608			return (error);
609
610		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
611			AUTO_SHARE, &uq->uq_key)) != 0)
612			return (error);
613
614		umtxq_lock(&uq->uq_key);
615		umtxq_busy(&uq->uq_key);
616		umtxq_insert(uq);
617		umtxq_unbusy(&uq->uq_key);
618		umtxq_unlock(&uq->uq_key);
619
620		/*
621		 * Set the contested bit so that a release in user space
622		 * knows to use the system call for unlock.  If this fails
623		 * either some one else has acquired the lock or it has been
624		 * released.
625		 */
626		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
627
628		/* The address was invalid. */
629		if (old == -1) {
630			umtxq_lock(&uq->uq_key);
631			umtxq_remove(uq);
632			umtxq_unlock(&uq->uq_key);
633			umtx_key_release(&uq->uq_key);
634			return (EFAULT);
635		}
636
637		/*
638		 * We set the contested bit, sleep. Otherwise the lock changed
639		 * and we need to retry or we lost a race to the thread
640		 * unlocking the umtx.
641		 */
642		umtxq_lock(&uq->uq_key);
643		if (old == owner)
644			error = umtxq_sleep(uq, "umtx", timo);
645		umtxq_remove(uq);
646		umtxq_unlock(&uq->uq_key);
647		umtx_key_release(&uq->uq_key);
648	}
649
650	return (0);
651}
652
653/*
654 * Lock a umtx object.
655 */
656static int
657do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
658	struct timespec *timeout)
659{
660	struct timespec ts, ts2, ts3;
661	struct timeval tv;
662	int error;
663
664	if (timeout == NULL) {
665		error = _do_lock_umtx(td, umtx, id, 0);
666		/* Mutex locking is restarted if it is interrupted. */
667		if (error == EINTR)
668			error = ERESTART;
669	} else {
670		getnanouptime(&ts);
671		timespecadd(&ts, timeout);
672		TIMESPEC_TO_TIMEVAL(&tv, timeout);
673		for (;;) {
674			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
675			if (error != ETIMEDOUT)
676				break;
677			getnanouptime(&ts2);
678			if (timespeccmp(&ts2, &ts, >=)) {
679				error = ETIMEDOUT;
680				break;
681			}
682			ts3 = ts;
683			timespecsub(&ts3, &ts2);
684			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
685		}
686		/* Timed-locking is not restarted. */
687		if (error == ERESTART)
688			error = EINTR;
689	}
690	return (error);
691}
692
693/*
694 * Unlock a umtx object.
695 */
696static int
697do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
698{
699	struct umtx_key key;
700	u_long owner;
701	u_long old;
702	int error;
703	int count;
704
705	/*
706	 * Make sure we own this mtx.
707	 */
708	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
709	if (owner == -1)
710		return (EFAULT);
711
712	if ((owner & ~UMTX_CONTESTED) != id)
713		return (EPERM);
714
715	/* This should be done in userland */
716	if ((owner & UMTX_CONTESTED) == 0) {
717		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
718		if (old == -1)
719			return (EFAULT);
720		if (old == owner)
721			return (0);
722		owner = old;
723	}
724
725	/* We should only ever be in here for contested locks */
726	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
727		&key)) != 0)
728		return (error);
729
730	umtxq_lock(&key);
731	umtxq_busy(&key);
732	count = umtxq_count(&key);
733	umtxq_unlock(&key);
734
735	/*
736	 * When unlocking the umtx, it must be marked as unowned if
737	 * there is zero or one thread only waiting for it.
738	 * Otherwise, it must be marked as contested.
739	 */
740	old = casuword(&umtx->u_owner, owner,
741		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
742	umtxq_lock(&key);
743	umtxq_signal(&key,1);
744	umtxq_unbusy(&key);
745	umtxq_unlock(&key);
746	umtx_key_release(&key);
747	if (old == -1)
748		return (EFAULT);
749	if (old != owner)
750		return (EINVAL);
751	return (0);
752}
753
754#ifdef COMPAT_IA32
755
756/*
757 * Lock a umtx object.
758 */
759static int
760_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
761{
762	struct umtx_q *uq;
763	uint32_t owner;
764	uint32_t old;
765	int error = 0;
766
767	uq = td->td_umtxq;
768
769	/*
770	 * Care must be exercised when dealing with umtx structure. It
771	 * can fault on any access.
772	 */
773	for (;;) {
774		/*
775		 * Try the uncontested case.  This should be done in userland.
776		 */
777		owner = casuword32(m, UMUTEX_UNOWNED, id);
778
779		/* The acquire succeeded. */
780		if (owner == UMUTEX_UNOWNED)
781			return (0);
782
783		/* The address was invalid. */
784		if (owner == -1)
785			return (EFAULT);
786
787		/* If no one owns it but it is contested try to acquire it. */
788		if (owner == UMUTEX_CONTESTED) {
789			owner = casuword32(m,
790			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
791			if (owner == UMUTEX_CONTESTED)
792				return (0);
793
794			/* The address was invalid. */
795			if (owner == -1)
796				return (EFAULT);
797
798			/* If this failed the lock has changed, restart. */
799			continue;
800		}
801
802		/*
803		 * If we caught a signal, we have retried and now
804		 * exit immediately.
805		 */
806		if (error != 0)
807			return (error);
808
809		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
810			AUTO_SHARE, &uq->uq_key)) != 0)
811			return (error);
812
813		umtxq_lock(&uq->uq_key);
814		umtxq_busy(&uq->uq_key);
815		umtxq_insert(uq);
816		umtxq_unbusy(&uq->uq_key);
817		umtxq_unlock(&uq->uq_key);
818
819		/*
820		 * Set the contested bit so that a release in user space
821		 * knows to use the system call for unlock.  If this fails
822		 * either some one else has acquired the lock or it has been
823		 * released.
824		 */
825		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
826
827		/* The address was invalid. */
828		if (old == -1) {
829			umtxq_lock(&uq->uq_key);
830			umtxq_remove(uq);
831			umtxq_unlock(&uq->uq_key);
832			umtx_key_release(&uq->uq_key);
833			return (EFAULT);
834		}
835
836		/*
837		 * We set the contested bit, sleep. Otherwise the lock changed
838		 * and we need to retry or we lost a race to the thread
839		 * unlocking the umtx.
840		 */
841		umtxq_lock(&uq->uq_key);
842		if (old == owner)
843			error = umtxq_sleep(uq, "umtx", timo);
844		umtxq_remove(uq);
845		umtxq_unlock(&uq->uq_key);
846		umtx_key_release(&uq->uq_key);
847	}
848
849	return (0);
850}
851
852/*
853 * Lock a umtx object.
854 */
855static int
856do_lock_umtx32(struct thread *td, void *m, uint32_t id,
857	struct timespec *timeout)
858{
859	struct timespec ts, ts2, ts3;
860	struct timeval tv;
861	int error;
862
863	if (timeout == NULL) {
864		error = _do_lock_umtx32(td, m, id, 0);
865		/* Mutex locking is restarted if it is interrupted. */
866		if (error == EINTR)
867			error = ERESTART;
868	} else {
869		getnanouptime(&ts);
870		timespecadd(&ts, timeout);
871		TIMESPEC_TO_TIMEVAL(&tv, timeout);
872		for (;;) {
873			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
874			if (error != ETIMEDOUT)
875				break;
876			getnanouptime(&ts2);
877			if (timespeccmp(&ts2, &ts, >=)) {
878				error = ETIMEDOUT;
879				break;
880			}
881			ts3 = ts;
882			timespecsub(&ts3, &ts2);
883			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
884		}
885		/* Timed-locking is not restarted. */
886		if (error == ERESTART)
887			error = EINTR;
888	}
889	return (error);
890}
891
892/*
893 * Unlock a umtx object.
894 */
895static int
896do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
897{
898	struct umtx_key key;
899	uint32_t owner;
900	uint32_t old;
901	int error;
902	int count;
903
904	/*
905	 * Make sure we own this mtx.
906	 */
907	owner = fuword32(m);
908	if (owner == -1)
909		return (EFAULT);
910
911	if ((owner & ~UMUTEX_CONTESTED) != id)
912		return (EPERM);
913
914	/* This should be done in userland */
915	if ((owner & UMUTEX_CONTESTED) == 0) {
916		old = casuword32(m, owner, UMUTEX_UNOWNED);
917		if (old == -1)
918			return (EFAULT);
919		if (old == owner)
920			return (0);
921		owner = old;
922	}
923
924	/* We should only ever be in here for contested locks */
925	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
926		&key)) != 0)
927		return (error);
928
929	umtxq_lock(&key);
930	umtxq_busy(&key);
931	count = umtxq_count(&key);
932	umtxq_unlock(&key);
933
934	/*
935	 * When unlocking the umtx, it must be marked as unowned if
936	 * there is zero or one thread only waiting for it.
937	 * Otherwise, it must be marked as contested.
938	 */
939	old = casuword32(m, owner,
940		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
941	umtxq_lock(&key);
942	umtxq_signal(&key,1);
943	umtxq_unbusy(&key);
944	umtxq_unlock(&key);
945	umtx_key_release(&key);
946	if (old == -1)
947		return (EFAULT);
948	if (old != owner)
949		return (EINVAL);
950	return (0);
951}
952#endif
953
954/*
955 * Fetch and compare value, sleep on the address if value is not changed.
956 */
957static int
958do_wait(struct thread *td, void *addr, u_long id,
959	struct timespec *timeout, int compat32, int is_private)
960{
961	struct umtx_q *uq;
962	struct timespec ts, ts2, ts3;
963	struct timeval tv;
964	u_long tmp;
965	int error = 0;
966
967	uq = td->td_umtxq;
968	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
969		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
970		return (error);
971
972	umtxq_lock(&uq->uq_key);
973	umtxq_insert(uq);
974	umtxq_unlock(&uq->uq_key);
975	if (compat32 == 0)
976		tmp = fuword(addr);
977        else
978		tmp = (unsigned int)fuword32(addr);
979	if (tmp != id) {
980		umtxq_lock(&uq->uq_key);
981		umtxq_remove(uq);
982		umtxq_unlock(&uq->uq_key);
983	} else if (timeout == NULL) {
984		umtxq_lock(&uq->uq_key);
985		error = umtxq_sleep(uq, "uwait", 0);
986		umtxq_remove(uq);
987		umtxq_unlock(&uq->uq_key);
988	} else {
989		getnanouptime(&ts);
990		timespecadd(&ts, timeout);
991		TIMESPEC_TO_TIMEVAL(&tv, timeout);
992		umtxq_lock(&uq->uq_key);
993		for (;;) {
994			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
995			if (!(uq->uq_flags & UQF_UMTXQ))
996				break;
997			if (error != ETIMEDOUT)
998				break;
999			umtxq_unlock(&uq->uq_key);
1000			getnanouptime(&ts2);
1001			if (timespeccmp(&ts2, &ts, >=)) {
1002				error = ETIMEDOUT;
1003				umtxq_lock(&uq->uq_key);
1004				break;
1005			}
1006			ts3 = ts;
1007			timespecsub(&ts3, &ts2);
1008			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
1009			umtxq_lock(&uq->uq_key);
1010		}
1011		umtxq_remove(uq);
1012		umtxq_unlock(&uq->uq_key);
1013	}
1014	umtx_key_release(&uq->uq_key);
1015	if (error == ERESTART)
1016		error = EINTR;
1017	return (error);
1018}
1019
1020/*
1021 * Wake up threads sleeping on the specified address.
1022 */
1023int
1024kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1025{
1026	struct umtx_key key;
1027	int ret;
1028
1029	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1030		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1031		return (ret);
1032	umtxq_lock(&key);
1033	ret = umtxq_signal(&key, n_wake);
1034	umtxq_unlock(&key);
1035	umtx_key_release(&key);
1036	return (0);
1037}
1038
1039/*
1040 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1041 */
1042static int
1043_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1044	int mode)
1045{
1046	struct umtx_q *uq;
1047	uint32_t owner, old, id;
1048	int error = 0;
1049
1050	id = td->td_tid;
1051	uq = td->td_umtxq;
1052
1053	/*
1054	 * Care must be exercised when dealing with umtx structure. It
1055	 * can fault on any access.
1056	 */
1057	for (;;) {
1058		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1059		if (mode == _UMUTEX_WAIT) {
1060			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1061				return (0);
1062		} else {
1063			/*
1064			 * Try the uncontested case.  This should be done in userland.
1065			 */
1066			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1067
1068			/* The acquire succeeded. */
1069			if (owner == UMUTEX_UNOWNED)
1070				return (0);
1071
1072			/* The address was invalid. */
1073			if (owner == -1)
1074				return (EFAULT);
1075
1076			/* If no one owns it but it is contested try to acquire it. */
1077			if (owner == UMUTEX_CONTESTED) {
1078				owner = casuword32(&m->m_owner,
1079				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1080
1081				if (owner == UMUTEX_CONTESTED)
1082					return (0);
1083
1084				/* The address was invalid. */
1085				if (owner == -1)
1086					return (EFAULT);
1087
1088				/* If this failed the lock has changed, restart. */
1089				continue;
1090			}
1091		}
1092
1093		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1094		    (owner & ~UMUTEX_CONTESTED) == id)
1095			return (EDEADLK);
1096
1097		if (mode == _UMUTEX_TRY)
1098			return (EBUSY);
1099
1100		/*
1101		 * If we caught a signal, we have retried and now
1102		 * exit immediately.
1103		 */
1104		if (error != 0)
1105			return (error);
1106
1107		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1108		    GET_SHARE(flags), &uq->uq_key)) != 0)
1109			return (error);
1110
1111		umtxq_lock(&uq->uq_key);
1112		umtxq_busy(&uq->uq_key);
1113		umtxq_insert(uq);
1114		umtxq_unlock(&uq->uq_key);
1115
1116		/*
1117		 * Set the contested bit so that a release in user space
1118		 * knows to use the system call for unlock.  If this fails
1119		 * either some one else has acquired the lock or it has been
1120		 * released.
1121		 */
1122		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1123
1124		/* The address was invalid. */
1125		if (old == -1) {
1126			umtxq_lock(&uq->uq_key);
1127			umtxq_remove(uq);
1128			umtxq_unbusy(&uq->uq_key);
1129			umtxq_unlock(&uq->uq_key);
1130			umtx_key_release(&uq->uq_key);
1131			return (EFAULT);
1132		}
1133
1134		/*
1135		 * We set the contested bit, sleep. Otherwise the lock changed
1136		 * and we need to retry or we lost a race to the thread
1137		 * unlocking the umtx.
1138		 */
1139		umtxq_lock(&uq->uq_key);
1140		umtxq_unbusy(&uq->uq_key);
1141		if (old == owner)
1142			error = umtxq_sleep(uq, "umtxn", timo);
1143		umtxq_remove(uq);
1144		umtxq_unlock(&uq->uq_key);
1145		umtx_key_release(&uq->uq_key);
1146	}
1147
1148	return (0);
1149}
1150
1151/*
1152 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1153 */
1154/*
1155 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1156 */
1157static int
1158do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1159{
1160	struct umtx_key key;
1161	uint32_t owner, old, id;
1162	int error;
1163	int count;
1164
1165	id = td->td_tid;
1166	/*
1167	 * Make sure we own this mtx.
1168	 */
1169	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1170	if (owner == -1)
1171		return (EFAULT);
1172
1173	if ((owner & ~UMUTEX_CONTESTED) != id)
1174		return (EPERM);
1175
1176	if ((owner & UMUTEX_CONTESTED) == 0) {
1177		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1178		if (old == -1)
1179			return (EFAULT);
1180		if (old == owner)
1181			return (0);
1182		owner = old;
1183	}
1184
1185	/* We should only ever be in here for contested locks */
1186	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1187	    &key)) != 0)
1188		return (error);
1189
1190	umtxq_lock(&key);
1191	umtxq_busy(&key);
1192	count = umtxq_count(&key);
1193	umtxq_unlock(&key);
1194
1195	/*
1196	 * When unlocking the umtx, it must be marked as unowned if
1197	 * there is zero or one thread only waiting for it.
1198	 * Otherwise, it must be marked as contested.
1199	 */
1200	old = casuword32(&m->m_owner, owner,
1201		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1202	umtxq_lock(&key);
1203	umtxq_signal(&key,1);
1204	umtxq_unbusy(&key);
1205	umtxq_unlock(&key);
1206	umtx_key_release(&key);
1207	if (old == -1)
1208		return (EFAULT);
1209	if (old != owner)
1210		return (EINVAL);
1211	return (0);
1212}
1213
1214/*
1215 * Check if the mutex is available and wake up a waiter,
1216 * only for simple mutex.
1217 */
1218static int
1219do_wake_umutex(struct thread *td, struct umutex *m)
1220{
1221	struct umtx_key key;
1222	uint32_t owner;
1223	uint32_t flags;
1224	int error;
1225	int count;
1226
1227	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1228	if (owner == -1)
1229		return (EFAULT);
1230
1231	if ((owner & ~UMUTEX_CONTESTED) != 0)
1232		return (0);
1233
1234	flags = fuword32(&m->m_flags);
1235
1236	/* We should only ever be in here for contested locks */
1237	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1238	    &key)) != 0)
1239		return (error);
1240
1241	umtxq_lock(&key);
1242	umtxq_busy(&key);
1243	count = umtxq_count(&key);
1244	umtxq_unlock(&key);
1245
1246	if (count <= 1)
1247		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1248
1249	umtxq_lock(&key);
1250	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1251		umtxq_signal(&key, 1);
1252	umtxq_unbusy(&key);
1253	umtxq_unlock(&key);
1254	umtx_key_release(&key);
1255	return (0);
1256}
1257
1258static inline struct umtx_pi *
1259umtx_pi_alloc(int flags)
1260{
1261	struct umtx_pi *pi;
1262
1263	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1264	TAILQ_INIT(&pi->pi_blocked);
1265	atomic_add_int(&umtx_pi_allocated, 1);
1266	return (pi);
1267}
1268
1269static inline void
1270umtx_pi_free(struct umtx_pi *pi)
1271{
1272	uma_zfree(umtx_pi_zone, pi);
1273	atomic_add_int(&umtx_pi_allocated, -1);
1274}
1275
1276/*
1277 * Adjust the thread's position on a pi_state after its priority has been
1278 * changed.
1279 */
1280static int
1281umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1282{
1283	struct umtx_q *uq, *uq1, *uq2;
1284	struct thread *td1;
1285
1286	mtx_assert(&umtx_lock, MA_OWNED);
1287	if (pi == NULL)
1288		return (0);
1289
1290	uq = td->td_umtxq;
1291
1292	/*
1293	 * Check if the thread needs to be moved on the blocked chain.
1294	 * It needs to be moved if either its priority is lower than
1295	 * the previous thread or higher than the next thread.
1296	 */
1297	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1298	uq2 = TAILQ_NEXT(uq, uq_lockq);
1299	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1300	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1301		/*
1302		 * Remove thread from blocked chain and determine where
1303		 * it should be moved to.
1304		 */
1305		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1306		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1307			td1 = uq1->uq_thread;
1308			MPASS(td1->td_proc->p_magic == P_MAGIC);
1309			if (UPRI(td1) > UPRI(td))
1310				break;
1311		}
1312
1313		if (uq1 == NULL)
1314			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1315		else
1316			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1317	}
1318	return (1);
1319}
1320
1321/*
1322 * Propagate priority when a thread is blocked on POSIX
1323 * PI mutex.
1324 */
1325static void
1326umtx_propagate_priority(struct thread *td)
1327{
1328	struct umtx_q *uq;
1329	struct umtx_pi *pi;
1330	int pri;
1331
1332	mtx_assert(&umtx_lock, MA_OWNED);
1333	pri = UPRI(td);
1334	uq = td->td_umtxq;
1335	pi = uq->uq_pi_blocked;
1336	if (pi == NULL)
1337		return;
1338
1339	for (;;) {
1340		td = pi->pi_owner;
1341		if (td == NULL)
1342			return;
1343
1344		MPASS(td->td_proc != NULL);
1345		MPASS(td->td_proc->p_magic == P_MAGIC);
1346
1347		if (UPRI(td) <= pri)
1348			return;
1349
1350		thread_lock(td);
1351		sched_lend_user_prio(td, pri);
1352		thread_unlock(td);
1353
1354		/*
1355		 * Pick up the lock that td is blocked on.
1356		 */
1357		uq = td->td_umtxq;
1358		pi = uq->uq_pi_blocked;
1359		/* Resort td on the list if needed. */
1360		if (!umtx_pi_adjust_thread(pi, td))
1361			break;
1362	}
1363}
1364
1365/*
1366 * Unpropagate priority for a PI mutex when a thread blocked on
1367 * it is interrupted by signal or resumed by others.
1368 */
1369static void
1370umtx_unpropagate_priority(struct umtx_pi *pi)
1371{
1372	struct umtx_q *uq, *uq_owner;
1373	struct umtx_pi *pi2;
1374	int pri, oldpri;
1375
1376	mtx_assert(&umtx_lock, MA_OWNED);
1377
1378	while (pi != NULL && pi->pi_owner != NULL) {
1379		pri = PRI_MAX;
1380		uq_owner = pi->pi_owner->td_umtxq;
1381
1382		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1383			uq = TAILQ_FIRST(&pi2->pi_blocked);
1384			if (uq != NULL) {
1385				if (pri > UPRI(uq->uq_thread))
1386					pri = UPRI(uq->uq_thread);
1387			}
1388		}
1389
1390		if (pri > uq_owner->uq_inherited_pri)
1391			pri = uq_owner->uq_inherited_pri;
1392		thread_lock(pi->pi_owner);
1393		oldpri = pi->pi_owner->td_user_pri;
1394		sched_unlend_user_prio(pi->pi_owner, pri);
1395		thread_unlock(pi->pi_owner);
1396		if (uq_owner->uq_pi_blocked != NULL)
1397			umtx_pi_adjust_locked(pi->pi_owner, oldpri);
1398		pi = uq_owner->uq_pi_blocked;
1399	}
1400}
1401
1402/*
1403 * Insert a PI mutex into owned list.
1404 */
1405static void
1406umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1407{
1408	struct umtx_q *uq_owner;
1409
1410	uq_owner = owner->td_umtxq;
1411	mtx_assert(&umtx_lock, MA_OWNED);
1412	if (pi->pi_owner != NULL)
1413		panic("pi_ower != NULL");
1414	pi->pi_owner = owner;
1415	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1416}
1417
1418/*
1419 * Claim ownership of a PI mutex.
1420 */
1421static int
1422umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1423{
1424	struct umtx_q *uq, *uq_owner;
1425
1426	uq_owner = owner->td_umtxq;
1427	mtx_lock_spin(&umtx_lock);
1428	if (pi->pi_owner == owner) {
1429		mtx_unlock_spin(&umtx_lock);
1430		return (0);
1431	}
1432
1433	if (pi->pi_owner != NULL) {
1434		/*
1435		 * userland may have already messed the mutex, sigh.
1436		 */
1437		mtx_unlock_spin(&umtx_lock);
1438		return (EPERM);
1439	}
1440	umtx_pi_setowner(pi, owner);
1441	uq = TAILQ_FIRST(&pi->pi_blocked);
1442	if (uq != NULL) {
1443		int pri;
1444
1445		pri = UPRI(uq->uq_thread);
1446		thread_lock(owner);
1447		if (pri < UPRI(owner))
1448			sched_lend_user_prio(owner, pri);
1449		thread_unlock(owner);
1450	}
1451	mtx_unlock_spin(&umtx_lock);
1452	return (0);
1453}
1454
1455static void
1456umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
1457{
1458	struct umtx_q *uq;
1459	struct umtx_pi *pi;
1460
1461	uq = td->td_umtxq;
1462	/*
1463	 * Pick up the lock that td is blocked on.
1464	 */
1465	pi = uq->uq_pi_blocked;
1466	MPASS(pi != NULL);
1467
1468	/* Resort the turnstile on the list. */
1469	if (!umtx_pi_adjust_thread(pi, td))
1470		return;
1471
1472	/*
1473	 * If our priority was lowered and we are at the head of the
1474	 * turnstile, then propagate our new priority up the chain.
1475	 */
1476	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1477		umtx_propagate_priority(td);
1478}
1479
1480/*
1481 * Adjust a thread's order position in its blocked PI mutex,
1482 * this may result new priority propagating process.
1483 */
1484void
1485umtx_pi_adjust(struct thread *td, u_char oldpri)
1486{
1487	struct umtx_q *uq;
1488	struct umtx_pi *pi;
1489
1490	uq = td->td_umtxq;
1491	mtx_lock_spin(&umtx_lock);
1492	/*
1493	 * Pick up the lock that td is blocked on.
1494	 */
1495	pi = uq->uq_pi_blocked;
1496	if (pi != NULL)
1497		umtx_pi_adjust_locked(td, oldpri);
1498	mtx_unlock_spin(&umtx_lock);
1499}
1500
1501/*
1502 * Sleep on a PI mutex.
1503 */
1504static int
1505umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1506	uint32_t owner, const char *wmesg, int timo)
1507{
1508	struct umtxq_chain *uc;
1509	struct thread *td, *td1;
1510	struct umtx_q *uq1;
1511	int pri;
1512	int error = 0;
1513
1514	td = uq->uq_thread;
1515	KASSERT(td == curthread, ("inconsistent uq_thread"));
1516	uc = umtxq_getchain(&uq->uq_key);
1517	UMTXQ_LOCKED_ASSERT(uc);
1518	UMTXQ_BUSY_ASSERT(uc);
1519	umtxq_insert(uq);
1520	mtx_lock_spin(&umtx_lock);
1521	if (pi->pi_owner == NULL) {
1522		/* XXX
1523		 * Current, We only support process private PI-mutex,
1524		 * non-contended PI-mutexes are locked in userland.
1525		 * Process shared PI-mutex should always be initialized
1526		 * by kernel and be registered in kernel, locking should
1527		 * always be done by kernel to avoid security problems.
1528		 * For process private PI-mutex, we can find owner
1529		 * thread and boost its priority safely.
1530		 */
1531		mtx_unlock_spin(&umtx_lock);
1532		PROC_LOCK(curproc);
1533		td1 = thread_find(curproc, owner);
1534		mtx_lock_spin(&umtx_lock);
1535		if (td1 != NULL && pi->pi_owner == NULL) {
1536			uq1 = td1->td_umtxq;
1537			umtx_pi_setowner(pi, td1);
1538		}
1539		PROC_UNLOCK(curproc);
1540	}
1541
1542	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1543		pri = UPRI(uq1->uq_thread);
1544		if (pri > UPRI(td))
1545			break;
1546	}
1547
1548	if (uq1 != NULL)
1549		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1550	else
1551		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1552
1553	uq->uq_pi_blocked = pi;
1554	thread_lock(td);
1555	td->td_flags |= TDF_UPIBLOCKED;
1556	thread_unlock(td);
1557	umtx_propagate_priority(td);
1558	mtx_unlock_spin(&umtx_lock);
1559	umtxq_unbusy(&uq->uq_key);
1560
1561	if (uq->uq_flags & UQF_UMTXQ) {
1562		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1563		if (error == EWOULDBLOCK)
1564			error = ETIMEDOUT;
1565		if (uq->uq_flags & UQF_UMTXQ) {
1566			umtxq_remove(uq);
1567		}
1568	}
1569	mtx_lock_spin(&umtx_lock);
1570	uq->uq_pi_blocked = NULL;
1571	thread_lock(td);
1572	td->td_flags &= ~TDF_UPIBLOCKED;
1573	thread_unlock(td);
1574	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1575	umtx_unpropagate_priority(pi);
1576	mtx_unlock_spin(&umtx_lock);
1577	umtxq_unlock(&uq->uq_key);
1578
1579	return (error);
1580}
1581
1582/*
1583 * Add reference count for a PI mutex.
1584 */
1585static void
1586umtx_pi_ref(struct umtx_pi *pi)
1587{
1588	struct umtxq_chain *uc;
1589
1590	uc = umtxq_getchain(&pi->pi_key);
1591	UMTXQ_LOCKED_ASSERT(uc);
1592	pi->pi_refcount++;
1593}
1594
1595/*
1596 * Decrease reference count for a PI mutex, if the counter
1597 * is decreased to zero, its memory space is freed.
1598 */
1599static void
1600umtx_pi_unref(struct umtx_pi *pi)
1601{
1602	struct umtxq_chain *uc;
1603
1604	uc = umtxq_getchain(&pi->pi_key);
1605	UMTXQ_LOCKED_ASSERT(uc);
1606	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1607	if (--pi->pi_refcount == 0) {
1608		mtx_lock_spin(&umtx_lock);
1609		if (pi->pi_owner != NULL) {
1610			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1611				pi, pi_link);
1612			pi->pi_owner = NULL;
1613		}
1614		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1615			("blocked queue not empty"));
1616		mtx_unlock_spin(&umtx_lock);
1617		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1618		umtx_pi_free(pi);
1619	}
1620}
1621
1622/*
1623 * Find a PI mutex in hash table.
1624 */
1625static struct umtx_pi *
1626umtx_pi_lookup(struct umtx_key *key)
1627{
1628	struct umtxq_chain *uc;
1629	struct umtx_pi *pi;
1630
1631	uc = umtxq_getchain(key);
1632	UMTXQ_LOCKED_ASSERT(uc);
1633
1634	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1635		if (umtx_key_match(&pi->pi_key, key)) {
1636			return (pi);
1637		}
1638	}
1639	return (NULL);
1640}
1641
1642/*
1643 * Insert a PI mutex into hash table.
1644 */
1645static inline void
1646umtx_pi_insert(struct umtx_pi *pi)
1647{
1648	struct umtxq_chain *uc;
1649
1650	uc = umtxq_getchain(&pi->pi_key);
1651	UMTXQ_LOCKED_ASSERT(uc);
1652	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1653}
1654
1655/*
1656 * Lock a PI mutex.
1657 */
1658static int
1659_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1660	int try)
1661{
1662	struct umtx_q *uq;
1663	struct umtx_pi *pi, *new_pi;
1664	uint32_t id, owner, old;
1665	int error;
1666
1667	id = td->td_tid;
1668	uq = td->td_umtxq;
1669
1670	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1671	    &uq->uq_key)) != 0)
1672		return (error);
1673	umtxq_lock(&uq->uq_key);
1674	pi = umtx_pi_lookup(&uq->uq_key);
1675	if (pi == NULL) {
1676		new_pi = umtx_pi_alloc(M_NOWAIT);
1677		if (new_pi == NULL) {
1678			umtxq_unlock(&uq->uq_key);
1679			new_pi = umtx_pi_alloc(M_WAITOK);
1680			umtxq_lock(&uq->uq_key);
1681			pi = umtx_pi_lookup(&uq->uq_key);
1682			if (pi != NULL) {
1683				umtx_pi_free(new_pi);
1684				new_pi = NULL;
1685			}
1686		}
1687		if (new_pi != NULL) {
1688			new_pi->pi_key = uq->uq_key;
1689			umtx_pi_insert(new_pi);
1690			pi = new_pi;
1691		}
1692	}
1693	umtx_pi_ref(pi);
1694	umtxq_unlock(&uq->uq_key);
1695
1696	/*
1697	 * Care must be exercised when dealing with umtx structure.  It
1698	 * can fault on any access.
1699	 */
1700	for (;;) {
1701		/*
1702		 * Try the uncontested case.  This should be done in userland.
1703		 */
1704		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1705
1706		/* The acquire succeeded. */
1707		if (owner == UMUTEX_UNOWNED) {
1708			error = 0;
1709			break;
1710		}
1711
1712		/* The address was invalid. */
1713		if (owner == -1) {
1714			error = EFAULT;
1715			break;
1716		}
1717
1718		/* If no one owns it but it is contested try to acquire it. */
1719		if (owner == UMUTEX_CONTESTED) {
1720			owner = casuword32(&m->m_owner,
1721			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1722
1723			if (owner == UMUTEX_CONTESTED) {
1724				umtxq_lock(&uq->uq_key);
1725				umtxq_busy(&uq->uq_key);
1726				error = umtx_pi_claim(pi, td);
1727				umtxq_unbusy(&uq->uq_key);
1728				umtxq_unlock(&uq->uq_key);
1729				break;
1730			}
1731
1732			/* The address was invalid. */
1733			if (owner == -1) {
1734				error = EFAULT;
1735				break;
1736			}
1737
1738			/* If this failed the lock has changed, restart. */
1739			continue;
1740		}
1741
1742		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1743		    (owner & ~UMUTEX_CONTESTED) == id) {
1744			error = EDEADLK;
1745			break;
1746		}
1747
1748		if (try != 0) {
1749			error = EBUSY;
1750			break;
1751		}
1752
1753		/*
1754		 * If we caught a signal, we have retried and now
1755		 * exit immediately.
1756		 */
1757		if (error != 0)
1758			break;
1759
1760		umtxq_lock(&uq->uq_key);
1761		umtxq_busy(&uq->uq_key);
1762		umtxq_unlock(&uq->uq_key);
1763
1764		/*
1765		 * Set the contested bit so that a release in user space
1766		 * knows to use the system call for unlock.  If this fails
1767		 * either some one else has acquired the lock or it has been
1768		 * released.
1769		 */
1770		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1771
1772		/* The address was invalid. */
1773		if (old == -1) {
1774			umtxq_lock(&uq->uq_key);
1775			umtxq_unbusy(&uq->uq_key);
1776			umtxq_unlock(&uq->uq_key);
1777			error = EFAULT;
1778			break;
1779		}
1780
1781		umtxq_lock(&uq->uq_key);
1782		/*
1783		 * We set the contested bit, sleep. Otherwise the lock changed
1784		 * and we need to retry or we lost a race to the thread
1785		 * unlocking the umtx.
1786		 */
1787		if (old == owner)
1788			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1789				 "umtxpi", timo);
1790		else {
1791			umtxq_unbusy(&uq->uq_key);
1792			umtxq_unlock(&uq->uq_key);
1793		}
1794	}
1795
1796	umtxq_lock(&uq->uq_key);
1797	umtx_pi_unref(pi);
1798	umtxq_unlock(&uq->uq_key);
1799
1800	umtx_key_release(&uq->uq_key);
1801	return (error);
1802}
1803
1804/*
1805 * Unlock a PI mutex.
1806 */
1807static int
1808do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1809{
1810	struct umtx_key key;
1811	struct umtx_q *uq_first, *uq_first2, *uq_me;
1812	struct umtx_pi *pi, *pi2;
1813	uint32_t owner, old, id;
1814	int error;
1815	int count;
1816	int pri;
1817
1818	id = td->td_tid;
1819	/*
1820	 * Make sure we own this mtx.
1821	 */
1822	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1823	if (owner == -1)
1824		return (EFAULT);
1825
1826	if ((owner & ~UMUTEX_CONTESTED) != id)
1827		return (EPERM);
1828
1829	/* This should be done in userland */
1830	if ((owner & UMUTEX_CONTESTED) == 0) {
1831		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1832		if (old == -1)
1833			return (EFAULT);
1834		if (old == owner)
1835			return (0);
1836		owner = old;
1837	}
1838
1839	/* We should only ever be in here for contested locks */
1840	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1841	    &key)) != 0)
1842		return (error);
1843
1844	umtxq_lock(&key);
1845	umtxq_busy(&key);
1846	count = umtxq_count_pi(&key, &uq_first);
1847	if (uq_first != NULL) {
1848		mtx_lock_spin(&umtx_lock);
1849		pi = uq_first->uq_pi_blocked;
1850		KASSERT(pi != NULL, ("pi == NULL?"));
1851		if (pi->pi_owner != curthread) {
1852			mtx_unlock_spin(&umtx_lock);
1853			umtxq_unbusy(&key);
1854			umtxq_unlock(&key);
1855			umtx_key_release(&key);
1856			/* userland messed the mutex */
1857			return (EPERM);
1858		}
1859		uq_me = curthread->td_umtxq;
1860		pi->pi_owner = NULL;
1861		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1862		/* get highest priority thread which is still sleeping. */
1863		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1864		while (uq_first != NULL &&
1865		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1866			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1867		}
1868		pri = PRI_MAX;
1869		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1870			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1871			if (uq_first2 != NULL) {
1872				if (pri > UPRI(uq_first2->uq_thread))
1873					pri = UPRI(uq_first2->uq_thread);
1874			}
1875		}
1876		thread_lock(curthread);
1877		sched_unlend_user_prio(curthread, pri);
1878		thread_unlock(curthread);
1879		mtx_unlock_spin(&umtx_lock);
1880		if (uq_first)
1881			umtxq_signal_thread(uq_first);
1882	}
1883	umtxq_unlock(&key);
1884
1885	/*
1886	 * When unlocking the umtx, it must be marked as unowned if
1887	 * there is zero or one thread only waiting for it.
1888	 * Otherwise, it must be marked as contested.
1889	 */
1890	old = casuword32(&m->m_owner, owner,
1891		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1892
1893	umtxq_lock(&key);
1894	umtxq_unbusy(&key);
1895	umtxq_unlock(&key);
1896	umtx_key_release(&key);
1897	if (old == -1)
1898		return (EFAULT);
1899	if (old != owner)
1900		return (EINVAL);
1901	return (0);
1902}
1903
1904/*
1905 * Lock a PP mutex.
1906 */
1907static int
1908_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1909	int try)
1910{
1911	struct umtx_q *uq, *uq2;
1912	struct umtx_pi *pi;
1913	uint32_t ceiling;
1914	uint32_t owner, id;
1915	int error, pri, old_inherited_pri, su;
1916
1917	id = td->td_tid;
1918	uq = td->td_umtxq;
1919	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1920	    &uq->uq_key)) != 0)
1921		return (error);
1922	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1923	for (;;) {
1924		old_inherited_pri = uq->uq_inherited_pri;
1925		umtxq_lock(&uq->uq_key);
1926		umtxq_busy(&uq->uq_key);
1927		umtxq_unlock(&uq->uq_key);
1928
1929		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1930		if (ceiling > RTP_PRIO_MAX) {
1931			error = EINVAL;
1932			goto out;
1933		}
1934
1935		mtx_lock_spin(&umtx_lock);
1936		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1937			mtx_unlock_spin(&umtx_lock);
1938			error = EINVAL;
1939			goto out;
1940		}
1941		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1942			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1943			thread_lock(td);
1944			if (uq->uq_inherited_pri < UPRI(td))
1945				sched_lend_user_prio(td, uq->uq_inherited_pri);
1946			thread_unlock(td);
1947		}
1948		mtx_unlock_spin(&umtx_lock);
1949
1950		owner = casuword32(&m->m_owner,
1951		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1952
1953		if (owner == UMUTEX_CONTESTED) {
1954			error = 0;
1955			break;
1956		}
1957
1958		/* The address was invalid. */
1959		if (owner == -1) {
1960			error = EFAULT;
1961			break;
1962		}
1963
1964		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1965		    (owner & ~UMUTEX_CONTESTED) == id) {
1966			error = EDEADLK;
1967			break;
1968		}
1969
1970		if (try != 0) {
1971			error = EBUSY;
1972			break;
1973		}
1974
1975		/*
1976		 * If we caught a signal, we have retried and now
1977		 * exit immediately.
1978		 */
1979		if (error != 0)
1980			break;
1981
1982		umtxq_lock(&uq->uq_key);
1983		umtxq_insert(uq);
1984		umtxq_unbusy(&uq->uq_key);
1985		error = umtxq_sleep(uq, "umtxpp", timo);
1986		umtxq_remove(uq);
1987		umtxq_unlock(&uq->uq_key);
1988
1989		mtx_lock_spin(&umtx_lock);
1990		uq->uq_inherited_pri = old_inherited_pri;
1991		pri = PRI_MAX;
1992		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1993			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1994			if (uq2 != NULL) {
1995				if (pri > UPRI(uq2->uq_thread))
1996					pri = UPRI(uq2->uq_thread);
1997			}
1998		}
1999		if (pri > uq->uq_inherited_pri)
2000			pri = uq->uq_inherited_pri;
2001		thread_lock(td);
2002		sched_unlend_user_prio(td, pri);
2003		thread_unlock(td);
2004		mtx_unlock_spin(&umtx_lock);
2005	}
2006
2007	if (error != 0) {
2008		mtx_lock_spin(&umtx_lock);
2009		uq->uq_inherited_pri = old_inherited_pri;
2010		pri = PRI_MAX;
2011		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2012			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2013			if (uq2 != NULL) {
2014				if (pri > UPRI(uq2->uq_thread))
2015					pri = UPRI(uq2->uq_thread);
2016			}
2017		}
2018		if (pri > uq->uq_inherited_pri)
2019			pri = uq->uq_inherited_pri;
2020		thread_lock(td);
2021		sched_unlend_user_prio(td, pri);
2022		thread_unlock(td);
2023		mtx_unlock_spin(&umtx_lock);
2024	}
2025
2026out:
2027	umtxq_lock(&uq->uq_key);
2028	umtxq_unbusy(&uq->uq_key);
2029	umtxq_unlock(&uq->uq_key);
2030	umtx_key_release(&uq->uq_key);
2031	return (error);
2032}
2033
2034/*
2035 * Unlock a PP mutex.
2036 */
2037static int
2038do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2039{
2040	struct umtx_key key;
2041	struct umtx_q *uq, *uq2;
2042	struct umtx_pi *pi;
2043	uint32_t owner, id;
2044	uint32_t rceiling;
2045	int error, pri, new_inherited_pri, su;
2046
2047	id = td->td_tid;
2048	uq = td->td_umtxq;
2049	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2050
2051	/*
2052	 * Make sure we own this mtx.
2053	 */
2054	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2055	if (owner == -1)
2056		return (EFAULT);
2057
2058	if ((owner & ~UMUTEX_CONTESTED) != id)
2059		return (EPERM);
2060
2061	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2062	if (error != 0)
2063		return (error);
2064
2065	if (rceiling == -1)
2066		new_inherited_pri = PRI_MAX;
2067	else {
2068		rceiling = RTP_PRIO_MAX - rceiling;
2069		if (rceiling > RTP_PRIO_MAX)
2070			return (EINVAL);
2071		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2072	}
2073
2074	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2075	    &key)) != 0)
2076		return (error);
2077	umtxq_lock(&key);
2078	umtxq_busy(&key);
2079	umtxq_unlock(&key);
2080	/*
2081	 * For priority protected mutex, always set unlocked state
2082	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2083	 * to lock the mutex, it is necessary because thread priority
2084	 * has to be adjusted for such mutex.
2085	 */
2086	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2087		UMUTEX_CONTESTED);
2088
2089	umtxq_lock(&key);
2090	if (error == 0)
2091		umtxq_signal(&key, 1);
2092	umtxq_unbusy(&key);
2093	umtxq_unlock(&key);
2094
2095	if (error == -1)
2096		error = EFAULT;
2097	else {
2098		mtx_lock_spin(&umtx_lock);
2099		if (su != 0)
2100			uq->uq_inherited_pri = new_inherited_pri;
2101		pri = PRI_MAX;
2102		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2103			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2104			if (uq2 != NULL) {
2105				if (pri > UPRI(uq2->uq_thread))
2106					pri = UPRI(uq2->uq_thread);
2107			}
2108		}
2109		if (pri > uq->uq_inherited_pri)
2110			pri = uq->uq_inherited_pri;
2111		thread_lock(td);
2112		sched_unlend_user_prio(td, pri);
2113		thread_unlock(td);
2114		mtx_unlock_spin(&umtx_lock);
2115	}
2116	umtx_key_release(&key);
2117	return (error);
2118}
2119
2120static int
2121do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2122	uint32_t *old_ceiling)
2123{
2124	struct umtx_q *uq;
2125	uint32_t save_ceiling;
2126	uint32_t owner, id;
2127	uint32_t flags;
2128	int error;
2129
2130	flags = fuword32(&m->m_flags);
2131	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2132		return (EINVAL);
2133	if (ceiling > RTP_PRIO_MAX)
2134		return (EINVAL);
2135	id = td->td_tid;
2136	uq = td->td_umtxq;
2137	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2138	   &uq->uq_key)) != 0)
2139		return (error);
2140	for (;;) {
2141		umtxq_lock(&uq->uq_key);
2142		umtxq_busy(&uq->uq_key);
2143		umtxq_unlock(&uq->uq_key);
2144
2145		save_ceiling = fuword32(&m->m_ceilings[0]);
2146
2147		owner = casuword32(&m->m_owner,
2148		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2149
2150		if (owner == UMUTEX_CONTESTED) {
2151			suword32(&m->m_ceilings[0], ceiling);
2152			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2153				UMUTEX_CONTESTED);
2154			error = 0;
2155			break;
2156		}
2157
2158		/* The address was invalid. */
2159		if (owner == -1) {
2160			error = EFAULT;
2161			break;
2162		}
2163
2164		if ((owner & ~UMUTEX_CONTESTED) == id) {
2165			suword32(&m->m_ceilings[0], ceiling);
2166			error = 0;
2167			break;
2168		}
2169
2170		/*
2171		 * If we caught a signal, we have retried and now
2172		 * exit immediately.
2173		 */
2174		if (error != 0)
2175			break;
2176
2177		/*
2178		 * We set the contested bit, sleep. Otherwise the lock changed
2179		 * and we need to retry or we lost a race to the thread
2180		 * unlocking the umtx.
2181		 */
2182		umtxq_lock(&uq->uq_key);
2183		umtxq_insert(uq);
2184		umtxq_unbusy(&uq->uq_key);
2185		error = umtxq_sleep(uq, "umtxpp", 0);
2186		umtxq_remove(uq);
2187		umtxq_unlock(&uq->uq_key);
2188	}
2189	umtxq_lock(&uq->uq_key);
2190	if (error == 0)
2191		umtxq_signal(&uq->uq_key, INT_MAX);
2192	umtxq_unbusy(&uq->uq_key);
2193	umtxq_unlock(&uq->uq_key);
2194	umtx_key_release(&uq->uq_key);
2195	if (error == 0 && old_ceiling != NULL)
2196		suword32(old_ceiling, save_ceiling);
2197	return (error);
2198}
2199
2200static int
2201_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2202	int mode)
2203{
2204	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2205	case 0:
2206		return (_do_lock_normal(td, m, flags, timo, mode));
2207	case UMUTEX_PRIO_INHERIT:
2208		return (_do_lock_pi(td, m, flags, timo, mode));
2209	case UMUTEX_PRIO_PROTECT:
2210		return (_do_lock_pp(td, m, flags, timo, mode));
2211	}
2212	return (EINVAL);
2213}
2214
2215/*
2216 * Lock a userland POSIX mutex.
2217 */
2218static int
2219do_lock_umutex(struct thread *td, struct umutex *m,
2220	struct timespec *timeout, int mode)
2221{
2222	struct timespec ts, ts2, ts3;
2223	struct timeval tv;
2224	uint32_t flags;
2225	int error;
2226
2227	flags = fuword32(&m->m_flags);
2228	if (flags == -1)
2229		return (EFAULT);
2230
2231	if (timeout == NULL) {
2232		error = _do_lock_umutex(td, m, flags, 0, mode);
2233		/* Mutex locking is restarted if it is interrupted. */
2234		if (error == EINTR && mode != _UMUTEX_WAIT)
2235			error = ERESTART;
2236	} else {
2237		getnanouptime(&ts);
2238		timespecadd(&ts, timeout);
2239		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2240		for (;;) {
2241			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
2242			if (error != ETIMEDOUT)
2243				break;
2244			getnanouptime(&ts2);
2245			if (timespeccmp(&ts2, &ts, >=)) {
2246				error = ETIMEDOUT;
2247				break;
2248			}
2249			ts3 = ts;
2250			timespecsub(&ts3, &ts2);
2251			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2252		}
2253		/* Timed-locking is not restarted. */
2254		if (error == ERESTART)
2255			error = EINTR;
2256	}
2257	return (error);
2258}
2259
2260/*
2261 * Unlock a userland POSIX mutex.
2262 */
2263static int
2264do_unlock_umutex(struct thread *td, struct umutex *m)
2265{
2266	uint32_t flags;
2267
2268	flags = fuword32(&m->m_flags);
2269	if (flags == -1)
2270		return (EFAULT);
2271
2272	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2273	case 0:
2274		return (do_unlock_normal(td, m, flags));
2275	case UMUTEX_PRIO_INHERIT:
2276		return (do_unlock_pi(td, m, flags));
2277	case UMUTEX_PRIO_PROTECT:
2278		return (do_unlock_pp(td, m, flags));
2279	}
2280
2281	return (EINVAL);
2282}
2283
2284static int
2285do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2286	struct timespec *timeout, u_long wflags)
2287{
2288	struct umtx_q *uq;
2289	struct timeval tv;
2290	struct timespec cts, ets, tts;
2291	uint32_t flags;
2292	int error;
2293
2294	uq = td->td_umtxq;
2295	flags = fuword32(&cv->c_flags);
2296	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2297	if (error != 0)
2298		return (error);
2299	umtxq_lock(&uq->uq_key);
2300	umtxq_busy(&uq->uq_key);
2301	umtxq_insert(uq);
2302	umtxq_unlock(&uq->uq_key);
2303
2304	/*
2305	 * The magic thing is we should set c_has_waiters to 1 before
2306	 * releasing user mutex.
2307	 */
2308	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2309
2310	umtxq_lock(&uq->uq_key);
2311	umtxq_unbusy(&uq->uq_key);
2312	umtxq_unlock(&uq->uq_key);
2313
2314	error = do_unlock_umutex(td, m);
2315
2316	umtxq_lock(&uq->uq_key);
2317	if (error == 0) {
2318		if ((wflags & UMTX_CHECK_UNPARKING) &&
2319		    (td->td_pflags & TDP_WAKEUP)) {
2320			td->td_pflags &= ~TDP_WAKEUP;
2321			error = EINTR;
2322		} else if (timeout == NULL) {
2323			error = umtxq_sleep(uq, "ucond", 0);
2324		} else {
2325			getnanouptime(&ets);
2326			timespecadd(&ets, timeout);
2327			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2328			for (;;) {
2329				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2330				if (error != ETIMEDOUT)
2331					break;
2332				getnanouptime(&cts);
2333				if (timespeccmp(&cts, &ets, >=)) {
2334					error = ETIMEDOUT;
2335					break;
2336				}
2337				tts = ets;
2338				timespecsub(&tts, &cts);
2339				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2340			}
2341		}
2342	}
2343
2344	if (error != 0) {
2345		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2346			/*
2347			 * If we concurrently got do_cv_signal()d
2348			 * and we got an error or UNIX signals or a timeout,
2349			 * then, perform another umtxq_signal to avoid
2350			 * consuming the wakeup. This may cause supurious
2351			 * wakeup for another thread which was just queued,
2352			 * but SUSV3 explicitly allows supurious wakeup to
2353			 * occur, and indeed a kernel based implementation
2354			 * can not avoid it.
2355			 */
2356			if (!umtxq_signal(&uq->uq_key, 1))
2357				error = 0;
2358		}
2359		if (error == ERESTART)
2360			error = EINTR;
2361	}
2362	umtxq_remove(uq);
2363	umtxq_unlock(&uq->uq_key);
2364	umtx_key_release(&uq->uq_key);
2365	return (error);
2366}
2367
2368/*
2369 * Signal a userland condition variable.
2370 */
2371static int
2372do_cv_signal(struct thread *td, struct ucond *cv)
2373{
2374	struct umtx_key key;
2375	int error, cnt, nwake;
2376	uint32_t flags;
2377
2378	flags = fuword32(&cv->c_flags);
2379	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2380		return (error);
2381	umtxq_lock(&key);
2382	umtxq_busy(&key);
2383	cnt = umtxq_count(&key);
2384	nwake = umtxq_signal(&key, 1);
2385	if (cnt <= nwake) {
2386		umtxq_unlock(&key);
2387		error = suword32(
2388		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2389		umtxq_lock(&key);
2390	}
2391	umtxq_unbusy(&key);
2392	umtxq_unlock(&key);
2393	umtx_key_release(&key);
2394	return (error);
2395}
2396
2397static int
2398do_cv_broadcast(struct thread *td, struct ucond *cv)
2399{
2400	struct umtx_key key;
2401	int error;
2402	uint32_t flags;
2403
2404	flags = fuword32(&cv->c_flags);
2405	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2406		return (error);
2407
2408	umtxq_lock(&key);
2409	umtxq_busy(&key);
2410	umtxq_signal(&key, INT_MAX);
2411	umtxq_unlock(&key);
2412
2413	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2414
2415	umtxq_lock(&key);
2416	umtxq_unbusy(&key);
2417	umtxq_unlock(&key);
2418
2419	umtx_key_release(&key);
2420	return (error);
2421}
2422
2423static int
2424do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2425{
2426	struct umtx_q *uq;
2427	uint32_t flags, wrflags;
2428	int32_t state, oldstate;
2429	int32_t blocked_readers;
2430	int error;
2431
2432	uq = td->td_umtxq;
2433	flags = fuword32(&rwlock->rw_flags);
2434	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2435	if (error != 0)
2436		return (error);
2437
2438	wrflags = URWLOCK_WRITE_OWNER;
2439	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2440		wrflags |= URWLOCK_WRITE_WAITERS;
2441
2442	for (;;) {
2443		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2444		/* try to lock it */
2445		while (!(state & wrflags)) {
2446			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2447				umtx_key_release(&uq->uq_key);
2448				return (EAGAIN);
2449			}
2450			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2451			if (oldstate == state) {
2452				umtx_key_release(&uq->uq_key);
2453				return (0);
2454			}
2455			state = oldstate;
2456		}
2457
2458		if (error)
2459			break;
2460
2461		/* grab monitor lock */
2462		umtxq_lock(&uq->uq_key);
2463		umtxq_busy(&uq->uq_key);
2464		umtxq_unlock(&uq->uq_key);
2465
2466		/* set read contention bit */
2467		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2468			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2469			if (oldstate == state)
2470				goto sleep;
2471			state = oldstate;
2472		}
2473
2474		/* state is changed while setting flags, restart */
2475		if (!(state & wrflags)) {
2476			umtxq_lock(&uq->uq_key);
2477			umtxq_unbusy(&uq->uq_key);
2478			umtxq_unlock(&uq->uq_key);
2479			continue;
2480		}
2481
2482sleep:
2483		/* contention bit is set, before sleeping, increase read waiter count */
2484		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2485		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2486
2487		while (state & wrflags) {
2488			umtxq_lock(&uq->uq_key);
2489			umtxq_insert(uq);
2490			umtxq_unbusy(&uq->uq_key);
2491
2492			error = umtxq_sleep(uq, "urdlck", timo);
2493
2494			umtxq_busy(&uq->uq_key);
2495			umtxq_remove(uq);
2496			umtxq_unlock(&uq->uq_key);
2497			if (error)
2498				break;
2499			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2500		}
2501
2502		/* decrease read waiter count, and may clear read contention bit */
2503		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2504		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2505		if (blocked_readers == 1) {
2506			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2507			for (;;) {
2508				oldstate = casuword32(&rwlock->rw_state, state,
2509					 state & ~URWLOCK_READ_WAITERS);
2510				if (oldstate == state)
2511					break;
2512				state = oldstate;
2513			}
2514		}
2515
2516		umtxq_lock(&uq->uq_key);
2517		umtxq_unbusy(&uq->uq_key);
2518		umtxq_unlock(&uq->uq_key);
2519	}
2520	umtx_key_release(&uq->uq_key);
2521	return (error);
2522}
2523
2524static int
2525do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
2526{
2527	struct timespec ts, ts2, ts3;
2528	struct timeval tv;
2529	int error;
2530
2531	getnanouptime(&ts);
2532	timespecadd(&ts, timeout);
2533	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2534	for (;;) {
2535		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
2536		if (error != ETIMEDOUT)
2537			break;
2538		getnanouptime(&ts2);
2539		if (timespeccmp(&ts2, &ts, >=)) {
2540			error = ETIMEDOUT;
2541			break;
2542		}
2543		ts3 = ts;
2544		timespecsub(&ts3, &ts2);
2545		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2546	}
2547	if (error == ERESTART)
2548		error = EINTR;
2549	return (error);
2550}
2551
2552static int
2553do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2554{
2555	struct umtx_q *uq;
2556	uint32_t flags;
2557	int32_t state, oldstate;
2558	int32_t blocked_writers;
2559	int32_t blocked_readers;
2560	int error;
2561
2562	uq = td->td_umtxq;
2563	flags = fuword32(&rwlock->rw_flags);
2564	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2565	if (error != 0)
2566		return (error);
2567
2568	blocked_readers = 0;
2569	for (;;) {
2570		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2571		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2572			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2573			if (oldstate == state) {
2574				umtx_key_release(&uq->uq_key);
2575				return (0);
2576			}
2577			state = oldstate;
2578		}
2579
2580		if (error) {
2581			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2582			    blocked_readers != 0) {
2583				umtxq_lock(&uq->uq_key);
2584				umtxq_busy(&uq->uq_key);
2585				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2586				umtxq_unbusy(&uq->uq_key);
2587				umtxq_unlock(&uq->uq_key);
2588			}
2589
2590			break;
2591		}
2592
2593		/* grab monitor lock */
2594		umtxq_lock(&uq->uq_key);
2595		umtxq_busy(&uq->uq_key);
2596		umtxq_unlock(&uq->uq_key);
2597
2598		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2599		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2600			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2601			if (oldstate == state)
2602				goto sleep;
2603			state = oldstate;
2604		}
2605
2606		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2607			umtxq_lock(&uq->uq_key);
2608			umtxq_unbusy(&uq->uq_key);
2609			umtxq_unlock(&uq->uq_key);
2610			continue;
2611		}
2612sleep:
2613		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2614		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2615
2616		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2617			umtxq_lock(&uq->uq_key);
2618			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2619			umtxq_unbusy(&uq->uq_key);
2620
2621			error = umtxq_sleep(uq, "uwrlck", timo);
2622
2623			umtxq_busy(&uq->uq_key);
2624			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2625			umtxq_unlock(&uq->uq_key);
2626			if (error)
2627				break;
2628			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2629		}
2630
2631		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2632		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2633		if (blocked_writers == 1) {
2634			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2635			for (;;) {
2636				oldstate = casuword32(&rwlock->rw_state, state,
2637					 state & ~URWLOCK_WRITE_WAITERS);
2638				if (oldstate == state)
2639					break;
2640				state = oldstate;
2641			}
2642			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2643		} else
2644			blocked_readers = 0;
2645
2646		umtxq_lock(&uq->uq_key);
2647		umtxq_unbusy(&uq->uq_key);
2648		umtxq_unlock(&uq->uq_key);
2649	}
2650
2651	umtx_key_release(&uq->uq_key);
2652	return (error);
2653}
2654
2655static int
2656do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
2657{
2658	struct timespec ts, ts2, ts3;
2659	struct timeval tv;
2660	int error;
2661
2662	getnanouptime(&ts);
2663	timespecadd(&ts, timeout);
2664	TIMESPEC_TO_TIMEVAL(&tv, timeout);
2665	for (;;) {
2666		error = do_rw_wrlock(td, obj, tvtohz(&tv));
2667		if (error != ETIMEDOUT)
2668			break;
2669		getnanouptime(&ts2);
2670		if (timespeccmp(&ts2, &ts, >=)) {
2671			error = ETIMEDOUT;
2672			break;
2673		}
2674		ts3 = ts;
2675		timespecsub(&ts3, &ts2);
2676		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2677	}
2678	if (error == ERESTART)
2679		error = EINTR;
2680	return (error);
2681}
2682
2683static int
2684do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2685{
2686	struct umtx_q *uq;
2687	uint32_t flags;
2688	int32_t state, oldstate;
2689	int error, q, count;
2690
2691	uq = td->td_umtxq;
2692	flags = fuword32(&rwlock->rw_flags);
2693	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2694	if (error != 0)
2695		return (error);
2696
2697	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2698	if (state & URWLOCK_WRITE_OWNER) {
2699		for (;;) {
2700			oldstate = casuword32(&rwlock->rw_state, state,
2701				state & ~URWLOCK_WRITE_OWNER);
2702			if (oldstate != state) {
2703				state = oldstate;
2704				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2705					error = EPERM;
2706					goto out;
2707				}
2708			} else
2709				break;
2710		}
2711	} else if (URWLOCK_READER_COUNT(state) != 0) {
2712		for (;;) {
2713			oldstate = casuword32(&rwlock->rw_state, state,
2714				state - 1);
2715			if (oldstate != state) {
2716				state = oldstate;
2717				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2718					error = EPERM;
2719					goto out;
2720				}
2721			}
2722			else
2723				break;
2724		}
2725	} else {
2726		error = EPERM;
2727		goto out;
2728	}
2729
2730	count = 0;
2731
2732	if (!(flags & URWLOCK_PREFER_READER)) {
2733		if (state & URWLOCK_WRITE_WAITERS) {
2734			count = 1;
2735			q = UMTX_EXCLUSIVE_QUEUE;
2736		} else if (state & URWLOCK_READ_WAITERS) {
2737			count = INT_MAX;
2738			q = UMTX_SHARED_QUEUE;
2739		}
2740	} else {
2741		if (state & URWLOCK_READ_WAITERS) {
2742			count = INT_MAX;
2743			q = UMTX_SHARED_QUEUE;
2744		} else if (state & URWLOCK_WRITE_WAITERS) {
2745			count = 1;
2746			q = UMTX_EXCLUSIVE_QUEUE;
2747		}
2748	}
2749
2750	if (count) {
2751		umtxq_lock(&uq->uq_key);
2752		umtxq_busy(&uq->uq_key);
2753		umtxq_signal_queue(&uq->uq_key, count, q);
2754		umtxq_unbusy(&uq->uq_key);
2755		umtxq_unlock(&uq->uq_key);
2756	}
2757out:
2758	umtx_key_release(&uq->uq_key);
2759	return (error);
2760}
2761
2762static int
2763do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
2764{
2765	struct umtx_q *uq;
2766	struct timeval tv;
2767	struct timespec cts, ets, tts;
2768	uint32_t flags, count;
2769	int error;
2770
2771	uq = td->td_umtxq;
2772	flags = fuword32(&sem->_flags);
2773	error = umtx_key_get(sem, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2774	if (error != 0)
2775		return (error);
2776	umtxq_lock(&uq->uq_key);
2777	umtxq_busy(&uq->uq_key);
2778	umtxq_insert(uq);
2779	umtxq_unlock(&uq->uq_key);
2780
2781	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2782	if (count != 0) {
2783		umtxq_lock(&uq->uq_key);
2784		umtxq_unbusy(&uq->uq_key);
2785		umtxq_remove(uq);
2786		umtxq_unlock(&uq->uq_key);
2787		umtx_key_release(&uq->uq_key);
2788		return (0);
2789	}
2790
2791	/*
2792	 * The magic thing is we should set c_has_waiters to 1 before
2793	 * releasing user mutex.
2794	 */
2795	suword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 1);
2796
2797	umtxq_lock(&uq->uq_key);
2798	umtxq_unbusy(&uq->uq_key);
2799	umtxq_unlock(&uq->uq_key);
2800
2801	umtxq_lock(&uq->uq_key);
2802	if (timeout == NULL) {
2803		error = umtxq_sleep(uq, "usem", 0);
2804	} else {
2805		getnanouptime(&ets);
2806		timespecadd(&ets, timeout);
2807		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2808		for (;;) {
2809			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
2810			if (error != ETIMEDOUT)
2811				break;
2812			getnanouptime(&cts);
2813			if (timespeccmp(&cts, &ets, >=)) {
2814				error = ETIMEDOUT;
2815				break;
2816			}
2817			tts = ets;
2818			timespecsub(&tts, &cts);
2819			TIMESPEC_TO_TIMEVAL(&tv, &tts);
2820		}
2821	}
2822
2823	if (error != 0) {
2824		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2825			if (!umtxq_signal(&uq->uq_key, 1))
2826				error = 0;
2827		}
2828		if (error == ERESTART)
2829			error = EINTR;
2830	}
2831	umtxq_remove(uq);
2832	umtxq_unlock(&uq->uq_key);
2833	umtx_key_release(&uq->uq_key);
2834	return (error);
2835}
2836
2837/*
2838 * Signal a userland condition variable.
2839 */
2840static int
2841do_sem_wake(struct thread *td, struct _usem *sem)
2842{
2843	struct umtx_key key;
2844	int error, cnt, nwake;
2845	uint32_t flags;
2846
2847	flags = fuword32(&sem->_flags);
2848	if ((error = umtx_key_get(sem, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2849		return (error);
2850	umtxq_lock(&key);
2851	umtxq_busy(&key);
2852	cnt = umtxq_count(&key);
2853	nwake = umtxq_signal(&key, 1);
2854	if (cnt <= nwake) {
2855		umtxq_unlock(&key);
2856		error = suword32(
2857		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2858		umtxq_lock(&key);
2859	}
2860	umtxq_unbusy(&key);
2861	umtxq_unlock(&key);
2862	umtx_key_release(&key);
2863	return (error);
2864}
2865
2866int
2867_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2868    /* struct umtx *umtx */
2869{
2870	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2871}
2872
2873int
2874_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2875    /* struct umtx *umtx */
2876{
2877	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2878}
2879
2880static int
2881__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2882{
2883	struct timespec *ts, timeout;
2884	int error;
2885
2886	/* Allow a null timespec (wait forever). */
2887	if (uap->uaddr2 == NULL)
2888		ts = NULL;
2889	else {
2890		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2891		if (error != 0)
2892			return (error);
2893		if (timeout.tv_nsec >= 1000000000 ||
2894		    timeout.tv_nsec < 0) {
2895			return (EINVAL);
2896		}
2897		ts = &timeout;
2898	}
2899	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2900}
2901
2902static int
2903__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2904{
2905	return (do_unlock_umtx(td, uap->obj, uap->val));
2906}
2907
2908static int
2909__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2910{
2911	struct timespec *ts, timeout;
2912	int error;
2913
2914	if (uap->uaddr2 == NULL)
2915		ts = NULL;
2916	else {
2917		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2918		if (error != 0)
2919			return (error);
2920		if (timeout.tv_nsec >= 1000000000 ||
2921		    timeout.tv_nsec < 0)
2922			return (EINVAL);
2923		ts = &timeout;
2924	}
2925	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
2926}
2927
2928static int
2929__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
2930{
2931	struct timespec *ts, timeout;
2932	int error;
2933
2934	if (uap->uaddr2 == NULL)
2935		ts = NULL;
2936	else {
2937		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2938		if (error != 0)
2939			return (error);
2940		if (timeout.tv_nsec >= 1000000000 ||
2941		    timeout.tv_nsec < 0)
2942			return (EINVAL);
2943		ts = &timeout;
2944	}
2945	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
2946}
2947
2948static int
2949__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
2950{
2951	struct timespec *ts, timeout;
2952	int error;
2953
2954	if (uap->uaddr2 == NULL)
2955		ts = NULL;
2956	else {
2957		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2958		if (error != 0)
2959			return (error);
2960		if (timeout.tv_nsec >= 1000000000 ||
2961		    timeout.tv_nsec < 0)
2962			return (EINVAL);
2963		ts = &timeout;
2964	}
2965	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
2966}
2967
2968static int
2969__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2970{
2971	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
2972}
2973
2974static int
2975__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
2976{
2977	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
2978}
2979
2980static int
2981__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2982{
2983	struct timespec *ts, timeout;
2984	int error;
2985
2986	/* Allow a null timespec (wait forever). */
2987	if (uap->uaddr2 == NULL)
2988		ts = NULL;
2989	else {
2990		error = copyin(uap->uaddr2, &timeout,
2991		    sizeof(timeout));
2992		if (error != 0)
2993			return (error);
2994		if (timeout.tv_nsec >= 1000000000 ||
2995		    timeout.tv_nsec < 0) {
2996			return (EINVAL);
2997		}
2998		ts = &timeout;
2999	}
3000	return do_lock_umutex(td, uap->obj, ts, 0);
3001}
3002
3003static int
3004__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3005{
3006	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3007}
3008
3009static int
3010__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3011{
3012	struct timespec *ts, timeout;
3013	int error;
3014
3015	/* Allow a null timespec (wait forever). */
3016	if (uap->uaddr2 == NULL)
3017		ts = NULL;
3018	else {
3019		error = copyin(uap->uaddr2, &timeout,
3020		    sizeof(timeout));
3021		if (error != 0)
3022			return (error);
3023		if (timeout.tv_nsec >= 1000000000 ||
3024		    timeout.tv_nsec < 0) {
3025			return (EINVAL);
3026		}
3027		ts = &timeout;
3028	}
3029	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3030}
3031
3032static int
3033__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3034{
3035	return do_wake_umutex(td, uap->obj);
3036}
3037
3038static int
3039__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3040{
3041	return do_unlock_umutex(td, uap->obj);
3042}
3043
3044static int
3045__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3046{
3047	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3048}
3049
3050static int
3051__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3052{
3053	struct timespec *ts, timeout;
3054	int error;
3055
3056	/* Allow a null timespec (wait forever). */
3057	if (uap->uaddr2 == NULL)
3058		ts = NULL;
3059	else {
3060		error = copyin(uap->uaddr2, &timeout,
3061		    sizeof(timeout));
3062		if (error != 0)
3063			return (error);
3064		if (timeout.tv_nsec >= 1000000000 ||
3065		    timeout.tv_nsec < 0) {
3066			return (EINVAL);
3067		}
3068		ts = &timeout;
3069	}
3070	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3071}
3072
3073static int
3074__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3075{
3076	return do_cv_signal(td, uap->obj);
3077}
3078
3079static int
3080__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3081{
3082	return do_cv_broadcast(td, uap->obj);
3083}
3084
3085static int
3086__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3087{
3088	struct timespec timeout;
3089	int error;
3090
3091	/* Allow a null timespec (wait forever). */
3092	if (uap->uaddr2 == NULL) {
3093		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3094	} else {
3095		error = copyin(uap->uaddr2, &timeout,
3096		    sizeof(timeout));
3097		if (error != 0)
3098			return (error);
3099		if (timeout.tv_nsec >= 1000000000 ||
3100		    timeout.tv_nsec < 0) {
3101			return (EINVAL);
3102		}
3103		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3104	}
3105	return (error);
3106}
3107
3108static int
3109__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3110{
3111	struct timespec timeout;
3112	int error;
3113
3114	/* Allow a null timespec (wait forever). */
3115	if (uap->uaddr2 == NULL) {
3116		error = do_rw_wrlock(td, uap->obj, 0);
3117	} else {
3118		error = copyin(uap->uaddr2, &timeout,
3119		    sizeof(timeout));
3120		if (error != 0)
3121			return (error);
3122		if (timeout.tv_nsec >= 1000000000 ||
3123		    timeout.tv_nsec < 0) {
3124			return (EINVAL);
3125		}
3126
3127		error = do_rw_wrlock2(td, uap->obj, &timeout);
3128	}
3129	return (error);
3130}
3131
3132static int
3133__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3134{
3135	return do_rw_unlock(td, uap->obj);
3136}
3137
3138static int
3139__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3140{
3141	struct timespec *ts, timeout;
3142	int error;
3143
3144	/* Allow a null timespec (wait forever). */
3145	if (uap->uaddr2 == NULL)
3146		ts = NULL;
3147	else {
3148		error = copyin(uap->uaddr2, &timeout,
3149		    sizeof(timeout));
3150		if (error != 0)
3151			return (error);
3152		if (timeout.tv_nsec >= 1000000000 ||
3153		    timeout.tv_nsec < 0) {
3154			return (EINVAL);
3155		}
3156		ts = &timeout;
3157	}
3158	return (do_sem_wait(td, uap->obj, ts));
3159}
3160
3161static int
3162__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3163{
3164	return do_sem_wake(td, uap->obj);
3165}
3166
3167typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3168
3169static _umtx_op_func op_table[] = {
3170	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3171	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3172	__umtx_op_wait,			/* UMTX_OP_WAIT */
3173	__umtx_op_wake,			/* UMTX_OP_WAKE */
3174	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3175	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3176	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3177	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3178	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3179	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3180	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3181	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3182	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3183	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3184	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3185	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3186	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3187	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3188	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3189	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3190	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3191};
3192
3193int
3194_umtx_op(struct thread *td, struct _umtx_op_args *uap)
3195{
3196	if ((unsigned)uap->op < UMTX_OP_MAX)
3197		return (*op_table[uap->op])(td, uap);
3198	return (EINVAL);
3199}
3200
3201#ifdef COMPAT_IA32
3202int
3203freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3204    /* struct umtx *umtx */
3205{
3206	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3207}
3208
3209int
3210freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3211    /* struct umtx *umtx */
3212{
3213	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3214}
3215
3216struct timespec32 {
3217	u_int32_t tv_sec;
3218	u_int32_t tv_nsec;
3219};
3220
3221static inline int
3222copyin_timeout32(void *addr, struct timespec *tsp)
3223{
3224	struct timespec32 ts32;
3225	int error;
3226
3227	error = copyin(addr, &ts32, sizeof(struct timespec32));
3228	if (error == 0) {
3229		tsp->tv_sec = ts32.tv_sec;
3230		tsp->tv_nsec = ts32.tv_nsec;
3231	}
3232	return (error);
3233}
3234
3235static int
3236__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3237{
3238	struct timespec *ts, timeout;
3239	int error;
3240
3241	/* Allow a null timespec (wait forever). */
3242	if (uap->uaddr2 == NULL)
3243		ts = NULL;
3244	else {
3245		error = copyin_timeout32(uap->uaddr2, &timeout);
3246		if (error != 0)
3247			return (error);
3248		if (timeout.tv_nsec >= 1000000000 ||
3249		    timeout.tv_nsec < 0) {
3250			return (EINVAL);
3251		}
3252		ts = &timeout;
3253	}
3254	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3255}
3256
3257static int
3258__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3259{
3260	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3261}
3262
3263static int
3264__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3265{
3266	struct timespec *ts, timeout;
3267	int error;
3268
3269	if (uap->uaddr2 == NULL)
3270		ts = NULL;
3271	else {
3272		error = copyin_timeout32(uap->uaddr2, &timeout);
3273		if (error != 0)
3274			return (error);
3275		if (timeout.tv_nsec >= 1000000000 ||
3276		    timeout.tv_nsec < 0)
3277			return (EINVAL);
3278		ts = &timeout;
3279	}
3280	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
3281}
3282
3283static int
3284__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3285{
3286	struct timespec *ts, timeout;
3287	int error;
3288
3289	/* Allow a null timespec (wait forever). */
3290	if (uap->uaddr2 == NULL)
3291		ts = NULL;
3292	else {
3293		error = copyin_timeout32(uap->uaddr2, &timeout);
3294		if (error != 0)
3295			return (error);
3296		if (timeout.tv_nsec >= 1000000000 ||
3297		    timeout.tv_nsec < 0)
3298			return (EINVAL);
3299		ts = &timeout;
3300	}
3301	return do_lock_umutex(td, uap->obj, ts, 0);
3302}
3303
3304static int
3305__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3306{
3307	struct timespec *ts, timeout;
3308	int error;
3309
3310	/* Allow a null timespec (wait forever). */
3311	if (uap->uaddr2 == NULL)
3312		ts = NULL;
3313	else {
3314		error = copyin_timeout32(uap->uaddr2, &timeout);
3315		if (error != 0)
3316			return (error);
3317		if (timeout.tv_nsec >= 1000000000 ||
3318		    timeout.tv_nsec < 0)
3319			return (EINVAL);
3320		ts = &timeout;
3321	}
3322	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
3323}
3324
3325static int
3326__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3327{
3328	struct timespec *ts, timeout;
3329	int error;
3330
3331	/* Allow a null timespec (wait forever). */
3332	if (uap->uaddr2 == NULL)
3333		ts = NULL;
3334	else {
3335		error = copyin_timeout32(uap->uaddr2, &timeout);
3336		if (error != 0)
3337			return (error);
3338		if (timeout.tv_nsec >= 1000000000 ||
3339		    timeout.tv_nsec < 0)
3340			return (EINVAL);
3341		ts = &timeout;
3342	}
3343	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3344}
3345
3346static int
3347__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3348{
3349	struct timespec timeout;
3350	int error;
3351
3352	/* Allow a null timespec (wait forever). */
3353	if (uap->uaddr2 == NULL) {
3354		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3355	} else {
3356		error = copyin(uap->uaddr2, &timeout,
3357		    sizeof(timeout));
3358		if (error != 0)
3359			return (error);
3360		if (timeout.tv_nsec >= 1000000000 ||
3361		    timeout.tv_nsec < 0) {
3362			return (EINVAL);
3363		}
3364		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3365	}
3366	return (error);
3367}
3368
3369static int
3370__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3371{
3372	struct timespec timeout;
3373	int error;
3374
3375	/* Allow a null timespec (wait forever). */
3376	if (uap->uaddr2 == NULL) {
3377		error = do_rw_wrlock(td, uap->obj, 0);
3378	} else {
3379		error = copyin_timeout32(uap->uaddr2, &timeout);
3380		if (error != 0)
3381			return (error);
3382		if (timeout.tv_nsec >= 1000000000 ||
3383		    timeout.tv_nsec < 0) {
3384			return (EINVAL);
3385		}
3386
3387		error = do_rw_wrlock2(td, uap->obj, &timeout);
3388	}
3389	return (error);
3390}
3391
3392static int
3393__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3394{
3395	struct timespec *ts, timeout;
3396	int error;
3397
3398	if (uap->uaddr2 == NULL)
3399		ts = NULL;
3400	else {
3401		error = copyin_timeout32(uap->uaddr2, &timeout);
3402		if (error != 0)
3403			return (error);
3404		if (timeout.tv_nsec >= 1000000000 ||
3405		    timeout.tv_nsec < 0)
3406			return (EINVAL);
3407		ts = &timeout;
3408	}
3409	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
3410}
3411
3412static int
3413__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3414{
3415	struct timespec *ts, timeout;
3416	int error;
3417
3418	/* Allow a null timespec (wait forever). */
3419	if (uap->uaddr2 == NULL)
3420		ts = NULL;
3421	else {
3422		error = copyin_timeout32(uap->uaddr2, &timeout);
3423		if (error != 0)
3424			return (error);
3425		if (timeout.tv_nsec >= 1000000000 ||
3426		    timeout.tv_nsec < 0)
3427			return (EINVAL);
3428		ts = &timeout;
3429	}
3430	return (do_sem_wait(td, uap->obj, ts));
3431}
3432
3433static _umtx_op_func op_table_compat32[] = {
3434	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3435	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3436	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3437	__umtx_op_wake,			/* UMTX_OP_WAKE */
3438	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3439	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3440	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3441	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3442	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3443	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3444	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3445	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3446	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3447	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3448	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3449	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3450	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3451	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3452	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3453	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3454	__umtx_op_sem_wake		/* UMTX_OP_SEM_WAKE */
3455};
3456
3457int
3458freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3459{
3460	if ((unsigned)uap->op < UMTX_OP_MAX)
3461		return (*op_table_compat32[uap->op])(td,
3462			(struct _umtx_op_args *)uap);
3463	return (EINVAL);
3464}
3465#endif
3466
3467void
3468umtx_thread_init(struct thread *td)
3469{
3470	td->td_umtxq = umtxq_alloc();
3471	td->td_umtxq->uq_thread = td;
3472}
3473
3474void
3475umtx_thread_fini(struct thread *td)
3476{
3477	umtxq_free(td->td_umtxq);
3478}
3479
3480/*
3481 * It will be called when new thread is created, e.g fork().
3482 */
3483void
3484umtx_thread_alloc(struct thread *td)
3485{
3486	struct umtx_q *uq;
3487
3488	uq = td->td_umtxq;
3489	uq->uq_inherited_pri = PRI_MAX;
3490
3491	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3492	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3493	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3494	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3495}
3496
3497/*
3498 * exec() hook.
3499 */
3500static void
3501umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3502	struct image_params *imgp __unused)
3503{
3504	umtx_thread_cleanup(curthread);
3505}
3506
3507/*
3508 * thread_exit() hook.
3509 */
3510void
3511umtx_thread_exit(struct thread *td)
3512{
3513	umtx_thread_cleanup(td);
3514}
3515
3516/*
3517 * clean up umtx data.
3518 */
3519static void
3520umtx_thread_cleanup(struct thread *td)
3521{
3522	struct umtx_q *uq;
3523	struct umtx_pi *pi;
3524
3525	if ((uq = td->td_umtxq) == NULL)
3526		return;
3527
3528	mtx_lock_spin(&umtx_lock);
3529	uq->uq_inherited_pri = PRI_MAX;
3530	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3531		pi->pi_owner = NULL;
3532		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3533	}
3534	thread_lock(td);
3535	td->td_flags &= ~TDF_UBORROWING;
3536	thread_unlock(td);
3537	mtx_unlock_spin(&umtx_lock);
3538}
3539