kern_umtx.c revision 233045
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 233045 2012-03-16 20:32:11Z davide $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sched.h>
43#include <sys/smp.h>
44#include <sys/sysctl.h>
45#include <sys/sysent.h>
46#include <sys/systm.h>
47#include <sys/sysproto.h>
48#include <sys/syscallsubr.h>
49#include <sys/eventhandler.h>
50#include <sys/umtx.h>
51
52#include <vm/vm.h>
53#include <vm/vm_param.h>
54#include <vm/pmap.h>
55#include <vm/vm_map.h>
56#include <vm/vm_object.h>
57
58#include <machine/cpu.h>
59
60#ifdef COMPAT_FREEBSD32
61#include <compat/freebsd32/freebsd32_proto.h>
62#endif
63
64#define _UMUTEX_TRY		1
65#define _UMUTEX_WAIT		2
66
67/* Priority inheritance mutex info. */
68struct umtx_pi {
69	/* Owner thread */
70	struct thread		*pi_owner;
71
72	/* Reference count */
73	int			pi_refcount;
74
75 	/* List entry to link umtx holding by thread */
76	TAILQ_ENTRY(umtx_pi)	pi_link;
77
78	/* List entry in hash */
79	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
80
81	/* List for waiters */
82	TAILQ_HEAD(,umtx_q)	pi_blocked;
83
84	/* Identify a userland lock object */
85	struct umtx_key		pi_key;
86};
87
88/* A userland synchronous object user. */
89struct umtx_q {
90	/* Linked list for the hash. */
91	TAILQ_ENTRY(umtx_q)	uq_link;
92
93	/* Umtx key. */
94	struct umtx_key		uq_key;
95
96	/* Umtx flags. */
97	int			uq_flags;
98#define UQF_UMTXQ	0x0001
99
100	/* The thread waits on. */
101	struct thread		*uq_thread;
102
103	/*
104	 * Blocked on PI mutex. read can use chain lock
105	 * or umtx_lock, write must have both chain lock and
106	 * umtx_lock being hold.
107	 */
108	struct umtx_pi		*uq_pi_blocked;
109
110	/* On blocked list */
111	TAILQ_ENTRY(umtx_q)	uq_lockq;
112
113	/* Thread contending with us */
114	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
115
116	/* Inherited priority from PP mutex */
117	u_char			uq_inherited_pri;
118
119	/* Spare queue ready to be reused */
120	struct umtxq_queue	*uq_spare_queue;
121
122	/* The queue we on */
123	struct umtxq_queue	*uq_cur_queue;
124};
125
126TAILQ_HEAD(umtxq_head, umtx_q);
127
128/* Per-key wait-queue */
129struct umtxq_queue {
130	struct umtxq_head	head;
131	struct umtx_key		key;
132	LIST_ENTRY(umtxq_queue)	link;
133	int			length;
134};
135
136LIST_HEAD(umtxq_list, umtxq_queue);
137
138/* Userland lock object's wait-queue chain */
139struct umtxq_chain {
140	/* Lock for this chain. */
141	struct mtx		uc_lock;
142
143	/* List of sleep queues. */
144	struct umtxq_list	uc_queue[2];
145#define UMTX_SHARED_QUEUE	0
146#define UMTX_EXCLUSIVE_QUEUE	1
147
148	LIST_HEAD(, umtxq_queue) uc_spare_queue;
149
150	/* Busy flag */
151	char			uc_busy;
152
153	/* Chain lock waiters */
154	int			uc_waiters;
155
156	/* All PI in the list */
157	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
158
159#ifdef UMTX_PROFILING
160	int 			length;
161	int			max_length;
162#endif
163};
164
165#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
166#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
167
168/*
169 * Don't propagate time-sharing priority, there is a security reason,
170 * a user can simply introduce PI-mutex, let thread A lock the mutex,
171 * and let another thread B block on the mutex, because B is
172 * sleeping, its priority will be boosted, this causes A's priority to
173 * be boosted via priority propagating too and will never be lowered even
174 * if it is using 100%CPU, this is unfair to other processes.
175 */
176
177#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
178			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
179			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
180
181#define	GOLDEN_RATIO_PRIME	2654404609U
182#define	UMTX_CHAINS		512
183#define	UMTX_SHIFTS		(__WORD_BIT - 9)
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188#define BUSY_SPINS		200
189
190static uma_zone_t		umtx_pi_zone;
191static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
192static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
193static int			umtx_pi_allocated;
194
195static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
196SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
197    &umtx_pi_allocated, 0, "Allocated umtx_pi");
198
199#ifdef UMTX_PROFILING
200static long max_length;
201SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
202static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
203#endif
204
205static void umtxq_sysinit(void *);
206static void umtxq_hash(struct umtx_key *key);
207static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
208static void umtxq_lock(struct umtx_key *key);
209static void umtxq_unlock(struct umtx_key *key);
210static void umtxq_busy(struct umtx_key *key);
211static void umtxq_unbusy(struct umtx_key *key);
212static void umtxq_insert_queue(struct umtx_q *uq, int q);
213static void umtxq_remove_queue(struct umtx_q *uq, int q);
214static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
215static int umtxq_count(struct umtx_key *key);
216static struct umtx_pi *umtx_pi_alloc(int);
217static void umtx_pi_free(struct umtx_pi *pi);
218static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
219static void umtx_thread_cleanup(struct thread *td);
220static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
221	struct image_params *imgp __unused);
222SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
223
224#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
225#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
226#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
227
228static struct mtx umtx_lock;
229
230#ifdef UMTX_PROFILING
231static void
232umtx_init_profiling(void)
233{
234	struct sysctl_oid *chain_oid;
235	char chain_name[10];
236	int i;
237
238	for (i = 0; i < UMTX_CHAINS; ++i) {
239		snprintf(chain_name, sizeof(chain_name), "%d", i);
240		chain_oid = SYSCTL_ADD_NODE(NULL,
241		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
242		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
243		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
244		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
245		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
246		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
247	}
248}
249#endif
250
251static void
252umtxq_sysinit(void *arg __unused)
253{
254	int i, j;
255
256	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
257		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
258	for (i = 0; i < 2; ++i) {
259		for (j = 0; j < UMTX_CHAINS; ++j) {
260			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
261				 MTX_DEF | MTX_DUPOK);
262			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
263			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
264			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
265			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
266			umtxq_chains[i][j].uc_busy = 0;
267			umtxq_chains[i][j].uc_waiters = 0;
268			#ifdef UMTX_PROFILING
269			umtxq_chains[i][j].length = 0;
270			umtxq_chains[i][j].max_length = 0;
271			#endif
272		}
273	}
274	#ifdef UMTX_PROFILING
275	umtx_init_profiling();
276	#endif
277	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
278	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
279	    EVENTHANDLER_PRI_ANY);
280}
281
282struct umtx_q *
283umtxq_alloc(void)
284{
285	struct umtx_q *uq;
286
287	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
288	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
289	TAILQ_INIT(&uq->uq_spare_queue->head);
290	TAILQ_INIT(&uq->uq_pi_contested);
291	uq->uq_inherited_pri = PRI_MAX;
292	return (uq);
293}
294
295void
296umtxq_free(struct umtx_q *uq)
297{
298	MPASS(uq->uq_spare_queue != NULL);
299	free(uq->uq_spare_queue, M_UMTX);
300	free(uq, M_UMTX);
301}
302
303static inline void
304umtxq_hash(struct umtx_key *key)
305{
306	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
307	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
308}
309
310static inline struct umtxq_chain *
311umtxq_getchain(struct umtx_key *key)
312{
313	if (key->type <= TYPE_SEM)
314		return (&umtxq_chains[1][key->hash]);
315	return (&umtxq_chains[0][key->hash]);
316}
317
318/*
319 * Lock a chain.
320 */
321static inline void
322umtxq_lock(struct umtx_key *key)
323{
324	struct umtxq_chain *uc;
325
326	uc = umtxq_getchain(key);
327	mtx_lock(&uc->uc_lock);
328}
329
330/*
331 * Unlock a chain.
332 */
333static inline void
334umtxq_unlock(struct umtx_key *key)
335{
336	struct umtxq_chain *uc;
337
338	uc = umtxq_getchain(key);
339	mtx_unlock(&uc->uc_lock);
340}
341
342/*
343 * Set chain to busy state when following operation
344 * may be blocked (kernel mutex can not be used).
345 */
346static inline void
347umtxq_busy(struct umtx_key *key)
348{
349	struct umtxq_chain *uc;
350
351	uc = umtxq_getchain(key);
352	mtx_assert(&uc->uc_lock, MA_OWNED);
353	if (uc->uc_busy) {
354#ifdef SMP
355		if (smp_cpus > 1) {
356			int count = BUSY_SPINS;
357			if (count > 0) {
358				umtxq_unlock(key);
359				while (uc->uc_busy && --count > 0)
360					cpu_spinwait();
361				umtxq_lock(key);
362			}
363		}
364#endif
365		while (uc->uc_busy) {
366			uc->uc_waiters++;
367			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
368			uc->uc_waiters--;
369		}
370	}
371	uc->uc_busy = 1;
372}
373
374/*
375 * Unbusy a chain.
376 */
377static inline void
378umtxq_unbusy(struct umtx_key *key)
379{
380	struct umtxq_chain *uc;
381
382	uc = umtxq_getchain(key);
383	mtx_assert(&uc->uc_lock, MA_OWNED);
384	KASSERT(uc->uc_busy != 0, ("not busy"));
385	uc->uc_busy = 0;
386	if (uc->uc_waiters)
387		wakeup_one(uc);
388}
389
390static struct umtxq_queue *
391umtxq_queue_lookup(struct umtx_key *key, int q)
392{
393	struct umtxq_queue *uh;
394	struct umtxq_chain *uc;
395
396	uc = umtxq_getchain(key);
397	UMTXQ_LOCKED_ASSERT(uc);
398	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
399		if (umtx_key_match(&uh->key, key))
400			return (uh);
401	}
402
403	return (NULL);
404}
405
406static inline void
407umtxq_insert_queue(struct umtx_q *uq, int q)
408{
409	struct umtxq_queue *uh;
410	struct umtxq_chain *uc;
411
412	uc = umtxq_getchain(&uq->uq_key);
413	UMTXQ_LOCKED_ASSERT(uc);
414	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
415	uh = umtxq_queue_lookup(&uq->uq_key, q);
416	if (uh != NULL) {
417		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
418	} else {
419		uh = uq->uq_spare_queue;
420		uh->key = uq->uq_key;
421		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
422	}
423	uq->uq_spare_queue = NULL;
424
425	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
426	uh->length++;
427	#ifdef UMTX_PROFILING
428	uc->length++;
429	if (uc->length > uc->max_length) {
430		uc->max_length = uc->length;
431		if (uc->max_length > max_length)
432			max_length = uc->max_length;
433	}
434	#endif
435	uq->uq_flags |= UQF_UMTXQ;
436	uq->uq_cur_queue = uh;
437	return;
438}
439
440static inline void
441umtxq_remove_queue(struct umtx_q *uq, int q)
442{
443	struct umtxq_chain *uc;
444	struct umtxq_queue *uh;
445
446	uc = umtxq_getchain(&uq->uq_key);
447	UMTXQ_LOCKED_ASSERT(uc);
448	if (uq->uq_flags & UQF_UMTXQ) {
449		uh = uq->uq_cur_queue;
450		TAILQ_REMOVE(&uh->head, uq, uq_link);
451		uh->length--;
452		#ifdef UMTX_PROFILING
453		uc->length--;
454		#endif
455		uq->uq_flags &= ~UQF_UMTXQ;
456		if (TAILQ_EMPTY(&uh->head)) {
457			KASSERT(uh->length == 0,
458			    ("inconsistent umtxq_queue length"));
459			LIST_REMOVE(uh, link);
460		} else {
461			uh = LIST_FIRST(&uc->uc_spare_queue);
462			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
463			LIST_REMOVE(uh, link);
464		}
465		uq->uq_spare_queue = uh;
466		uq->uq_cur_queue = NULL;
467	}
468}
469
470/*
471 * Check if there are multiple waiters
472 */
473static int
474umtxq_count(struct umtx_key *key)
475{
476	struct umtxq_chain *uc;
477	struct umtxq_queue *uh;
478
479	uc = umtxq_getchain(key);
480	UMTXQ_LOCKED_ASSERT(uc);
481	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
482	if (uh != NULL)
483		return (uh->length);
484	return (0);
485}
486
487/*
488 * Check if there are multiple PI waiters and returns first
489 * waiter.
490 */
491static int
492umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
493{
494	struct umtxq_chain *uc;
495	struct umtxq_queue *uh;
496
497	*first = NULL;
498	uc = umtxq_getchain(key);
499	UMTXQ_LOCKED_ASSERT(uc);
500	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
501	if (uh != NULL) {
502		*first = TAILQ_FIRST(&uh->head);
503		return (uh->length);
504	}
505	return (0);
506}
507
508/*
509 * Wake up threads waiting on an userland object.
510 */
511
512static int
513umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
514{
515	struct umtxq_chain *uc;
516	struct umtxq_queue *uh;
517	struct umtx_q *uq;
518	int ret;
519
520	ret = 0;
521	uc = umtxq_getchain(key);
522	UMTXQ_LOCKED_ASSERT(uc);
523	uh = umtxq_queue_lookup(key, q);
524	if (uh != NULL) {
525		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
526			umtxq_remove_queue(uq, q);
527			wakeup(uq);
528			if (++ret >= n_wake)
529				return (ret);
530		}
531	}
532	return (ret);
533}
534
535
536/*
537 * Wake up specified thread.
538 */
539static inline void
540umtxq_signal_thread(struct umtx_q *uq)
541{
542	struct umtxq_chain *uc;
543
544	uc = umtxq_getchain(&uq->uq_key);
545	UMTXQ_LOCKED_ASSERT(uc);
546	umtxq_remove(uq);
547	wakeup(uq);
548}
549
550/*
551 * Put thread into sleep state, before sleeping, check if
552 * thread was removed from umtx queue.
553 */
554static inline int
555umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
556{
557	struct umtxq_chain *uc;
558	int error;
559
560	uc = umtxq_getchain(&uq->uq_key);
561	UMTXQ_LOCKED_ASSERT(uc);
562	if (!(uq->uq_flags & UQF_UMTXQ))
563		return (0);
564	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
565	if (error == EWOULDBLOCK)
566		error = ETIMEDOUT;
567	return (error);
568}
569
570/*
571 * Convert userspace address into unique logical address.
572 */
573int
574umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
575{
576	struct thread *td = curthread;
577	vm_map_t map;
578	vm_map_entry_t entry;
579	vm_pindex_t pindex;
580	vm_prot_t prot;
581	boolean_t wired;
582
583	key->type = type;
584	if (share == THREAD_SHARE) {
585		key->shared = 0;
586		key->info.private.vs = td->td_proc->p_vmspace;
587		key->info.private.addr = (uintptr_t)addr;
588	} else {
589		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
590		map = &td->td_proc->p_vmspace->vm_map;
591		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
592		    &entry, &key->info.shared.object, &pindex, &prot,
593		    &wired) != KERN_SUCCESS) {
594			return EFAULT;
595		}
596
597		if ((share == PROCESS_SHARE) ||
598		    (share == AUTO_SHARE &&
599		     VM_INHERIT_SHARE == entry->inheritance)) {
600			key->shared = 1;
601			key->info.shared.offset = entry->offset + entry->start -
602				(vm_offset_t)addr;
603			vm_object_reference(key->info.shared.object);
604		} else {
605			key->shared = 0;
606			key->info.private.vs = td->td_proc->p_vmspace;
607			key->info.private.addr = (uintptr_t)addr;
608		}
609		vm_map_lookup_done(map, entry);
610	}
611
612	umtxq_hash(key);
613	return (0);
614}
615
616/*
617 * Release key.
618 */
619void
620umtx_key_release(struct umtx_key *key)
621{
622	if (key->shared)
623		vm_object_deallocate(key->info.shared.object);
624}
625
626/*
627 * Lock a umtx object.
628 */
629static int
630_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
631{
632	struct umtx_q *uq;
633	u_long owner;
634	u_long old;
635	int error = 0;
636
637	uq = td->td_umtxq;
638
639	/*
640	 * Care must be exercised when dealing with umtx structure. It
641	 * can fault on any access.
642	 */
643	for (;;) {
644		/*
645		 * Try the uncontested case.  This should be done in userland.
646		 */
647		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
648
649		/* The acquire succeeded. */
650		if (owner == UMTX_UNOWNED)
651			return (0);
652
653		/* The address was invalid. */
654		if (owner == -1)
655			return (EFAULT);
656
657		/* If no one owns it but it is contested try to acquire it. */
658		if (owner == UMTX_CONTESTED) {
659			owner = casuword(&umtx->u_owner,
660			    UMTX_CONTESTED, id | UMTX_CONTESTED);
661
662			if (owner == UMTX_CONTESTED)
663				return (0);
664
665			/* The address was invalid. */
666			if (owner == -1)
667				return (EFAULT);
668
669			/* If this failed the lock has changed, restart. */
670			continue;
671		}
672
673		/*
674		 * If we caught a signal, we have retried and now
675		 * exit immediately.
676		 */
677		if (error != 0)
678			return (error);
679
680		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
681			AUTO_SHARE, &uq->uq_key)) != 0)
682			return (error);
683
684		umtxq_lock(&uq->uq_key);
685		umtxq_busy(&uq->uq_key);
686		umtxq_insert(uq);
687		umtxq_unbusy(&uq->uq_key);
688		umtxq_unlock(&uq->uq_key);
689
690		/*
691		 * Set the contested bit so that a release in user space
692		 * knows to use the system call for unlock.  If this fails
693		 * either some one else has acquired the lock or it has been
694		 * released.
695		 */
696		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
697
698		/* The address was invalid. */
699		if (old == -1) {
700			umtxq_lock(&uq->uq_key);
701			umtxq_remove(uq);
702			umtxq_unlock(&uq->uq_key);
703			umtx_key_release(&uq->uq_key);
704			return (EFAULT);
705		}
706
707		/*
708		 * We set the contested bit, sleep. Otherwise the lock changed
709		 * and we need to retry or we lost a race to the thread
710		 * unlocking the umtx.
711		 */
712		umtxq_lock(&uq->uq_key);
713		if (old == owner)
714			error = umtxq_sleep(uq, "umtx", timo);
715		umtxq_remove(uq);
716		umtxq_unlock(&uq->uq_key);
717		umtx_key_release(&uq->uq_key);
718	}
719
720	return (0);
721}
722
723/*
724 * Lock a umtx object.
725 */
726static int
727do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
728	struct timespec *timeout)
729{
730	struct timespec ts, ts2, ts3;
731	struct timeval tv;
732	int error;
733
734	if (timeout == NULL) {
735		error = _do_lock_umtx(td, umtx, id, 0);
736		/* Mutex locking is restarted if it is interrupted. */
737		if (error == EINTR)
738			error = ERESTART;
739	} else {
740		getnanouptime(&ts);
741		timespecadd(&ts, timeout);
742		TIMESPEC_TO_TIMEVAL(&tv, timeout);
743		for (;;) {
744			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
745			if (error != ETIMEDOUT)
746				break;
747			getnanouptime(&ts2);
748			if (timespeccmp(&ts2, &ts, >=)) {
749				error = ETIMEDOUT;
750				break;
751			}
752			ts3 = ts;
753			timespecsub(&ts3, &ts2);
754			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
755		}
756		/* Timed-locking is not restarted. */
757		if (error == ERESTART)
758			error = EINTR;
759	}
760	return (error);
761}
762
763/*
764 * Unlock a umtx object.
765 */
766static int
767do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
768{
769	struct umtx_key key;
770	u_long owner;
771	u_long old;
772	int error;
773	int count;
774
775	/*
776	 * Make sure we own this mtx.
777	 */
778	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
779	if (owner == -1)
780		return (EFAULT);
781
782	if ((owner & ~UMTX_CONTESTED) != id)
783		return (EPERM);
784
785	/* This should be done in userland */
786	if ((owner & UMTX_CONTESTED) == 0) {
787		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
788		if (old == -1)
789			return (EFAULT);
790		if (old == owner)
791			return (0);
792		owner = old;
793	}
794
795	/* We should only ever be in here for contested locks */
796	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
797		&key)) != 0)
798		return (error);
799
800	umtxq_lock(&key);
801	umtxq_busy(&key);
802	count = umtxq_count(&key);
803	umtxq_unlock(&key);
804
805	/*
806	 * When unlocking the umtx, it must be marked as unowned if
807	 * there is zero or one thread only waiting for it.
808	 * Otherwise, it must be marked as contested.
809	 */
810	old = casuword(&umtx->u_owner, owner,
811		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
812	umtxq_lock(&key);
813	umtxq_signal(&key,1);
814	umtxq_unbusy(&key);
815	umtxq_unlock(&key);
816	umtx_key_release(&key);
817	if (old == -1)
818		return (EFAULT);
819	if (old != owner)
820		return (EINVAL);
821	return (0);
822}
823
824#ifdef COMPAT_FREEBSD32
825
826/*
827 * Lock a umtx object.
828 */
829static int
830_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
831{
832	struct umtx_q *uq;
833	uint32_t owner;
834	uint32_t old;
835	int error = 0;
836
837	uq = td->td_umtxq;
838
839	/*
840	 * Care must be exercised when dealing with umtx structure. It
841	 * can fault on any access.
842	 */
843	for (;;) {
844		/*
845		 * Try the uncontested case.  This should be done in userland.
846		 */
847		owner = casuword32(m, UMUTEX_UNOWNED, id);
848
849		/* The acquire succeeded. */
850		if (owner == UMUTEX_UNOWNED)
851			return (0);
852
853		/* The address was invalid. */
854		if (owner == -1)
855			return (EFAULT);
856
857		/* If no one owns it but it is contested try to acquire it. */
858		if (owner == UMUTEX_CONTESTED) {
859			owner = casuword32(m,
860			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
861			if (owner == UMUTEX_CONTESTED)
862				return (0);
863
864			/* The address was invalid. */
865			if (owner == -1)
866				return (EFAULT);
867
868			/* If this failed the lock has changed, restart. */
869			continue;
870		}
871
872		/*
873		 * If we caught a signal, we have retried and now
874		 * exit immediately.
875		 */
876		if (error != 0)
877			return (error);
878
879		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
880			AUTO_SHARE, &uq->uq_key)) != 0)
881			return (error);
882
883		umtxq_lock(&uq->uq_key);
884		umtxq_busy(&uq->uq_key);
885		umtxq_insert(uq);
886		umtxq_unbusy(&uq->uq_key);
887		umtxq_unlock(&uq->uq_key);
888
889		/*
890		 * Set the contested bit so that a release in user space
891		 * knows to use the system call for unlock.  If this fails
892		 * either some one else has acquired the lock or it has been
893		 * released.
894		 */
895		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
896
897		/* The address was invalid. */
898		if (old == -1) {
899			umtxq_lock(&uq->uq_key);
900			umtxq_remove(uq);
901			umtxq_unlock(&uq->uq_key);
902			umtx_key_release(&uq->uq_key);
903			return (EFAULT);
904		}
905
906		/*
907		 * We set the contested bit, sleep. Otherwise the lock changed
908		 * and we need to retry or we lost a race to the thread
909		 * unlocking the umtx.
910		 */
911		umtxq_lock(&uq->uq_key);
912		if (old == owner)
913			error = umtxq_sleep(uq, "umtx", timo);
914		umtxq_remove(uq);
915		umtxq_unlock(&uq->uq_key);
916		umtx_key_release(&uq->uq_key);
917	}
918
919	return (0);
920}
921
922/*
923 * Lock a umtx object.
924 */
925static int
926do_lock_umtx32(struct thread *td, void *m, uint32_t id,
927	struct timespec *timeout)
928{
929	struct timespec ts, ts2, ts3;
930	struct timeval tv;
931	int error;
932
933	if (timeout == NULL) {
934		error = _do_lock_umtx32(td, m, id, 0);
935		/* Mutex locking is restarted if it is interrupted. */
936		if (error == EINTR)
937			error = ERESTART;
938	} else {
939		getnanouptime(&ts);
940		timespecadd(&ts, timeout);
941		TIMESPEC_TO_TIMEVAL(&tv, timeout);
942		for (;;) {
943			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
944			if (error != ETIMEDOUT)
945				break;
946			getnanouptime(&ts2);
947			if (timespeccmp(&ts2, &ts, >=)) {
948				error = ETIMEDOUT;
949				break;
950			}
951			ts3 = ts;
952			timespecsub(&ts3, &ts2);
953			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
954		}
955		/* Timed-locking is not restarted. */
956		if (error == ERESTART)
957			error = EINTR;
958	}
959	return (error);
960}
961
962/*
963 * Unlock a umtx object.
964 */
965static int
966do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
967{
968	struct umtx_key key;
969	uint32_t owner;
970	uint32_t old;
971	int error;
972	int count;
973
974	/*
975	 * Make sure we own this mtx.
976	 */
977	owner = fuword32(m);
978	if (owner == -1)
979		return (EFAULT);
980
981	if ((owner & ~UMUTEX_CONTESTED) != id)
982		return (EPERM);
983
984	/* This should be done in userland */
985	if ((owner & UMUTEX_CONTESTED) == 0) {
986		old = casuword32(m, owner, UMUTEX_UNOWNED);
987		if (old == -1)
988			return (EFAULT);
989		if (old == owner)
990			return (0);
991		owner = old;
992	}
993
994	/* We should only ever be in here for contested locks */
995	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
996		&key)) != 0)
997		return (error);
998
999	umtxq_lock(&key);
1000	umtxq_busy(&key);
1001	count = umtxq_count(&key);
1002	umtxq_unlock(&key);
1003
1004	/*
1005	 * When unlocking the umtx, it must be marked as unowned if
1006	 * there is zero or one thread only waiting for it.
1007	 * Otherwise, it must be marked as contested.
1008	 */
1009	old = casuword32(m, owner,
1010		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1011	umtxq_lock(&key);
1012	umtxq_signal(&key,1);
1013	umtxq_unbusy(&key);
1014	umtxq_unlock(&key);
1015	umtx_key_release(&key);
1016	if (old == -1)
1017		return (EFAULT);
1018	if (old != owner)
1019		return (EINVAL);
1020	return (0);
1021}
1022#endif
1023
1024static inline int
1025tstohz(const struct timespec *tsp)
1026{
1027	struct timeval tv;
1028
1029	TIMESPEC_TO_TIMEVAL(&tv, tsp);
1030	return tvtohz(&tv);
1031}
1032
1033/*
1034 * Fetch and compare value, sleep on the address if value is not changed.
1035 */
1036static int
1037do_wait(struct thread *td, void *addr, u_long id,
1038	struct _umtx_time *timeout, int compat32, int is_private)
1039{
1040	struct umtx_q *uq;
1041	struct timespec ets, cts, tts;
1042	u_long tmp;
1043	int error = 0;
1044
1045	uq = td->td_umtxq;
1046	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1047		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1048		return (error);
1049
1050	umtxq_lock(&uq->uq_key);
1051	umtxq_insert(uq);
1052	umtxq_unlock(&uq->uq_key);
1053	if (compat32 == 0)
1054		tmp = fuword(addr);
1055        else
1056		tmp = (unsigned int)fuword32(addr);
1057	if (tmp != id) {
1058		umtxq_lock(&uq->uq_key);
1059		umtxq_remove(uq);
1060		umtxq_unlock(&uq->uq_key);
1061	} else if (timeout == NULL) {
1062		umtxq_lock(&uq->uq_key);
1063		error = umtxq_sleep(uq, "uwait", 0);
1064		umtxq_remove(uq);
1065		umtxq_unlock(&uq->uq_key);
1066	} else {
1067		kern_clock_gettime(td, timeout->_clockid, &cts);
1068		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
1069			ets = cts;
1070			timespecadd(&ets, &timeout->_timeout);
1071		} else {
1072			ets = timeout->_timeout;
1073		}
1074		umtxq_lock(&uq->uq_key);
1075		for (;;) {
1076			if (timespeccmp(&cts, &ets, >=)) {
1077				error = ETIMEDOUT;
1078				break;
1079			}
1080			tts = ets;
1081			timespecsub(&tts, &cts);
1082			error = umtxq_sleep(uq, "uwait", tstohz(&tts));
1083			if (!(uq->uq_flags & UQF_UMTXQ)) {
1084				error = 0;
1085				break;
1086			}
1087			if (error != ETIMEDOUT)
1088				break;
1089			umtxq_unlock(&uq->uq_key);
1090			kern_clock_gettime(td, timeout->_clockid, &cts);
1091			umtxq_lock(&uq->uq_key);
1092		}
1093		umtxq_remove(uq);
1094		umtxq_unlock(&uq->uq_key);
1095	}
1096	umtx_key_release(&uq->uq_key);
1097	if (error == ERESTART)
1098		error = EINTR;
1099	return (error);
1100}
1101
1102/*
1103 * Wake up threads sleeping on the specified address.
1104 */
1105int
1106kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1107{
1108	struct umtx_key key;
1109	int ret;
1110
1111	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1112		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1113		return (ret);
1114	umtxq_lock(&key);
1115	ret = umtxq_signal(&key, n_wake);
1116	umtxq_unlock(&key);
1117	umtx_key_release(&key);
1118	return (0);
1119}
1120
1121/*
1122 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1123 */
1124static int
1125_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1126	int mode)
1127{
1128	struct umtx_q *uq;
1129	uint32_t owner, old, id;
1130	int error = 0;
1131
1132	id = td->td_tid;
1133	uq = td->td_umtxq;
1134
1135	/*
1136	 * Care must be exercised when dealing with umtx structure. It
1137	 * can fault on any access.
1138	 */
1139	for (;;) {
1140		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
1141		if (mode == _UMUTEX_WAIT) {
1142			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1143				return (0);
1144		} else {
1145			/*
1146			 * Try the uncontested case.  This should be done in userland.
1147			 */
1148			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1149
1150			/* The acquire succeeded. */
1151			if (owner == UMUTEX_UNOWNED)
1152				return (0);
1153
1154			/* The address was invalid. */
1155			if (owner == -1)
1156				return (EFAULT);
1157
1158			/* If no one owns it but it is contested try to acquire it. */
1159			if (owner == UMUTEX_CONTESTED) {
1160				owner = casuword32(&m->m_owner,
1161				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1162
1163				if (owner == UMUTEX_CONTESTED)
1164					return (0);
1165
1166				/* The address was invalid. */
1167				if (owner == -1)
1168					return (EFAULT);
1169
1170				/* If this failed the lock has changed, restart. */
1171				continue;
1172			}
1173		}
1174
1175		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1176		    (owner & ~UMUTEX_CONTESTED) == id)
1177			return (EDEADLK);
1178
1179		if (mode == _UMUTEX_TRY)
1180			return (EBUSY);
1181
1182		/*
1183		 * If we caught a signal, we have retried and now
1184		 * exit immediately.
1185		 */
1186		if (error != 0)
1187			return (error);
1188
1189		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1190		    GET_SHARE(flags), &uq->uq_key)) != 0)
1191			return (error);
1192
1193		umtxq_lock(&uq->uq_key);
1194		umtxq_busy(&uq->uq_key);
1195		umtxq_insert(uq);
1196		umtxq_unlock(&uq->uq_key);
1197
1198		/*
1199		 * Set the contested bit so that a release in user space
1200		 * knows to use the system call for unlock.  If this fails
1201		 * either some one else has acquired the lock or it has been
1202		 * released.
1203		 */
1204		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1205
1206		/* The address was invalid. */
1207		if (old == -1) {
1208			umtxq_lock(&uq->uq_key);
1209			umtxq_remove(uq);
1210			umtxq_unbusy(&uq->uq_key);
1211			umtxq_unlock(&uq->uq_key);
1212			umtx_key_release(&uq->uq_key);
1213			return (EFAULT);
1214		}
1215
1216		/*
1217		 * We set the contested bit, sleep. Otherwise the lock changed
1218		 * and we need to retry or we lost a race to the thread
1219		 * unlocking the umtx.
1220		 */
1221		umtxq_lock(&uq->uq_key);
1222		umtxq_unbusy(&uq->uq_key);
1223		if (old == owner)
1224			error = umtxq_sleep(uq, "umtxn", timo);
1225		umtxq_remove(uq);
1226		umtxq_unlock(&uq->uq_key);
1227		umtx_key_release(&uq->uq_key);
1228	}
1229
1230	return (0);
1231}
1232
1233/*
1234 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1235 */
1236/*
1237 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1238 */
1239static int
1240do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1241{
1242	struct umtx_key key;
1243	uint32_t owner, old, id;
1244	int error;
1245	int count;
1246
1247	id = td->td_tid;
1248	/*
1249	 * Make sure we own this mtx.
1250	 */
1251	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1252	if (owner == -1)
1253		return (EFAULT);
1254
1255	if ((owner & ~UMUTEX_CONTESTED) != id)
1256		return (EPERM);
1257
1258	if ((owner & UMUTEX_CONTESTED) == 0) {
1259		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1260		if (old == -1)
1261			return (EFAULT);
1262		if (old == owner)
1263			return (0);
1264		owner = old;
1265	}
1266
1267	/* We should only ever be in here for contested locks */
1268	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1269	    &key)) != 0)
1270		return (error);
1271
1272	umtxq_lock(&key);
1273	umtxq_busy(&key);
1274	count = umtxq_count(&key);
1275	umtxq_unlock(&key);
1276
1277	/*
1278	 * When unlocking the umtx, it must be marked as unowned if
1279	 * there is zero or one thread only waiting for it.
1280	 * Otherwise, it must be marked as contested.
1281	 */
1282	old = casuword32(&m->m_owner, owner,
1283		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1284	umtxq_lock(&key);
1285	umtxq_signal(&key,1);
1286	umtxq_unbusy(&key);
1287	umtxq_unlock(&key);
1288	umtx_key_release(&key);
1289	if (old == -1)
1290		return (EFAULT);
1291	if (old != owner)
1292		return (EINVAL);
1293	return (0);
1294}
1295
1296/*
1297 * Check if the mutex is available and wake up a waiter,
1298 * only for simple mutex.
1299 */
1300static int
1301do_wake_umutex(struct thread *td, struct umutex *m)
1302{
1303	struct umtx_key key;
1304	uint32_t owner;
1305	uint32_t flags;
1306	int error;
1307	int count;
1308
1309	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1310	if (owner == -1)
1311		return (EFAULT);
1312
1313	if ((owner & ~UMUTEX_CONTESTED) != 0)
1314		return (0);
1315
1316	flags = fuword32(&m->m_flags);
1317
1318	/* We should only ever be in here for contested locks */
1319	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1320	    &key)) != 0)
1321		return (error);
1322
1323	umtxq_lock(&key);
1324	umtxq_busy(&key);
1325	count = umtxq_count(&key);
1326	umtxq_unlock(&key);
1327
1328	if (count <= 1)
1329		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
1330
1331	umtxq_lock(&key);
1332	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1333		umtxq_signal(&key, 1);
1334	umtxq_unbusy(&key);
1335	umtxq_unlock(&key);
1336	umtx_key_release(&key);
1337	return (0);
1338}
1339
1340static inline struct umtx_pi *
1341umtx_pi_alloc(int flags)
1342{
1343	struct umtx_pi *pi;
1344
1345	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1346	TAILQ_INIT(&pi->pi_blocked);
1347	atomic_add_int(&umtx_pi_allocated, 1);
1348	return (pi);
1349}
1350
1351static inline void
1352umtx_pi_free(struct umtx_pi *pi)
1353{
1354	uma_zfree(umtx_pi_zone, pi);
1355	atomic_add_int(&umtx_pi_allocated, -1);
1356}
1357
1358/*
1359 * Adjust the thread's position on a pi_state after its priority has been
1360 * changed.
1361 */
1362static int
1363umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1364{
1365	struct umtx_q *uq, *uq1, *uq2;
1366	struct thread *td1;
1367
1368	mtx_assert(&umtx_lock, MA_OWNED);
1369	if (pi == NULL)
1370		return (0);
1371
1372	uq = td->td_umtxq;
1373
1374	/*
1375	 * Check if the thread needs to be moved on the blocked chain.
1376	 * It needs to be moved if either its priority is lower than
1377	 * the previous thread or higher than the next thread.
1378	 */
1379	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1380	uq2 = TAILQ_NEXT(uq, uq_lockq);
1381	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1382	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1383		/*
1384		 * Remove thread from blocked chain and determine where
1385		 * it should be moved to.
1386		 */
1387		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1388		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1389			td1 = uq1->uq_thread;
1390			MPASS(td1->td_proc->p_magic == P_MAGIC);
1391			if (UPRI(td1) > UPRI(td))
1392				break;
1393		}
1394
1395		if (uq1 == NULL)
1396			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1397		else
1398			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1399	}
1400	return (1);
1401}
1402
1403/*
1404 * Propagate priority when a thread is blocked on POSIX
1405 * PI mutex.
1406 */
1407static void
1408umtx_propagate_priority(struct thread *td)
1409{
1410	struct umtx_q *uq;
1411	struct umtx_pi *pi;
1412	int pri;
1413
1414	mtx_assert(&umtx_lock, MA_OWNED);
1415	pri = UPRI(td);
1416	uq = td->td_umtxq;
1417	pi = uq->uq_pi_blocked;
1418	if (pi == NULL)
1419		return;
1420
1421	for (;;) {
1422		td = pi->pi_owner;
1423		if (td == NULL || td == curthread)
1424			return;
1425
1426		MPASS(td->td_proc != NULL);
1427		MPASS(td->td_proc->p_magic == P_MAGIC);
1428
1429		thread_lock(td);
1430		if (td->td_lend_user_pri > pri)
1431			sched_lend_user_prio(td, pri);
1432		else {
1433			thread_unlock(td);
1434			break;
1435		}
1436		thread_unlock(td);
1437
1438		/*
1439		 * Pick up the lock that td is blocked on.
1440		 */
1441		uq = td->td_umtxq;
1442		pi = uq->uq_pi_blocked;
1443		if (pi == NULL)
1444			break;
1445		/* Resort td on the list if needed. */
1446		umtx_pi_adjust_thread(pi, td);
1447	}
1448}
1449
1450/*
1451 * Unpropagate priority for a PI mutex when a thread blocked on
1452 * it is interrupted by signal or resumed by others.
1453 */
1454static void
1455umtx_repropagate_priority(struct umtx_pi *pi)
1456{
1457	struct umtx_q *uq, *uq_owner;
1458	struct umtx_pi *pi2;
1459	int pri;
1460
1461	mtx_assert(&umtx_lock, MA_OWNED);
1462
1463	while (pi != NULL && pi->pi_owner != NULL) {
1464		pri = PRI_MAX;
1465		uq_owner = pi->pi_owner->td_umtxq;
1466
1467		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1468			uq = TAILQ_FIRST(&pi2->pi_blocked);
1469			if (uq != NULL) {
1470				if (pri > UPRI(uq->uq_thread))
1471					pri = UPRI(uq->uq_thread);
1472			}
1473		}
1474
1475		if (pri > uq_owner->uq_inherited_pri)
1476			pri = uq_owner->uq_inherited_pri;
1477		thread_lock(pi->pi_owner);
1478		sched_lend_user_prio(pi->pi_owner, pri);
1479		thread_unlock(pi->pi_owner);
1480		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1481			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1482	}
1483}
1484
1485/*
1486 * Insert a PI mutex into owned list.
1487 */
1488static void
1489umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1490{
1491	struct umtx_q *uq_owner;
1492
1493	uq_owner = owner->td_umtxq;
1494	mtx_assert(&umtx_lock, MA_OWNED);
1495	if (pi->pi_owner != NULL)
1496		panic("pi_ower != NULL");
1497	pi->pi_owner = owner;
1498	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1499}
1500
1501/*
1502 * Claim ownership of a PI mutex.
1503 */
1504static int
1505umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1506{
1507	struct umtx_q *uq, *uq_owner;
1508
1509	uq_owner = owner->td_umtxq;
1510	mtx_lock_spin(&umtx_lock);
1511	if (pi->pi_owner == owner) {
1512		mtx_unlock_spin(&umtx_lock);
1513		return (0);
1514	}
1515
1516	if (pi->pi_owner != NULL) {
1517		/*
1518		 * userland may have already messed the mutex, sigh.
1519		 */
1520		mtx_unlock_spin(&umtx_lock);
1521		return (EPERM);
1522	}
1523	umtx_pi_setowner(pi, owner);
1524	uq = TAILQ_FIRST(&pi->pi_blocked);
1525	if (uq != NULL) {
1526		int pri;
1527
1528		pri = UPRI(uq->uq_thread);
1529		thread_lock(owner);
1530		if (pri < UPRI(owner))
1531			sched_lend_user_prio(owner, pri);
1532		thread_unlock(owner);
1533	}
1534	mtx_unlock_spin(&umtx_lock);
1535	return (0);
1536}
1537
1538/*
1539 * Adjust a thread's order position in its blocked PI mutex,
1540 * this may result new priority propagating process.
1541 */
1542void
1543umtx_pi_adjust(struct thread *td, u_char oldpri)
1544{
1545	struct umtx_q *uq;
1546	struct umtx_pi *pi;
1547
1548	uq = td->td_umtxq;
1549	mtx_lock_spin(&umtx_lock);
1550	/*
1551	 * Pick up the lock that td is blocked on.
1552	 */
1553	pi = uq->uq_pi_blocked;
1554	if (pi != NULL) {
1555		umtx_pi_adjust_thread(pi, td);
1556		umtx_repropagate_priority(pi);
1557	}
1558	mtx_unlock_spin(&umtx_lock);
1559}
1560
1561/*
1562 * Sleep on a PI mutex.
1563 */
1564static int
1565umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1566	uint32_t owner, const char *wmesg, int timo)
1567{
1568	struct umtxq_chain *uc;
1569	struct thread *td, *td1;
1570	struct umtx_q *uq1;
1571	int pri;
1572	int error = 0;
1573
1574	td = uq->uq_thread;
1575	KASSERT(td == curthread, ("inconsistent uq_thread"));
1576	uc = umtxq_getchain(&uq->uq_key);
1577	UMTXQ_LOCKED_ASSERT(uc);
1578	UMTXQ_BUSY_ASSERT(uc);
1579	umtxq_insert(uq);
1580	mtx_lock_spin(&umtx_lock);
1581	if (pi->pi_owner == NULL) {
1582		mtx_unlock_spin(&umtx_lock);
1583		/* XXX Only look up thread in current process. */
1584		td1 = tdfind(owner, curproc->p_pid);
1585		mtx_lock_spin(&umtx_lock);
1586		if (td1 != NULL) {
1587			if (pi->pi_owner == NULL)
1588				umtx_pi_setowner(pi, td1);
1589			PROC_UNLOCK(td1->td_proc);
1590		}
1591	}
1592
1593	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1594		pri = UPRI(uq1->uq_thread);
1595		if (pri > UPRI(td))
1596			break;
1597	}
1598
1599	if (uq1 != NULL)
1600		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1601	else
1602		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1603
1604	uq->uq_pi_blocked = pi;
1605	thread_lock(td);
1606	td->td_flags |= TDF_UPIBLOCKED;
1607	thread_unlock(td);
1608	umtx_propagate_priority(td);
1609	mtx_unlock_spin(&umtx_lock);
1610	umtxq_unbusy(&uq->uq_key);
1611
1612	if (uq->uq_flags & UQF_UMTXQ) {
1613		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1614		if (error == EWOULDBLOCK)
1615			error = ETIMEDOUT;
1616		if (uq->uq_flags & UQF_UMTXQ) {
1617			umtxq_remove(uq);
1618		}
1619	}
1620	mtx_lock_spin(&umtx_lock);
1621	uq->uq_pi_blocked = NULL;
1622	thread_lock(td);
1623	td->td_flags &= ~TDF_UPIBLOCKED;
1624	thread_unlock(td);
1625	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1626	umtx_repropagate_priority(pi);
1627	mtx_unlock_spin(&umtx_lock);
1628	umtxq_unlock(&uq->uq_key);
1629
1630	return (error);
1631}
1632
1633/*
1634 * Add reference count for a PI mutex.
1635 */
1636static void
1637umtx_pi_ref(struct umtx_pi *pi)
1638{
1639	struct umtxq_chain *uc;
1640
1641	uc = umtxq_getchain(&pi->pi_key);
1642	UMTXQ_LOCKED_ASSERT(uc);
1643	pi->pi_refcount++;
1644}
1645
1646/*
1647 * Decrease reference count for a PI mutex, if the counter
1648 * is decreased to zero, its memory space is freed.
1649 */
1650static void
1651umtx_pi_unref(struct umtx_pi *pi)
1652{
1653	struct umtxq_chain *uc;
1654
1655	uc = umtxq_getchain(&pi->pi_key);
1656	UMTXQ_LOCKED_ASSERT(uc);
1657	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1658	if (--pi->pi_refcount == 0) {
1659		mtx_lock_spin(&umtx_lock);
1660		if (pi->pi_owner != NULL) {
1661			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1662				pi, pi_link);
1663			pi->pi_owner = NULL;
1664		}
1665		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1666			("blocked queue not empty"));
1667		mtx_unlock_spin(&umtx_lock);
1668		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1669		umtx_pi_free(pi);
1670	}
1671}
1672
1673/*
1674 * Find a PI mutex in hash table.
1675 */
1676static struct umtx_pi *
1677umtx_pi_lookup(struct umtx_key *key)
1678{
1679	struct umtxq_chain *uc;
1680	struct umtx_pi *pi;
1681
1682	uc = umtxq_getchain(key);
1683	UMTXQ_LOCKED_ASSERT(uc);
1684
1685	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1686		if (umtx_key_match(&pi->pi_key, key)) {
1687			return (pi);
1688		}
1689	}
1690	return (NULL);
1691}
1692
1693/*
1694 * Insert a PI mutex into hash table.
1695 */
1696static inline void
1697umtx_pi_insert(struct umtx_pi *pi)
1698{
1699	struct umtxq_chain *uc;
1700
1701	uc = umtxq_getchain(&pi->pi_key);
1702	UMTXQ_LOCKED_ASSERT(uc);
1703	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1704}
1705
1706/*
1707 * Lock a PI mutex.
1708 */
1709static int
1710_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1711	int try)
1712{
1713	struct umtx_q *uq;
1714	struct umtx_pi *pi, *new_pi;
1715	uint32_t id, owner, old;
1716	int error;
1717
1718	id = td->td_tid;
1719	uq = td->td_umtxq;
1720
1721	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1722	    &uq->uq_key)) != 0)
1723		return (error);
1724	umtxq_lock(&uq->uq_key);
1725	pi = umtx_pi_lookup(&uq->uq_key);
1726	if (pi == NULL) {
1727		new_pi = umtx_pi_alloc(M_NOWAIT);
1728		if (new_pi == NULL) {
1729			umtxq_unlock(&uq->uq_key);
1730			new_pi = umtx_pi_alloc(M_WAITOK);
1731			umtxq_lock(&uq->uq_key);
1732			pi = umtx_pi_lookup(&uq->uq_key);
1733			if (pi != NULL) {
1734				umtx_pi_free(new_pi);
1735				new_pi = NULL;
1736			}
1737		}
1738		if (new_pi != NULL) {
1739			new_pi->pi_key = uq->uq_key;
1740			umtx_pi_insert(new_pi);
1741			pi = new_pi;
1742		}
1743	}
1744	umtx_pi_ref(pi);
1745	umtxq_unlock(&uq->uq_key);
1746
1747	/*
1748	 * Care must be exercised when dealing with umtx structure.  It
1749	 * can fault on any access.
1750	 */
1751	for (;;) {
1752		/*
1753		 * Try the uncontested case.  This should be done in userland.
1754		 */
1755		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1756
1757		/* The acquire succeeded. */
1758		if (owner == UMUTEX_UNOWNED) {
1759			error = 0;
1760			break;
1761		}
1762
1763		/* The address was invalid. */
1764		if (owner == -1) {
1765			error = EFAULT;
1766			break;
1767		}
1768
1769		/* If no one owns it but it is contested try to acquire it. */
1770		if (owner == UMUTEX_CONTESTED) {
1771			owner = casuword32(&m->m_owner,
1772			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1773
1774			if (owner == UMUTEX_CONTESTED) {
1775				umtxq_lock(&uq->uq_key);
1776				umtxq_busy(&uq->uq_key);
1777				error = umtx_pi_claim(pi, td);
1778				umtxq_unbusy(&uq->uq_key);
1779				umtxq_unlock(&uq->uq_key);
1780				break;
1781			}
1782
1783			/* The address was invalid. */
1784			if (owner == -1) {
1785				error = EFAULT;
1786				break;
1787			}
1788
1789			/* If this failed the lock has changed, restart. */
1790			continue;
1791		}
1792
1793		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1794		    (owner & ~UMUTEX_CONTESTED) == id) {
1795			error = EDEADLK;
1796			break;
1797		}
1798
1799		if (try != 0) {
1800			error = EBUSY;
1801			break;
1802		}
1803
1804		/*
1805		 * If we caught a signal, we have retried and now
1806		 * exit immediately.
1807		 */
1808		if (error != 0)
1809			break;
1810
1811		umtxq_lock(&uq->uq_key);
1812		umtxq_busy(&uq->uq_key);
1813		umtxq_unlock(&uq->uq_key);
1814
1815		/*
1816		 * Set the contested bit so that a release in user space
1817		 * knows to use the system call for unlock.  If this fails
1818		 * either some one else has acquired the lock or it has been
1819		 * released.
1820		 */
1821		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1822
1823		/* The address was invalid. */
1824		if (old == -1) {
1825			umtxq_lock(&uq->uq_key);
1826			umtxq_unbusy(&uq->uq_key);
1827			umtxq_unlock(&uq->uq_key);
1828			error = EFAULT;
1829			break;
1830		}
1831
1832		umtxq_lock(&uq->uq_key);
1833		/*
1834		 * We set the contested bit, sleep. Otherwise the lock changed
1835		 * and we need to retry or we lost a race to the thread
1836		 * unlocking the umtx.
1837		 */
1838		if (old == owner)
1839			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1840				 "umtxpi", timo);
1841		else {
1842			umtxq_unbusy(&uq->uq_key);
1843			umtxq_unlock(&uq->uq_key);
1844		}
1845	}
1846
1847	umtxq_lock(&uq->uq_key);
1848	umtx_pi_unref(pi);
1849	umtxq_unlock(&uq->uq_key);
1850
1851	umtx_key_release(&uq->uq_key);
1852	return (error);
1853}
1854
1855/*
1856 * Unlock a PI mutex.
1857 */
1858static int
1859do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1860{
1861	struct umtx_key key;
1862	struct umtx_q *uq_first, *uq_first2, *uq_me;
1863	struct umtx_pi *pi, *pi2;
1864	uint32_t owner, old, id;
1865	int error;
1866	int count;
1867	int pri;
1868
1869	id = td->td_tid;
1870	/*
1871	 * Make sure we own this mtx.
1872	 */
1873	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1874	if (owner == -1)
1875		return (EFAULT);
1876
1877	if ((owner & ~UMUTEX_CONTESTED) != id)
1878		return (EPERM);
1879
1880	/* This should be done in userland */
1881	if ((owner & UMUTEX_CONTESTED) == 0) {
1882		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1883		if (old == -1)
1884			return (EFAULT);
1885		if (old == owner)
1886			return (0);
1887		owner = old;
1888	}
1889
1890	/* We should only ever be in here for contested locks */
1891	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1892	    &key)) != 0)
1893		return (error);
1894
1895	umtxq_lock(&key);
1896	umtxq_busy(&key);
1897	count = umtxq_count_pi(&key, &uq_first);
1898	if (uq_first != NULL) {
1899		mtx_lock_spin(&umtx_lock);
1900		pi = uq_first->uq_pi_blocked;
1901		KASSERT(pi != NULL, ("pi == NULL?"));
1902		if (pi->pi_owner != curthread) {
1903			mtx_unlock_spin(&umtx_lock);
1904			umtxq_unbusy(&key);
1905			umtxq_unlock(&key);
1906			umtx_key_release(&key);
1907			/* userland messed the mutex */
1908			return (EPERM);
1909		}
1910		uq_me = curthread->td_umtxq;
1911		pi->pi_owner = NULL;
1912		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1913		/* get highest priority thread which is still sleeping. */
1914		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1915		while (uq_first != NULL &&
1916		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
1917			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
1918		}
1919		pri = PRI_MAX;
1920		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1921			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1922			if (uq_first2 != NULL) {
1923				if (pri > UPRI(uq_first2->uq_thread))
1924					pri = UPRI(uq_first2->uq_thread);
1925			}
1926		}
1927		thread_lock(curthread);
1928		sched_lend_user_prio(curthread, pri);
1929		thread_unlock(curthread);
1930		mtx_unlock_spin(&umtx_lock);
1931		if (uq_first)
1932			umtxq_signal_thread(uq_first);
1933	}
1934	umtxq_unlock(&key);
1935
1936	/*
1937	 * When unlocking the umtx, it must be marked as unowned if
1938	 * there is zero or one thread only waiting for it.
1939	 * Otherwise, it must be marked as contested.
1940	 */
1941	old = casuword32(&m->m_owner, owner,
1942		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1943
1944	umtxq_lock(&key);
1945	umtxq_unbusy(&key);
1946	umtxq_unlock(&key);
1947	umtx_key_release(&key);
1948	if (old == -1)
1949		return (EFAULT);
1950	if (old != owner)
1951		return (EINVAL);
1952	return (0);
1953}
1954
1955/*
1956 * Lock a PP mutex.
1957 */
1958static int
1959_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1960	int try)
1961{
1962	struct umtx_q *uq, *uq2;
1963	struct umtx_pi *pi;
1964	uint32_t ceiling;
1965	uint32_t owner, id;
1966	int error, pri, old_inherited_pri, su;
1967
1968	id = td->td_tid;
1969	uq = td->td_umtxq;
1970	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1971	    &uq->uq_key)) != 0)
1972		return (error);
1973	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1974	for (;;) {
1975		old_inherited_pri = uq->uq_inherited_pri;
1976		umtxq_lock(&uq->uq_key);
1977		umtxq_busy(&uq->uq_key);
1978		umtxq_unlock(&uq->uq_key);
1979
1980		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1981		if (ceiling > RTP_PRIO_MAX) {
1982			error = EINVAL;
1983			goto out;
1984		}
1985
1986		mtx_lock_spin(&umtx_lock);
1987		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1988			mtx_unlock_spin(&umtx_lock);
1989			error = EINVAL;
1990			goto out;
1991		}
1992		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1993			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1994			thread_lock(td);
1995			if (uq->uq_inherited_pri < UPRI(td))
1996				sched_lend_user_prio(td, uq->uq_inherited_pri);
1997			thread_unlock(td);
1998		}
1999		mtx_unlock_spin(&umtx_lock);
2000
2001		owner = casuword32(&m->m_owner,
2002		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2003
2004		if (owner == UMUTEX_CONTESTED) {
2005			error = 0;
2006			break;
2007		}
2008
2009		/* The address was invalid. */
2010		if (owner == -1) {
2011			error = EFAULT;
2012			break;
2013		}
2014
2015		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2016		    (owner & ~UMUTEX_CONTESTED) == id) {
2017			error = EDEADLK;
2018			break;
2019		}
2020
2021		if (try != 0) {
2022			error = EBUSY;
2023			break;
2024		}
2025
2026		/*
2027		 * If we caught a signal, we have retried and now
2028		 * exit immediately.
2029		 */
2030		if (error != 0)
2031			break;
2032
2033		umtxq_lock(&uq->uq_key);
2034		umtxq_insert(uq);
2035		umtxq_unbusy(&uq->uq_key);
2036		error = umtxq_sleep(uq, "umtxpp", timo);
2037		umtxq_remove(uq);
2038		umtxq_unlock(&uq->uq_key);
2039
2040		mtx_lock_spin(&umtx_lock);
2041		uq->uq_inherited_pri = old_inherited_pri;
2042		pri = PRI_MAX;
2043		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2044			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2045			if (uq2 != NULL) {
2046				if (pri > UPRI(uq2->uq_thread))
2047					pri = UPRI(uq2->uq_thread);
2048			}
2049		}
2050		if (pri > uq->uq_inherited_pri)
2051			pri = uq->uq_inherited_pri;
2052		thread_lock(td);
2053		sched_lend_user_prio(td, pri);
2054		thread_unlock(td);
2055		mtx_unlock_spin(&umtx_lock);
2056	}
2057
2058	if (error != 0) {
2059		mtx_lock_spin(&umtx_lock);
2060		uq->uq_inherited_pri = old_inherited_pri;
2061		pri = PRI_MAX;
2062		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2063			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2064			if (uq2 != NULL) {
2065				if (pri > UPRI(uq2->uq_thread))
2066					pri = UPRI(uq2->uq_thread);
2067			}
2068		}
2069		if (pri > uq->uq_inherited_pri)
2070			pri = uq->uq_inherited_pri;
2071		thread_lock(td);
2072		sched_lend_user_prio(td, pri);
2073		thread_unlock(td);
2074		mtx_unlock_spin(&umtx_lock);
2075	}
2076
2077out:
2078	umtxq_lock(&uq->uq_key);
2079	umtxq_unbusy(&uq->uq_key);
2080	umtxq_unlock(&uq->uq_key);
2081	umtx_key_release(&uq->uq_key);
2082	return (error);
2083}
2084
2085/*
2086 * Unlock a PP mutex.
2087 */
2088static int
2089do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2090{
2091	struct umtx_key key;
2092	struct umtx_q *uq, *uq2;
2093	struct umtx_pi *pi;
2094	uint32_t owner, id;
2095	uint32_t rceiling;
2096	int error, pri, new_inherited_pri, su;
2097
2098	id = td->td_tid;
2099	uq = td->td_umtxq;
2100	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2101
2102	/*
2103	 * Make sure we own this mtx.
2104	 */
2105	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2106	if (owner == -1)
2107		return (EFAULT);
2108
2109	if ((owner & ~UMUTEX_CONTESTED) != id)
2110		return (EPERM);
2111
2112	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2113	if (error != 0)
2114		return (error);
2115
2116	if (rceiling == -1)
2117		new_inherited_pri = PRI_MAX;
2118	else {
2119		rceiling = RTP_PRIO_MAX - rceiling;
2120		if (rceiling > RTP_PRIO_MAX)
2121			return (EINVAL);
2122		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2123	}
2124
2125	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2126	    &key)) != 0)
2127		return (error);
2128	umtxq_lock(&key);
2129	umtxq_busy(&key);
2130	umtxq_unlock(&key);
2131	/*
2132	 * For priority protected mutex, always set unlocked state
2133	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2134	 * to lock the mutex, it is necessary because thread priority
2135	 * has to be adjusted for such mutex.
2136	 */
2137	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2138		UMUTEX_CONTESTED);
2139
2140	umtxq_lock(&key);
2141	if (error == 0)
2142		umtxq_signal(&key, 1);
2143	umtxq_unbusy(&key);
2144	umtxq_unlock(&key);
2145
2146	if (error == -1)
2147		error = EFAULT;
2148	else {
2149		mtx_lock_spin(&umtx_lock);
2150		if (su != 0)
2151			uq->uq_inherited_pri = new_inherited_pri;
2152		pri = PRI_MAX;
2153		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2154			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2155			if (uq2 != NULL) {
2156				if (pri > UPRI(uq2->uq_thread))
2157					pri = UPRI(uq2->uq_thread);
2158			}
2159		}
2160		if (pri > uq->uq_inherited_pri)
2161			pri = uq->uq_inherited_pri;
2162		thread_lock(td);
2163		sched_lend_user_prio(td, pri);
2164		thread_unlock(td);
2165		mtx_unlock_spin(&umtx_lock);
2166	}
2167	umtx_key_release(&key);
2168	return (error);
2169}
2170
2171static int
2172do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2173	uint32_t *old_ceiling)
2174{
2175	struct umtx_q *uq;
2176	uint32_t save_ceiling;
2177	uint32_t owner, id;
2178	uint32_t flags;
2179	int error;
2180
2181	flags = fuword32(&m->m_flags);
2182	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2183		return (EINVAL);
2184	if (ceiling > RTP_PRIO_MAX)
2185		return (EINVAL);
2186	id = td->td_tid;
2187	uq = td->td_umtxq;
2188	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2189	   &uq->uq_key)) != 0)
2190		return (error);
2191	for (;;) {
2192		umtxq_lock(&uq->uq_key);
2193		umtxq_busy(&uq->uq_key);
2194		umtxq_unlock(&uq->uq_key);
2195
2196		save_ceiling = fuword32(&m->m_ceilings[0]);
2197
2198		owner = casuword32(&m->m_owner,
2199		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2200
2201		if (owner == UMUTEX_CONTESTED) {
2202			suword32(&m->m_ceilings[0], ceiling);
2203			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2204				UMUTEX_CONTESTED);
2205			error = 0;
2206			break;
2207		}
2208
2209		/* The address was invalid. */
2210		if (owner == -1) {
2211			error = EFAULT;
2212			break;
2213		}
2214
2215		if ((owner & ~UMUTEX_CONTESTED) == id) {
2216			suword32(&m->m_ceilings[0], ceiling);
2217			error = 0;
2218			break;
2219		}
2220
2221		/*
2222		 * If we caught a signal, we have retried and now
2223		 * exit immediately.
2224		 */
2225		if (error != 0)
2226			break;
2227
2228		/*
2229		 * We set the contested bit, sleep. Otherwise the lock changed
2230		 * and we need to retry or we lost a race to the thread
2231		 * unlocking the umtx.
2232		 */
2233		umtxq_lock(&uq->uq_key);
2234		umtxq_insert(uq);
2235		umtxq_unbusy(&uq->uq_key);
2236		error = umtxq_sleep(uq, "umtxpp", 0);
2237		umtxq_remove(uq);
2238		umtxq_unlock(&uq->uq_key);
2239	}
2240	umtxq_lock(&uq->uq_key);
2241	if (error == 0)
2242		umtxq_signal(&uq->uq_key, INT_MAX);
2243	umtxq_unbusy(&uq->uq_key);
2244	umtxq_unlock(&uq->uq_key);
2245	umtx_key_release(&uq->uq_key);
2246	if (error == 0 && old_ceiling != NULL)
2247		suword32(old_ceiling, save_ceiling);
2248	return (error);
2249}
2250
2251static int
2252_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2253	int mode)
2254{
2255	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2256	case 0:
2257		return (_do_lock_normal(td, m, flags, timo, mode));
2258	case UMUTEX_PRIO_INHERIT:
2259		return (_do_lock_pi(td, m, flags, timo, mode));
2260	case UMUTEX_PRIO_PROTECT:
2261		return (_do_lock_pp(td, m, flags, timo, mode));
2262	}
2263	return (EINVAL);
2264}
2265
2266/*
2267 * Lock a userland POSIX mutex.
2268 */
2269static int
2270do_lock_umutex(struct thread *td, struct umutex *m,
2271	struct _umtx_time *timeout, int mode)
2272{
2273	struct timespec cts, ets, tts;
2274	uint32_t flags;
2275	int error;
2276
2277	flags = fuword32(&m->m_flags);
2278	if (flags == -1)
2279		return (EFAULT);
2280
2281	if (timeout == NULL) {
2282		error = _do_lock_umutex(td, m, flags, 0, mode);
2283		/* Mutex locking is restarted if it is interrupted. */
2284		if (error == EINTR && mode != _UMUTEX_WAIT)
2285			error = ERESTART;
2286	} else {
2287		kern_clock_gettime(td, timeout->_clockid, &cts);
2288		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2289			ets = cts;
2290			timespecadd(&ets, &timeout->_timeout);
2291			tts = timeout->_timeout;
2292		} else {
2293			ets = timeout->_timeout;
2294			tts = timeout->_timeout;
2295			timespecsub(&tts, &cts);
2296		}
2297		for (;;) {
2298			error = _do_lock_umutex(td, m, flags, tstohz(&tts), mode);
2299			if (error != ETIMEDOUT)
2300				break;
2301			kern_clock_gettime(td, timeout->_clockid, &cts);
2302			if (timespeccmp(&cts, &ets, >=))
2303				break;
2304			tts = ets;
2305			timespecsub(&tts, &cts);
2306		}
2307		/* Timed-locking is not restarted. */
2308		if (error == ERESTART)
2309			error = EINTR;
2310	}
2311	return (error);
2312}
2313
2314/*
2315 * Unlock a userland POSIX mutex.
2316 */
2317static int
2318do_unlock_umutex(struct thread *td, struct umutex *m)
2319{
2320	uint32_t flags;
2321
2322	flags = fuword32(&m->m_flags);
2323	if (flags == -1)
2324		return (EFAULT);
2325
2326	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2327	case 0:
2328		return (do_unlock_normal(td, m, flags));
2329	case UMUTEX_PRIO_INHERIT:
2330		return (do_unlock_pi(td, m, flags));
2331	case UMUTEX_PRIO_PROTECT:
2332		return (do_unlock_pp(td, m, flags));
2333	}
2334
2335	return (EINVAL);
2336}
2337
2338static int
2339do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2340	struct timespec *timeout, u_long wflags)
2341{
2342	struct umtx_q *uq;
2343	struct timespec cts, ets, tts;
2344	uint32_t flags;
2345	uint32_t clockid;
2346	int error;
2347
2348	uq = td->td_umtxq;
2349	flags = fuword32(&cv->c_flags);
2350	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2351	if (error != 0)
2352		return (error);
2353
2354	if ((wflags & CVWAIT_CLOCKID) != 0) {
2355		clockid = fuword32(&cv->c_clockid);
2356		if (clockid < CLOCK_REALTIME ||
2357		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2358			/* hmm, only HW clock id will work. */
2359			return (EINVAL);
2360		}
2361	} else {
2362		clockid = CLOCK_REALTIME;
2363	}
2364
2365	umtxq_lock(&uq->uq_key);
2366	umtxq_busy(&uq->uq_key);
2367	umtxq_insert(uq);
2368	umtxq_unlock(&uq->uq_key);
2369
2370	/*
2371	 * Set c_has_waiters to 1 before releasing user mutex, also
2372	 * don't modify cache line when unnecessary.
2373	 */
2374	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
2375		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2376
2377	umtxq_lock(&uq->uq_key);
2378	umtxq_unbusy(&uq->uq_key);
2379	umtxq_unlock(&uq->uq_key);
2380
2381	error = do_unlock_umutex(td, m);
2382
2383	umtxq_lock(&uq->uq_key);
2384	if (error == 0) {
2385		if (timeout == NULL) {
2386			error = umtxq_sleep(uq, "ucond", 0);
2387		} else {
2388			if ((wflags & CVWAIT_ABSTIME) == 0) {
2389				kern_clock_gettime(td, clockid, &ets);
2390				timespecadd(&ets, timeout);
2391				tts = *timeout;
2392			} else { /* absolute time */
2393				ets = *timeout;
2394				tts = *timeout;
2395				kern_clock_gettime(td, clockid, &cts);
2396				timespecsub(&tts, &cts);
2397			}
2398			for (;;) {
2399				error = umtxq_sleep(uq, "ucond", tstohz(&tts));
2400				if (error != ETIMEDOUT)
2401					break;
2402				kern_clock_gettime(td, clockid, &cts);
2403				if (timespeccmp(&cts, &ets, >=)) {
2404					error = ETIMEDOUT;
2405					break;
2406				}
2407				tts = ets;
2408				timespecsub(&tts, &cts);
2409			}
2410		}
2411	}
2412
2413	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2414		error = 0;
2415	else {
2416		/*
2417		 * This must be timeout,interrupted by signal or
2418		 * surprious wakeup, clear c_has_waiter flag when
2419		 * necessary.
2420		 */
2421		umtxq_busy(&uq->uq_key);
2422		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2423			int oldlen = uq->uq_cur_queue->length;
2424			umtxq_remove(uq);
2425			if (oldlen == 1) {
2426				umtxq_unlock(&uq->uq_key);
2427				suword32(
2428				    __DEVOLATILE(uint32_t *,
2429					 &cv->c_has_waiters), 0);
2430				umtxq_lock(&uq->uq_key);
2431			}
2432		}
2433		umtxq_unbusy(&uq->uq_key);
2434		if (error == ERESTART)
2435			error = EINTR;
2436	}
2437
2438	umtxq_unlock(&uq->uq_key);
2439	umtx_key_release(&uq->uq_key);
2440	return (error);
2441}
2442
2443/*
2444 * Signal a userland condition variable.
2445 */
2446static int
2447do_cv_signal(struct thread *td, struct ucond *cv)
2448{
2449	struct umtx_key key;
2450	int error, cnt, nwake;
2451	uint32_t flags;
2452
2453	flags = fuword32(&cv->c_flags);
2454	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2455		return (error);
2456	umtxq_lock(&key);
2457	umtxq_busy(&key);
2458	cnt = umtxq_count(&key);
2459	nwake = umtxq_signal(&key, 1);
2460	if (cnt <= nwake) {
2461		umtxq_unlock(&key);
2462		error = suword32(
2463		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2464		umtxq_lock(&key);
2465	}
2466	umtxq_unbusy(&key);
2467	umtxq_unlock(&key);
2468	umtx_key_release(&key);
2469	return (error);
2470}
2471
2472static int
2473do_cv_broadcast(struct thread *td, struct ucond *cv)
2474{
2475	struct umtx_key key;
2476	int error;
2477	uint32_t flags;
2478
2479	flags = fuword32(&cv->c_flags);
2480	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2481		return (error);
2482
2483	umtxq_lock(&key);
2484	umtxq_busy(&key);
2485	umtxq_signal(&key, INT_MAX);
2486	umtxq_unlock(&key);
2487
2488	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2489
2490	umtxq_lock(&key);
2491	umtxq_unbusy(&key);
2492	umtxq_unlock(&key);
2493
2494	umtx_key_release(&key);
2495	return (error);
2496}
2497
2498static int
2499do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
2500{
2501	struct umtx_q *uq;
2502	uint32_t flags, wrflags;
2503	int32_t state, oldstate;
2504	int32_t blocked_readers;
2505	int error;
2506
2507	uq = td->td_umtxq;
2508	flags = fuword32(&rwlock->rw_flags);
2509	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2510	if (error != 0)
2511		return (error);
2512
2513	wrflags = URWLOCK_WRITE_OWNER;
2514	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2515		wrflags |= URWLOCK_WRITE_WAITERS;
2516
2517	for (;;) {
2518		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2519		/* try to lock it */
2520		while (!(state & wrflags)) {
2521			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2522				umtx_key_release(&uq->uq_key);
2523				return (EAGAIN);
2524			}
2525			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
2526			if (oldstate == state) {
2527				umtx_key_release(&uq->uq_key);
2528				return (0);
2529			}
2530			state = oldstate;
2531		}
2532
2533		if (error)
2534			break;
2535
2536		/* grab monitor lock */
2537		umtxq_lock(&uq->uq_key);
2538		umtxq_busy(&uq->uq_key);
2539		umtxq_unlock(&uq->uq_key);
2540
2541		/*
2542		 * re-read the state, in case it changed between the try-lock above
2543		 * and the check below
2544		 */
2545		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2546
2547		/* set read contention bit */
2548		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
2549			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
2550			if (oldstate == state)
2551				goto sleep;
2552			state = oldstate;
2553		}
2554
2555		/* state is changed while setting flags, restart */
2556		if (!(state & wrflags)) {
2557			umtxq_lock(&uq->uq_key);
2558			umtxq_unbusy(&uq->uq_key);
2559			umtxq_unlock(&uq->uq_key);
2560			continue;
2561		}
2562
2563sleep:
2564		/* contention bit is set, before sleeping, increase read waiter count */
2565		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2566		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2567
2568		while (state & wrflags) {
2569			umtxq_lock(&uq->uq_key);
2570			umtxq_insert(uq);
2571			umtxq_unbusy(&uq->uq_key);
2572
2573			error = umtxq_sleep(uq, "urdlck", timo);
2574
2575			umtxq_busy(&uq->uq_key);
2576			umtxq_remove(uq);
2577			umtxq_unlock(&uq->uq_key);
2578			if (error)
2579				break;
2580			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2581		}
2582
2583		/* decrease read waiter count, and may clear read contention bit */
2584		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2585		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2586		if (blocked_readers == 1) {
2587			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2588			for (;;) {
2589				oldstate = casuword32(&rwlock->rw_state, state,
2590					 state & ~URWLOCK_READ_WAITERS);
2591				if (oldstate == state)
2592					break;
2593				state = oldstate;
2594			}
2595		}
2596
2597		umtxq_lock(&uq->uq_key);
2598		umtxq_unbusy(&uq->uq_key);
2599		umtxq_unlock(&uq->uq_key);
2600	}
2601	umtx_key_release(&uq->uq_key);
2602	return (error);
2603}
2604
2605static int
2606do_rw_rdlock2(struct thread *td, void *obj, long val, struct _umtx_time *timeout)
2607{
2608	struct timespec cts, ets, tts;
2609	int error;
2610
2611	kern_clock_gettime(td, timeout->_clockid, &cts);
2612	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2613		ets = cts;
2614		timespecadd(&ets, &timeout->_timeout);
2615		tts = timeout->_timeout;
2616	} else {
2617		ets = timeout->_timeout;
2618		tts = timeout->_timeout;
2619		timespecsub(&tts, &cts);
2620	}
2621	for (;;) {
2622		error = do_rw_rdlock(td, obj, val, tstohz(&tts));
2623		if (error != ETIMEDOUT)
2624			break;
2625		kern_clock_gettime(td, timeout->_clockid, &cts);
2626		if (timespeccmp(&cts, &ets, >=))
2627			break;
2628		tts = ets;
2629		timespecsub(&tts, &cts);
2630	}
2631	if (error == ERESTART)
2632		error = EINTR;
2633	return (error);
2634}
2635
2636static int
2637do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
2638{
2639	struct umtx_q *uq;
2640	uint32_t flags;
2641	int32_t state, oldstate;
2642	int32_t blocked_writers;
2643	int32_t blocked_readers;
2644	int error;
2645
2646	uq = td->td_umtxq;
2647	flags = fuword32(&rwlock->rw_flags);
2648	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2649	if (error != 0)
2650		return (error);
2651
2652	blocked_readers = 0;
2653	for (;;) {
2654		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2655		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2656			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
2657			if (oldstate == state) {
2658				umtx_key_release(&uq->uq_key);
2659				return (0);
2660			}
2661			state = oldstate;
2662		}
2663
2664		if (error) {
2665			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2666			    blocked_readers != 0) {
2667				umtxq_lock(&uq->uq_key);
2668				umtxq_busy(&uq->uq_key);
2669				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2670				umtxq_unbusy(&uq->uq_key);
2671				umtxq_unlock(&uq->uq_key);
2672			}
2673
2674			break;
2675		}
2676
2677		/* grab monitor lock */
2678		umtxq_lock(&uq->uq_key);
2679		umtxq_busy(&uq->uq_key);
2680		umtxq_unlock(&uq->uq_key);
2681
2682		/*
2683		 * re-read the state, in case it changed between the try-lock above
2684		 * and the check below
2685		 */
2686		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2687
2688		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
2689		       (state & URWLOCK_WRITE_WAITERS) == 0) {
2690			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
2691			if (oldstate == state)
2692				goto sleep;
2693			state = oldstate;
2694		}
2695
2696		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2697			umtxq_lock(&uq->uq_key);
2698			umtxq_unbusy(&uq->uq_key);
2699			umtxq_unlock(&uq->uq_key);
2700			continue;
2701		}
2702sleep:
2703		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2704		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
2705
2706		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
2707			umtxq_lock(&uq->uq_key);
2708			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2709			umtxq_unbusy(&uq->uq_key);
2710
2711			error = umtxq_sleep(uq, "uwrlck", timo);
2712
2713			umtxq_busy(&uq->uq_key);
2714			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
2715			umtxq_unlock(&uq->uq_key);
2716			if (error)
2717				break;
2718			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2719		}
2720
2721		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
2722		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
2723		if (blocked_writers == 1) {
2724			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2725			for (;;) {
2726				oldstate = casuword32(&rwlock->rw_state, state,
2727					 state & ~URWLOCK_WRITE_WAITERS);
2728				if (oldstate == state)
2729					break;
2730				state = oldstate;
2731			}
2732			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
2733		} else
2734			blocked_readers = 0;
2735
2736		umtxq_lock(&uq->uq_key);
2737		umtxq_unbusy(&uq->uq_key);
2738		umtxq_unlock(&uq->uq_key);
2739	}
2740
2741	umtx_key_release(&uq->uq_key);
2742	return (error);
2743}
2744
2745static int
2746do_rw_wrlock2(struct thread *td, void *obj, struct _umtx_time *timeout)
2747{
2748	struct timespec cts, ets, tts;
2749	int error;
2750
2751	kern_clock_gettime(td, timeout->_clockid, &cts);
2752	if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2753		ets = cts;
2754		timespecadd(&ets, &timeout->_timeout);
2755		tts = timeout->_timeout;
2756	} else {
2757		ets = timeout->_timeout;
2758		tts = timeout->_timeout;
2759		timespecsub(&tts, &cts);
2760	}
2761	for (;;) {
2762		error = do_rw_wrlock(td, obj, tstohz(&tts));
2763		if (error != ETIMEDOUT)
2764			break;
2765		kern_clock_gettime(td, timeout->_clockid, &cts);
2766		if (timespeccmp(&cts, &ets, >=))
2767			break;
2768		tts = ets;
2769		timespecsub(&tts, &cts);
2770	}
2771	if (error == ERESTART)
2772		error = EINTR;
2773	return (error);
2774}
2775
2776static int
2777do_rw_unlock(struct thread *td, struct urwlock *rwlock)
2778{
2779	struct umtx_q *uq;
2780	uint32_t flags;
2781	int32_t state, oldstate;
2782	int error, q, count;
2783
2784	uq = td->td_umtxq;
2785	flags = fuword32(&rwlock->rw_flags);
2786	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2787	if (error != 0)
2788		return (error);
2789
2790	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
2791	if (state & URWLOCK_WRITE_OWNER) {
2792		for (;;) {
2793			oldstate = casuword32(&rwlock->rw_state, state,
2794				state & ~URWLOCK_WRITE_OWNER);
2795			if (oldstate != state) {
2796				state = oldstate;
2797				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
2798					error = EPERM;
2799					goto out;
2800				}
2801			} else
2802				break;
2803		}
2804	} else if (URWLOCK_READER_COUNT(state) != 0) {
2805		for (;;) {
2806			oldstate = casuword32(&rwlock->rw_state, state,
2807				state - 1);
2808			if (oldstate != state) {
2809				state = oldstate;
2810				if (URWLOCK_READER_COUNT(oldstate) == 0) {
2811					error = EPERM;
2812					goto out;
2813				}
2814			}
2815			else
2816				break;
2817		}
2818	} else {
2819		error = EPERM;
2820		goto out;
2821	}
2822
2823	count = 0;
2824
2825	if (!(flags & URWLOCK_PREFER_READER)) {
2826		if (state & URWLOCK_WRITE_WAITERS) {
2827			count = 1;
2828			q = UMTX_EXCLUSIVE_QUEUE;
2829		} else if (state & URWLOCK_READ_WAITERS) {
2830			count = INT_MAX;
2831			q = UMTX_SHARED_QUEUE;
2832		}
2833	} else {
2834		if (state & URWLOCK_READ_WAITERS) {
2835			count = INT_MAX;
2836			q = UMTX_SHARED_QUEUE;
2837		} else if (state & URWLOCK_WRITE_WAITERS) {
2838			count = 1;
2839			q = UMTX_EXCLUSIVE_QUEUE;
2840		}
2841	}
2842
2843	if (count) {
2844		umtxq_lock(&uq->uq_key);
2845		umtxq_busy(&uq->uq_key);
2846		umtxq_signal_queue(&uq->uq_key, count, q);
2847		umtxq_unbusy(&uq->uq_key);
2848		umtxq_unlock(&uq->uq_key);
2849	}
2850out:
2851	umtx_key_release(&uq->uq_key);
2852	return (error);
2853}
2854
2855static int
2856do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
2857{
2858	struct umtx_q *uq;
2859	struct timespec cts, ets, tts;
2860	uint32_t flags, count;
2861	int error;
2862
2863	uq = td->td_umtxq;
2864	flags = fuword32(&sem->_flags);
2865	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
2866	if (error != 0)
2867		return (error);
2868	umtxq_lock(&uq->uq_key);
2869	umtxq_busy(&uq->uq_key);
2870	umtxq_insert(uq);
2871	umtxq_unlock(&uq->uq_key);
2872
2873	casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
2874	rmb();
2875	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
2876	if (count != 0) {
2877		umtxq_lock(&uq->uq_key);
2878		umtxq_unbusy(&uq->uq_key);
2879		umtxq_remove(uq);
2880		umtxq_unlock(&uq->uq_key);
2881		umtx_key_release(&uq->uq_key);
2882		return (0);
2883	}
2884
2885	umtxq_lock(&uq->uq_key);
2886	umtxq_unbusy(&uq->uq_key);
2887
2888	if (timeout == NULL) {
2889		error = umtxq_sleep(uq, "usem", 0);
2890	} else {
2891		umtxq_unlock(&uq->uq_key);
2892		kern_clock_gettime(td, timeout->_clockid, &cts);
2893		if ((timeout->_flags & UMTX_ABSTIME) == 0) {
2894			ets = cts;
2895			timespecadd(&ets, &timeout->_timeout);
2896		} else {
2897			ets = timeout->_timeout;
2898		}
2899		umtxq_lock(&uq->uq_key);
2900		for (;;) {
2901			if (timespeccmp(&cts, &ets, >=)) {
2902				error = ETIMEDOUT;
2903				break;
2904			}
2905			tts = ets;
2906			timespecsub(&tts, &cts);
2907			error = umtxq_sleep(uq, "usem", tstohz(&tts));
2908			if (error != ETIMEDOUT)
2909				break;
2910			umtxq_unlock(&uq->uq_key);
2911			kern_clock_gettime(td, timeout->_clockid, &cts);
2912			umtxq_lock(&uq->uq_key);
2913		}
2914	}
2915
2916	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2917		error = 0;
2918	else {
2919		umtxq_remove(uq);
2920		if (error == ERESTART)
2921			error = EINTR;
2922	}
2923	umtxq_unlock(&uq->uq_key);
2924	umtx_key_release(&uq->uq_key);
2925	return (error);
2926}
2927
2928/*
2929 * Signal a userland condition variable.
2930 */
2931static int
2932do_sem_wake(struct thread *td, struct _usem *sem)
2933{
2934	struct umtx_key key;
2935	int error, cnt, nwake;
2936	uint32_t flags;
2937
2938	flags = fuword32(&sem->_flags);
2939	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
2940		return (error);
2941	umtxq_lock(&key);
2942	umtxq_busy(&key);
2943	cnt = umtxq_count(&key);
2944	nwake = umtxq_signal(&key, 1);
2945	if (cnt <= nwake) {
2946		umtxq_unlock(&key);
2947		error = suword32(
2948		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
2949		umtxq_lock(&key);
2950	}
2951	umtxq_unbusy(&key);
2952	umtxq_unlock(&key);
2953	umtx_key_release(&key);
2954	return (error);
2955}
2956
2957int
2958sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2959    /* struct umtx *umtx */
2960{
2961	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2962}
2963
2964int
2965sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2966    /* struct umtx *umtx */
2967{
2968	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2969}
2970
2971inline int
2972umtx_copyin_timeout(const void *addr, struct timespec *tsp)
2973{
2974	int error;
2975
2976	error = copyin(addr, tsp, sizeof(struct timespec));
2977	if (error == 0) {
2978		if (tsp->tv_sec < 0 ||
2979		    tsp->tv_nsec >= 1000000000 ||
2980		    tsp->tv_nsec < 0)
2981			error = EINVAL;
2982	}
2983	return (error);
2984}
2985
2986static inline int
2987umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
2988{
2989	int error;
2990
2991	if (size <= sizeof(struct timespec)) {
2992		tp->_clockid = CLOCK_REALTIME;
2993		tp->_flags = 0;
2994		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
2995	} else
2996		error = copyin(addr, tp, sizeof(struct _umtx_time));
2997	if (error != 0)
2998		return (error);
2999	if (tp->_timeout.tv_sec < 0 ||
3000	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3001		return (EINVAL);
3002	return (0);
3003}
3004
3005static int
3006__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3007{
3008	struct timespec *ts, timeout;
3009	int error;
3010
3011	/* Allow a null timespec (wait forever). */
3012	if (uap->uaddr2 == NULL)
3013		ts = NULL;
3014	else {
3015		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3016		if (error != 0)
3017			return (error);
3018		ts = &timeout;
3019	}
3020	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3021}
3022
3023static int
3024__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3025{
3026	return (do_unlock_umtx(td, uap->obj, uap->val));
3027}
3028
3029static int
3030__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3031{
3032	struct _umtx_time timeout, *tm_p;
3033	int error;
3034
3035	if (uap->uaddr2 == NULL)
3036		tm_p = NULL;
3037	else {
3038		error = umtx_copyin_umtx_time(
3039		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3040		if (error != 0)
3041			return (error);
3042		tm_p = &timeout;
3043	}
3044	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3045}
3046
3047static int
3048__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3049{
3050	struct _umtx_time timeout, *tm_p;
3051	int error;
3052
3053	if (uap->uaddr2 == NULL)
3054		tm_p = NULL;
3055	else {
3056		error = umtx_copyin_umtx_time(
3057		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3058		if (error != 0)
3059			return (error);
3060		tm_p = &timeout;
3061	}
3062	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3063}
3064
3065static int
3066__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3067{
3068	struct _umtx_time *tm_p, timeout;
3069	int error;
3070
3071	if (uap->uaddr2 == NULL)
3072		tm_p = NULL;
3073	else {
3074		error = umtx_copyin_umtx_time(
3075		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3076		if (error != 0)
3077			return (error);
3078		tm_p = &timeout;
3079	}
3080	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3081}
3082
3083static int
3084__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3085{
3086	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3087}
3088
3089#define BATCH_SIZE	128
3090static int
3091__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3092{
3093	int count = uap->val;
3094	void *uaddrs[BATCH_SIZE];
3095	char **upp = (char **)uap->obj;
3096	int tocopy;
3097	int error = 0;
3098	int i, pos = 0;
3099
3100	while (count > 0) {
3101		tocopy = count;
3102		if (tocopy > BATCH_SIZE)
3103			tocopy = BATCH_SIZE;
3104		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3105		if (error != 0)
3106			break;
3107		for (i = 0; i < tocopy; ++i)
3108			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3109		count -= tocopy;
3110		pos += tocopy;
3111	}
3112	return (error);
3113}
3114
3115static int
3116__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3117{
3118	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3119}
3120
3121static int
3122__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3123{
3124	struct _umtx_time *tm_p, timeout;
3125	int error;
3126
3127	/* Allow a null timespec (wait forever). */
3128	if (uap->uaddr2 == NULL)
3129		tm_p = NULL;
3130	else {
3131		error = umtx_copyin_umtx_time(
3132		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3133		if (error != 0)
3134			return (error);
3135		tm_p = &timeout;
3136	}
3137	return do_lock_umutex(td, uap->obj, tm_p, 0);
3138}
3139
3140static int
3141__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3142{
3143	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3144}
3145
3146static int
3147__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3148{
3149	struct _umtx_time *tm_p, timeout;
3150	int error;
3151
3152	/* Allow a null timespec (wait forever). */
3153	if (uap->uaddr2 == NULL)
3154		tm_p = NULL;
3155	else {
3156		error = umtx_copyin_umtx_time(
3157		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3158		if (error != 0)
3159			return (error);
3160		tm_p = &timeout;
3161	}
3162	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3163}
3164
3165static int
3166__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3167{
3168	return do_wake_umutex(td, uap->obj);
3169}
3170
3171static int
3172__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3173{
3174	return do_unlock_umutex(td, uap->obj);
3175}
3176
3177static int
3178__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3179{
3180	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3181}
3182
3183static int
3184__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3185{
3186	struct timespec *ts, timeout;
3187	int error;
3188
3189	/* Allow a null timespec (wait forever). */
3190	if (uap->uaddr2 == NULL)
3191		ts = NULL;
3192	else {
3193		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3194		if (error != 0)
3195			return (error);
3196		ts = &timeout;
3197	}
3198	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3199}
3200
3201static int
3202__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3203{
3204	return do_cv_signal(td, uap->obj);
3205}
3206
3207static int
3208__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3209{
3210	return do_cv_broadcast(td, uap->obj);
3211}
3212
3213static int
3214__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3215{
3216	struct _umtx_time timeout;
3217	int error;
3218
3219	/* Allow a null timespec (wait forever). */
3220	if (uap->uaddr2 == NULL) {
3221		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3222	} else {
3223		error = umtx_copyin_umtx_time(uap->uaddr2,
3224		   (size_t)uap->uaddr1, &timeout);
3225		if (error != 0)
3226			return (error);
3227		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3228	}
3229	return (error);
3230}
3231
3232static int
3233__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3234{
3235	struct _umtx_time timeout;
3236	int error;
3237
3238	/* Allow a null timespec (wait forever). */
3239	if (uap->uaddr2 == NULL) {
3240		error = do_rw_wrlock(td, uap->obj, 0);
3241	} else {
3242		error = umtx_copyin_umtx_time(uap->uaddr2,
3243		   (size_t)uap->uaddr1, &timeout);
3244		if (error != 0)
3245			return (error);
3246
3247		error = do_rw_wrlock2(td, uap->obj, &timeout);
3248	}
3249	return (error);
3250}
3251
3252static int
3253__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3254{
3255	return do_rw_unlock(td, uap->obj);
3256}
3257
3258static int
3259__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3260{
3261	struct _umtx_time *tm_p, timeout;
3262	int error;
3263
3264	/* Allow a null timespec (wait forever). */
3265	if (uap->uaddr2 == NULL)
3266		tm_p = NULL;
3267	else {
3268		error = umtx_copyin_umtx_time(
3269		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3270		if (error != 0)
3271			return (error);
3272		tm_p = &timeout;
3273	}
3274	return (do_sem_wait(td, uap->obj, tm_p));
3275}
3276
3277static int
3278__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3279{
3280	return do_sem_wake(td, uap->obj);
3281}
3282
3283typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3284
3285static _umtx_op_func op_table[] = {
3286	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3287	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3288	__umtx_op_wait,			/* UMTX_OP_WAIT */
3289	__umtx_op_wake,			/* UMTX_OP_WAKE */
3290	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3291	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3292	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3293	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3294	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3295	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3296	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3297	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3298	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3299	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3300	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3301	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3302	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3303	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3304	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3305	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3306	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3307	__umtx_op_nwake_private		/* UMTX_OP_NWAKE_PRIVATE */
3308};
3309
3310int
3311sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3312{
3313	if ((unsigned)uap->op < UMTX_OP_MAX)
3314		return (*op_table[uap->op])(td, uap);
3315	return (EINVAL);
3316}
3317
3318#ifdef COMPAT_FREEBSD32
3319int
3320freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3321    /* struct umtx *umtx */
3322{
3323	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3324}
3325
3326int
3327freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3328    /* struct umtx *umtx */
3329{
3330	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3331}
3332
3333struct timespec32 {
3334	uint32_t tv_sec;
3335	uint32_t tv_nsec;
3336};
3337
3338struct umtx_time32 {
3339	struct	timespec32	timeout;
3340	uint32_t		flags;
3341	uint32_t		clockid;
3342};
3343
3344static inline int
3345umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3346{
3347	struct timespec32 ts32;
3348	int error;
3349
3350	error = copyin(addr, &ts32, sizeof(struct timespec32));
3351	if (error == 0) {
3352		if (ts32.tv_sec < 0 ||
3353		    ts32.tv_nsec >= 1000000000 ||
3354		    ts32.tv_nsec < 0)
3355			error = EINVAL;
3356		else {
3357			tsp->tv_sec = ts32.tv_sec;
3358			tsp->tv_nsec = ts32.tv_nsec;
3359		}
3360	}
3361	return (error);
3362}
3363
3364static inline int
3365umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3366{
3367	struct umtx_time32 t32;
3368	int error;
3369
3370	t32.clockid = CLOCK_REALTIME;
3371	t32.flags   = 0;
3372	if (size <= sizeof(struct timespec32))
3373		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3374	else
3375		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3376	if (error != 0)
3377		return (error);
3378	if (t32.timeout.tv_sec < 0 ||
3379	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3380		return (EINVAL);
3381	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3382	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3383	tp->_flags = t32.flags;
3384	tp->_clockid = t32.clockid;
3385	return (0);
3386}
3387
3388static int
3389__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3390{
3391	struct timespec *ts, timeout;
3392	int error;
3393
3394	/* Allow a null timespec (wait forever). */
3395	if (uap->uaddr2 == NULL)
3396		ts = NULL;
3397	else {
3398		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3399		if (error != 0)
3400			return (error);
3401		ts = &timeout;
3402	}
3403	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3404}
3405
3406static int
3407__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3408{
3409	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3410}
3411
3412static int
3413__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3414{
3415	struct _umtx_time *tm_p, timeout;
3416	int error;
3417
3418	if (uap->uaddr2 == NULL)
3419		tm_p = NULL;
3420	else {
3421		error = umtx_copyin_umtx_time32(uap->uaddr2,
3422			(size_t)uap->uaddr1, &timeout);
3423		if (error != 0)
3424			return (error);
3425		tm_p = &timeout;
3426	}
3427	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3428}
3429
3430static int
3431__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3432{
3433	struct _umtx_time *tm_p, timeout;
3434	int error;
3435
3436	/* Allow a null timespec (wait forever). */
3437	if (uap->uaddr2 == NULL)
3438		tm_p = NULL;
3439	else {
3440		error = umtx_copyin_umtx_time(uap->uaddr2,
3441			    (size_t)uap->uaddr1, &timeout);
3442		if (error != 0)
3443			return (error);
3444		tm_p = &timeout;
3445	}
3446	return do_lock_umutex(td, uap->obj, tm_p, 0);
3447}
3448
3449static int
3450__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3451{
3452	struct _umtx_time *tm_p, timeout;
3453	int error;
3454
3455	/* Allow a null timespec (wait forever). */
3456	if (uap->uaddr2 == NULL)
3457		tm_p = NULL;
3458	else {
3459		error = umtx_copyin_umtx_time32(uap->uaddr2,
3460		    (size_t)uap->uaddr1, &timeout);
3461		if (error != 0)
3462			return (error);
3463		tm_p = &timeout;
3464	}
3465	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3466}
3467
3468static int
3469__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3470{
3471	struct timespec *ts, timeout;
3472	int error;
3473
3474	/* Allow a null timespec (wait forever). */
3475	if (uap->uaddr2 == NULL)
3476		ts = NULL;
3477	else {
3478		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3479		if (error != 0)
3480			return (error);
3481		ts = &timeout;
3482	}
3483	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3484}
3485
3486static int
3487__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3488{
3489	struct _umtx_time timeout;
3490	int error;
3491
3492	/* Allow a null timespec (wait forever). */
3493	if (uap->uaddr2 == NULL) {
3494		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3495	} else {
3496		error = umtx_copyin_umtx_time32(uap->uaddr2,
3497		    (size_t)uap->uaddr1, &timeout);
3498		if (error != 0)
3499			return (error);
3500		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
3501	}
3502	return (error);
3503}
3504
3505static int
3506__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3507{
3508	struct _umtx_time timeout;
3509	int error;
3510
3511	/* Allow a null timespec (wait forever). */
3512	if (uap->uaddr2 == NULL) {
3513		error = do_rw_wrlock(td, uap->obj, 0);
3514	} else {
3515		error = umtx_copyin_umtx_time32(uap->uaddr2,
3516		    (size_t)uap->uaddr1, &timeout);
3517		if (error != 0)
3518			return (error);
3519		error = do_rw_wrlock2(td, uap->obj, &timeout);
3520	}
3521	return (error);
3522}
3523
3524static int
3525__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3526{
3527	struct _umtx_time *tm_p, timeout;
3528	int error;
3529
3530	if (uap->uaddr2 == NULL)
3531		tm_p = NULL;
3532	else {
3533		error = umtx_copyin_umtx_time32(
3534		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3535		if (error != 0)
3536			return (error);
3537		tm_p = &timeout;
3538	}
3539	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3540}
3541
3542static int
3543__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3544{
3545	struct _umtx_time *tm_p, timeout;
3546	int error;
3547
3548	/* Allow a null timespec (wait forever). */
3549	if (uap->uaddr2 == NULL)
3550		tm_p = NULL;
3551	else {
3552		error = umtx_copyin_umtx_time32(uap->uaddr2,
3553		    (size_t)uap->uaddr1, &timeout);
3554		if (error != 0)
3555			return (error);
3556		tm_p = &timeout;
3557	}
3558	return (do_sem_wait(td, uap->obj, tm_p));
3559}
3560
3561static int
3562__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3563{
3564	int count = uap->val;
3565	uint32_t uaddrs[BATCH_SIZE];
3566	uint32_t **upp = (uint32_t **)uap->obj;
3567	int tocopy;
3568	int error = 0;
3569	int i, pos = 0;
3570
3571	while (count > 0) {
3572		tocopy = count;
3573		if (tocopy > BATCH_SIZE)
3574			tocopy = BATCH_SIZE;
3575		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3576		if (error != 0)
3577			break;
3578		for (i = 0; i < tocopy; ++i)
3579			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3580				INT_MAX, 1);
3581		count -= tocopy;
3582		pos += tocopy;
3583	}
3584	return (error);
3585}
3586
3587static _umtx_op_func op_table_compat32[] = {
3588	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3589	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3590	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3591	__umtx_op_wake,			/* UMTX_OP_WAKE */
3592	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3593	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3594	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3595	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3596	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3597	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3598	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3599	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3600	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3601	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3602	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3603	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3604	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3605	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3606	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3607	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3608	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3609	__umtx_op_nwake_private32	/* UMTX_OP_NWAKE_PRIVATE */
3610};
3611
3612int
3613freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3614{
3615	if ((unsigned)uap->op < UMTX_OP_MAX)
3616		return (*op_table_compat32[uap->op])(td,
3617			(struct _umtx_op_args *)uap);
3618	return (EINVAL);
3619}
3620#endif
3621
3622void
3623umtx_thread_init(struct thread *td)
3624{
3625	td->td_umtxq = umtxq_alloc();
3626	td->td_umtxq->uq_thread = td;
3627}
3628
3629void
3630umtx_thread_fini(struct thread *td)
3631{
3632	umtxq_free(td->td_umtxq);
3633}
3634
3635/*
3636 * It will be called when new thread is created, e.g fork().
3637 */
3638void
3639umtx_thread_alloc(struct thread *td)
3640{
3641	struct umtx_q *uq;
3642
3643	uq = td->td_umtxq;
3644	uq->uq_inherited_pri = PRI_MAX;
3645
3646	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3647	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3648	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3649	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3650}
3651
3652/*
3653 * exec() hook.
3654 */
3655static void
3656umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3657	struct image_params *imgp __unused)
3658{
3659	umtx_thread_cleanup(curthread);
3660}
3661
3662/*
3663 * thread_exit() hook.
3664 */
3665void
3666umtx_thread_exit(struct thread *td)
3667{
3668	umtx_thread_cleanup(td);
3669}
3670
3671/*
3672 * clean up umtx data.
3673 */
3674static void
3675umtx_thread_cleanup(struct thread *td)
3676{
3677	struct umtx_q *uq;
3678	struct umtx_pi *pi;
3679
3680	if ((uq = td->td_umtxq) == NULL)
3681		return;
3682
3683	mtx_lock_spin(&umtx_lock);
3684	uq->uq_inherited_pri = PRI_MAX;
3685	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
3686		pi->pi_owner = NULL;
3687		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
3688	}
3689	mtx_unlock_spin(&umtx_lock);
3690	thread_lock(td);
3691	sched_lend_user_prio(td, PRI_MAX);
3692	thread_unlock(td);
3693}
3694