kern_umtx.c revision 163449
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 163449 2006-10-17 02:24:47Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/proc.h>
39#include <sys/sched.h>
40#include <sys/sysctl.h>
41#include <sys/sysent.h>
42#include <sys/systm.h>
43#include <sys/sysproto.h>
44#include <sys/eventhandler.h>
45#include <sys/umtx.h>
46
47#include <vm/vm.h>
48#include <vm/vm_param.h>
49#include <vm/pmap.h>
50#include <vm/vm_map.h>
51#include <vm/vm_object.h>
52
53#ifdef COMPAT_IA32
54#include <compat/freebsd32/freebsd32_proto.h>
55#endif
56
57#define TYPE_SIMPLE_LOCK	0
58#define TYPE_SIMPLE_WAIT	1
59#define TYPE_NORMAL_UMUTEX	2
60#define TYPE_PI_UMUTEX		3
61#define TYPE_PP_UMUTEX		4
62#define TYPE_CV			5
63
64/* Key to represent a unique userland synchronous object */
65struct umtx_key {
66	int	hash;
67	int	type;
68	int	shared;
69	union {
70		struct {
71			vm_object_t	object;
72			uintptr_t	offset;
73		} shared;
74		struct {
75			struct vmspace	*vs;
76			uintptr_t	addr;
77		} private;
78		struct {
79			void		*a;
80			uintptr_t	b;
81		} both;
82	} info;
83};
84
85/* Priority inheritance mutex info. */
86struct umtx_pi {
87	/* Owner thread */
88	struct thread		*pi_owner;
89
90	/* Reference count */
91	int			pi_refcount;
92
93 	/* List entry to link umtx holding by thread */
94	TAILQ_ENTRY(umtx_pi)	pi_link;
95
96	/* List entry in hash */
97	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
98
99	/* List for waiters */
100	TAILQ_HEAD(,umtx_q)	pi_blocked;
101
102	/* Identify a userland lock object */
103	struct umtx_key		pi_key;
104};
105
106/* A userland synchronous object user. */
107struct umtx_q {
108	/* Linked list for the hash. */
109	TAILQ_ENTRY(umtx_q)	uq_link;
110
111	/* Umtx key. */
112	struct umtx_key		uq_key;
113
114	/* Umtx flags. */
115	int			uq_flags;
116#define UQF_UMTXQ	0x0001
117
118	/* The thread waits on. */
119	struct thread		*uq_thread;
120
121	/*
122	 * Blocked on PI mutex. read can use chain lock
123	 * or sched_lock, write must have both chain lock and
124	 * sched_lock being hold.
125	 */
126	struct umtx_pi		*uq_pi_blocked;
127
128	/* On blocked list */
129	TAILQ_ENTRY(umtx_q)	uq_lockq;
130
131	/* Thread contending with us */
132	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
133
134	/* Inherited priority from PP mutex */
135	u_char			uq_inherited_pri;
136};
137
138TAILQ_HEAD(umtxq_head, umtx_q);
139
140/* Userland lock object's wait-queue chain */
141struct umtxq_chain {
142	/* Lock for this chain. */
143	struct mtx		uc_lock;
144
145	/* List of sleep queues. */
146	struct umtxq_head	uc_queue;
147
148	/* Busy flag */
149	char			uc_busy;
150
151	/* Chain lock waiters */
152	int			uc_waiters;
153
154	/* All PI in the list */
155	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
156};
157
158#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
159
160/*
161 * Don't propagate time-sharing priority, there is a security reason,
162 * a user can simply introduce PI-mutex, let thread A lock the mutex,
163 * and let another thread B block on the mutex, because B is
164 * sleeping, its priority will be boosted, this causes A's priority to
165 * be boosted via priority propagating too and will never be lowered even
166 * if it is using 100%CPU, this is unfair to other processes.
167 */
168
169#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
170			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
171			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
172
173#define	GOLDEN_RATIO_PRIME	2654404609U
174#define	UMTX_CHAINS		128
175#define	UMTX_SHIFTS		(__WORD_BIT - 7)
176
177#define THREAD_SHARE		0
178#define PROCESS_SHARE		1
179#define AUTO_SHARE		2
180
181#define	GET_SHARE(flags)	\
182    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
183
184static uma_zone_t		umtx_pi_zone;
185static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
186static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
187static int			umtx_pi_allocated;
188
189SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
190SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
191    &umtx_pi_allocated, 0, "Allocated umtx_pi");
192
193static void umtxq_sysinit(void *);
194static void umtxq_hash(struct umtx_key *key);
195static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
196static void umtxq_lock(struct umtx_key *key);
197static void umtxq_unlock(struct umtx_key *key);
198static void umtxq_busy(struct umtx_key *key);
199static void umtxq_unbusy(struct umtx_key *key);
200static void umtxq_insert(struct umtx_q *uq);
201static void umtxq_remove(struct umtx_q *uq);
202static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
203static int umtxq_count(struct umtx_key *key);
204static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
205static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
206static int umtx_key_get(void *addr, int type, int share,
207	struct umtx_key *key);
208static void umtx_key_release(struct umtx_key *key);
209static struct umtx_pi *umtx_pi_alloc(void);
210static void umtx_pi_free(struct umtx_pi *pi);
211static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
212static void umtx_thread_cleanup(struct thread *td);
213static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
214	struct image_params *imgp __unused);
215SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
216
217static void
218umtxq_sysinit(void *arg __unused)
219{
220	int i;
221
222	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
223		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
224	for (i = 0; i < UMTX_CHAINS; ++i) {
225		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
226			 MTX_DEF | MTX_DUPOK);
227		TAILQ_INIT(&umtxq_chains[i].uc_queue);
228		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
229		umtxq_chains[i].uc_busy = 0;
230		umtxq_chains[i].uc_waiters = 0;
231	}
232	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
233	    EVENTHANDLER_PRI_ANY);
234}
235
236struct umtx_q *
237umtxq_alloc(void)
238{
239	struct umtx_q *uq;
240
241	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
242	TAILQ_INIT(&uq->uq_pi_contested);
243	uq->uq_inherited_pri = PRI_MAX;
244	return (uq);
245}
246
247void
248umtxq_free(struct umtx_q *uq)
249{
250	free(uq, M_UMTX);
251}
252
253static inline void
254umtxq_hash(struct umtx_key *key)
255{
256	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
257	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
258}
259
260static inline int
261umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
262{
263	return (k1->type == k2->type &&
264		k1->info.both.a == k2->info.both.a &&
265	        k1->info.both.b == k2->info.both.b);
266}
267
268static inline struct umtxq_chain *
269umtxq_getchain(struct umtx_key *key)
270{
271	return (&umtxq_chains[key->hash]);
272}
273
274/*
275 * Set chain to busy state when following operation
276 * may be blocked (kernel mutex can not be used).
277 */
278static inline void
279umtxq_busy(struct umtx_key *key)
280{
281	struct umtxq_chain *uc;
282
283	uc = umtxq_getchain(key);
284	mtx_assert(&uc->uc_lock, MA_OWNED);
285	while (uc->uc_busy != 0) {
286		uc->uc_waiters++;
287		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
288		uc->uc_waiters--;
289	}
290	uc->uc_busy = 1;
291}
292
293/*
294 * Unbusy a chain.
295 */
296static inline void
297umtxq_unbusy(struct umtx_key *key)
298{
299	struct umtxq_chain *uc;
300
301	uc = umtxq_getchain(key);
302	mtx_assert(&uc->uc_lock, MA_OWNED);
303	KASSERT(uc->uc_busy != 0, ("not busy"));
304	uc->uc_busy = 0;
305	if (uc->uc_waiters)
306		wakeup_one(uc);
307}
308
309/*
310 * Lock a chain.
311 */
312static inline void
313umtxq_lock(struct umtx_key *key)
314{
315	struct umtxq_chain *uc;
316
317	uc = umtxq_getchain(key);
318	mtx_lock(&uc->uc_lock);
319}
320
321/*
322 * Unlock a chain.
323 */
324static inline void
325umtxq_unlock(struct umtx_key *key)
326{
327	struct umtxq_chain *uc;
328
329	uc = umtxq_getchain(key);
330	mtx_unlock(&uc->uc_lock);
331}
332
333/*
334 * Insert a thread onto the umtx queue.
335 */
336static inline void
337umtxq_insert(struct umtx_q *uq)
338{
339	struct umtxq_chain *uc;
340
341	uc = umtxq_getchain(&uq->uq_key);
342	UMTXQ_LOCKED_ASSERT(uc);
343	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
344	uq->uq_flags |= UQF_UMTXQ;
345}
346
347/*
348 * Remove thread from the umtx queue.
349 */
350static inline void
351umtxq_remove(struct umtx_q *uq)
352{
353	struct umtxq_chain *uc;
354
355	uc = umtxq_getchain(&uq->uq_key);
356	UMTXQ_LOCKED_ASSERT(uc);
357	if (uq->uq_flags & UQF_UMTXQ) {
358		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
359		uq->uq_flags &= ~UQF_UMTXQ;
360	}
361}
362
363/*
364 * Check if there are multiple waiters
365 */
366static int
367umtxq_count(struct umtx_key *key)
368{
369	struct umtxq_chain *uc;
370	struct umtx_q *uq;
371	int count = 0;
372
373	uc = umtxq_getchain(key);
374	UMTXQ_LOCKED_ASSERT(uc);
375	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
376		if (umtx_key_match(&uq->uq_key, key)) {
377			if (++count > 1)
378				break;
379		}
380	}
381	return (count);
382}
383
384/*
385 * Check if there are multiple PI waiters and returns first
386 * waiter.
387 */
388static int
389umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
390{
391	struct umtxq_chain *uc;
392	struct umtx_q *uq;
393	int count = 0;
394
395	*first = NULL;
396	uc = umtxq_getchain(key);
397	UMTXQ_LOCKED_ASSERT(uc);
398	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
399		if (umtx_key_match(&uq->uq_key, key)) {
400			if (++count > 1)
401				break;
402			*first = uq;
403		}
404	}
405	return (count);
406}
407
408/*
409 * Wake up threads waiting on an userland object.
410 */
411static int
412umtxq_signal(struct umtx_key *key, int n_wake)
413{
414	struct umtxq_chain *uc;
415	struct umtx_q *uq, *next;
416	int ret;
417
418	ret = 0;
419	uc = umtxq_getchain(key);
420	UMTXQ_LOCKED_ASSERT(uc);
421	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
422		if (umtx_key_match(&uq->uq_key, key)) {
423			umtxq_remove(uq);
424			wakeup(uq);
425			if (++ret >= n_wake)
426				break;
427		}
428	}
429	return (ret);
430}
431
432/*
433 * Wake up specified thread.
434 */
435static inline void
436umtxq_signal_thread(struct umtx_q *uq)
437{
438	struct umtxq_chain *uc;
439
440	uc = umtxq_getchain(&uq->uq_key);
441	UMTXQ_LOCKED_ASSERT(uc);
442	umtxq_remove(uq);
443	wakeup(uq);
444}
445
446/*
447 * Put thread into sleep state, before sleeping, check if
448 * thread was removed from umtx queue.
449 */
450static inline int
451umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
452{
453	struct umtxq_chain *uc;
454	int error;
455
456	uc = umtxq_getchain(&uq->uq_key);
457	UMTXQ_LOCKED_ASSERT(uc);
458	if (!(uq->uq_flags & UQF_UMTXQ))
459		return (0);
460	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
461	if (error == EWOULDBLOCK)
462		error = ETIMEDOUT;
463	return (error);
464}
465
466/*
467 * Convert userspace address into unique logical address.
468 */
469static int
470umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
471{
472	struct thread *td = curthread;
473	vm_map_t map;
474	vm_map_entry_t entry;
475	vm_pindex_t pindex;
476	vm_prot_t prot;
477	boolean_t wired;
478
479	key->type = type;
480	if (share == THREAD_SHARE) {
481		key->shared = 0;
482		key->info.private.vs = td->td_proc->p_vmspace;
483		key->info.private.addr = (uintptr_t)addr;
484	} else if (share == PROCESS_SHARE || share == AUTO_SHARE) {
485		map = &td->td_proc->p_vmspace->vm_map;
486		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
487		    &entry, &key->info.shared.object, &pindex, &prot,
488		    &wired) != KERN_SUCCESS) {
489			return EFAULT;
490		}
491
492		if ((share == PROCESS_SHARE) ||
493		    (share == AUTO_SHARE &&
494		     VM_INHERIT_SHARE == entry->inheritance)) {
495			key->shared = 1;
496			key->info.shared.offset = entry->offset + entry->start -
497				(vm_offset_t)addr;
498			vm_object_reference(key->info.shared.object);
499		} else {
500			key->shared = 0;
501			key->info.private.vs = td->td_proc->p_vmspace;
502			key->info.private.addr = (uintptr_t)addr;
503		}
504		vm_map_lookup_done(map, entry);
505	}
506
507	umtxq_hash(key);
508	return (0);
509}
510
511/*
512 * Release key.
513 */
514static inline void
515umtx_key_release(struct umtx_key *key)
516{
517	if (key->shared)
518		vm_object_deallocate(key->info.shared.object);
519}
520
521/*
522 * Lock a umtx object.
523 */
524static int
525_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
526{
527	struct umtx_q *uq;
528	u_long owner;
529	u_long old;
530	int error = 0;
531
532	uq = td->td_umtxq;
533
534	/*
535	 * Care must be exercised when dealing with umtx structure. It
536	 * can fault on any access.
537	 */
538	for (;;) {
539		/*
540		 * Try the uncontested case.  This should be done in userland.
541		 */
542		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
543
544		/* The acquire succeeded. */
545		if (owner == UMTX_UNOWNED)
546			return (0);
547
548		/* The address was invalid. */
549		if (owner == -1)
550			return (EFAULT);
551
552		/* If no one owns it but it is contested try to acquire it. */
553		if (owner == UMTX_CONTESTED) {
554			owner = casuword(&umtx->u_owner,
555			    UMTX_CONTESTED, id | UMTX_CONTESTED);
556
557			if (owner == UMTX_CONTESTED)
558				return (0);
559
560			/* The address was invalid. */
561			if (owner == -1)
562				return (EFAULT);
563
564			/* If this failed the lock has changed, restart. */
565			continue;
566		}
567
568		/*
569		 * If we caught a signal, we have retried and now
570		 * exit immediately.
571		 */
572		if (error != 0)
573			return (error);
574
575		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
576			AUTO_SHARE, &uq->uq_key)) != 0)
577			return (error);
578
579		umtxq_lock(&uq->uq_key);
580		umtxq_busy(&uq->uq_key);
581		umtxq_insert(uq);
582		umtxq_unbusy(&uq->uq_key);
583		umtxq_unlock(&uq->uq_key);
584
585		/*
586		 * Set the contested bit so that a release in user space
587		 * knows to use the system call for unlock.  If this fails
588		 * either some one else has acquired the lock or it has been
589		 * released.
590		 */
591		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
592
593		/* The address was invalid. */
594		if (old == -1) {
595			umtxq_lock(&uq->uq_key);
596			umtxq_remove(uq);
597			umtxq_unlock(&uq->uq_key);
598			umtx_key_release(&uq->uq_key);
599			return (EFAULT);
600		}
601
602		/*
603		 * We set the contested bit, sleep. Otherwise the lock changed
604		 * and we need to retry or we lost a race to the thread
605		 * unlocking the umtx.
606		 */
607		umtxq_lock(&uq->uq_key);
608		if (old == owner)
609			error = umtxq_sleep(uq, "umtx", timo);
610		umtxq_remove(uq);
611		umtxq_unlock(&uq->uq_key);
612		umtx_key_release(&uq->uq_key);
613	}
614
615	return (0);
616}
617
618/*
619 * Lock a umtx object.
620 */
621static int
622do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
623	struct timespec *timeout)
624{
625	struct timespec ts, ts2, ts3;
626	struct timeval tv;
627	int error;
628
629	if (timeout == NULL) {
630		error = _do_lock_umtx(td, umtx, id, 0);
631		/* Mutex locking is restarted if it is interrupted. */
632		if (error == EINTR)
633			error = ERESTART;
634	} else {
635		getnanouptime(&ts);
636		timespecadd(&ts, timeout);
637		TIMESPEC_TO_TIMEVAL(&tv, timeout);
638		for (;;) {
639			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
640			if (error != ETIMEDOUT)
641				break;
642			getnanouptime(&ts2);
643			if (timespeccmp(&ts2, &ts, >=)) {
644				error = ETIMEDOUT;
645				break;
646			}
647			ts3 = ts;
648			timespecsub(&ts3, &ts2);
649			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
650		}
651		/* Timed-locking is not restarted. */
652		if (error == ERESTART)
653			error = EINTR;
654	}
655	return (error);
656}
657
658/*
659 * Unlock a umtx object.
660 */
661static int
662do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
663{
664	struct umtx_key key;
665	u_long owner;
666	u_long old;
667	int error;
668	int count;
669
670	/*
671	 * Make sure we own this mtx.
672	 */
673	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
674	if (owner == -1)
675		return (EFAULT);
676
677	if ((owner & ~UMTX_CONTESTED) != id)
678		return (EPERM);
679
680	/* This should be done in userland */
681	if ((owner & UMTX_CONTESTED) == 0) {
682		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
683		if (old == -1)
684			return (EFAULT);
685		if (old == owner)
686			return (0);
687		owner = old;
688	}
689
690	/* We should only ever be in here for contested locks */
691	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
692		&key)) != 0)
693		return (error);
694
695	umtxq_lock(&key);
696	umtxq_busy(&key);
697	count = umtxq_count(&key);
698	umtxq_unlock(&key);
699
700	/*
701	 * When unlocking the umtx, it must be marked as unowned if
702	 * there is zero or one thread only waiting for it.
703	 * Otherwise, it must be marked as contested.
704	 */
705	old = casuword(&umtx->u_owner, owner,
706		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
707	umtxq_lock(&key);
708	umtxq_signal(&key,1);
709	umtxq_unbusy(&key);
710	umtxq_unlock(&key);
711	umtx_key_release(&key);
712	if (old == -1)
713		return (EFAULT);
714	if (old != owner)
715		return (EINVAL);
716	return (0);
717}
718
719#ifdef COMPAT_IA32
720
721/*
722 * Lock a umtx object.
723 */
724static int
725_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
726{
727	struct umtx_q *uq;
728	uint32_t owner;
729	uint32_t old;
730	int error = 0;
731
732	uq = td->td_umtxq;
733
734	/*
735	 * Care must be exercised when dealing with umtx structure. It
736	 * can fault on any access.
737	 */
738	for (;;) {
739		/*
740		 * Try the uncontested case.  This should be done in userland.
741		 */
742		owner = casuword32(m, UMUTEX_UNOWNED, id);
743
744		/* The acquire succeeded. */
745		if (owner == UMUTEX_UNOWNED)
746			return (0);
747
748		/* The address was invalid. */
749		if (owner == -1)
750			return (EFAULT);
751
752		/* If no one owns it but it is contested try to acquire it. */
753		if (owner == UMUTEX_CONTESTED) {
754			owner = casuword32(m,
755			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
756			if (owner == UMUTEX_CONTESTED)
757				return (0);
758
759			/* The address was invalid. */
760			if (owner == -1)
761				return (EFAULT);
762
763			/* If this failed the lock has changed, restart. */
764			continue;
765		}
766
767		/*
768		 * If we caught a signal, we have retried and now
769		 * exit immediately.
770		 */
771		if (error != 0)
772			return (error);
773
774		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
775			AUTO_SHARE, &uq->uq_key)) != 0)
776			return (error);
777
778		umtxq_lock(&uq->uq_key);
779		umtxq_busy(&uq->uq_key);
780		umtxq_insert(uq);
781		umtxq_unbusy(&uq->uq_key);
782		umtxq_unlock(&uq->uq_key);
783
784		/*
785		 * Set the contested bit so that a release in user space
786		 * knows to use the system call for unlock.  If this fails
787		 * either some one else has acquired the lock or it has been
788		 * released.
789		 */
790		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
791
792		/* The address was invalid. */
793		if (old == -1) {
794			umtxq_lock(&uq->uq_key);
795			umtxq_remove(uq);
796			umtxq_unlock(&uq->uq_key);
797			umtx_key_release(&uq->uq_key);
798			return (EFAULT);
799		}
800
801		/*
802		 * We set the contested bit, sleep. Otherwise the lock changed
803		 * and we need to retry or we lost a race to the thread
804		 * unlocking the umtx.
805		 */
806		umtxq_lock(&uq->uq_key);
807		if (old == owner)
808			error = umtxq_sleep(uq, "umtx", timo);
809		umtxq_remove(uq);
810		umtxq_unlock(&uq->uq_key);
811		umtx_key_release(&uq->uq_key);
812	}
813
814	return (0);
815}
816
817/*
818 * Lock a umtx object.
819 */
820static int
821do_lock_umtx32(struct thread *td, void *m, uint32_t id,
822	struct timespec *timeout)
823{
824	struct timespec ts, ts2, ts3;
825	struct timeval tv;
826	int error;
827
828	if (timeout == NULL) {
829		error = _do_lock_umtx32(td, m, id, 0);
830		/* Mutex locking is restarted if it is interrupted. */
831		if (error == EINTR)
832			error = ERESTART;
833	} else {
834		getnanouptime(&ts);
835		timespecadd(&ts, timeout);
836		TIMESPEC_TO_TIMEVAL(&tv, timeout);
837		for (;;) {
838			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
839			if (error != ETIMEDOUT)
840				break;
841			getnanouptime(&ts2);
842			if (timespeccmp(&ts2, &ts, >=)) {
843				error = ETIMEDOUT;
844				break;
845			}
846			ts3 = ts;
847			timespecsub(&ts3, &ts2);
848			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
849		}
850		/* Timed-locking is not restarted. */
851		if (error == ERESTART)
852			error = EINTR;
853	}
854	return (error);
855}
856
857/*
858 * Unlock a umtx object.
859 */
860static int
861do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
862{
863	struct umtx_key key;
864	uint32_t owner;
865	uint32_t old;
866	int error;
867	int count;
868
869	/*
870	 * Make sure we own this mtx.
871	 */
872	owner = fuword32(m);
873	if (owner == -1)
874		return (EFAULT);
875
876	if ((owner & ~UMUTEX_CONTESTED) != id)
877		return (EPERM);
878
879	/* This should be done in userland */
880	if ((owner & UMUTEX_CONTESTED) == 0) {
881		old = casuword32(m, owner, UMUTEX_UNOWNED);
882		if (old == -1)
883			return (EFAULT);
884		if (old == owner)
885			return (0);
886		owner = old;
887	}
888
889	/* We should only ever be in here for contested locks */
890	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
891		&key)) != 0)
892		return (error);
893
894	umtxq_lock(&key);
895	umtxq_busy(&key);
896	count = umtxq_count(&key);
897	umtxq_unlock(&key);
898
899	/*
900	 * When unlocking the umtx, it must be marked as unowned if
901	 * there is zero or one thread only waiting for it.
902	 * Otherwise, it must be marked as contested.
903	 */
904	old = casuword32(m, owner,
905		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
906	umtxq_lock(&key);
907	umtxq_signal(&key,1);
908	umtxq_unbusy(&key);
909	umtxq_unlock(&key);
910	umtx_key_release(&key);
911	if (old == -1)
912		return (EFAULT);
913	if (old != owner)
914		return (EINVAL);
915	return (0);
916}
917#endif
918
919/*
920 * Fetch and compare value, sleep on the address if value is not changed.
921 */
922static int
923do_wait(struct thread *td, void *addr, u_long id,
924	struct timespec *timeout, int compat32)
925{
926	struct umtx_q *uq;
927	struct timespec ts, ts2, ts3;
928	struct timeval tv;
929	u_long tmp;
930	int error = 0;
931
932	uq = td->td_umtxq;
933	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
934	    &uq->uq_key)) != 0)
935		return (error);
936
937	umtxq_lock(&uq->uq_key);
938	umtxq_insert(uq);
939	umtxq_unlock(&uq->uq_key);
940	if (compat32 == 0)
941		tmp = fuword(addr);
942        else
943		tmp = fuword32(addr);
944	if (tmp != id) {
945		umtxq_lock(&uq->uq_key);
946		umtxq_remove(uq);
947		umtxq_unlock(&uq->uq_key);
948	} else if (timeout == NULL) {
949		umtxq_lock(&uq->uq_key);
950		error = umtxq_sleep(uq, "ucond", 0);
951		umtxq_remove(uq);
952		umtxq_unlock(&uq->uq_key);
953	} else {
954		getnanouptime(&ts);
955		timespecadd(&ts, timeout);
956		TIMESPEC_TO_TIMEVAL(&tv, timeout);
957		umtxq_lock(&uq->uq_key);
958		for (;;) {
959			error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
960			if (!(uq->uq_flags & UQF_UMTXQ))
961				break;
962			if (error != ETIMEDOUT)
963				break;
964			umtxq_unlock(&uq->uq_key);
965			getnanouptime(&ts2);
966			if (timespeccmp(&ts2, &ts, >=)) {
967				error = ETIMEDOUT;
968				umtxq_lock(&uq->uq_key);
969				break;
970			}
971			ts3 = ts;
972			timespecsub(&ts3, &ts2);
973			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
974			umtxq_lock(&uq->uq_key);
975		}
976		umtxq_remove(uq);
977		umtxq_unlock(&uq->uq_key);
978	}
979	umtx_key_release(&uq->uq_key);
980	if (error == ERESTART)
981		error = EINTR;
982	return (error);
983}
984
985/*
986 * Wake up threads sleeping on the specified address.
987 */
988int
989kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
990{
991	struct umtx_key key;
992	int ret;
993
994	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
995	   &key)) != 0)
996		return (ret);
997	umtxq_lock(&key);
998	ret = umtxq_signal(&key, n_wake);
999	umtxq_unlock(&key);
1000	umtx_key_release(&key);
1001	return (0);
1002}
1003
1004/*
1005 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1006 */
1007static int
1008_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1009	int try)
1010{
1011	struct umtx_q *uq;
1012	uint32_t owner, old, id;
1013	int error = 0;
1014
1015	id = td->td_tid;
1016	uq = td->td_umtxq;
1017
1018	/*
1019	 * Care must be exercised when dealing with umtx structure. It
1020	 * can fault on any access.
1021	 */
1022	for (;;) {
1023		/*
1024		 * Try the uncontested case.  This should be done in userland.
1025		 */
1026		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1027
1028		/* The acquire succeeded. */
1029		if (owner == UMUTEX_UNOWNED)
1030			return (0);
1031
1032		/* The address was invalid. */
1033		if (owner == -1)
1034			return (EFAULT);
1035
1036		/* If no one owns it but it is contested try to acquire it. */
1037		if (owner == UMUTEX_CONTESTED) {
1038			owner = casuword32(&m->m_owner,
1039			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1040
1041			if (owner == UMUTEX_CONTESTED)
1042				return (0);
1043
1044			/* The address was invalid. */
1045			if (owner == -1)
1046				return (EFAULT);
1047
1048			/* If this failed the lock has changed, restart. */
1049			continue;
1050		}
1051
1052		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1053		    (owner & ~UMUTEX_CONTESTED) == id)
1054			return (EDEADLK);
1055
1056		if (try != 0)
1057			return (EBUSY);
1058
1059		/*
1060		 * If we caught a signal, we have retried and now
1061		 * exit immediately.
1062		 */
1063		if (error != 0)
1064			return (error);
1065
1066		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1067		    GET_SHARE(flags), &uq->uq_key)) != 0)
1068			return (error);
1069
1070		umtxq_lock(&uq->uq_key);
1071		umtxq_busy(&uq->uq_key);
1072		umtxq_insert(uq);
1073		umtxq_unbusy(&uq->uq_key);
1074		umtxq_unlock(&uq->uq_key);
1075
1076		/*
1077		 * Set the contested bit so that a release in user space
1078		 * knows to use the system call for unlock.  If this fails
1079		 * either some one else has acquired the lock or it has been
1080		 * released.
1081		 */
1082		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1083
1084		/* The address was invalid. */
1085		if (old == -1) {
1086			umtxq_lock(&uq->uq_key);
1087			umtxq_remove(uq);
1088			umtxq_unlock(&uq->uq_key);
1089			umtx_key_release(&uq->uq_key);
1090			return (EFAULT);
1091		}
1092
1093		/*
1094		 * We set the contested bit, sleep. Otherwise the lock changed
1095		 * and we need to retry or we lost a race to the thread
1096		 * unlocking the umtx.
1097		 */
1098		umtxq_lock(&uq->uq_key);
1099		if (old == owner)
1100			error = umtxq_sleep(uq, "umtxn", timo);
1101		umtxq_remove(uq);
1102		umtxq_unlock(&uq->uq_key);
1103		umtx_key_release(&uq->uq_key);
1104	}
1105
1106	return (0);
1107}
1108
1109/*
1110 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1111 */
1112/*
1113 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1114 */
1115static int
1116do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1117{
1118	struct umtx_key key;
1119	uint32_t owner, old, id;
1120	int error;
1121	int count;
1122
1123	id = td->td_tid;
1124	/*
1125	 * Make sure we own this mtx.
1126	 */
1127	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1128	if (owner == -1)
1129		return (EFAULT);
1130
1131	if ((owner & ~UMUTEX_CONTESTED) != id)
1132		return (EPERM);
1133
1134	/* This should be done in userland */
1135	if ((owner & UMUTEX_CONTESTED) == 0) {
1136		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1137		if (old == -1)
1138			return (EFAULT);
1139		if (old == owner)
1140			return (0);
1141		owner = old;
1142	}
1143
1144	/* We should only ever be in here for contested locks */
1145	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1146	    &key)) != 0)
1147		return (error);
1148
1149	umtxq_lock(&key);
1150	umtxq_busy(&key);
1151	count = umtxq_count(&key);
1152	umtxq_unlock(&key);
1153
1154	/*
1155	 * When unlocking the umtx, it must be marked as unowned if
1156	 * there is zero or one thread only waiting for it.
1157	 * Otherwise, it must be marked as contested.
1158	 */
1159	old = casuword32(&m->m_owner, owner,
1160		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1161	umtxq_lock(&key);
1162	umtxq_signal(&key,1);
1163	umtxq_unbusy(&key);
1164	umtxq_unlock(&key);
1165	umtx_key_release(&key);
1166	if (old == -1)
1167		return (EFAULT);
1168	if (old != owner)
1169		return (EINVAL);
1170	return (0);
1171}
1172
1173static inline struct umtx_pi *
1174umtx_pi_alloc(void)
1175{
1176	struct umtx_pi *pi;
1177
1178	pi = uma_zalloc(umtx_pi_zone, M_ZERO | M_WAITOK);
1179	TAILQ_INIT(&pi->pi_blocked);
1180	atomic_add_int(&umtx_pi_allocated, 1);
1181	return (pi);
1182}
1183
1184static inline void
1185umtx_pi_free(struct umtx_pi *pi)
1186{
1187	uma_zfree(umtx_pi_zone, pi);
1188	atomic_add_int(&umtx_pi_allocated, -1);
1189}
1190
1191/*
1192 * Adjust the thread's position on a pi_state after its priority has been
1193 * changed.
1194 */
1195static int
1196umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1197{
1198	struct umtx_q *uq, *uq1, *uq2;
1199	struct thread *td1;
1200
1201	mtx_assert(&sched_lock, MA_OWNED);
1202	if (pi == NULL)
1203		return (0);
1204
1205	uq = td->td_umtxq;
1206
1207	/*
1208	 * Check if the thread needs to be moved on the blocked chain.
1209	 * It needs to be moved if either its priority is lower than
1210	 * the previous thread or higher than the next thread.
1211	 */
1212	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1213	uq2 = TAILQ_NEXT(uq, uq_lockq);
1214	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1215	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1216		/*
1217		 * Remove thread from blocked chain and determine where
1218		 * it should be moved to.
1219		 */
1220		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1221		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1222			td1 = uq1->uq_thread;
1223			MPASS(td1->td_proc->p_magic == P_MAGIC);
1224			if (UPRI(td1) > UPRI(td))
1225				break;
1226		}
1227
1228		if (uq1 == NULL)
1229			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1230		else
1231			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1232	}
1233	return (1);
1234}
1235
1236/*
1237 * Propagate priority when a thread is blocked on POSIX
1238 * PI mutex.
1239 */
1240static void
1241umtx_propagate_priority(struct thread *td)
1242{
1243	struct umtx_q *uq;
1244	struct umtx_pi *pi;
1245	int pri;
1246
1247	mtx_assert(&sched_lock, MA_OWNED);
1248	pri = UPRI(td);
1249	uq = td->td_umtxq;
1250	pi = uq->uq_pi_blocked;
1251	if (pi == NULL)
1252		return;
1253
1254	for (;;) {
1255		td = pi->pi_owner;
1256		if (td == NULL)
1257			return;
1258
1259		MPASS(td->td_proc != NULL);
1260		MPASS(td->td_proc->p_magic == P_MAGIC);
1261
1262		if (UPRI(td) <= pri)
1263			return;
1264
1265		sched_lend_user_prio(td, pri);
1266
1267		/*
1268		 * Pick up the lock that td is blocked on.
1269		 */
1270		uq = td->td_umtxq;
1271		pi = uq->uq_pi_blocked;
1272		/* Resort td on the list if needed. */
1273		if (!umtx_pi_adjust_thread(pi, td))
1274			break;
1275	}
1276}
1277
1278/*
1279 * Unpropagate priority for a PI mutex when a thread blocked on
1280 * it is interrupted by signal or resumed by others.
1281 */
1282static void
1283umtx_unpropagate_priority(struct umtx_pi *pi)
1284{
1285	struct umtx_q *uq, *uq_owner;
1286	struct umtx_pi *pi2;
1287	int pri;
1288
1289	mtx_assert(&sched_lock, MA_OWNED);
1290
1291	while (pi != NULL && pi->pi_owner != NULL) {
1292		pri = PRI_MAX;
1293		uq_owner = pi->pi_owner->td_umtxq;
1294
1295		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1296			uq = TAILQ_FIRST(&pi2->pi_blocked);
1297			if (uq != NULL) {
1298				if (pri > UPRI(uq->uq_thread))
1299					pri = UPRI(uq->uq_thread);
1300			}
1301		}
1302
1303		if (pri > uq_owner->uq_inherited_pri)
1304			pri = uq_owner->uq_inherited_pri;
1305		sched_unlend_user_prio(pi->pi_owner, pri);
1306		pi = uq_owner->uq_pi_blocked;
1307	}
1308}
1309
1310/*
1311 * Insert a PI mutex into owned list.
1312 */
1313static void
1314umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1315{
1316	struct umtx_q *uq_owner;
1317
1318	uq_owner = owner->td_umtxq;
1319	mtx_assert(&sched_lock, MA_OWNED);
1320	if (pi->pi_owner != NULL)
1321		panic("pi_ower != NULL");
1322	pi->pi_owner = owner;
1323	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1324}
1325
1326/*
1327 * Claim ownership of a PI mutex.
1328 */
1329static int
1330umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1331{
1332	struct umtx_q *uq, *uq_owner;
1333
1334	uq_owner = owner->td_umtxq;
1335	mtx_lock_spin(&sched_lock);
1336	if (pi->pi_owner == owner) {
1337		mtx_unlock_spin(&sched_lock);
1338		return (0);
1339	}
1340
1341	if (pi->pi_owner != NULL) {
1342		/*
1343		 * userland may have already messed the mutex, sigh.
1344		 */
1345		mtx_unlock_spin(&sched_lock);
1346		return (EPERM);
1347	}
1348	umtx_pi_setowner(pi, owner);
1349	uq = TAILQ_FIRST(&pi->pi_blocked);
1350	if (uq != NULL) {
1351		int pri;
1352
1353		pri = UPRI(uq->uq_thread);
1354		if (pri < UPRI(owner))
1355			sched_lend_user_prio(owner, pri);
1356	}
1357	mtx_unlock_spin(&sched_lock);
1358	return (0);
1359}
1360
1361/*
1362 * Adjust a thread's order position in its blocked PI mutex,
1363 * this may result new priority propagating process.
1364 */
1365void
1366umtx_pi_adjust(struct thread *td, u_char oldpri)
1367{
1368	struct umtx_q *uq;
1369	struct umtx_pi *pi;
1370
1371	uq = td->td_umtxq;
1372
1373	mtx_assert(&sched_lock, MA_OWNED);
1374	MPASS(TD_ON_UPILOCK(td));
1375
1376	/*
1377	 * Pick up the lock that td is blocked on.
1378	 */
1379	pi = uq->uq_pi_blocked;
1380	MPASS(pi != NULL);
1381
1382	/* Resort the turnstile on the list. */
1383	if (!umtx_pi_adjust_thread(pi, td))
1384		return;
1385
1386	/*
1387	 * If our priority was lowered and we are at the head of the
1388	 * turnstile, then propagate our new priority up the chain.
1389	 */
1390	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1391		umtx_propagate_priority(td);
1392}
1393
1394/*
1395 * Sleep on a PI mutex.
1396 */
1397static int
1398umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1399	uint32_t owner, const char *wmesg, int timo)
1400{
1401	struct umtxq_chain *uc;
1402	struct thread *td, *td1;
1403	struct umtx_q *uq1;
1404	int pri;
1405	int error = 0;
1406
1407	td = uq->uq_thread;
1408	KASSERT(td == curthread, ("inconsistent uq_thread"));
1409	uc = umtxq_getchain(&uq->uq_key);
1410	UMTXQ_LOCKED_ASSERT(uc);
1411	umtxq_insert(uq);
1412	if (pi->pi_owner == NULL) {
1413		/* XXX
1414		 * Current, We only support process private PI-mutex,
1415		 * non-contended PI-mutexes are locked in userland.
1416		 * Process shared PI-mutex should always be initialized
1417		 * by kernel and be registered in kernel, locking should
1418		 * always be done by kernel to avoid security problems.
1419		 * For process private PI-mutex, we can find owner
1420		 * thread and boost its priority safely.
1421		 */
1422		PROC_LOCK(curproc);
1423		td1 = thread_find(curproc, owner);
1424		mtx_lock_spin(&sched_lock);
1425		if (td1 != NULL && pi->pi_owner == NULL) {
1426			uq1 = td1->td_umtxq;
1427			umtx_pi_setowner(pi, td1);
1428		}
1429		PROC_UNLOCK(curproc);
1430	} else {
1431		mtx_lock_spin(&sched_lock);
1432	}
1433
1434	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1435		pri = UPRI(uq1->uq_thread);
1436		if (pri > UPRI(td))
1437			break;
1438	}
1439
1440	if (uq1 != NULL)
1441		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1442	else
1443		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1444
1445	uq->uq_pi_blocked = pi;
1446	td->td_flags |= TDF_UPIBLOCKED;
1447	mtx_unlock_spin(&sched_lock);
1448	umtxq_unlock(&uq->uq_key);
1449
1450	mtx_lock_spin(&sched_lock);
1451	umtx_propagate_priority(td);
1452	mtx_unlock_spin(&sched_lock);
1453
1454	umtxq_lock(&uq->uq_key);
1455	if (uq->uq_flags & UQF_UMTXQ) {
1456		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1457		if (error == EWOULDBLOCK)
1458			error = ETIMEDOUT;
1459		if (uq->uq_flags & UQF_UMTXQ) {
1460			umtxq_busy(&uq->uq_key);
1461			umtxq_remove(uq);
1462			umtxq_unbusy(&uq->uq_key);
1463		}
1464	}
1465	umtxq_unlock(&uq->uq_key);
1466
1467	mtx_lock_spin(&sched_lock);
1468	uq->uq_pi_blocked = NULL;
1469	td->td_flags &= ~TDF_UPIBLOCKED;
1470	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1471	umtx_unpropagate_priority(pi);
1472	mtx_unlock_spin(&sched_lock);
1473
1474	umtxq_lock(&uq->uq_key);
1475
1476	return (error);
1477}
1478
1479/*
1480 * Add reference count for a PI mutex.
1481 */
1482static void
1483umtx_pi_ref(struct umtx_pi *pi)
1484{
1485	struct umtxq_chain *uc;
1486
1487	uc = umtxq_getchain(&pi->pi_key);
1488	UMTXQ_LOCKED_ASSERT(uc);
1489	pi->pi_refcount++;
1490}
1491
1492/*
1493 * Decrease reference count for a PI mutex, if the counter
1494 * is decreased to zero, its memory space is freed.
1495 */
1496static void
1497umtx_pi_unref(struct umtx_pi *pi)
1498{
1499	struct umtxq_chain *uc;
1500	int free = 0;
1501
1502	uc = umtxq_getchain(&pi->pi_key);
1503	UMTXQ_LOCKED_ASSERT(uc);
1504	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1505	if (--pi->pi_refcount == 0) {
1506		mtx_lock_spin(&sched_lock);
1507		if (pi->pi_owner != NULL) {
1508			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1509				pi, pi_link);
1510			pi->pi_owner = NULL;
1511		}
1512		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1513			("blocked queue not empty"));
1514		mtx_unlock_spin(&sched_lock);
1515		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1516		free = 1;
1517	}
1518	if (free)
1519		umtx_pi_free(pi);
1520}
1521
1522/*
1523 * Find a PI mutex in hash table.
1524 */
1525static struct umtx_pi *
1526umtx_pi_lookup(struct umtx_key *key)
1527{
1528	struct umtxq_chain *uc;
1529	struct umtx_pi *pi;
1530
1531	uc = umtxq_getchain(key);
1532	UMTXQ_LOCKED_ASSERT(uc);
1533
1534	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1535		if (umtx_key_match(&pi->pi_key, key)) {
1536			return (pi);
1537		}
1538	}
1539	return (NULL);
1540}
1541
1542/*
1543 * Insert a PI mutex into hash table.
1544 */
1545static inline void
1546umtx_pi_insert(struct umtx_pi *pi)
1547{
1548	struct umtxq_chain *uc;
1549
1550	uc = umtxq_getchain(&pi->pi_key);
1551	UMTXQ_LOCKED_ASSERT(uc);
1552	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1553}
1554
1555/*
1556 * Lock a PI mutex.
1557 */
1558static int
1559_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1560	int try)
1561{
1562	struct umtx_q *uq;
1563	struct umtx_pi *pi, *new_pi;
1564	uint32_t id, owner, old;
1565	int error;
1566
1567	id = td->td_tid;
1568	uq = td->td_umtxq;
1569
1570	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1571	    &uq->uq_key)) != 0)
1572		return (error);
1573	for (;;) {
1574		pi = NULL;
1575		umtxq_lock(&uq->uq_key);
1576		pi = umtx_pi_lookup(&uq->uq_key);
1577		if (pi == NULL) {
1578			umtxq_unlock(&uq->uq_key);
1579			new_pi = umtx_pi_alloc();
1580			new_pi->pi_key = uq->uq_key;
1581			umtxq_lock(&uq->uq_key);
1582			pi = umtx_pi_lookup(&uq->uq_key);
1583			if (pi != NULL)
1584				umtx_pi_free(new_pi);
1585			else {
1586				umtx_pi_insert(new_pi);
1587				pi = new_pi;
1588			}
1589		}
1590
1591		umtx_pi_ref(pi);
1592		umtxq_unlock(&uq->uq_key);
1593
1594		/*
1595		 * Care must be exercised when dealing with umtx structure.  It
1596		 * can fault on any access.
1597		 */
1598
1599		/*
1600		 * Try the uncontested case.  This should be done in userland.
1601		 */
1602		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1603
1604		/* The acquire succeeded. */
1605		if (owner == UMUTEX_UNOWNED) {
1606			error = 0;
1607			break;
1608		}
1609
1610		/* The address was invalid. */
1611		if (owner == -1) {
1612			error = EFAULT;
1613			break;
1614		}
1615
1616		/* If no one owns it but it is contested try to acquire it. */
1617		if (owner == UMUTEX_CONTESTED) {
1618			owner = casuword32(&m->m_owner,
1619			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1620
1621			if (owner == UMUTEX_CONTESTED) {
1622				umtxq_lock(&uq->uq_key);
1623				error = umtx_pi_claim(pi, td);
1624				umtxq_unlock(&uq->uq_key);
1625				break;
1626			}
1627
1628			/* The address was invalid. */
1629			if (owner == -1) {
1630				error = EFAULT;
1631				break;
1632			}
1633
1634			/* If this failed the lock has changed, restart. */
1635			umtxq_lock(&uq->uq_key);
1636			umtx_pi_unref(pi);
1637			umtxq_unlock(&uq->uq_key);
1638			pi = NULL;
1639			continue;
1640		}
1641
1642		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1643		    (owner & ~UMUTEX_CONTESTED) == id) {
1644			error = EDEADLK;
1645			break;
1646		}
1647
1648		if (try != 0) {
1649			error = EBUSY;
1650			break;
1651		}
1652
1653		/*
1654		 * If we caught a signal, we have retried and now
1655		 * exit immediately.
1656		 */
1657		if (error != 0)
1658			break;
1659
1660		umtxq_lock(&uq->uq_key);
1661		umtxq_busy(&uq->uq_key);
1662		umtxq_unlock(&uq->uq_key);
1663
1664		/*
1665		 * Set the contested bit so that a release in user space
1666		 * knows to use the system call for unlock.  If this fails
1667		 * either some one else has acquired the lock or it has been
1668		 * released.
1669		 */
1670		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1671
1672		/* The address was invalid. */
1673		if (old == -1) {
1674			umtxq_lock(&uq->uq_key);
1675			umtxq_unbusy(&uq->uq_key);
1676			umtxq_unlock(&uq->uq_key);
1677			error = EFAULT;
1678			break;
1679		}
1680
1681		umtxq_lock(&uq->uq_key);
1682		umtxq_unbusy(&uq->uq_key);
1683		/*
1684		 * We set the contested bit, sleep. Otherwise the lock changed
1685		 * and we need to retry or we lost a race to the thread
1686		 * unlocking the umtx.
1687		 */
1688		if (old == owner)
1689			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1690				 "umtxpi", timo);
1691		umtx_pi_unref(pi);
1692		umtxq_unlock(&uq->uq_key);
1693		pi = NULL;
1694	}
1695
1696	if (pi != NULL) {
1697		umtxq_lock(&uq->uq_key);
1698		umtx_pi_unref(pi);
1699		umtxq_unlock(&uq->uq_key);
1700	}
1701
1702	umtx_key_release(&uq->uq_key);
1703	return (error);
1704}
1705
1706/*
1707 * Unlock a PI mutex.
1708 */
1709static int
1710do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1711{
1712	struct umtx_key key;
1713	struct umtx_q *uq_first, *uq_first2, *uq_me;
1714	struct umtx_pi *pi, *pi2;
1715	uint32_t owner, old, id;
1716	int error;
1717	int count;
1718	int pri;
1719
1720	id = td->td_tid;
1721	/*
1722	 * Make sure we own this mtx.
1723	 */
1724	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1725	if (owner == -1)
1726		return (EFAULT);
1727
1728	if ((owner & ~UMUTEX_CONTESTED) != id)
1729		return (EPERM);
1730
1731	/* This should be done in userland */
1732	if ((owner & UMUTEX_CONTESTED) == 0) {
1733		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1734		if (old == -1)
1735			return (EFAULT);
1736		if (old == owner)
1737			return (0);
1738		owner = old;
1739	}
1740
1741	/* We should only ever be in here for contested locks */
1742	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1743	    &key)) != 0)
1744		return (error);
1745
1746	umtxq_lock(&key);
1747	umtxq_busy(&key);
1748	count = umtxq_count_pi(&key, &uq_first);
1749	if (uq_first != NULL) {
1750		pi = uq_first->uq_pi_blocked;
1751		if (pi->pi_owner != curthread) {
1752			umtxq_unbusy(&key);
1753			umtxq_unlock(&key);
1754			/* userland messed the mutex */
1755			return (EPERM);
1756		}
1757		uq_me = curthread->td_umtxq;
1758		mtx_lock_spin(&sched_lock);
1759		pi->pi_owner = NULL;
1760		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1761		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1762		pri = PRI_MAX;
1763		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1764			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1765			if (uq_first2 != NULL) {
1766				if (pri > UPRI(uq_first2->uq_thread))
1767					pri = UPRI(uq_first2->uq_thread);
1768			}
1769		}
1770		sched_unlend_user_prio(curthread, pri);
1771		mtx_unlock_spin(&sched_lock);
1772	}
1773	umtxq_unlock(&key);
1774
1775	/*
1776	 * When unlocking the umtx, it must be marked as unowned if
1777	 * there is zero or one thread only waiting for it.
1778	 * Otherwise, it must be marked as contested.
1779	 */
1780	old = casuword32(&m->m_owner, owner,
1781		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1782
1783	umtxq_lock(&key);
1784	if (uq_first != NULL)
1785		umtxq_signal_thread(uq_first);
1786	umtxq_unbusy(&key);
1787	umtxq_unlock(&key);
1788	umtx_key_release(&key);
1789	if (old == -1)
1790		return (EFAULT);
1791	if (old != owner)
1792		return (EINVAL);
1793	return (0);
1794}
1795
1796/*
1797 * Lock a PP mutex.
1798 */
1799static int
1800_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1801	int try)
1802{
1803	struct umtx_q *uq, *uq2;
1804	struct umtx_pi *pi;
1805	uint32_t ceiling;
1806	uint32_t owner, id;
1807	int error, pri, old_inherited_pri, su;
1808
1809	id = td->td_tid;
1810	uq = td->td_umtxq;
1811	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1812	    &uq->uq_key)) != 0)
1813		return (error);
1814	su = (suser(td) == 0);
1815	for (;;) {
1816		old_inherited_pri = uq->uq_inherited_pri;
1817		umtxq_lock(&uq->uq_key);
1818		umtxq_busy(&uq->uq_key);
1819		umtxq_unlock(&uq->uq_key);
1820
1821		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1822		if (ceiling > RTP_PRIO_MAX) {
1823			error = EINVAL;
1824			goto out;
1825		}
1826
1827		mtx_lock_spin(&sched_lock);
1828		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1829			mtx_unlock_spin(&sched_lock);
1830			error = EINVAL;
1831			goto out;
1832		}
1833		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1834			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1835			if (uq->uq_inherited_pri < UPRI(td))
1836				sched_lend_user_prio(td, uq->uq_inherited_pri);
1837		}
1838		mtx_unlock_spin(&sched_lock);
1839
1840		owner = casuword32(&m->m_owner,
1841		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1842
1843		if (owner == UMUTEX_CONTESTED) {
1844			error = 0;
1845			break;
1846		}
1847
1848		/* The address was invalid. */
1849		if (owner == -1) {
1850			error = EFAULT;
1851			break;
1852		}
1853
1854		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1855		    (owner & ~UMUTEX_CONTESTED) == id) {
1856			error = EDEADLK;
1857			break;
1858		}
1859
1860		if (try != 0) {
1861			error = EBUSY;
1862			break;
1863		}
1864
1865		/*
1866		 * If we caught a signal, we have retried and now
1867		 * exit immediately.
1868		 */
1869		if (error != 0)
1870			break;
1871
1872		umtxq_lock(&uq->uq_key);
1873		umtxq_insert(uq);
1874		umtxq_unbusy(&uq->uq_key);
1875		error = umtxq_sleep(uq, "umtxpp", timo);
1876		umtxq_remove(uq);
1877		umtxq_unlock(&uq->uq_key);
1878
1879		mtx_lock_spin(&sched_lock);
1880		uq->uq_inherited_pri = old_inherited_pri;
1881		pri = PRI_MAX;
1882		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1883			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1884			if (uq2 != NULL) {
1885				if (pri > UPRI(uq2->uq_thread))
1886					pri = UPRI(uq2->uq_thread);
1887			}
1888		}
1889		if (pri > uq->uq_inherited_pri)
1890			pri = uq->uq_inherited_pri;
1891		sched_unlend_user_prio(td, pri);
1892		mtx_unlock_spin(&sched_lock);
1893	}
1894
1895	if (error != 0) {
1896		mtx_lock_spin(&sched_lock);
1897		uq->uq_inherited_pri = old_inherited_pri;
1898		pri = PRI_MAX;
1899		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1900			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1901			if (uq2 != NULL) {
1902				if (pri > UPRI(uq2->uq_thread))
1903					pri = UPRI(uq2->uq_thread);
1904			}
1905		}
1906		if (pri > uq->uq_inherited_pri)
1907			pri = uq->uq_inherited_pri;
1908		sched_unlend_user_prio(td, pri);
1909		mtx_unlock_spin(&sched_lock);
1910	}
1911
1912out:
1913	umtxq_lock(&uq->uq_key);
1914	umtxq_unbusy(&uq->uq_key);
1915	umtxq_unlock(&uq->uq_key);
1916	umtx_key_release(&uq->uq_key);
1917	return (error);
1918}
1919
1920/*
1921 * Unlock a PP mutex.
1922 */
1923static int
1924do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1925{
1926	struct umtx_key key;
1927	struct umtx_q *uq, *uq2;
1928	struct umtx_pi *pi;
1929	uint32_t owner, id;
1930	uint32_t rceiling;
1931	int error, pri, new_inherited_pri, su;
1932
1933	id = td->td_tid;
1934	uq = td->td_umtxq;
1935	su = (suser(td) == 0);
1936
1937	/*
1938	 * Make sure we own this mtx.
1939	 */
1940	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1941	if (owner == -1)
1942		return (EFAULT);
1943
1944	if ((owner & ~UMUTEX_CONTESTED) != id)
1945		return (EPERM);
1946
1947	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1948	if (error != 0)
1949		return (error);
1950
1951	if (rceiling == -1)
1952		new_inherited_pri = PRI_MAX;
1953	else {
1954		rceiling = RTP_PRIO_MAX - rceiling;
1955		if (rceiling > RTP_PRIO_MAX)
1956			return (EINVAL);
1957		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1958	}
1959
1960	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1961	    &key)) != 0)
1962		return (error);
1963	umtxq_lock(&key);
1964	umtxq_busy(&key);
1965	umtxq_unlock(&key);
1966	/*
1967	 * For priority protected mutex, always set unlocked state
1968	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1969	 * to lock the mutex, it is necessary because thread priority
1970	 * has to be adjusted for such mutex.
1971	 */
1972	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1973		UMUTEX_CONTESTED);
1974
1975	umtxq_lock(&key);
1976	if (error == 0)
1977		umtxq_signal(&key, 1);
1978	umtxq_unbusy(&key);
1979	umtxq_unlock(&key);
1980
1981	if (error == -1)
1982		error = EFAULT;
1983	else {
1984		mtx_lock_spin(&sched_lock);
1985		if (su != 0)
1986			uq->uq_inherited_pri = new_inherited_pri;
1987		pri = PRI_MAX;
1988		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1989			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1990			if (uq2 != NULL) {
1991				if (pri > UPRI(uq2->uq_thread))
1992					pri = UPRI(uq2->uq_thread);
1993			}
1994		}
1995		if (pri > uq->uq_inherited_pri)
1996			pri = uq->uq_inherited_pri;
1997		sched_unlend_user_prio(td, pri);
1998		mtx_unlock_spin(&sched_lock);
1999	}
2000	umtx_key_release(&key);
2001	return (error);
2002}
2003
2004static int
2005do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2006	uint32_t *old_ceiling)
2007{
2008	struct umtx_q *uq;
2009	uint32_t save_ceiling;
2010	uint32_t owner, id;
2011	uint32_t flags;
2012	int error;
2013
2014	flags = fuword32(&m->m_flags);
2015	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2016		return (EINVAL);
2017	if (ceiling > RTP_PRIO_MAX)
2018		return (EINVAL);
2019	id = td->td_tid;
2020	uq = td->td_umtxq;
2021	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2022	   &uq->uq_key)) != 0)
2023		return (error);
2024	for (;;) {
2025		umtxq_lock(&uq->uq_key);
2026		umtxq_busy(&uq->uq_key);
2027		umtxq_unlock(&uq->uq_key);
2028
2029		save_ceiling = fuword32(&m->m_ceilings[0]);
2030
2031		owner = casuword32(&m->m_owner,
2032		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2033
2034		if (owner == UMUTEX_CONTESTED) {
2035			suword32(&m->m_ceilings[0], ceiling);
2036			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2037				UMUTEX_CONTESTED);
2038			error = 0;
2039			break;
2040		}
2041
2042		/* The address was invalid. */
2043		if (owner == -1) {
2044			error = EFAULT;
2045			break;
2046		}
2047
2048		if ((owner & ~UMUTEX_CONTESTED) == id) {
2049			suword32(&m->m_ceilings[0], ceiling);
2050			error = 0;
2051			break;
2052		}
2053
2054		/*
2055		 * If we caught a signal, we have retried and now
2056		 * exit immediately.
2057		 */
2058		if (error != 0)
2059			break;
2060
2061		/*
2062		 * We set the contested bit, sleep. Otherwise the lock changed
2063		 * and we need to retry or we lost a race to the thread
2064		 * unlocking the umtx.
2065		 */
2066		umtxq_lock(&uq->uq_key);
2067		umtxq_insert(uq);
2068		umtxq_unbusy(&uq->uq_key);
2069		error = umtxq_sleep(uq, "umtxpp", 0);
2070		umtxq_remove(uq);
2071		umtxq_unlock(&uq->uq_key);
2072	}
2073	umtxq_lock(&uq->uq_key);
2074	if (error == 0)
2075		umtxq_signal(&uq->uq_key, INT_MAX);
2076	umtxq_unbusy(&uq->uq_key);
2077	umtxq_unlock(&uq->uq_key);
2078	umtx_key_release(&uq->uq_key);
2079	if (error == 0 && old_ceiling != NULL)
2080		suword32(old_ceiling, save_ceiling);
2081	return (error);
2082}
2083
2084static int
2085_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2086	int try)
2087{
2088	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2089	case 0:
2090		return (_do_lock_normal(td, m, flags, timo, try));
2091	case UMUTEX_PRIO_INHERIT:
2092		return (_do_lock_pi(td, m, flags, timo, try));
2093	case UMUTEX_PRIO_PROTECT:
2094		return (_do_lock_pp(td, m, flags, timo, try));
2095	}
2096	return (EINVAL);
2097}
2098
2099/*
2100 * Lock a userland POSIX mutex.
2101 */
2102static int
2103do_lock_umutex(struct thread *td, struct umutex *m,
2104	struct timespec *timeout, int try)
2105{
2106	struct timespec ts, ts2, ts3;
2107	struct timeval tv;
2108	uint32_t flags;
2109	int error;
2110
2111	flags = fuword32(&m->m_flags);
2112	if (flags == -1)
2113		return (EFAULT);
2114
2115	if (timeout == NULL) {
2116		error = _do_lock_umutex(td, m, flags, 0, try);
2117		/* Mutex locking is restarted if it is interrupted. */
2118		if (error == EINTR)
2119			error = ERESTART;
2120	} else {
2121		getnanouptime(&ts);
2122		timespecadd(&ts, timeout);
2123		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2124		for (;;) {
2125			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2126			if (error != ETIMEDOUT)
2127				break;
2128			getnanouptime(&ts2);
2129			if (timespeccmp(&ts2, &ts, >=)) {
2130				error = ETIMEDOUT;
2131				break;
2132			}
2133			ts3 = ts;
2134			timespecsub(&ts3, &ts2);
2135			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2136		}
2137		/* Timed-locking is not restarted. */
2138		if (error == ERESTART)
2139			error = EINTR;
2140	}
2141	return (error);
2142}
2143
2144/*
2145 * Unlock a userland POSIX mutex.
2146 */
2147static int
2148do_unlock_umutex(struct thread *td, struct umutex *m)
2149{
2150	uint32_t flags;
2151
2152	flags = fuword32(&m->m_flags);
2153	if (flags == -1)
2154		return (EFAULT);
2155
2156	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2157	case 0:
2158		return (do_unlock_normal(td, m, flags));
2159	case UMUTEX_PRIO_INHERIT:
2160		return (do_unlock_pi(td, m, flags));
2161	case UMUTEX_PRIO_PROTECT:
2162		return (do_unlock_pp(td, m, flags));
2163	}
2164
2165	return (EINVAL);
2166}
2167
2168int
2169_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2170    /* struct umtx *umtx */
2171{
2172	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2173}
2174
2175int
2176_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2177    /* struct umtx *umtx */
2178{
2179	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2180}
2181
2182static int
2183__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2184{
2185	struct timespec *ts, timeout;
2186	int error;
2187
2188	/* Allow a null timespec (wait forever). */
2189	if (uap->uaddr2 == NULL)
2190		ts = NULL;
2191	else {
2192		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2193		if (error != 0)
2194			return (error);
2195		if (timeout.tv_nsec >= 1000000000 ||
2196		    timeout.tv_nsec < 0) {
2197			return (EINVAL);
2198		}
2199		ts = &timeout;
2200	}
2201	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2202}
2203
2204static int
2205__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2206{
2207	return (do_unlock_umtx(td, uap->obj, uap->val));
2208}
2209
2210static int
2211__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2212{
2213	struct timespec *ts, timeout;
2214	int error;
2215
2216	if (uap->uaddr2 == NULL)
2217		ts = NULL;
2218	else {
2219		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2220		if (error != 0)
2221			return (error);
2222		if (timeout.tv_nsec >= 1000000000 ||
2223		    timeout.tv_nsec < 0)
2224			return (EINVAL);
2225		ts = &timeout;
2226	}
2227	return do_wait(td, uap->obj, uap->val, ts, 0);
2228}
2229
2230static int
2231__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2232{
2233	return (kern_umtx_wake(td, uap->obj, uap->val));
2234}
2235
2236static int
2237__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2238{
2239	struct timespec *ts, timeout;
2240	int error;
2241
2242	/* Allow a null timespec (wait forever). */
2243	if (uap->uaddr2 == NULL)
2244		ts = NULL;
2245	else {
2246		error = copyin(uap->uaddr2, &timeout,
2247		    sizeof(timeout));
2248		if (error != 0)
2249			return (error);
2250		if (timeout.tv_nsec >= 1000000000 ||
2251		    timeout.tv_nsec < 0) {
2252			return (EINVAL);
2253		}
2254		ts = &timeout;
2255	}
2256	return do_lock_umutex(td, uap->obj, ts, 0);
2257}
2258
2259static int
2260__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2261{
2262	return do_lock_umutex(td, uap->obj, NULL, 1);
2263}
2264
2265static int
2266__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2267{
2268	return do_unlock_umutex(td, uap->obj);
2269}
2270
2271static int
2272__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2273{
2274	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2275}
2276
2277typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2278
2279static _umtx_op_func op_table[] = {
2280	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2281	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2282	__umtx_op_wait,			/* UMTX_OP_WAIT */
2283	__umtx_op_wake,			/* UMTX_OP_WAKE */
2284	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2285	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2286	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2287	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2288};
2289
2290int
2291_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2292{
2293	if (uap->op >= 0 && uap->op < UMTX_OP_MAX)
2294		return (*op_table[uap->op])(td, uap);
2295	return (EINVAL);
2296}
2297
2298#ifdef COMPAT_IA32
2299
2300int
2301freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2302    /* struct umtx *umtx */
2303{
2304	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2305}
2306
2307int
2308freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2309    /* struct umtx *umtx */
2310{
2311	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2312}
2313
2314struct timespec32 {
2315	u_int32_t tv_sec;
2316	u_int32_t tv_nsec;
2317};
2318
2319static inline int
2320copyin_timeout32(void *addr, struct timespec *tsp)
2321{
2322	struct timespec32 ts32;
2323	int error;
2324
2325	error = copyin(addr, &ts32, sizeof(struct timespec32));
2326	if (error == 0) {
2327		tsp->tv_sec = ts32.tv_sec;
2328		tsp->tv_nsec = ts32.tv_nsec;
2329	}
2330	return (error);
2331}
2332
2333static int
2334__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2335{
2336	struct timespec *ts, timeout;
2337	int error;
2338
2339	/* Allow a null timespec (wait forever). */
2340	if (uap->uaddr2 == NULL)
2341		ts = NULL;
2342	else {
2343		error = copyin_timeout32(uap->uaddr2, &timeout);
2344		if (error != 0)
2345			return (error);
2346		if (timeout.tv_nsec >= 1000000000 ||
2347		    timeout.tv_nsec < 0) {
2348			return (EINVAL);
2349		}
2350		ts = &timeout;
2351	}
2352	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2353}
2354
2355static int
2356__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2357{
2358	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2359}
2360
2361static int
2362__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2363{
2364	struct timespec *ts, timeout;
2365	int error;
2366
2367	if (uap->uaddr2 == NULL)
2368		ts = NULL;
2369	else {
2370		error = copyin_timeout32(uap->uaddr2, &timeout);
2371		if (error != 0)
2372			return (error);
2373		if (timeout.tv_nsec >= 1000000000 ||
2374		    timeout.tv_nsec < 0)
2375			return (EINVAL);
2376		ts = &timeout;
2377	}
2378	return do_wait(td, uap->obj, uap->val, ts, 1);
2379}
2380
2381static int
2382__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2383{
2384	struct timespec *ts, timeout;
2385	int error;
2386
2387	/* Allow a null timespec (wait forever). */
2388	if (uap->uaddr2 == NULL)
2389		ts = NULL;
2390	else {
2391		error = copyin_timeout32(uap->uaddr2, &timeout);
2392		if (error != 0)
2393			return (error);
2394		if (timeout.tv_nsec >= 1000000000 ||
2395		    timeout.tv_nsec < 0)
2396			return (EINVAL);
2397		ts = &timeout;
2398	}
2399	return do_lock_umutex(td, uap->obj, ts, 0);
2400}
2401
2402static _umtx_op_func op_table_compat32[] = {
2403	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2404	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2405	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2406	__umtx_op_wake,			/* UMTX_OP_WAKE */
2407	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2408	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2409	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2410	__umtx_op_set_ceiling		/* UMTX_OP_SET_CEILING */
2411};
2412
2413int
2414freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2415{
2416	if (uap->op >= 0 && uap->op < UMTX_OP_MAX)
2417		return (*op_table_compat32[uap->op])(td,
2418			(struct _umtx_op_args *)uap);
2419	return (EINVAL);
2420}
2421#endif
2422
2423void
2424umtx_thread_init(struct thread *td)
2425{
2426	td->td_umtxq = umtxq_alloc();
2427	td->td_umtxq->uq_thread = td;
2428}
2429
2430void
2431umtx_thread_fini(struct thread *td)
2432{
2433	umtxq_free(td->td_umtxq);
2434}
2435
2436/*
2437 * It will be called when new thread is created, e.g fork().
2438 */
2439void
2440umtx_thread_alloc(struct thread *td)
2441{
2442	struct umtx_q *uq;
2443
2444	uq = td->td_umtxq;
2445	uq->uq_inherited_pri = PRI_MAX;
2446
2447	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2448	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2449	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2450	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2451}
2452
2453/*
2454 * exec() hook.
2455 */
2456static void
2457umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2458	struct image_params *imgp __unused)
2459{
2460	umtx_thread_cleanup(curthread);
2461}
2462
2463/*
2464 * thread_exit() hook.
2465 */
2466void
2467umtx_thread_exit(struct thread *td)
2468{
2469	umtx_thread_cleanup(td);
2470}
2471
2472/*
2473 * clean up umtx data.
2474 */
2475static void
2476umtx_thread_cleanup(struct thread *td)
2477{
2478	struct umtx_q *uq;
2479	struct umtx_pi *pi;
2480
2481	if ((uq = td->td_umtxq) == NULL)
2482		return;
2483
2484	mtx_lock_spin(&sched_lock);
2485	uq->uq_inherited_pri = PRI_MAX;
2486	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2487		pi->pi_owner = NULL;
2488		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2489	}
2490	td->td_flags &= ~TDF_UBORROWING;
2491	mtx_unlock_spin(&sched_lock);
2492}
2493