kern_umtx.c revision 164839
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 164839 2006-12-03 01:49:22Z davidxu $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/sysctl.h>
42#include <sys/sysent.h>
43#include <sys/systm.h>
44#include <sys/sysproto.h>
45#include <sys/eventhandler.h>
46#include <sys/umtx.h>
47
48#include <vm/vm.h>
49#include <vm/vm_param.h>
50#include <vm/pmap.h>
51#include <vm/vm_map.h>
52#include <vm/vm_object.h>
53
54#ifdef COMPAT_IA32
55#include <compat/freebsd32/freebsd32_proto.h>
56#endif
57
58#define TYPE_SIMPLE_LOCK	0
59#define TYPE_SIMPLE_WAIT	1
60#define TYPE_NORMAL_UMUTEX	2
61#define TYPE_PI_UMUTEX		3
62#define TYPE_PP_UMUTEX		4
63#define TYPE_CV			5
64
65/* Key to represent a unique userland synchronous object */
66struct umtx_key {
67	int	hash;
68	int	type;
69	int	shared;
70	union {
71		struct {
72			vm_object_t	object;
73			uintptr_t	offset;
74		} shared;
75		struct {
76			struct vmspace	*vs;
77			uintptr_t	addr;
78		} private;
79		struct {
80			void		*a;
81			uintptr_t	b;
82		} both;
83	} info;
84};
85
86/* Priority inheritance mutex info. */
87struct umtx_pi {
88	/* Owner thread */
89	struct thread		*pi_owner;
90
91	/* Reference count */
92	int			pi_refcount;
93
94 	/* List entry to link umtx holding by thread */
95	TAILQ_ENTRY(umtx_pi)	pi_link;
96
97	/* List entry in hash */
98	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
99
100	/* List for waiters */
101	TAILQ_HEAD(,umtx_q)	pi_blocked;
102
103	/* Identify a userland lock object */
104	struct umtx_key		pi_key;
105};
106
107/* A userland synchronous object user. */
108struct umtx_q {
109	/* Linked list for the hash. */
110	TAILQ_ENTRY(umtx_q)	uq_link;
111
112	/* Umtx key. */
113	struct umtx_key		uq_key;
114
115	/* Umtx flags. */
116	int			uq_flags;
117#define UQF_UMTXQ	0x0001
118
119	/* The thread waits on. */
120	struct thread		*uq_thread;
121
122	/*
123	 * Blocked on PI mutex. read can use chain lock
124	 * or sched_lock, write must have both chain lock and
125	 * sched_lock being hold.
126	 */
127	struct umtx_pi		*uq_pi_blocked;
128
129	/* On blocked list */
130	TAILQ_ENTRY(umtx_q)	uq_lockq;
131
132	/* Thread contending with us */
133	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
134
135	/* Inherited priority from PP mutex */
136	u_char			uq_inherited_pri;
137};
138
139TAILQ_HEAD(umtxq_head, umtx_q);
140
141/* Userland lock object's wait-queue chain */
142struct umtxq_chain {
143	/* Lock for this chain. */
144	struct mtx		uc_lock;
145
146	/* List of sleep queues. */
147	struct umtxq_head	uc_queue;
148
149	/* Busy flag */
150	char			uc_busy;
151
152	/* Chain lock waiters */
153	int			uc_waiters;
154
155	/* All PI in the list */
156	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
157};
158
159#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
160
161/*
162 * Don't propagate time-sharing priority, there is a security reason,
163 * a user can simply introduce PI-mutex, let thread A lock the mutex,
164 * and let another thread B block on the mutex, because B is
165 * sleeping, its priority will be boosted, this causes A's priority to
166 * be boosted via priority propagating too and will never be lowered even
167 * if it is using 100%CPU, this is unfair to other processes.
168 */
169
170#ifdef KSE
171#define UPRI(td)	(((td)->td_ksegrp->kg_user_pri >= PRI_MIN_TIMESHARE &&\
172			  (td)->td_ksegrp->kg_user_pri <= PRI_MAX_TIMESHARE) ?\
173			 PRI_MAX_TIMESHARE : (td)->td_ksegrp->kg_user_pri)
174#else
175#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
176			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
177			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
178#endif
179
180#define	GOLDEN_RATIO_PRIME	2654404609U
181#define	UMTX_CHAINS		128
182#define	UMTX_SHIFTS		(__WORD_BIT - 7)
183
184#define THREAD_SHARE		0
185#define PROCESS_SHARE		1
186#define AUTO_SHARE		2
187
188#define	GET_SHARE(flags)	\
189    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
190
191static uma_zone_t		umtx_pi_zone;
192static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
193static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
194static int			umtx_pi_allocated;
195
196SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
197SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
198    &umtx_pi_allocated, 0, "Allocated umtx_pi");
199
200static void umtxq_sysinit(void *);
201static void umtxq_hash(struct umtx_key *key);
202static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
203static void umtxq_lock(struct umtx_key *key);
204static void umtxq_unlock(struct umtx_key *key);
205static void umtxq_busy(struct umtx_key *key);
206static void umtxq_unbusy(struct umtx_key *key);
207static void umtxq_insert(struct umtx_q *uq);
208static void umtxq_remove(struct umtx_q *uq);
209static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
210static int umtxq_count(struct umtx_key *key);
211static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
212static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
213static int umtx_key_get(void *addr, int type, int share,
214	struct umtx_key *key);
215static void umtx_key_release(struct umtx_key *key);
216static struct umtx_pi *umtx_pi_alloc(int);
217static void umtx_pi_free(struct umtx_pi *pi);
218static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
219static void umtx_thread_cleanup(struct thread *td);
220static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
221	struct image_params *imgp __unused);
222SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
223
224static void
225umtxq_sysinit(void *arg __unused)
226{
227	int i;
228
229	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
230		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
231	for (i = 0; i < UMTX_CHAINS; ++i) {
232		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
233			 MTX_DEF | MTX_DUPOK);
234		TAILQ_INIT(&umtxq_chains[i].uc_queue);
235		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
236		umtxq_chains[i].uc_busy = 0;
237		umtxq_chains[i].uc_waiters = 0;
238	}
239	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
240	    EVENTHANDLER_PRI_ANY);
241}
242
243struct umtx_q *
244umtxq_alloc(void)
245{
246	struct umtx_q *uq;
247
248	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
249	TAILQ_INIT(&uq->uq_pi_contested);
250	uq->uq_inherited_pri = PRI_MAX;
251	return (uq);
252}
253
254void
255umtxq_free(struct umtx_q *uq)
256{
257	free(uq, M_UMTX);
258}
259
260static inline void
261umtxq_hash(struct umtx_key *key)
262{
263	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
264	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
265}
266
267static inline int
268umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
269{
270	return (k1->type == k2->type &&
271		k1->info.both.a == k2->info.both.a &&
272	        k1->info.both.b == k2->info.both.b);
273}
274
275static inline struct umtxq_chain *
276umtxq_getchain(struct umtx_key *key)
277{
278	return (&umtxq_chains[key->hash]);
279}
280
281/*
282 * Set chain to busy state when following operation
283 * may be blocked (kernel mutex can not be used).
284 */
285static inline void
286umtxq_busy(struct umtx_key *key)
287{
288	struct umtxq_chain *uc;
289
290	uc = umtxq_getchain(key);
291	mtx_assert(&uc->uc_lock, MA_OWNED);
292	while (uc->uc_busy != 0) {
293		uc->uc_waiters++;
294		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
295		uc->uc_waiters--;
296	}
297	uc->uc_busy = 1;
298}
299
300/*
301 * Unbusy a chain.
302 */
303static inline void
304umtxq_unbusy(struct umtx_key *key)
305{
306	struct umtxq_chain *uc;
307
308	uc = umtxq_getchain(key);
309	mtx_assert(&uc->uc_lock, MA_OWNED);
310	KASSERT(uc->uc_busy != 0, ("not busy"));
311	uc->uc_busy = 0;
312	if (uc->uc_waiters)
313		wakeup_one(uc);
314}
315
316/*
317 * Lock a chain.
318 */
319static inline void
320umtxq_lock(struct umtx_key *key)
321{
322	struct umtxq_chain *uc;
323
324	uc = umtxq_getchain(key);
325	mtx_lock(&uc->uc_lock);
326}
327
328/*
329 * Unlock a chain.
330 */
331static inline void
332umtxq_unlock(struct umtx_key *key)
333{
334	struct umtxq_chain *uc;
335
336	uc = umtxq_getchain(key);
337	mtx_unlock(&uc->uc_lock);
338}
339
340/*
341 * Insert a thread onto the umtx queue.
342 */
343static inline void
344umtxq_insert(struct umtx_q *uq)
345{
346	struct umtxq_chain *uc;
347
348	uc = umtxq_getchain(&uq->uq_key);
349	UMTXQ_LOCKED_ASSERT(uc);
350	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
351	uq->uq_flags |= UQF_UMTXQ;
352}
353
354/*
355 * Remove thread from the umtx queue.
356 */
357static inline void
358umtxq_remove(struct umtx_q *uq)
359{
360	struct umtxq_chain *uc;
361
362	uc = umtxq_getchain(&uq->uq_key);
363	UMTXQ_LOCKED_ASSERT(uc);
364	if (uq->uq_flags & UQF_UMTXQ) {
365		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
366		uq->uq_flags &= ~UQF_UMTXQ;
367	}
368}
369
370/*
371 * Check if there are multiple waiters
372 */
373static int
374umtxq_count(struct umtx_key *key)
375{
376	struct umtxq_chain *uc;
377	struct umtx_q *uq;
378	int count = 0;
379
380	uc = umtxq_getchain(key);
381	UMTXQ_LOCKED_ASSERT(uc);
382	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
383		if (umtx_key_match(&uq->uq_key, key)) {
384			if (++count > 1)
385				break;
386		}
387	}
388	return (count);
389}
390
391/*
392 * Check if there are multiple PI waiters and returns first
393 * waiter.
394 */
395static int
396umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
397{
398	struct umtxq_chain *uc;
399	struct umtx_q *uq;
400	int count = 0;
401
402	*first = NULL;
403	uc = umtxq_getchain(key);
404	UMTXQ_LOCKED_ASSERT(uc);
405	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
406		if (umtx_key_match(&uq->uq_key, key)) {
407			if (++count > 1)
408				break;
409			*first = uq;
410		}
411	}
412	return (count);
413}
414
415/*
416 * Wake up threads waiting on an userland object.
417 */
418static int
419umtxq_signal(struct umtx_key *key, int n_wake)
420{
421	struct umtxq_chain *uc;
422	struct umtx_q *uq, *next;
423	int ret;
424
425	ret = 0;
426	uc = umtxq_getchain(key);
427	UMTXQ_LOCKED_ASSERT(uc);
428	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
429		if (umtx_key_match(&uq->uq_key, key)) {
430			umtxq_remove(uq);
431			wakeup(uq);
432			if (++ret >= n_wake)
433				break;
434		}
435	}
436	return (ret);
437}
438
439/*
440 * Wake up specified thread.
441 */
442static inline void
443umtxq_signal_thread(struct umtx_q *uq)
444{
445	struct umtxq_chain *uc;
446
447	uc = umtxq_getchain(&uq->uq_key);
448	UMTXQ_LOCKED_ASSERT(uc);
449	umtxq_remove(uq);
450	wakeup(uq);
451}
452
453/*
454 * Put thread into sleep state, before sleeping, check if
455 * thread was removed from umtx queue.
456 */
457static inline int
458umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
459{
460	struct umtxq_chain *uc;
461	int error;
462
463	uc = umtxq_getchain(&uq->uq_key);
464	UMTXQ_LOCKED_ASSERT(uc);
465	if (!(uq->uq_flags & UQF_UMTXQ))
466		return (0);
467	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
468	if (error == EWOULDBLOCK)
469		error = ETIMEDOUT;
470	return (error);
471}
472
473/*
474 * Convert userspace address into unique logical address.
475 */
476static int
477umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
478{
479	struct thread *td = curthread;
480	vm_map_t map;
481	vm_map_entry_t entry;
482	vm_pindex_t pindex;
483	vm_prot_t prot;
484	boolean_t wired;
485
486	key->type = type;
487	if (share == THREAD_SHARE) {
488		key->shared = 0;
489		key->info.private.vs = td->td_proc->p_vmspace;
490		key->info.private.addr = (uintptr_t)addr;
491	} else {
492		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
493		map = &td->td_proc->p_vmspace->vm_map;
494		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
495		    &entry, &key->info.shared.object, &pindex, &prot,
496		    &wired) != KERN_SUCCESS) {
497			return EFAULT;
498		}
499
500		if ((share == PROCESS_SHARE) ||
501		    (share == AUTO_SHARE &&
502		     VM_INHERIT_SHARE == entry->inheritance)) {
503			key->shared = 1;
504			key->info.shared.offset = entry->offset + entry->start -
505				(vm_offset_t)addr;
506			vm_object_reference(key->info.shared.object);
507		} else {
508			key->shared = 0;
509			key->info.private.vs = td->td_proc->p_vmspace;
510			key->info.private.addr = (uintptr_t)addr;
511		}
512		vm_map_lookup_done(map, entry);
513	}
514
515	umtxq_hash(key);
516	return (0);
517}
518
519/*
520 * Release key.
521 */
522static inline void
523umtx_key_release(struct umtx_key *key)
524{
525	if (key->shared)
526		vm_object_deallocate(key->info.shared.object);
527}
528
529/*
530 * Lock a umtx object.
531 */
532static int
533_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
534{
535	struct umtx_q *uq;
536	u_long owner;
537	u_long old;
538	int error = 0;
539
540	uq = td->td_umtxq;
541
542	/*
543	 * Care must be exercised when dealing with umtx structure. It
544	 * can fault on any access.
545	 */
546	for (;;) {
547		/*
548		 * Try the uncontested case.  This should be done in userland.
549		 */
550		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
551
552		/* The acquire succeeded. */
553		if (owner == UMTX_UNOWNED)
554			return (0);
555
556		/* The address was invalid. */
557		if (owner == -1)
558			return (EFAULT);
559
560		/* If no one owns it but it is contested try to acquire it. */
561		if (owner == UMTX_CONTESTED) {
562			owner = casuword(&umtx->u_owner,
563			    UMTX_CONTESTED, id | UMTX_CONTESTED);
564
565			if (owner == UMTX_CONTESTED)
566				return (0);
567
568			/* The address was invalid. */
569			if (owner == -1)
570				return (EFAULT);
571
572			/* If this failed the lock has changed, restart. */
573			continue;
574		}
575
576		/*
577		 * If we caught a signal, we have retried and now
578		 * exit immediately.
579		 */
580		if (error != 0)
581			return (error);
582
583		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
584			AUTO_SHARE, &uq->uq_key)) != 0)
585			return (error);
586
587		umtxq_lock(&uq->uq_key);
588		umtxq_busy(&uq->uq_key);
589		umtxq_insert(uq);
590		umtxq_unbusy(&uq->uq_key);
591		umtxq_unlock(&uq->uq_key);
592
593		/*
594		 * Set the contested bit so that a release in user space
595		 * knows to use the system call for unlock.  If this fails
596		 * either some one else has acquired the lock or it has been
597		 * released.
598		 */
599		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
600
601		/* The address was invalid. */
602		if (old == -1) {
603			umtxq_lock(&uq->uq_key);
604			umtxq_remove(uq);
605			umtxq_unlock(&uq->uq_key);
606			umtx_key_release(&uq->uq_key);
607			return (EFAULT);
608		}
609
610		/*
611		 * We set the contested bit, sleep. Otherwise the lock changed
612		 * and we need to retry or we lost a race to the thread
613		 * unlocking the umtx.
614		 */
615		umtxq_lock(&uq->uq_key);
616		if (old == owner)
617			error = umtxq_sleep(uq, "umtx", timo);
618		umtxq_remove(uq);
619		umtxq_unlock(&uq->uq_key);
620		umtx_key_release(&uq->uq_key);
621	}
622
623	return (0);
624}
625
626/*
627 * Lock a umtx object.
628 */
629static int
630do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
631	struct timespec *timeout)
632{
633	struct timespec ts, ts2, ts3;
634	struct timeval tv;
635	int error;
636
637	if (timeout == NULL) {
638		error = _do_lock_umtx(td, umtx, id, 0);
639		/* Mutex locking is restarted if it is interrupted. */
640		if (error == EINTR)
641			error = ERESTART;
642	} else {
643		getnanouptime(&ts);
644		timespecadd(&ts, timeout);
645		TIMESPEC_TO_TIMEVAL(&tv, timeout);
646		for (;;) {
647			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
648			if (error != ETIMEDOUT)
649				break;
650			getnanouptime(&ts2);
651			if (timespeccmp(&ts2, &ts, >=)) {
652				error = ETIMEDOUT;
653				break;
654			}
655			ts3 = ts;
656			timespecsub(&ts3, &ts2);
657			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
658		}
659		/* Timed-locking is not restarted. */
660		if (error == ERESTART)
661			error = EINTR;
662	}
663	return (error);
664}
665
666/*
667 * Unlock a umtx object.
668 */
669static int
670do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
671{
672	struct umtx_key key;
673	u_long owner;
674	u_long old;
675	int error;
676	int count;
677
678	/*
679	 * Make sure we own this mtx.
680	 */
681	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
682	if (owner == -1)
683		return (EFAULT);
684
685	if ((owner & ~UMTX_CONTESTED) != id)
686		return (EPERM);
687
688	/* This should be done in userland */
689	if ((owner & UMTX_CONTESTED) == 0) {
690		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
691		if (old == -1)
692			return (EFAULT);
693		if (old == owner)
694			return (0);
695		owner = old;
696	}
697
698	/* We should only ever be in here for contested locks */
699	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
700		&key)) != 0)
701		return (error);
702
703	umtxq_lock(&key);
704	umtxq_busy(&key);
705	count = umtxq_count(&key);
706	umtxq_unlock(&key);
707
708	/*
709	 * When unlocking the umtx, it must be marked as unowned if
710	 * there is zero or one thread only waiting for it.
711	 * Otherwise, it must be marked as contested.
712	 */
713	old = casuword(&umtx->u_owner, owner,
714		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
715	umtxq_lock(&key);
716	umtxq_signal(&key,1);
717	umtxq_unbusy(&key);
718	umtxq_unlock(&key);
719	umtx_key_release(&key);
720	if (old == -1)
721		return (EFAULT);
722	if (old != owner)
723		return (EINVAL);
724	return (0);
725}
726
727#ifdef COMPAT_IA32
728
729/*
730 * Lock a umtx object.
731 */
732static int
733_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
734{
735	struct umtx_q *uq;
736	uint32_t owner;
737	uint32_t old;
738	int error = 0;
739
740	uq = td->td_umtxq;
741
742	/*
743	 * Care must be exercised when dealing with umtx structure. It
744	 * can fault on any access.
745	 */
746	for (;;) {
747		/*
748		 * Try the uncontested case.  This should be done in userland.
749		 */
750		owner = casuword32(m, UMUTEX_UNOWNED, id);
751
752		/* The acquire succeeded. */
753		if (owner == UMUTEX_UNOWNED)
754			return (0);
755
756		/* The address was invalid. */
757		if (owner == -1)
758			return (EFAULT);
759
760		/* If no one owns it but it is contested try to acquire it. */
761		if (owner == UMUTEX_CONTESTED) {
762			owner = casuword32(m,
763			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
764			if (owner == UMUTEX_CONTESTED)
765				return (0);
766
767			/* The address was invalid. */
768			if (owner == -1)
769				return (EFAULT);
770
771			/* If this failed the lock has changed, restart. */
772			continue;
773		}
774
775		/*
776		 * If we caught a signal, we have retried and now
777		 * exit immediately.
778		 */
779		if (error != 0)
780			return (error);
781
782		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
783			AUTO_SHARE, &uq->uq_key)) != 0)
784			return (error);
785
786		umtxq_lock(&uq->uq_key);
787		umtxq_busy(&uq->uq_key);
788		umtxq_insert(uq);
789		umtxq_unbusy(&uq->uq_key);
790		umtxq_unlock(&uq->uq_key);
791
792		/*
793		 * Set the contested bit so that a release in user space
794		 * knows to use the system call for unlock.  If this fails
795		 * either some one else has acquired the lock or it has been
796		 * released.
797		 */
798		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
799
800		/* The address was invalid. */
801		if (old == -1) {
802			umtxq_lock(&uq->uq_key);
803			umtxq_remove(uq);
804			umtxq_unlock(&uq->uq_key);
805			umtx_key_release(&uq->uq_key);
806			return (EFAULT);
807		}
808
809		/*
810		 * We set the contested bit, sleep. Otherwise the lock changed
811		 * and we need to retry or we lost a race to the thread
812		 * unlocking the umtx.
813		 */
814		umtxq_lock(&uq->uq_key);
815		if (old == owner)
816			error = umtxq_sleep(uq, "umtx", timo);
817		umtxq_remove(uq);
818		umtxq_unlock(&uq->uq_key);
819		umtx_key_release(&uq->uq_key);
820	}
821
822	return (0);
823}
824
825/*
826 * Lock a umtx object.
827 */
828static int
829do_lock_umtx32(struct thread *td, void *m, uint32_t id,
830	struct timespec *timeout)
831{
832	struct timespec ts, ts2, ts3;
833	struct timeval tv;
834	int error;
835
836	if (timeout == NULL) {
837		error = _do_lock_umtx32(td, m, id, 0);
838		/* Mutex locking is restarted if it is interrupted. */
839		if (error == EINTR)
840			error = ERESTART;
841	} else {
842		getnanouptime(&ts);
843		timespecadd(&ts, timeout);
844		TIMESPEC_TO_TIMEVAL(&tv, timeout);
845		for (;;) {
846			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
847			if (error != ETIMEDOUT)
848				break;
849			getnanouptime(&ts2);
850			if (timespeccmp(&ts2, &ts, >=)) {
851				error = ETIMEDOUT;
852				break;
853			}
854			ts3 = ts;
855			timespecsub(&ts3, &ts2);
856			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
857		}
858		/* Timed-locking is not restarted. */
859		if (error == ERESTART)
860			error = EINTR;
861	}
862	return (error);
863}
864
865/*
866 * Unlock a umtx object.
867 */
868static int
869do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
870{
871	struct umtx_key key;
872	uint32_t owner;
873	uint32_t old;
874	int error;
875	int count;
876
877	/*
878	 * Make sure we own this mtx.
879	 */
880	owner = fuword32(m);
881	if (owner == -1)
882		return (EFAULT);
883
884	if ((owner & ~UMUTEX_CONTESTED) != id)
885		return (EPERM);
886
887	/* This should be done in userland */
888	if ((owner & UMUTEX_CONTESTED) == 0) {
889		old = casuword32(m, owner, UMUTEX_UNOWNED);
890		if (old == -1)
891			return (EFAULT);
892		if (old == owner)
893			return (0);
894		owner = old;
895	}
896
897	/* We should only ever be in here for contested locks */
898	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
899		&key)) != 0)
900		return (error);
901
902	umtxq_lock(&key);
903	umtxq_busy(&key);
904	count = umtxq_count(&key);
905	umtxq_unlock(&key);
906
907	/*
908	 * When unlocking the umtx, it must be marked as unowned if
909	 * there is zero or one thread only waiting for it.
910	 * Otherwise, it must be marked as contested.
911	 */
912	old = casuword32(m, owner,
913		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
914	umtxq_lock(&key);
915	umtxq_signal(&key,1);
916	umtxq_unbusy(&key);
917	umtxq_unlock(&key);
918	umtx_key_release(&key);
919	if (old == -1)
920		return (EFAULT);
921	if (old != owner)
922		return (EINVAL);
923	return (0);
924}
925#endif
926
927/*
928 * Fetch and compare value, sleep on the address if value is not changed.
929 */
930static int
931do_wait(struct thread *td, void *addr, u_long id,
932	struct timespec *timeout, int compat32)
933{
934	struct umtx_q *uq;
935	struct timespec ts, ts2, ts3;
936	struct timeval tv;
937	u_long tmp;
938	int error = 0;
939
940	uq = td->td_umtxq;
941	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
942	    &uq->uq_key)) != 0)
943		return (error);
944
945	umtxq_lock(&uq->uq_key);
946	umtxq_insert(uq);
947	umtxq_unlock(&uq->uq_key);
948	if (compat32 == 0)
949		tmp = fuword(addr);
950        else
951		tmp = fuword32(addr);
952	if (tmp != id) {
953		umtxq_lock(&uq->uq_key);
954		umtxq_remove(uq);
955		umtxq_unlock(&uq->uq_key);
956	} else if (timeout == NULL) {
957		umtxq_lock(&uq->uq_key);
958		error = umtxq_sleep(uq, "uwait", 0);
959		umtxq_remove(uq);
960		umtxq_unlock(&uq->uq_key);
961	} else {
962		getnanouptime(&ts);
963		timespecadd(&ts, timeout);
964		TIMESPEC_TO_TIMEVAL(&tv, timeout);
965		umtxq_lock(&uq->uq_key);
966		for (;;) {
967			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
968			if (!(uq->uq_flags & UQF_UMTXQ))
969				break;
970			if (error != ETIMEDOUT)
971				break;
972			umtxq_unlock(&uq->uq_key);
973			getnanouptime(&ts2);
974			if (timespeccmp(&ts2, &ts, >=)) {
975				error = ETIMEDOUT;
976				umtxq_lock(&uq->uq_key);
977				break;
978			}
979			ts3 = ts;
980			timespecsub(&ts3, &ts2);
981			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
982			umtxq_lock(&uq->uq_key);
983		}
984		umtxq_remove(uq);
985		umtxq_unlock(&uq->uq_key);
986	}
987	umtx_key_release(&uq->uq_key);
988	if (error == ERESTART)
989		error = EINTR;
990	return (error);
991}
992
993/*
994 * Wake up threads sleeping on the specified address.
995 */
996int
997kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
998{
999	struct umtx_key key;
1000	int ret;
1001
1002	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1003	   &key)) != 0)
1004		return (ret);
1005	umtxq_lock(&key);
1006	ret = umtxq_signal(&key, n_wake);
1007	umtxq_unlock(&key);
1008	umtx_key_release(&key);
1009	return (0);
1010}
1011
1012/*
1013 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1014 */
1015static int
1016_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1017	int try)
1018{
1019	struct umtx_q *uq;
1020	uint32_t owner, old, id;
1021	int error = 0;
1022
1023	id = td->td_tid;
1024	uq = td->td_umtxq;
1025
1026	/*
1027	 * Care must be exercised when dealing with umtx structure. It
1028	 * can fault on any access.
1029	 */
1030	for (;;) {
1031		/*
1032		 * Try the uncontested case.  This should be done in userland.
1033		 */
1034		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1035
1036		/* The acquire succeeded. */
1037		if (owner == UMUTEX_UNOWNED)
1038			return (0);
1039
1040		/* The address was invalid. */
1041		if (owner == -1)
1042			return (EFAULT);
1043
1044		/* If no one owns it but it is contested try to acquire it. */
1045		if (owner == UMUTEX_CONTESTED) {
1046			owner = casuword32(&m->m_owner,
1047			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1048
1049			if (owner == UMUTEX_CONTESTED)
1050				return (0);
1051
1052			/* The address was invalid. */
1053			if (owner == -1)
1054				return (EFAULT);
1055
1056			/* If this failed the lock has changed, restart. */
1057			continue;
1058		}
1059
1060		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1061		    (owner & ~UMUTEX_CONTESTED) == id)
1062			return (EDEADLK);
1063
1064		if (try != 0)
1065			return (EBUSY);
1066
1067		/*
1068		 * If we caught a signal, we have retried and now
1069		 * exit immediately.
1070		 */
1071		if (error != 0)
1072			return (error);
1073
1074		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1075		    GET_SHARE(flags), &uq->uq_key)) != 0)
1076			return (error);
1077
1078		umtxq_lock(&uq->uq_key);
1079		umtxq_busy(&uq->uq_key);
1080		umtxq_insert(uq);
1081		umtxq_unbusy(&uq->uq_key);
1082		umtxq_unlock(&uq->uq_key);
1083
1084		/*
1085		 * Set the contested bit so that a release in user space
1086		 * knows to use the system call for unlock.  If this fails
1087		 * either some one else has acquired the lock or it has been
1088		 * released.
1089		 */
1090		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1091
1092		/* The address was invalid. */
1093		if (old == -1) {
1094			umtxq_lock(&uq->uq_key);
1095			umtxq_remove(uq);
1096			umtxq_unlock(&uq->uq_key);
1097			umtx_key_release(&uq->uq_key);
1098			return (EFAULT);
1099		}
1100
1101		/*
1102		 * We set the contested bit, sleep. Otherwise the lock changed
1103		 * and we need to retry or we lost a race to the thread
1104		 * unlocking the umtx.
1105		 */
1106		umtxq_lock(&uq->uq_key);
1107		if (old == owner)
1108			error = umtxq_sleep(uq, "umtxn", timo);
1109		umtxq_remove(uq);
1110		umtxq_unlock(&uq->uq_key);
1111		umtx_key_release(&uq->uq_key);
1112	}
1113
1114	return (0);
1115}
1116
1117/*
1118 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1119 */
1120/*
1121 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1122 */
1123static int
1124do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1125{
1126	struct umtx_key key;
1127	uint32_t owner, old, id;
1128	int error;
1129	int count;
1130
1131	id = td->td_tid;
1132	/*
1133	 * Make sure we own this mtx.
1134	 */
1135	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1136	if (owner == -1)
1137		return (EFAULT);
1138
1139	if ((owner & ~UMUTEX_CONTESTED) != id)
1140		return (EPERM);
1141
1142	/* This should be done in userland */
1143	if ((owner & UMUTEX_CONTESTED) == 0) {
1144		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1145		if (old == -1)
1146			return (EFAULT);
1147		if (old == owner)
1148			return (0);
1149		owner = old;
1150	}
1151
1152	/* We should only ever be in here for contested locks */
1153	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1154	    &key)) != 0)
1155		return (error);
1156
1157	umtxq_lock(&key);
1158	umtxq_busy(&key);
1159	count = umtxq_count(&key);
1160	umtxq_unlock(&key);
1161
1162	/*
1163	 * When unlocking the umtx, it must be marked as unowned if
1164	 * there is zero or one thread only waiting for it.
1165	 * Otherwise, it must be marked as contested.
1166	 */
1167	old = casuword32(&m->m_owner, owner,
1168		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1169	umtxq_lock(&key);
1170	umtxq_signal(&key,1);
1171	umtxq_unbusy(&key);
1172	umtxq_unlock(&key);
1173	umtx_key_release(&key);
1174	if (old == -1)
1175		return (EFAULT);
1176	if (old != owner)
1177		return (EINVAL);
1178	return (0);
1179}
1180
1181static inline struct umtx_pi *
1182umtx_pi_alloc(int flags)
1183{
1184	struct umtx_pi *pi;
1185
1186	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1187	TAILQ_INIT(&pi->pi_blocked);
1188	atomic_add_int(&umtx_pi_allocated, 1);
1189	return (pi);
1190}
1191
1192static inline void
1193umtx_pi_free(struct umtx_pi *pi)
1194{
1195	uma_zfree(umtx_pi_zone, pi);
1196	atomic_add_int(&umtx_pi_allocated, -1);
1197}
1198
1199/*
1200 * Adjust the thread's position on a pi_state after its priority has been
1201 * changed.
1202 */
1203static int
1204umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1205{
1206	struct umtx_q *uq, *uq1, *uq2;
1207	struct thread *td1;
1208
1209	mtx_assert(&sched_lock, MA_OWNED);
1210	if (pi == NULL)
1211		return (0);
1212
1213	uq = td->td_umtxq;
1214
1215	/*
1216	 * Check if the thread needs to be moved on the blocked chain.
1217	 * It needs to be moved if either its priority is lower than
1218	 * the previous thread or higher than the next thread.
1219	 */
1220	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1221	uq2 = TAILQ_NEXT(uq, uq_lockq);
1222	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1223	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1224		/*
1225		 * Remove thread from blocked chain and determine where
1226		 * it should be moved to.
1227		 */
1228		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1229		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1230			td1 = uq1->uq_thread;
1231			MPASS(td1->td_proc->p_magic == P_MAGIC);
1232			if (UPRI(td1) > UPRI(td))
1233				break;
1234		}
1235
1236		if (uq1 == NULL)
1237			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1238		else
1239			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1240	}
1241	return (1);
1242}
1243
1244/*
1245 * Propagate priority when a thread is blocked on POSIX
1246 * PI mutex.
1247 */
1248static void
1249umtx_propagate_priority(struct thread *td)
1250{
1251	struct umtx_q *uq;
1252	struct umtx_pi *pi;
1253	int pri;
1254
1255	mtx_assert(&sched_lock, MA_OWNED);
1256	pri = UPRI(td);
1257	uq = td->td_umtxq;
1258	pi = uq->uq_pi_blocked;
1259	if (pi == NULL)
1260		return;
1261
1262	for (;;) {
1263		td = pi->pi_owner;
1264		if (td == NULL)
1265			return;
1266
1267		MPASS(td->td_proc != NULL);
1268		MPASS(td->td_proc->p_magic == P_MAGIC);
1269
1270		if (UPRI(td) <= pri)
1271			return;
1272
1273		sched_lend_user_prio(td, pri);
1274
1275		/*
1276		 * Pick up the lock that td is blocked on.
1277		 */
1278		uq = td->td_umtxq;
1279		pi = uq->uq_pi_blocked;
1280		/* Resort td on the list if needed. */
1281		if (!umtx_pi_adjust_thread(pi, td))
1282			break;
1283	}
1284}
1285
1286/*
1287 * Unpropagate priority for a PI mutex when a thread blocked on
1288 * it is interrupted by signal or resumed by others.
1289 */
1290static void
1291umtx_unpropagate_priority(struct umtx_pi *pi)
1292{
1293	struct umtx_q *uq, *uq_owner;
1294	struct umtx_pi *pi2;
1295	int pri;
1296
1297	mtx_assert(&sched_lock, MA_OWNED);
1298
1299	while (pi != NULL && pi->pi_owner != NULL) {
1300		pri = PRI_MAX;
1301		uq_owner = pi->pi_owner->td_umtxq;
1302
1303		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1304			uq = TAILQ_FIRST(&pi2->pi_blocked);
1305			if (uq != NULL) {
1306				if (pri > UPRI(uq->uq_thread))
1307					pri = UPRI(uq->uq_thread);
1308			}
1309		}
1310
1311		if (pri > uq_owner->uq_inherited_pri)
1312			pri = uq_owner->uq_inherited_pri;
1313		sched_unlend_user_prio(pi->pi_owner, pri);
1314		pi = uq_owner->uq_pi_blocked;
1315	}
1316}
1317
1318/*
1319 * Insert a PI mutex into owned list.
1320 */
1321static void
1322umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1323{
1324	struct umtx_q *uq_owner;
1325
1326	uq_owner = owner->td_umtxq;
1327	mtx_assert(&sched_lock, MA_OWNED);
1328	if (pi->pi_owner != NULL)
1329		panic("pi_ower != NULL");
1330	pi->pi_owner = owner;
1331	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1332}
1333
1334/*
1335 * Claim ownership of a PI mutex.
1336 */
1337static int
1338umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1339{
1340	struct umtx_q *uq, *uq_owner;
1341
1342	uq_owner = owner->td_umtxq;
1343	mtx_lock_spin(&sched_lock);
1344	if (pi->pi_owner == owner) {
1345		mtx_unlock_spin(&sched_lock);
1346		return (0);
1347	}
1348
1349	if (pi->pi_owner != NULL) {
1350		/*
1351		 * userland may have already messed the mutex, sigh.
1352		 */
1353		mtx_unlock_spin(&sched_lock);
1354		return (EPERM);
1355	}
1356	umtx_pi_setowner(pi, owner);
1357	uq = TAILQ_FIRST(&pi->pi_blocked);
1358	if (uq != NULL) {
1359		int pri;
1360
1361		pri = UPRI(uq->uq_thread);
1362		if (pri < UPRI(owner))
1363			sched_lend_user_prio(owner, pri);
1364	}
1365	mtx_unlock_spin(&sched_lock);
1366	return (0);
1367}
1368
1369/*
1370 * Adjust a thread's order position in its blocked PI mutex,
1371 * this may result new priority propagating process.
1372 */
1373void
1374umtx_pi_adjust(struct thread *td, u_char oldpri)
1375{
1376	struct umtx_q *uq;
1377	struct umtx_pi *pi;
1378
1379	uq = td->td_umtxq;
1380
1381	mtx_assert(&sched_lock, MA_OWNED);
1382	MPASS(TD_ON_UPILOCK(td));
1383
1384	/*
1385	 * Pick up the lock that td is blocked on.
1386	 */
1387	pi = uq->uq_pi_blocked;
1388	MPASS(pi != NULL);
1389
1390	/* Resort the turnstile on the list. */
1391	if (!umtx_pi_adjust_thread(pi, td))
1392		return;
1393
1394	/*
1395	 * If our priority was lowered and we are at the head of the
1396	 * turnstile, then propagate our new priority up the chain.
1397	 */
1398	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1399		umtx_propagate_priority(td);
1400}
1401
1402/*
1403 * Sleep on a PI mutex.
1404 */
1405static int
1406umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1407	uint32_t owner, const char *wmesg, int timo)
1408{
1409	struct umtxq_chain *uc;
1410	struct thread *td, *td1;
1411	struct umtx_q *uq1;
1412	int pri;
1413	int error = 0;
1414
1415	td = uq->uq_thread;
1416	KASSERT(td == curthread, ("inconsistent uq_thread"));
1417	uc = umtxq_getchain(&uq->uq_key);
1418	UMTXQ_LOCKED_ASSERT(uc);
1419	umtxq_insert(uq);
1420	if (pi->pi_owner == NULL) {
1421		/* XXX
1422		 * Current, We only support process private PI-mutex,
1423		 * non-contended PI-mutexes are locked in userland.
1424		 * Process shared PI-mutex should always be initialized
1425		 * by kernel and be registered in kernel, locking should
1426		 * always be done by kernel to avoid security problems.
1427		 * For process private PI-mutex, we can find owner
1428		 * thread and boost its priority safely.
1429		 */
1430		PROC_LOCK(curproc);
1431		td1 = thread_find(curproc, owner);
1432		mtx_lock_spin(&sched_lock);
1433		if (td1 != NULL && pi->pi_owner == NULL) {
1434			uq1 = td1->td_umtxq;
1435			umtx_pi_setowner(pi, td1);
1436		}
1437		PROC_UNLOCK(curproc);
1438	} else {
1439		mtx_lock_spin(&sched_lock);
1440	}
1441
1442	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1443		pri = UPRI(uq1->uq_thread);
1444		if (pri > UPRI(td))
1445			break;
1446	}
1447
1448	if (uq1 != NULL)
1449		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1450	else
1451		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1452
1453	uq->uq_pi_blocked = pi;
1454	td->td_flags |= TDF_UPIBLOCKED;
1455	mtx_unlock_spin(&sched_lock);
1456	umtxq_unlock(&uq->uq_key);
1457
1458	mtx_lock_spin(&sched_lock);
1459	umtx_propagate_priority(td);
1460	mtx_unlock_spin(&sched_lock);
1461
1462	umtxq_lock(&uq->uq_key);
1463	if (uq->uq_flags & UQF_UMTXQ) {
1464		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1465		if (error == EWOULDBLOCK)
1466			error = ETIMEDOUT;
1467		if (uq->uq_flags & UQF_UMTXQ) {
1468			umtxq_busy(&uq->uq_key);
1469			umtxq_remove(uq);
1470			umtxq_unbusy(&uq->uq_key);
1471		}
1472	}
1473	umtxq_unlock(&uq->uq_key);
1474
1475	mtx_lock_spin(&sched_lock);
1476	uq->uq_pi_blocked = NULL;
1477	td->td_flags &= ~TDF_UPIBLOCKED;
1478	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1479	umtx_unpropagate_priority(pi);
1480	mtx_unlock_spin(&sched_lock);
1481
1482	umtxq_lock(&uq->uq_key);
1483
1484	return (error);
1485}
1486
1487/*
1488 * Add reference count for a PI mutex.
1489 */
1490static void
1491umtx_pi_ref(struct umtx_pi *pi)
1492{
1493	struct umtxq_chain *uc;
1494
1495	uc = umtxq_getchain(&pi->pi_key);
1496	UMTXQ_LOCKED_ASSERT(uc);
1497	pi->pi_refcount++;
1498}
1499
1500/*
1501 * Decrease reference count for a PI mutex, if the counter
1502 * is decreased to zero, its memory space is freed.
1503 */
1504static void
1505umtx_pi_unref(struct umtx_pi *pi)
1506{
1507	struct umtxq_chain *uc;
1508	int free = 0;
1509
1510	uc = umtxq_getchain(&pi->pi_key);
1511	UMTXQ_LOCKED_ASSERT(uc);
1512	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1513	if (--pi->pi_refcount == 0) {
1514		mtx_lock_spin(&sched_lock);
1515		if (pi->pi_owner != NULL) {
1516			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1517				pi, pi_link);
1518			pi->pi_owner = NULL;
1519		}
1520		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1521			("blocked queue not empty"));
1522		mtx_unlock_spin(&sched_lock);
1523		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1524		free = 1;
1525	}
1526	if (free)
1527		umtx_pi_free(pi);
1528}
1529
1530/*
1531 * Find a PI mutex in hash table.
1532 */
1533static struct umtx_pi *
1534umtx_pi_lookup(struct umtx_key *key)
1535{
1536	struct umtxq_chain *uc;
1537	struct umtx_pi *pi;
1538
1539	uc = umtxq_getchain(key);
1540	UMTXQ_LOCKED_ASSERT(uc);
1541
1542	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1543		if (umtx_key_match(&pi->pi_key, key)) {
1544			return (pi);
1545		}
1546	}
1547	return (NULL);
1548}
1549
1550/*
1551 * Insert a PI mutex into hash table.
1552 */
1553static inline void
1554umtx_pi_insert(struct umtx_pi *pi)
1555{
1556	struct umtxq_chain *uc;
1557
1558	uc = umtxq_getchain(&pi->pi_key);
1559	UMTXQ_LOCKED_ASSERT(uc);
1560	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1561}
1562
1563/*
1564 * Lock a PI mutex.
1565 */
1566static int
1567_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1568	int try)
1569{
1570	struct umtx_q *uq;
1571	struct umtx_pi *pi, *new_pi;
1572	uint32_t id, owner, old;
1573	int error;
1574
1575	id = td->td_tid;
1576	uq = td->td_umtxq;
1577
1578	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1579	    &uq->uq_key)) != 0)
1580		return (error);
1581	umtxq_lock(&uq->uq_key);
1582	pi = umtx_pi_lookup(&uq->uq_key);
1583	if (pi == NULL) {
1584		new_pi = umtx_pi_alloc(M_NOWAIT);
1585		if (new_pi == NULL) {
1586			umtxq_unlock(&uq->uq_key);
1587			new_pi = umtx_pi_alloc(M_WAITOK);
1588			new_pi->pi_key = uq->uq_key;
1589			umtxq_lock(&uq->uq_key);
1590			pi = umtx_pi_lookup(&uq->uq_key);
1591			if (pi != NULL) {
1592				umtx_pi_free(new_pi);
1593				new_pi = NULL;
1594			}
1595		}
1596		if (new_pi != NULL) {
1597			new_pi->pi_key = uq->uq_key;
1598			umtx_pi_insert(new_pi);
1599			pi = new_pi;
1600		}
1601	}
1602	umtx_pi_ref(pi);
1603	umtxq_unlock(&uq->uq_key);
1604
1605	/*
1606	 * Care must be exercised when dealing with umtx structure.  It
1607	 * can fault on any access.
1608	 */
1609	for (;;) {
1610		/*
1611		 * Try the uncontested case.  This should be done in userland.
1612		 */
1613		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1614
1615		/* The acquire succeeded. */
1616		if (owner == UMUTEX_UNOWNED) {
1617			error = 0;
1618			break;
1619		}
1620
1621		/* The address was invalid. */
1622		if (owner == -1) {
1623			error = EFAULT;
1624			break;
1625		}
1626
1627		/* If no one owns it but it is contested try to acquire it. */
1628		if (owner == UMUTEX_CONTESTED) {
1629			owner = casuword32(&m->m_owner,
1630			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1631
1632			if (owner == UMUTEX_CONTESTED) {
1633				umtxq_lock(&uq->uq_key);
1634				error = umtx_pi_claim(pi, td);
1635				umtxq_unlock(&uq->uq_key);
1636				break;
1637			}
1638
1639			/* The address was invalid. */
1640			if (owner == -1) {
1641				error = EFAULT;
1642				break;
1643			}
1644
1645			/* If this failed the lock has changed, restart. */
1646			continue;
1647		}
1648
1649		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1650		    (owner & ~UMUTEX_CONTESTED) == id) {
1651			error = EDEADLK;
1652			break;
1653		}
1654
1655		if (try != 0) {
1656			error = EBUSY;
1657			break;
1658		}
1659
1660		/*
1661		 * If we caught a signal, we have retried and now
1662		 * exit immediately.
1663		 */
1664		if (error != 0)
1665			break;
1666
1667		umtxq_lock(&uq->uq_key);
1668		umtxq_busy(&uq->uq_key);
1669		umtxq_unlock(&uq->uq_key);
1670
1671		/*
1672		 * Set the contested bit so that a release in user space
1673		 * knows to use the system call for unlock.  If this fails
1674		 * either some one else has acquired the lock or it has been
1675		 * released.
1676		 */
1677		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1678
1679		/* The address was invalid. */
1680		if (old == -1) {
1681			umtxq_lock(&uq->uq_key);
1682			umtxq_unbusy(&uq->uq_key);
1683			umtxq_unlock(&uq->uq_key);
1684			error = EFAULT;
1685			break;
1686		}
1687
1688		umtxq_lock(&uq->uq_key);
1689		umtxq_unbusy(&uq->uq_key);
1690		/*
1691		 * We set the contested bit, sleep. Otherwise the lock changed
1692		 * and we need to retry or we lost a race to the thread
1693		 * unlocking the umtx.
1694		 */
1695		if (old == owner)
1696			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1697				 "umtxpi", timo);
1698		umtxq_unlock(&uq->uq_key);
1699	}
1700
1701	umtxq_lock(&uq->uq_key);
1702	umtx_pi_unref(pi);
1703	umtxq_unlock(&uq->uq_key);
1704
1705	umtx_key_release(&uq->uq_key);
1706	return (error);
1707}
1708
1709/*
1710 * Unlock a PI mutex.
1711 */
1712static int
1713do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1714{
1715	struct umtx_key key;
1716	struct umtx_q *uq_first, *uq_first2, *uq_me;
1717	struct umtx_pi *pi, *pi2;
1718	uint32_t owner, old, id;
1719	int error;
1720	int count;
1721	int pri;
1722
1723	id = td->td_tid;
1724	/*
1725	 * Make sure we own this mtx.
1726	 */
1727	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1728	if (owner == -1)
1729		return (EFAULT);
1730
1731	if ((owner & ~UMUTEX_CONTESTED) != id)
1732		return (EPERM);
1733
1734	/* This should be done in userland */
1735	if ((owner & UMUTEX_CONTESTED) == 0) {
1736		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1737		if (old == -1)
1738			return (EFAULT);
1739		if (old == owner)
1740			return (0);
1741		owner = old;
1742	}
1743
1744	/* We should only ever be in here for contested locks */
1745	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1746	    &key)) != 0)
1747		return (error);
1748
1749	umtxq_lock(&key);
1750	umtxq_busy(&key);
1751	count = umtxq_count_pi(&key, &uq_first);
1752	if (uq_first != NULL) {
1753		pi = uq_first->uq_pi_blocked;
1754		if (pi->pi_owner != curthread) {
1755			umtxq_unbusy(&key);
1756			umtxq_unlock(&key);
1757			/* userland messed the mutex */
1758			return (EPERM);
1759		}
1760		uq_me = curthread->td_umtxq;
1761		mtx_lock_spin(&sched_lock);
1762		pi->pi_owner = NULL;
1763		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1764		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1765		pri = PRI_MAX;
1766		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1767			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1768			if (uq_first2 != NULL) {
1769				if (pri > UPRI(uq_first2->uq_thread))
1770					pri = UPRI(uq_first2->uq_thread);
1771			}
1772		}
1773		sched_unlend_user_prio(curthread, pri);
1774		mtx_unlock_spin(&sched_lock);
1775	}
1776	umtxq_unlock(&key);
1777
1778	/*
1779	 * When unlocking the umtx, it must be marked as unowned if
1780	 * there is zero or one thread only waiting for it.
1781	 * Otherwise, it must be marked as contested.
1782	 */
1783	old = casuword32(&m->m_owner, owner,
1784		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1785
1786	umtxq_lock(&key);
1787	if (uq_first != NULL)
1788		umtxq_signal_thread(uq_first);
1789	umtxq_unbusy(&key);
1790	umtxq_unlock(&key);
1791	umtx_key_release(&key);
1792	if (old == -1)
1793		return (EFAULT);
1794	if (old != owner)
1795		return (EINVAL);
1796	return (0);
1797}
1798
1799/*
1800 * Lock a PP mutex.
1801 */
1802static int
1803_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1804	int try)
1805{
1806	struct umtx_q *uq, *uq2;
1807	struct umtx_pi *pi;
1808	uint32_t ceiling;
1809	uint32_t owner, id;
1810	int error, pri, old_inherited_pri, su;
1811
1812	id = td->td_tid;
1813	uq = td->td_umtxq;
1814	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1815	    &uq->uq_key)) != 0)
1816		return (error);
1817	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1818	for (;;) {
1819		old_inherited_pri = uq->uq_inherited_pri;
1820		umtxq_lock(&uq->uq_key);
1821		umtxq_busy(&uq->uq_key);
1822		umtxq_unlock(&uq->uq_key);
1823
1824		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1825		if (ceiling > RTP_PRIO_MAX) {
1826			error = EINVAL;
1827			goto out;
1828		}
1829
1830		mtx_lock_spin(&sched_lock);
1831		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1832			mtx_unlock_spin(&sched_lock);
1833			error = EINVAL;
1834			goto out;
1835		}
1836		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1837			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1838			if (uq->uq_inherited_pri < UPRI(td))
1839				sched_lend_user_prio(td, uq->uq_inherited_pri);
1840		}
1841		mtx_unlock_spin(&sched_lock);
1842
1843		owner = casuword32(&m->m_owner,
1844		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1845
1846		if (owner == UMUTEX_CONTESTED) {
1847			error = 0;
1848			break;
1849		}
1850
1851		/* The address was invalid. */
1852		if (owner == -1) {
1853			error = EFAULT;
1854			break;
1855		}
1856
1857		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1858		    (owner & ~UMUTEX_CONTESTED) == id) {
1859			error = EDEADLK;
1860			break;
1861		}
1862
1863		if (try != 0) {
1864			error = EBUSY;
1865			break;
1866		}
1867
1868		/*
1869		 * If we caught a signal, we have retried and now
1870		 * exit immediately.
1871		 */
1872		if (error != 0)
1873			break;
1874
1875		umtxq_lock(&uq->uq_key);
1876		umtxq_insert(uq);
1877		umtxq_unbusy(&uq->uq_key);
1878		error = umtxq_sleep(uq, "umtxpp", timo);
1879		umtxq_remove(uq);
1880		umtxq_unlock(&uq->uq_key);
1881
1882		mtx_lock_spin(&sched_lock);
1883		uq->uq_inherited_pri = old_inherited_pri;
1884		pri = PRI_MAX;
1885		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1886			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1887			if (uq2 != NULL) {
1888				if (pri > UPRI(uq2->uq_thread))
1889					pri = UPRI(uq2->uq_thread);
1890			}
1891		}
1892		if (pri > uq->uq_inherited_pri)
1893			pri = uq->uq_inherited_pri;
1894		sched_unlend_user_prio(td, pri);
1895		mtx_unlock_spin(&sched_lock);
1896	}
1897
1898	if (error != 0) {
1899		mtx_lock_spin(&sched_lock);
1900		uq->uq_inherited_pri = old_inherited_pri;
1901		pri = PRI_MAX;
1902		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1903			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1904			if (uq2 != NULL) {
1905				if (pri > UPRI(uq2->uq_thread))
1906					pri = UPRI(uq2->uq_thread);
1907			}
1908		}
1909		if (pri > uq->uq_inherited_pri)
1910			pri = uq->uq_inherited_pri;
1911		sched_unlend_user_prio(td, pri);
1912		mtx_unlock_spin(&sched_lock);
1913	}
1914
1915out:
1916	umtxq_lock(&uq->uq_key);
1917	umtxq_unbusy(&uq->uq_key);
1918	umtxq_unlock(&uq->uq_key);
1919	umtx_key_release(&uq->uq_key);
1920	return (error);
1921}
1922
1923/*
1924 * Unlock a PP mutex.
1925 */
1926static int
1927do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1928{
1929	struct umtx_key key;
1930	struct umtx_q *uq, *uq2;
1931	struct umtx_pi *pi;
1932	uint32_t owner, id;
1933	uint32_t rceiling;
1934	int error, pri, new_inherited_pri, su;
1935
1936	id = td->td_tid;
1937	uq = td->td_umtxq;
1938	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1939
1940	/*
1941	 * Make sure we own this mtx.
1942	 */
1943	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1944	if (owner == -1)
1945		return (EFAULT);
1946
1947	if ((owner & ~UMUTEX_CONTESTED) != id)
1948		return (EPERM);
1949
1950	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
1951	if (error != 0)
1952		return (error);
1953
1954	if (rceiling == -1)
1955		new_inherited_pri = PRI_MAX;
1956	else {
1957		rceiling = RTP_PRIO_MAX - rceiling;
1958		if (rceiling > RTP_PRIO_MAX)
1959			return (EINVAL);
1960		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
1961	}
1962
1963	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1964	    &key)) != 0)
1965		return (error);
1966	umtxq_lock(&key);
1967	umtxq_busy(&key);
1968	umtxq_unlock(&key);
1969	/*
1970	 * For priority protected mutex, always set unlocked state
1971	 * to UMUTEX_CONTESTED, so that userland always enters kernel
1972	 * to lock the mutex, it is necessary because thread priority
1973	 * has to be adjusted for such mutex.
1974	 */
1975	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
1976		UMUTEX_CONTESTED);
1977
1978	umtxq_lock(&key);
1979	if (error == 0)
1980		umtxq_signal(&key, 1);
1981	umtxq_unbusy(&key);
1982	umtxq_unlock(&key);
1983
1984	if (error == -1)
1985		error = EFAULT;
1986	else {
1987		mtx_lock_spin(&sched_lock);
1988		if (su != 0)
1989			uq->uq_inherited_pri = new_inherited_pri;
1990		pri = PRI_MAX;
1991		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1992			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1993			if (uq2 != NULL) {
1994				if (pri > UPRI(uq2->uq_thread))
1995					pri = UPRI(uq2->uq_thread);
1996			}
1997		}
1998		if (pri > uq->uq_inherited_pri)
1999			pri = uq->uq_inherited_pri;
2000		sched_unlend_user_prio(td, pri);
2001		mtx_unlock_spin(&sched_lock);
2002	}
2003	umtx_key_release(&key);
2004	return (error);
2005}
2006
2007static int
2008do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2009	uint32_t *old_ceiling)
2010{
2011	struct umtx_q *uq;
2012	uint32_t save_ceiling;
2013	uint32_t owner, id;
2014	uint32_t flags;
2015	int error;
2016
2017	flags = fuword32(&m->m_flags);
2018	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2019		return (EINVAL);
2020	if (ceiling > RTP_PRIO_MAX)
2021		return (EINVAL);
2022	id = td->td_tid;
2023	uq = td->td_umtxq;
2024	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2025	   &uq->uq_key)) != 0)
2026		return (error);
2027	for (;;) {
2028		umtxq_lock(&uq->uq_key);
2029		umtxq_busy(&uq->uq_key);
2030		umtxq_unlock(&uq->uq_key);
2031
2032		save_ceiling = fuword32(&m->m_ceilings[0]);
2033
2034		owner = casuword32(&m->m_owner,
2035		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2036
2037		if (owner == UMUTEX_CONTESTED) {
2038			suword32(&m->m_ceilings[0], ceiling);
2039			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2040				UMUTEX_CONTESTED);
2041			error = 0;
2042			break;
2043		}
2044
2045		/* The address was invalid. */
2046		if (owner == -1) {
2047			error = EFAULT;
2048			break;
2049		}
2050
2051		if ((owner & ~UMUTEX_CONTESTED) == id) {
2052			suword32(&m->m_ceilings[0], ceiling);
2053			error = 0;
2054			break;
2055		}
2056
2057		/*
2058		 * If we caught a signal, we have retried and now
2059		 * exit immediately.
2060		 */
2061		if (error != 0)
2062			break;
2063
2064		/*
2065		 * We set the contested bit, sleep. Otherwise the lock changed
2066		 * and we need to retry or we lost a race to the thread
2067		 * unlocking the umtx.
2068		 */
2069		umtxq_lock(&uq->uq_key);
2070		umtxq_insert(uq);
2071		umtxq_unbusy(&uq->uq_key);
2072		error = umtxq_sleep(uq, "umtxpp", 0);
2073		umtxq_remove(uq);
2074		umtxq_unlock(&uq->uq_key);
2075	}
2076	umtxq_lock(&uq->uq_key);
2077	if (error == 0)
2078		umtxq_signal(&uq->uq_key, INT_MAX);
2079	umtxq_unbusy(&uq->uq_key);
2080	umtxq_unlock(&uq->uq_key);
2081	umtx_key_release(&uq->uq_key);
2082	if (error == 0 && old_ceiling != NULL)
2083		suword32(old_ceiling, save_ceiling);
2084	return (error);
2085}
2086
2087static int
2088_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2089	int try)
2090{
2091	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2092	case 0:
2093		return (_do_lock_normal(td, m, flags, timo, try));
2094	case UMUTEX_PRIO_INHERIT:
2095		return (_do_lock_pi(td, m, flags, timo, try));
2096	case UMUTEX_PRIO_PROTECT:
2097		return (_do_lock_pp(td, m, flags, timo, try));
2098	}
2099	return (EINVAL);
2100}
2101
2102/*
2103 * Lock a userland POSIX mutex.
2104 */
2105static int
2106do_lock_umutex(struct thread *td, struct umutex *m,
2107	struct timespec *timeout, int try)
2108{
2109	struct timespec ts, ts2, ts3;
2110	struct timeval tv;
2111	uint32_t flags;
2112	int error;
2113
2114	flags = fuword32(&m->m_flags);
2115	if (flags == -1)
2116		return (EFAULT);
2117
2118	if (timeout == NULL) {
2119		error = _do_lock_umutex(td, m, flags, 0, try);
2120		/* Mutex locking is restarted if it is interrupted. */
2121		if (error == EINTR)
2122			error = ERESTART;
2123	} else {
2124		getnanouptime(&ts);
2125		timespecadd(&ts, timeout);
2126		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2127		for (;;) {
2128			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2129			if (error != ETIMEDOUT)
2130				break;
2131			getnanouptime(&ts2);
2132			if (timespeccmp(&ts2, &ts, >=)) {
2133				error = ETIMEDOUT;
2134				break;
2135			}
2136			ts3 = ts;
2137			timespecsub(&ts3, &ts2);
2138			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2139		}
2140		/* Timed-locking is not restarted. */
2141		if (error == ERESTART)
2142			error = EINTR;
2143	}
2144	return (error);
2145}
2146
2147/*
2148 * Unlock a userland POSIX mutex.
2149 */
2150static int
2151do_unlock_umutex(struct thread *td, struct umutex *m)
2152{
2153	uint32_t flags;
2154
2155	flags = fuword32(&m->m_flags);
2156	if (flags == -1)
2157		return (EFAULT);
2158
2159	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2160	case 0:
2161		return (do_unlock_normal(td, m, flags));
2162	case UMUTEX_PRIO_INHERIT:
2163		return (do_unlock_pi(td, m, flags));
2164	case UMUTEX_PRIO_PROTECT:
2165		return (do_unlock_pp(td, m, flags));
2166	}
2167
2168	return (EINVAL);
2169}
2170
2171static int
2172do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2173	struct timespec *timeout)
2174{
2175	struct umtx_q *uq;
2176	struct timeval tv;
2177	struct timespec cts, ets, tts;
2178	uint32_t flags;
2179	int error;
2180
2181	uq = td->td_umtxq;
2182	flags = fuword32(&cv->c_flags);
2183	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2184	if (error != 0)
2185		return (error);
2186	umtxq_lock(&uq->uq_key);
2187	umtxq_busy(&uq->uq_key);
2188	umtxq_insert(uq);
2189	umtxq_unlock(&uq->uq_key);
2190
2191	/*
2192	 * The magic thing is we should set c_has_waiters to 1 before
2193	 * releasing user mutex.
2194	 */
2195	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2196
2197	umtxq_lock(&uq->uq_key);
2198	umtxq_unbusy(&uq->uq_key);
2199	umtxq_unlock(&uq->uq_key);
2200
2201	error = do_unlock_umutex(td, m);
2202
2203	umtxq_lock(&uq->uq_key);
2204	if (error == 0) {
2205		if (timeout == NULL) {
2206			error = umtxq_sleep(uq, "ucond", 0);
2207		} else {
2208			getnanouptime(&ets);
2209			timespecadd(&ets, timeout);
2210			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2211			for (;;) {
2212				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2213				if (error != ETIMEDOUT)
2214					break;
2215				getnanouptime(&cts);
2216				if (timespeccmp(&cts, &ets, >=)) {
2217					error = ETIMEDOUT;
2218					break;
2219				}
2220				tts = ets;
2221				timespecsub(&tts, &cts);
2222				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2223			}
2224		}
2225	}
2226
2227	if (error != 0) {
2228		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2229			/*
2230			 * If we concurrently got do_cv_signal()d
2231			 * and we got an error or UNIX signals or a timeout,
2232			 * then, perform another umtxq_signal to avoid
2233			 * consuming the wakeup. This may cause supurious
2234			 * wakeup for another thread which was just queued,
2235			 * but SUSV3 explicitly allows supurious wakeup to
2236			 * occur, and indeed a kernel based implementation
2237			 * can not avoid it.
2238			 */
2239			umtxq_signal(&uq->uq_key, 1);
2240		}
2241		if (error == ERESTART)
2242			error = EINTR;
2243	}
2244	umtxq_remove(uq);
2245	umtxq_unlock(&uq->uq_key);
2246	umtx_key_release(&uq->uq_key);
2247	return (error);
2248}
2249
2250/*
2251 * Signal a userland condition variable.
2252 */
2253static int
2254do_cv_signal(struct thread *td, struct ucond *cv)
2255{
2256	struct umtx_key key;
2257	int error, cnt, nwake;
2258	uint32_t flags;
2259
2260	flags = fuword32(&cv->c_flags);
2261	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2262		return (error);
2263	umtxq_lock(&key);
2264	umtxq_busy(&key);
2265	cnt = umtxq_count(&key);
2266	nwake = umtxq_signal(&key, 1);
2267	if (cnt <= nwake) {
2268		umtxq_unlock(&key);
2269		error = suword32(
2270		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2271		umtxq_lock(&key);
2272	}
2273	umtxq_unbusy(&key);
2274	umtxq_unlock(&key);
2275	umtx_key_release(&key);
2276	return (error);
2277}
2278
2279static int
2280do_cv_broadcast(struct thread *td, struct ucond *cv)
2281{
2282	struct umtx_key key;
2283	int error;
2284	uint32_t flags;
2285
2286	flags = fuword32(&cv->c_flags);
2287	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2288		return (error);
2289
2290	umtxq_lock(&key);
2291	umtxq_busy(&key);
2292	umtxq_signal(&key, INT_MAX);
2293	umtxq_unlock(&key);
2294
2295	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2296
2297	umtxq_lock(&key);
2298	umtxq_unbusy(&key);
2299	umtxq_unlock(&key);
2300
2301	umtx_key_release(&key);
2302	return (error);
2303}
2304
2305int
2306_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2307    /* struct umtx *umtx */
2308{
2309	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2310}
2311
2312int
2313_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2314    /* struct umtx *umtx */
2315{
2316	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2317}
2318
2319static int
2320__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2321{
2322	struct timespec *ts, timeout;
2323	int error;
2324
2325	/* Allow a null timespec (wait forever). */
2326	if (uap->uaddr2 == NULL)
2327		ts = NULL;
2328	else {
2329		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2330		if (error != 0)
2331			return (error);
2332		if (timeout.tv_nsec >= 1000000000 ||
2333		    timeout.tv_nsec < 0) {
2334			return (EINVAL);
2335		}
2336		ts = &timeout;
2337	}
2338	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2339}
2340
2341static int
2342__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2343{
2344	return (do_unlock_umtx(td, uap->obj, uap->val));
2345}
2346
2347static int
2348__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2349{
2350	struct timespec *ts, timeout;
2351	int error;
2352
2353	if (uap->uaddr2 == NULL)
2354		ts = NULL;
2355	else {
2356		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2357		if (error != 0)
2358			return (error);
2359		if (timeout.tv_nsec >= 1000000000 ||
2360		    timeout.tv_nsec < 0)
2361			return (EINVAL);
2362		ts = &timeout;
2363	}
2364	return do_wait(td, uap->obj, uap->val, ts, 0);
2365}
2366
2367static int
2368__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2369{
2370	return (kern_umtx_wake(td, uap->obj, uap->val));
2371}
2372
2373static int
2374__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2375{
2376	struct timespec *ts, timeout;
2377	int error;
2378
2379	/* Allow a null timespec (wait forever). */
2380	if (uap->uaddr2 == NULL)
2381		ts = NULL;
2382	else {
2383		error = copyin(uap->uaddr2, &timeout,
2384		    sizeof(timeout));
2385		if (error != 0)
2386			return (error);
2387		if (timeout.tv_nsec >= 1000000000 ||
2388		    timeout.tv_nsec < 0) {
2389			return (EINVAL);
2390		}
2391		ts = &timeout;
2392	}
2393	return do_lock_umutex(td, uap->obj, ts, 0);
2394}
2395
2396static int
2397__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2398{
2399	return do_lock_umutex(td, uap->obj, NULL, 1);
2400}
2401
2402static int
2403__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2404{
2405	return do_unlock_umutex(td, uap->obj);
2406}
2407
2408static int
2409__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2410{
2411	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2412}
2413
2414static int
2415__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2416{
2417	struct timespec *ts, timeout;
2418	int error;
2419
2420	/* Allow a null timespec (wait forever). */
2421	if (uap->uaddr2 == NULL)
2422		ts = NULL;
2423	else {
2424		error = copyin(uap->uaddr2, &timeout,
2425		    sizeof(timeout));
2426		if (error != 0)
2427			return (error);
2428		if (timeout.tv_nsec >= 1000000000 ||
2429		    timeout.tv_nsec < 0) {
2430			return (EINVAL);
2431		}
2432		ts = &timeout;
2433	}
2434	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts));
2435}
2436
2437static int
2438__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2439{
2440	return do_cv_signal(td, uap->obj);
2441}
2442
2443static int
2444__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2445{
2446	return do_cv_broadcast(td, uap->obj);
2447}
2448
2449typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2450
2451static _umtx_op_func op_table[] = {
2452	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2453	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2454	__umtx_op_wait,			/* UMTX_OP_WAIT */
2455	__umtx_op_wake,			/* UMTX_OP_WAKE */
2456	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2457	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2458	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2459	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2460	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2461	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2462	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2463};
2464
2465int
2466_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2467{
2468	if ((unsigned)uap->op < UMTX_OP_MAX)
2469		return (*op_table[uap->op])(td, uap);
2470	return (EINVAL);
2471}
2472
2473#ifdef COMPAT_IA32
2474
2475int
2476freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2477    /* struct umtx *umtx */
2478{
2479	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2480}
2481
2482int
2483freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2484    /* struct umtx *umtx */
2485{
2486	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2487}
2488
2489struct timespec32 {
2490	u_int32_t tv_sec;
2491	u_int32_t tv_nsec;
2492};
2493
2494static inline int
2495copyin_timeout32(void *addr, struct timespec *tsp)
2496{
2497	struct timespec32 ts32;
2498	int error;
2499
2500	error = copyin(addr, &ts32, sizeof(struct timespec32));
2501	if (error == 0) {
2502		tsp->tv_sec = ts32.tv_sec;
2503		tsp->tv_nsec = ts32.tv_nsec;
2504	}
2505	return (error);
2506}
2507
2508static int
2509__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2510{
2511	struct timespec *ts, timeout;
2512	int error;
2513
2514	/* Allow a null timespec (wait forever). */
2515	if (uap->uaddr2 == NULL)
2516		ts = NULL;
2517	else {
2518		error = copyin_timeout32(uap->uaddr2, &timeout);
2519		if (error != 0)
2520			return (error);
2521		if (timeout.tv_nsec >= 1000000000 ||
2522		    timeout.tv_nsec < 0) {
2523			return (EINVAL);
2524		}
2525		ts = &timeout;
2526	}
2527	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2528}
2529
2530static int
2531__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2532{
2533	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2534}
2535
2536static int
2537__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2538{
2539	struct timespec *ts, timeout;
2540	int error;
2541
2542	if (uap->uaddr2 == NULL)
2543		ts = NULL;
2544	else {
2545		error = copyin_timeout32(uap->uaddr2, &timeout);
2546		if (error != 0)
2547			return (error);
2548		if (timeout.tv_nsec >= 1000000000 ||
2549		    timeout.tv_nsec < 0)
2550			return (EINVAL);
2551		ts = &timeout;
2552	}
2553	return do_wait(td, uap->obj, uap->val, ts, 1);
2554}
2555
2556static int
2557__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2558{
2559	struct timespec *ts, timeout;
2560	int error;
2561
2562	/* Allow a null timespec (wait forever). */
2563	if (uap->uaddr2 == NULL)
2564		ts = NULL;
2565	else {
2566		error = copyin_timeout32(uap->uaddr2, &timeout);
2567		if (error != 0)
2568			return (error);
2569		if (timeout.tv_nsec >= 1000000000 ||
2570		    timeout.tv_nsec < 0)
2571			return (EINVAL);
2572		ts = &timeout;
2573	}
2574	return do_lock_umutex(td, uap->obj, ts, 0);
2575}
2576
2577static int
2578__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2579{
2580	struct timespec *ts, timeout;
2581	int error;
2582
2583	/* Allow a null timespec (wait forever). */
2584	if (uap->uaddr2 == NULL)
2585		ts = NULL;
2586	else {
2587		error = copyin_timeout32(uap->uaddr2, &timeout);
2588		if (error != 0)
2589			return (error);
2590		if (timeout.tv_nsec >= 1000000000 ||
2591		    timeout.tv_nsec < 0)
2592			return (EINVAL);
2593		ts = &timeout;
2594	}
2595	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts));
2596}
2597
2598static _umtx_op_func op_table_compat32[] = {
2599	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2600	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2601	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2602	__umtx_op_wake,			/* UMTX_OP_WAKE */
2603	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2604	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2605	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2606	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2607	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
2608	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2609	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2610};
2611
2612int
2613freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2614{
2615	if ((unsigned)uap->op < UMTX_OP_MAX)
2616		return (*op_table_compat32[uap->op])(td,
2617			(struct _umtx_op_args *)uap);
2618	return (EINVAL);
2619}
2620#endif
2621
2622void
2623umtx_thread_init(struct thread *td)
2624{
2625	td->td_umtxq = umtxq_alloc();
2626	td->td_umtxq->uq_thread = td;
2627}
2628
2629void
2630umtx_thread_fini(struct thread *td)
2631{
2632	umtxq_free(td->td_umtxq);
2633}
2634
2635/*
2636 * It will be called when new thread is created, e.g fork().
2637 */
2638void
2639umtx_thread_alloc(struct thread *td)
2640{
2641	struct umtx_q *uq;
2642
2643	uq = td->td_umtxq;
2644	uq->uq_inherited_pri = PRI_MAX;
2645
2646	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2647	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2648	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2649	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2650}
2651
2652/*
2653 * exec() hook.
2654 */
2655static void
2656umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2657	struct image_params *imgp __unused)
2658{
2659	umtx_thread_cleanup(curthread);
2660}
2661
2662/*
2663 * thread_exit() hook.
2664 */
2665void
2666umtx_thread_exit(struct thread *td)
2667{
2668	umtx_thread_cleanup(td);
2669}
2670
2671/*
2672 * clean up umtx data.
2673 */
2674static void
2675umtx_thread_cleanup(struct thread *td)
2676{
2677	struct umtx_q *uq;
2678	struct umtx_pi *pi;
2679
2680	if ((uq = td->td_umtxq) == NULL)
2681		return;
2682
2683	mtx_lock_spin(&sched_lock);
2684	uq->uq_inherited_pri = PRI_MAX;
2685	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2686		pi->pi_owner = NULL;
2687		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2688	}
2689	td->td_flags &= ~TDF_UBORROWING;
2690	mtx_unlock_spin(&sched_lock);
2691}
2692