kern_umtx.c revision 167232
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: head/sys/kern/kern_umtx.c 167232 2007-03-05 13:10:58Z rwatson $");
30
31#include "opt_compat.h"
32#include <sys/param.h>
33#include <sys/kernel.h>
34#include <sys/limits.h>
35#include <sys/lock.h>
36#include <sys/malloc.h>
37#include <sys/mutex.h>
38#include <sys/priv.h>
39#include <sys/proc.h>
40#include <sys/sched.h>
41#include <sys/smp.h>
42#include <sys/sysctl.h>
43#include <sys/sysent.h>
44#include <sys/systm.h>
45#include <sys/sysproto.h>
46#include <sys/eventhandler.h>
47#include <sys/umtx.h>
48
49#include <vm/vm.h>
50#include <vm/vm_param.h>
51#include <vm/pmap.h>
52#include <vm/vm_map.h>
53#include <vm/vm_object.h>
54
55#include <machine/cpu.h>
56
57#ifdef COMPAT_IA32
58#include <compat/freebsd32/freebsd32_proto.h>
59#endif
60
61#define TYPE_SIMPLE_LOCK	0
62#define TYPE_SIMPLE_WAIT	1
63#define TYPE_NORMAL_UMUTEX	2
64#define TYPE_PI_UMUTEX		3
65#define TYPE_PP_UMUTEX		4
66#define TYPE_CV			5
67
68/* Key to represent a unique userland synchronous object */
69struct umtx_key {
70	int	hash;
71	int	type;
72	int	shared;
73	union {
74		struct {
75			vm_object_t	object;
76			uintptr_t	offset;
77		} shared;
78		struct {
79			struct vmspace	*vs;
80			uintptr_t	addr;
81		} private;
82		struct {
83			void		*a;
84			uintptr_t	b;
85		} both;
86	} info;
87};
88
89/* Priority inheritance mutex info. */
90struct umtx_pi {
91	/* Owner thread */
92	struct thread		*pi_owner;
93
94	/* Reference count */
95	int			pi_refcount;
96
97 	/* List entry to link umtx holding by thread */
98	TAILQ_ENTRY(umtx_pi)	pi_link;
99
100	/* List entry in hash */
101	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
102
103	/* List for waiters */
104	TAILQ_HEAD(,umtx_q)	pi_blocked;
105
106	/* Identify a userland lock object */
107	struct umtx_key		pi_key;
108};
109
110/* A userland synchronous object user. */
111struct umtx_q {
112	/* Linked list for the hash. */
113	TAILQ_ENTRY(umtx_q)	uq_link;
114
115	/* Umtx key. */
116	struct umtx_key		uq_key;
117
118	/* Umtx flags. */
119	int			uq_flags;
120#define UQF_UMTXQ	0x0001
121
122	/* The thread waits on. */
123	struct thread		*uq_thread;
124
125	/*
126	 * Blocked on PI mutex. read can use chain lock
127	 * or sched_lock, write must have both chain lock and
128	 * sched_lock being hold.
129	 */
130	struct umtx_pi		*uq_pi_blocked;
131
132	/* On blocked list */
133	TAILQ_ENTRY(umtx_q)	uq_lockq;
134
135	/* Thread contending with us */
136	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
137
138	/* Inherited priority from PP mutex */
139	u_char			uq_inherited_pri;
140};
141
142TAILQ_HEAD(umtxq_head, umtx_q);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_head	uc_queue;
151
152	/* Busy flag */
153	char			uc_busy;
154
155	/* Chain lock waiters */
156	int			uc_waiters;
157
158	/* All PI in the list */
159	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
160};
161
162#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
163
164/*
165 * Don't propagate time-sharing priority, there is a security reason,
166 * a user can simply introduce PI-mutex, let thread A lock the mutex,
167 * and let another thread B block on the mutex, because B is
168 * sleeping, its priority will be boosted, this causes A's priority to
169 * be boosted via priority propagating too and will never be lowered even
170 * if it is using 100%CPU, this is unfair to other processes.
171 */
172
173#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
174			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
175			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
176
177#define	GOLDEN_RATIO_PRIME	2654404609U
178#define	UMTX_CHAINS		128
179#define	UMTX_SHIFTS		(__WORD_BIT - 7)
180
181#define THREAD_SHARE		0
182#define PROCESS_SHARE		1
183#define AUTO_SHARE		2
184
185#define	GET_SHARE(flags)	\
186    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
187
188static uma_zone_t		umtx_pi_zone;
189static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
190static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
191static int			umtx_pi_allocated;
192
193SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
194SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
195    &umtx_pi_allocated, 0, "Allocated umtx_pi");
196SYSCTL_DECL(_kern_threads);
197static int			umtx_dflt_spins = 0;
198SYSCTL_INT(_kern_threads, OID_AUTO, umtx_dflt_spins, CTLFLAG_RW,
199    &umtx_dflt_spins, 0, "default umtx spin count");
200static int			umtx_max_spins = 3000;
201SYSCTL_INT(_kern_threads, OID_AUTO, umtx_max_spins, CTLFLAG_RW,
202    &umtx_max_spins, 0, "max umtx spin count");
203
204static void umtxq_sysinit(void *);
205static void umtxq_hash(struct umtx_key *key);
206static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
207static void umtxq_lock(struct umtx_key *key);
208static void umtxq_unlock(struct umtx_key *key);
209static void umtxq_busy(struct umtx_key *key);
210static void umtxq_unbusy(struct umtx_key *key);
211static void umtxq_insert(struct umtx_q *uq);
212static void umtxq_remove(struct umtx_q *uq);
213static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
214static int umtxq_count(struct umtx_key *key);
215static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
216static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
217static int umtx_key_get(void *addr, int type, int share,
218	struct umtx_key *key);
219static void umtx_key_release(struct umtx_key *key);
220static struct umtx_pi *umtx_pi_alloc(int);
221static void umtx_pi_free(struct umtx_pi *pi);
222static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
223static void umtx_thread_cleanup(struct thread *td);
224static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
225	struct image_params *imgp __unused);
226SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
227
228static void
229umtxq_sysinit(void *arg __unused)
230{
231	int i;
232
233	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
234		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
235	for (i = 0; i < UMTX_CHAINS; ++i) {
236		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
237			 MTX_DEF | MTX_DUPOK);
238		TAILQ_INIT(&umtxq_chains[i].uc_queue);
239		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
240		umtxq_chains[i].uc_busy = 0;
241		umtxq_chains[i].uc_waiters = 0;
242	}
243	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
244	    EVENTHANDLER_PRI_ANY);
245}
246
247struct umtx_q *
248umtxq_alloc(void)
249{
250	struct umtx_q *uq;
251
252	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
253	TAILQ_INIT(&uq->uq_pi_contested);
254	uq->uq_inherited_pri = PRI_MAX;
255	return (uq);
256}
257
258void
259umtxq_free(struct umtx_q *uq)
260{
261	free(uq, M_UMTX);
262}
263
264static inline void
265umtxq_hash(struct umtx_key *key)
266{
267	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
268	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
269}
270
271static inline int
272umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
273{
274	return (k1->type == k2->type &&
275		k1->info.both.a == k2->info.both.a &&
276	        k1->info.both.b == k2->info.both.b);
277}
278
279static inline struct umtxq_chain *
280umtxq_getchain(struct umtx_key *key)
281{
282	return (&umtxq_chains[key->hash]);
283}
284
285/*
286 * Set chain to busy state when following operation
287 * may be blocked (kernel mutex can not be used).
288 */
289static inline void
290umtxq_busy(struct umtx_key *key)
291{
292	struct umtxq_chain *uc;
293
294	uc = umtxq_getchain(key);
295	mtx_assert(&uc->uc_lock, MA_OWNED);
296	while (uc->uc_busy != 0) {
297		uc->uc_waiters++;
298		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
299		uc->uc_waiters--;
300	}
301	uc->uc_busy = 1;
302}
303
304/*
305 * Unbusy a chain.
306 */
307static inline void
308umtxq_unbusy(struct umtx_key *key)
309{
310	struct umtxq_chain *uc;
311
312	uc = umtxq_getchain(key);
313	mtx_assert(&uc->uc_lock, MA_OWNED);
314	KASSERT(uc->uc_busy != 0, ("not busy"));
315	uc->uc_busy = 0;
316	if (uc->uc_waiters)
317		wakeup_one(uc);
318}
319
320/*
321 * Lock a chain.
322 */
323static inline void
324umtxq_lock(struct umtx_key *key)
325{
326	struct umtxq_chain *uc;
327
328	uc = umtxq_getchain(key);
329	mtx_lock(&uc->uc_lock);
330}
331
332/*
333 * Unlock a chain.
334 */
335static inline void
336umtxq_unlock(struct umtx_key *key)
337{
338	struct umtxq_chain *uc;
339
340	uc = umtxq_getchain(key);
341	mtx_unlock(&uc->uc_lock);
342}
343
344/*
345 * Insert a thread onto the umtx queue.
346 */
347static inline void
348umtxq_insert(struct umtx_q *uq)
349{
350	struct umtxq_chain *uc;
351
352	uc = umtxq_getchain(&uq->uq_key);
353	UMTXQ_LOCKED_ASSERT(uc);
354	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
355	uq->uq_flags |= UQF_UMTXQ;
356}
357
358/*
359 * Remove thread from the umtx queue.
360 */
361static inline void
362umtxq_remove(struct umtx_q *uq)
363{
364	struct umtxq_chain *uc;
365
366	uc = umtxq_getchain(&uq->uq_key);
367	UMTXQ_LOCKED_ASSERT(uc);
368	if (uq->uq_flags & UQF_UMTXQ) {
369		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
370		uq->uq_flags &= ~UQF_UMTXQ;
371	}
372}
373
374/*
375 * Check if there are multiple waiters
376 */
377static int
378umtxq_count(struct umtx_key *key)
379{
380	struct umtxq_chain *uc;
381	struct umtx_q *uq;
382	int count = 0;
383
384	uc = umtxq_getchain(key);
385	UMTXQ_LOCKED_ASSERT(uc);
386	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
387		if (umtx_key_match(&uq->uq_key, key)) {
388			if (++count > 1)
389				break;
390		}
391	}
392	return (count);
393}
394
395/*
396 * Check if there are multiple PI waiters and returns first
397 * waiter.
398 */
399static int
400umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
401{
402	struct umtxq_chain *uc;
403	struct umtx_q *uq;
404	int count = 0;
405
406	*first = NULL;
407	uc = umtxq_getchain(key);
408	UMTXQ_LOCKED_ASSERT(uc);
409	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
410		if (umtx_key_match(&uq->uq_key, key)) {
411			if (++count > 1)
412				break;
413			*first = uq;
414		}
415	}
416	return (count);
417}
418
419/*
420 * Wake up threads waiting on an userland object.
421 */
422static int
423umtxq_signal(struct umtx_key *key, int n_wake)
424{
425	struct umtxq_chain *uc;
426	struct umtx_q *uq, *next;
427	int ret;
428
429	ret = 0;
430	uc = umtxq_getchain(key);
431	UMTXQ_LOCKED_ASSERT(uc);
432	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
433		if (umtx_key_match(&uq->uq_key, key)) {
434			umtxq_remove(uq);
435			wakeup(uq);
436			if (++ret >= n_wake)
437				break;
438		}
439	}
440	return (ret);
441}
442
443/*
444 * Wake up specified thread.
445 */
446static inline void
447umtxq_signal_thread(struct umtx_q *uq)
448{
449	struct umtxq_chain *uc;
450
451	uc = umtxq_getchain(&uq->uq_key);
452	UMTXQ_LOCKED_ASSERT(uc);
453	umtxq_remove(uq);
454	wakeup(uq);
455}
456
457/*
458 * Put thread into sleep state, before sleeping, check if
459 * thread was removed from umtx queue.
460 */
461static inline int
462umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
463{
464	struct umtxq_chain *uc;
465	int error;
466
467	uc = umtxq_getchain(&uq->uq_key);
468	UMTXQ_LOCKED_ASSERT(uc);
469	if (!(uq->uq_flags & UQF_UMTXQ))
470		return (0);
471	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
472	if (error == EWOULDBLOCK)
473		error = ETIMEDOUT;
474	return (error);
475}
476
477/*
478 * Convert userspace address into unique logical address.
479 */
480static int
481umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
482{
483	struct thread *td = curthread;
484	vm_map_t map;
485	vm_map_entry_t entry;
486	vm_pindex_t pindex;
487	vm_prot_t prot;
488	boolean_t wired;
489
490	key->type = type;
491	if (share == THREAD_SHARE) {
492		key->shared = 0;
493		key->info.private.vs = td->td_proc->p_vmspace;
494		key->info.private.addr = (uintptr_t)addr;
495	} else {
496		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
497		map = &td->td_proc->p_vmspace->vm_map;
498		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
499		    &entry, &key->info.shared.object, &pindex, &prot,
500		    &wired) != KERN_SUCCESS) {
501			return EFAULT;
502		}
503
504		if ((share == PROCESS_SHARE) ||
505		    (share == AUTO_SHARE &&
506		     VM_INHERIT_SHARE == entry->inheritance)) {
507			key->shared = 1;
508			key->info.shared.offset = entry->offset + entry->start -
509				(vm_offset_t)addr;
510			vm_object_reference(key->info.shared.object);
511		} else {
512			key->shared = 0;
513			key->info.private.vs = td->td_proc->p_vmspace;
514			key->info.private.addr = (uintptr_t)addr;
515		}
516		vm_map_lookup_done(map, entry);
517	}
518
519	umtxq_hash(key);
520	return (0);
521}
522
523/*
524 * Release key.
525 */
526static inline void
527umtx_key_release(struct umtx_key *key)
528{
529	if (key->shared)
530		vm_object_deallocate(key->info.shared.object);
531}
532
533/*
534 * Lock a umtx object.
535 */
536static int
537_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
538{
539	struct umtx_q *uq;
540	u_long owner;
541	u_long old;
542	int error = 0;
543
544	uq = td->td_umtxq;
545
546	/*
547	 * Care must be exercised when dealing with umtx structure. It
548	 * can fault on any access.
549	 */
550	for (;;) {
551		/*
552		 * Try the uncontested case.  This should be done in userland.
553		 */
554		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
555
556		/* The acquire succeeded. */
557		if (owner == UMTX_UNOWNED)
558			return (0);
559
560		/* The address was invalid. */
561		if (owner == -1)
562			return (EFAULT);
563
564		/* If no one owns it but it is contested try to acquire it. */
565		if (owner == UMTX_CONTESTED) {
566			owner = casuword(&umtx->u_owner,
567			    UMTX_CONTESTED, id | UMTX_CONTESTED);
568
569			if (owner == UMTX_CONTESTED)
570				return (0);
571
572			/* The address was invalid. */
573			if (owner == -1)
574				return (EFAULT);
575
576			/* If this failed the lock has changed, restart. */
577			continue;
578		}
579
580		/*
581		 * If we caught a signal, we have retried and now
582		 * exit immediately.
583		 */
584		if (error != 0)
585			return (error);
586
587		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
588			AUTO_SHARE, &uq->uq_key)) != 0)
589			return (error);
590
591		umtxq_lock(&uq->uq_key);
592		umtxq_busy(&uq->uq_key);
593		umtxq_insert(uq);
594		umtxq_unbusy(&uq->uq_key);
595		umtxq_unlock(&uq->uq_key);
596
597		/*
598		 * Set the contested bit so that a release in user space
599		 * knows to use the system call for unlock.  If this fails
600		 * either some one else has acquired the lock or it has been
601		 * released.
602		 */
603		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
604
605		/* The address was invalid. */
606		if (old == -1) {
607			umtxq_lock(&uq->uq_key);
608			umtxq_remove(uq);
609			umtxq_unlock(&uq->uq_key);
610			umtx_key_release(&uq->uq_key);
611			return (EFAULT);
612		}
613
614		/*
615		 * We set the contested bit, sleep. Otherwise the lock changed
616		 * and we need to retry or we lost a race to the thread
617		 * unlocking the umtx.
618		 */
619		umtxq_lock(&uq->uq_key);
620		if (old == owner)
621			error = umtxq_sleep(uq, "umtx", timo);
622		umtxq_remove(uq);
623		umtxq_unlock(&uq->uq_key);
624		umtx_key_release(&uq->uq_key);
625	}
626
627	return (0);
628}
629
630/*
631 * Lock a umtx object.
632 */
633static int
634do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
635	struct timespec *timeout)
636{
637	struct timespec ts, ts2, ts3;
638	struct timeval tv;
639	int error;
640
641	if (timeout == NULL) {
642		error = _do_lock_umtx(td, umtx, id, 0);
643		/* Mutex locking is restarted if it is interrupted. */
644		if (error == EINTR)
645			error = ERESTART;
646	} else {
647		getnanouptime(&ts);
648		timespecadd(&ts, timeout);
649		TIMESPEC_TO_TIMEVAL(&tv, timeout);
650		for (;;) {
651			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
652			if (error != ETIMEDOUT)
653				break;
654			getnanouptime(&ts2);
655			if (timespeccmp(&ts2, &ts, >=)) {
656				error = ETIMEDOUT;
657				break;
658			}
659			ts3 = ts;
660			timespecsub(&ts3, &ts2);
661			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
662		}
663		/* Timed-locking is not restarted. */
664		if (error == ERESTART)
665			error = EINTR;
666	}
667	return (error);
668}
669
670/*
671 * Unlock a umtx object.
672 */
673static int
674do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
675{
676	struct umtx_key key;
677	u_long owner;
678	u_long old;
679	int error;
680	int count;
681
682	/*
683	 * Make sure we own this mtx.
684	 */
685	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
686	if (owner == -1)
687		return (EFAULT);
688
689	if ((owner & ~UMTX_CONTESTED) != id)
690		return (EPERM);
691
692	/* This should be done in userland */
693	if ((owner & UMTX_CONTESTED) == 0) {
694		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
695		if (old == -1)
696			return (EFAULT);
697		if (old == owner)
698			return (0);
699		owner = old;
700	}
701
702	/* We should only ever be in here for contested locks */
703	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
704		&key)) != 0)
705		return (error);
706
707	umtxq_lock(&key);
708	umtxq_busy(&key);
709	count = umtxq_count(&key);
710	umtxq_unlock(&key);
711
712	/*
713	 * When unlocking the umtx, it must be marked as unowned if
714	 * there is zero or one thread only waiting for it.
715	 * Otherwise, it must be marked as contested.
716	 */
717	old = casuword(&umtx->u_owner, owner,
718		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
719	umtxq_lock(&key);
720	umtxq_signal(&key,1);
721	umtxq_unbusy(&key);
722	umtxq_unlock(&key);
723	umtx_key_release(&key);
724	if (old == -1)
725		return (EFAULT);
726	if (old != owner)
727		return (EINVAL);
728	return (0);
729}
730
731#ifdef COMPAT_IA32
732
733/*
734 * Lock a umtx object.
735 */
736static int
737_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
738{
739	struct umtx_q *uq;
740	uint32_t owner;
741	uint32_t old;
742	int error = 0;
743
744	uq = td->td_umtxq;
745
746	/*
747	 * Care must be exercised when dealing with umtx structure. It
748	 * can fault on any access.
749	 */
750	for (;;) {
751		/*
752		 * Try the uncontested case.  This should be done in userland.
753		 */
754		owner = casuword32(m, UMUTEX_UNOWNED, id);
755
756		/* The acquire succeeded. */
757		if (owner == UMUTEX_UNOWNED)
758			return (0);
759
760		/* The address was invalid. */
761		if (owner == -1)
762			return (EFAULT);
763
764		/* If no one owns it but it is contested try to acquire it. */
765		if (owner == UMUTEX_CONTESTED) {
766			owner = casuword32(m,
767			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
768			if (owner == UMUTEX_CONTESTED)
769				return (0);
770
771			/* The address was invalid. */
772			if (owner == -1)
773				return (EFAULT);
774
775			/* If this failed the lock has changed, restart. */
776			continue;
777		}
778
779		/*
780		 * If we caught a signal, we have retried and now
781		 * exit immediately.
782		 */
783		if (error != 0)
784			return (error);
785
786		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
787			AUTO_SHARE, &uq->uq_key)) != 0)
788			return (error);
789
790		umtxq_lock(&uq->uq_key);
791		umtxq_busy(&uq->uq_key);
792		umtxq_insert(uq);
793		umtxq_unbusy(&uq->uq_key);
794		umtxq_unlock(&uq->uq_key);
795
796		/*
797		 * Set the contested bit so that a release in user space
798		 * knows to use the system call for unlock.  If this fails
799		 * either some one else has acquired the lock or it has been
800		 * released.
801		 */
802		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
803
804		/* The address was invalid. */
805		if (old == -1) {
806			umtxq_lock(&uq->uq_key);
807			umtxq_remove(uq);
808			umtxq_unlock(&uq->uq_key);
809			umtx_key_release(&uq->uq_key);
810			return (EFAULT);
811		}
812
813		/*
814		 * We set the contested bit, sleep. Otherwise the lock changed
815		 * and we need to retry or we lost a race to the thread
816		 * unlocking the umtx.
817		 */
818		umtxq_lock(&uq->uq_key);
819		if (old == owner)
820			error = umtxq_sleep(uq, "umtx", timo);
821		umtxq_remove(uq);
822		umtxq_unlock(&uq->uq_key);
823		umtx_key_release(&uq->uq_key);
824	}
825
826	return (0);
827}
828
829/*
830 * Lock a umtx object.
831 */
832static int
833do_lock_umtx32(struct thread *td, void *m, uint32_t id,
834	struct timespec *timeout)
835{
836	struct timespec ts, ts2, ts3;
837	struct timeval tv;
838	int error;
839
840	if (timeout == NULL) {
841		error = _do_lock_umtx32(td, m, id, 0);
842		/* Mutex locking is restarted if it is interrupted. */
843		if (error == EINTR)
844			error = ERESTART;
845	} else {
846		getnanouptime(&ts);
847		timespecadd(&ts, timeout);
848		TIMESPEC_TO_TIMEVAL(&tv, timeout);
849		for (;;) {
850			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
851			if (error != ETIMEDOUT)
852				break;
853			getnanouptime(&ts2);
854			if (timespeccmp(&ts2, &ts, >=)) {
855				error = ETIMEDOUT;
856				break;
857			}
858			ts3 = ts;
859			timespecsub(&ts3, &ts2);
860			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
861		}
862		/* Timed-locking is not restarted. */
863		if (error == ERESTART)
864			error = EINTR;
865	}
866	return (error);
867}
868
869/*
870 * Unlock a umtx object.
871 */
872static int
873do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
874{
875	struct umtx_key key;
876	uint32_t owner;
877	uint32_t old;
878	int error;
879	int count;
880
881	/*
882	 * Make sure we own this mtx.
883	 */
884	owner = fuword32(m);
885	if (owner == -1)
886		return (EFAULT);
887
888	if ((owner & ~UMUTEX_CONTESTED) != id)
889		return (EPERM);
890
891	/* This should be done in userland */
892	if ((owner & UMUTEX_CONTESTED) == 0) {
893		old = casuword32(m, owner, UMUTEX_UNOWNED);
894		if (old == -1)
895			return (EFAULT);
896		if (old == owner)
897			return (0);
898		owner = old;
899	}
900
901	/* We should only ever be in here for contested locks */
902	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
903		&key)) != 0)
904		return (error);
905
906	umtxq_lock(&key);
907	umtxq_busy(&key);
908	count = umtxq_count(&key);
909	umtxq_unlock(&key);
910
911	/*
912	 * When unlocking the umtx, it must be marked as unowned if
913	 * there is zero or one thread only waiting for it.
914	 * Otherwise, it must be marked as contested.
915	 */
916	old = casuword32(m, owner,
917		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
918	umtxq_lock(&key);
919	umtxq_signal(&key,1);
920	umtxq_unbusy(&key);
921	umtxq_unlock(&key);
922	umtx_key_release(&key);
923	if (old == -1)
924		return (EFAULT);
925	if (old != owner)
926		return (EINVAL);
927	return (0);
928}
929#endif
930
931/*
932 * Fetch and compare value, sleep on the address if value is not changed.
933 */
934static int
935do_wait(struct thread *td, void *addr, u_long id,
936	struct timespec *timeout, int compat32)
937{
938	struct umtx_q *uq;
939	struct timespec ts, ts2, ts3;
940	struct timeval tv;
941	u_long tmp;
942	int error = 0;
943
944	uq = td->td_umtxq;
945	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
946	    &uq->uq_key)) != 0)
947		return (error);
948
949	umtxq_lock(&uq->uq_key);
950	umtxq_insert(uq);
951	umtxq_unlock(&uq->uq_key);
952	if (compat32 == 0)
953		tmp = fuword(addr);
954        else
955		tmp = fuword32(addr);
956	if (tmp != id) {
957		umtxq_lock(&uq->uq_key);
958		umtxq_remove(uq);
959		umtxq_unlock(&uq->uq_key);
960	} else if (timeout == NULL) {
961		umtxq_lock(&uq->uq_key);
962		error = umtxq_sleep(uq, "uwait", 0);
963		umtxq_remove(uq);
964		umtxq_unlock(&uq->uq_key);
965	} else {
966		getnanouptime(&ts);
967		timespecadd(&ts, timeout);
968		TIMESPEC_TO_TIMEVAL(&tv, timeout);
969		umtxq_lock(&uq->uq_key);
970		for (;;) {
971			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
972			if (!(uq->uq_flags & UQF_UMTXQ))
973				break;
974			if (error != ETIMEDOUT)
975				break;
976			umtxq_unlock(&uq->uq_key);
977			getnanouptime(&ts2);
978			if (timespeccmp(&ts2, &ts, >=)) {
979				error = ETIMEDOUT;
980				umtxq_lock(&uq->uq_key);
981				break;
982			}
983			ts3 = ts;
984			timespecsub(&ts3, &ts2);
985			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
986			umtxq_lock(&uq->uq_key);
987		}
988		umtxq_remove(uq);
989		umtxq_unlock(&uq->uq_key);
990	}
991	umtx_key_release(&uq->uq_key);
992	if (error == ERESTART)
993		error = EINTR;
994	return (error);
995}
996
997/*
998 * Wake up threads sleeping on the specified address.
999 */
1000int
1001kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
1002{
1003	struct umtx_key key;
1004	int ret;
1005
1006	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
1007	   &key)) != 0)
1008		return (ret);
1009	umtxq_lock(&key);
1010	ret = umtxq_signal(&key, n_wake);
1011	umtxq_unlock(&key);
1012	umtx_key_release(&key);
1013	return (0);
1014}
1015
1016/*
1017 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1018 */
1019static int
1020_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1021	int try)
1022{
1023	struct umtx_q *uq;
1024	uint32_t owner, old, id;
1025#ifdef SMP
1026	int spincount;
1027#endif
1028	int error = 0;
1029
1030	id = td->td_tid;
1031	uq = td->td_umtxq;
1032
1033#ifdef SMP
1034	if (smp_cpus > 1) {
1035		spincount = fuword32(&m->m_spincount);
1036		if (spincount == 0)
1037			spincount = umtx_dflt_spins;
1038		if (spincount > umtx_max_spins)
1039			spincount = umtx_max_spins;
1040	} else
1041		spincount = 0;
1042#endif
1043
1044	/*
1045	 * Care must be exercised when dealing with umtx structure. It
1046	 * can fault on any access.
1047	 */
1048	for (;;) {
1049#ifdef SMP
1050try_unowned:
1051#endif
1052		/*
1053		 * Try the uncontested case.  This should be done in userland.
1054		 */
1055		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1056
1057		/* The acquire succeeded. */
1058		if (owner == UMUTEX_UNOWNED)
1059			return (0);
1060
1061		/* The address was invalid. */
1062		if (owner == -1)
1063			return (EFAULT);
1064
1065		/* If no one owns it but it is contested try to acquire it. */
1066		if (owner == UMUTEX_CONTESTED) {
1067#ifdef SMP
1068try_contested:
1069#endif
1070			owner = casuword32(&m->m_owner,
1071			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1072
1073			if (owner == UMUTEX_CONTESTED)
1074				return (0);
1075
1076			/* The address was invalid. */
1077			if (owner == -1)
1078				return (EFAULT);
1079
1080			/* If this failed the lock has changed, restart. */
1081			continue;
1082		}
1083
1084		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1085		    (owner & ~UMUTEX_CONTESTED) == id)
1086			return (EDEADLK);
1087
1088		if (try != 0)
1089			return (EBUSY);
1090
1091#ifdef SMP
1092		if (spincount > 0 && (owner & ~UMUTEX_CONTESTED) != id) {
1093			int i, found = 0;
1094			struct pcpu *pcpu = NULL;
1095
1096			/* Look for a cpu the owner is running on */
1097			for (i = 0; i < MAXCPU; i++) {
1098				if (CPU_ABSENT(i))
1099					continue;
1100				pcpu = pcpu_find(i);
1101				if ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid) {
1102					found = 1;
1103					break;
1104				}
1105			}
1106
1107			if (__predict_false(!found))
1108				goto end_spin;
1109
1110			while ((owner & ~UMUTEX_CONTESTED) == pcpu->pc_curtid &&
1111			       (owner & ~UMUTEX_CONTESTED) != id) {
1112				if (--spincount <= 0)
1113					break;
1114				if ((td->td_flags &
1115			    	    (TDF_NEEDRESCHED|TDF_ASTPENDING|TDF_NEEDSIGCHK)) ||
1116				     P_SHOULDSTOP(td->td_proc))
1117					break;
1118				owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1119				if (owner == UMUTEX_UNOWNED)
1120					goto try_unowned;
1121 				if (owner == UMUTEX_CONTESTED)
1122					goto try_contested;
1123				cpu_spinwait();
1124			}
1125		}
1126end_spin:
1127		spincount = 0;
1128
1129#endif
1130
1131		/*
1132		 * If we caught a signal, we have retried and now
1133		 * exit immediately.
1134		 */
1135		if (error != 0)
1136			return (error);
1137
1138		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1139		    GET_SHARE(flags), &uq->uq_key)) != 0)
1140			return (error);
1141
1142		umtxq_lock(&uq->uq_key);
1143		umtxq_busy(&uq->uq_key);
1144		umtxq_insert(uq);
1145		umtxq_unbusy(&uq->uq_key);
1146		umtxq_unlock(&uq->uq_key);
1147
1148		/*
1149		 * Set the contested bit so that a release in user space
1150		 * knows to use the system call for unlock.  If this fails
1151		 * either some one else has acquired the lock or it has been
1152		 * released.
1153		 */
1154		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1155
1156		/* The address was invalid. */
1157		if (old == -1) {
1158			umtxq_lock(&uq->uq_key);
1159			umtxq_remove(uq);
1160			umtxq_unlock(&uq->uq_key);
1161			umtx_key_release(&uq->uq_key);
1162			return (EFAULT);
1163		}
1164
1165		/*
1166		 * We set the contested bit, sleep. Otherwise the lock changed
1167		 * and we need to retry or we lost a race to the thread
1168		 * unlocking the umtx.
1169		 */
1170		umtxq_lock(&uq->uq_key);
1171		if (old == owner)
1172			error = umtxq_sleep(uq, "umtxn", timo);
1173		umtxq_remove(uq);
1174		umtxq_unlock(&uq->uq_key);
1175		umtx_key_release(&uq->uq_key);
1176	}
1177
1178	return (0);
1179}
1180
1181/*
1182 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1183 */
1184/*
1185 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1186 */
1187static int
1188do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1189{
1190	struct umtx_key key;
1191	uint32_t owner, old, id;
1192	int error;
1193	int count;
1194
1195	id = td->td_tid;
1196	/*
1197	 * Make sure we own this mtx.
1198	 */
1199	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1200	if (owner == -1)
1201		return (EFAULT);
1202
1203	if ((owner & ~UMUTEX_CONTESTED) != id)
1204		return (EPERM);
1205
1206	/* This should be done in userland */
1207	if ((owner & UMUTEX_CONTESTED) == 0) {
1208		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1209		if (old == -1)
1210			return (EFAULT);
1211		if (old == owner)
1212			return (0);
1213		owner = old;
1214	}
1215
1216	/* We should only ever be in here for contested locks */
1217	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1218	    &key)) != 0)
1219		return (error);
1220
1221	umtxq_lock(&key);
1222	umtxq_busy(&key);
1223	count = umtxq_count(&key);
1224	umtxq_unlock(&key);
1225
1226	/*
1227	 * When unlocking the umtx, it must be marked as unowned if
1228	 * there is zero or one thread only waiting for it.
1229	 * Otherwise, it must be marked as contested.
1230	 */
1231	old = casuword32(&m->m_owner, owner,
1232		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1233	umtxq_lock(&key);
1234	umtxq_signal(&key,1);
1235	umtxq_unbusy(&key);
1236	umtxq_unlock(&key);
1237	umtx_key_release(&key);
1238	if (old == -1)
1239		return (EFAULT);
1240	if (old != owner)
1241		return (EINVAL);
1242	return (0);
1243}
1244
1245static inline struct umtx_pi *
1246umtx_pi_alloc(int flags)
1247{
1248	struct umtx_pi *pi;
1249
1250	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1251	TAILQ_INIT(&pi->pi_blocked);
1252	atomic_add_int(&umtx_pi_allocated, 1);
1253	return (pi);
1254}
1255
1256static inline void
1257umtx_pi_free(struct umtx_pi *pi)
1258{
1259	uma_zfree(umtx_pi_zone, pi);
1260	atomic_add_int(&umtx_pi_allocated, -1);
1261}
1262
1263/*
1264 * Adjust the thread's position on a pi_state after its priority has been
1265 * changed.
1266 */
1267static int
1268umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1269{
1270	struct umtx_q *uq, *uq1, *uq2;
1271	struct thread *td1;
1272
1273	mtx_assert(&sched_lock, MA_OWNED);
1274	if (pi == NULL)
1275		return (0);
1276
1277	uq = td->td_umtxq;
1278
1279	/*
1280	 * Check if the thread needs to be moved on the blocked chain.
1281	 * It needs to be moved if either its priority is lower than
1282	 * the previous thread or higher than the next thread.
1283	 */
1284	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1285	uq2 = TAILQ_NEXT(uq, uq_lockq);
1286	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1287	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1288		/*
1289		 * Remove thread from blocked chain and determine where
1290		 * it should be moved to.
1291		 */
1292		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1293		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1294			td1 = uq1->uq_thread;
1295			MPASS(td1->td_proc->p_magic == P_MAGIC);
1296			if (UPRI(td1) > UPRI(td))
1297				break;
1298		}
1299
1300		if (uq1 == NULL)
1301			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1302		else
1303			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1304	}
1305	return (1);
1306}
1307
1308/*
1309 * Propagate priority when a thread is blocked on POSIX
1310 * PI mutex.
1311 */
1312static void
1313umtx_propagate_priority(struct thread *td)
1314{
1315	struct umtx_q *uq;
1316	struct umtx_pi *pi;
1317	int pri;
1318
1319	mtx_assert(&sched_lock, MA_OWNED);
1320	pri = UPRI(td);
1321	uq = td->td_umtxq;
1322	pi = uq->uq_pi_blocked;
1323	if (pi == NULL)
1324		return;
1325
1326	for (;;) {
1327		td = pi->pi_owner;
1328		if (td == NULL)
1329			return;
1330
1331		MPASS(td->td_proc != NULL);
1332		MPASS(td->td_proc->p_magic == P_MAGIC);
1333
1334		if (UPRI(td) <= pri)
1335			return;
1336
1337		sched_lend_user_prio(td, pri);
1338
1339		/*
1340		 * Pick up the lock that td is blocked on.
1341		 */
1342		uq = td->td_umtxq;
1343		pi = uq->uq_pi_blocked;
1344		/* Resort td on the list if needed. */
1345		if (!umtx_pi_adjust_thread(pi, td))
1346			break;
1347	}
1348}
1349
1350/*
1351 * Unpropagate priority for a PI mutex when a thread blocked on
1352 * it is interrupted by signal or resumed by others.
1353 */
1354static void
1355umtx_unpropagate_priority(struct umtx_pi *pi)
1356{
1357	struct umtx_q *uq, *uq_owner;
1358	struct umtx_pi *pi2;
1359	int pri;
1360
1361	mtx_assert(&sched_lock, MA_OWNED);
1362
1363	while (pi != NULL && pi->pi_owner != NULL) {
1364		pri = PRI_MAX;
1365		uq_owner = pi->pi_owner->td_umtxq;
1366
1367		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1368			uq = TAILQ_FIRST(&pi2->pi_blocked);
1369			if (uq != NULL) {
1370				if (pri > UPRI(uq->uq_thread))
1371					pri = UPRI(uq->uq_thread);
1372			}
1373		}
1374
1375		if (pri > uq_owner->uq_inherited_pri)
1376			pri = uq_owner->uq_inherited_pri;
1377		sched_unlend_user_prio(pi->pi_owner, pri);
1378		pi = uq_owner->uq_pi_blocked;
1379	}
1380}
1381
1382/*
1383 * Insert a PI mutex into owned list.
1384 */
1385static void
1386umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1387{
1388	struct umtx_q *uq_owner;
1389
1390	uq_owner = owner->td_umtxq;
1391	mtx_assert(&sched_lock, MA_OWNED);
1392	if (pi->pi_owner != NULL)
1393		panic("pi_ower != NULL");
1394	pi->pi_owner = owner;
1395	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1396}
1397
1398/*
1399 * Claim ownership of a PI mutex.
1400 */
1401static int
1402umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1403{
1404	struct umtx_q *uq, *uq_owner;
1405
1406	uq_owner = owner->td_umtxq;
1407	mtx_lock_spin(&sched_lock);
1408	if (pi->pi_owner == owner) {
1409		mtx_unlock_spin(&sched_lock);
1410		return (0);
1411	}
1412
1413	if (pi->pi_owner != NULL) {
1414		/*
1415		 * userland may have already messed the mutex, sigh.
1416		 */
1417		mtx_unlock_spin(&sched_lock);
1418		return (EPERM);
1419	}
1420	umtx_pi_setowner(pi, owner);
1421	uq = TAILQ_FIRST(&pi->pi_blocked);
1422	if (uq != NULL) {
1423		int pri;
1424
1425		pri = UPRI(uq->uq_thread);
1426		if (pri < UPRI(owner))
1427			sched_lend_user_prio(owner, pri);
1428	}
1429	mtx_unlock_spin(&sched_lock);
1430	return (0);
1431}
1432
1433/*
1434 * Adjust a thread's order position in its blocked PI mutex,
1435 * this may result new priority propagating process.
1436 */
1437void
1438umtx_pi_adjust(struct thread *td, u_char oldpri)
1439{
1440	struct umtx_q *uq;
1441	struct umtx_pi *pi;
1442
1443	uq = td->td_umtxq;
1444
1445	mtx_assert(&sched_lock, MA_OWNED);
1446	MPASS(TD_ON_UPILOCK(td));
1447
1448	/*
1449	 * Pick up the lock that td is blocked on.
1450	 */
1451	pi = uq->uq_pi_blocked;
1452	MPASS(pi != NULL);
1453
1454	/* Resort the turnstile on the list. */
1455	if (!umtx_pi_adjust_thread(pi, td))
1456		return;
1457
1458	/*
1459	 * If our priority was lowered and we are at the head of the
1460	 * turnstile, then propagate our new priority up the chain.
1461	 */
1462	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
1463		umtx_propagate_priority(td);
1464}
1465
1466/*
1467 * Sleep on a PI mutex.
1468 */
1469static int
1470umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1471	uint32_t owner, const char *wmesg, int timo)
1472{
1473	struct umtxq_chain *uc;
1474	struct thread *td, *td1;
1475	struct umtx_q *uq1;
1476	int pri;
1477	int error = 0;
1478
1479	td = uq->uq_thread;
1480	KASSERT(td == curthread, ("inconsistent uq_thread"));
1481	uc = umtxq_getchain(&uq->uq_key);
1482	UMTXQ_LOCKED_ASSERT(uc);
1483	umtxq_insert(uq);
1484	if (pi->pi_owner == NULL) {
1485		/* XXX
1486		 * Current, We only support process private PI-mutex,
1487		 * non-contended PI-mutexes are locked in userland.
1488		 * Process shared PI-mutex should always be initialized
1489		 * by kernel and be registered in kernel, locking should
1490		 * always be done by kernel to avoid security problems.
1491		 * For process private PI-mutex, we can find owner
1492		 * thread and boost its priority safely.
1493		 */
1494		PROC_LOCK(curproc);
1495		td1 = thread_find(curproc, owner);
1496		mtx_lock_spin(&sched_lock);
1497		if (td1 != NULL && pi->pi_owner == NULL) {
1498			uq1 = td1->td_umtxq;
1499			umtx_pi_setowner(pi, td1);
1500		}
1501		PROC_UNLOCK(curproc);
1502	} else {
1503		mtx_lock_spin(&sched_lock);
1504	}
1505
1506	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1507		pri = UPRI(uq1->uq_thread);
1508		if (pri > UPRI(td))
1509			break;
1510	}
1511
1512	if (uq1 != NULL)
1513		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1514	else
1515		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1516
1517	uq->uq_pi_blocked = pi;
1518	td->td_flags |= TDF_UPIBLOCKED;
1519	mtx_unlock_spin(&sched_lock);
1520	umtxq_unlock(&uq->uq_key);
1521
1522	mtx_lock_spin(&sched_lock);
1523	umtx_propagate_priority(td);
1524	mtx_unlock_spin(&sched_lock);
1525
1526	umtxq_lock(&uq->uq_key);
1527	if (uq->uq_flags & UQF_UMTXQ) {
1528		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
1529		if (error == EWOULDBLOCK)
1530			error = ETIMEDOUT;
1531		if (uq->uq_flags & UQF_UMTXQ) {
1532			umtxq_busy(&uq->uq_key);
1533			umtxq_remove(uq);
1534			umtxq_unbusy(&uq->uq_key);
1535		}
1536	}
1537	umtxq_unlock(&uq->uq_key);
1538
1539	mtx_lock_spin(&sched_lock);
1540	uq->uq_pi_blocked = NULL;
1541	td->td_flags &= ~TDF_UPIBLOCKED;
1542	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1543	umtx_unpropagate_priority(pi);
1544	mtx_unlock_spin(&sched_lock);
1545
1546	umtxq_lock(&uq->uq_key);
1547
1548	return (error);
1549}
1550
1551/*
1552 * Add reference count for a PI mutex.
1553 */
1554static void
1555umtx_pi_ref(struct umtx_pi *pi)
1556{
1557	struct umtxq_chain *uc;
1558
1559	uc = umtxq_getchain(&pi->pi_key);
1560	UMTXQ_LOCKED_ASSERT(uc);
1561	pi->pi_refcount++;
1562}
1563
1564/*
1565 * Decrease reference count for a PI mutex, if the counter
1566 * is decreased to zero, its memory space is freed.
1567 */
1568static void
1569umtx_pi_unref(struct umtx_pi *pi)
1570{
1571	struct umtxq_chain *uc;
1572	int free = 0;
1573
1574	uc = umtxq_getchain(&pi->pi_key);
1575	UMTXQ_LOCKED_ASSERT(uc);
1576	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1577	if (--pi->pi_refcount == 0) {
1578		mtx_lock_spin(&sched_lock);
1579		if (pi->pi_owner != NULL) {
1580			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1581				pi, pi_link);
1582			pi->pi_owner = NULL;
1583		}
1584		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1585			("blocked queue not empty"));
1586		mtx_unlock_spin(&sched_lock);
1587		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1588		free = 1;
1589	}
1590	if (free)
1591		umtx_pi_free(pi);
1592}
1593
1594/*
1595 * Find a PI mutex in hash table.
1596 */
1597static struct umtx_pi *
1598umtx_pi_lookup(struct umtx_key *key)
1599{
1600	struct umtxq_chain *uc;
1601	struct umtx_pi *pi;
1602
1603	uc = umtxq_getchain(key);
1604	UMTXQ_LOCKED_ASSERT(uc);
1605
1606	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1607		if (umtx_key_match(&pi->pi_key, key)) {
1608			return (pi);
1609		}
1610	}
1611	return (NULL);
1612}
1613
1614/*
1615 * Insert a PI mutex into hash table.
1616 */
1617static inline void
1618umtx_pi_insert(struct umtx_pi *pi)
1619{
1620	struct umtxq_chain *uc;
1621
1622	uc = umtxq_getchain(&pi->pi_key);
1623	UMTXQ_LOCKED_ASSERT(uc);
1624	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1625}
1626
1627/*
1628 * Lock a PI mutex.
1629 */
1630static int
1631_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1632	int try)
1633{
1634	struct umtx_q *uq;
1635	struct umtx_pi *pi, *new_pi;
1636	uint32_t id, owner, old;
1637	int error;
1638
1639	id = td->td_tid;
1640	uq = td->td_umtxq;
1641
1642	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1643	    &uq->uq_key)) != 0)
1644		return (error);
1645	umtxq_lock(&uq->uq_key);
1646	pi = umtx_pi_lookup(&uq->uq_key);
1647	if (pi == NULL) {
1648		new_pi = umtx_pi_alloc(M_NOWAIT);
1649		if (new_pi == NULL) {
1650			umtxq_unlock(&uq->uq_key);
1651			new_pi = umtx_pi_alloc(M_WAITOK);
1652			new_pi->pi_key = uq->uq_key;
1653			umtxq_lock(&uq->uq_key);
1654			pi = umtx_pi_lookup(&uq->uq_key);
1655			if (pi != NULL) {
1656				umtx_pi_free(new_pi);
1657				new_pi = NULL;
1658			}
1659		}
1660		if (new_pi != NULL) {
1661			new_pi->pi_key = uq->uq_key;
1662			umtx_pi_insert(new_pi);
1663			pi = new_pi;
1664		}
1665	}
1666	umtx_pi_ref(pi);
1667	umtxq_unlock(&uq->uq_key);
1668
1669	/*
1670	 * Care must be exercised when dealing with umtx structure.  It
1671	 * can fault on any access.
1672	 */
1673	for (;;) {
1674		/*
1675		 * Try the uncontested case.  This should be done in userland.
1676		 */
1677		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
1678
1679		/* The acquire succeeded. */
1680		if (owner == UMUTEX_UNOWNED) {
1681			error = 0;
1682			break;
1683		}
1684
1685		/* The address was invalid. */
1686		if (owner == -1) {
1687			error = EFAULT;
1688			break;
1689		}
1690
1691		/* If no one owns it but it is contested try to acquire it. */
1692		if (owner == UMUTEX_CONTESTED) {
1693			owner = casuword32(&m->m_owner,
1694			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1695
1696			if (owner == UMUTEX_CONTESTED) {
1697				umtxq_lock(&uq->uq_key);
1698				error = umtx_pi_claim(pi, td);
1699				umtxq_unlock(&uq->uq_key);
1700				break;
1701			}
1702
1703			/* The address was invalid. */
1704			if (owner == -1) {
1705				error = EFAULT;
1706				break;
1707			}
1708
1709			/* If this failed the lock has changed, restart. */
1710			continue;
1711		}
1712
1713		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1714		    (owner & ~UMUTEX_CONTESTED) == id) {
1715			error = EDEADLK;
1716			break;
1717		}
1718
1719		if (try != 0) {
1720			error = EBUSY;
1721			break;
1722		}
1723
1724		/*
1725		 * If we caught a signal, we have retried and now
1726		 * exit immediately.
1727		 */
1728		if (error != 0)
1729			break;
1730
1731		umtxq_lock(&uq->uq_key);
1732		umtxq_busy(&uq->uq_key);
1733		umtxq_unlock(&uq->uq_key);
1734
1735		/*
1736		 * Set the contested bit so that a release in user space
1737		 * knows to use the system call for unlock.  If this fails
1738		 * either some one else has acquired the lock or it has been
1739		 * released.
1740		 */
1741		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
1742
1743		/* The address was invalid. */
1744		if (old == -1) {
1745			umtxq_lock(&uq->uq_key);
1746			umtxq_unbusy(&uq->uq_key);
1747			umtxq_unlock(&uq->uq_key);
1748			error = EFAULT;
1749			break;
1750		}
1751
1752		umtxq_lock(&uq->uq_key);
1753		umtxq_unbusy(&uq->uq_key);
1754		/*
1755		 * We set the contested bit, sleep. Otherwise the lock changed
1756		 * and we need to retry or we lost a race to the thread
1757		 * unlocking the umtx.
1758		 */
1759		if (old == owner)
1760			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
1761				 "umtxpi", timo);
1762		umtxq_unlock(&uq->uq_key);
1763	}
1764
1765	umtxq_lock(&uq->uq_key);
1766	umtx_pi_unref(pi);
1767	umtxq_unlock(&uq->uq_key);
1768
1769	umtx_key_release(&uq->uq_key);
1770	return (error);
1771}
1772
1773/*
1774 * Unlock a PI mutex.
1775 */
1776static int
1777do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
1778{
1779	struct umtx_key key;
1780	struct umtx_q *uq_first, *uq_first2, *uq_me;
1781	struct umtx_pi *pi, *pi2;
1782	uint32_t owner, old, id;
1783	int error;
1784	int count;
1785	int pri;
1786
1787	id = td->td_tid;
1788	/*
1789	 * Make sure we own this mtx.
1790	 */
1791	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
1792	if (owner == -1)
1793		return (EFAULT);
1794
1795	if ((owner & ~UMUTEX_CONTESTED) != id)
1796		return (EPERM);
1797
1798	/* This should be done in userland */
1799	if ((owner & UMUTEX_CONTESTED) == 0) {
1800		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
1801		if (old == -1)
1802			return (EFAULT);
1803		if (old == owner)
1804			return (0);
1805		owner = old;
1806	}
1807
1808	/* We should only ever be in here for contested locks */
1809	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1810	    &key)) != 0)
1811		return (error);
1812
1813	umtxq_lock(&key);
1814	umtxq_busy(&key);
1815	count = umtxq_count_pi(&key, &uq_first);
1816	if (uq_first != NULL) {
1817		pi = uq_first->uq_pi_blocked;
1818		if (pi->pi_owner != curthread) {
1819			umtxq_unbusy(&key);
1820			umtxq_unlock(&key);
1821			/* userland messed the mutex */
1822			return (EPERM);
1823		}
1824		uq_me = curthread->td_umtxq;
1825		mtx_lock_spin(&sched_lock);
1826		pi->pi_owner = NULL;
1827		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
1828		uq_first = TAILQ_FIRST(&pi->pi_blocked);
1829		pri = PRI_MAX;
1830		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
1831			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
1832			if (uq_first2 != NULL) {
1833				if (pri > UPRI(uq_first2->uq_thread))
1834					pri = UPRI(uq_first2->uq_thread);
1835			}
1836		}
1837		sched_unlend_user_prio(curthread, pri);
1838		mtx_unlock_spin(&sched_lock);
1839	}
1840	umtxq_unlock(&key);
1841
1842	/*
1843	 * When unlocking the umtx, it must be marked as unowned if
1844	 * there is zero or one thread only waiting for it.
1845	 * Otherwise, it must be marked as contested.
1846	 */
1847	old = casuword32(&m->m_owner, owner,
1848		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1849
1850	umtxq_lock(&key);
1851	if (uq_first != NULL)
1852		umtxq_signal_thread(uq_first);
1853	umtxq_unbusy(&key);
1854	umtxq_unlock(&key);
1855	umtx_key_release(&key);
1856	if (old == -1)
1857		return (EFAULT);
1858	if (old != owner)
1859		return (EINVAL);
1860	return (0);
1861}
1862
1863/*
1864 * Lock a PP mutex.
1865 */
1866static int
1867_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
1868	int try)
1869{
1870	struct umtx_q *uq, *uq2;
1871	struct umtx_pi *pi;
1872	uint32_t ceiling;
1873	uint32_t owner, id;
1874	int error, pri, old_inherited_pri, su;
1875
1876	id = td->td_tid;
1877	uq = td->td_umtxq;
1878	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
1879	    &uq->uq_key)) != 0)
1880		return (error);
1881	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
1882	for (;;) {
1883		old_inherited_pri = uq->uq_inherited_pri;
1884		umtxq_lock(&uq->uq_key);
1885		umtxq_busy(&uq->uq_key);
1886		umtxq_unlock(&uq->uq_key);
1887
1888		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
1889		if (ceiling > RTP_PRIO_MAX) {
1890			error = EINVAL;
1891			goto out;
1892		}
1893
1894		mtx_lock_spin(&sched_lock);
1895		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
1896			mtx_unlock_spin(&sched_lock);
1897			error = EINVAL;
1898			goto out;
1899		}
1900		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
1901			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
1902			if (uq->uq_inherited_pri < UPRI(td))
1903				sched_lend_user_prio(td, uq->uq_inherited_pri);
1904		}
1905		mtx_unlock_spin(&sched_lock);
1906
1907		owner = casuword32(&m->m_owner,
1908		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1909
1910		if (owner == UMUTEX_CONTESTED) {
1911			error = 0;
1912			break;
1913		}
1914
1915		/* The address was invalid. */
1916		if (owner == -1) {
1917			error = EFAULT;
1918			break;
1919		}
1920
1921		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1922		    (owner & ~UMUTEX_CONTESTED) == id) {
1923			error = EDEADLK;
1924			break;
1925		}
1926
1927		if (try != 0) {
1928			error = EBUSY;
1929			break;
1930		}
1931
1932		/*
1933		 * If we caught a signal, we have retried and now
1934		 * exit immediately.
1935		 */
1936		if (error != 0)
1937			break;
1938
1939		umtxq_lock(&uq->uq_key);
1940		umtxq_insert(uq);
1941		umtxq_unbusy(&uq->uq_key);
1942		error = umtxq_sleep(uq, "umtxpp", timo);
1943		umtxq_remove(uq);
1944		umtxq_unlock(&uq->uq_key);
1945
1946		mtx_lock_spin(&sched_lock);
1947		uq->uq_inherited_pri = old_inherited_pri;
1948		pri = PRI_MAX;
1949		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1950			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1951			if (uq2 != NULL) {
1952				if (pri > UPRI(uq2->uq_thread))
1953					pri = UPRI(uq2->uq_thread);
1954			}
1955		}
1956		if (pri > uq->uq_inherited_pri)
1957			pri = uq->uq_inherited_pri;
1958		sched_unlend_user_prio(td, pri);
1959		mtx_unlock_spin(&sched_lock);
1960	}
1961
1962	if (error != 0) {
1963		mtx_lock_spin(&sched_lock);
1964		uq->uq_inherited_pri = old_inherited_pri;
1965		pri = PRI_MAX;
1966		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
1967			uq2 = TAILQ_FIRST(&pi->pi_blocked);
1968			if (uq2 != NULL) {
1969				if (pri > UPRI(uq2->uq_thread))
1970					pri = UPRI(uq2->uq_thread);
1971			}
1972		}
1973		if (pri > uq->uq_inherited_pri)
1974			pri = uq->uq_inherited_pri;
1975		sched_unlend_user_prio(td, pri);
1976		mtx_unlock_spin(&sched_lock);
1977	}
1978
1979out:
1980	umtxq_lock(&uq->uq_key);
1981	umtxq_unbusy(&uq->uq_key);
1982	umtxq_unlock(&uq->uq_key);
1983	umtx_key_release(&uq->uq_key);
1984	return (error);
1985}
1986
1987/*
1988 * Unlock a PP mutex.
1989 */
1990static int
1991do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
1992{
1993	struct umtx_key key;
1994	struct umtx_q *uq, *uq2;
1995	struct umtx_pi *pi;
1996	uint32_t owner, id;
1997	uint32_t rceiling;
1998	int error, pri, new_inherited_pri, su;
1999
2000	id = td->td_tid;
2001	uq = td->td_umtxq;
2002	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2003
2004	/*
2005	 * Make sure we own this mtx.
2006	 */
2007	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
2008	if (owner == -1)
2009		return (EFAULT);
2010
2011	if ((owner & ~UMUTEX_CONTESTED) != id)
2012		return (EPERM);
2013
2014	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2015	if (error != 0)
2016		return (error);
2017
2018	if (rceiling == -1)
2019		new_inherited_pri = PRI_MAX;
2020	else {
2021		rceiling = RTP_PRIO_MAX - rceiling;
2022		if (rceiling > RTP_PRIO_MAX)
2023			return (EINVAL);
2024		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2025	}
2026
2027	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2028	    &key)) != 0)
2029		return (error);
2030	umtxq_lock(&key);
2031	umtxq_busy(&key);
2032	umtxq_unlock(&key);
2033	/*
2034	 * For priority protected mutex, always set unlocked state
2035	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2036	 * to lock the mutex, it is necessary because thread priority
2037	 * has to be adjusted for such mutex.
2038	 */
2039	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2040		UMUTEX_CONTESTED);
2041
2042	umtxq_lock(&key);
2043	if (error == 0)
2044		umtxq_signal(&key, 1);
2045	umtxq_unbusy(&key);
2046	umtxq_unlock(&key);
2047
2048	if (error == -1)
2049		error = EFAULT;
2050	else {
2051		mtx_lock_spin(&sched_lock);
2052		if (su != 0)
2053			uq->uq_inherited_pri = new_inherited_pri;
2054		pri = PRI_MAX;
2055		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2056			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2057			if (uq2 != NULL) {
2058				if (pri > UPRI(uq2->uq_thread))
2059					pri = UPRI(uq2->uq_thread);
2060			}
2061		}
2062		if (pri > uq->uq_inherited_pri)
2063			pri = uq->uq_inherited_pri;
2064		sched_unlend_user_prio(td, pri);
2065		mtx_unlock_spin(&sched_lock);
2066	}
2067	umtx_key_release(&key);
2068	return (error);
2069}
2070
2071static int
2072do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2073	uint32_t *old_ceiling)
2074{
2075	struct umtx_q *uq;
2076	uint32_t save_ceiling;
2077	uint32_t owner, id;
2078	uint32_t flags;
2079	int error;
2080
2081	flags = fuword32(&m->m_flags);
2082	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2083		return (EINVAL);
2084	if (ceiling > RTP_PRIO_MAX)
2085		return (EINVAL);
2086	id = td->td_tid;
2087	uq = td->td_umtxq;
2088	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2089	   &uq->uq_key)) != 0)
2090		return (error);
2091	for (;;) {
2092		umtxq_lock(&uq->uq_key);
2093		umtxq_busy(&uq->uq_key);
2094		umtxq_unlock(&uq->uq_key);
2095
2096		save_ceiling = fuword32(&m->m_ceilings[0]);
2097
2098		owner = casuword32(&m->m_owner,
2099		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
2100
2101		if (owner == UMUTEX_CONTESTED) {
2102			suword32(&m->m_ceilings[0], ceiling);
2103			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
2104				UMUTEX_CONTESTED);
2105			error = 0;
2106			break;
2107		}
2108
2109		/* The address was invalid. */
2110		if (owner == -1) {
2111			error = EFAULT;
2112			break;
2113		}
2114
2115		if ((owner & ~UMUTEX_CONTESTED) == id) {
2116			suword32(&m->m_ceilings[0], ceiling);
2117			error = 0;
2118			break;
2119		}
2120
2121		/*
2122		 * If we caught a signal, we have retried and now
2123		 * exit immediately.
2124		 */
2125		if (error != 0)
2126			break;
2127
2128		/*
2129		 * We set the contested bit, sleep. Otherwise the lock changed
2130		 * and we need to retry or we lost a race to the thread
2131		 * unlocking the umtx.
2132		 */
2133		umtxq_lock(&uq->uq_key);
2134		umtxq_insert(uq);
2135		umtxq_unbusy(&uq->uq_key);
2136		error = umtxq_sleep(uq, "umtxpp", 0);
2137		umtxq_remove(uq);
2138		umtxq_unlock(&uq->uq_key);
2139	}
2140	umtxq_lock(&uq->uq_key);
2141	if (error == 0)
2142		umtxq_signal(&uq->uq_key, INT_MAX);
2143	umtxq_unbusy(&uq->uq_key);
2144	umtxq_unlock(&uq->uq_key);
2145	umtx_key_release(&uq->uq_key);
2146	if (error == 0 && old_ceiling != NULL)
2147		suword32(old_ceiling, save_ceiling);
2148	return (error);
2149}
2150
2151static int
2152_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
2153	int try)
2154{
2155	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2156	case 0:
2157		return (_do_lock_normal(td, m, flags, timo, try));
2158	case UMUTEX_PRIO_INHERIT:
2159		return (_do_lock_pi(td, m, flags, timo, try));
2160	case UMUTEX_PRIO_PROTECT:
2161		return (_do_lock_pp(td, m, flags, timo, try));
2162	}
2163	return (EINVAL);
2164}
2165
2166/*
2167 * Lock a userland POSIX mutex.
2168 */
2169static int
2170do_lock_umutex(struct thread *td, struct umutex *m,
2171	struct timespec *timeout, int try)
2172{
2173	struct timespec ts, ts2, ts3;
2174	struct timeval tv;
2175	uint32_t flags;
2176	int error;
2177
2178	flags = fuword32(&m->m_flags);
2179	if (flags == -1)
2180		return (EFAULT);
2181
2182	if (timeout == NULL) {
2183		error = _do_lock_umutex(td, m, flags, 0, try);
2184		/* Mutex locking is restarted if it is interrupted. */
2185		if (error == EINTR)
2186			error = ERESTART;
2187	} else {
2188		getnanouptime(&ts);
2189		timespecadd(&ts, timeout);
2190		TIMESPEC_TO_TIMEVAL(&tv, timeout);
2191		for (;;) {
2192			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
2193			if (error != ETIMEDOUT)
2194				break;
2195			getnanouptime(&ts2);
2196			if (timespeccmp(&ts2, &ts, >=)) {
2197				error = ETIMEDOUT;
2198				break;
2199			}
2200			ts3 = ts;
2201			timespecsub(&ts3, &ts2);
2202			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
2203		}
2204		/* Timed-locking is not restarted. */
2205		if (error == ERESTART)
2206			error = EINTR;
2207	}
2208	return (error);
2209}
2210
2211/*
2212 * Unlock a userland POSIX mutex.
2213 */
2214static int
2215do_unlock_umutex(struct thread *td, struct umutex *m)
2216{
2217	uint32_t flags;
2218
2219	flags = fuword32(&m->m_flags);
2220	if (flags == -1)
2221		return (EFAULT);
2222
2223	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2224	case 0:
2225		return (do_unlock_normal(td, m, flags));
2226	case UMUTEX_PRIO_INHERIT:
2227		return (do_unlock_pi(td, m, flags));
2228	case UMUTEX_PRIO_PROTECT:
2229		return (do_unlock_pp(td, m, flags));
2230	}
2231
2232	return (EINVAL);
2233}
2234
2235static int
2236do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2237	struct timespec *timeout, u_long wflags)
2238{
2239	struct umtx_q *uq;
2240	struct timeval tv;
2241	struct timespec cts, ets, tts;
2242	uint32_t flags;
2243	int error;
2244
2245	uq = td->td_umtxq;
2246	flags = fuword32(&cv->c_flags);
2247	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2248	if (error != 0)
2249		return (error);
2250	umtxq_lock(&uq->uq_key);
2251	umtxq_busy(&uq->uq_key);
2252	umtxq_insert(uq);
2253	umtxq_unlock(&uq->uq_key);
2254
2255	/*
2256	 * The magic thing is we should set c_has_waiters to 1 before
2257	 * releasing user mutex.
2258	 */
2259	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
2260
2261	umtxq_lock(&uq->uq_key);
2262	umtxq_unbusy(&uq->uq_key);
2263	umtxq_unlock(&uq->uq_key);
2264
2265	error = do_unlock_umutex(td, m);
2266
2267	umtxq_lock(&uq->uq_key);
2268	if (error == 0) {
2269		if ((wflags & UMTX_CHECK_UNPARKING) &&
2270		    (td->td_pflags & TDP_WAKEUP)) {
2271			td->td_pflags &= ~TDP_WAKEUP;
2272			error = EINTR;
2273		} else if (timeout == NULL) {
2274			error = umtxq_sleep(uq, "ucond", 0);
2275		} else {
2276			getnanouptime(&ets);
2277			timespecadd(&ets, timeout);
2278			TIMESPEC_TO_TIMEVAL(&tv, timeout);
2279			for (;;) {
2280				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
2281				if (error != ETIMEDOUT)
2282					break;
2283				getnanouptime(&cts);
2284				if (timespeccmp(&cts, &ets, >=)) {
2285					error = ETIMEDOUT;
2286					break;
2287				}
2288				tts = ets;
2289				timespecsub(&tts, &cts);
2290				TIMESPEC_TO_TIMEVAL(&tv, &tts);
2291			}
2292		}
2293	}
2294
2295	if (error != 0) {
2296		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
2297			/*
2298			 * If we concurrently got do_cv_signal()d
2299			 * and we got an error or UNIX signals or a timeout,
2300			 * then, perform another umtxq_signal to avoid
2301			 * consuming the wakeup. This may cause supurious
2302			 * wakeup for another thread which was just queued,
2303			 * but SUSV3 explicitly allows supurious wakeup to
2304			 * occur, and indeed a kernel based implementation
2305			 * can not avoid it.
2306			 */
2307			if (!umtxq_signal(&uq->uq_key, 1))
2308				error = 0;
2309		}
2310		if (error == ERESTART)
2311			error = EINTR;
2312	}
2313	umtxq_remove(uq);
2314	umtxq_unlock(&uq->uq_key);
2315	umtx_key_release(&uq->uq_key);
2316	return (error);
2317}
2318
2319/*
2320 * Signal a userland condition variable.
2321 */
2322static int
2323do_cv_signal(struct thread *td, struct ucond *cv)
2324{
2325	struct umtx_key key;
2326	int error, cnt, nwake;
2327	uint32_t flags;
2328
2329	flags = fuword32(&cv->c_flags);
2330	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2331		return (error);
2332	umtxq_lock(&key);
2333	umtxq_busy(&key);
2334	cnt = umtxq_count(&key);
2335	nwake = umtxq_signal(&key, 1);
2336	if (cnt <= nwake) {
2337		umtxq_unlock(&key);
2338		error = suword32(
2339		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2340		umtxq_lock(&key);
2341	}
2342	umtxq_unbusy(&key);
2343	umtxq_unlock(&key);
2344	umtx_key_release(&key);
2345	return (error);
2346}
2347
2348static int
2349do_cv_broadcast(struct thread *td, struct ucond *cv)
2350{
2351	struct umtx_key key;
2352	int error;
2353	uint32_t flags;
2354
2355	flags = fuword32(&cv->c_flags);
2356	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2357		return (error);
2358
2359	umtxq_lock(&key);
2360	umtxq_busy(&key);
2361	umtxq_signal(&key, INT_MAX);
2362	umtxq_unlock(&key);
2363
2364	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
2365
2366	umtxq_lock(&key);
2367	umtxq_unbusy(&key);
2368	umtxq_unlock(&key);
2369
2370	umtx_key_release(&key);
2371	return (error);
2372}
2373
2374int
2375_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
2376    /* struct umtx *umtx */
2377{
2378	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
2379}
2380
2381int
2382_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
2383    /* struct umtx *umtx */
2384{
2385	return do_unlock_umtx(td, uap->umtx, td->td_tid);
2386}
2387
2388static int
2389__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
2390{
2391	struct timespec *ts, timeout;
2392	int error;
2393
2394	/* Allow a null timespec (wait forever). */
2395	if (uap->uaddr2 == NULL)
2396		ts = NULL;
2397	else {
2398		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2399		if (error != 0)
2400			return (error);
2401		if (timeout.tv_nsec >= 1000000000 ||
2402		    timeout.tv_nsec < 0) {
2403			return (EINVAL);
2404		}
2405		ts = &timeout;
2406	}
2407	return (do_lock_umtx(td, uap->obj, uap->val, ts));
2408}
2409
2410static int
2411__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
2412{
2413	return (do_unlock_umtx(td, uap->obj, uap->val));
2414}
2415
2416static int
2417__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
2418{
2419	struct timespec *ts, timeout;
2420	int error;
2421
2422	if (uap->uaddr2 == NULL)
2423		ts = NULL;
2424	else {
2425		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
2426		if (error != 0)
2427			return (error);
2428		if (timeout.tv_nsec >= 1000000000 ||
2429		    timeout.tv_nsec < 0)
2430			return (EINVAL);
2431		ts = &timeout;
2432	}
2433	return do_wait(td, uap->obj, uap->val, ts, 0);
2434}
2435
2436static int
2437__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
2438{
2439	return (kern_umtx_wake(td, uap->obj, uap->val));
2440}
2441
2442static int
2443__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
2444{
2445	struct timespec *ts, timeout;
2446	int error;
2447
2448	/* Allow a null timespec (wait forever). */
2449	if (uap->uaddr2 == NULL)
2450		ts = NULL;
2451	else {
2452		error = copyin(uap->uaddr2, &timeout,
2453		    sizeof(timeout));
2454		if (error != 0)
2455			return (error);
2456		if (timeout.tv_nsec >= 1000000000 ||
2457		    timeout.tv_nsec < 0) {
2458			return (EINVAL);
2459		}
2460		ts = &timeout;
2461	}
2462	return do_lock_umutex(td, uap->obj, ts, 0);
2463}
2464
2465static int
2466__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
2467{
2468	return do_lock_umutex(td, uap->obj, NULL, 1);
2469}
2470
2471static int
2472__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
2473{
2474	return do_unlock_umutex(td, uap->obj);
2475}
2476
2477static int
2478__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
2479{
2480	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
2481}
2482
2483static int
2484__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
2485{
2486	struct timespec *ts, timeout;
2487	int error;
2488
2489	/* Allow a null timespec (wait forever). */
2490	if (uap->uaddr2 == NULL)
2491		ts = NULL;
2492	else {
2493		error = copyin(uap->uaddr2, &timeout,
2494		    sizeof(timeout));
2495		if (error != 0)
2496			return (error);
2497		if (timeout.tv_nsec >= 1000000000 ||
2498		    timeout.tv_nsec < 0) {
2499			return (EINVAL);
2500		}
2501		ts = &timeout;
2502	}
2503	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2504}
2505
2506static int
2507__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
2508{
2509	return do_cv_signal(td, uap->obj);
2510}
2511
2512static int
2513__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
2514{
2515	return do_cv_broadcast(td, uap->obj);
2516}
2517
2518typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
2519
2520static _umtx_op_func op_table[] = {
2521	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
2522	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
2523	__umtx_op_wait,			/* UMTX_OP_WAIT */
2524	__umtx_op_wake,			/* UMTX_OP_WAKE */
2525	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
2526	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
2527	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
2528	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2529	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
2530	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2531	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2532};
2533
2534int
2535_umtx_op(struct thread *td, struct _umtx_op_args *uap)
2536{
2537	if ((unsigned)uap->op < UMTX_OP_MAX)
2538		return (*op_table[uap->op])(td, uap);
2539	return (EINVAL);
2540}
2541
2542#ifdef COMPAT_IA32
2543int
2544freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
2545    /* struct umtx *umtx */
2546{
2547	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
2548}
2549
2550int
2551freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
2552    /* struct umtx *umtx */
2553{
2554	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
2555}
2556
2557struct timespec32 {
2558	u_int32_t tv_sec;
2559	u_int32_t tv_nsec;
2560};
2561
2562static inline int
2563copyin_timeout32(void *addr, struct timespec *tsp)
2564{
2565	struct timespec32 ts32;
2566	int error;
2567
2568	error = copyin(addr, &ts32, sizeof(struct timespec32));
2569	if (error == 0) {
2570		tsp->tv_sec = ts32.tv_sec;
2571		tsp->tv_nsec = ts32.tv_nsec;
2572	}
2573	return (error);
2574}
2575
2576static int
2577__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2578{
2579	struct timespec *ts, timeout;
2580	int error;
2581
2582	/* Allow a null timespec (wait forever). */
2583	if (uap->uaddr2 == NULL)
2584		ts = NULL;
2585	else {
2586		error = copyin_timeout32(uap->uaddr2, &timeout);
2587		if (error != 0)
2588			return (error);
2589		if (timeout.tv_nsec >= 1000000000 ||
2590		    timeout.tv_nsec < 0) {
2591			return (EINVAL);
2592		}
2593		ts = &timeout;
2594	}
2595	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
2596}
2597
2598static int
2599__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
2600{
2601	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
2602}
2603
2604static int
2605__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2606{
2607	struct timespec *ts, timeout;
2608	int error;
2609
2610	if (uap->uaddr2 == NULL)
2611		ts = NULL;
2612	else {
2613		error = copyin_timeout32(uap->uaddr2, &timeout);
2614		if (error != 0)
2615			return (error);
2616		if (timeout.tv_nsec >= 1000000000 ||
2617		    timeout.tv_nsec < 0)
2618			return (EINVAL);
2619		ts = &timeout;
2620	}
2621	return do_wait(td, uap->obj, uap->val, ts, 1);
2622}
2623
2624static int
2625__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
2626{
2627	struct timespec *ts, timeout;
2628	int error;
2629
2630	/* Allow a null timespec (wait forever). */
2631	if (uap->uaddr2 == NULL)
2632		ts = NULL;
2633	else {
2634		error = copyin_timeout32(uap->uaddr2, &timeout);
2635		if (error != 0)
2636			return (error);
2637		if (timeout.tv_nsec >= 1000000000 ||
2638		    timeout.tv_nsec < 0)
2639			return (EINVAL);
2640		ts = &timeout;
2641	}
2642	return do_lock_umutex(td, uap->obj, ts, 0);
2643}
2644
2645static int
2646__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
2647{
2648	struct timespec *ts, timeout;
2649	int error;
2650
2651	/* Allow a null timespec (wait forever). */
2652	if (uap->uaddr2 == NULL)
2653		ts = NULL;
2654	else {
2655		error = copyin_timeout32(uap->uaddr2, &timeout);
2656		if (error != 0)
2657			return (error);
2658		if (timeout.tv_nsec >= 1000000000 ||
2659		    timeout.tv_nsec < 0)
2660			return (EINVAL);
2661		ts = &timeout;
2662	}
2663	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
2664}
2665
2666static _umtx_op_func op_table_compat32[] = {
2667	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
2668	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
2669	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
2670	__umtx_op_wake,			/* UMTX_OP_WAKE */
2671	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
2672	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
2673	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
2674	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
2675	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
2676	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
2677	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
2678};
2679
2680int
2681freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
2682{
2683	if ((unsigned)uap->op < UMTX_OP_MAX)
2684		return (*op_table_compat32[uap->op])(td,
2685			(struct _umtx_op_args *)uap);
2686	return (EINVAL);
2687}
2688#endif
2689
2690void
2691umtx_thread_init(struct thread *td)
2692{
2693	td->td_umtxq = umtxq_alloc();
2694	td->td_umtxq->uq_thread = td;
2695}
2696
2697void
2698umtx_thread_fini(struct thread *td)
2699{
2700	umtxq_free(td->td_umtxq);
2701}
2702
2703/*
2704 * It will be called when new thread is created, e.g fork().
2705 */
2706void
2707umtx_thread_alloc(struct thread *td)
2708{
2709	struct umtx_q *uq;
2710
2711	uq = td->td_umtxq;
2712	uq->uq_inherited_pri = PRI_MAX;
2713
2714	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
2715	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
2716	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
2717	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
2718}
2719
2720/*
2721 * exec() hook.
2722 */
2723static void
2724umtx_exec_hook(void *arg __unused, struct proc *p __unused,
2725	struct image_params *imgp __unused)
2726{
2727	umtx_thread_cleanup(curthread);
2728}
2729
2730/*
2731 * thread_exit() hook.
2732 */
2733void
2734umtx_thread_exit(struct thread *td)
2735{
2736	umtx_thread_cleanup(td);
2737}
2738
2739/*
2740 * clean up umtx data.
2741 */
2742static void
2743umtx_thread_cleanup(struct thread *td)
2744{
2745	struct umtx_q *uq;
2746	struct umtx_pi *pi;
2747
2748	if ((uq = td->td_umtxq) == NULL)
2749		return;
2750
2751	mtx_lock_spin(&sched_lock);
2752	uq->uq_inherited_pri = PRI_MAX;
2753	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
2754		pi->pi_owner = NULL;
2755		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
2756	}
2757	td->td_flags &= ~TDF_UBORROWING;
2758	mtx_unlock_spin(&sched_lock);
2759}
2760