kern_umtx.c revision 274648
1/*-
2 * Copyright (c) 2004, David Xu <davidxu@freebsd.org>
3 * Copyright (c) 2002, Jeffrey Roberson <jeff@freebsd.org>
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 *    notice unmodified, this list of conditions, and the following
11 *    disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include <sys/cdefs.h>
29__FBSDID("$FreeBSD: stable/10/sys/kern/kern_umtx.c 274648 2014-11-18 12:53:32Z kib $");
30
31#include "opt_compat.h"
32#include "opt_umtx_profiling.h"
33
34#include <sys/param.h>
35#include <sys/kernel.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/malloc.h>
39#include <sys/mutex.h>
40#include <sys/priv.h>
41#include <sys/proc.h>
42#include <sys/sbuf.h>
43#include <sys/sched.h>
44#include <sys/smp.h>
45#include <sys/sysctl.h>
46#include <sys/sysent.h>
47#include <sys/systm.h>
48#include <sys/sysproto.h>
49#include <sys/syscallsubr.h>
50#include <sys/eventhandler.h>
51#include <sys/umtx.h>
52
53#include <vm/vm.h>
54#include <vm/vm_param.h>
55#include <vm/pmap.h>
56#include <vm/vm_map.h>
57#include <vm/vm_object.h>
58
59#include <machine/cpu.h>
60
61#ifdef COMPAT_FREEBSD32
62#include <compat/freebsd32/freebsd32_proto.h>
63#endif
64
65#define _UMUTEX_TRY		1
66#define _UMUTEX_WAIT		2
67
68#ifdef UMTX_PROFILING
69#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
70	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
71#endif
72
73/* Priority inheritance mutex info. */
74struct umtx_pi {
75	/* Owner thread */
76	struct thread		*pi_owner;
77
78	/* Reference count */
79	int			pi_refcount;
80
81 	/* List entry to link umtx holding by thread */
82	TAILQ_ENTRY(umtx_pi)	pi_link;
83
84	/* List entry in hash */
85	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
86
87	/* List for waiters */
88	TAILQ_HEAD(,umtx_q)	pi_blocked;
89
90	/* Identify a userland lock object */
91	struct umtx_key		pi_key;
92};
93
94/* A userland synchronous object user. */
95struct umtx_q {
96	/* Linked list for the hash. */
97	TAILQ_ENTRY(umtx_q)	uq_link;
98
99	/* Umtx key. */
100	struct umtx_key		uq_key;
101
102	/* Umtx flags. */
103	int			uq_flags;
104#define UQF_UMTXQ	0x0001
105
106	/* The thread waits on. */
107	struct thread		*uq_thread;
108
109	/*
110	 * Blocked on PI mutex. read can use chain lock
111	 * or umtx_lock, write must have both chain lock and
112	 * umtx_lock being hold.
113	 */
114	struct umtx_pi		*uq_pi_blocked;
115
116	/* On blocked list */
117	TAILQ_ENTRY(umtx_q)	uq_lockq;
118
119	/* Thread contending with us */
120	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
121
122	/* Inherited priority from PP mutex */
123	u_char			uq_inherited_pri;
124
125	/* Spare queue ready to be reused */
126	struct umtxq_queue	*uq_spare_queue;
127
128	/* The queue we on */
129	struct umtxq_queue	*uq_cur_queue;
130};
131
132TAILQ_HEAD(umtxq_head, umtx_q);
133
134/* Per-key wait-queue */
135struct umtxq_queue {
136	struct umtxq_head	head;
137	struct umtx_key		key;
138	LIST_ENTRY(umtxq_queue)	link;
139	int			length;
140};
141
142LIST_HEAD(umtxq_list, umtxq_queue);
143
144/* Userland lock object's wait-queue chain */
145struct umtxq_chain {
146	/* Lock for this chain. */
147	struct mtx		uc_lock;
148
149	/* List of sleep queues. */
150	struct umtxq_list	uc_queue[2];
151#define UMTX_SHARED_QUEUE	0
152#define UMTX_EXCLUSIVE_QUEUE	1
153
154	LIST_HEAD(, umtxq_queue) uc_spare_queue;
155
156	/* Busy flag */
157	char			uc_busy;
158
159	/* Chain lock waiters */
160	int			uc_waiters;
161
162	/* All PI in the list */
163	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
164
165#ifdef UMTX_PROFILING
166	u_int 			length;
167	u_int			max_length;
168#endif
169};
170
171#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
172#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
173
174/*
175 * Don't propagate time-sharing priority, there is a security reason,
176 * a user can simply introduce PI-mutex, let thread A lock the mutex,
177 * and let another thread B block on the mutex, because B is
178 * sleeping, its priority will be boosted, this causes A's priority to
179 * be boosted via priority propagating too and will never be lowered even
180 * if it is using 100%CPU, this is unfair to other processes.
181 */
182
183#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
184			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
185			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
186
187#define	GOLDEN_RATIO_PRIME	2654404609U
188#define	UMTX_CHAINS		512
189#define	UMTX_SHIFTS		(__WORD_BIT - 9)
190
191#define	GET_SHARE(flags)	\
192    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
193
194#define BUSY_SPINS		200
195
196struct abs_timeout {
197	int clockid;
198	struct timespec cur;
199	struct timespec end;
200};
201
202static uma_zone_t		umtx_pi_zone;
203static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
204static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
205static int			umtx_pi_allocated;
206
207static SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
208SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
209    &umtx_pi_allocated, 0, "Allocated umtx_pi");
210
211#ifdef UMTX_PROFILING
212static long max_length;
213SYSCTL_LONG(_debug_umtx, OID_AUTO, max_length, CTLFLAG_RD, &max_length, 0, "max_length");
214static SYSCTL_NODE(_debug_umtx, OID_AUTO, chains, CTLFLAG_RD, 0, "umtx chain stats");
215#endif
216
217static void umtxq_sysinit(void *);
218static void umtxq_hash(struct umtx_key *key);
219static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
220static void umtxq_lock(struct umtx_key *key);
221static void umtxq_unlock(struct umtx_key *key);
222static void umtxq_busy(struct umtx_key *key);
223static void umtxq_unbusy(struct umtx_key *key);
224static void umtxq_insert_queue(struct umtx_q *uq, int q);
225static void umtxq_remove_queue(struct umtx_q *uq, int q);
226static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
227static int umtxq_count(struct umtx_key *key);
228static struct umtx_pi *umtx_pi_alloc(int);
229static void umtx_pi_free(struct umtx_pi *pi);
230static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
231static void umtx_thread_cleanup(struct thread *td);
232static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
233	struct image_params *imgp __unused);
234SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
235
236#define umtxq_signal(key, nwake)	umtxq_signal_queue((key), (nwake), UMTX_SHARED_QUEUE)
237#define umtxq_insert(uq)	umtxq_insert_queue((uq), UMTX_SHARED_QUEUE)
238#define umtxq_remove(uq)	umtxq_remove_queue((uq), UMTX_SHARED_QUEUE)
239
240static struct mtx umtx_lock;
241
242#ifdef UMTX_PROFILING
243static void
244umtx_init_profiling(void)
245{
246	struct sysctl_oid *chain_oid;
247	char chain_name[10];
248	int i;
249
250	for (i = 0; i < UMTX_CHAINS; ++i) {
251		snprintf(chain_name, sizeof(chain_name), "%d", i);
252		chain_oid = SYSCTL_ADD_NODE(NULL,
253		    SYSCTL_STATIC_CHILDREN(_debug_umtx_chains), OID_AUTO,
254		    chain_name, CTLFLAG_RD, NULL, "umtx hash stats");
255		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
256		    "max_length0", CTLFLAG_RD, &umtxq_chains[0][i].max_length, 0, NULL);
257		SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(chain_oid), OID_AUTO,
258		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
259	}
260}
261
262static int
263sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
264{
265	char buf[512];
266	struct sbuf sb;
267	struct umtxq_chain *uc;
268	u_int fract, i, j, tot, whole;
269	u_int sf0, sf1, sf2, sf3, sf4;
270	u_int si0, si1, si2, si3, si4;
271	u_int sw0, sw1, sw2, sw3, sw4;
272
273	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
274	for (i = 0; i < 2; i++) {
275		tot = 0;
276		for (j = 0; j < UMTX_CHAINS; ++j) {
277			uc = &umtxq_chains[i][j];
278			mtx_lock(&uc->uc_lock);
279			tot += uc->max_length;
280			mtx_unlock(&uc->uc_lock);
281		}
282		if (tot == 0)
283			sbuf_printf(&sb, "%u) Empty ", i);
284		else {
285			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
286			si0 = si1 = si2 = si3 = si4 = 0;
287			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
288			for (j = 0; j < UMTX_CHAINS; j++) {
289				uc = &umtxq_chains[i][j];
290				mtx_lock(&uc->uc_lock);
291				whole = uc->max_length * 100;
292				mtx_unlock(&uc->uc_lock);
293				fract = (whole % tot) * 100;
294				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
295					sf0 = fract;
296					si0 = j;
297					sw0 = whole;
298				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
299				    sf1)) {
300					sf1 = fract;
301					si1 = j;
302					sw1 = whole;
303				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
304				    sf2)) {
305					sf2 = fract;
306					si2 = j;
307					sw2 = whole;
308				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
309				    sf3)) {
310					sf3 = fract;
311					si3 = j;
312					sw3 = whole;
313				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
314				    sf4)) {
315					sf4 = fract;
316					si4 = j;
317					sw4 = whole;
318				}
319			}
320			sbuf_printf(&sb, "queue %u:\n", i);
321			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
322			    sf0 / tot, si0);
323			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
324			    sf1 / tot, si1);
325			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
326			    sf2 / tot, si2);
327			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
328			    sf3 / tot, si3);
329			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
330			    sf4 / tot, si4);
331		}
332	}
333	sbuf_trim(&sb);
334	sbuf_finish(&sb);
335	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
336	sbuf_delete(&sb);
337	return (0);
338}
339
340static int
341sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
342{
343	struct umtxq_chain *uc;
344	u_int i, j;
345	int clear, error;
346
347	clear = 0;
348	error = sysctl_handle_int(oidp, &clear, 0, req);
349	if (error != 0 || req->newptr == NULL)
350		return (error);
351
352	if (clear != 0) {
353		for (i = 0; i < 2; ++i) {
354			for (j = 0; j < UMTX_CHAINS; ++j) {
355				uc = &umtxq_chains[i][j];
356				mtx_lock(&uc->uc_lock);
357				uc->length = 0;
358				uc->max_length = 0;
359				mtx_unlock(&uc->uc_lock);
360			}
361		}
362	}
363	return (0);
364}
365
366SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
367    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
368    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
369SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
370    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
371    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
372#endif
373
374static void
375umtxq_sysinit(void *arg __unused)
376{
377	int i, j;
378
379	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
380		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
381	for (i = 0; i < 2; ++i) {
382		for (j = 0; j < UMTX_CHAINS; ++j) {
383			mtx_init(&umtxq_chains[i][j].uc_lock, "umtxql", NULL,
384				 MTX_DEF | MTX_DUPOK);
385			LIST_INIT(&umtxq_chains[i][j].uc_queue[0]);
386			LIST_INIT(&umtxq_chains[i][j].uc_queue[1]);
387			LIST_INIT(&umtxq_chains[i][j].uc_spare_queue);
388			TAILQ_INIT(&umtxq_chains[i][j].uc_pi_list);
389			umtxq_chains[i][j].uc_busy = 0;
390			umtxq_chains[i][j].uc_waiters = 0;
391#ifdef UMTX_PROFILING
392			umtxq_chains[i][j].length = 0;
393			umtxq_chains[i][j].max_length = 0;
394#endif
395		}
396	}
397#ifdef UMTX_PROFILING
398	umtx_init_profiling();
399#endif
400	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
401	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
402	    EVENTHANDLER_PRI_ANY);
403}
404
405struct umtx_q *
406umtxq_alloc(void)
407{
408	struct umtx_q *uq;
409
410	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
411	uq->uq_spare_queue = malloc(sizeof(struct umtxq_queue), M_UMTX, M_WAITOK | M_ZERO);
412	TAILQ_INIT(&uq->uq_spare_queue->head);
413	TAILQ_INIT(&uq->uq_pi_contested);
414	uq->uq_inherited_pri = PRI_MAX;
415	return (uq);
416}
417
418void
419umtxq_free(struct umtx_q *uq)
420{
421	MPASS(uq->uq_spare_queue != NULL);
422	free(uq->uq_spare_queue, M_UMTX);
423	free(uq, M_UMTX);
424}
425
426static inline void
427umtxq_hash(struct umtx_key *key)
428{
429	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
430	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
431}
432
433static inline struct umtxq_chain *
434umtxq_getchain(struct umtx_key *key)
435{
436	if (key->type <= TYPE_SEM)
437		return (&umtxq_chains[1][key->hash]);
438	return (&umtxq_chains[0][key->hash]);
439}
440
441/*
442 * Lock a chain.
443 */
444static inline void
445umtxq_lock(struct umtx_key *key)
446{
447	struct umtxq_chain *uc;
448
449	uc = umtxq_getchain(key);
450	mtx_lock(&uc->uc_lock);
451}
452
453/*
454 * Unlock a chain.
455 */
456static inline void
457umtxq_unlock(struct umtx_key *key)
458{
459	struct umtxq_chain *uc;
460
461	uc = umtxq_getchain(key);
462	mtx_unlock(&uc->uc_lock);
463}
464
465/*
466 * Set chain to busy state when following operation
467 * may be blocked (kernel mutex can not be used).
468 */
469static inline void
470umtxq_busy(struct umtx_key *key)
471{
472	struct umtxq_chain *uc;
473
474	uc = umtxq_getchain(key);
475	mtx_assert(&uc->uc_lock, MA_OWNED);
476	if (uc->uc_busy) {
477#ifdef SMP
478		if (smp_cpus > 1) {
479			int count = BUSY_SPINS;
480			if (count > 0) {
481				umtxq_unlock(key);
482				while (uc->uc_busy && --count > 0)
483					cpu_spinwait();
484				umtxq_lock(key);
485			}
486		}
487#endif
488		while (uc->uc_busy) {
489			uc->uc_waiters++;
490			msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
491			uc->uc_waiters--;
492		}
493	}
494	uc->uc_busy = 1;
495}
496
497/*
498 * Unbusy a chain.
499 */
500static inline void
501umtxq_unbusy(struct umtx_key *key)
502{
503	struct umtxq_chain *uc;
504
505	uc = umtxq_getchain(key);
506	mtx_assert(&uc->uc_lock, MA_OWNED);
507	KASSERT(uc->uc_busy != 0, ("not busy"));
508	uc->uc_busy = 0;
509	if (uc->uc_waiters)
510		wakeup_one(uc);
511}
512
513static inline void
514umtxq_unbusy_unlocked(struct umtx_key *key)
515{
516
517	umtxq_lock(key);
518	umtxq_unbusy(key);
519	umtxq_unlock(key);
520}
521
522static struct umtxq_queue *
523umtxq_queue_lookup(struct umtx_key *key, int q)
524{
525	struct umtxq_queue *uh;
526	struct umtxq_chain *uc;
527
528	uc = umtxq_getchain(key);
529	UMTXQ_LOCKED_ASSERT(uc);
530	LIST_FOREACH(uh, &uc->uc_queue[q], link) {
531		if (umtx_key_match(&uh->key, key))
532			return (uh);
533	}
534
535	return (NULL);
536}
537
538static inline void
539umtxq_insert_queue(struct umtx_q *uq, int q)
540{
541	struct umtxq_queue *uh;
542	struct umtxq_chain *uc;
543
544	uc = umtxq_getchain(&uq->uq_key);
545	UMTXQ_LOCKED_ASSERT(uc);
546	KASSERT((uq->uq_flags & UQF_UMTXQ) == 0, ("umtx_q is already on queue"));
547	uh = umtxq_queue_lookup(&uq->uq_key, q);
548	if (uh != NULL) {
549		LIST_INSERT_HEAD(&uc->uc_spare_queue, uq->uq_spare_queue, link);
550	} else {
551		uh = uq->uq_spare_queue;
552		uh->key = uq->uq_key;
553		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
554#ifdef UMTX_PROFILING
555		uc->length++;
556		if (uc->length > uc->max_length) {
557			uc->max_length = uc->length;
558			if (uc->max_length > max_length)
559				max_length = uc->max_length;
560		}
561#endif
562	}
563	uq->uq_spare_queue = NULL;
564
565	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
566	uh->length++;
567	uq->uq_flags |= UQF_UMTXQ;
568	uq->uq_cur_queue = uh;
569	return;
570}
571
572static inline void
573umtxq_remove_queue(struct umtx_q *uq, int q)
574{
575	struct umtxq_chain *uc;
576	struct umtxq_queue *uh;
577
578	uc = umtxq_getchain(&uq->uq_key);
579	UMTXQ_LOCKED_ASSERT(uc);
580	if (uq->uq_flags & UQF_UMTXQ) {
581		uh = uq->uq_cur_queue;
582		TAILQ_REMOVE(&uh->head, uq, uq_link);
583		uh->length--;
584		uq->uq_flags &= ~UQF_UMTXQ;
585		if (TAILQ_EMPTY(&uh->head)) {
586			KASSERT(uh->length == 0,
587			    ("inconsistent umtxq_queue length"));
588#ifdef UMTX_PROFILING
589			uc->length--;
590#endif
591			LIST_REMOVE(uh, link);
592		} else {
593			uh = LIST_FIRST(&uc->uc_spare_queue);
594			KASSERT(uh != NULL, ("uc_spare_queue is empty"));
595			LIST_REMOVE(uh, link);
596		}
597		uq->uq_spare_queue = uh;
598		uq->uq_cur_queue = NULL;
599	}
600}
601
602/*
603 * Check if there are multiple waiters
604 */
605static int
606umtxq_count(struct umtx_key *key)
607{
608	struct umtxq_chain *uc;
609	struct umtxq_queue *uh;
610
611	uc = umtxq_getchain(key);
612	UMTXQ_LOCKED_ASSERT(uc);
613	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
614	if (uh != NULL)
615		return (uh->length);
616	return (0);
617}
618
619/*
620 * Check if there are multiple PI waiters and returns first
621 * waiter.
622 */
623static int
624umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
625{
626	struct umtxq_chain *uc;
627	struct umtxq_queue *uh;
628
629	*first = NULL;
630	uc = umtxq_getchain(key);
631	UMTXQ_LOCKED_ASSERT(uc);
632	uh = umtxq_queue_lookup(key, UMTX_SHARED_QUEUE);
633	if (uh != NULL) {
634		*first = TAILQ_FIRST(&uh->head);
635		return (uh->length);
636	}
637	return (0);
638}
639
640static int
641umtxq_check_susp(struct thread *td)
642{
643	struct proc *p;
644	int error;
645
646	/*
647	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
648	 * eventually break the lockstep loop.
649	 */
650	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
651		return (0);
652	error = 0;
653	p = td->td_proc;
654	PROC_LOCK(p);
655	if (P_SHOULDSTOP(p) ||
656	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
657		if (p->p_flag & P_SINGLE_EXIT)
658			error = EINTR;
659		else
660			error = ERESTART;
661	}
662	PROC_UNLOCK(p);
663	return (error);
664}
665
666/*
667 * Wake up threads waiting on an userland object.
668 */
669
670static int
671umtxq_signal_queue(struct umtx_key *key, int n_wake, int q)
672{
673	struct umtxq_chain *uc;
674	struct umtxq_queue *uh;
675	struct umtx_q *uq;
676	int ret;
677
678	ret = 0;
679	uc = umtxq_getchain(key);
680	UMTXQ_LOCKED_ASSERT(uc);
681	uh = umtxq_queue_lookup(key, q);
682	if (uh != NULL) {
683		while ((uq = TAILQ_FIRST(&uh->head)) != NULL) {
684			umtxq_remove_queue(uq, q);
685			wakeup(uq);
686			if (++ret >= n_wake)
687				return (ret);
688		}
689	}
690	return (ret);
691}
692
693
694/*
695 * Wake up specified thread.
696 */
697static inline void
698umtxq_signal_thread(struct umtx_q *uq)
699{
700	struct umtxq_chain *uc;
701
702	uc = umtxq_getchain(&uq->uq_key);
703	UMTXQ_LOCKED_ASSERT(uc);
704	umtxq_remove(uq);
705	wakeup(uq);
706}
707
708static inline int
709tstohz(const struct timespec *tsp)
710{
711	struct timeval tv;
712
713	TIMESPEC_TO_TIMEVAL(&tv, tsp);
714	return tvtohz(&tv);
715}
716
717static void
718abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
719	const struct timespec *timeout)
720{
721
722	timo->clockid = clockid;
723	if (!absolute) {
724		kern_clock_gettime(curthread, clockid, &timo->end);
725		timo->cur = timo->end;
726		timespecadd(&timo->end, timeout);
727	} else {
728		timo->end = *timeout;
729		kern_clock_gettime(curthread, clockid, &timo->cur);
730	}
731}
732
733static void
734abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
735{
736
737	abs_timeout_init(timo, umtxtime->_clockid,
738		(umtxtime->_flags & UMTX_ABSTIME) != 0,
739		&umtxtime->_timeout);
740}
741
742static inline void
743abs_timeout_update(struct abs_timeout *timo)
744{
745	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
746}
747
748static int
749abs_timeout_gethz(struct abs_timeout *timo)
750{
751	struct timespec tts;
752
753	if (timespeccmp(&timo->end, &timo->cur, <=))
754		return (-1);
755	tts = timo->end;
756	timespecsub(&tts, &timo->cur);
757	return (tstohz(&tts));
758}
759
760/*
761 * Put thread into sleep state, before sleeping, check if
762 * thread was removed from umtx queue.
763 */
764static inline int
765umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
766{
767	struct umtxq_chain *uc;
768	int error, timo;
769
770	uc = umtxq_getchain(&uq->uq_key);
771	UMTXQ_LOCKED_ASSERT(uc);
772	for (;;) {
773		if (!(uq->uq_flags & UQF_UMTXQ))
774			return (0);
775		if (abstime != NULL) {
776			timo = abs_timeout_gethz(abstime);
777			if (timo < 0)
778				return (ETIMEDOUT);
779		} else
780			timo = 0;
781		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
782		if (error != EWOULDBLOCK) {
783			umtxq_lock(&uq->uq_key);
784			break;
785		}
786		if (abstime != NULL)
787			abs_timeout_update(abstime);
788		umtxq_lock(&uq->uq_key);
789	}
790	return (error);
791}
792
793/*
794 * Convert userspace address into unique logical address.
795 */
796int
797umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
798{
799	struct thread *td = curthread;
800	vm_map_t map;
801	vm_map_entry_t entry;
802	vm_pindex_t pindex;
803	vm_prot_t prot;
804	boolean_t wired;
805
806	key->type = type;
807	if (share == THREAD_SHARE) {
808		key->shared = 0;
809		key->info.private.vs = td->td_proc->p_vmspace;
810		key->info.private.addr = (uintptr_t)addr;
811	} else {
812		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
813		map = &td->td_proc->p_vmspace->vm_map;
814		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
815		    &entry, &key->info.shared.object, &pindex, &prot,
816		    &wired) != KERN_SUCCESS) {
817			return EFAULT;
818		}
819
820		if ((share == PROCESS_SHARE) ||
821		    (share == AUTO_SHARE &&
822		     VM_INHERIT_SHARE == entry->inheritance)) {
823			key->shared = 1;
824			key->info.shared.offset = entry->offset + entry->start -
825				(vm_offset_t)addr;
826			vm_object_reference(key->info.shared.object);
827		} else {
828			key->shared = 0;
829			key->info.private.vs = td->td_proc->p_vmspace;
830			key->info.private.addr = (uintptr_t)addr;
831		}
832		vm_map_lookup_done(map, entry);
833	}
834
835	umtxq_hash(key);
836	return (0);
837}
838
839/*
840 * Release key.
841 */
842void
843umtx_key_release(struct umtx_key *key)
844{
845	if (key->shared)
846		vm_object_deallocate(key->info.shared.object);
847}
848
849/*
850 * Lock a umtx object.
851 */
852static int
853do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
854	const struct timespec *timeout)
855{
856	struct abs_timeout timo;
857	struct umtx_q *uq;
858	u_long owner;
859	u_long old;
860	int error = 0;
861
862	uq = td->td_umtxq;
863	if (timeout != NULL)
864		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
865
866	/*
867	 * Care must be exercised when dealing with umtx structure. It
868	 * can fault on any access.
869	 */
870	for (;;) {
871		/*
872		 * Try the uncontested case.  This should be done in userland.
873		 */
874		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
875
876		/* The acquire succeeded. */
877		if (owner == UMTX_UNOWNED)
878			return (0);
879
880		/* The address was invalid. */
881		if (owner == -1)
882			return (EFAULT);
883
884		/* If no one owns it but it is contested try to acquire it. */
885		if (owner == UMTX_CONTESTED) {
886			owner = casuword(&umtx->u_owner,
887			    UMTX_CONTESTED, id | UMTX_CONTESTED);
888
889			if (owner == UMTX_CONTESTED)
890				return (0);
891
892			/* The address was invalid. */
893			if (owner == -1)
894				return (EFAULT);
895
896			error = umtxq_check_susp(td);
897			if (error != 0)
898				break;
899
900			/* If this failed the lock has changed, restart. */
901			continue;
902		}
903
904		/*
905		 * If we caught a signal, we have retried and now
906		 * exit immediately.
907		 */
908		if (error != 0)
909			break;
910
911		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
912			AUTO_SHARE, &uq->uq_key)) != 0)
913			return (error);
914
915		umtxq_lock(&uq->uq_key);
916		umtxq_busy(&uq->uq_key);
917		umtxq_insert(uq);
918		umtxq_unbusy(&uq->uq_key);
919		umtxq_unlock(&uq->uq_key);
920
921		/*
922		 * Set the contested bit so that a release in user space
923		 * knows to use the system call for unlock.  If this fails
924		 * either some one else has acquired the lock or it has been
925		 * released.
926		 */
927		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
928
929		/* The address was invalid. */
930		if (old == -1) {
931			umtxq_lock(&uq->uq_key);
932			umtxq_remove(uq);
933			umtxq_unlock(&uq->uq_key);
934			umtx_key_release(&uq->uq_key);
935			return (EFAULT);
936		}
937
938		/*
939		 * We set the contested bit, sleep. Otherwise the lock changed
940		 * and we need to retry or we lost a race to the thread
941		 * unlocking the umtx.
942		 */
943		umtxq_lock(&uq->uq_key);
944		if (old == owner)
945			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
946			    &timo);
947		umtxq_remove(uq);
948		umtxq_unlock(&uq->uq_key);
949		umtx_key_release(&uq->uq_key);
950
951		if (error == 0)
952			error = umtxq_check_susp(td);
953	}
954
955	if (timeout == NULL) {
956		/* Mutex locking is restarted if it is interrupted. */
957		if (error == EINTR)
958			error = ERESTART;
959	} else {
960		/* Timed-locking is not restarted. */
961		if (error == ERESTART)
962			error = EINTR;
963	}
964	return (error);
965}
966
967/*
968 * Unlock a umtx object.
969 */
970static int
971do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
972{
973	struct umtx_key key;
974	u_long owner;
975	u_long old;
976	int error;
977	int count;
978
979	/*
980	 * Make sure we own this mtx.
981	 */
982	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
983	if (owner == -1)
984		return (EFAULT);
985
986	if ((owner & ~UMTX_CONTESTED) != id)
987		return (EPERM);
988
989	/* This should be done in userland */
990	if ((owner & UMTX_CONTESTED) == 0) {
991		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
992		if (old == -1)
993			return (EFAULT);
994		if (old == owner)
995			return (0);
996		owner = old;
997	}
998
999	/* We should only ever be in here for contested locks */
1000	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1001		&key)) != 0)
1002		return (error);
1003
1004	umtxq_lock(&key);
1005	umtxq_busy(&key);
1006	count = umtxq_count(&key);
1007	umtxq_unlock(&key);
1008
1009	/*
1010	 * When unlocking the umtx, it must be marked as unowned if
1011	 * there is zero or one thread only waiting for it.
1012	 * Otherwise, it must be marked as contested.
1013	 */
1014	old = casuword(&umtx->u_owner, owner,
1015		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
1016	umtxq_lock(&key);
1017	umtxq_signal(&key,1);
1018	umtxq_unbusy(&key);
1019	umtxq_unlock(&key);
1020	umtx_key_release(&key);
1021	if (old == -1)
1022		return (EFAULT);
1023	if (old != owner)
1024		return (EINVAL);
1025	return (0);
1026}
1027
1028#ifdef COMPAT_FREEBSD32
1029
1030/*
1031 * Lock a umtx object.
1032 */
1033static int
1034do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
1035	const struct timespec *timeout)
1036{
1037	struct abs_timeout timo;
1038	struct umtx_q *uq;
1039	uint32_t owner;
1040	uint32_t old;
1041	int error = 0;
1042
1043	uq = td->td_umtxq;
1044
1045	if (timeout != NULL)
1046		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
1047
1048	/*
1049	 * Care must be exercised when dealing with umtx structure. It
1050	 * can fault on any access.
1051	 */
1052	for (;;) {
1053		/*
1054		 * Try the uncontested case.  This should be done in userland.
1055		 */
1056		owner = casuword32(m, UMUTEX_UNOWNED, id);
1057
1058		/* The acquire succeeded. */
1059		if (owner == UMUTEX_UNOWNED)
1060			return (0);
1061
1062		/* The address was invalid. */
1063		if (owner == -1)
1064			return (EFAULT);
1065
1066		/* If no one owns it but it is contested try to acquire it. */
1067		if (owner == UMUTEX_CONTESTED) {
1068			owner = casuword32(m,
1069			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
1070			if (owner == UMUTEX_CONTESTED)
1071				return (0);
1072
1073			/* The address was invalid. */
1074			if (owner == -1)
1075				return (EFAULT);
1076
1077			error = umtxq_check_susp(td);
1078			if (error != 0)
1079				break;
1080
1081			/* If this failed the lock has changed, restart. */
1082			continue;
1083		}
1084
1085		/*
1086		 * If we caught a signal, we have retried and now
1087		 * exit immediately.
1088		 */
1089		if (error != 0)
1090			return (error);
1091
1092		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
1093			AUTO_SHARE, &uq->uq_key)) != 0)
1094			return (error);
1095
1096		umtxq_lock(&uq->uq_key);
1097		umtxq_busy(&uq->uq_key);
1098		umtxq_insert(uq);
1099		umtxq_unbusy(&uq->uq_key);
1100		umtxq_unlock(&uq->uq_key);
1101
1102		/*
1103		 * Set the contested bit so that a release in user space
1104		 * knows to use the system call for unlock.  If this fails
1105		 * either some one else has acquired the lock or it has been
1106		 * released.
1107		 */
1108		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
1109
1110		/* The address was invalid. */
1111		if (old == -1) {
1112			umtxq_lock(&uq->uq_key);
1113			umtxq_remove(uq);
1114			umtxq_unlock(&uq->uq_key);
1115			umtx_key_release(&uq->uq_key);
1116			return (EFAULT);
1117		}
1118
1119		/*
1120		 * We set the contested bit, sleep. Otherwise the lock changed
1121		 * and we need to retry or we lost a race to the thread
1122		 * unlocking the umtx.
1123		 */
1124		umtxq_lock(&uq->uq_key);
1125		if (old == owner)
1126			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
1127			    NULL : &timo);
1128		umtxq_remove(uq);
1129		umtxq_unlock(&uq->uq_key);
1130		umtx_key_release(&uq->uq_key);
1131
1132		if (error == 0)
1133			error = umtxq_check_susp(td);
1134	}
1135
1136	if (timeout == NULL) {
1137		/* Mutex locking is restarted if it is interrupted. */
1138		if (error == EINTR)
1139			error = ERESTART;
1140	} else {
1141		/* Timed-locking is not restarted. */
1142		if (error == ERESTART)
1143			error = EINTR;
1144	}
1145	return (error);
1146}
1147
1148/*
1149 * Unlock a umtx object.
1150 */
1151static int
1152do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
1153{
1154	struct umtx_key key;
1155	uint32_t owner;
1156	uint32_t old;
1157	int error;
1158	int count;
1159
1160	/*
1161	 * Make sure we own this mtx.
1162	 */
1163	owner = fuword32(m);
1164	if (owner == -1)
1165		return (EFAULT);
1166
1167	if ((owner & ~UMUTEX_CONTESTED) != id)
1168		return (EPERM);
1169
1170	/* This should be done in userland */
1171	if ((owner & UMUTEX_CONTESTED) == 0) {
1172		old = casuword32(m, owner, UMUTEX_UNOWNED);
1173		if (old == -1)
1174			return (EFAULT);
1175		if (old == owner)
1176			return (0);
1177		owner = old;
1178	}
1179
1180	/* We should only ever be in here for contested locks */
1181	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
1182		&key)) != 0)
1183		return (error);
1184
1185	umtxq_lock(&key);
1186	umtxq_busy(&key);
1187	count = umtxq_count(&key);
1188	umtxq_unlock(&key);
1189
1190	/*
1191	 * When unlocking the umtx, it must be marked as unowned if
1192	 * there is zero or one thread only waiting for it.
1193	 * Otherwise, it must be marked as contested.
1194	 */
1195	old = casuword32(m, owner,
1196		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1197	umtxq_lock(&key);
1198	umtxq_signal(&key,1);
1199	umtxq_unbusy(&key);
1200	umtxq_unlock(&key);
1201	umtx_key_release(&key);
1202	if (old == -1)
1203		return (EFAULT);
1204	if (old != owner)
1205		return (EINVAL);
1206	return (0);
1207}
1208#endif
1209
1210/*
1211 * Fetch and compare value, sleep on the address if value is not changed.
1212 */
1213static int
1214do_wait(struct thread *td, void *addr, u_long id,
1215	struct _umtx_time *timeout, int compat32, int is_private)
1216{
1217	struct abs_timeout timo;
1218	struct umtx_q *uq;
1219	u_long tmp;
1220	uint32_t tmp32;
1221	int error = 0;
1222
1223	uq = td->td_umtxq;
1224	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT,
1225		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
1226		return (error);
1227
1228	if (timeout != NULL)
1229		abs_timeout_init2(&timo, timeout);
1230
1231	umtxq_lock(&uq->uq_key);
1232	umtxq_insert(uq);
1233	umtxq_unlock(&uq->uq_key);
1234	if (compat32 == 0) {
1235		error = fueword(addr, &tmp);
1236		if (error != 0)
1237			error = EFAULT;
1238	} else {
1239		error = fueword32(addr, &tmp32);
1240		if (error == 0)
1241			tmp = tmp32;
1242		else
1243			error = EFAULT;
1244	}
1245	umtxq_lock(&uq->uq_key);
1246	if (error == 0) {
1247		if (tmp == id)
1248			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
1249			    NULL : &timo);
1250		if ((uq->uq_flags & UQF_UMTXQ) == 0)
1251			error = 0;
1252		else
1253			umtxq_remove(uq);
1254	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
1255		umtxq_remove(uq);
1256	}
1257	umtxq_unlock(&uq->uq_key);
1258	umtx_key_release(&uq->uq_key);
1259	if (error == ERESTART)
1260		error = EINTR;
1261	return (error);
1262}
1263
1264/*
1265 * Wake up threads sleeping on the specified address.
1266 */
1267int
1268kern_umtx_wake(struct thread *td, void *uaddr, int n_wake, int is_private)
1269{
1270	struct umtx_key key;
1271	int ret;
1272
1273	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT,
1274		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
1275		return (ret);
1276	umtxq_lock(&key);
1277	ret = umtxq_signal(&key, n_wake);
1278	umtxq_unlock(&key);
1279	umtx_key_release(&key);
1280	return (0);
1281}
1282
1283/*
1284 * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
1285 */
1286static int
1287do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
1288	struct _umtx_time *timeout, int mode)
1289{
1290	struct abs_timeout timo;
1291	struct umtx_q *uq;
1292	uint32_t owner, old, id;
1293	int error, rv;
1294
1295	id = td->td_tid;
1296	uq = td->td_umtxq;
1297	error = 0;
1298	if (timeout != NULL)
1299		abs_timeout_init2(&timo, timeout);
1300
1301	/*
1302	 * Care must be exercised when dealing with umtx structure. It
1303	 * can fault on any access.
1304	 */
1305	for (;;) {
1306		rv = fueword32(&m->m_owner, &owner);
1307		if (rv == -1)
1308			return (EFAULT);
1309		if (mode == _UMUTEX_WAIT) {
1310			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
1311				return (0);
1312		} else {
1313			/*
1314			 * Try the uncontested case.  This should be done in userland.
1315			 */
1316			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
1317			    &owner, id);
1318			/* The address was invalid. */
1319			if (rv == -1)
1320				return (EFAULT);
1321
1322			/* The acquire succeeded. */
1323			if (owner == UMUTEX_UNOWNED)
1324				return (0);
1325
1326			/* If no one owns it but it is contested try to acquire it. */
1327			if (owner == UMUTEX_CONTESTED) {
1328				rv = casueword32(&m->m_owner,
1329				    UMUTEX_CONTESTED, &owner,
1330				    id | UMUTEX_CONTESTED);
1331				/* The address was invalid. */
1332				if (rv == -1)
1333					return (EFAULT);
1334
1335				if (owner == UMUTEX_CONTESTED)
1336					return (0);
1337
1338				rv = umtxq_check_susp(td);
1339				if (rv != 0)
1340					return (rv);
1341
1342				/* If this failed the lock has changed, restart. */
1343				continue;
1344			}
1345		}
1346
1347		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
1348		    (owner & ~UMUTEX_CONTESTED) == id)
1349			return (EDEADLK);
1350
1351		if (mode == _UMUTEX_TRY)
1352			return (EBUSY);
1353
1354		/*
1355		 * If we caught a signal, we have retried and now
1356		 * exit immediately.
1357		 */
1358		if (error != 0)
1359			return (error);
1360
1361		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
1362		    GET_SHARE(flags), &uq->uq_key)) != 0)
1363			return (error);
1364
1365		umtxq_lock(&uq->uq_key);
1366		umtxq_busy(&uq->uq_key);
1367		umtxq_insert(uq);
1368		umtxq_unlock(&uq->uq_key);
1369
1370		/*
1371		 * Set the contested bit so that a release in user space
1372		 * knows to use the system call for unlock.  If this fails
1373		 * either some one else has acquired the lock or it has been
1374		 * released.
1375		 */
1376		rv = casueword32(&m->m_owner, owner, &old,
1377		    owner | UMUTEX_CONTESTED);
1378
1379		/* The address was invalid. */
1380		if (rv == -1) {
1381			umtxq_lock(&uq->uq_key);
1382			umtxq_remove(uq);
1383			umtxq_unbusy(&uq->uq_key);
1384			umtxq_unlock(&uq->uq_key);
1385			umtx_key_release(&uq->uq_key);
1386			return (EFAULT);
1387		}
1388
1389		/*
1390		 * We set the contested bit, sleep. Otherwise the lock changed
1391		 * and we need to retry or we lost a race to the thread
1392		 * unlocking the umtx.
1393		 */
1394		umtxq_lock(&uq->uq_key);
1395		umtxq_unbusy(&uq->uq_key);
1396		if (old == owner)
1397			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
1398			    NULL : &timo);
1399		umtxq_remove(uq);
1400		umtxq_unlock(&uq->uq_key);
1401		umtx_key_release(&uq->uq_key);
1402
1403		if (error == 0)
1404			error = umtxq_check_susp(td);
1405	}
1406
1407	return (0);
1408}
1409
1410/*
1411 * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
1412 */
1413static int
1414do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
1415{
1416	struct umtx_key key;
1417	uint32_t owner, old, id;
1418	int error;
1419	int count;
1420
1421	id = td->td_tid;
1422	/*
1423	 * Make sure we own this mtx.
1424	 */
1425	error = fueword32(&m->m_owner, &owner);
1426	if (error == -1)
1427		return (EFAULT);
1428
1429	if ((owner & ~UMUTEX_CONTESTED) != id)
1430		return (EPERM);
1431
1432	if ((owner & UMUTEX_CONTESTED) == 0) {
1433		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
1434		if (error == -1)
1435			return (EFAULT);
1436		if (old == owner)
1437			return (0);
1438		owner = old;
1439	}
1440
1441	/* We should only ever be in here for contested locks */
1442	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1443	    &key)) != 0)
1444		return (error);
1445
1446	umtxq_lock(&key);
1447	umtxq_busy(&key);
1448	count = umtxq_count(&key);
1449	umtxq_unlock(&key);
1450
1451	/*
1452	 * When unlocking the umtx, it must be marked as unowned if
1453	 * there is zero or one thread only waiting for it.
1454	 * Otherwise, it must be marked as contested.
1455	 */
1456	error = casueword32(&m->m_owner, owner, &old,
1457	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
1458	umtxq_lock(&key);
1459	umtxq_signal(&key,1);
1460	umtxq_unbusy(&key);
1461	umtxq_unlock(&key);
1462	umtx_key_release(&key);
1463	if (error == -1)
1464		return (EFAULT);
1465	if (old != owner)
1466		return (EINVAL);
1467	return (0);
1468}
1469
1470/*
1471 * Check if the mutex is available and wake up a waiter,
1472 * only for simple mutex.
1473 */
1474static int
1475do_wake_umutex(struct thread *td, struct umutex *m)
1476{
1477	struct umtx_key key;
1478	uint32_t owner;
1479	uint32_t flags;
1480	int error;
1481	int count;
1482
1483	error = fueword32(&m->m_owner, &owner);
1484	if (error == -1)
1485		return (EFAULT);
1486
1487	if ((owner & ~UMUTEX_CONTESTED) != 0)
1488		return (0);
1489
1490	error = fueword32(&m->m_flags, &flags);
1491	if (error == -1)
1492		return (EFAULT);
1493
1494	/* We should only ever be in here for contested locks */
1495	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
1496	    &key)) != 0)
1497		return (error);
1498
1499	umtxq_lock(&key);
1500	umtxq_busy(&key);
1501	count = umtxq_count(&key);
1502	umtxq_unlock(&key);
1503
1504	if (count <= 1) {
1505		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
1506		    UMUTEX_UNOWNED);
1507		if (error == -1)
1508			error = EFAULT;
1509	}
1510
1511	umtxq_lock(&key);
1512	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1513		umtxq_signal(&key, 1);
1514	umtxq_unbusy(&key);
1515	umtxq_unlock(&key);
1516	umtx_key_release(&key);
1517	return (error);
1518}
1519
1520/*
1521 * Check if the mutex has waiters and tries to fix contention bit.
1522 */
1523static int
1524do_wake2_umutex(struct thread *td, struct umutex *m, uint32_t flags)
1525{
1526	struct umtx_key key;
1527	uint32_t owner, old;
1528	int type;
1529	int error;
1530	int count;
1531
1532	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
1533	case 0:
1534		type = TYPE_NORMAL_UMUTEX;
1535		break;
1536	case UMUTEX_PRIO_INHERIT:
1537		type = TYPE_PI_UMUTEX;
1538		break;
1539	case UMUTEX_PRIO_PROTECT:
1540		type = TYPE_PP_UMUTEX;
1541		break;
1542	default:
1543		return (EINVAL);
1544	}
1545	if ((error = umtx_key_get(m, type, GET_SHARE(flags),
1546	    &key)) != 0)
1547		return (error);
1548
1549	owner = 0;
1550	umtxq_lock(&key);
1551	umtxq_busy(&key);
1552	count = umtxq_count(&key);
1553	umtxq_unlock(&key);
1554	/*
1555	 * Only repair contention bit if there is a waiter, this means the mutex
1556	 * is still being referenced by userland code, otherwise don't update
1557	 * any memory.
1558	 */
1559	if (count > 1) {
1560		error = fueword32(&m->m_owner, &owner);
1561		if (error == -1)
1562			error = EFAULT;
1563		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
1564			error = casueword32(&m->m_owner, owner, &old,
1565			    owner | UMUTEX_CONTESTED);
1566			if (error == -1) {
1567				error = EFAULT;
1568				break;
1569			}
1570			if (old == owner)
1571				break;
1572			owner = old;
1573			error = umtxq_check_susp(td);
1574			if (error != 0)
1575				break;
1576		}
1577	} else if (count == 1) {
1578		error = fueword32(&m->m_owner, &owner);
1579		if (error == -1)
1580			error = EFAULT;
1581		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
1582		       (owner & UMUTEX_CONTESTED) == 0) {
1583			error = casueword32(&m->m_owner, owner, &old,
1584			    owner | UMUTEX_CONTESTED);
1585			if (error == -1) {
1586				error = EFAULT;
1587				break;
1588			}
1589			if (old == owner)
1590				break;
1591			owner = old;
1592			error = umtxq_check_susp(td);
1593			if (error != 0)
1594				break;
1595		}
1596	}
1597	umtxq_lock(&key);
1598	if (error == EFAULT) {
1599		umtxq_signal(&key, INT_MAX);
1600	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
1601		umtxq_signal(&key, 1);
1602	umtxq_unbusy(&key);
1603	umtxq_unlock(&key);
1604	umtx_key_release(&key);
1605	return (error);
1606}
1607
1608static inline struct umtx_pi *
1609umtx_pi_alloc(int flags)
1610{
1611	struct umtx_pi *pi;
1612
1613	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
1614	TAILQ_INIT(&pi->pi_blocked);
1615	atomic_add_int(&umtx_pi_allocated, 1);
1616	return (pi);
1617}
1618
1619static inline void
1620umtx_pi_free(struct umtx_pi *pi)
1621{
1622	uma_zfree(umtx_pi_zone, pi);
1623	atomic_add_int(&umtx_pi_allocated, -1);
1624}
1625
1626/*
1627 * Adjust the thread's position on a pi_state after its priority has been
1628 * changed.
1629 */
1630static int
1631umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
1632{
1633	struct umtx_q *uq, *uq1, *uq2;
1634	struct thread *td1;
1635
1636	mtx_assert(&umtx_lock, MA_OWNED);
1637	if (pi == NULL)
1638		return (0);
1639
1640	uq = td->td_umtxq;
1641
1642	/*
1643	 * Check if the thread needs to be moved on the blocked chain.
1644	 * It needs to be moved if either its priority is lower than
1645	 * the previous thread or higher than the next thread.
1646	 */
1647	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
1648	uq2 = TAILQ_NEXT(uq, uq_lockq);
1649	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
1650	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
1651		/*
1652		 * Remove thread from blocked chain and determine where
1653		 * it should be moved to.
1654		 */
1655		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1656		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1657			td1 = uq1->uq_thread;
1658			MPASS(td1->td_proc->p_magic == P_MAGIC);
1659			if (UPRI(td1) > UPRI(td))
1660				break;
1661		}
1662
1663		if (uq1 == NULL)
1664			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1665		else
1666			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1667	}
1668	return (1);
1669}
1670
1671/*
1672 * Propagate priority when a thread is blocked on POSIX
1673 * PI mutex.
1674 */
1675static void
1676umtx_propagate_priority(struct thread *td)
1677{
1678	struct umtx_q *uq;
1679	struct umtx_pi *pi;
1680	int pri;
1681
1682	mtx_assert(&umtx_lock, MA_OWNED);
1683	pri = UPRI(td);
1684	uq = td->td_umtxq;
1685	pi = uq->uq_pi_blocked;
1686	if (pi == NULL)
1687		return;
1688
1689	for (;;) {
1690		td = pi->pi_owner;
1691		if (td == NULL || td == curthread)
1692			return;
1693
1694		MPASS(td->td_proc != NULL);
1695		MPASS(td->td_proc->p_magic == P_MAGIC);
1696
1697		thread_lock(td);
1698		if (td->td_lend_user_pri > pri)
1699			sched_lend_user_prio(td, pri);
1700		else {
1701			thread_unlock(td);
1702			break;
1703		}
1704		thread_unlock(td);
1705
1706		/*
1707		 * Pick up the lock that td is blocked on.
1708		 */
1709		uq = td->td_umtxq;
1710		pi = uq->uq_pi_blocked;
1711		if (pi == NULL)
1712			break;
1713		/* Resort td on the list if needed. */
1714		umtx_pi_adjust_thread(pi, td);
1715	}
1716}
1717
1718/*
1719 * Unpropagate priority for a PI mutex when a thread blocked on
1720 * it is interrupted by signal or resumed by others.
1721 */
1722static void
1723umtx_repropagate_priority(struct umtx_pi *pi)
1724{
1725	struct umtx_q *uq, *uq_owner;
1726	struct umtx_pi *pi2;
1727	int pri;
1728
1729	mtx_assert(&umtx_lock, MA_OWNED);
1730
1731	while (pi != NULL && pi->pi_owner != NULL) {
1732		pri = PRI_MAX;
1733		uq_owner = pi->pi_owner->td_umtxq;
1734
1735		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
1736			uq = TAILQ_FIRST(&pi2->pi_blocked);
1737			if (uq != NULL) {
1738				if (pri > UPRI(uq->uq_thread))
1739					pri = UPRI(uq->uq_thread);
1740			}
1741		}
1742
1743		if (pri > uq_owner->uq_inherited_pri)
1744			pri = uq_owner->uq_inherited_pri;
1745		thread_lock(pi->pi_owner);
1746		sched_lend_user_prio(pi->pi_owner, pri);
1747		thread_unlock(pi->pi_owner);
1748		if ((pi = uq_owner->uq_pi_blocked) != NULL)
1749			umtx_pi_adjust_thread(pi, uq_owner->uq_thread);
1750	}
1751}
1752
1753/*
1754 * Insert a PI mutex into owned list.
1755 */
1756static void
1757umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
1758{
1759	struct umtx_q *uq_owner;
1760
1761	uq_owner = owner->td_umtxq;
1762	mtx_assert(&umtx_lock, MA_OWNED);
1763	if (pi->pi_owner != NULL)
1764		panic("pi_ower != NULL");
1765	pi->pi_owner = owner;
1766	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
1767}
1768
1769/*
1770 * Claim ownership of a PI mutex.
1771 */
1772static int
1773umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
1774{
1775	struct umtx_q *uq, *uq_owner;
1776
1777	uq_owner = owner->td_umtxq;
1778	mtx_lock_spin(&umtx_lock);
1779	if (pi->pi_owner == owner) {
1780		mtx_unlock_spin(&umtx_lock);
1781		return (0);
1782	}
1783
1784	if (pi->pi_owner != NULL) {
1785		/*
1786		 * userland may have already messed the mutex, sigh.
1787		 */
1788		mtx_unlock_spin(&umtx_lock);
1789		return (EPERM);
1790	}
1791	umtx_pi_setowner(pi, owner);
1792	uq = TAILQ_FIRST(&pi->pi_blocked);
1793	if (uq != NULL) {
1794		int pri;
1795
1796		pri = UPRI(uq->uq_thread);
1797		thread_lock(owner);
1798		if (pri < UPRI(owner))
1799			sched_lend_user_prio(owner, pri);
1800		thread_unlock(owner);
1801	}
1802	mtx_unlock_spin(&umtx_lock);
1803	return (0);
1804}
1805
1806/*
1807 * Adjust a thread's order position in its blocked PI mutex,
1808 * this may result new priority propagating process.
1809 */
1810void
1811umtx_pi_adjust(struct thread *td, u_char oldpri)
1812{
1813	struct umtx_q *uq;
1814	struct umtx_pi *pi;
1815
1816	uq = td->td_umtxq;
1817	mtx_lock_spin(&umtx_lock);
1818	/*
1819	 * Pick up the lock that td is blocked on.
1820	 */
1821	pi = uq->uq_pi_blocked;
1822	if (pi != NULL) {
1823		umtx_pi_adjust_thread(pi, td);
1824		umtx_repropagate_priority(pi);
1825	}
1826	mtx_unlock_spin(&umtx_lock);
1827}
1828
1829/*
1830 * Sleep on a PI mutex.
1831 */
1832static int
1833umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
1834	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
1835{
1836	struct umtxq_chain *uc;
1837	struct thread *td, *td1;
1838	struct umtx_q *uq1;
1839	int pri;
1840	int error = 0;
1841
1842	td = uq->uq_thread;
1843	KASSERT(td == curthread, ("inconsistent uq_thread"));
1844	uc = umtxq_getchain(&uq->uq_key);
1845	UMTXQ_LOCKED_ASSERT(uc);
1846	UMTXQ_BUSY_ASSERT(uc);
1847	umtxq_insert(uq);
1848	mtx_lock_spin(&umtx_lock);
1849	if (pi->pi_owner == NULL) {
1850		mtx_unlock_spin(&umtx_lock);
1851		/* XXX Only look up thread in current process. */
1852		td1 = tdfind(owner, curproc->p_pid);
1853		mtx_lock_spin(&umtx_lock);
1854		if (td1 != NULL) {
1855			if (pi->pi_owner == NULL)
1856				umtx_pi_setowner(pi, td1);
1857			PROC_UNLOCK(td1->td_proc);
1858		}
1859	}
1860
1861	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
1862		pri = UPRI(uq1->uq_thread);
1863		if (pri > UPRI(td))
1864			break;
1865	}
1866
1867	if (uq1 != NULL)
1868		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
1869	else
1870		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
1871
1872	uq->uq_pi_blocked = pi;
1873	thread_lock(td);
1874	td->td_flags |= TDF_UPIBLOCKED;
1875	thread_unlock(td);
1876	umtx_propagate_priority(td);
1877	mtx_unlock_spin(&umtx_lock);
1878	umtxq_unbusy(&uq->uq_key);
1879
1880	error = umtxq_sleep(uq, wmesg, timo);
1881	umtxq_remove(uq);
1882
1883	mtx_lock_spin(&umtx_lock);
1884	uq->uq_pi_blocked = NULL;
1885	thread_lock(td);
1886	td->td_flags &= ~TDF_UPIBLOCKED;
1887	thread_unlock(td);
1888	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
1889	umtx_repropagate_priority(pi);
1890	mtx_unlock_spin(&umtx_lock);
1891	umtxq_unlock(&uq->uq_key);
1892
1893	return (error);
1894}
1895
1896/*
1897 * Add reference count for a PI mutex.
1898 */
1899static void
1900umtx_pi_ref(struct umtx_pi *pi)
1901{
1902	struct umtxq_chain *uc;
1903
1904	uc = umtxq_getchain(&pi->pi_key);
1905	UMTXQ_LOCKED_ASSERT(uc);
1906	pi->pi_refcount++;
1907}
1908
1909/*
1910 * Decrease reference count for a PI mutex, if the counter
1911 * is decreased to zero, its memory space is freed.
1912 */
1913static void
1914umtx_pi_unref(struct umtx_pi *pi)
1915{
1916	struct umtxq_chain *uc;
1917
1918	uc = umtxq_getchain(&pi->pi_key);
1919	UMTXQ_LOCKED_ASSERT(uc);
1920	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
1921	if (--pi->pi_refcount == 0) {
1922		mtx_lock_spin(&umtx_lock);
1923		if (pi->pi_owner != NULL) {
1924			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
1925				pi, pi_link);
1926			pi->pi_owner = NULL;
1927		}
1928		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
1929			("blocked queue not empty"));
1930		mtx_unlock_spin(&umtx_lock);
1931		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
1932		umtx_pi_free(pi);
1933	}
1934}
1935
1936/*
1937 * Find a PI mutex in hash table.
1938 */
1939static struct umtx_pi *
1940umtx_pi_lookup(struct umtx_key *key)
1941{
1942	struct umtxq_chain *uc;
1943	struct umtx_pi *pi;
1944
1945	uc = umtxq_getchain(key);
1946	UMTXQ_LOCKED_ASSERT(uc);
1947
1948	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
1949		if (umtx_key_match(&pi->pi_key, key)) {
1950			return (pi);
1951		}
1952	}
1953	return (NULL);
1954}
1955
1956/*
1957 * Insert a PI mutex into hash table.
1958 */
1959static inline void
1960umtx_pi_insert(struct umtx_pi *pi)
1961{
1962	struct umtxq_chain *uc;
1963
1964	uc = umtxq_getchain(&pi->pi_key);
1965	UMTXQ_LOCKED_ASSERT(uc);
1966	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
1967}
1968
1969/*
1970 * Lock a PI mutex.
1971 */
1972static int
1973do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
1974    struct _umtx_time *timeout, int try)
1975{
1976	struct abs_timeout timo;
1977	struct umtx_q *uq;
1978	struct umtx_pi *pi, *new_pi;
1979	uint32_t id, owner, old;
1980	int error, rv;
1981
1982	id = td->td_tid;
1983	uq = td->td_umtxq;
1984
1985	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
1986	    &uq->uq_key)) != 0)
1987		return (error);
1988
1989	if (timeout != NULL)
1990		abs_timeout_init2(&timo, timeout);
1991
1992	umtxq_lock(&uq->uq_key);
1993	pi = umtx_pi_lookup(&uq->uq_key);
1994	if (pi == NULL) {
1995		new_pi = umtx_pi_alloc(M_NOWAIT);
1996		if (new_pi == NULL) {
1997			umtxq_unlock(&uq->uq_key);
1998			new_pi = umtx_pi_alloc(M_WAITOK);
1999			umtxq_lock(&uq->uq_key);
2000			pi = umtx_pi_lookup(&uq->uq_key);
2001			if (pi != NULL) {
2002				umtx_pi_free(new_pi);
2003				new_pi = NULL;
2004			}
2005		}
2006		if (new_pi != NULL) {
2007			new_pi->pi_key = uq->uq_key;
2008			umtx_pi_insert(new_pi);
2009			pi = new_pi;
2010		}
2011	}
2012	umtx_pi_ref(pi);
2013	umtxq_unlock(&uq->uq_key);
2014
2015	/*
2016	 * Care must be exercised when dealing with umtx structure.  It
2017	 * can fault on any access.
2018	 */
2019	for (;;) {
2020		/*
2021		 * Try the uncontested case.  This should be done in userland.
2022		 */
2023		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
2024		/* The address was invalid. */
2025		if (rv == -1) {
2026			error = EFAULT;
2027			break;
2028		}
2029
2030		/* The acquire succeeded. */
2031		if (owner == UMUTEX_UNOWNED) {
2032			error = 0;
2033			break;
2034		}
2035
2036		/* If no one owns it but it is contested try to acquire it. */
2037		if (owner == UMUTEX_CONTESTED) {
2038			rv = casueword32(&m->m_owner,
2039			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2040			/* The address was invalid. */
2041			if (rv == -1) {
2042				error = EFAULT;
2043				break;
2044			}
2045
2046			if (owner == UMUTEX_CONTESTED) {
2047				umtxq_lock(&uq->uq_key);
2048				umtxq_busy(&uq->uq_key);
2049				error = umtx_pi_claim(pi, td);
2050				umtxq_unbusy(&uq->uq_key);
2051				umtxq_unlock(&uq->uq_key);
2052				break;
2053			}
2054
2055			error = umtxq_check_susp(td);
2056			if (error != 0)
2057				break;
2058
2059			/* If this failed the lock has changed, restart. */
2060			continue;
2061		}
2062
2063		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2064		    (owner & ~UMUTEX_CONTESTED) == id) {
2065			error = EDEADLK;
2066			break;
2067		}
2068
2069		if (try != 0) {
2070			error = EBUSY;
2071			break;
2072		}
2073
2074		/*
2075		 * If we caught a signal, we have retried and now
2076		 * exit immediately.
2077		 */
2078		if (error != 0)
2079			break;
2080
2081		umtxq_lock(&uq->uq_key);
2082		umtxq_busy(&uq->uq_key);
2083		umtxq_unlock(&uq->uq_key);
2084
2085		/*
2086		 * Set the contested bit so that a release in user space
2087		 * knows to use the system call for unlock.  If this fails
2088		 * either some one else has acquired the lock or it has been
2089		 * released.
2090		 */
2091		rv = casueword32(&m->m_owner, owner, &old,
2092		    owner | UMUTEX_CONTESTED);
2093
2094		/* The address was invalid. */
2095		if (rv == -1) {
2096			umtxq_unbusy_unlocked(&uq->uq_key);
2097			error = EFAULT;
2098			break;
2099		}
2100
2101		umtxq_lock(&uq->uq_key);
2102		/*
2103		 * We set the contested bit, sleep. Otherwise the lock changed
2104		 * and we need to retry or we lost a race to the thread
2105		 * unlocking the umtx.
2106		 */
2107		if (old == owner) {
2108			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
2109			    "umtxpi", timeout == NULL ? NULL : &timo);
2110			if (error != 0)
2111				continue;
2112		} else {
2113			umtxq_unbusy(&uq->uq_key);
2114			umtxq_unlock(&uq->uq_key);
2115		}
2116
2117		error = umtxq_check_susp(td);
2118		if (error != 0)
2119			break;
2120	}
2121
2122	umtxq_lock(&uq->uq_key);
2123	umtx_pi_unref(pi);
2124	umtxq_unlock(&uq->uq_key);
2125
2126	umtx_key_release(&uq->uq_key);
2127	return (error);
2128}
2129
2130/*
2131 * Unlock a PI mutex.
2132 */
2133static int
2134do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
2135{
2136	struct umtx_key key;
2137	struct umtx_q *uq_first, *uq_first2, *uq_me;
2138	struct umtx_pi *pi, *pi2;
2139	uint32_t owner, old, id;
2140	int error;
2141	int count;
2142	int pri;
2143
2144	id = td->td_tid;
2145	/*
2146	 * Make sure we own this mtx.
2147	 */
2148	error = fueword32(&m->m_owner, &owner);
2149	if (error == -1)
2150		return (EFAULT);
2151
2152	if ((owner & ~UMUTEX_CONTESTED) != id)
2153		return (EPERM);
2154
2155	/* This should be done in userland */
2156	if ((owner & UMUTEX_CONTESTED) == 0) {
2157		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
2158		if (error == -1)
2159			return (EFAULT);
2160		if (old == owner)
2161			return (0);
2162		owner = old;
2163	}
2164
2165	/* We should only ever be in here for contested locks */
2166	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
2167	    &key)) != 0)
2168		return (error);
2169
2170	umtxq_lock(&key);
2171	umtxq_busy(&key);
2172	count = umtxq_count_pi(&key, &uq_first);
2173	if (uq_first != NULL) {
2174		mtx_lock_spin(&umtx_lock);
2175		pi = uq_first->uq_pi_blocked;
2176		KASSERT(pi != NULL, ("pi == NULL?"));
2177		if (pi->pi_owner != curthread) {
2178			mtx_unlock_spin(&umtx_lock);
2179			umtxq_unbusy(&key);
2180			umtxq_unlock(&key);
2181			umtx_key_release(&key);
2182			/* userland messed the mutex */
2183			return (EPERM);
2184		}
2185		uq_me = curthread->td_umtxq;
2186		pi->pi_owner = NULL;
2187		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
2188		/* get highest priority thread which is still sleeping. */
2189		uq_first = TAILQ_FIRST(&pi->pi_blocked);
2190		while (uq_first != NULL &&
2191		       (uq_first->uq_flags & UQF_UMTXQ) == 0) {
2192			uq_first = TAILQ_NEXT(uq_first, uq_lockq);
2193		}
2194		pri = PRI_MAX;
2195		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
2196			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
2197			if (uq_first2 != NULL) {
2198				if (pri > UPRI(uq_first2->uq_thread))
2199					pri = UPRI(uq_first2->uq_thread);
2200			}
2201		}
2202		thread_lock(curthread);
2203		sched_lend_user_prio(curthread, pri);
2204		thread_unlock(curthread);
2205		mtx_unlock_spin(&umtx_lock);
2206		if (uq_first)
2207			umtxq_signal_thread(uq_first);
2208	}
2209	umtxq_unlock(&key);
2210
2211	/*
2212	 * When unlocking the umtx, it must be marked as unowned if
2213	 * there is zero or one thread only waiting for it.
2214	 * Otherwise, it must be marked as contested.
2215	 */
2216	error = casueword32(&m->m_owner, owner, &old,
2217	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
2218
2219	umtxq_unbusy_unlocked(&key);
2220	umtx_key_release(&key);
2221	if (error == -1)
2222		return (EFAULT);
2223	if (old != owner)
2224		return (EINVAL);
2225	return (0);
2226}
2227
2228/*
2229 * Lock a PP mutex.
2230 */
2231static int
2232do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
2233    struct _umtx_time *timeout, int try)
2234{
2235	struct abs_timeout timo;
2236	struct umtx_q *uq, *uq2;
2237	struct umtx_pi *pi;
2238	uint32_t ceiling;
2239	uint32_t owner, id;
2240	int error, pri, old_inherited_pri, su, rv;
2241
2242	id = td->td_tid;
2243	uq = td->td_umtxq;
2244	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2245	    &uq->uq_key)) != 0)
2246		return (error);
2247
2248	if (timeout != NULL)
2249		abs_timeout_init2(&timo, timeout);
2250
2251	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2252	for (;;) {
2253		old_inherited_pri = uq->uq_inherited_pri;
2254		umtxq_lock(&uq->uq_key);
2255		umtxq_busy(&uq->uq_key);
2256		umtxq_unlock(&uq->uq_key);
2257
2258		rv = fueword32(&m->m_ceilings[0], &ceiling);
2259		if (rv == -1) {
2260			error = EFAULT;
2261			goto out;
2262		}
2263		ceiling = RTP_PRIO_MAX - ceiling;
2264		if (ceiling > RTP_PRIO_MAX) {
2265			error = EINVAL;
2266			goto out;
2267		}
2268
2269		mtx_lock_spin(&umtx_lock);
2270		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
2271			mtx_unlock_spin(&umtx_lock);
2272			error = EINVAL;
2273			goto out;
2274		}
2275		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
2276			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
2277			thread_lock(td);
2278			if (uq->uq_inherited_pri < UPRI(td))
2279				sched_lend_user_prio(td, uq->uq_inherited_pri);
2280			thread_unlock(td);
2281		}
2282		mtx_unlock_spin(&umtx_lock);
2283
2284		rv = casueword32(&m->m_owner,
2285		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2286		/* The address was invalid. */
2287		if (rv == -1) {
2288			error = EFAULT;
2289			break;
2290		}
2291
2292		if (owner == UMUTEX_CONTESTED) {
2293			error = 0;
2294			break;
2295		}
2296
2297		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
2298		    (owner & ~UMUTEX_CONTESTED) == id) {
2299			error = EDEADLK;
2300			break;
2301		}
2302
2303		if (try != 0) {
2304			error = EBUSY;
2305			break;
2306		}
2307
2308		/*
2309		 * If we caught a signal, we have retried and now
2310		 * exit immediately.
2311		 */
2312		if (error != 0)
2313			break;
2314
2315		umtxq_lock(&uq->uq_key);
2316		umtxq_insert(uq);
2317		umtxq_unbusy(&uq->uq_key);
2318		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
2319		    NULL : &timo);
2320		umtxq_remove(uq);
2321		umtxq_unlock(&uq->uq_key);
2322
2323		mtx_lock_spin(&umtx_lock);
2324		uq->uq_inherited_pri = old_inherited_pri;
2325		pri = PRI_MAX;
2326		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2327			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2328			if (uq2 != NULL) {
2329				if (pri > UPRI(uq2->uq_thread))
2330					pri = UPRI(uq2->uq_thread);
2331			}
2332		}
2333		if (pri > uq->uq_inherited_pri)
2334			pri = uq->uq_inherited_pri;
2335		thread_lock(td);
2336		sched_lend_user_prio(td, pri);
2337		thread_unlock(td);
2338		mtx_unlock_spin(&umtx_lock);
2339	}
2340
2341	if (error != 0) {
2342		mtx_lock_spin(&umtx_lock);
2343		uq->uq_inherited_pri = old_inherited_pri;
2344		pri = PRI_MAX;
2345		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2346			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2347			if (uq2 != NULL) {
2348				if (pri > UPRI(uq2->uq_thread))
2349					pri = UPRI(uq2->uq_thread);
2350			}
2351		}
2352		if (pri > uq->uq_inherited_pri)
2353			pri = uq->uq_inherited_pri;
2354		thread_lock(td);
2355		sched_lend_user_prio(td, pri);
2356		thread_unlock(td);
2357		mtx_unlock_spin(&umtx_lock);
2358	}
2359
2360out:
2361	umtxq_unbusy_unlocked(&uq->uq_key);
2362	umtx_key_release(&uq->uq_key);
2363	return (error);
2364}
2365
2366/*
2367 * Unlock a PP mutex.
2368 */
2369static int
2370do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
2371{
2372	struct umtx_key key;
2373	struct umtx_q *uq, *uq2;
2374	struct umtx_pi *pi;
2375	uint32_t owner, id;
2376	uint32_t rceiling;
2377	int error, pri, new_inherited_pri, su;
2378
2379	id = td->td_tid;
2380	uq = td->td_umtxq;
2381	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
2382
2383	/*
2384	 * Make sure we own this mtx.
2385	 */
2386	error = fueword32(&m->m_owner, &owner);
2387	if (error == -1)
2388		return (EFAULT);
2389
2390	if ((owner & ~UMUTEX_CONTESTED) != id)
2391		return (EPERM);
2392
2393	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
2394	if (error != 0)
2395		return (error);
2396
2397	if (rceiling == -1)
2398		new_inherited_pri = PRI_MAX;
2399	else {
2400		rceiling = RTP_PRIO_MAX - rceiling;
2401		if (rceiling > RTP_PRIO_MAX)
2402			return (EINVAL);
2403		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
2404	}
2405
2406	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2407	    &key)) != 0)
2408		return (error);
2409	umtxq_lock(&key);
2410	umtxq_busy(&key);
2411	umtxq_unlock(&key);
2412	/*
2413	 * For priority protected mutex, always set unlocked state
2414	 * to UMUTEX_CONTESTED, so that userland always enters kernel
2415	 * to lock the mutex, it is necessary because thread priority
2416	 * has to be adjusted for such mutex.
2417	 */
2418	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
2419
2420	umtxq_lock(&key);
2421	if (error == 0)
2422		umtxq_signal(&key, 1);
2423	umtxq_unbusy(&key);
2424	umtxq_unlock(&key);
2425
2426	if (error == -1)
2427		error = EFAULT;
2428	else {
2429		mtx_lock_spin(&umtx_lock);
2430		if (su != 0)
2431			uq->uq_inherited_pri = new_inherited_pri;
2432		pri = PRI_MAX;
2433		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
2434			uq2 = TAILQ_FIRST(&pi->pi_blocked);
2435			if (uq2 != NULL) {
2436				if (pri > UPRI(uq2->uq_thread))
2437					pri = UPRI(uq2->uq_thread);
2438			}
2439		}
2440		if (pri > uq->uq_inherited_pri)
2441			pri = uq->uq_inherited_pri;
2442		thread_lock(td);
2443		sched_lend_user_prio(td, pri);
2444		thread_unlock(td);
2445		mtx_unlock_spin(&umtx_lock);
2446	}
2447	umtx_key_release(&key);
2448	return (error);
2449}
2450
2451static int
2452do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
2453	uint32_t *old_ceiling)
2454{
2455	struct umtx_q *uq;
2456	uint32_t save_ceiling;
2457	uint32_t owner, id;
2458	uint32_t flags;
2459	int error, rv;
2460
2461	error = fueword32(&m->m_flags, &flags);
2462	if (error == -1)
2463		return (EFAULT);
2464	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
2465		return (EINVAL);
2466	if (ceiling > RTP_PRIO_MAX)
2467		return (EINVAL);
2468	id = td->td_tid;
2469	uq = td->td_umtxq;
2470	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
2471	   &uq->uq_key)) != 0)
2472		return (error);
2473	for (;;) {
2474		umtxq_lock(&uq->uq_key);
2475		umtxq_busy(&uq->uq_key);
2476		umtxq_unlock(&uq->uq_key);
2477
2478		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
2479		if (rv == -1) {
2480			error = EFAULT;
2481			break;
2482		}
2483
2484		rv = casueword32(&m->m_owner,
2485		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
2486		if (rv == -1) {
2487			error = EFAULT;
2488			break;
2489		}
2490
2491		if (owner == UMUTEX_CONTESTED) {
2492			suword32(&m->m_ceilings[0], ceiling);
2493			suword32(&m->m_owner, UMUTEX_CONTESTED);
2494			error = 0;
2495			break;
2496		}
2497
2498		if ((owner & ~UMUTEX_CONTESTED) == id) {
2499			suword32(&m->m_ceilings[0], ceiling);
2500			error = 0;
2501			break;
2502		}
2503
2504		/*
2505		 * If we caught a signal, we have retried and now
2506		 * exit immediately.
2507		 */
2508		if (error != 0)
2509			break;
2510
2511		/*
2512		 * We set the contested bit, sleep. Otherwise the lock changed
2513		 * and we need to retry or we lost a race to the thread
2514		 * unlocking the umtx.
2515		 */
2516		umtxq_lock(&uq->uq_key);
2517		umtxq_insert(uq);
2518		umtxq_unbusy(&uq->uq_key);
2519		error = umtxq_sleep(uq, "umtxpp", NULL);
2520		umtxq_remove(uq);
2521		umtxq_unlock(&uq->uq_key);
2522	}
2523	umtxq_lock(&uq->uq_key);
2524	if (error == 0)
2525		umtxq_signal(&uq->uq_key, INT_MAX);
2526	umtxq_unbusy(&uq->uq_key);
2527	umtxq_unlock(&uq->uq_key);
2528	umtx_key_release(&uq->uq_key);
2529	if (error == 0 && old_ceiling != NULL)
2530		suword32(old_ceiling, save_ceiling);
2531	return (error);
2532}
2533
2534/*
2535 * Lock a userland POSIX mutex.
2536 */
2537static int
2538do_lock_umutex(struct thread *td, struct umutex *m,
2539    struct _umtx_time *timeout, int mode)
2540{
2541	uint32_t flags;
2542	int error;
2543
2544	error = fueword32(&m->m_flags, &flags);
2545	if (error == -1)
2546		return (EFAULT);
2547
2548	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2549	case 0:
2550		error = do_lock_normal(td, m, flags, timeout, mode);
2551		break;
2552	case UMUTEX_PRIO_INHERIT:
2553		error = do_lock_pi(td, m, flags, timeout, mode);
2554		break;
2555	case UMUTEX_PRIO_PROTECT:
2556		error = do_lock_pp(td, m, flags, timeout, mode);
2557		break;
2558	default:
2559		return (EINVAL);
2560	}
2561	if (timeout == NULL) {
2562		if (error == EINTR && mode != _UMUTEX_WAIT)
2563			error = ERESTART;
2564	} else {
2565		/* Timed-locking is not restarted. */
2566		if (error == ERESTART)
2567			error = EINTR;
2568	}
2569	return (error);
2570}
2571
2572/*
2573 * Unlock a userland POSIX mutex.
2574 */
2575static int
2576do_unlock_umutex(struct thread *td, struct umutex *m)
2577{
2578	uint32_t flags;
2579	int error;
2580
2581	error = fueword32(&m->m_flags, &flags);
2582	if (error == -1)
2583		return (EFAULT);
2584
2585	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
2586	case 0:
2587		return (do_unlock_normal(td, m, flags));
2588	case UMUTEX_PRIO_INHERIT:
2589		return (do_unlock_pi(td, m, flags));
2590	case UMUTEX_PRIO_PROTECT:
2591		return (do_unlock_pp(td, m, flags));
2592	}
2593
2594	return (EINVAL);
2595}
2596
2597static int
2598do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
2599	struct timespec *timeout, u_long wflags)
2600{
2601	struct abs_timeout timo;
2602	struct umtx_q *uq;
2603	uint32_t flags, clockid, hasw;
2604	int error;
2605
2606	uq = td->td_umtxq;
2607	error = fueword32(&cv->c_flags, &flags);
2608	if (error == -1)
2609		return (EFAULT);
2610	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
2611	if (error != 0)
2612		return (error);
2613
2614	if ((wflags & CVWAIT_CLOCKID) != 0) {
2615		error = fueword32(&cv->c_clockid, &clockid);
2616		if (error == -1) {
2617			umtx_key_release(&uq->uq_key);
2618			return (EFAULT);
2619		}
2620		if (clockid < CLOCK_REALTIME ||
2621		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
2622			/* hmm, only HW clock id will work. */
2623			umtx_key_release(&uq->uq_key);
2624			return (EINVAL);
2625		}
2626	} else {
2627		clockid = CLOCK_REALTIME;
2628	}
2629
2630	umtxq_lock(&uq->uq_key);
2631	umtxq_busy(&uq->uq_key);
2632	umtxq_insert(uq);
2633	umtxq_unlock(&uq->uq_key);
2634
2635	/*
2636	 * Set c_has_waiters to 1 before releasing user mutex, also
2637	 * don't modify cache line when unnecessary.
2638	 */
2639	error = fueword32(&cv->c_has_waiters, &hasw);
2640	if (error == 0 && hasw == 0)
2641		suword32(&cv->c_has_waiters, 1);
2642
2643	umtxq_unbusy_unlocked(&uq->uq_key);
2644
2645	error = do_unlock_umutex(td, m);
2646
2647	if (timeout != NULL)
2648		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
2649			timeout);
2650
2651	umtxq_lock(&uq->uq_key);
2652	if (error == 0) {
2653		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
2654		    NULL : &timo);
2655	}
2656
2657	if ((uq->uq_flags & UQF_UMTXQ) == 0)
2658		error = 0;
2659	else {
2660		/*
2661		 * This must be timeout,interrupted by signal or
2662		 * surprious wakeup, clear c_has_waiter flag when
2663		 * necessary.
2664		 */
2665		umtxq_busy(&uq->uq_key);
2666		if ((uq->uq_flags & UQF_UMTXQ) != 0) {
2667			int oldlen = uq->uq_cur_queue->length;
2668			umtxq_remove(uq);
2669			if (oldlen == 1) {
2670				umtxq_unlock(&uq->uq_key);
2671				suword32(&cv->c_has_waiters, 0);
2672				umtxq_lock(&uq->uq_key);
2673			}
2674		}
2675		umtxq_unbusy(&uq->uq_key);
2676		if (error == ERESTART)
2677			error = EINTR;
2678	}
2679
2680	umtxq_unlock(&uq->uq_key);
2681	umtx_key_release(&uq->uq_key);
2682	return (error);
2683}
2684
2685/*
2686 * Signal a userland condition variable.
2687 */
2688static int
2689do_cv_signal(struct thread *td, struct ucond *cv)
2690{
2691	struct umtx_key key;
2692	int error, cnt, nwake;
2693	uint32_t flags;
2694
2695	error = fueword32(&cv->c_flags, &flags);
2696	if (error == -1)
2697		return (EFAULT);
2698	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2699		return (error);
2700	umtxq_lock(&key);
2701	umtxq_busy(&key);
2702	cnt = umtxq_count(&key);
2703	nwake = umtxq_signal(&key, 1);
2704	if (cnt <= nwake) {
2705		umtxq_unlock(&key);
2706		error = suword32(&cv->c_has_waiters, 0);
2707		if (error == -1)
2708			error = EFAULT;
2709		umtxq_lock(&key);
2710	}
2711	umtxq_unbusy(&key);
2712	umtxq_unlock(&key);
2713	umtx_key_release(&key);
2714	return (error);
2715}
2716
2717static int
2718do_cv_broadcast(struct thread *td, struct ucond *cv)
2719{
2720	struct umtx_key key;
2721	int error;
2722	uint32_t flags;
2723
2724	error = fueword32(&cv->c_flags, &flags);
2725	if (error == -1)
2726		return (EFAULT);
2727	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
2728		return (error);
2729
2730	umtxq_lock(&key);
2731	umtxq_busy(&key);
2732	umtxq_signal(&key, INT_MAX);
2733	umtxq_unlock(&key);
2734
2735	error = suword32(&cv->c_has_waiters, 0);
2736	if (error == -1)
2737		error = EFAULT;
2738
2739	umtxq_unbusy_unlocked(&key);
2740
2741	umtx_key_release(&key);
2742	return (error);
2743}
2744
2745static int
2746do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
2747{
2748	struct abs_timeout timo;
2749	struct umtx_q *uq;
2750	uint32_t flags, wrflags;
2751	int32_t state, oldstate;
2752	int32_t blocked_readers;
2753	int error, rv;
2754
2755	uq = td->td_umtxq;
2756	error = fueword32(&rwlock->rw_flags, &flags);
2757	if (error == -1)
2758		return (EFAULT);
2759	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2760	if (error != 0)
2761		return (error);
2762
2763	if (timeout != NULL)
2764		abs_timeout_init2(&timo, timeout);
2765
2766	wrflags = URWLOCK_WRITE_OWNER;
2767	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
2768		wrflags |= URWLOCK_WRITE_WAITERS;
2769
2770	for (;;) {
2771		rv = fueword32(&rwlock->rw_state, &state);
2772		if (rv == -1) {
2773			umtx_key_release(&uq->uq_key);
2774			return (EFAULT);
2775		}
2776
2777		/* try to lock it */
2778		while (!(state & wrflags)) {
2779			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
2780				umtx_key_release(&uq->uq_key);
2781				return (EAGAIN);
2782			}
2783			rv = casueword32(&rwlock->rw_state, state,
2784			    &oldstate, state + 1);
2785			if (rv == -1) {
2786				umtx_key_release(&uq->uq_key);
2787				return (EFAULT);
2788			}
2789			if (oldstate == state) {
2790				umtx_key_release(&uq->uq_key);
2791				return (0);
2792			}
2793			error = umtxq_check_susp(td);
2794			if (error != 0)
2795				break;
2796			state = oldstate;
2797		}
2798
2799		if (error)
2800			break;
2801
2802		/* grab monitor lock */
2803		umtxq_lock(&uq->uq_key);
2804		umtxq_busy(&uq->uq_key);
2805		umtxq_unlock(&uq->uq_key);
2806
2807		/*
2808		 * re-read the state, in case it changed between the try-lock above
2809		 * and the check below
2810		 */
2811		rv = fueword32(&rwlock->rw_state, &state);
2812		if (rv == -1)
2813			error = EFAULT;
2814
2815		/* set read contention bit */
2816		while (error == 0 && (state & wrflags) &&
2817		    !(state & URWLOCK_READ_WAITERS)) {
2818			rv = casueword32(&rwlock->rw_state, state,
2819			    &oldstate, state | URWLOCK_READ_WAITERS);
2820			if (rv == -1) {
2821				error = EFAULT;
2822				break;
2823			}
2824			if (oldstate == state)
2825				goto sleep;
2826			state = oldstate;
2827			error = umtxq_check_susp(td);
2828			if (error != 0)
2829				break;
2830		}
2831		if (error != 0) {
2832			umtxq_unbusy_unlocked(&uq->uq_key);
2833			break;
2834		}
2835
2836		/* state is changed while setting flags, restart */
2837		if (!(state & wrflags)) {
2838			umtxq_unbusy_unlocked(&uq->uq_key);
2839			error = umtxq_check_susp(td);
2840			if (error != 0)
2841				break;
2842			continue;
2843		}
2844
2845sleep:
2846		/* contention bit is set, before sleeping, increase read waiter count */
2847		rv = fueword32(&rwlock->rw_blocked_readers,
2848		    &blocked_readers);
2849		if (rv == -1) {
2850			umtxq_unbusy_unlocked(&uq->uq_key);
2851			error = EFAULT;
2852			break;
2853		}
2854		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
2855
2856		while (state & wrflags) {
2857			umtxq_lock(&uq->uq_key);
2858			umtxq_insert(uq);
2859			umtxq_unbusy(&uq->uq_key);
2860
2861			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
2862			    NULL : &timo);
2863
2864			umtxq_busy(&uq->uq_key);
2865			umtxq_remove(uq);
2866			umtxq_unlock(&uq->uq_key);
2867			if (error)
2868				break;
2869			rv = fueword32(&rwlock->rw_state, &state);
2870			if (rv == -1) {
2871				error = EFAULT;
2872				break;
2873			}
2874		}
2875
2876		/* decrease read waiter count, and may clear read contention bit */
2877		rv = fueword32(&rwlock->rw_blocked_readers,
2878		    &blocked_readers);
2879		if (rv == -1) {
2880			umtxq_unbusy_unlocked(&uq->uq_key);
2881			error = EFAULT;
2882			break;
2883		}
2884		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
2885		if (blocked_readers == 1) {
2886			rv = fueword32(&rwlock->rw_state, &state);
2887			if (rv == -1)
2888				error = EFAULT;
2889			while (error == 0) {
2890				rv = casueword32(&rwlock->rw_state, state,
2891				    &oldstate, state & ~URWLOCK_READ_WAITERS);
2892				if (rv == -1) {
2893					error = EFAULT;
2894					break;
2895				}
2896				if (oldstate == state)
2897					break;
2898				state = oldstate;
2899				error = umtxq_check_susp(td);
2900			}
2901		}
2902
2903		umtxq_unbusy_unlocked(&uq->uq_key);
2904		if (error != 0)
2905			break;
2906	}
2907	umtx_key_release(&uq->uq_key);
2908	if (error == ERESTART)
2909		error = EINTR;
2910	return (error);
2911}
2912
2913static int
2914do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
2915{
2916	struct abs_timeout timo;
2917	struct umtx_q *uq;
2918	uint32_t flags;
2919	int32_t state, oldstate;
2920	int32_t blocked_writers;
2921	int32_t blocked_readers;
2922	int error, rv;
2923
2924	uq = td->td_umtxq;
2925	error = fueword32(&rwlock->rw_flags, &flags);
2926	if (error == -1)
2927		return (EFAULT);
2928	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
2929	if (error != 0)
2930		return (error);
2931
2932	if (timeout != NULL)
2933		abs_timeout_init2(&timo, timeout);
2934
2935	blocked_readers = 0;
2936	for (;;) {
2937		rv = fueword32(&rwlock->rw_state, &state);
2938		if (rv == -1) {
2939			umtx_key_release(&uq->uq_key);
2940			return (EFAULT);
2941		}
2942		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
2943			rv = casueword32(&rwlock->rw_state, state,
2944			    &oldstate, state | URWLOCK_WRITE_OWNER);
2945			if (rv == -1) {
2946				umtx_key_release(&uq->uq_key);
2947				return (EFAULT);
2948			}
2949			if (oldstate == state) {
2950				umtx_key_release(&uq->uq_key);
2951				return (0);
2952			}
2953			state = oldstate;
2954			error = umtxq_check_susp(td);
2955			if (error != 0)
2956				break;
2957		}
2958
2959		if (error) {
2960			if (!(state & (URWLOCK_WRITE_OWNER|URWLOCK_WRITE_WAITERS)) &&
2961			    blocked_readers != 0) {
2962				umtxq_lock(&uq->uq_key);
2963				umtxq_busy(&uq->uq_key);
2964				umtxq_signal_queue(&uq->uq_key, INT_MAX, UMTX_SHARED_QUEUE);
2965				umtxq_unbusy(&uq->uq_key);
2966				umtxq_unlock(&uq->uq_key);
2967			}
2968
2969			break;
2970		}
2971
2972		/* grab monitor lock */
2973		umtxq_lock(&uq->uq_key);
2974		umtxq_busy(&uq->uq_key);
2975		umtxq_unlock(&uq->uq_key);
2976
2977		/*
2978		 * re-read the state, in case it changed between the try-lock above
2979		 * and the check below
2980		 */
2981		rv = fueword32(&rwlock->rw_state, &state);
2982		if (rv == -1)
2983			error = EFAULT;
2984
2985		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
2986		    URWLOCK_READER_COUNT(state) != 0) &&
2987		    (state & URWLOCK_WRITE_WAITERS) == 0) {
2988			rv = casueword32(&rwlock->rw_state, state,
2989			    &oldstate, state | URWLOCK_WRITE_WAITERS);
2990			if (rv == -1) {
2991				error = EFAULT;
2992				break;
2993			}
2994			if (oldstate == state)
2995				goto sleep;
2996			state = oldstate;
2997			error = umtxq_check_susp(td);
2998			if (error != 0)
2999				break;
3000		}
3001		if (error != 0) {
3002			umtxq_unbusy_unlocked(&uq->uq_key);
3003			break;
3004		}
3005
3006		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
3007			umtxq_unbusy_unlocked(&uq->uq_key);
3008			error = umtxq_check_susp(td);
3009			if (error != 0)
3010				break;
3011			continue;
3012		}
3013sleep:
3014		rv = fueword32(&rwlock->rw_blocked_writers,
3015		    &blocked_writers);
3016		if (rv == -1) {
3017			umtxq_unbusy_unlocked(&uq->uq_key);
3018			error = EFAULT;
3019			break;
3020		}
3021		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
3022
3023		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
3024			umtxq_lock(&uq->uq_key);
3025			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
3026			umtxq_unbusy(&uq->uq_key);
3027
3028			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
3029			    NULL : &timo);
3030
3031			umtxq_busy(&uq->uq_key);
3032			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
3033			umtxq_unlock(&uq->uq_key);
3034			if (error)
3035				break;
3036			rv = fueword32(&rwlock->rw_state, &state);
3037			if (rv == -1) {
3038				error = EFAULT;
3039				break;
3040			}
3041		}
3042
3043		rv = fueword32(&rwlock->rw_blocked_writers,
3044		    &blocked_writers);
3045		if (rv == -1) {
3046			umtxq_unbusy_unlocked(&uq->uq_key);
3047			error = EFAULT;
3048			break;
3049		}
3050		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
3051		if (blocked_writers == 1) {
3052			rv = fueword32(&rwlock->rw_state, &state);
3053			if (rv == -1) {
3054				umtxq_unbusy_unlocked(&uq->uq_key);
3055				error = EFAULT;
3056				break;
3057			}
3058			for (;;) {
3059				rv = casueword32(&rwlock->rw_state, state,
3060				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
3061				if (rv == -1) {
3062					error = EFAULT;
3063					break;
3064				}
3065				if (oldstate == state)
3066					break;
3067				state = oldstate;
3068				error = umtxq_check_susp(td);
3069				/*
3070				 * We are leaving the URWLOCK_WRITE_WAITERS
3071				 * behind, but this should not harm the
3072				 * correctness.
3073				 */
3074				if (error != 0)
3075					break;
3076			}
3077			rv = fueword32(&rwlock->rw_blocked_readers,
3078			    &blocked_readers);
3079			if (rv == -1) {
3080				umtxq_unbusy_unlocked(&uq->uq_key);
3081				error = EFAULT;
3082				break;
3083			}
3084		} else
3085			blocked_readers = 0;
3086
3087		umtxq_unbusy_unlocked(&uq->uq_key);
3088	}
3089
3090	umtx_key_release(&uq->uq_key);
3091	if (error == ERESTART)
3092		error = EINTR;
3093	return (error);
3094}
3095
3096static int
3097do_rw_unlock(struct thread *td, struct urwlock *rwlock)
3098{
3099	struct umtx_q *uq;
3100	uint32_t flags;
3101	int32_t state, oldstate;
3102	int error, rv, q, count;
3103
3104	uq = td->td_umtxq;
3105	error = fueword32(&rwlock->rw_flags, &flags);
3106	if (error == -1)
3107		return (EFAULT);
3108	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
3109	if (error != 0)
3110		return (error);
3111
3112	error = fueword32(&rwlock->rw_state, &state);
3113	if (error == -1) {
3114		error = EFAULT;
3115		goto out;
3116	}
3117	if (state & URWLOCK_WRITE_OWNER) {
3118		for (;;) {
3119			rv = casueword32(&rwlock->rw_state, state,
3120			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
3121			if (rv == -1) {
3122				error = EFAULT;
3123				goto out;
3124			}
3125			if (oldstate != state) {
3126				state = oldstate;
3127				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
3128					error = EPERM;
3129					goto out;
3130				}
3131				error = umtxq_check_susp(td);
3132				if (error != 0)
3133					goto out;
3134			} else
3135				break;
3136		}
3137	} else if (URWLOCK_READER_COUNT(state) != 0) {
3138		for (;;) {
3139			rv = casueword32(&rwlock->rw_state, state,
3140			    &oldstate, state - 1);
3141			if (rv == -1) {
3142				error = EFAULT;
3143				goto out;
3144			}
3145			if (oldstate != state) {
3146				state = oldstate;
3147				if (URWLOCK_READER_COUNT(oldstate) == 0) {
3148					error = EPERM;
3149					goto out;
3150				}
3151				error = umtxq_check_susp(td);
3152				if (error != 0)
3153					goto out;
3154			} else
3155				break;
3156		}
3157	} else {
3158		error = EPERM;
3159		goto out;
3160	}
3161
3162	count = 0;
3163
3164	if (!(flags & URWLOCK_PREFER_READER)) {
3165		if (state & URWLOCK_WRITE_WAITERS) {
3166			count = 1;
3167			q = UMTX_EXCLUSIVE_QUEUE;
3168		} else if (state & URWLOCK_READ_WAITERS) {
3169			count = INT_MAX;
3170			q = UMTX_SHARED_QUEUE;
3171		}
3172	} else {
3173		if (state & URWLOCK_READ_WAITERS) {
3174			count = INT_MAX;
3175			q = UMTX_SHARED_QUEUE;
3176		} else if (state & URWLOCK_WRITE_WAITERS) {
3177			count = 1;
3178			q = UMTX_EXCLUSIVE_QUEUE;
3179		}
3180	}
3181
3182	if (count) {
3183		umtxq_lock(&uq->uq_key);
3184		umtxq_busy(&uq->uq_key);
3185		umtxq_signal_queue(&uq->uq_key, count, q);
3186		umtxq_unbusy(&uq->uq_key);
3187		umtxq_unlock(&uq->uq_key);
3188	}
3189out:
3190	umtx_key_release(&uq->uq_key);
3191	return (error);
3192}
3193
3194static int
3195do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
3196{
3197	struct abs_timeout timo;
3198	struct umtx_q *uq;
3199	uint32_t flags, count, count1;
3200	int error, rv;
3201
3202	uq = td->td_umtxq;
3203	error = fueword32(&sem->_flags, &flags);
3204	if (error == -1)
3205		return (EFAULT);
3206	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
3207	if (error != 0)
3208		return (error);
3209
3210	if (timeout != NULL)
3211		abs_timeout_init2(&timo, timeout);
3212
3213	umtxq_lock(&uq->uq_key);
3214	umtxq_busy(&uq->uq_key);
3215	umtxq_insert(uq);
3216	umtxq_unlock(&uq->uq_key);
3217	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
3218	if (rv == 0)
3219		rv = fueword32(&sem->_count, &count);
3220	if (rv == -1 || count != 0) {
3221		umtxq_lock(&uq->uq_key);
3222		umtxq_unbusy(&uq->uq_key);
3223		umtxq_remove(uq);
3224		umtxq_unlock(&uq->uq_key);
3225		umtx_key_release(&uq->uq_key);
3226		return (rv == -1 ? EFAULT : 0);
3227	}
3228	umtxq_lock(&uq->uq_key);
3229	umtxq_unbusy(&uq->uq_key);
3230
3231	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
3232
3233	if ((uq->uq_flags & UQF_UMTXQ) == 0)
3234		error = 0;
3235	else {
3236		umtxq_remove(uq);
3237		/* A relative timeout cannot be restarted. */
3238		if (error == ERESTART && timeout != NULL &&
3239		    (timeout->_flags & UMTX_ABSTIME) == 0)
3240			error = EINTR;
3241	}
3242	umtxq_unlock(&uq->uq_key);
3243	umtx_key_release(&uq->uq_key);
3244	return (error);
3245}
3246
3247/*
3248 * Signal a userland condition variable.
3249 */
3250static int
3251do_sem_wake(struct thread *td, struct _usem *sem)
3252{
3253	struct umtx_key key;
3254	int error, cnt;
3255	uint32_t flags;
3256
3257	error = fueword32(&sem->_flags, &flags);
3258	if (error == -1)
3259		return (EFAULT);
3260	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
3261		return (error);
3262	umtxq_lock(&key);
3263	umtxq_busy(&key);
3264	cnt = umtxq_count(&key);
3265	if (cnt > 0) {
3266		umtxq_signal(&key, 1);
3267		/*
3268		 * Check if count is greater than 0, this means the memory is
3269		 * still being referenced by user code, so we can safely
3270		 * update _has_waiters flag.
3271		 */
3272		if (cnt == 1) {
3273			umtxq_unlock(&key);
3274			error = suword32(&sem->_has_waiters, 0);
3275			umtxq_lock(&key);
3276			if (error == -1)
3277				error = EFAULT;
3278		}
3279	}
3280	umtxq_unbusy(&key);
3281	umtxq_unlock(&key);
3282	umtx_key_release(&key);
3283	return (error);
3284}
3285
3286int
3287sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
3288    /* struct umtx *umtx */
3289{
3290	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
3291}
3292
3293int
3294sys__umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
3295    /* struct umtx *umtx */
3296{
3297	return do_unlock_umtx(td, uap->umtx, td->td_tid);
3298}
3299
3300inline int
3301umtx_copyin_timeout(const void *addr, struct timespec *tsp)
3302{
3303	int error;
3304
3305	error = copyin(addr, tsp, sizeof(struct timespec));
3306	if (error == 0) {
3307		if (tsp->tv_sec < 0 ||
3308		    tsp->tv_nsec >= 1000000000 ||
3309		    tsp->tv_nsec < 0)
3310			error = EINVAL;
3311	}
3312	return (error);
3313}
3314
3315static inline int
3316umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
3317{
3318	int error;
3319
3320	if (size <= sizeof(struct timespec)) {
3321		tp->_clockid = CLOCK_REALTIME;
3322		tp->_flags = 0;
3323		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
3324	} else
3325		error = copyin(addr, tp, sizeof(struct _umtx_time));
3326	if (error != 0)
3327		return (error);
3328	if (tp->_timeout.tv_sec < 0 ||
3329	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
3330		return (EINVAL);
3331	return (0);
3332}
3333
3334static int
3335__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
3336{
3337	struct timespec *ts, timeout;
3338	int error;
3339
3340	/* Allow a null timespec (wait forever). */
3341	if (uap->uaddr2 == NULL)
3342		ts = NULL;
3343	else {
3344		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3345		if (error != 0)
3346			return (error);
3347		ts = &timeout;
3348	}
3349	return (do_lock_umtx(td, uap->obj, uap->val, ts));
3350}
3351
3352static int
3353__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
3354{
3355	return (do_unlock_umtx(td, uap->obj, uap->val));
3356}
3357
3358static int
3359__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
3360{
3361	struct _umtx_time timeout, *tm_p;
3362	int error;
3363
3364	if (uap->uaddr2 == NULL)
3365		tm_p = NULL;
3366	else {
3367		error = umtx_copyin_umtx_time(
3368		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3369		if (error != 0)
3370			return (error);
3371		tm_p = &timeout;
3372	}
3373	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
3374}
3375
3376static int
3377__umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
3378{
3379	struct _umtx_time timeout, *tm_p;
3380	int error;
3381
3382	if (uap->uaddr2 == NULL)
3383		tm_p = NULL;
3384	else {
3385		error = umtx_copyin_umtx_time(
3386		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3387		if (error != 0)
3388			return (error);
3389		tm_p = &timeout;
3390	}
3391	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3392}
3393
3394static int
3395__umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
3396{
3397	struct _umtx_time *tm_p, timeout;
3398	int error;
3399
3400	if (uap->uaddr2 == NULL)
3401		tm_p = NULL;
3402	else {
3403		error = umtx_copyin_umtx_time(
3404		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3405		if (error != 0)
3406			return (error);
3407		tm_p = &timeout;
3408	}
3409	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3410}
3411
3412static int
3413__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
3414{
3415	return (kern_umtx_wake(td, uap->obj, uap->val, 0));
3416}
3417
3418#define BATCH_SIZE	128
3419static int
3420__umtx_op_nwake_private(struct thread *td, struct _umtx_op_args *uap)
3421{
3422	int count = uap->val;
3423	void *uaddrs[BATCH_SIZE];
3424	char **upp = (char **)uap->obj;
3425	int tocopy;
3426	int error = 0;
3427	int i, pos = 0;
3428
3429	while (count > 0) {
3430		tocopy = count;
3431		if (tocopy > BATCH_SIZE)
3432			tocopy = BATCH_SIZE;
3433		error = copyin(upp+pos, uaddrs, tocopy * sizeof(char *));
3434		if (error != 0)
3435			break;
3436		for (i = 0; i < tocopy; ++i)
3437			kern_umtx_wake(td, uaddrs[i], INT_MAX, 1);
3438		count -= tocopy;
3439		pos += tocopy;
3440	}
3441	return (error);
3442}
3443
3444static int
3445__umtx_op_wake_private(struct thread *td, struct _umtx_op_args *uap)
3446{
3447	return (kern_umtx_wake(td, uap->obj, uap->val, 1));
3448}
3449
3450static int
3451__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
3452{
3453	struct _umtx_time *tm_p, timeout;
3454	int error;
3455
3456	/* Allow a null timespec (wait forever). */
3457	if (uap->uaddr2 == NULL)
3458		tm_p = NULL;
3459	else {
3460		error = umtx_copyin_umtx_time(
3461		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3462		if (error != 0)
3463			return (error);
3464		tm_p = &timeout;
3465	}
3466	return do_lock_umutex(td, uap->obj, tm_p, 0);
3467}
3468
3469static int
3470__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
3471{
3472	return do_lock_umutex(td, uap->obj, NULL, _UMUTEX_TRY);
3473}
3474
3475static int
3476__umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
3477{
3478	struct _umtx_time *tm_p, timeout;
3479	int error;
3480
3481	/* Allow a null timespec (wait forever). */
3482	if (uap->uaddr2 == NULL)
3483		tm_p = NULL;
3484	else {
3485		error = umtx_copyin_umtx_time(
3486		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3487		if (error != 0)
3488			return (error);
3489		tm_p = &timeout;
3490	}
3491	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3492}
3493
3494static int
3495__umtx_op_wake_umutex(struct thread *td, struct _umtx_op_args *uap)
3496{
3497	return do_wake_umutex(td, uap->obj);
3498}
3499
3500static int
3501__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
3502{
3503	return do_unlock_umutex(td, uap->obj);
3504}
3505
3506static int
3507__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
3508{
3509	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
3510}
3511
3512static int
3513__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
3514{
3515	struct timespec *ts, timeout;
3516	int error;
3517
3518	/* Allow a null timespec (wait forever). */
3519	if (uap->uaddr2 == NULL)
3520		ts = NULL;
3521	else {
3522		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
3523		if (error != 0)
3524			return (error);
3525		ts = &timeout;
3526	}
3527	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3528}
3529
3530static int
3531__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
3532{
3533	return do_cv_signal(td, uap->obj);
3534}
3535
3536static int
3537__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
3538{
3539	return do_cv_broadcast(td, uap->obj);
3540}
3541
3542static int
3543__umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
3544{
3545	struct _umtx_time timeout;
3546	int error;
3547
3548	/* Allow a null timespec (wait forever). */
3549	if (uap->uaddr2 == NULL) {
3550		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3551	} else {
3552		error = umtx_copyin_umtx_time(uap->uaddr2,
3553		   (size_t)uap->uaddr1, &timeout);
3554		if (error != 0)
3555			return (error);
3556		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3557	}
3558	return (error);
3559}
3560
3561static int
3562__umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
3563{
3564	struct _umtx_time timeout;
3565	int error;
3566
3567	/* Allow a null timespec (wait forever). */
3568	if (uap->uaddr2 == NULL) {
3569		error = do_rw_wrlock(td, uap->obj, 0);
3570	} else {
3571		error = umtx_copyin_umtx_time(uap->uaddr2,
3572		   (size_t)uap->uaddr1, &timeout);
3573		if (error != 0)
3574			return (error);
3575
3576		error = do_rw_wrlock(td, uap->obj, &timeout);
3577	}
3578	return (error);
3579}
3580
3581static int
3582__umtx_op_rw_unlock(struct thread *td, struct _umtx_op_args *uap)
3583{
3584	return do_rw_unlock(td, uap->obj);
3585}
3586
3587static int
3588__umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
3589{
3590	struct _umtx_time *tm_p, timeout;
3591	int error;
3592
3593	/* Allow a null timespec (wait forever). */
3594	if (uap->uaddr2 == NULL)
3595		tm_p = NULL;
3596	else {
3597		error = umtx_copyin_umtx_time(
3598		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
3599		if (error != 0)
3600			return (error);
3601		tm_p = &timeout;
3602	}
3603	return (do_sem_wait(td, uap->obj, tm_p));
3604}
3605
3606static int
3607__umtx_op_sem_wake(struct thread *td, struct _umtx_op_args *uap)
3608{
3609	return do_sem_wake(td, uap->obj);
3610}
3611
3612static int
3613__umtx_op_wake2_umutex(struct thread *td, struct _umtx_op_args *uap)
3614{
3615	return do_wake2_umutex(td, uap->obj, uap->val);
3616}
3617
3618typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
3619
3620static _umtx_op_func op_table[] = {
3621	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
3622	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
3623	__umtx_op_wait,			/* UMTX_OP_WAIT */
3624	__umtx_op_wake,			/* UMTX_OP_WAKE */
3625	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
3626	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
3627	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
3628	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3629	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
3630	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3631	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3632	__umtx_op_wait_uint,		/* UMTX_OP_WAIT_UINT */
3633	__umtx_op_rw_rdlock,		/* UMTX_OP_RW_RDLOCK */
3634	__umtx_op_rw_wrlock,		/* UMTX_OP_RW_WRLOCK */
3635	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3636	__umtx_op_wait_uint_private,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3637	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3638	__umtx_op_wait_umutex,		/* UMTX_OP_UMUTEX_WAIT */
3639	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3640	__umtx_op_sem_wait,		/* UMTX_OP_SEM_WAIT */
3641	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3642	__umtx_op_nwake_private,	/* UMTX_OP_NWAKE_PRIVATE */
3643	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3644};
3645
3646int
3647sys__umtx_op(struct thread *td, struct _umtx_op_args *uap)
3648{
3649	if ((unsigned)uap->op < UMTX_OP_MAX)
3650		return (*op_table[uap->op])(td, uap);
3651	return (EINVAL);
3652}
3653
3654#ifdef COMPAT_FREEBSD32
3655int
3656freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
3657    /* struct umtx *umtx */
3658{
3659	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
3660}
3661
3662int
3663freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
3664    /* struct umtx *umtx */
3665{
3666	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
3667}
3668
3669struct timespec32 {
3670	int32_t tv_sec;
3671	int32_t tv_nsec;
3672};
3673
3674struct umtx_time32 {
3675	struct	timespec32	timeout;
3676	uint32_t		flags;
3677	uint32_t		clockid;
3678};
3679
3680static inline int
3681umtx_copyin_timeout32(void *addr, struct timespec *tsp)
3682{
3683	struct timespec32 ts32;
3684	int error;
3685
3686	error = copyin(addr, &ts32, sizeof(struct timespec32));
3687	if (error == 0) {
3688		if (ts32.tv_sec < 0 ||
3689		    ts32.tv_nsec >= 1000000000 ||
3690		    ts32.tv_nsec < 0)
3691			error = EINVAL;
3692		else {
3693			tsp->tv_sec = ts32.tv_sec;
3694			tsp->tv_nsec = ts32.tv_nsec;
3695		}
3696	}
3697	return (error);
3698}
3699
3700static inline int
3701umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
3702{
3703	struct umtx_time32 t32;
3704	int error;
3705
3706	t32.clockid = CLOCK_REALTIME;
3707	t32.flags   = 0;
3708	if (size <= sizeof(struct timespec32))
3709		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
3710	else
3711		error = copyin(addr, &t32, sizeof(struct umtx_time32));
3712	if (error != 0)
3713		return (error);
3714	if (t32.timeout.tv_sec < 0 ||
3715	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
3716		return (EINVAL);
3717	tp->_timeout.tv_sec = t32.timeout.tv_sec;
3718	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
3719	tp->_flags = t32.flags;
3720	tp->_clockid = t32.clockid;
3721	return (0);
3722}
3723
3724static int
3725__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3726{
3727	struct timespec *ts, timeout;
3728	int error;
3729
3730	/* Allow a null timespec (wait forever). */
3731	if (uap->uaddr2 == NULL)
3732		ts = NULL;
3733	else {
3734		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3735		if (error != 0)
3736			return (error);
3737		ts = &timeout;
3738	}
3739	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
3740}
3741
3742static int
3743__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
3744{
3745	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
3746}
3747
3748static int
3749__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3750{
3751	struct _umtx_time *tm_p, timeout;
3752	int error;
3753
3754	if (uap->uaddr2 == NULL)
3755		tm_p = NULL;
3756	else {
3757		error = umtx_copyin_umtx_time32(uap->uaddr2,
3758			(size_t)uap->uaddr1, &timeout);
3759		if (error != 0)
3760			return (error);
3761		tm_p = &timeout;
3762	}
3763	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
3764}
3765
3766static int
3767__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3768{
3769	struct _umtx_time *tm_p, timeout;
3770	int error;
3771
3772	/* Allow a null timespec (wait forever). */
3773	if (uap->uaddr2 == NULL)
3774		tm_p = NULL;
3775	else {
3776		error = umtx_copyin_umtx_time(uap->uaddr2,
3777			    (size_t)uap->uaddr1, &timeout);
3778		if (error != 0)
3779			return (error);
3780		tm_p = &timeout;
3781	}
3782	return do_lock_umutex(td, uap->obj, tm_p, 0);
3783}
3784
3785static int
3786__umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
3787{
3788	struct _umtx_time *tm_p, timeout;
3789	int error;
3790
3791	/* Allow a null timespec (wait forever). */
3792	if (uap->uaddr2 == NULL)
3793		tm_p = NULL;
3794	else {
3795		error = umtx_copyin_umtx_time32(uap->uaddr2,
3796		    (size_t)uap->uaddr1, &timeout);
3797		if (error != 0)
3798			return (error);
3799		tm_p = &timeout;
3800	}
3801	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
3802}
3803
3804static int
3805__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3806{
3807	struct timespec *ts, timeout;
3808	int error;
3809
3810	/* Allow a null timespec (wait forever). */
3811	if (uap->uaddr2 == NULL)
3812		ts = NULL;
3813	else {
3814		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
3815		if (error != 0)
3816			return (error);
3817		ts = &timeout;
3818	}
3819	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
3820}
3821
3822static int
3823__umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3824{
3825	struct _umtx_time timeout;
3826	int error;
3827
3828	/* Allow a null timespec (wait forever). */
3829	if (uap->uaddr2 == NULL) {
3830		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
3831	} else {
3832		error = umtx_copyin_umtx_time32(uap->uaddr2,
3833		    (size_t)uap->uaddr1, &timeout);
3834		if (error != 0)
3835			return (error);
3836		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
3837	}
3838	return (error);
3839}
3840
3841static int
3842__umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
3843{
3844	struct _umtx_time timeout;
3845	int error;
3846
3847	/* Allow a null timespec (wait forever). */
3848	if (uap->uaddr2 == NULL) {
3849		error = do_rw_wrlock(td, uap->obj, 0);
3850	} else {
3851		error = umtx_copyin_umtx_time32(uap->uaddr2,
3852		    (size_t)uap->uaddr1, &timeout);
3853		if (error != 0)
3854			return (error);
3855		error = do_rw_wrlock(td, uap->obj, &timeout);
3856	}
3857	return (error);
3858}
3859
3860static int
3861__umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
3862{
3863	struct _umtx_time *tm_p, timeout;
3864	int error;
3865
3866	if (uap->uaddr2 == NULL)
3867		tm_p = NULL;
3868	else {
3869		error = umtx_copyin_umtx_time32(
3870		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
3871		if (error != 0)
3872			return (error);
3873		tm_p = &timeout;
3874	}
3875	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
3876}
3877
3878static int
3879__umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
3880{
3881	struct _umtx_time *tm_p, timeout;
3882	int error;
3883
3884	/* Allow a null timespec (wait forever). */
3885	if (uap->uaddr2 == NULL)
3886		tm_p = NULL;
3887	else {
3888		error = umtx_copyin_umtx_time32(uap->uaddr2,
3889		    (size_t)uap->uaddr1, &timeout);
3890		if (error != 0)
3891			return (error);
3892		tm_p = &timeout;
3893	}
3894	return (do_sem_wait(td, uap->obj, tm_p));
3895}
3896
3897static int
3898__umtx_op_nwake_private32(struct thread *td, struct _umtx_op_args *uap)
3899{
3900	int count = uap->val;
3901	uint32_t uaddrs[BATCH_SIZE];
3902	uint32_t **upp = (uint32_t **)uap->obj;
3903	int tocopy;
3904	int error = 0;
3905	int i, pos = 0;
3906
3907	while (count > 0) {
3908		tocopy = count;
3909		if (tocopy > BATCH_SIZE)
3910			tocopy = BATCH_SIZE;
3911		error = copyin(upp+pos, uaddrs, tocopy * sizeof(uint32_t));
3912		if (error != 0)
3913			break;
3914		for (i = 0; i < tocopy; ++i)
3915			kern_umtx_wake(td, (void *)(intptr_t)uaddrs[i],
3916				INT_MAX, 1);
3917		count -= tocopy;
3918		pos += tocopy;
3919	}
3920	return (error);
3921}
3922
3923static _umtx_op_func op_table_compat32[] = {
3924	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
3925	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
3926	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
3927	__umtx_op_wake,			/* UMTX_OP_WAKE */
3928	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
3929	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
3930	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
3931	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
3932	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
3933	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
3934	__umtx_op_cv_broadcast,		/* UMTX_OP_CV_BROADCAST */
3935	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT_UINT */
3936	__umtx_op_rw_rdlock_compat32,	/* UMTX_OP_RW_RDLOCK */
3937	__umtx_op_rw_wrlock_compat32,	/* UMTX_OP_RW_WRLOCK */
3938	__umtx_op_rw_unlock,		/* UMTX_OP_RW_UNLOCK */
3939	__umtx_op_wait_uint_private_compat32,	/* UMTX_OP_WAIT_UINT_PRIVATE */
3940	__umtx_op_wake_private,		/* UMTX_OP_WAKE_PRIVATE */
3941	__umtx_op_wait_umutex_compat32, /* UMTX_OP_UMUTEX_WAIT */
3942	__umtx_op_wake_umutex,		/* UMTX_OP_UMUTEX_WAKE */
3943	__umtx_op_sem_wait_compat32,	/* UMTX_OP_SEM_WAIT */
3944	__umtx_op_sem_wake,		/* UMTX_OP_SEM_WAKE */
3945	__umtx_op_nwake_private32,	/* UMTX_OP_NWAKE_PRIVATE */
3946	__umtx_op_wake2_umutex		/* UMTX_OP_UMUTEX_WAKE2 */
3947};
3948
3949int
3950freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
3951{
3952	if ((unsigned)uap->op < UMTX_OP_MAX)
3953		return (*op_table_compat32[uap->op])(td,
3954			(struct _umtx_op_args *)uap);
3955	return (EINVAL);
3956}
3957#endif
3958
3959void
3960umtx_thread_init(struct thread *td)
3961{
3962	td->td_umtxq = umtxq_alloc();
3963	td->td_umtxq->uq_thread = td;
3964}
3965
3966void
3967umtx_thread_fini(struct thread *td)
3968{
3969	umtxq_free(td->td_umtxq);
3970}
3971
3972/*
3973 * It will be called when new thread is created, e.g fork().
3974 */
3975void
3976umtx_thread_alloc(struct thread *td)
3977{
3978	struct umtx_q *uq;
3979
3980	uq = td->td_umtxq;
3981	uq->uq_inherited_pri = PRI_MAX;
3982
3983	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
3984	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
3985	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
3986	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
3987}
3988
3989/*
3990 * exec() hook.
3991 */
3992static void
3993umtx_exec_hook(void *arg __unused, struct proc *p __unused,
3994	struct image_params *imgp __unused)
3995{
3996	umtx_thread_cleanup(curthread);
3997}
3998
3999/*
4000 * thread_exit() hook.
4001 */
4002void
4003umtx_thread_exit(struct thread *td)
4004{
4005	umtx_thread_cleanup(td);
4006}
4007
4008/*
4009 * clean up umtx data.
4010 */
4011static void
4012umtx_thread_cleanup(struct thread *td)
4013{
4014	struct umtx_q *uq;
4015	struct umtx_pi *pi;
4016
4017	if ((uq = td->td_umtxq) == NULL)
4018		return;
4019
4020	mtx_lock_spin(&umtx_lock);
4021	uq->uq_inherited_pri = PRI_MAX;
4022	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
4023		pi->pi_owner = NULL;
4024		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
4025	}
4026	mtx_unlock_spin(&umtx_lock);
4027	thread_lock(td);
4028	sched_lend_user_prio(td, PRI_MAX);
4029	thread_unlock(td);
4030}
4031