1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2009-2021 Dmitry Chagin <dchagin@FreeBSD.org>
5 * Copyright (c) 2008 Roman Divacky
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29#include <sys/param.h>
30#include <sys/imgact.h>
31#include <sys/imgact_elf.h>
32#include <sys/ktr.h>
33#include <sys/lock.h>
34#include <sys/mutex.h>
35#include <sys/priv.h>
36#include <sys/proc.h>
37#include <sys/sched.h>
38#include <sys/sysent.h>
39#include <sys/vnode.h>
40#include <sys/umtxvar.h>
41
42#ifdef COMPAT_LINUX32
43#include <machine/../linux32/linux.h>
44#include <machine/../linux32/linux32_proto.h>
45#else
46#include <machine/../linux/linux.h>
47#include <machine/../linux/linux_proto.h>
48#endif
49#include <compat/linux/linux_emul.h>
50#include <compat/linux/linux_futex.h>
51#include <compat/linux/linux_misc.h>
52#include <compat/linux/linux_time.h>
53#include <compat/linux/linux_util.h>
54
55#define	FUTEX_SHARED	0x8     /* shared futex */
56#define	FUTEX_UNOWNED	0
57
58#define	GET_SHARED(a)	(a->flags & FUTEX_SHARED) ? AUTO_SHARE : THREAD_SHARE
59
60static int futex_atomic_op(struct thread *, int, uint32_t *, int *);
61static int handle_futex_death(struct thread *td, struct linux_emuldata *,
62    uint32_t *, unsigned int, bool);
63static int fetch_robust_entry(struct linux_robust_list **,
64    struct linux_robust_list **, unsigned int *);
65
66struct linux_futex_args {
67	uint32_t	*uaddr;
68	int32_t		op;
69	uint32_t	flags;
70	bool		clockrt;
71	uint32_t	val;
72	struct timespec	*ts;
73	uint32_t	*uaddr2;
74	uint32_t	val3;
75	bool		val3_compare;
76	struct timespec	kts;
77};
78
79static inline int futex_key_get(const void *, int, int, struct umtx_key *);
80static void linux_umtx_abs_timeout_init(struct umtx_abs_timeout *,
81	    struct linux_futex_args *);
82static int linux_futex(struct thread *, struct linux_futex_args *);
83static int linux_futex_wait(struct thread *, struct linux_futex_args *);
84static int linux_futex_wake(struct thread *, struct linux_futex_args *);
85static int linux_futex_requeue(struct thread *, struct linux_futex_args *);
86static int linux_futex_wakeop(struct thread *, struct linux_futex_args *);
87static int linux_futex_lock_pi(struct thread *, bool, struct linux_futex_args *);
88static int linux_futex_unlock_pi(struct thread *, bool,
89	    struct linux_futex_args *);
90static int futex_wake_pi(struct thread *, uint32_t *, bool);
91
92static int
93futex_key_get(const void *uaddr, int type, int share, struct umtx_key *key)
94{
95
96	/* Check that futex address is a 32bit aligned. */
97	if (!__is_aligned(uaddr, sizeof(uint32_t)))
98		return (EINVAL);
99	return (umtx_key_get(uaddr, type, share, key));
100}
101
102int
103futex_wake(struct thread *td, uint32_t *uaddr, int val, bool shared)
104{
105	struct linux_futex_args args;
106
107	bzero(&args, sizeof(args));
108	args.op = LINUX_FUTEX_WAKE;
109	args.uaddr = uaddr;
110	args.flags = shared == true ? FUTEX_SHARED : 0;
111	args.val = val;
112	args.val3 = FUTEX_BITSET_MATCH_ANY;
113
114	return (linux_futex_wake(td, &args));
115}
116
117static int
118futex_wake_pi(struct thread *td, uint32_t *uaddr, bool shared)
119{
120	struct linux_futex_args args;
121
122	bzero(&args, sizeof(args));
123	args.op = LINUX_FUTEX_UNLOCK_PI;
124	args.uaddr = uaddr;
125	args.flags = shared == true ? FUTEX_SHARED : 0;
126
127	return (linux_futex_unlock_pi(td, true, &args));
128}
129
130static int
131futex_atomic_op(struct thread *td, int encoded_op, uint32_t *uaddr,
132    int *res)
133{
134	int op = (encoded_op >> 28) & 7;
135	int cmp = (encoded_op >> 24) & 15;
136	int oparg = (encoded_op << 8) >> 20;
137	int cmparg = (encoded_op << 20) >> 20;
138	int oldval = 0, ret;
139
140	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
141		oparg = 1 << oparg;
142
143	switch (op) {
144	case FUTEX_OP_SET:
145		ret = futex_xchgl(oparg, uaddr, &oldval);
146		break;
147	case FUTEX_OP_ADD:
148		ret = futex_addl(oparg, uaddr, &oldval);
149		break;
150	case FUTEX_OP_OR:
151		ret = futex_orl(oparg, uaddr, &oldval);
152		break;
153	case FUTEX_OP_ANDN:
154		ret = futex_andl(~oparg, uaddr, &oldval);
155		break;
156	case FUTEX_OP_XOR:
157		ret = futex_xorl(oparg, uaddr, &oldval);
158		break;
159	default:
160		ret = ENOSYS;
161		break;
162	}
163
164	if (ret != 0)
165		return (ret);
166
167	switch (cmp) {
168	case FUTEX_OP_CMP_EQ:
169		*res = (oldval == cmparg);
170		break;
171	case FUTEX_OP_CMP_NE:
172		*res = (oldval != cmparg);
173		break;
174	case FUTEX_OP_CMP_LT:
175		*res = (oldval < cmparg);
176		break;
177	case FUTEX_OP_CMP_GE:
178		*res = (oldval >= cmparg);
179		break;
180	case FUTEX_OP_CMP_LE:
181		*res = (oldval <= cmparg);
182		break;
183	case FUTEX_OP_CMP_GT:
184		*res = (oldval > cmparg);
185		break;
186	default:
187		ret = ENOSYS;
188	}
189
190	return (ret);
191}
192
193static int
194linux_futex(struct thread *td, struct linux_futex_args *args)
195{
196	struct linux_pemuldata *pem;
197	struct proc *p;
198
199	if (args->op & LINUX_FUTEX_PRIVATE_FLAG) {
200		args->flags = 0;
201		args->op &= ~LINUX_FUTEX_PRIVATE_FLAG;
202	} else
203		args->flags = FUTEX_SHARED;
204
205	args->clockrt = args->op & LINUX_FUTEX_CLOCK_REALTIME;
206	args->op = args->op & ~LINUX_FUTEX_CLOCK_REALTIME;
207
208	if (args->clockrt &&
209	    args->op != LINUX_FUTEX_WAIT_BITSET &&
210	    args->op != LINUX_FUTEX_WAIT_REQUEUE_PI &&
211	    args->op != LINUX_FUTEX_LOCK_PI2)
212		return (ENOSYS);
213
214	switch (args->op) {
215	case LINUX_FUTEX_WAIT:
216		args->val3 = FUTEX_BITSET_MATCH_ANY;
217		/* FALLTHROUGH */
218
219	case LINUX_FUTEX_WAIT_BITSET:
220		LINUX_CTR3(sys_futex, "WAIT uaddr %p val 0x%x bitset 0x%x",
221		    args->uaddr, args->val, args->val3);
222
223		return (linux_futex_wait(td, args));
224
225	case LINUX_FUTEX_WAKE:
226		args->val3 = FUTEX_BITSET_MATCH_ANY;
227		/* FALLTHROUGH */
228
229	case LINUX_FUTEX_WAKE_BITSET:
230		LINUX_CTR3(sys_futex, "WAKE uaddr %p nrwake 0x%x bitset 0x%x",
231		    args->uaddr, args->val, args->val3);
232
233		return (linux_futex_wake(td, args));
234
235	case LINUX_FUTEX_REQUEUE:
236		/*
237		 * Glibc does not use this operation since version 2.3.3,
238		 * as it is racy and replaced by FUTEX_CMP_REQUEUE operation.
239		 * Glibc versions prior to 2.3.3 fall back to FUTEX_WAKE when
240		 * FUTEX_REQUEUE returned EINVAL.
241		 */
242		pem = pem_find(td->td_proc);
243		if ((pem->flags & LINUX_XDEPR_REQUEUEOP) == 0) {
244			linux_msg(td, "unsupported FUTEX_REQUEUE");
245			pem->flags |= LINUX_XDEPR_REQUEUEOP;
246		}
247
248		/*
249		 * The above is true, however musl libc does make use of the
250		 * futex requeue operation, allow operation for brands which
251		 * set LINUX_BI_FUTEX_REQUEUE bit of Brandinfo flags.
252		 */
253		p = td->td_proc;
254		Elf_Brandinfo *bi = p->p_elf_brandinfo;
255		if (bi == NULL || ((bi->flags & LINUX_BI_FUTEX_REQUEUE)) == 0)
256			return (EINVAL);
257		args->val3_compare = false;
258		/* FALLTHROUGH */
259
260	case LINUX_FUTEX_CMP_REQUEUE:
261		LINUX_CTR5(sys_futex, "CMP_REQUEUE uaddr %p "
262		    "nrwake 0x%x uval 0x%x uaddr2 %p nrequeue 0x%x",
263		    args->uaddr, args->val, args->val3, args->uaddr2,
264		    args->ts);
265
266		return (linux_futex_requeue(td, args));
267
268	case LINUX_FUTEX_WAKE_OP:
269		LINUX_CTR5(sys_futex, "WAKE_OP "
270		    "uaddr %p nrwake 0x%x uaddr2 %p op 0x%x nrwake2 0x%x",
271		    args->uaddr, args->val, args->uaddr2, args->val3,
272		    args->ts);
273
274		return (linux_futex_wakeop(td, args));
275
276	case LINUX_FUTEX_LOCK_PI:
277		args->clockrt = true;
278		/* FALLTHROUGH */
279
280	case LINUX_FUTEX_LOCK_PI2:
281		LINUX_CTR2(sys_futex, "LOCKPI uaddr %p val 0x%x",
282		    args->uaddr, args->val);
283
284		return (linux_futex_lock_pi(td, false, args));
285
286	case LINUX_FUTEX_UNLOCK_PI:
287		LINUX_CTR1(sys_futex, "UNLOCKPI uaddr %p",
288		    args->uaddr);
289
290		return (linux_futex_unlock_pi(td, false, args));
291
292	case LINUX_FUTEX_TRYLOCK_PI:
293		LINUX_CTR1(sys_futex, "TRYLOCKPI uaddr %p",
294		    args->uaddr);
295
296		return (linux_futex_lock_pi(td, true, args));
297
298	/*
299	 * Current implementation of FUTEX_WAIT_REQUEUE_PI and FUTEX_CMP_REQUEUE_PI
300	 * can't be used anymore to implement conditional variables.
301	 * A detailed explanation can be found here:
302	 *
303	 * https://sourceware.org/bugzilla/show_bug.cgi?id=13165
304	 * and here http://austingroupbugs.net/view.php?id=609
305	 *
306	 * And since commit
307	 * https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=ed19993b5b0d05d62cc883571519a67dae481a14
308	 * glibc does not use them.
309	 */
310	case LINUX_FUTEX_WAIT_REQUEUE_PI:
311		/* not yet implemented */
312		pem = pem_find(td->td_proc);
313		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
314			linux_msg(td, "unsupported FUTEX_WAIT_REQUEUE_PI");
315			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
316		}
317		return (ENOSYS);
318
319	case LINUX_FUTEX_CMP_REQUEUE_PI:
320		/* not yet implemented */
321		pem = pem_find(td->td_proc);
322		if ((pem->flags & LINUX_XUNSUP_FUTEXPIOP) == 0) {
323			linux_msg(td, "unsupported FUTEX_CMP_REQUEUE_PI");
324			pem->flags |= LINUX_XUNSUP_FUTEXPIOP;
325		}
326		return (ENOSYS);
327
328	default:
329		linux_msg(td, "unsupported futex op %d", args->op);
330		return (ENOSYS);
331	}
332}
333
334/*
335 * pi protocol:
336 * - 0 futex word value means unlocked.
337 * - TID futex word value means locked.
338 * Userspace uses atomic ops to lock/unlock these futexes without entering the
339 * kernel. If the lock-acquire fastpath fails, (transition from 0 to TID fails),
340 * then FUTEX_LOCK_PI is called.
341 * The kernel atomically set FUTEX_WAITERS bit in the futex word value, if no
342 * other waiters exists looks up the thread that owns the futex (it has put its
343 * own TID into the futex value) and made this thread the owner of the internal
344 * pi-aware lock object (mutex). Then the kernel tries to lock the internal lock
345 * object, on which it blocks. Once it returns, it has the mutex acquired, and it
346 * sets the futex value to its own TID and returns (futex value contains
347 * FUTEX_WAITERS|TID).
348 * The unlock fastpath would fail (because the FUTEX_WAITERS bit is set) and
349 * FUTEX_UNLOCK_PI will be called.
350 * If a futex is found to be held at exit time, the kernel sets the OWNER_DIED
351 * bit of the futex word and wakes up the next futex waiter (if any), WAITERS
352 * bit is preserved (if any).
353 * If OWNER_DIED bit is set the kernel sanity checks the futex word value against
354 * the internal futex state and if correct, acquire futex.
355 */
356static int
357linux_futex_lock_pi(struct thread *td, bool try, struct linux_futex_args *args)
358{
359	struct umtx_abs_timeout timo;
360	struct linux_emuldata *em;
361	struct umtx_pi *pi, *new_pi;
362	struct thread *td1;
363	struct umtx_q *uq;
364	int error, rv;
365	uint32_t owner, old_owner;
366
367	em = em_find(td);
368	uq = td->td_umtxq;
369	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args),
370	    &uq->uq_key);
371	if (error != 0)
372		return (error);
373	if (args->ts != NULL)
374		linux_umtx_abs_timeout_init(&timo, args);
375
376	umtxq_lock(&uq->uq_key);
377	pi = umtx_pi_lookup(&uq->uq_key);
378	if (pi == NULL) {
379		new_pi = umtx_pi_alloc(M_NOWAIT);
380		if (new_pi == NULL) {
381			umtxq_unlock(&uq->uq_key);
382			new_pi = umtx_pi_alloc(M_WAITOK);
383			umtxq_lock(&uq->uq_key);
384			pi = umtx_pi_lookup(&uq->uq_key);
385			if (pi != NULL) {
386				umtx_pi_free(new_pi);
387				new_pi = NULL;
388			}
389		}
390		if (new_pi != NULL) {
391			new_pi->pi_key = uq->uq_key;
392			umtx_pi_insert(new_pi);
393			pi = new_pi;
394		}
395	}
396	umtx_pi_ref(pi);
397	umtxq_unlock(&uq->uq_key);
398	for (;;) {
399		/* Try uncontested case first. */
400		rv = casueword32(args->uaddr, FUTEX_UNOWNED, &owner, em->em_tid);
401		/* The acquire succeeded. */
402		if (rv == 0) {
403			error = 0;
404			break;
405		}
406		if (rv == -1) {
407			error = EFAULT;
408			break;
409		}
410
411		/*
412		 * Nobody owns it, but the acquire failed. This can happen
413		 * with ll/sc atomic.
414		 */
415		if (owner == FUTEX_UNOWNED) {
416			error = thread_check_susp(td, true);
417			if (error != 0)
418				break;
419			continue;
420		}
421
422		/*
423		 * Avoid overwriting a possible error from sleep due
424		 * to the pending signal with suspension check result.
425		 */
426		if (error == 0) {
427			error = thread_check_susp(td, true);
428			if (error != 0)
429				break;
430		}
431
432		/* The futex word at *uaddr is already locked by the caller. */
433		if ((owner & FUTEX_TID_MASK) == em->em_tid) {
434			error = EDEADLK;
435			break;
436		}
437
438		/*
439		 * Futex owner died, handle_futex_death() set the OWNER_DIED bit
440		 * and clear tid. Try to acquire it.
441		 */
442		if ((owner & FUTEX_TID_MASK) == FUTEX_UNOWNED) {
443			old_owner = owner;
444			owner = owner & (FUTEX_WAITERS | FUTEX_OWNER_DIED);
445			owner |= em->em_tid;
446			rv = casueword32(args->uaddr, old_owner, &owner, owner);
447			if (rv == -1) {
448				error = EFAULT;
449				break;
450			}
451			if (rv == 1) {
452				if (error == 0) {
453					error = thread_check_susp(td, true);
454					if (error != 0)
455						break;
456				}
457
458				/*
459				 * If this failed the lock could
460				 * changed, restart.
461				 */
462				continue;
463			}
464
465			umtxq_lock(&uq->uq_key);
466			umtxq_busy(&uq->uq_key);
467			error = umtx_pi_claim(pi, td);
468			umtxq_unbusy(&uq->uq_key);
469			umtxq_unlock(&uq->uq_key);
470			if (error != 0) {
471				/*
472				 * Since we're going to return an
473				 * error, restore the futex to its
474				 * previous, unowned state to avoid
475				 * compounding the problem.
476				 */
477				(void)casuword32(args->uaddr, owner, old_owner);
478			}
479			break;
480		}
481
482		/*
483		 * Inconsistent state: OWNER_DIED is set and tid is not 0.
484		 * Linux does some checks of futex state, we return EINVAL,
485		 * as the user space can take care of this.
486		 */
487		if ((owner & FUTEX_OWNER_DIED) != FUTEX_UNOWNED) {
488			error = EINVAL;
489			break;
490		}
491
492		if (try != 0) {
493			error = EBUSY;
494			break;
495		}
496
497		/*
498		 * If we caught a signal, we have retried and now
499		 * exit immediately.
500		 */
501		if (error != 0)
502			break;
503
504		umtxq_lock(&uq->uq_key);
505		umtxq_busy(&uq->uq_key);
506		umtxq_unlock(&uq->uq_key);
507
508		/*
509		 * Set the contested bit so that a release in user space knows
510		 * to use the system call for unlock. If this fails either some
511		 * one else has acquired the lock or it has been released.
512		 */
513		rv = casueword32(args->uaddr, owner, &owner,
514		    owner | FUTEX_WAITERS);
515		if (rv == -1) {
516			umtxq_unbusy_unlocked(&uq->uq_key);
517			error = EFAULT;
518			break;
519		}
520		if (rv == 1) {
521			umtxq_unbusy_unlocked(&uq->uq_key);
522			error = thread_check_susp(td, true);
523			if (error != 0)
524				break;
525
526			/*
527			 * The lock changed and we need to retry or we
528			 * lost a race to the thread unlocking the umtx.
529			 */
530			continue;
531		}
532
533		/*
534		 * Substitute Linux thread id by native thread id to
535		 * avoid refactoring code of umtxq_sleep_pi().
536		 */
537		td1 = linux_tdfind(td, owner & FUTEX_TID_MASK, -1);
538		if (td1 != NULL) {
539			owner = td1->td_tid;
540			PROC_UNLOCK(td1->td_proc);
541		} else {
542			umtxq_unbusy_unlocked(&uq->uq_key);
543			error = EINVAL;
544			break;
545		}
546
547		umtxq_lock(&uq->uq_key);
548
549		/* We set the contested bit, sleep. */
550		error = umtxq_sleep_pi(uq, pi, owner, "futexp",
551		    args->ts == NULL ? NULL : &timo,
552		    (args->flags & FUTEX_SHARED) != 0);
553		if (error != 0)
554			continue;
555
556		error = thread_check_susp(td, false);
557		if (error != 0)
558			break;
559	}
560
561	umtxq_lock(&uq->uq_key);
562	umtx_pi_unref(pi);
563	umtxq_unlock(&uq->uq_key);
564	umtx_key_release(&uq->uq_key);
565	return (error);
566}
567
568static int
569linux_futex_unlock_pi(struct thread *td, bool rb, struct linux_futex_args *args)
570{
571	struct linux_emuldata *em;
572	struct umtx_key key;
573	uint32_t old, owner, new_owner;
574	int count, error;
575
576	em = em_find(td);
577
578	/*
579	 * Make sure we own this mtx.
580	 */
581	error = fueword32(args->uaddr, &owner);
582	if (error == -1)
583		return (EFAULT);
584	if (!rb && (owner & FUTEX_TID_MASK) != em->em_tid)
585		return (EPERM);
586
587	error = futex_key_get(args->uaddr, TYPE_PI_FUTEX, GET_SHARED(args), &key);
588	if (error != 0)
589		return (error);
590	umtxq_lock(&key);
591	umtxq_busy(&key);
592	error = umtx_pi_drop(td, &key, rb, &count);
593	if (error != 0 || rb) {
594		umtxq_unbusy(&key);
595		umtxq_unlock(&key);
596		umtx_key_release(&key);
597		return (error);
598	}
599	umtxq_unlock(&key);
600
601	/*
602	 * When unlocking the futex, it must be marked as unowned if
603	 * there is zero or one thread only waiting for it.
604	 * Otherwise, it must be marked as contested.
605	 */
606	if (count > 1)
607		new_owner = FUTEX_WAITERS;
608	else
609		new_owner = FUTEX_UNOWNED;
610
611again:
612	error = casueword32(args->uaddr, owner, &old, new_owner);
613	if (error == 1) {
614		error = thread_check_susp(td, false);
615		if (error == 0)
616			goto again;
617	}
618	umtxq_unbusy_unlocked(&key);
619	umtx_key_release(&key);
620	if (error == -1)
621		return (EFAULT);
622	if (error == 0 && old != owner)
623		return (EINVAL);
624	return (error);
625}
626
627static int
628linux_futex_wakeop(struct thread *td, struct linux_futex_args *args)
629{
630	struct umtx_key key, key2;
631	int nrwake, op_ret, ret;
632	int error, count;
633
634	if (args->uaddr == args->uaddr2)
635		return (EINVAL);
636
637	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
638	if (error != 0)
639		return (error);
640	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
641	if (error != 0) {
642		umtx_key_release(&key);
643		return (error);
644	}
645	umtxq_lock(&key);
646	umtxq_busy(&key);
647	umtxq_unlock(&key);
648	error = futex_atomic_op(td, args->val3, args->uaddr2, &op_ret);
649	umtxq_lock(&key);
650	umtxq_unbusy(&key);
651	if (error != 0)
652		goto out;
653	ret = umtxq_signal_mask(&key, args->val, args->val3);
654	if (op_ret > 0) {
655		nrwake = (int)(unsigned long)args->ts;
656		umtxq_lock(&key2);
657		count = umtxq_count(&key2);
658		if (count > 0)
659			ret += umtxq_signal_mask(&key2, nrwake, args->val3);
660		else
661			ret += umtxq_signal_mask(&key, nrwake, args->val3);
662		umtxq_unlock(&key2);
663	}
664	td->td_retval[0] = ret;
665out:
666	umtxq_unlock(&key);
667	umtx_key_release(&key2);
668	umtx_key_release(&key);
669	return (error);
670}
671
672static int
673linux_futex_requeue(struct thread *td, struct linux_futex_args *args)
674{
675	int nrwake, nrrequeue;
676	struct umtx_key key, key2;
677	int error;
678	uint32_t uval;
679
680	/*
681	 * Linux allows this, we would not, it is an incorrect
682	 * usage of declared ABI, so return EINVAL.
683	 */
684	if (args->uaddr == args->uaddr2)
685		return (EINVAL);
686
687	nrrequeue = (int)(unsigned long)args->ts;
688	nrwake = args->val;
689	/*
690	 * Sanity check to prevent signed integer overflow,
691	 * see Linux CVE-2018-6927
692	 */
693	if (nrwake < 0 || nrrequeue < 0)
694		return (EINVAL);
695
696	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
697	if (error != 0)
698		return (error);
699	error = futex_key_get(args->uaddr2, TYPE_FUTEX, GET_SHARED(args), &key2);
700	if (error != 0) {
701		umtx_key_release(&key);
702		return (error);
703	}
704	umtxq_lock(&key);
705	umtxq_busy(&key);
706	umtxq_unlock(&key);
707	error = fueword32(args->uaddr, &uval);
708	if (error != 0)
709		error = EFAULT;
710	else if (args->val3_compare == true && uval != args->val3)
711		error = EWOULDBLOCK;
712	umtxq_lock(&key);
713	umtxq_unbusy(&key);
714	if (error == 0) {
715		umtxq_lock(&key2);
716		td->td_retval[0] = umtxq_requeue(&key, nrwake, &key2, nrrequeue);
717		umtxq_unlock(&key2);
718	}
719	umtxq_unlock(&key);
720	umtx_key_release(&key2);
721	umtx_key_release(&key);
722	return (error);
723}
724
725static int
726linux_futex_wake(struct thread *td, struct linux_futex_args *args)
727{
728	struct umtx_key key;
729	int error;
730
731	if (args->val3 == 0)
732		return (EINVAL);
733
734	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args), &key);
735	if (error != 0)
736		return (error);
737	umtxq_lock(&key);
738	td->td_retval[0] = umtxq_signal_mask(&key, args->val, args->val3);
739	umtxq_unlock(&key);
740	umtx_key_release(&key);
741	return (0);
742}
743
744static int
745linux_futex_wait(struct thread *td, struct linux_futex_args *args)
746{
747	struct umtx_abs_timeout timo;
748	struct umtx_q *uq;
749	uint32_t uval;
750	int error;
751
752	if (args->val3 == 0)
753		error = EINVAL;
754
755	uq = td->td_umtxq;
756	error = futex_key_get(args->uaddr, TYPE_FUTEX, GET_SHARED(args),
757	    &uq->uq_key);
758	if (error != 0)
759		return (error);
760	if (args->ts != NULL)
761		linux_umtx_abs_timeout_init(&timo, args);
762	umtxq_lock(&uq->uq_key);
763	umtxq_busy(&uq->uq_key);
764	uq->uq_bitset = args->val3;
765	umtxq_insert(uq);
766	umtxq_unlock(&uq->uq_key);
767	error = fueword32(args->uaddr, &uval);
768	if (error != 0)
769		error = EFAULT;
770	else if (uval != args->val)
771		error = EWOULDBLOCK;
772	umtxq_lock(&uq->uq_key);
773	umtxq_unbusy(&uq->uq_key);
774	if (error == 0) {
775		error = umtxq_sleep(uq, "futex",
776		    args->ts == NULL ? NULL : &timo);
777		if ((uq->uq_flags & UQF_UMTXQ) == 0)
778			error = 0;
779		else
780			umtxq_remove(uq);
781	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
782		umtxq_remove(uq);
783	}
784	umtxq_unlock(&uq->uq_key);
785	umtx_key_release(&uq->uq_key);
786	if (error == ERESTART)
787		error = EINTR;
788	return (error);
789}
790
791static void
792linux_umtx_abs_timeout_init(struct umtx_abs_timeout *timo,
793    struct linux_futex_args *args)
794{
795	int clockid, absolute;
796
797	/*
798	 * The FUTEX_CLOCK_REALTIME option bit can be employed only with the
799	 * FUTEX_WAIT_BITSET, FUTEX_WAIT_REQUEUE_PI, FUTEX_LOCK_PI2.
800	 * For FUTEX_WAIT, timeout is interpreted as a relative value, for other
801	 * futex operations timeout is interpreted as an absolute value.
802	 * If FUTEX_CLOCK_REALTIME option bit is set, the Linux kernel measures
803	 * the timeout against the CLOCK_REALTIME clock, otherwise the kernel
804	 * measures the timeout against the CLOCK_MONOTONIC clock.
805	 */
806	clockid = args->clockrt ? CLOCK_REALTIME : CLOCK_MONOTONIC;
807	absolute = args->op == LINUX_FUTEX_WAIT ? false : true;
808	umtx_abs_timeout_init(timo, clockid, absolute, args->ts);
809}
810
811int
812linux_sys_futex(struct thread *td, struct linux_sys_futex_args *args)
813{
814	struct linux_futex_args fargs = {
815		.uaddr = args->uaddr,
816		.op = args->op,
817		.val = args->val,
818		.ts = NULL,
819		.uaddr2 = args->uaddr2,
820		.val3 = args->val3,
821		.val3_compare = true,
822	};
823	int error;
824
825	switch (args->op & LINUX_FUTEX_CMD_MASK) {
826	case LINUX_FUTEX_WAIT:
827	case LINUX_FUTEX_WAIT_BITSET:
828	case LINUX_FUTEX_LOCK_PI:
829	case LINUX_FUTEX_LOCK_PI2:
830		if (args->timeout != NULL) {
831			error = linux_get_timespec(&fargs.kts, args->timeout);
832			if (error != 0)
833				return (error);
834			fargs.ts = &fargs.kts;
835		}
836		break;
837	default:
838		fargs.ts = PTRIN(args->timeout);
839	}
840	return (linux_futex(td, &fargs));
841}
842
843#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
844int
845linux_sys_futex_time64(struct thread *td,
846    struct linux_sys_futex_time64_args *args)
847{
848	struct linux_futex_args fargs = {
849		.uaddr = args->uaddr,
850		.op = args->op,
851		.val = args->val,
852		.ts = NULL,
853		.uaddr2 = args->uaddr2,
854		.val3 = args->val3,
855		.val3_compare = true,
856	};
857	int error;
858
859	switch (args->op & LINUX_FUTEX_CMD_MASK) {
860	case LINUX_FUTEX_WAIT:
861	case LINUX_FUTEX_WAIT_BITSET:
862	case LINUX_FUTEX_LOCK_PI:
863	case LINUX_FUTEX_LOCK_PI2:
864		if (args->timeout != NULL) {
865			error = linux_get_timespec64(&fargs.kts, args->timeout);
866			if (error != 0)
867				return (error);
868			fargs.ts = &fargs.kts;
869		}
870		break;
871	default:
872		fargs.ts = PTRIN(args->timeout);
873	}
874	return (linux_futex(td, &fargs));
875}
876#endif
877
878int
879linux_set_robust_list(struct thread *td, struct linux_set_robust_list_args *args)
880{
881	struct linux_emuldata *em;
882
883	if (args->len != sizeof(struct linux_robust_list_head))
884		return (EINVAL);
885
886	em = em_find(td);
887	em->robust_futexes = args->head;
888
889	return (0);
890}
891
892int
893linux_get_robust_list(struct thread *td, struct linux_get_robust_list_args *args)
894{
895	struct linux_emuldata *em;
896	struct linux_robust_list_head *head;
897	l_size_t len;
898	struct thread *td2;
899	int error;
900
901	if (!args->pid) {
902		em = em_find(td);
903		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
904		head = em->robust_futexes;
905	} else {
906		td2 = linux_tdfind(td, args->pid, -1);
907		if (td2 == NULL)
908			return (ESRCH);
909		if (SV_PROC_ABI(td2->td_proc) != SV_ABI_LINUX) {
910			PROC_UNLOCK(td2->td_proc);
911			return (EPERM);
912		}
913
914		em = em_find(td2);
915		KASSERT(em != NULL, ("get_robust_list: emuldata notfound.\n"));
916		/* XXX: ptrace? */
917		if (priv_check(td, PRIV_CRED_SETUID) ||
918		    priv_check(td, PRIV_CRED_SETEUID) ||
919		    p_candebug(td, td2->td_proc)) {
920			PROC_UNLOCK(td2->td_proc);
921			return (EPERM);
922		}
923		head = em->robust_futexes;
924
925		PROC_UNLOCK(td2->td_proc);
926	}
927
928	len = sizeof(struct linux_robust_list_head);
929	error = copyout(&len, args->len, sizeof(l_size_t));
930	if (error != 0)
931		return (EFAULT);
932
933	return (copyout(&head, args->head, sizeof(l_uintptr_t)));
934}
935
936static int
937handle_futex_death(struct thread *td, struct linux_emuldata *em, uint32_t *uaddr,
938    unsigned int pi, bool pending_op)
939{
940	uint32_t uval, nval, mval;
941	int error;
942
943retry:
944	error = fueword32(uaddr, &uval);
945	if (error != 0)
946		return (EFAULT);
947
948	/*
949	 * Special case for regular (non PI) futexes. The unlock path in
950	 * user space has two race scenarios:
951	 *
952	 * 1. The unlock path releases the user space futex value and
953	 *    before it can execute the futex() syscall to wake up
954	 *    waiters it is killed.
955	 *
956	 * 2. A woken up waiter is killed before it can acquire the
957	 *    futex in user space.
958	 *
959	 * In both cases the TID validation below prevents a wakeup of
960	 * potential waiters which can cause these waiters to block
961	 * forever.
962	 *
963	 * In both cases it is safe to attempt waking up a potential
964	 * waiter without touching the user space futex value and trying
965	 * to set the OWNER_DIED bit.
966	 */
967	if (pending_op && !pi && !uval) {
968		(void)futex_wake(td, uaddr, 1, true);
969		return (0);
970	}
971
972	if ((uval & FUTEX_TID_MASK) == em->em_tid) {
973		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
974		error = casueword32(uaddr, uval, &nval, mval);
975		if (error == -1)
976			return (EFAULT);
977		if (error == 1) {
978			error = thread_check_susp(td, false);
979			if (error != 0)
980				return (error);
981			goto retry;
982		}
983
984		if (!pi && (uval & FUTEX_WAITERS)) {
985			error = futex_wake(td, uaddr, 1, true);
986			if (error != 0)
987				return (error);
988		} else if (pi && (uval & FUTEX_WAITERS)) {
989			error = futex_wake_pi(td, uaddr, true);
990			if (error != 0)
991				return (error);
992		}
993	}
994
995	return (0);
996}
997
998static int
999fetch_robust_entry(struct linux_robust_list **entry,
1000    struct linux_robust_list **head, unsigned int *pi)
1001{
1002	l_ulong uentry;
1003	int error;
1004
1005	error = copyin((const void *)head, &uentry, sizeof(uentry));
1006	if (error != 0)
1007		return (EFAULT);
1008
1009	*entry = (void *)(uentry & ~1UL);
1010	*pi = uentry & 1;
1011
1012	return (0);
1013}
1014
1015#define	LINUX_HANDLE_DEATH_PENDING	true
1016#define	LINUX_HANDLE_DEATH_LIST		false
1017
1018/* This walks the list of robust futexes releasing them. */
1019void
1020release_futexes(struct thread *td, struct linux_emuldata *em)
1021{
1022	struct linux_robust_list_head *head;
1023	struct linux_robust_list *entry, *next_entry, *pending;
1024	unsigned int limit = 2048, pi, next_pi, pip;
1025	uint32_t *uaddr;
1026	l_long futex_offset;
1027	int error;
1028
1029	head = em->robust_futexes;
1030	if (head == NULL)
1031		return;
1032
1033	if (fetch_robust_entry(&entry, PTRIN(&head->list.next), &pi))
1034		return;
1035
1036	error = copyin(&head->futex_offset, &futex_offset,
1037	    sizeof(futex_offset));
1038	if (error != 0)
1039		return;
1040
1041	if (fetch_robust_entry(&pending, PTRIN(&head->pending_list), &pip))
1042		return;
1043
1044	while (entry != &head->list) {
1045		error = fetch_robust_entry(&next_entry, PTRIN(&entry->next),
1046		    &next_pi);
1047
1048		/*
1049		 * A pending lock might already be on the list, so
1050		 * don't process it twice.
1051		 */
1052		if (entry != pending) {
1053			uaddr = (uint32_t *)((caddr_t)entry + futex_offset);
1054			if (handle_futex_death(td, em, uaddr, pi,
1055			    LINUX_HANDLE_DEATH_LIST))
1056				return;
1057		}
1058		if (error != 0)
1059			return;
1060
1061		entry = next_entry;
1062		pi = next_pi;
1063
1064		if (!--limit)
1065			break;
1066
1067		sched_relinquish(curthread);
1068	}
1069
1070	if (pending) {
1071		uaddr = (uint32_t *)((caddr_t)pending + futex_offset);
1072		(void)handle_futex_death(td, em, uaddr, pip,
1073		    LINUX_HANDLE_DEATH_PENDING);
1074	}
1075}
1076