1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 2002 Doug Rabson
5 * Copyright (c) 1994-1995 S��ren Schmidt
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer
13 *    in this position and unchanged.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. The name of the author may not be used to endorse or promote products
18 *    derived from this software without specific prior written permission
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/param.h>
33#include <sys/fcntl.h>
34#include <sys/jail.h>
35#include <sys/imgact.h>
36#include <sys/limits.h>
37#include <sys/lock.h>
38#include <sys/msgbuf.h>
39#include <sys/mqueue.h>
40#include <sys/mutex.h>
41#include <sys/poll.h>
42#include <sys/priv.h>
43#include <sys/proc.h>
44#include <sys/procctl.h>
45#include <sys/reboot.h>
46#include <sys/random.h>
47#include <sys/resourcevar.h>
48#include <sys/rtprio.h>
49#include <sys/sched.h>
50#include <sys/smp.h>
51#include <sys/stat.h>
52#include <sys/syscallsubr.h>
53#include <sys/sysctl.h>
54#include <sys/sysent.h>
55#include <sys/sysproto.h>
56#include <sys/time.h>
57#include <sys/vmmeter.h>
58#include <sys/vnode.h>
59
60#include <security/audit/audit.h>
61#include <security/mac/mac_framework.h>
62
63#include <vm/pmap.h>
64#include <vm/vm_map.h>
65#include <vm/swap_pager.h>
66
67#ifdef COMPAT_LINUX32
68#include <machine/../linux32/linux.h>
69#include <machine/../linux32/linux32_proto.h>
70#else
71#include <machine/../linux/linux.h>
72#include <machine/../linux/linux_proto.h>
73#endif
74
75#include <compat/linux/linux_common.h>
76#include <compat/linux/linux_dtrace.h>
77#include <compat/linux/linux_file.h>
78#include <compat/linux/linux_mib.h>
79#include <compat/linux/linux_mmap.h>
80#include <compat/linux/linux_signal.h>
81#include <compat/linux/linux_time.h>
82#include <compat/linux/linux_util.h>
83#include <compat/linux/linux_emul.h>
84#include <compat/linux/linux_misc.h>
85
86int stclohz;				/* Statistics clock frequency */
87
88static unsigned int linux_to_bsd_resource[LINUX_RLIM_NLIMITS] = {
89	RLIMIT_CPU, RLIMIT_FSIZE, RLIMIT_DATA, RLIMIT_STACK,
90	RLIMIT_CORE, RLIMIT_RSS, RLIMIT_NPROC, RLIMIT_NOFILE,
91	RLIMIT_MEMLOCK, RLIMIT_AS
92};
93
94struct l_sysinfo {
95	l_long		uptime;		/* Seconds since boot */
96	l_ulong		loads[3];	/* 1, 5, and 15 minute load averages */
97#define LINUX_SYSINFO_LOADS_SCALE 65536
98	l_ulong		totalram;	/* Total usable main memory size */
99	l_ulong		freeram;	/* Available memory size */
100	l_ulong		sharedram;	/* Amount of shared memory */
101	l_ulong		bufferram;	/* Memory used by buffers */
102	l_ulong		totalswap;	/* Total swap space size */
103	l_ulong		freeswap;	/* swap space still available */
104	l_ushort	procs;		/* Number of current processes */
105	l_ushort	pads;
106	l_ulong		totalhigh;
107	l_ulong		freehigh;
108	l_uint		mem_unit;
109	char		_f[20-2*sizeof(l_long)-sizeof(l_int)];	/* padding */
110};
111
112struct l_pselect6arg {
113	l_uintptr_t	ss;
114	l_size_t	ss_len;
115};
116
117static int	linux_utimensat_lts_to_ts(struct l_timespec *,
118			struct timespec *);
119#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
120static int	linux_utimensat_lts64_to_ts(struct l_timespec64 *,
121			struct timespec *);
122#endif
123static int	linux_common_utimensat(struct thread *, int,
124			const char *, struct timespec *, int);
125static int	linux_common_pselect6(struct thread *, l_int,
126			l_fd_set *, l_fd_set *, l_fd_set *,
127			struct timespec *, l_uintptr_t *);
128static int	linux_common_ppoll(struct thread *, struct pollfd *,
129			uint32_t, struct timespec *, l_sigset_t *,
130			l_size_t);
131static int	linux_pollin(struct thread *, struct pollfd *,
132			struct pollfd *, u_int);
133static int	linux_pollout(struct thread *, struct pollfd *,
134			struct pollfd *, u_int);
135
136int
137linux_sysinfo(struct thread *td, struct linux_sysinfo_args *args)
138{
139	struct l_sysinfo sysinfo;
140	int i, j;
141	struct timespec ts;
142
143	bzero(&sysinfo, sizeof(sysinfo));
144	getnanouptime(&ts);
145	if (ts.tv_nsec != 0)
146		ts.tv_sec++;
147	sysinfo.uptime = ts.tv_sec;
148
149	/* Use the information from the mib to get our load averages */
150	for (i = 0; i < 3; i++)
151		sysinfo.loads[i] = averunnable.ldavg[i] *
152		    LINUX_SYSINFO_LOADS_SCALE / averunnable.fscale;
153
154	sysinfo.totalram = physmem * PAGE_SIZE;
155	sysinfo.freeram = (u_long)vm_free_count() * PAGE_SIZE;
156
157	/*
158	 * sharedram counts pages allocated to named, swap-backed objects such
159	 * as shared memory segments and tmpfs files.  There is no cheap way to
160	 * compute this, so just leave the field unpopulated.  Linux itself only
161	 * started setting this field in the 3.x timeframe.
162	 */
163	sysinfo.sharedram = 0;
164	sysinfo.bufferram = 0;
165
166	swap_pager_status(&i, &j);
167	sysinfo.totalswap = i * PAGE_SIZE;
168	sysinfo.freeswap = (i - j) * PAGE_SIZE;
169
170	sysinfo.procs = nprocs;
171
172	/*
173	 * Platforms supported by the emulation layer do not have a notion of
174	 * high memory.
175	 */
176	sysinfo.totalhigh = 0;
177	sysinfo.freehigh = 0;
178
179	sysinfo.mem_unit = 1;
180
181	return (copyout(&sysinfo, args->info, sizeof(sysinfo)));
182}
183
184#ifdef LINUX_LEGACY_SYSCALLS
185int
186linux_alarm(struct thread *td, struct linux_alarm_args *args)
187{
188	struct itimerval it, old_it;
189	u_int secs;
190	int error __diagused;
191
192	secs = args->secs;
193	/*
194	 * Linux alarm() is always successful. Limit secs to INT32_MAX / 2
195	 * to match kern_setitimer()'s limit to avoid error from it.
196	 *
197	 * XXX. Linux limit secs to INT_MAX on 32 and does not limit on 64-bit
198	 * platforms.
199	 */
200	if (secs > INT32_MAX / 2)
201		secs = INT32_MAX / 2;
202
203	it.it_value.tv_sec = secs;
204	it.it_value.tv_usec = 0;
205	timevalclear(&it.it_interval);
206	error = kern_setitimer(td, ITIMER_REAL, &it, &old_it);
207	KASSERT(error == 0, ("kern_setitimer returns %d", error));
208
209	if ((old_it.it_value.tv_sec == 0 && old_it.it_value.tv_usec > 0) ||
210	    old_it.it_value.tv_usec >= 500000)
211		old_it.it_value.tv_sec++;
212	td->td_retval[0] = old_it.it_value.tv_sec;
213	return (0);
214}
215#endif
216
217int
218linux_brk(struct thread *td, struct linux_brk_args *args)
219{
220	struct vmspace *vm = td->td_proc->p_vmspace;
221	uintptr_t new, old;
222
223	old = (uintptr_t)vm->vm_daddr + ctob(vm->vm_dsize);
224	new = (uintptr_t)args->dsend;
225	if ((caddr_t)new > vm->vm_daddr && !kern_break(td, &new))
226		td->td_retval[0] = (register_t)new;
227	else
228		td->td_retval[0] = (register_t)old;
229
230	return (0);
231}
232
233#ifdef LINUX_LEGACY_SYSCALLS
234int
235linux_select(struct thread *td, struct linux_select_args *args)
236{
237	l_timeval ltv;
238	struct timeval tv0, tv1, utv, *tvp;
239	int error;
240
241	/*
242	 * Store current time for computation of the amount of
243	 * time left.
244	 */
245	if (args->timeout) {
246		if ((error = copyin(args->timeout, &ltv, sizeof(ltv))))
247			goto select_out;
248		utv.tv_sec = ltv.tv_sec;
249		utv.tv_usec = ltv.tv_usec;
250
251		if (itimerfix(&utv)) {
252			/*
253			 * The timeval was invalid.  Convert it to something
254			 * valid that will act as it does under Linux.
255			 */
256			utv.tv_sec += utv.tv_usec / 1000000;
257			utv.tv_usec %= 1000000;
258			if (utv.tv_usec < 0) {
259				utv.tv_sec -= 1;
260				utv.tv_usec += 1000000;
261			}
262			if (utv.tv_sec < 0)
263				timevalclear(&utv);
264		}
265		microtime(&tv0);
266		tvp = &utv;
267	} else
268		tvp = NULL;
269
270	error = kern_select(td, args->nfds, args->readfds, args->writefds,
271	    args->exceptfds, tvp, LINUX_NFDBITS);
272	if (error)
273		goto select_out;
274
275	if (args->timeout) {
276		if (td->td_retval[0]) {
277			/*
278			 * Compute how much time was left of the timeout,
279			 * by subtracting the current time and the time
280			 * before we started the call, and subtracting
281			 * that result from the user-supplied value.
282			 */
283			microtime(&tv1);
284			timevalsub(&tv1, &tv0);
285			timevalsub(&utv, &tv1);
286			if (utv.tv_sec < 0)
287				timevalclear(&utv);
288		} else
289			timevalclear(&utv);
290		ltv.tv_sec = utv.tv_sec;
291		ltv.tv_usec = utv.tv_usec;
292		if ((error = copyout(&ltv, args->timeout, sizeof(ltv))))
293			goto select_out;
294	}
295
296select_out:
297	return (error);
298}
299#endif
300
301int
302linux_mremap(struct thread *td, struct linux_mremap_args *args)
303{
304	uintptr_t addr;
305	size_t len;
306	int error = 0;
307
308	if (args->flags & ~(LINUX_MREMAP_FIXED | LINUX_MREMAP_MAYMOVE)) {
309		td->td_retval[0] = 0;
310		return (EINVAL);
311	}
312
313	/*
314	 * Check for the page alignment.
315	 * Linux defines PAGE_MASK to be FreeBSD ~PAGE_MASK.
316	 */
317	if (args->addr & PAGE_MASK) {
318		td->td_retval[0] = 0;
319		return (EINVAL);
320	}
321
322	args->new_len = round_page(args->new_len);
323	args->old_len = round_page(args->old_len);
324
325	if (args->new_len > args->old_len) {
326		td->td_retval[0] = 0;
327		return (ENOMEM);
328	}
329
330	if (args->new_len < args->old_len) {
331		addr = args->addr + args->new_len;
332		len = args->old_len - args->new_len;
333		error = kern_munmap(td, addr, len);
334	}
335
336	td->td_retval[0] = error ? 0 : (uintptr_t)args->addr;
337	return (error);
338}
339
340#define LINUX_MS_ASYNC       0x0001
341#define LINUX_MS_INVALIDATE  0x0002
342#define LINUX_MS_SYNC        0x0004
343
344int
345linux_msync(struct thread *td, struct linux_msync_args *args)
346{
347
348	return (kern_msync(td, args->addr, args->len,
349	    args->fl & ~LINUX_MS_SYNC));
350}
351
352int
353linux_mprotect(struct thread *td, struct linux_mprotect_args *uap)
354{
355
356	return (linux_mprotect_common(td, PTROUT(uap->addr), uap->len,
357	    uap->prot));
358}
359
360int
361linux_madvise(struct thread *td, struct linux_madvise_args *uap)
362{
363
364	return (linux_madvise_common(td, PTROUT(uap->addr), uap->len,
365	    uap->behav));
366}
367
368int
369linux_mmap2(struct thread *td, struct linux_mmap2_args *uap)
370{
371#if defined(LINUX_ARCHWANT_MMAP2PGOFF)
372	/*
373	 * For architectures with sizeof (off_t) < sizeof (loff_t) mmap is
374	 * implemented with mmap2 syscall and the offset is represented in
375	 * multiples of page size.
376	 */
377	return (linux_mmap_common(td, PTROUT(uap->addr), uap->len, uap->prot,
378	    uap->flags, uap->fd, (uint64_t)(uint32_t)uap->pgoff * PAGE_SIZE));
379#else
380	return (linux_mmap_common(td, PTROUT(uap->addr), uap->len, uap->prot,
381	    uap->flags, uap->fd, uap->pgoff));
382#endif
383}
384
385#ifdef LINUX_LEGACY_SYSCALLS
386int
387linux_time(struct thread *td, struct linux_time_args *args)
388{
389	struct timeval tv;
390	l_time_t tm;
391	int error;
392
393	microtime(&tv);
394	tm = tv.tv_sec;
395	if (args->tm && (error = copyout(&tm, args->tm, sizeof(tm))))
396		return (error);
397	td->td_retval[0] = tm;
398	return (0);
399}
400#endif
401
402struct l_times_argv {
403	l_clock_t	tms_utime;
404	l_clock_t	tms_stime;
405	l_clock_t	tms_cutime;
406	l_clock_t	tms_cstime;
407};
408
409/*
410 * Glibc versions prior to 2.2.1 always use hard-coded CLK_TCK value.
411 * Since 2.2.1 Glibc uses value exported from kernel via AT_CLKTCK
412 * auxiliary vector entry.
413 */
414#define	CLK_TCK		100
415
416#define	CONVOTCK(r)	(r.tv_sec * CLK_TCK + r.tv_usec / (1000000 / CLK_TCK))
417#define	CONVNTCK(r)	(r.tv_sec * stclohz + r.tv_usec / (1000000 / stclohz))
418
419#define	CONVTCK(r)	(linux_kernver(td) >= LINUX_KERNVER(2,4,0) ?	\
420			    CONVNTCK(r) : CONVOTCK(r))
421
422int
423linux_times(struct thread *td, struct linux_times_args *args)
424{
425	struct timeval tv, utime, stime, cutime, cstime;
426	struct l_times_argv tms;
427	struct proc *p;
428	int error;
429
430	if (args->buf != NULL) {
431		p = td->td_proc;
432		PROC_LOCK(p);
433		PROC_STATLOCK(p);
434		calcru(p, &utime, &stime);
435		PROC_STATUNLOCK(p);
436		calccru(p, &cutime, &cstime);
437		PROC_UNLOCK(p);
438
439		tms.tms_utime = CONVTCK(utime);
440		tms.tms_stime = CONVTCK(stime);
441
442		tms.tms_cutime = CONVTCK(cutime);
443		tms.tms_cstime = CONVTCK(cstime);
444
445		if ((error = copyout(&tms, args->buf, sizeof(tms))))
446			return (error);
447	}
448
449	microuptime(&tv);
450	td->td_retval[0] = (int)CONVTCK(tv);
451	return (0);
452}
453
454int
455linux_newuname(struct thread *td, struct linux_newuname_args *args)
456{
457	struct l_new_utsname utsname;
458	char osname[LINUX_MAX_UTSNAME];
459	char osrelease[LINUX_MAX_UTSNAME];
460	char *p;
461
462	linux_get_osname(td, osname);
463	linux_get_osrelease(td, osrelease);
464
465	bzero(&utsname, sizeof(utsname));
466	strlcpy(utsname.sysname, osname, LINUX_MAX_UTSNAME);
467	getcredhostname(td->td_ucred, utsname.nodename, LINUX_MAX_UTSNAME);
468	getcreddomainname(td->td_ucred, utsname.domainname, LINUX_MAX_UTSNAME);
469	strlcpy(utsname.release, osrelease, LINUX_MAX_UTSNAME);
470	strlcpy(utsname.version, version, LINUX_MAX_UTSNAME);
471	for (p = utsname.version; *p != '\0'; ++p)
472		if (*p == '\n') {
473			*p = '\0';
474			break;
475		}
476#if defined(__amd64__)
477	/*
478	 * On amd64, Linux uname(2) needs to return "x86_64"
479	 * for both 64-bit and 32-bit applications.  On 32-bit,
480	 * the string returned by getauxval(AT_PLATFORM) needs
481	 * to remain "i686", though.
482	 */
483#if defined(COMPAT_LINUX32)
484	if (linux32_emulate_i386)
485		strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
486	else
487#endif
488	strlcpy(utsname.machine, "x86_64", LINUX_MAX_UTSNAME);
489#elif defined(__aarch64__)
490	strlcpy(utsname.machine, "aarch64", LINUX_MAX_UTSNAME);
491#elif defined(__i386__)
492	strlcpy(utsname.machine, "i686", LINUX_MAX_UTSNAME);
493#endif
494
495	return (copyout(&utsname, args->buf, sizeof(utsname)));
496}
497
498struct l_utimbuf {
499	l_time_t l_actime;
500	l_time_t l_modtime;
501};
502
503#ifdef LINUX_LEGACY_SYSCALLS
504int
505linux_utime(struct thread *td, struct linux_utime_args *args)
506{
507	struct timeval tv[2], *tvp;
508	struct l_utimbuf lut;
509	int error;
510
511	if (args->times) {
512		if ((error = copyin(args->times, &lut, sizeof lut)) != 0)
513			return (error);
514		tv[0].tv_sec = lut.l_actime;
515		tv[0].tv_usec = 0;
516		tv[1].tv_sec = lut.l_modtime;
517		tv[1].tv_usec = 0;
518		tvp = tv;
519	} else
520		tvp = NULL;
521
522	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
523	    tvp, UIO_SYSSPACE));
524}
525#endif
526
527#ifdef LINUX_LEGACY_SYSCALLS
528int
529linux_utimes(struct thread *td, struct linux_utimes_args *args)
530{
531	l_timeval ltv[2];
532	struct timeval tv[2], *tvp = NULL;
533	int error;
534
535	if (args->tptr != NULL) {
536		if ((error = copyin(args->tptr, ltv, sizeof ltv)) != 0)
537			return (error);
538		tv[0].tv_sec = ltv[0].tv_sec;
539		tv[0].tv_usec = ltv[0].tv_usec;
540		tv[1].tv_sec = ltv[1].tv_sec;
541		tv[1].tv_usec = ltv[1].tv_usec;
542		tvp = tv;
543	}
544
545	return (kern_utimesat(td, AT_FDCWD, args->fname, UIO_USERSPACE,
546	    tvp, UIO_SYSSPACE));
547}
548#endif
549
550static int
551linux_utimensat_lts_to_ts(struct l_timespec *l_times, struct timespec *times)
552{
553
554	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
555	    l_times->tv_nsec != LINUX_UTIME_NOW &&
556	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
557		return (EINVAL);
558
559	times->tv_sec = l_times->tv_sec;
560	switch (l_times->tv_nsec)
561	{
562	case LINUX_UTIME_OMIT:
563		times->tv_nsec = UTIME_OMIT;
564		break;
565	case LINUX_UTIME_NOW:
566		times->tv_nsec = UTIME_NOW;
567		break;
568	default:
569		times->tv_nsec = l_times->tv_nsec;
570	}
571
572	return (0);
573}
574
575static int
576linux_common_utimensat(struct thread *td, int ldfd, const char *pathname,
577    struct timespec *timesp, int lflags)
578{
579	int dfd, flags = 0;
580
581	dfd = (ldfd == LINUX_AT_FDCWD) ? AT_FDCWD : ldfd;
582
583	if (lflags & ~(LINUX_AT_SYMLINK_NOFOLLOW | LINUX_AT_EMPTY_PATH))
584		return (EINVAL);
585
586	if (timesp != NULL) {
587		/* This breaks POSIX, but is what the Linux kernel does
588		 * _on purpose_ (documented in the man page for utimensat(2)),
589		 * so we must follow that behaviour. */
590		if (timesp[0].tv_nsec == UTIME_OMIT &&
591		    timesp[1].tv_nsec == UTIME_OMIT)
592			return (0);
593	}
594
595	if (lflags & LINUX_AT_SYMLINK_NOFOLLOW)
596		flags |= AT_SYMLINK_NOFOLLOW;
597	if (lflags & LINUX_AT_EMPTY_PATH)
598		flags |= AT_EMPTY_PATH;
599
600	if (pathname != NULL)
601		return (kern_utimensat(td, dfd, pathname,
602		    UIO_USERSPACE, timesp, UIO_SYSSPACE, flags));
603
604	if (lflags != 0)
605		return (EINVAL);
606
607	return (kern_futimens(td, dfd, timesp, UIO_SYSSPACE));
608}
609
610int
611linux_utimensat(struct thread *td, struct linux_utimensat_args *args)
612{
613	struct l_timespec l_times[2];
614	struct timespec times[2], *timesp;
615	int error;
616
617	if (args->times != NULL) {
618		error = copyin(args->times, l_times, sizeof(l_times));
619		if (error != 0)
620			return (error);
621
622		error = linux_utimensat_lts_to_ts(&l_times[0], &times[0]);
623		if (error != 0)
624			return (error);
625		error = linux_utimensat_lts_to_ts(&l_times[1], &times[1]);
626		if (error != 0)
627			return (error);
628		timesp = times;
629	} else
630		timesp = NULL;
631
632	return (linux_common_utimensat(td, args->dfd, args->pathname,
633	    timesp, args->flags));
634}
635
636#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
637static int
638linux_utimensat_lts64_to_ts(struct l_timespec64 *l_times, struct timespec *times)
639{
640
641	/* Zero out the padding in compat mode. */
642	l_times->tv_nsec &= 0xFFFFFFFFUL;
643
644	if (l_times->tv_nsec != LINUX_UTIME_OMIT &&
645	    l_times->tv_nsec != LINUX_UTIME_NOW &&
646	    (l_times->tv_nsec < 0 || l_times->tv_nsec > 999999999))
647		return (EINVAL);
648
649	times->tv_sec = l_times->tv_sec;
650	switch (l_times->tv_nsec)
651	{
652	case LINUX_UTIME_OMIT:
653		times->tv_nsec = UTIME_OMIT;
654		break;
655	case LINUX_UTIME_NOW:
656		times->tv_nsec = UTIME_NOW;
657		break;
658	default:
659		times->tv_nsec = l_times->tv_nsec;
660	}
661
662	return (0);
663}
664
665int
666linux_utimensat_time64(struct thread *td, struct linux_utimensat_time64_args *args)
667{
668	struct l_timespec64 l_times[2];
669	struct timespec times[2], *timesp;
670	int error;
671
672	if (args->times64 != NULL) {
673		error = copyin(args->times64, l_times, sizeof(l_times));
674		if (error != 0)
675			return (error);
676
677		error = linux_utimensat_lts64_to_ts(&l_times[0], &times[0]);
678		if (error != 0)
679			return (error);
680		error = linux_utimensat_lts64_to_ts(&l_times[1], &times[1]);
681		if (error != 0)
682			return (error);
683		timesp = times;
684	} else
685		timesp = NULL;
686
687	return (linux_common_utimensat(td, args->dfd, args->pathname,
688	    timesp, args->flags));
689}
690#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
691
692#ifdef LINUX_LEGACY_SYSCALLS
693int
694linux_futimesat(struct thread *td, struct linux_futimesat_args *args)
695{
696	l_timeval ltv[2];
697	struct timeval tv[2], *tvp = NULL;
698	int error, dfd;
699
700	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
701
702	if (args->utimes != NULL) {
703		if ((error = copyin(args->utimes, ltv, sizeof ltv)) != 0)
704			return (error);
705		tv[0].tv_sec = ltv[0].tv_sec;
706		tv[0].tv_usec = ltv[0].tv_usec;
707		tv[1].tv_sec = ltv[1].tv_sec;
708		tv[1].tv_usec = ltv[1].tv_usec;
709		tvp = tv;
710	}
711
712	return (kern_utimesat(td, dfd, args->filename, UIO_USERSPACE,
713	    tvp, UIO_SYSSPACE));
714}
715#endif
716
717static int
718linux_common_wait(struct thread *td, idtype_t idtype, int id, int *statusp,
719    int options, void *rup, l_siginfo_t *infop)
720{
721	l_siginfo_t lsi;
722	siginfo_t siginfo;
723	struct __wrusage wru;
724	int error, status, tmpstat, sig;
725
726	error = kern_wait6(td, idtype, id, &status, options,
727	    rup != NULL ? &wru : NULL, &siginfo);
728
729	if (error == 0 && statusp) {
730		tmpstat = status & 0xffff;
731		if (WIFSIGNALED(tmpstat)) {
732			tmpstat = (tmpstat & 0xffffff80) |
733			    bsd_to_linux_signal(WTERMSIG(tmpstat));
734		} else if (WIFSTOPPED(tmpstat)) {
735			tmpstat = (tmpstat & 0xffff00ff) |
736			    (bsd_to_linux_signal(WSTOPSIG(tmpstat)) << 8);
737#if defined(__aarch64__) || (defined(__amd64__) && !defined(COMPAT_LINUX32))
738			if (WSTOPSIG(status) == SIGTRAP) {
739				tmpstat = linux_ptrace_status(td,
740				    siginfo.si_pid, tmpstat);
741			}
742#endif
743		} else if (WIFCONTINUED(tmpstat)) {
744			tmpstat = 0xffff;
745		}
746		error = copyout(&tmpstat, statusp, sizeof(int));
747	}
748	if (error == 0 && rup != NULL)
749		error = linux_copyout_rusage(&wru.wru_self, rup);
750	if (error == 0 && infop != NULL && td->td_retval[0] != 0) {
751		sig = bsd_to_linux_signal(siginfo.si_signo);
752		siginfo_to_lsiginfo(&siginfo, &lsi, sig);
753		error = copyout(&lsi, infop, sizeof(lsi));
754	}
755
756	return (error);
757}
758
759#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
760int
761linux_waitpid(struct thread *td, struct linux_waitpid_args *args)
762{
763	struct linux_wait4_args wait4_args = {
764		.pid = args->pid,
765		.status = args->status,
766		.options = args->options,
767		.rusage = NULL,
768	};
769
770	return (linux_wait4(td, &wait4_args));
771}
772#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
773
774int
775linux_wait4(struct thread *td, struct linux_wait4_args *args)
776{
777	struct proc *p;
778	int options, id, idtype;
779
780	if (args->options & ~(LINUX_WUNTRACED | LINUX_WNOHANG |
781	    LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
782		return (EINVAL);
783
784	/* -INT_MIN is not defined. */
785	if (args->pid == INT_MIN)
786		return (ESRCH);
787
788	options = 0;
789	linux_to_bsd_waitopts(args->options, &options);
790
791	/*
792	 * For backward compatibility we implicitly add flags WEXITED
793	 * and WTRAPPED here.
794	 */
795	options |= WEXITED | WTRAPPED;
796
797	if (args->pid == WAIT_ANY) {
798		idtype = P_ALL;
799		id = 0;
800	} else if (args->pid < 0) {
801		idtype = P_PGID;
802		id = (id_t)-args->pid;
803	} else if (args->pid == 0) {
804		idtype = P_PGID;
805		p = td->td_proc;
806		PROC_LOCK(p);
807		id = p->p_pgid;
808		PROC_UNLOCK(p);
809	} else {
810		idtype = P_PID;
811		id = (id_t)args->pid;
812	}
813
814	return (linux_common_wait(td, idtype, id, args->status, options,
815	    args->rusage, NULL));
816}
817
818int
819linux_waitid(struct thread *td, struct linux_waitid_args *args)
820{
821	idtype_t idtype;
822	int error, options;
823	struct proc *p;
824	pid_t id;
825
826	if (args->options & ~(LINUX_WNOHANG | LINUX_WNOWAIT | LINUX_WEXITED |
827	    LINUX_WSTOPPED | LINUX_WCONTINUED | __WCLONE | __WNOTHREAD | __WALL))
828		return (EINVAL);
829
830	options = 0;
831	linux_to_bsd_waitopts(args->options, &options);
832
833	id = args->id;
834	switch (args->idtype) {
835	case LINUX_P_ALL:
836		idtype = P_ALL;
837		break;
838	case LINUX_P_PID:
839		if (args->id <= 0)
840			return (EINVAL);
841		idtype = P_PID;
842		break;
843	case LINUX_P_PGID:
844		if (linux_kernver(td) >= LINUX_KERNVER(5,4,0) && args->id == 0) {
845			p = td->td_proc;
846			PROC_LOCK(p);
847			id = p->p_pgid;
848			PROC_UNLOCK(p);
849		} else if (args->id <= 0)
850			return (EINVAL);
851		idtype = P_PGID;
852		break;
853	case LINUX_P_PIDFD:
854		LINUX_RATELIMIT_MSG("unsupported waitid P_PIDFD idtype");
855		return (ENOSYS);
856	default:
857		return (EINVAL);
858	}
859
860	error = linux_common_wait(td, idtype, id, NULL, options,
861	    args->rusage, args->info);
862	td->td_retval[0] = 0;
863
864	return (error);
865}
866
867#ifdef LINUX_LEGACY_SYSCALLS
868int
869linux_mknod(struct thread *td, struct linux_mknod_args *args)
870{
871	int error;
872
873	switch (args->mode & S_IFMT) {
874	case S_IFIFO:
875	case S_IFSOCK:
876		error = kern_mkfifoat(td, AT_FDCWD, args->path, UIO_USERSPACE,
877		    args->mode);
878		break;
879
880	case S_IFCHR:
881	case S_IFBLK:
882		error = kern_mknodat(td, AT_FDCWD, args->path, UIO_USERSPACE,
883		    args->mode, linux_decode_dev(args->dev));
884		break;
885
886	case S_IFDIR:
887		error = EPERM;
888		break;
889
890	case 0:
891		args->mode |= S_IFREG;
892		/* FALLTHROUGH */
893	case S_IFREG:
894		error = kern_openat(td, AT_FDCWD, args->path, UIO_USERSPACE,
895		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
896		if (error == 0)
897			kern_close(td, td->td_retval[0]);
898		break;
899
900	default:
901		error = EINVAL;
902		break;
903	}
904	return (error);
905}
906#endif
907
908int
909linux_mknodat(struct thread *td, struct linux_mknodat_args *args)
910{
911	int error, dfd;
912
913	dfd = (args->dfd == LINUX_AT_FDCWD) ? AT_FDCWD : args->dfd;
914
915	switch (args->mode & S_IFMT) {
916	case S_IFIFO:
917	case S_IFSOCK:
918		error = kern_mkfifoat(td, dfd, args->filename, UIO_USERSPACE,
919		    args->mode);
920		break;
921
922	case S_IFCHR:
923	case S_IFBLK:
924		error = kern_mknodat(td, dfd, args->filename, UIO_USERSPACE,
925		    args->mode, linux_decode_dev(args->dev));
926		break;
927
928	case S_IFDIR:
929		error = EPERM;
930		break;
931
932	case 0:
933		args->mode |= S_IFREG;
934		/* FALLTHROUGH */
935	case S_IFREG:
936		error = kern_openat(td, dfd, args->filename, UIO_USERSPACE,
937		    O_WRONLY | O_CREAT | O_TRUNC, args->mode);
938		if (error == 0)
939			kern_close(td, td->td_retval[0]);
940		break;
941
942	default:
943		error = EINVAL;
944		break;
945	}
946	return (error);
947}
948
949/*
950 * UGH! This is just about the dumbest idea I've ever heard!!
951 */
952int
953linux_personality(struct thread *td, struct linux_personality_args *args)
954{
955	struct linux_pemuldata *pem;
956	struct proc *p = td->td_proc;
957	uint32_t old;
958
959	PROC_LOCK(p);
960	pem = pem_find(p);
961	old = pem->persona;
962	if (args->per != 0xffffffff)
963		pem->persona = args->per;
964	PROC_UNLOCK(p);
965
966	td->td_retval[0] = old;
967	return (0);
968}
969
970struct l_itimerval {
971	l_timeval it_interval;
972	l_timeval it_value;
973};
974
975#define	B2L_ITIMERVAL(bip, lip)						\
976	(bip)->it_interval.tv_sec = (lip)->it_interval.tv_sec;		\
977	(bip)->it_interval.tv_usec = (lip)->it_interval.tv_usec;	\
978	(bip)->it_value.tv_sec = (lip)->it_value.tv_sec;		\
979	(bip)->it_value.tv_usec = (lip)->it_value.tv_usec;
980
981int
982linux_setitimer(struct thread *td, struct linux_setitimer_args *uap)
983{
984	int error;
985	struct l_itimerval ls;
986	struct itimerval aitv, oitv;
987
988	if (uap->itv == NULL) {
989		uap->itv = uap->oitv;
990		return (linux_getitimer(td, (struct linux_getitimer_args *)uap));
991	}
992
993	error = copyin(uap->itv, &ls, sizeof(ls));
994	if (error != 0)
995		return (error);
996	B2L_ITIMERVAL(&aitv, &ls);
997	error = kern_setitimer(td, uap->which, &aitv, &oitv);
998	if (error != 0 || uap->oitv == NULL)
999		return (error);
1000	B2L_ITIMERVAL(&ls, &oitv);
1001
1002	return (copyout(&ls, uap->oitv, sizeof(ls)));
1003}
1004
1005int
1006linux_getitimer(struct thread *td, struct linux_getitimer_args *uap)
1007{
1008	int error;
1009	struct l_itimerval ls;
1010	struct itimerval aitv;
1011
1012	error = kern_getitimer(td, uap->which, &aitv);
1013	if (error != 0)
1014		return (error);
1015	B2L_ITIMERVAL(&ls, &aitv);
1016	return (copyout(&ls, uap->itv, sizeof(ls)));
1017}
1018
1019#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1020int
1021linux_nice(struct thread *td, struct linux_nice_args *args)
1022{
1023
1024	return (kern_setpriority(td, PRIO_PROCESS, 0, args->inc));
1025}
1026#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1027
1028int
1029linux_setgroups(struct thread *td, struct linux_setgroups_args *args)
1030{
1031	struct ucred *newcred, *oldcred;
1032	l_gid_t *linux_gidset;
1033	gid_t *bsd_gidset;
1034	int ngrp, error;
1035	struct proc *p;
1036
1037	ngrp = args->gidsetsize;
1038	if (ngrp < 0 || ngrp >= ngroups_max + 1)
1039		return (EINVAL);
1040	linux_gidset = malloc(ngrp * sizeof(*linux_gidset), M_LINUX, M_WAITOK);
1041	error = copyin(args->grouplist, linux_gidset, ngrp * sizeof(l_gid_t));
1042	if (error)
1043		goto out;
1044	newcred = crget();
1045	crextend(newcred, ngrp + 1);
1046	p = td->td_proc;
1047	PROC_LOCK(p);
1048	oldcred = p->p_ucred;
1049	crcopy(newcred, oldcred);
1050
1051	/*
1052	 * cr_groups[0] holds egid. Setting the whole set from
1053	 * the supplied set will cause egid to be changed too.
1054	 * Keep cr_groups[0] unchanged to prevent that.
1055	 */
1056
1057	if ((error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS)) != 0) {
1058		PROC_UNLOCK(p);
1059		crfree(newcred);
1060		goto out;
1061	}
1062
1063	if (ngrp > 0) {
1064		newcred->cr_ngroups = ngrp + 1;
1065
1066		bsd_gidset = newcred->cr_groups;
1067		ngrp--;
1068		while (ngrp >= 0) {
1069			bsd_gidset[ngrp + 1] = linux_gidset[ngrp];
1070			ngrp--;
1071		}
1072	} else
1073		newcred->cr_ngroups = 1;
1074
1075	setsugid(p);
1076	proc_set_cred(p, newcred);
1077	PROC_UNLOCK(p);
1078	crfree(oldcred);
1079	error = 0;
1080out:
1081	free(linux_gidset, M_LINUX);
1082	return (error);
1083}
1084
1085int
1086linux_getgroups(struct thread *td, struct linux_getgroups_args *args)
1087{
1088	struct ucred *cred;
1089	l_gid_t *linux_gidset;
1090	gid_t *bsd_gidset;
1091	int bsd_gidsetsz, ngrp, error;
1092
1093	cred = td->td_ucred;
1094	bsd_gidset = cred->cr_groups;
1095	bsd_gidsetsz = cred->cr_ngroups - 1;
1096
1097	/*
1098	 * cr_groups[0] holds egid. Returning the whole set
1099	 * here will cause a duplicate. Exclude cr_groups[0]
1100	 * to prevent that.
1101	 */
1102
1103	if ((ngrp = args->gidsetsize) == 0) {
1104		td->td_retval[0] = bsd_gidsetsz;
1105		return (0);
1106	}
1107
1108	if (ngrp < bsd_gidsetsz)
1109		return (EINVAL);
1110
1111	ngrp = 0;
1112	linux_gidset = malloc(bsd_gidsetsz * sizeof(*linux_gidset),
1113	    M_LINUX, M_WAITOK);
1114	while (ngrp < bsd_gidsetsz) {
1115		linux_gidset[ngrp] = bsd_gidset[ngrp + 1];
1116		ngrp++;
1117	}
1118
1119	error = copyout(linux_gidset, args->grouplist, ngrp * sizeof(l_gid_t));
1120	free(linux_gidset, M_LINUX);
1121	if (error)
1122		return (error);
1123
1124	td->td_retval[0] = ngrp;
1125	return (0);
1126}
1127
1128static bool
1129linux_get_dummy_limit(struct thread *td, l_uint resource, struct rlimit *rlim)
1130{
1131	ssize_t size;
1132	int res, error;
1133
1134	if (linux_dummy_rlimits == 0)
1135		return (false);
1136
1137	switch (resource) {
1138	case LINUX_RLIMIT_LOCKS:
1139	case LINUX_RLIMIT_RTTIME:
1140		rlim->rlim_cur = LINUX_RLIM_INFINITY;
1141		rlim->rlim_max = LINUX_RLIM_INFINITY;
1142		return (true);
1143	case LINUX_RLIMIT_NICE:
1144	case LINUX_RLIMIT_RTPRIO:
1145		rlim->rlim_cur = 0;
1146		rlim->rlim_max = 0;
1147		return (true);
1148	case LINUX_RLIMIT_SIGPENDING:
1149		error = kernel_sysctlbyname(td,
1150		    "kern.sigqueue.max_pending_per_proc",
1151		    &res, &size, 0, 0, 0, 0);
1152		if (error != 0)
1153			return (false);
1154		rlim->rlim_cur = res;
1155		rlim->rlim_max = res;
1156		return (true);
1157	case LINUX_RLIMIT_MSGQUEUE:
1158		error = kernel_sysctlbyname(td,
1159		    "kern.ipc.msgmnb", &res, &size, 0, 0, 0, 0);
1160		if (error != 0)
1161			return (false);
1162		rlim->rlim_cur = res;
1163		rlim->rlim_max = res;
1164		return (true);
1165	default:
1166		return (false);
1167	}
1168}
1169
1170int
1171linux_setrlimit(struct thread *td, struct linux_setrlimit_args *args)
1172{
1173	struct rlimit bsd_rlim;
1174	struct l_rlimit rlim;
1175	u_int which;
1176	int error;
1177
1178	if (args->resource >= LINUX_RLIM_NLIMITS)
1179		return (EINVAL);
1180
1181	which = linux_to_bsd_resource[args->resource];
1182	if (which == -1)
1183		return (EINVAL);
1184
1185	error = copyin(args->rlim, &rlim, sizeof(rlim));
1186	if (error)
1187		return (error);
1188
1189	bsd_rlim.rlim_cur = (rlim_t)rlim.rlim_cur;
1190	bsd_rlim.rlim_max = (rlim_t)rlim.rlim_max;
1191	return (kern_setrlimit(td, which, &bsd_rlim));
1192}
1193
1194#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
1195int
1196linux_old_getrlimit(struct thread *td, struct linux_old_getrlimit_args *args)
1197{
1198	struct l_rlimit rlim;
1199	struct rlimit bsd_rlim;
1200	u_int which;
1201
1202	if (linux_get_dummy_limit(td, args->resource, &bsd_rlim)) {
1203		rlim.rlim_cur = bsd_rlim.rlim_cur;
1204		rlim.rlim_max = bsd_rlim.rlim_max;
1205		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1206	}
1207
1208	if (args->resource >= LINUX_RLIM_NLIMITS)
1209		return (EINVAL);
1210
1211	which = linux_to_bsd_resource[args->resource];
1212	if (which == -1)
1213		return (EINVAL);
1214
1215	lim_rlimit(td, which, &bsd_rlim);
1216
1217#ifdef COMPAT_LINUX32
1218	rlim.rlim_cur = (unsigned int)bsd_rlim.rlim_cur;
1219	if (rlim.rlim_cur == UINT_MAX)
1220		rlim.rlim_cur = INT_MAX;
1221	rlim.rlim_max = (unsigned int)bsd_rlim.rlim_max;
1222	if (rlim.rlim_max == UINT_MAX)
1223		rlim.rlim_max = INT_MAX;
1224#else
1225	rlim.rlim_cur = (unsigned long)bsd_rlim.rlim_cur;
1226	if (rlim.rlim_cur == ULONG_MAX)
1227		rlim.rlim_cur = LONG_MAX;
1228	rlim.rlim_max = (unsigned long)bsd_rlim.rlim_max;
1229	if (rlim.rlim_max == ULONG_MAX)
1230		rlim.rlim_max = LONG_MAX;
1231#endif
1232	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1233}
1234#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
1235
1236int
1237linux_getrlimit(struct thread *td, struct linux_getrlimit_args *args)
1238{
1239	struct l_rlimit rlim;
1240	struct rlimit bsd_rlim;
1241	u_int which;
1242
1243	if (linux_get_dummy_limit(td, args->resource, &bsd_rlim)) {
1244		rlim.rlim_cur = bsd_rlim.rlim_cur;
1245		rlim.rlim_max = bsd_rlim.rlim_max;
1246		return (copyout(&rlim, args->rlim, sizeof(rlim)));
1247	}
1248
1249	if (args->resource >= LINUX_RLIM_NLIMITS)
1250		return (EINVAL);
1251
1252	which = linux_to_bsd_resource[args->resource];
1253	if (which == -1)
1254		return (EINVAL);
1255
1256	lim_rlimit(td, which, &bsd_rlim);
1257
1258	rlim.rlim_cur = (l_ulong)bsd_rlim.rlim_cur;
1259	rlim.rlim_max = (l_ulong)bsd_rlim.rlim_max;
1260	return (copyout(&rlim, args->rlim, sizeof(rlim)));
1261}
1262
1263int
1264linux_sched_setscheduler(struct thread *td,
1265    struct linux_sched_setscheduler_args *args)
1266{
1267	struct sched_param sched_param;
1268	struct thread *tdt;
1269	int error, policy;
1270
1271	switch (args->policy) {
1272	case LINUX_SCHED_OTHER:
1273		policy = SCHED_OTHER;
1274		break;
1275	case LINUX_SCHED_FIFO:
1276		policy = SCHED_FIFO;
1277		break;
1278	case LINUX_SCHED_RR:
1279		policy = SCHED_RR;
1280		break;
1281	default:
1282		return (EINVAL);
1283	}
1284
1285	error = copyin(args->param, &sched_param, sizeof(sched_param));
1286	if (error)
1287		return (error);
1288
1289	if (linux_map_sched_prio) {
1290		switch (policy) {
1291		case SCHED_OTHER:
1292			if (sched_param.sched_priority != 0)
1293				return (EINVAL);
1294
1295			sched_param.sched_priority =
1296			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1297			break;
1298		case SCHED_FIFO:
1299		case SCHED_RR:
1300			if (sched_param.sched_priority < 1 ||
1301			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO)
1302				return (EINVAL);
1303
1304			/*
1305			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1306			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1307			 */
1308			sched_param.sched_priority =
1309			    (sched_param.sched_priority - 1) *
1310			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1311			    (LINUX_MAX_RT_PRIO - 1);
1312			break;
1313		}
1314	}
1315
1316	tdt = linux_tdfind(td, args->pid, -1);
1317	if (tdt == NULL)
1318		return (ESRCH);
1319
1320	error = kern_sched_setscheduler(td, tdt, policy, &sched_param);
1321	PROC_UNLOCK(tdt->td_proc);
1322	return (error);
1323}
1324
1325int
1326linux_sched_getscheduler(struct thread *td,
1327    struct linux_sched_getscheduler_args *args)
1328{
1329	struct thread *tdt;
1330	int error, policy;
1331
1332	tdt = linux_tdfind(td, args->pid, -1);
1333	if (tdt == NULL)
1334		return (ESRCH);
1335
1336	error = kern_sched_getscheduler(td, tdt, &policy);
1337	PROC_UNLOCK(tdt->td_proc);
1338
1339	switch (policy) {
1340	case SCHED_OTHER:
1341		td->td_retval[0] = LINUX_SCHED_OTHER;
1342		break;
1343	case SCHED_FIFO:
1344		td->td_retval[0] = LINUX_SCHED_FIFO;
1345		break;
1346	case SCHED_RR:
1347		td->td_retval[0] = LINUX_SCHED_RR;
1348		break;
1349	}
1350	return (error);
1351}
1352
1353int
1354linux_sched_get_priority_max(struct thread *td,
1355    struct linux_sched_get_priority_max_args *args)
1356{
1357	struct sched_get_priority_max_args bsd;
1358
1359	if (linux_map_sched_prio) {
1360		switch (args->policy) {
1361		case LINUX_SCHED_OTHER:
1362			td->td_retval[0] = 0;
1363			return (0);
1364		case LINUX_SCHED_FIFO:
1365		case LINUX_SCHED_RR:
1366			td->td_retval[0] = LINUX_MAX_RT_PRIO - 1;
1367			return (0);
1368		default:
1369			return (EINVAL);
1370		}
1371	}
1372
1373	switch (args->policy) {
1374	case LINUX_SCHED_OTHER:
1375		bsd.policy = SCHED_OTHER;
1376		break;
1377	case LINUX_SCHED_FIFO:
1378		bsd.policy = SCHED_FIFO;
1379		break;
1380	case LINUX_SCHED_RR:
1381		bsd.policy = SCHED_RR;
1382		break;
1383	default:
1384		return (EINVAL);
1385	}
1386	return (sys_sched_get_priority_max(td, &bsd));
1387}
1388
1389int
1390linux_sched_get_priority_min(struct thread *td,
1391    struct linux_sched_get_priority_min_args *args)
1392{
1393	struct sched_get_priority_min_args bsd;
1394
1395	if (linux_map_sched_prio) {
1396		switch (args->policy) {
1397		case LINUX_SCHED_OTHER:
1398			td->td_retval[0] = 0;
1399			return (0);
1400		case LINUX_SCHED_FIFO:
1401		case LINUX_SCHED_RR:
1402			td->td_retval[0] = 1;
1403			return (0);
1404		default:
1405			return (EINVAL);
1406		}
1407	}
1408
1409	switch (args->policy) {
1410	case LINUX_SCHED_OTHER:
1411		bsd.policy = SCHED_OTHER;
1412		break;
1413	case LINUX_SCHED_FIFO:
1414		bsd.policy = SCHED_FIFO;
1415		break;
1416	case LINUX_SCHED_RR:
1417		bsd.policy = SCHED_RR;
1418		break;
1419	default:
1420		return (EINVAL);
1421	}
1422	return (sys_sched_get_priority_min(td, &bsd));
1423}
1424
1425#define REBOOT_CAD_ON	0x89abcdef
1426#define REBOOT_CAD_OFF	0
1427#define REBOOT_HALT	0xcdef0123
1428#define REBOOT_RESTART	0x01234567
1429#define REBOOT_RESTART2	0xA1B2C3D4
1430#define REBOOT_POWEROFF	0x4321FEDC
1431#define REBOOT_MAGIC1	0xfee1dead
1432#define REBOOT_MAGIC2	0x28121969
1433#define REBOOT_MAGIC2A	0x05121996
1434#define REBOOT_MAGIC2B	0x16041998
1435
1436int
1437linux_reboot(struct thread *td, struct linux_reboot_args *args)
1438{
1439	struct reboot_args bsd_args;
1440
1441	if (args->magic1 != REBOOT_MAGIC1)
1442		return (EINVAL);
1443
1444	switch (args->magic2) {
1445	case REBOOT_MAGIC2:
1446	case REBOOT_MAGIC2A:
1447	case REBOOT_MAGIC2B:
1448		break;
1449	default:
1450		return (EINVAL);
1451	}
1452
1453	switch (args->cmd) {
1454	case REBOOT_CAD_ON:
1455	case REBOOT_CAD_OFF:
1456		return (priv_check(td, PRIV_REBOOT));
1457	case REBOOT_HALT:
1458		bsd_args.opt = RB_HALT;
1459		break;
1460	case REBOOT_RESTART:
1461	case REBOOT_RESTART2:
1462		bsd_args.opt = 0;
1463		break;
1464	case REBOOT_POWEROFF:
1465		bsd_args.opt = RB_POWEROFF;
1466		break;
1467	default:
1468		return (EINVAL);
1469	}
1470	return (sys_reboot(td, &bsd_args));
1471}
1472
1473int
1474linux_getpid(struct thread *td, struct linux_getpid_args *args)
1475{
1476
1477	td->td_retval[0] = td->td_proc->p_pid;
1478
1479	return (0);
1480}
1481
1482int
1483linux_gettid(struct thread *td, struct linux_gettid_args *args)
1484{
1485	struct linux_emuldata *em;
1486
1487	em = em_find(td);
1488	KASSERT(em != NULL, ("gettid: emuldata not found.\n"));
1489
1490	td->td_retval[0] = em->em_tid;
1491
1492	return (0);
1493}
1494
1495int
1496linux_getppid(struct thread *td, struct linux_getppid_args *args)
1497{
1498
1499	td->td_retval[0] = kern_getppid(td);
1500	return (0);
1501}
1502
1503int
1504linux_getgid(struct thread *td, struct linux_getgid_args *args)
1505{
1506
1507	td->td_retval[0] = td->td_ucred->cr_rgid;
1508	return (0);
1509}
1510
1511int
1512linux_getuid(struct thread *td, struct linux_getuid_args *args)
1513{
1514
1515	td->td_retval[0] = td->td_ucred->cr_ruid;
1516	return (0);
1517}
1518
1519int
1520linux_getsid(struct thread *td, struct linux_getsid_args *args)
1521{
1522
1523	return (kern_getsid(td, args->pid));
1524}
1525
1526int
1527linux_getpriority(struct thread *td, struct linux_getpriority_args *args)
1528{
1529	int error;
1530
1531	error = kern_getpriority(td, args->which, args->who);
1532	td->td_retval[0] = 20 - td->td_retval[0];
1533	return (error);
1534}
1535
1536int
1537linux_sethostname(struct thread *td, struct linux_sethostname_args *args)
1538{
1539	int name[2];
1540
1541	name[0] = CTL_KERN;
1542	name[1] = KERN_HOSTNAME;
1543	return (userland_sysctl(td, name, 2, 0, 0, 0, args->hostname,
1544	    args->len, 0, 0));
1545}
1546
1547int
1548linux_setdomainname(struct thread *td, struct linux_setdomainname_args *args)
1549{
1550	int name[2];
1551
1552	name[0] = CTL_KERN;
1553	name[1] = KERN_NISDOMAINNAME;
1554	return (userland_sysctl(td, name, 2, 0, 0, 0, args->name,
1555	    args->len, 0, 0));
1556}
1557
1558int
1559linux_exit_group(struct thread *td, struct linux_exit_group_args *args)
1560{
1561
1562	LINUX_CTR2(exit_group, "thread(%d) (%d)", td->td_tid,
1563	    args->error_code);
1564
1565	/*
1566	 * XXX: we should send a signal to the parent if
1567	 * SIGNAL_EXIT_GROUP is set. We ignore that (temporarily?)
1568	 * as it doesnt occur often.
1569	 */
1570	exit1(td, args->error_code, 0);
1571		/* NOTREACHED */
1572}
1573
1574#define _LINUX_CAPABILITY_VERSION_1  0x19980330
1575#define _LINUX_CAPABILITY_VERSION_2  0x20071026
1576#define _LINUX_CAPABILITY_VERSION_3  0x20080522
1577
1578struct l_user_cap_header {
1579	l_int	version;
1580	l_int	pid;
1581};
1582
1583struct l_user_cap_data {
1584	l_int	effective;
1585	l_int	permitted;
1586	l_int	inheritable;
1587};
1588
1589int
1590linux_capget(struct thread *td, struct linux_capget_args *uap)
1591{
1592	struct l_user_cap_header luch;
1593	struct l_user_cap_data lucd[2];
1594	int error, u32s;
1595
1596	if (uap->hdrp == NULL)
1597		return (EFAULT);
1598
1599	error = copyin(uap->hdrp, &luch, sizeof(luch));
1600	if (error != 0)
1601		return (error);
1602
1603	switch (luch.version) {
1604	case _LINUX_CAPABILITY_VERSION_1:
1605		u32s = 1;
1606		break;
1607	case _LINUX_CAPABILITY_VERSION_2:
1608	case _LINUX_CAPABILITY_VERSION_3:
1609		u32s = 2;
1610		break;
1611	default:
1612		luch.version = _LINUX_CAPABILITY_VERSION_1;
1613		error = copyout(&luch, uap->hdrp, sizeof(luch));
1614		if (error)
1615			return (error);
1616		return (EINVAL);
1617	}
1618
1619	if (luch.pid)
1620		return (EPERM);
1621
1622	if (uap->datap) {
1623		/*
1624		 * The current implementation doesn't support setting
1625		 * a capability (it's essentially a stub) so indicate
1626		 * that no capabilities are currently set or available
1627		 * to request.
1628		 */
1629		memset(&lucd, 0, u32s * sizeof(lucd[0]));
1630		error = copyout(&lucd, uap->datap, u32s * sizeof(lucd[0]));
1631	}
1632
1633	return (error);
1634}
1635
1636int
1637linux_capset(struct thread *td, struct linux_capset_args *uap)
1638{
1639	struct l_user_cap_header luch;
1640	struct l_user_cap_data lucd[2];
1641	int error, i, u32s;
1642
1643	if (uap->hdrp == NULL || uap->datap == NULL)
1644		return (EFAULT);
1645
1646	error = copyin(uap->hdrp, &luch, sizeof(luch));
1647	if (error != 0)
1648		return (error);
1649
1650	switch (luch.version) {
1651	case _LINUX_CAPABILITY_VERSION_1:
1652		u32s = 1;
1653		break;
1654	case _LINUX_CAPABILITY_VERSION_2:
1655	case _LINUX_CAPABILITY_VERSION_3:
1656		u32s = 2;
1657		break;
1658	default:
1659		luch.version = _LINUX_CAPABILITY_VERSION_1;
1660		error = copyout(&luch, uap->hdrp, sizeof(luch));
1661		if (error)
1662			return (error);
1663		return (EINVAL);
1664	}
1665
1666	if (luch.pid)
1667		return (EPERM);
1668
1669	error = copyin(uap->datap, &lucd, u32s * sizeof(lucd[0]));
1670	if (error != 0)
1671		return (error);
1672
1673	/* We currently don't support setting any capabilities. */
1674	for (i = 0; i < u32s; i++) {
1675		if (lucd[i].effective || lucd[i].permitted ||
1676		    lucd[i].inheritable) {
1677			linux_msg(td,
1678			    "capset[%d] effective=0x%x, permitted=0x%x, "
1679			    "inheritable=0x%x is not implemented", i,
1680			    (int)lucd[i].effective, (int)lucd[i].permitted,
1681			    (int)lucd[i].inheritable);
1682			return (EPERM);
1683		}
1684	}
1685
1686	return (0);
1687}
1688
1689int
1690linux_prctl(struct thread *td, struct linux_prctl_args *args)
1691{
1692	int error = 0, max_size, arg;
1693	struct proc *p = td->td_proc;
1694	char comm[LINUX_MAX_COMM_LEN];
1695	int pdeath_signal, trace_state;
1696
1697	switch (args->option) {
1698	case LINUX_PR_SET_PDEATHSIG:
1699		if (!LINUX_SIG_VALID(args->arg2))
1700			return (EINVAL);
1701		pdeath_signal = linux_to_bsd_signal(args->arg2);
1702		return (kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_CTL,
1703		    &pdeath_signal));
1704	case LINUX_PR_GET_PDEATHSIG:
1705		error = kern_procctl(td, P_PID, 0, PROC_PDEATHSIG_STATUS,
1706		    &pdeath_signal);
1707		if (error != 0)
1708			return (error);
1709		pdeath_signal = bsd_to_linux_signal(pdeath_signal);
1710		return (copyout(&pdeath_signal,
1711		    (void *)(register_t)args->arg2,
1712		    sizeof(pdeath_signal)));
1713	/*
1714	 * In Linux, this flag controls if set[gu]id processes can coredump.
1715	 * There are additional semantics imposed on processes that cannot
1716	 * coredump:
1717	 * - Such processes can not be ptraced.
1718	 * - There are some semantics around ownership of process-related files
1719	 *   in the /proc namespace.
1720	 *
1721	 * In FreeBSD, we can (and by default, do) disable setuid coredump
1722	 * system-wide with 'sugid_coredump.'  We control tracability on a
1723	 * per-process basis with the procctl PROC_TRACE (=> P2_NOTRACE flag).
1724	 * By happy coincidence, P2_NOTRACE also prevents coredumping.  So the
1725	 * procctl is roughly analogous to Linux's DUMPABLE.
1726	 *
1727	 * So, proxy these knobs to the corresponding PROC_TRACE setting.
1728	 */
1729	case LINUX_PR_GET_DUMPABLE:
1730		error = kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_STATUS,
1731		    &trace_state);
1732		if (error != 0)
1733			return (error);
1734		td->td_retval[0] = (trace_state != -1);
1735		return (0);
1736	case LINUX_PR_SET_DUMPABLE:
1737		/*
1738		 * It is only valid for userspace to set one of these two
1739		 * flags, and only one at a time.
1740		 */
1741		switch (args->arg2) {
1742		case LINUX_SUID_DUMP_DISABLE:
1743			trace_state = PROC_TRACE_CTL_DISABLE_EXEC;
1744			break;
1745		case LINUX_SUID_DUMP_USER:
1746			trace_state = PROC_TRACE_CTL_ENABLE;
1747			break;
1748		default:
1749			return (EINVAL);
1750		}
1751		return (kern_procctl(td, P_PID, p->p_pid, PROC_TRACE_CTL,
1752		    &trace_state));
1753	case LINUX_PR_GET_KEEPCAPS:
1754		/*
1755		 * Indicate that we always clear the effective and
1756		 * permitted capability sets when the user id becomes
1757		 * non-zero (actually the capability sets are simply
1758		 * always zero in the current implementation).
1759		 */
1760		td->td_retval[0] = 0;
1761		break;
1762	case LINUX_PR_SET_KEEPCAPS:
1763		/*
1764		 * Ignore requests to keep the effective and permitted
1765		 * capability sets when the user id becomes non-zero.
1766		 */
1767		break;
1768	case LINUX_PR_SET_NAME:
1769		/*
1770		 * To be on the safe side we need to make sure to not
1771		 * overflow the size a Linux program expects. We already
1772		 * do this here in the copyin, so that we don't need to
1773		 * check on copyout.
1774		 */
1775		max_size = MIN(sizeof(comm), sizeof(p->p_comm));
1776		error = copyinstr((void *)(register_t)args->arg2, comm,
1777		    max_size, NULL);
1778
1779		/* Linux silently truncates the name if it is too long. */
1780		if (error == ENAMETOOLONG) {
1781			/*
1782			 * XXX: copyinstr() isn't documented to populate the
1783			 * array completely, so do a copyin() to be on the
1784			 * safe side. This should be changed in case
1785			 * copyinstr() is changed to guarantee this.
1786			 */
1787			error = copyin((void *)(register_t)args->arg2, comm,
1788			    max_size - 1);
1789			comm[max_size - 1] = '\0';
1790		}
1791		if (error)
1792			return (error);
1793
1794		PROC_LOCK(p);
1795		strlcpy(p->p_comm, comm, sizeof(p->p_comm));
1796		PROC_UNLOCK(p);
1797		break;
1798	case LINUX_PR_GET_NAME:
1799		PROC_LOCK(p);
1800		strlcpy(comm, p->p_comm, sizeof(comm));
1801		PROC_UNLOCK(p);
1802		error = copyout(comm, (void *)(register_t)args->arg2,
1803		    strlen(comm) + 1);
1804		break;
1805	case LINUX_PR_GET_SECCOMP:
1806	case LINUX_PR_SET_SECCOMP:
1807		/*
1808		 * Same as returned by Linux without CONFIG_SECCOMP enabled.
1809		 */
1810		error = EINVAL;
1811		break;
1812	case LINUX_PR_CAPBSET_READ:
1813#if 0
1814		/*
1815		 * This makes too much noise with Ubuntu Focal.
1816		 */
1817		linux_msg(td, "unsupported prctl PR_CAPBSET_READ %d",
1818		    (int)args->arg2);
1819#endif
1820		error = EINVAL;
1821		break;
1822	case LINUX_PR_SET_CHILD_SUBREAPER:
1823		if (args->arg2 == 0) {
1824			return (kern_procctl(td, P_PID, 0, PROC_REAP_RELEASE,
1825			    NULL));
1826		}
1827
1828		return (kern_procctl(td, P_PID, 0, PROC_REAP_ACQUIRE,
1829		    NULL));
1830	case LINUX_PR_SET_NO_NEW_PRIVS:
1831		arg = args->arg2 == 1 ?
1832		    PROC_NO_NEW_PRIVS_ENABLE : PROC_NO_NEW_PRIVS_DISABLE;
1833		error = kern_procctl(td, P_PID, p->p_pid,
1834		    PROC_NO_NEW_PRIVS_CTL, &arg);
1835		break;
1836	case LINUX_PR_SET_PTRACER:
1837		linux_msg(td, "unsupported prctl PR_SET_PTRACER");
1838		error = EINVAL;
1839		break;
1840	default:
1841		linux_msg(td, "unsupported prctl option %d", args->option);
1842		error = EINVAL;
1843		break;
1844	}
1845
1846	return (error);
1847}
1848
1849int
1850linux_sched_setparam(struct thread *td,
1851    struct linux_sched_setparam_args *uap)
1852{
1853	struct sched_param sched_param;
1854	struct thread *tdt;
1855	int error, policy;
1856
1857	error = copyin(uap->param, &sched_param, sizeof(sched_param));
1858	if (error)
1859		return (error);
1860
1861	tdt = linux_tdfind(td, uap->pid, -1);
1862	if (tdt == NULL)
1863		return (ESRCH);
1864
1865	if (linux_map_sched_prio) {
1866		error = kern_sched_getscheduler(td, tdt, &policy);
1867		if (error)
1868			goto out;
1869
1870		switch (policy) {
1871		case SCHED_OTHER:
1872			if (sched_param.sched_priority != 0) {
1873				error = EINVAL;
1874				goto out;
1875			}
1876			sched_param.sched_priority =
1877			    PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
1878			break;
1879		case SCHED_FIFO:
1880		case SCHED_RR:
1881			if (sched_param.sched_priority < 1 ||
1882			    sched_param.sched_priority >= LINUX_MAX_RT_PRIO) {
1883				error = EINVAL;
1884				goto out;
1885			}
1886			/*
1887			 * Map [1, LINUX_MAX_RT_PRIO - 1] to
1888			 * [0, RTP_PRIO_MAX - RTP_PRIO_MIN] (rounding down).
1889			 */
1890			sched_param.sched_priority =
1891			    (sched_param.sched_priority - 1) *
1892			    (RTP_PRIO_MAX - RTP_PRIO_MIN + 1) /
1893			    (LINUX_MAX_RT_PRIO - 1);
1894			break;
1895		}
1896	}
1897
1898	error = kern_sched_setparam(td, tdt, &sched_param);
1899out:	PROC_UNLOCK(tdt->td_proc);
1900	return (error);
1901}
1902
1903int
1904linux_sched_getparam(struct thread *td,
1905    struct linux_sched_getparam_args *uap)
1906{
1907	struct sched_param sched_param;
1908	struct thread *tdt;
1909	int error, policy;
1910
1911	tdt = linux_tdfind(td, uap->pid, -1);
1912	if (tdt == NULL)
1913		return (ESRCH);
1914
1915	error = kern_sched_getparam(td, tdt, &sched_param);
1916	if (error) {
1917		PROC_UNLOCK(tdt->td_proc);
1918		return (error);
1919	}
1920
1921	if (linux_map_sched_prio) {
1922		error = kern_sched_getscheduler(td, tdt, &policy);
1923		PROC_UNLOCK(tdt->td_proc);
1924		if (error)
1925			return (error);
1926
1927		switch (policy) {
1928		case SCHED_OTHER:
1929			sched_param.sched_priority = 0;
1930			break;
1931		case SCHED_FIFO:
1932		case SCHED_RR:
1933			/*
1934			 * Map [0, RTP_PRIO_MAX - RTP_PRIO_MIN] to
1935			 * [1, LINUX_MAX_RT_PRIO - 1] (rounding up).
1936			 */
1937			sched_param.sched_priority =
1938			    (sched_param.sched_priority *
1939			    (LINUX_MAX_RT_PRIO - 1) +
1940			    (RTP_PRIO_MAX - RTP_PRIO_MIN - 1)) /
1941			    (RTP_PRIO_MAX - RTP_PRIO_MIN) + 1;
1942			break;
1943		}
1944	} else
1945		PROC_UNLOCK(tdt->td_proc);
1946
1947	error = copyout(&sched_param, uap->param, sizeof(sched_param));
1948	return (error);
1949}
1950
1951/*
1952 * Get affinity of a process.
1953 */
1954int
1955linux_sched_getaffinity(struct thread *td,
1956    struct linux_sched_getaffinity_args *args)
1957{
1958	struct thread *tdt;
1959	cpuset_t *mask;
1960	size_t size;
1961	int error;
1962	id_t tid;
1963
1964	tdt = linux_tdfind(td, args->pid, -1);
1965	if (tdt == NULL)
1966		return (ESRCH);
1967	tid = tdt->td_tid;
1968	PROC_UNLOCK(tdt->td_proc);
1969
1970	mask = malloc(sizeof(cpuset_t), M_LINUX, M_WAITOK | M_ZERO);
1971	size = min(args->len, sizeof(cpuset_t));
1972	error = kern_cpuset_getaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
1973	    tid, size, mask);
1974	if (error == ERANGE)
1975		error = EINVAL;
1976 	if (error == 0)
1977		error = copyout(mask, args->user_mask_ptr, size);
1978	if (error == 0)
1979		td->td_retval[0] = size;
1980	free(mask, M_LINUX);
1981	return (error);
1982}
1983
1984/*
1985 *  Set affinity of a process.
1986 */
1987int
1988linux_sched_setaffinity(struct thread *td,
1989    struct linux_sched_setaffinity_args *args)
1990{
1991	struct thread *tdt;
1992	cpuset_t *mask;
1993	int cpu, error;
1994	size_t len;
1995	id_t tid;
1996
1997	tdt = linux_tdfind(td, args->pid, -1);
1998	if (tdt == NULL)
1999		return (ESRCH);
2000	tid = tdt->td_tid;
2001	PROC_UNLOCK(tdt->td_proc);
2002
2003	len = min(args->len, sizeof(cpuset_t));
2004	mask = malloc(sizeof(cpuset_t), M_TEMP, M_WAITOK | M_ZERO);
2005	error = copyin(args->user_mask_ptr, mask, len);
2006	if (error != 0)
2007		goto out;
2008	/* Linux ignore high bits */
2009	CPU_FOREACH_ISSET(cpu, mask)
2010		if (cpu > mp_maxid)
2011			CPU_CLR(cpu, mask);
2012
2013	error = kern_cpuset_setaffinity(td, CPU_LEVEL_WHICH, CPU_WHICH_TID,
2014	    tid, mask);
2015	if (error == EDEADLK)
2016		error = EINVAL;
2017out:
2018	free(mask, M_TEMP);
2019	return (error);
2020}
2021
2022struct linux_rlimit64 {
2023	uint64_t	rlim_cur;
2024	uint64_t	rlim_max;
2025};
2026
2027int
2028linux_prlimit64(struct thread *td, struct linux_prlimit64_args *args)
2029{
2030	struct rlimit rlim, nrlim;
2031	struct linux_rlimit64 lrlim;
2032	struct proc *p;
2033	u_int which;
2034	int flags;
2035	int error;
2036
2037	if (args->new == NULL && args->old != NULL) {
2038		if (linux_get_dummy_limit(td, args->resource, &rlim)) {
2039			lrlim.rlim_cur = rlim.rlim_cur;
2040			lrlim.rlim_max = rlim.rlim_max;
2041			return (copyout(&lrlim, args->old, sizeof(lrlim)));
2042		}
2043	}
2044
2045	if (args->resource >= LINUX_RLIM_NLIMITS)
2046		return (EINVAL);
2047
2048	which = linux_to_bsd_resource[args->resource];
2049	if (which == -1)
2050		return (EINVAL);
2051
2052	if (args->new != NULL) {
2053		/*
2054		 * Note. Unlike FreeBSD where rlim is signed 64-bit Linux
2055		 * rlim is unsigned 64-bit. FreeBSD treats negative limits
2056		 * as INFINITY so we do not need a conversion even.
2057		 */
2058		error = copyin(args->new, &nrlim, sizeof(nrlim));
2059		if (error != 0)
2060			return (error);
2061	}
2062
2063	flags = PGET_HOLD | PGET_NOTWEXIT;
2064	if (args->new != NULL)
2065		flags |= PGET_CANDEBUG;
2066	else
2067		flags |= PGET_CANSEE;
2068	if (args->pid == 0) {
2069		p = td->td_proc;
2070		PHOLD(p);
2071	} else {
2072		error = pget(args->pid, flags, &p);
2073		if (error != 0)
2074			return (error);
2075	}
2076	if (args->old != NULL) {
2077		PROC_LOCK(p);
2078		lim_rlimit_proc(p, which, &rlim);
2079		PROC_UNLOCK(p);
2080		if (rlim.rlim_cur == RLIM_INFINITY)
2081			lrlim.rlim_cur = LINUX_RLIM_INFINITY;
2082		else
2083			lrlim.rlim_cur = rlim.rlim_cur;
2084		if (rlim.rlim_max == RLIM_INFINITY)
2085			lrlim.rlim_max = LINUX_RLIM_INFINITY;
2086		else
2087			lrlim.rlim_max = rlim.rlim_max;
2088		error = copyout(&lrlim, args->old, sizeof(lrlim));
2089		if (error != 0)
2090			goto out;
2091	}
2092
2093	if (args->new != NULL)
2094		error = kern_proc_setrlimit(td, p, which, &nrlim);
2095
2096 out:
2097	PRELE(p);
2098	return (error);
2099}
2100
2101int
2102linux_pselect6(struct thread *td, struct linux_pselect6_args *args)
2103{
2104	struct timespec ts, *tsp;
2105	int error;
2106
2107	if (args->tsp != NULL) {
2108		error = linux_get_timespec(&ts, args->tsp);
2109		if (error != 0)
2110			return (error);
2111		tsp = &ts;
2112	} else
2113		tsp = NULL;
2114
2115	error = linux_common_pselect6(td, args->nfds, args->readfds,
2116	    args->writefds, args->exceptfds, tsp, args->sig);
2117
2118	if (args->tsp != NULL)
2119		linux_put_timespec(&ts, args->tsp);
2120	return (error);
2121}
2122
2123static int
2124linux_common_pselect6(struct thread *td, l_int nfds, l_fd_set *readfds,
2125    l_fd_set *writefds, l_fd_set *exceptfds, struct timespec *tsp,
2126    l_uintptr_t *sig)
2127{
2128	struct timeval utv, tv0, tv1, *tvp;
2129	struct l_pselect6arg lpse6;
2130	sigset_t *ssp;
2131	sigset_t ss;
2132	int error;
2133
2134	ssp = NULL;
2135	if (sig != NULL) {
2136		error = copyin(sig, &lpse6, sizeof(lpse6));
2137		if (error != 0)
2138			return (error);
2139		error = linux_copyin_sigset(td, PTRIN(lpse6.ss),
2140		    lpse6.ss_len, &ss, &ssp);
2141		if (error != 0)
2142		    return (error);
2143	} else
2144		ssp = NULL;
2145
2146	/*
2147	 * Currently glibc changes nanosecond number to microsecond.
2148	 * This mean losing precision but for now it is hardly seen.
2149	 */
2150	if (tsp != NULL) {
2151		TIMESPEC_TO_TIMEVAL(&utv, tsp);
2152		if (itimerfix(&utv))
2153			return (EINVAL);
2154
2155		microtime(&tv0);
2156		tvp = &utv;
2157	} else
2158		tvp = NULL;
2159
2160	error = kern_pselect(td, nfds, readfds, writefds,
2161	    exceptfds, tvp, ssp, LINUX_NFDBITS);
2162
2163	if (tsp != NULL) {
2164		/*
2165		 * Compute how much time was left of the timeout,
2166		 * by subtracting the current time and the time
2167		 * before we started the call, and subtracting
2168		 * that result from the user-supplied value.
2169		 */
2170		microtime(&tv1);
2171		timevalsub(&tv1, &tv0);
2172		timevalsub(&utv, &tv1);
2173		if (utv.tv_sec < 0)
2174			timevalclear(&utv);
2175		TIMEVAL_TO_TIMESPEC(&utv, tsp);
2176	}
2177	return (error);
2178}
2179
2180#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2181int
2182linux_pselect6_time64(struct thread *td,
2183    struct linux_pselect6_time64_args *args)
2184{
2185	struct timespec ts, *tsp;
2186	int error;
2187
2188	if (args->tsp != NULL) {
2189		error = linux_get_timespec64(&ts, args->tsp);
2190		if (error != 0)
2191			return (error);
2192		tsp = &ts;
2193	} else
2194		tsp = NULL;
2195
2196	error = linux_common_pselect6(td, args->nfds, args->readfds,
2197	    args->writefds, args->exceptfds, tsp, args->sig);
2198
2199	if (args->tsp != NULL)
2200		linux_put_timespec64(&ts, args->tsp);
2201	return (error);
2202}
2203#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2204
2205int
2206linux_ppoll(struct thread *td, struct linux_ppoll_args *args)
2207{
2208	struct timespec uts, *tsp;
2209	int error;
2210
2211	if (args->tsp != NULL) {
2212		error = linux_get_timespec(&uts, args->tsp);
2213		if (error != 0)
2214			return (error);
2215		tsp = &uts;
2216	} else
2217		tsp = NULL;
2218
2219	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2220	    args->sset, args->ssize);
2221	if (error == 0 && args->tsp != NULL)
2222		error = linux_put_timespec(&uts, args->tsp);
2223	return (error);
2224}
2225
2226static int
2227linux_common_ppoll(struct thread *td, struct pollfd *fds, uint32_t nfds,
2228    struct timespec *tsp, l_sigset_t *sset, l_size_t ssize)
2229{
2230	struct timespec ts0, ts1;
2231	struct pollfd stackfds[32];
2232	struct pollfd *kfds;
2233 	sigset_t *ssp;
2234 	sigset_t ss;
2235 	int error;
2236
2237	if (kern_poll_maxfds(nfds))
2238		return (EINVAL);
2239	if (sset != NULL) {
2240		error = linux_copyin_sigset(td, sset, ssize, &ss, &ssp);
2241		if (error != 0)
2242		    return (error);
2243	} else
2244		ssp = NULL;
2245	if (tsp != NULL)
2246		nanotime(&ts0);
2247
2248	if (nfds > nitems(stackfds))
2249		kfds = mallocarray(nfds, sizeof(*kfds), M_TEMP, M_WAITOK);
2250	else
2251		kfds = stackfds;
2252	error = linux_pollin(td, kfds, fds, nfds);
2253	if (error != 0)
2254		goto out;
2255
2256	error = kern_poll_kfds(td, kfds, nfds, tsp, ssp);
2257	if (error == 0)
2258		error = linux_pollout(td, kfds, fds, nfds);
2259
2260	if (error == 0 && tsp != NULL) {
2261		if (td->td_retval[0]) {
2262			nanotime(&ts1);
2263			timespecsub(&ts1, &ts0, &ts1);
2264			timespecsub(tsp, &ts1, tsp);
2265			if (tsp->tv_sec < 0)
2266				timespecclear(tsp);
2267		} else
2268			timespecclear(tsp);
2269	}
2270
2271out:
2272	if (nfds > nitems(stackfds))
2273		free(kfds, M_TEMP);
2274	return (error);
2275}
2276
2277#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2278int
2279linux_ppoll_time64(struct thread *td, struct linux_ppoll_time64_args *args)
2280{
2281	struct timespec uts, *tsp;
2282	int error;
2283
2284	if (args->tsp != NULL) {
2285		error = linux_get_timespec64(&uts, args->tsp);
2286		if (error != 0)
2287			return (error);
2288		tsp = &uts;
2289	} else
2290 		tsp = NULL;
2291	error = linux_common_ppoll(td, args->fds, args->nfds, tsp,
2292	    args->sset, args->ssize);
2293	if (error == 0 && args->tsp != NULL)
2294		error = linux_put_timespec64(&uts, args->tsp);
2295	return (error);
2296}
2297#endif /* __i386__ || (__amd64__ && COMPAT_LINUX32) */
2298
2299static int
2300linux_pollin(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2301{
2302	int error;
2303	u_int i;
2304
2305	error = copyin(ufds, fds, nfd * sizeof(*fds));
2306	if (error != 0)
2307		return (error);
2308
2309	for (i = 0; i < nfd; i++) {
2310		if (fds->events != 0)
2311			linux_to_bsd_poll_events(td, fds->fd,
2312			    fds->events, &fds->events);
2313		fds++;
2314	}
2315	return (0);
2316}
2317
2318static int
2319linux_pollout(struct thread *td, struct pollfd *fds, struct pollfd *ufds, u_int nfd)
2320{
2321	int error = 0;
2322	u_int i, n = 0;
2323
2324	for (i = 0; i < nfd; i++) {
2325		if (fds->revents != 0) {
2326			bsd_to_linux_poll_events(fds->revents,
2327			    &fds->revents);
2328			n++;
2329		}
2330		error = copyout(&fds->revents, &ufds->revents,
2331		    sizeof(ufds->revents));
2332		if (error)
2333			return (error);
2334		fds++;
2335		ufds++;
2336	}
2337	td->td_retval[0] = n;
2338	return (0);
2339}
2340
2341static int
2342linux_sched_rr_get_interval_common(struct thread *td, pid_t pid,
2343    struct timespec *ts)
2344{
2345	struct thread *tdt;
2346	int error;
2347
2348	/*
2349	 * According to man in case the invalid pid specified
2350	 * EINVAL should be returned.
2351	 */
2352	if (pid < 0)
2353		return (EINVAL);
2354
2355	tdt = linux_tdfind(td, pid, -1);
2356	if (tdt == NULL)
2357		return (ESRCH);
2358
2359	error = kern_sched_rr_get_interval_td(td, tdt, ts);
2360	PROC_UNLOCK(tdt->td_proc);
2361	return (error);
2362}
2363
2364int
2365linux_sched_rr_get_interval(struct thread *td,
2366    struct linux_sched_rr_get_interval_args *uap)
2367{
2368	struct timespec ts;
2369	int error;
2370
2371	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2372	if (error != 0)
2373		return (error);
2374	return (linux_put_timespec(&ts, uap->interval));
2375}
2376
2377#if defined(__i386__) || (defined(__amd64__) && defined(COMPAT_LINUX32))
2378int
2379linux_sched_rr_get_interval_time64(struct thread *td,
2380    struct linux_sched_rr_get_interval_time64_args *uap)
2381{
2382	struct timespec ts;
2383	int error;
2384
2385	error = linux_sched_rr_get_interval_common(td, uap->pid, &ts);
2386	if (error != 0)
2387		return (error);
2388	return (linux_put_timespec64(&ts, uap->interval));
2389}
2390#endif
2391
2392/*
2393 * In case when the Linux thread is the initial thread in
2394 * the thread group thread id is equal to the process id.
2395 * Glibc depends on this magic (assert in pthread_getattr_np.c).
2396 */
2397struct thread *
2398linux_tdfind(struct thread *td, lwpid_t tid, pid_t pid)
2399{
2400	struct linux_emuldata *em;
2401	struct thread *tdt;
2402	struct proc *p;
2403
2404	tdt = NULL;
2405	if (tid == 0 || tid == td->td_tid) {
2406		if (pid != -1 && td->td_proc->p_pid != pid)
2407			return (NULL);
2408		PROC_LOCK(td->td_proc);
2409		return (td);
2410	} else if (tid > PID_MAX)
2411		return (tdfind(tid, pid));
2412
2413	/*
2414	 * Initial thread where the tid equal to the pid.
2415	 */
2416	p = pfind(tid);
2417	if (p != NULL) {
2418		if (SV_PROC_ABI(p) != SV_ABI_LINUX ||
2419		    (pid != -1 && tid != pid)) {
2420			/*
2421			 * p is not a Linuxulator process.
2422			 */
2423			PROC_UNLOCK(p);
2424			return (NULL);
2425		}
2426		FOREACH_THREAD_IN_PROC(p, tdt) {
2427			em = em_find(tdt);
2428			if (tid == em->em_tid)
2429				return (tdt);
2430		}
2431		PROC_UNLOCK(p);
2432	}
2433	return (NULL);
2434}
2435
2436void
2437linux_to_bsd_waitopts(int options, int *bsdopts)
2438{
2439
2440	if (options & LINUX_WNOHANG)
2441		*bsdopts |= WNOHANG;
2442	if (options & LINUX_WUNTRACED)
2443		*bsdopts |= WUNTRACED;
2444	if (options & LINUX_WEXITED)
2445		*bsdopts |= WEXITED;
2446	if (options & LINUX_WCONTINUED)
2447		*bsdopts |= WCONTINUED;
2448	if (options & LINUX_WNOWAIT)
2449		*bsdopts |= WNOWAIT;
2450
2451	if (options & __WCLONE)
2452		*bsdopts |= WLINUXCLONE;
2453}
2454
2455int
2456linux_getrandom(struct thread *td, struct linux_getrandom_args *args)
2457{
2458	struct uio uio;
2459	struct iovec iov;
2460	int error;
2461
2462	if (args->flags & ~(LINUX_GRND_NONBLOCK|LINUX_GRND_RANDOM))
2463		return (EINVAL);
2464	if (args->count > INT_MAX)
2465		args->count = INT_MAX;
2466
2467	iov.iov_base = args->buf;
2468	iov.iov_len = args->count;
2469
2470	uio.uio_iov = &iov;
2471	uio.uio_iovcnt = 1;
2472	uio.uio_resid = iov.iov_len;
2473	uio.uio_segflg = UIO_USERSPACE;
2474	uio.uio_rw = UIO_READ;
2475	uio.uio_td = td;
2476
2477	error = read_random_uio(&uio, args->flags & LINUX_GRND_NONBLOCK);
2478	if (error == 0)
2479		td->td_retval[0] = args->count - uio.uio_resid;
2480	return (error);
2481}
2482
2483int
2484linux_mincore(struct thread *td, struct linux_mincore_args *args)
2485{
2486
2487	/* Needs to be page-aligned */
2488	if (args->start & PAGE_MASK)
2489		return (EINVAL);
2490	return (kern_mincore(td, args->start, args->len, args->vec));
2491}
2492
2493#define	SYSLOG_TAG	"<6>"
2494
2495int
2496linux_syslog(struct thread *td, struct linux_syslog_args *args)
2497{
2498	char buf[128], *src, *dst;
2499	u_int seq;
2500	int buflen, error;
2501
2502	if (args->type != LINUX_SYSLOG_ACTION_READ_ALL) {
2503		linux_msg(td, "syslog unsupported type 0x%x", args->type);
2504		return (EINVAL);
2505	}
2506
2507	if (args->len < 6) {
2508		td->td_retval[0] = 0;
2509		return (0);
2510	}
2511
2512	error = priv_check(td, PRIV_MSGBUF);
2513	if (error)
2514		return (error);
2515
2516	mtx_lock(&msgbuf_lock);
2517	msgbuf_peekbytes(msgbufp, NULL, 0, &seq);
2518	mtx_unlock(&msgbuf_lock);
2519
2520	dst = args->buf;
2521	error = copyout(&SYSLOG_TAG, dst, sizeof(SYSLOG_TAG));
2522	/* The -1 is to skip the trailing '\0'. */
2523	dst += sizeof(SYSLOG_TAG) - 1;
2524
2525	while (error == 0) {
2526		mtx_lock(&msgbuf_lock);
2527		buflen = msgbuf_peekbytes(msgbufp, buf, sizeof(buf), &seq);
2528		mtx_unlock(&msgbuf_lock);
2529
2530		if (buflen == 0)
2531			break;
2532
2533		for (src = buf; src < buf + buflen && error == 0; src++) {
2534			if (*src == '\0')
2535				continue;
2536
2537			if (dst >= args->buf + args->len)
2538				goto out;
2539
2540			error = copyout(src, dst, 1);
2541			dst++;
2542
2543			if (*src == '\n' && *(src + 1) != '<' &&
2544			    dst + sizeof(SYSLOG_TAG) < args->buf + args->len) {
2545				error = copyout(&SYSLOG_TAG,
2546				    dst, sizeof(SYSLOG_TAG));
2547				dst += sizeof(SYSLOG_TAG) - 1;
2548			}
2549		}
2550	}
2551out:
2552	td->td_retval[0] = dst - args->buf;
2553	return (error);
2554}
2555
2556int
2557linux_getcpu(struct thread *td, struct linux_getcpu_args *args)
2558{
2559	int cpu, error, node;
2560
2561	cpu = td->td_oncpu; /* Make sure it doesn't change during copyout(9) */
2562	error = 0;
2563	node = cpuid_to_pcpu[cpu]->pc_domain;
2564
2565	if (args->cpu != NULL)
2566		error = copyout(&cpu, args->cpu, sizeof(l_int));
2567	if (args->node != NULL)
2568		error = copyout(&node, args->node, sizeof(l_int));
2569	return (error);
2570}
2571
2572#if defined(__i386__) || defined(__amd64__)
2573int
2574linux_poll(struct thread *td, struct linux_poll_args *args)
2575{
2576	struct timespec ts, *tsp;
2577
2578	if (args->timeout != INFTIM) {
2579		if (args->timeout < 0)
2580			return (EINVAL);
2581		ts.tv_sec = args->timeout / 1000;
2582		ts.tv_nsec = (args->timeout % 1000) * 1000000;
2583		tsp = &ts;
2584	} else
2585		tsp = NULL;
2586
2587	return (linux_common_ppoll(td, args->fds, args->nfds,
2588	    tsp, NULL, 0));
2589}
2590#endif /* __i386__ || __amd64__ */
2591
2592int
2593linux_seccomp(struct thread *td, struct linux_seccomp_args *args)
2594{
2595
2596	switch (args->op) {
2597	case LINUX_SECCOMP_GET_ACTION_AVAIL:
2598		return (EOPNOTSUPP);
2599	default:
2600		/*
2601		 * Ignore unknown operations, just like Linux kernel built
2602		 * without CONFIG_SECCOMP.
2603		 */
2604		return (EINVAL);
2605	}
2606}
2607
2608/*
2609 * Custom version of exec_copyin_args(), to copy out argument and environment
2610 * strings from the old process address space into the temporary string buffer.
2611 * Based on freebsd32_exec_copyin_args.
2612 */
2613static int
2614linux_exec_copyin_args(struct image_args *args, const char *fname,
2615    enum uio_seg segflg, l_uintptr_t *argv, l_uintptr_t *envv)
2616{
2617	char *argp, *envp;
2618	l_uintptr_t *ptr, arg;
2619	int error;
2620
2621	bzero(args, sizeof(*args));
2622	if (argv == NULL)
2623		return (EFAULT);
2624
2625	/*
2626	 * Allocate demand-paged memory for the file name, argument, and
2627	 * environment strings.
2628	 */
2629	error = exec_alloc_args(args);
2630	if (error != 0)
2631		return (error);
2632
2633	/*
2634	 * Copy the file name.
2635	 */
2636	error = exec_args_add_fname(args, fname, segflg);
2637	if (error != 0)
2638		goto err_exit;
2639
2640	/*
2641	 * extract arguments first
2642	 */
2643	ptr = argv;
2644	for (;;) {
2645		error = copyin(ptr++, &arg, sizeof(arg));
2646		if (error)
2647			goto err_exit;
2648		if (arg == 0)
2649			break;
2650		argp = PTRIN(arg);
2651		error = exec_args_add_arg(args, argp, UIO_USERSPACE);
2652		if (error != 0)
2653			goto err_exit;
2654	}
2655
2656	/*
2657	 * This comment is from Linux do_execveat_common:
2658	 * When argv is empty, add an empty string ("") as argv[0] to
2659	 * ensure confused userspace programs that start processing
2660	 * from argv[1] won't end up walking envp.
2661	 */
2662	if (args->argc == 0 &&
2663	    (error = exec_args_add_arg(args, "", UIO_SYSSPACE) != 0))
2664		goto err_exit;
2665
2666	/*
2667	 * extract environment strings
2668	 */
2669	if (envv) {
2670		ptr = envv;
2671		for (;;) {
2672			error = copyin(ptr++, &arg, sizeof(arg));
2673			if (error)
2674				goto err_exit;
2675			if (arg == 0)
2676				break;
2677			envp = PTRIN(arg);
2678			error = exec_args_add_env(args, envp, UIO_USERSPACE);
2679			if (error != 0)
2680				goto err_exit;
2681		}
2682	}
2683
2684	return (0);
2685
2686err_exit:
2687	exec_free_args(args);
2688	return (error);
2689}
2690
2691int
2692linux_execve(struct thread *td, struct linux_execve_args *args)
2693{
2694	struct image_args eargs;
2695	int error;
2696
2697	LINUX_CTR(execve);
2698
2699	error = linux_exec_copyin_args(&eargs, args->path, UIO_USERSPACE,
2700	    args->argp, args->envp);
2701	if (error == 0)
2702		error = linux_common_execve(td, &eargs);
2703	AUDIT_SYSCALL_EXIT(error == EJUSTRETURN ? 0 : error, td);
2704	return (error);
2705}
2706
2707static void
2708linux_up_rtprio_if(struct thread *td1, struct rtprio *rtp)
2709{
2710	struct rtprio rtp2;
2711
2712	pri_to_rtp(td1, &rtp2);
2713	if (rtp2.type <  rtp->type ||
2714	    (rtp2.type == rtp->type &&
2715	    rtp2.prio < rtp->prio)) {
2716		rtp->type = rtp2.type;
2717		rtp->prio = rtp2.prio;
2718	}
2719}
2720
2721#define	LINUX_PRIO_DIVIDER	RTP_PRIO_MAX / LINUX_IOPRIO_MAX
2722
2723static int
2724linux_rtprio2ioprio(struct rtprio *rtp)
2725{
2726	int ioprio, prio;
2727
2728	switch (rtp->type) {
2729	case RTP_PRIO_IDLE:
2730		prio = RTP_PRIO_MIN;
2731		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_IDLE, prio);
2732		break;
2733	case RTP_PRIO_NORMAL:
2734		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2735		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_BE, prio);
2736		break;
2737	case RTP_PRIO_REALTIME:
2738		prio = rtp->prio / LINUX_PRIO_DIVIDER;
2739		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_RT, prio);
2740		break;
2741	default:
2742		prio = RTP_PRIO_MIN;
2743		ioprio = LINUX_IOPRIO_PRIO(LINUX_IOPRIO_CLASS_NONE, prio);
2744		break;
2745	}
2746	return (ioprio);
2747}
2748
2749static int
2750linux_ioprio2rtprio(int ioprio, struct rtprio *rtp)
2751{
2752
2753	switch (LINUX_IOPRIO_PRIO_CLASS(ioprio)) {
2754	case LINUX_IOPRIO_CLASS_IDLE:
2755		rtp->prio = RTP_PRIO_MIN;
2756		rtp->type = RTP_PRIO_IDLE;
2757		break;
2758	case LINUX_IOPRIO_CLASS_BE:
2759		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2760		rtp->type = RTP_PRIO_NORMAL;
2761		break;
2762	case LINUX_IOPRIO_CLASS_RT:
2763		rtp->prio = LINUX_IOPRIO_PRIO_DATA(ioprio) * LINUX_PRIO_DIVIDER;
2764		rtp->type = RTP_PRIO_REALTIME;
2765		break;
2766	default:
2767		return (EINVAL);
2768	}
2769	return (0);
2770}
2771#undef LINUX_PRIO_DIVIDER
2772
2773int
2774linux_ioprio_get(struct thread *td, struct linux_ioprio_get_args *args)
2775{
2776	struct thread *td1;
2777	struct rtprio rtp;
2778	struct pgrp *pg;
2779	struct proc *p;
2780	int error, found;
2781
2782	p = NULL;
2783	td1 = NULL;
2784	error = 0;
2785	found = 0;
2786	rtp.type = RTP_PRIO_IDLE;
2787	rtp.prio = RTP_PRIO_MAX;
2788	switch (args->which) {
2789	case LINUX_IOPRIO_WHO_PROCESS:
2790		if (args->who == 0) {
2791			td1 = td;
2792			p = td1->td_proc;
2793			PROC_LOCK(p);
2794		} else if (args->who > PID_MAX) {
2795			td1 = linux_tdfind(td, args->who, -1);
2796			if (td1 != NULL)
2797				p = td1->td_proc;
2798		} else
2799			p = pfind(args->who);
2800		if (p == NULL)
2801			return (ESRCH);
2802		if ((error = p_cansee(td, p))) {
2803			PROC_UNLOCK(p);
2804			break;
2805		}
2806		if (td1 != NULL) {
2807			pri_to_rtp(td1, &rtp);
2808		} else {
2809			FOREACH_THREAD_IN_PROC(p, td1) {
2810				linux_up_rtprio_if(td1, &rtp);
2811			}
2812		}
2813		found++;
2814		PROC_UNLOCK(p);
2815		break;
2816	case LINUX_IOPRIO_WHO_PGRP:
2817		sx_slock(&proctree_lock);
2818		if (args->who == 0) {
2819			pg = td->td_proc->p_pgrp;
2820			PGRP_LOCK(pg);
2821		} else {
2822			pg = pgfind(args->who);
2823			if (pg == NULL) {
2824				sx_sunlock(&proctree_lock);
2825				error = ESRCH;
2826				break;
2827			}
2828		}
2829		sx_sunlock(&proctree_lock);
2830		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2831			PROC_LOCK(p);
2832			if (p->p_state == PRS_NORMAL &&
2833			    p_cansee(td, p) == 0) {
2834				FOREACH_THREAD_IN_PROC(p, td1) {
2835					linux_up_rtprio_if(td1, &rtp);
2836					found++;
2837				}
2838			}
2839			PROC_UNLOCK(p);
2840		}
2841		PGRP_UNLOCK(pg);
2842		break;
2843	case LINUX_IOPRIO_WHO_USER:
2844		if (args->who == 0)
2845			args->who = td->td_ucred->cr_uid;
2846		sx_slock(&allproc_lock);
2847		FOREACH_PROC_IN_SYSTEM(p) {
2848			PROC_LOCK(p);
2849			if (p->p_state == PRS_NORMAL &&
2850			    p->p_ucred->cr_uid == args->who &&
2851			    p_cansee(td, p) == 0) {
2852				FOREACH_THREAD_IN_PROC(p, td1) {
2853					linux_up_rtprio_if(td1, &rtp);
2854					found++;
2855				}
2856			}
2857			PROC_UNLOCK(p);
2858		}
2859		sx_sunlock(&allproc_lock);
2860		break;
2861	default:
2862		error = EINVAL;
2863		break;
2864	}
2865	if (error == 0) {
2866		if (found != 0)
2867			td->td_retval[0] = linux_rtprio2ioprio(&rtp);
2868		else
2869			error = ESRCH;
2870	}
2871	return (error);
2872}
2873
2874int
2875linux_ioprio_set(struct thread *td, struct linux_ioprio_set_args *args)
2876{
2877	struct thread *td1;
2878	struct rtprio rtp;
2879	struct pgrp *pg;
2880	struct proc *p;
2881	int error;
2882
2883	if ((error = linux_ioprio2rtprio(args->ioprio, &rtp)) != 0)
2884		return (error);
2885	/* Attempts to set high priorities (REALTIME) require su privileges. */
2886	if (RTP_PRIO_BASE(rtp.type) == RTP_PRIO_REALTIME &&
2887	    (error = priv_check(td, PRIV_SCHED_RTPRIO)) != 0)
2888		return (error);
2889
2890	p = NULL;
2891	td1 = NULL;
2892	switch (args->which) {
2893	case LINUX_IOPRIO_WHO_PROCESS:
2894		if (args->who == 0) {
2895			td1 = td;
2896			p = td1->td_proc;
2897			PROC_LOCK(p);
2898		} else if (args->who > PID_MAX) {
2899			td1 = linux_tdfind(td, args->who, -1);
2900			if (td1 != NULL)
2901				p = td1->td_proc;
2902		} else
2903			p = pfind(args->who);
2904		if (p == NULL)
2905			return (ESRCH);
2906		if ((error = p_cansched(td, p))) {
2907			PROC_UNLOCK(p);
2908			break;
2909		}
2910		if (td1 != NULL) {
2911			error = rtp_to_pri(&rtp, td1);
2912		} else {
2913			FOREACH_THREAD_IN_PROC(p, td1) {
2914				if ((error = rtp_to_pri(&rtp, td1)) != 0)
2915					break;
2916			}
2917		}
2918		PROC_UNLOCK(p);
2919		break;
2920	case LINUX_IOPRIO_WHO_PGRP:
2921		sx_slock(&proctree_lock);
2922		if (args->who == 0) {
2923			pg = td->td_proc->p_pgrp;
2924			PGRP_LOCK(pg);
2925		} else {
2926			pg = pgfind(args->who);
2927			if (pg == NULL) {
2928				sx_sunlock(&proctree_lock);
2929				error = ESRCH;
2930				break;
2931			}
2932		}
2933		sx_sunlock(&proctree_lock);
2934		LIST_FOREACH(p, &pg->pg_members, p_pglist) {
2935			PROC_LOCK(p);
2936			if (p->p_state == PRS_NORMAL &&
2937			    p_cansched(td, p) == 0) {
2938				FOREACH_THREAD_IN_PROC(p, td1) {
2939					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2940						break;
2941				}
2942			}
2943			PROC_UNLOCK(p);
2944			if (error != 0)
2945				break;
2946		}
2947		PGRP_UNLOCK(pg);
2948		break;
2949	case LINUX_IOPRIO_WHO_USER:
2950		if (args->who == 0)
2951			args->who = td->td_ucred->cr_uid;
2952		sx_slock(&allproc_lock);
2953		FOREACH_PROC_IN_SYSTEM(p) {
2954			PROC_LOCK(p);
2955			if (p->p_state == PRS_NORMAL &&
2956			    p->p_ucred->cr_uid == args->who &&
2957			    p_cansched(td, p) == 0) {
2958				FOREACH_THREAD_IN_PROC(p, td1) {
2959					if ((error = rtp_to_pri(&rtp, td1)) != 0)
2960						break;
2961				}
2962			}
2963			PROC_UNLOCK(p);
2964			if (error != 0)
2965				break;
2966		}
2967		sx_sunlock(&allproc_lock);
2968		break;
2969	default:
2970		error = EINVAL;
2971		break;
2972	}
2973	return (error);
2974}
2975
2976/* The only flag is O_NONBLOCK */
2977#define B2L_MQ_FLAGS(bflags)	((bflags) != 0 ? LINUX_O_NONBLOCK : 0)
2978#define L2B_MQ_FLAGS(lflags)	((lflags) != 0 ? O_NONBLOCK : 0)
2979
2980int
2981linux_mq_open(struct thread *td, struct linux_mq_open_args *args)
2982{
2983	struct mq_attr attr;
2984	int error, flags;
2985
2986	flags = linux_common_openflags(args->oflag);
2987	if ((flags & O_ACCMODE) == O_ACCMODE || (flags & O_EXEC) != 0)
2988		return (EINVAL);
2989	flags = FFLAGS(flags);
2990	if ((flags & O_CREAT) != 0 && args->attr != NULL) {
2991		error = copyin(args->attr, &attr, sizeof(attr));
2992		if (error != 0)
2993			return (error);
2994		attr.mq_flags = L2B_MQ_FLAGS(attr.mq_flags);
2995	}
2996
2997	return (kern_kmq_open(td, args->name, flags, args->mode,
2998	    args->attr != NULL ? &attr : NULL));
2999}
3000
3001int
3002linux_mq_unlink(struct thread *td, struct linux_mq_unlink_args *args)
3003{
3004	struct kmq_unlink_args bsd_args = {
3005		.path = PTRIN(args->name)
3006	};
3007
3008	return (sys_kmq_unlink(td, &bsd_args));
3009}
3010
3011int
3012linux_mq_timedsend(struct thread *td, struct linux_mq_timedsend_args *args)
3013{
3014	struct timespec ts, *abs_timeout;
3015	int error;
3016
3017	if (args->abs_timeout == NULL)
3018		abs_timeout = NULL;
3019	else {
3020		error = linux_get_timespec(&ts, args->abs_timeout);
3021		if (error != 0)
3022			return (error);
3023		abs_timeout = &ts;
3024	}
3025
3026	return (kern_kmq_timedsend(td, args->mqd, PTRIN(args->msg_ptr),
3027		args->msg_len, args->msg_prio, abs_timeout));
3028}
3029
3030int
3031linux_mq_timedreceive(struct thread *td, struct linux_mq_timedreceive_args *args)
3032{
3033	struct timespec ts, *abs_timeout;
3034	int error;
3035
3036	if (args->abs_timeout == NULL)
3037		abs_timeout = NULL;
3038	else {
3039		error = linux_get_timespec(&ts, args->abs_timeout);
3040		if (error != 0)
3041			return (error);
3042		abs_timeout = &ts;
3043	}
3044
3045	return (kern_kmq_timedreceive(td, args->mqd, PTRIN(args->msg_ptr),
3046		args->msg_len, args->msg_prio, abs_timeout));
3047}
3048
3049int
3050linux_mq_notify(struct thread *td, struct linux_mq_notify_args *args)
3051{
3052	struct sigevent ev, *evp;
3053	struct l_sigevent l_ev;
3054	int error;
3055
3056	if (args->sevp == NULL)
3057		evp = NULL;
3058	else {
3059		error = copyin(args->sevp, &l_ev, sizeof(l_ev));
3060		if (error != 0)
3061			return (error);
3062		error = linux_convert_l_sigevent(&l_ev, &ev);
3063		if (error != 0)
3064			return (error);
3065		evp = &ev;
3066	}
3067
3068	return (kern_kmq_notify(td, args->mqd, evp));
3069}
3070
3071int
3072linux_mq_getsetattr(struct thread *td, struct linux_mq_getsetattr_args *args)
3073{
3074	struct mq_attr attr, oattr;
3075	int error;
3076
3077	if (args->attr != NULL) {
3078		error = copyin(args->attr, &attr, sizeof(attr));
3079		if (error != 0)
3080			return (error);
3081		attr.mq_flags = L2B_MQ_FLAGS(attr.mq_flags);
3082	}
3083
3084	error = kern_kmq_setattr(td, args->mqd, args->attr != NULL ? &attr : NULL,
3085	    &oattr);
3086	if (error == 0 && args->oattr != NULL) {
3087		oattr.mq_flags = B2L_MQ_FLAGS(oattr.mq_flags);
3088		bzero(oattr.__reserved, sizeof(oattr.__reserved));
3089		error = copyout(&oattr, args->oattr, sizeof(oattr));
3090	}
3091
3092	return (error);
3093}
3094
3095MODULE_DEPEND(linux, mqueuefs, 1, 1, 1);
3096