1/*
2 * Minimal portability layer for system call differences between
3 * Capsicum OSes.
4 */
5#ifndef __SYSCALLS_H__
6#define __SYSCALLS_H__
7
8/************************************************************
9 * FreeBSD
10 ************************************************************/
11#ifdef __FreeBSD__
12
13/* Map umount2 (Linux) syscall to unmount (FreeBSD) syscall */
14#define umount2(T, F) unmount(T, F)
15
16/* Map sighandler_y (Linux) to sig_t (FreeBSD) */
17#define sighandler_t sig_t
18
19/* profil(2) has a first argument of char* */
20#define profil_arg1_t char
21
22/* FreeBSD has getdents(2) available */
23#include <sys/types.h>
24#include <dirent.h>
25inline int getdents_(unsigned int fd, void *dirp, unsigned int count) {
26  return getdents(fd, (char*)dirp, count);
27}
28#include <sys/mman.h>
29inline int mincore_(void *addr, size_t length, unsigned char *vec) {
30  return mincore(addr, length, (char*)vec);
31}
32#define getpid_ getpid
33
34/* Map Linux-style sendfile to FreeBSD sendfile */
35#include <sys/socket.h>
36#include <sys/uio.h>
37inline ssize_t sendfile_(int out_fd, int in_fd, off_t *offset, size_t count) {
38  return sendfile(in_fd, out_fd, *offset, count, NULL, offset, 0);
39}
40
41/* A sample mount(2) call */
42#include <sys/param.h>
43#include <sys/mount.h>
44inline int bogus_mount_() {
45  return mount("procfs", "/not_mounted", 0, NULL);
46}
47
48/* Mappings for extended attribute functions */
49#include <sys/extattr.h>
50#include <errno.h>
51static const char *fbsd_extattr_skip_prefix(const char *p) {
52  if (*p++ == 'u' && *p++ == 's' && *p++ == 'e' && *p++ == 'r' && *p++ == '.')
53    return p;
54  errno = EINVAL;
55  return NULL;
56}
57inline ssize_t flistxattr_(int fd, char *list, size_t size) {
58  return extattr_list_fd(fd, EXTATTR_NAMESPACE_USER, list, size);
59}
60inline ssize_t fgetxattr_(int fd, const char *name, void *value, size_t size) {
61  if (!(name = fbsd_extattr_skip_prefix(name)))
62    return -1;
63  return extattr_get_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size);
64}
65inline int fsetxattr_(int fd, const char *name, const void *value, size_t size, int) {
66  if (!(name = fbsd_extattr_skip_prefix(name)))
67    return -1;
68  return extattr_set_fd(fd, EXTATTR_NAMESPACE_USER, name, value, size);
69}
70inline int fremovexattr_(int fd, const char *name) {
71  if (!(name = fbsd_extattr_skip_prefix(name)))
72    return -1;
73  return extattr_delete_fd(fd, EXTATTR_NAMESPACE_USER, name);
74}
75
76/* mq_* functions are wrappers in FreeBSD so go through to underlying syscalls */
77#include <sys/syscall.h>
78extern "C" {
79extern int __sys_kmq_notify(int, const struct sigevent *);
80extern int __sys_kmq_open(const char *, int, mode_t, const struct mq_attr *);
81extern int __sys_kmq_setattr(int, const struct mq_attr *__restrict, struct mq_attr *__restrict);
82extern ssize_t __sys_kmq_timedreceive(int, char *__restrict, size_t,
83                                      unsigned *__restrict, const struct timespec *__restrict);
84extern int __sys_kmq_timedsend(int, const char *, size_t, unsigned,
85                               const struct timespec *);
86extern int  __sys_kmq_unlink(const char *);
87}
88#define mq_notify_ __sys_kmq_notify
89#define mq_open_ __sys_kmq_open
90#define mq_setattr_ __sys_kmq_setattr
91#define mq_getattr_(A, B) __sys_kmq_setattr(A, NULL, B)
92#define mq_timedreceive_ __sys_kmq_timedreceive
93#define mq_timedsend_ __sys_kmq_timedsend
94#define mq_unlink_ __sys_kmq_unlink
95#define mq_close_ close
96#include <sys/ptrace.h>
97inline long ptrace_(int request, pid_t pid, void *addr, void *data) {
98  return ptrace(request, pid, (caddr_t)addr, static_cast<int>((long)data));
99}
100#define PTRACE_PEEKDATA_ PT_READ_D
101#define getegid_ getegid
102#define getgid_ getgid
103#define geteuid_ geteuid
104#define getuid_ getuid
105#define getgroups_ getgroups
106#define getrlimit_ getrlimit
107#define bind_ bind
108#define connect_ connect
109
110/* Features available */
111#if __FreeBSD_version >= 1000000
112#define HAVE_CHFLAGSAT
113#define HAVE_BINDAT
114#define HAVE_CONNECTAT
115#endif
116#define HAVE_CHFLAGS
117#define HAVE_GETFSSTAT
118#define HAVE_REVOKE
119#define HAVE_GETLOGIN
120#define HAVE_MKFIFOAT
121#define HAVE_SYSARCH
122#include <machine/sysarch.h>
123#define HAVE_STAT_BIRTHTIME
124#define HAVE_SYSCTL
125#define HAVE_FPATHCONF
126#define HAVE_F_DUP2FD
127#define HAVE_PSELECT
128#define HAVE_SCTP
129
130/* FreeBSD only allows root to call mlock[all]/munlock[all] */
131#define MLOCK_REQUIRES_ROOT 1
132/* FreeBSD effectively only allows root to call sched_setscheduler */
133#define SCHED_SETSCHEDULER_REQUIRES_ROOT 1
134
135#endif  /* FreeBSD */
136
137/************************************************************
138 * Linux
139 ************************************************************/
140#ifdef __linux__
141#include <fcntl.h>
142#include <unistd.h>
143#include <sys/prctl.h>
144#include <sys/syscall.h>
145#include <sys/types.h>
146#include <sys/time.h>
147#include <sys/resource.h>
148#include <sys/wait.h>
149#include <sys/sendfile.h>
150#include <sys/statfs.h>
151#include <sys/xattr.h>
152#include <sys/mount.h>
153#include <linux/net.h>
154
155/* profil(2) has a first argument of unsigned short* */
156#define profil_arg1_t unsigned short
157
158static inline int getdents_(unsigned int fd, void *dirp, unsigned int count) {
159  return syscall(__NR_getdents, fd, dirp, count);
160}
161/* A sample mount(2) call */
162static inline int bogus_mount_() {
163  return mount("/dev/bogus", "/bogus", "debugfs", MS_RDONLY, "");
164}
165
166/* libc's getpid() wrapper caches the pid value, and doesn't invalidate
167 * the cached value on pdfork(), so directly syscall. */
168static inline pid_t getpid_() {
169  return syscall(__NR_getpid);
170}
171static inline int execveat(int fd, const char *path,
172                           char *const argv[], char *const envp[], int flags) {
173  return syscall(__NR_execveat, fd, path, argv, envp, flags);
174}
175
176/*
177 * Linux glibc includes an fexecve() function, implemented via the /proc
178 * filesystem.  Bypass this and go directly to the execveat(2) syscall.
179 */
180static inline int fexecve_(int fd, char *const argv[], char *const envp[]) {
181  return execveat(fd, "", argv, envp, AT_EMPTY_PATH);
182}
183/*
184 * Linux glibc attempts to be clever and intercepts various uid/gid functions.
185 * Bypass by calling the syscalls directly.
186 */
187static inline gid_t getegid_(void) { return syscall(__NR_getegid); }
188static inline gid_t getgid_(void) { return syscall(__NR_getgid); }
189static inline uid_t geteuid_(void) { return syscall(__NR_geteuid); }
190static inline uid_t getuid_(void) { return syscall(__NR_getuid); }
191static inline int getgroups_(int size, gid_t list[]) { return syscall(__NR_getgroups, size, list); }
192static inline int getrlimit_(int resource, struct rlimit *rlim) {
193  return syscall(__NR_getrlimit, resource, rlim);
194}
195
196/*
197 * Linux glibc for i386 consumes the errno returned from the raw socketcall(2) operation,
198 * so use the raw syscall for those operations that are disallowed in capability mode.
199 */
200#ifdef __NR_bind
201#define bind_ bind
202#else
203static inline int bind_(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
204  unsigned long args[3] = {(unsigned long)sockfd, (unsigned long)(intptr_t)addr, (unsigned long)addrlen};
205  return syscall(__NR_socketcall, SYS_BIND, args);
206}
207#endif
208#ifdef __NR_connect
209#define connect_ connect
210#else
211static inline int connect_(int sockfd, const struct sockaddr *addr, socklen_t addrlen) {
212  unsigned long args[3] = {(unsigned long)sockfd, (unsigned long)(intptr_t)addr, (unsigned long)addrlen};
213  return syscall(__NR_socketcall, SYS_CONNECT, args);
214}
215#endif
216
217#define mincore_ mincore
218#define sendfile_ sendfile
219#define flistxattr_ flistxattr
220#define fgetxattr_ fgetxattr
221#define fsetxattr_ fsetxattr
222#define fremovexattr_ fremovexattr
223#define mq_notify_ mq_notify
224#define mq_open_ mq_open
225#define mq_setattr_ mq_setattr
226#define mq_getattr_ mq_getattr
227#define mq_timedreceive_ mq_timedreceive
228#define mq_timedsend_ mq_timedsend
229#define mq_unlink_ mq_unlink
230#define mq_close_ mq_close
231#define ptrace_ ptrace
232#define PTRACE_PEEKDATA_ PTRACE_PEEKDATA
233
234/* Features available */
235#define HAVE_DUP3
236#define HAVE_PIPE2
237#include <sys/fsuid.h>  /* for setfsgid()/setfsuid() */
238#define HAVE_SETFSUID
239#define HAVE_SETFSGID
240#define HAVE_READAHEAD
241#define HAVE_SEND_RECV_MMSG
242#define HAVE_SYNCFS
243#define HAVE_SYNC_FILE_RANGE
244#include <sys/uio.h>  /* for vmsplice */
245#define HAVE_TEE
246#define HAVE_SPLICE
247#define HAVE_VMSPLICE
248#define HAVE_PSELECT
249#define HAVE_PPOLL
250#define HAVE_EXECVEAT
251#define HAVE_SYSCALL
252#define HAVE_MKNOD_REG
253#define HAVE_MKNOD_SOCKET
254/*
255 * O_BENEATH is arch-specific, via <asm/fcntl.h>; however we cannot include both that file
256 * and the normal <fcntl.h> as they have some clashing definitions.  Bypass by directly
257 * defining O_BENEATH, using the current proposed x86 value.  (This will therefore not
258 * work for non-x86, and may need changing in future if a different value gets merged.)
259 */
260#ifndef O_BENEATH
261#define O_BENEATH	040000000	/* no / or .. in openat path */
262#endif
263
264
265/* Linux allows anyone to call mlock[all]/munlock[all] */
266#define MLOCK_REQUIRES_ROOT 0
267/* Linux allows anyone to call sched_setscheduler */
268#define SCHED_SETSCHEDULER_REQUIRES_ROOT 1
269
270#endif  /* Linux */
271
272#endif /*__SYSCALLS_H__*/
273