1// Tests of Linux-specific functionality
2#ifdef __linux__
3
4#include <sys/types.h>
5#include <sys/stat.h>
6#include <sys/socket.h>
7#include <sys/timerfd.h>
8#include <sys/signalfd.h>
9#include <sys/eventfd.h>
10#include <sys/epoll.h>
11#include <sys/inotify.h>
12#include <sys/fanotify.h>
13#include <sys/mman.h>
14#include <sys/capability.h>  // Requires e.g. libcap-dev package for POSIX.1e capabilities headers
15#include <linux/aio_abi.h>
16#include <linux/filter.h>
17#include <linux/seccomp.h>
18#include <linux/version.h>
19#include <poll.h>
20#include <sched.h>
21#include <signal.h>
22#include <fcntl.h>
23#include <unistd.h>
24
25#include <string>
26
27#include "capsicum.h"
28#include "syscalls.h"
29#include "capsicum-test.h"
30
31TEST(Linux, TimerFD) {
32  int fd = timerfd_create(CLOCK_MONOTONIC, 0);
33
34  cap_rights_t r_ro;
35  cap_rights_init(&r_ro, CAP_READ);
36  cap_rights_t r_wo;
37  cap_rights_init(&r_wo, CAP_WRITE);
38  cap_rights_t r_rw;
39  cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
40  cap_rights_t r_rwpoll;
41  cap_rights_init(&r_rwpoll, CAP_READ, CAP_WRITE, CAP_EVENT);
42
43  int cap_fd_ro = dup(fd);
44  EXPECT_OK(cap_fd_ro);
45  EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_ro));
46  int cap_fd_wo = dup(fd);
47  EXPECT_OK(cap_fd_wo);
48  EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_wo));
49  int cap_fd_rw = dup(fd);
50  EXPECT_OK(cap_fd_rw);
51  EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rw));
52  int cap_fd_all = dup(fd);
53  EXPECT_OK(cap_fd_all);
54  EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwpoll));
55
56  struct itimerspec old_ispec;
57  struct itimerspec ispec;
58  ispec.it_interval.tv_sec = 0;
59  ispec.it_interval.tv_nsec = 0;
60  ispec.it_value.tv_sec = 0;
61  ispec.it_value.tv_nsec = 100000000;  // 100ms
62  EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_ro, 0, &ispec, NULL));
63  EXPECT_NOTCAPABLE(timerfd_settime(cap_fd_wo, 0, &ispec, &old_ispec));
64  EXPECT_OK(timerfd_settime(cap_fd_wo, 0, &ispec, NULL));
65  EXPECT_OK(timerfd_settime(cap_fd_rw, 0, &ispec, NULL));
66  EXPECT_OK(timerfd_settime(cap_fd_all, 0, &ispec, NULL));
67
68  EXPECT_NOTCAPABLE(timerfd_gettime(cap_fd_wo, &old_ispec));
69  EXPECT_OK(timerfd_gettime(cap_fd_ro, &old_ispec));
70  EXPECT_OK(timerfd_gettime(cap_fd_rw, &old_ispec));
71  EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
72
73  // To be able to poll() for the timer pop, still need CAP_EVENT.
74  struct pollfd poll_fd;
75  for (int ii = 0; ii < 3; ii++) {
76    poll_fd.revents = 0;
77    poll_fd.events = POLLIN;
78    switch (ii) {
79    case 0: poll_fd.fd = cap_fd_ro; break;
80    case 1: poll_fd.fd = cap_fd_wo; break;
81    case 2: poll_fd.fd = cap_fd_rw; break;
82    }
83    // Poll immediately returns with POLLNVAL
84    EXPECT_OK(poll(&poll_fd, 1, 400));
85    EXPECT_EQ(0, (poll_fd.revents & POLLIN));
86    EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
87  }
88
89  poll_fd.fd = cap_fd_all;
90  EXPECT_OK(poll(&poll_fd, 1, 400));
91  EXPECT_NE(0, (poll_fd.revents & POLLIN));
92  EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
93
94  EXPECT_OK(timerfd_gettime(cap_fd_all, &old_ispec));
95  EXPECT_EQ(0, old_ispec.it_value.tv_sec);
96  EXPECT_EQ(0, old_ispec.it_value.tv_nsec);
97  EXPECT_EQ(0, old_ispec.it_interval.tv_sec);
98  EXPECT_EQ(0, old_ispec.it_interval.tv_nsec);
99
100  close(cap_fd_all);
101  close(cap_fd_rw);
102  close(cap_fd_wo);
103  close(cap_fd_ro);
104  close(fd);
105}
106
107FORK_TEST(Linux, SignalFDIfSingleThreaded) {
108  if (force_mt) {
109    GTEST_SKIP() << "multi-threaded run clashes with signals";
110  }
111  pid_t me = getpid();
112  sigset_t mask;
113  sigemptyset(&mask);
114  sigaddset(&mask, SIGUSR1);
115
116  // Block signals before registering against a new signal FD.
117  EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
118  int fd = signalfd(-1, &mask, 0);
119  EXPECT_OK(fd);
120
121  cap_rights_t r_rs;
122  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
123  cap_rights_t r_ws;
124  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
125  cap_rights_t r_sig;
126  cap_rights_init(&r_sig, CAP_FSIGNAL);
127  cap_rights_t r_rssig;
128  cap_rights_init(&r_rssig, CAP_FSIGNAL, CAP_READ, CAP_SEEK);
129  cap_rights_t r_rssig_poll;
130  cap_rights_init(&r_rssig_poll, CAP_FSIGNAL, CAP_READ, CAP_SEEK, CAP_EVENT);
131
132  // Various capability variants.
133  int cap_fd_none = dup(fd);
134  EXPECT_OK(cap_fd_none);
135  EXPECT_OK(cap_rights_limit(cap_fd_none, &r_ws));
136  int cap_fd_read = dup(fd);
137  EXPECT_OK(cap_fd_read);
138  EXPECT_OK(cap_rights_limit(cap_fd_read, &r_rs));
139  int cap_fd_sig = dup(fd);
140  EXPECT_OK(cap_fd_sig);
141  EXPECT_OK(cap_rights_limit(cap_fd_sig, &r_sig));
142  int cap_fd_sig_read = dup(fd);
143  EXPECT_OK(cap_fd_sig_read);
144  EXPECT_OK(cap_rights_limit(cap_fd_sig_read, &r_rssig));
145  int cap_fd_all = dup(fd);
146  EXPECT_OK(cap_fd_all);
147  EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rssig_poll));
148
149  struct signalfd_siginfo fdsi;
150
151  // Need CAP_READ to read the signal information
152  kill(me, SIGUSR1);
153  EXPECT_NOTCAPABLE(read(cap_fd_none, &fdsi, sizeof(struct signalfd_siginfo)));
154  EXPECT_NOTCAPABLE(read(cap_fd_sig, &fdsi, sizeof(struct signalfd_siginfo)));
155  int len = read(cap_fd_read, &fdsi, sizeof(struct signalfd_siginfo));
156  EXPECT_OK(len);
157  EXPECT_EQ(sizeof(struct signalfd_siginfo), (size_t)len);
158  EXPECT_EQ(SIGUSR1, (int)fdsi.ssi_signo);
159
160  // Need CAP_FSIGNAL to modify the signal mask.
161  sigemptyset(&mask);
162  sigaddset(&mask, SIGUSR1);
163  sigaddset(&mask, SIGUSR2);
164  EXPECT_OK(sigprocmask(SIG_BLOCK, &mask, NULL));
165  EXPECT_NOTCAPABLE(signalfd(cap_fd_none, &mask, 0));
166  EXPECT_NOTCAPABLE(signalfd(cap_fd_read, &mask, 0));
167  EXPECT_EQ(cap_fd_sig, signalfd(cap_fd_sig, &mask, 0));
168
169  // Need CAP_EVENT to get notification of a signal in poll(2).
170  kill(me, SIGUSR2);
171
172  struct pollfd poll_fd;
173  poll_fd.revents = 0;
174  poll_fd.events = POLLIN;
175  poll_fd.fd = cap_fd_sig_read;
176  EXPECT_OK(poll(&poll_fd, 1, 400));
177  EXPECT_EQ(0, (poll_fd.revents & POLLIN));
178  EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
179
180  poll_fd.fd = cap_fd_all;
181  EXPECT_OK(poll(&poll_fd, 1, 400));
182  EXPECT_NE(0, (poll_fd.revents & POLLIN));
183  EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
184}
185
186TEST(Linux, EventFD) {
187  int fd = eventfd(0, 0);
188  EXPECT_OK(fd);
189
190  cap_rights_t r_rs;
191  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
192  cap_rights_t r_ws;
193  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
194  cap_rights_t r_rws;
195  cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
196  cap_rights_t r_rwspoll;
197  cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
198
199  int cap_ro = dup(fd);
200  EXPECT_OK(cap_ro);
201  EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
202  int cap_wo = dup(fd);
203  EXPECT_OK(cap_wo);
204  EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
205  int cap_rw = dup(fd);
206  EXPECT_OK(cap_rw);
207  EXPECT_OK(cap_rights_limit(cap_rw, &r_rws));
208  int cap_all = dup(fd);
209  EXPECT_OK(cap_all);
210  EXPECT_OK(cap_rights_limit(cap_all, &r_rwspoll));
211
212  pid_t child = fork();
213  if (child == 0) {
214    // Child: write counter to eventfd
215    uint64_t u = 42;
216    EXPECT_NOTCAPABLE(write(cap_ro, &u, sizeof(u)));
217    EXPECT_OK(write(cap_wo, &u, sizeof(u)));
218    exit(HasFailure());
219  }
220
221  sleep(1);  // Allow child to write
222
223  struct pollfd poll_fd;
224  poll_fd.revents = 0;
225  poll_fd.events = POLLIN;
226  poll_fd.fd = cap_rw;
227  EXPECT_OK(poll(&poll_fd, 1, 400));
228  EXPECT_EQ(0, (poll_fd.revents & POLLIN));
229  EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
230
231  poll_fd.fd = cap_all;
232  EXPECT_OK(poll(&poll_fd, 1, 400));
233  EXPECT_NE(0, (poll_fd.revents & POLLIN));
234  EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
235
236  uint64_t u;
237  EXPECT_NOTCAPABLE(read(cap_wo, &u, sizeof(u)));
238  EXPECT_OK(read(cap_ro, &u, sizeof(u)));
239  EXPECT_EQ(42, (int)u);
240
241  // Wait for the child.
242  int status;
243  EXPECT_EQ(child, waitpid(child, &status, 0));
244  int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
245  EXPECT_EQ(0, rc);
246
247  close(cap_all);
248  close(cap_rw);
249  close(cap_wo);
250  close(cap_ro);
251  close(fd);
252}
253
254FORK_TEST(Linux, epoll) {
255  int sock_fds[2];
256  EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, sock_fds));
257  // Queue some data.
258  char buffer[4] = {1, 2, 3, 4};
259  EXPECT_OK(write(sock_fds[1], buffer, sizeof(buffer)));
260
261  EXPECT_OK(cap_enter());  // Enter capability mode.
262
263  int epoll_fd = epoll_create(1);
264  EXPECT_OK(epoll_fd);
265
266  cap_rights_t r_rs;
267  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
268  cap_rights_t r_ws;
269  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
270  cap_rights_t r_rws;
271  cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
272  cap_rights_t r_rwspoll;
273  cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
274  cap_rights_t r_epoll;
275  cap_rights_init(&r_epoll, CAP_EPOLL_CTL);
276
277  int cap_epoll_wo = dup(epoll_fd);
278  EXPECT_OK(cap_epoll_wo);
279  EXPECT_OK(cap_rights_limit(cap_epoll_wo, &r_ws));
280  int cap_epoll_ro = dup(epoll_fd);
281  EXPECT_OK(cap_epoll_ro);
282  EXPECT_OK(cap_rights_limit(cap_epoll_ro, &r_rs));
283  int cap_epoll_rw = dup(epoll_fd);
284  EXPECT_OK(cap_epoll_rw);
285  EXPECT_OK(cap_rights_limit(cap_epoll_rw, &r_rws));
286  int cap_epoll_poll = dup(epoll_fd);
287  EXPECT_OK(cap_epoll_poll);
288  EXPECT_OK(cap_rights_limit(cap_epoll_poll, &r_rwspoll));
289  int cap_epoll_ctl = dup(epoll_fd);
290  EXPECT_OK(cap_epoll_ctl);
291  EXPECT_OK(cap_rights_limit(cap_epoll_ctl, &r_epoll));
292
293  // Can only modify the FDs being monitored if the CAP_EPOLL_CTL right is present.
294  struct epoll_event eev;
295  memset(&eev, 0, sizeof(eev));
296  eev.events = EPOLLIN|EPOLLOUT|EPOLLPRI;
297  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_ADD, sock_fds[0], &eev));
298  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_ADD, sock_fds[0], &eev));
299  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_ADD, sock_fds[0], &eev));
300  EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_ADD, sock_fds[0], &eev));
301  eev.events = EPOLLIN|EPOLLOUT;
302  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_MOD, sock_fds[0], &eev));
303  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_MOD, sock_fds[0], &eev));
304  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_MOD, sock_fds[0], &eev));
305  EXPECT_OK(epoll_ctl(cap_epoll_ctl, EPOLL_CTL_MOD, sock_fds[0], &eev));
306
307  // Running epoll_pwait(2) requires CAP_EVENT.
308  eev.events = 0;
309  EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_ro, &eev, 1, 100, NULL));
310  EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_wo, &eev, 1, 100, NULL));
311  EXPECT_NOTCAPABLE(epoll_pwait(cap_epoll_rw, &eev, 1, 100, NULL));
312  EXPECT_OK(epoll_pwait(cap_epoll_poll, &eev, 1, 100, NULL));
313  EXPECT_EQ(EPOLLIN, eev.events & EPOLLIN);
314
315  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_ro, EPOLL_CTL_DEL, sock_fds[0], &eev));
316  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_wo, EPOLL_CTL_DEL, sock_fds[0], &eev));
317  EXPECT_NOTCAPABLE(epoll_ctl(cap_epoll_rw, EPOLL_CTL_DEL, sock_fds[0], &eev));
318  EXPECT_OK(epoll_ctl(epoll_fd, EPOLL_CTL_DEL, sock_fds[0], &eev));
319
320  close(cap_epoll_ctl);
321  close(cap_epoll_poll);
322  close(cap_epoll_rw);
323  close(cap_epoll_ro);
324  close(cap_epoll_wo);
325  close(epoll_fd);
326  close(sock_fds[1]);
327  close(sock_fds[0]);
328}
329
330TEST(Linux, fstatat) {
331  int fd = open(TmpFile("cap_fstatat"), O_CREAT|O_RDWR, 0644);
332  EXPECT_OK(fd);
333  unsigned char buffer[] = {1, 2, 3, 4};
334  EXPECT_OK(write(fd, buffer, sizeof(buffer)));
335  cap_rights_t rights;
336  int cap_rf = dup(fd);
337  EXPECT_OK(cap_rf);
338  EXPECT_OK(cap_rights_limit(cap_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
339  int cap_ro = dup(fd);
340  EXPECT_OK(cap_ro);
341  EXPECT_OK(cap_rights_limit(cap_ro, cap_rights_init(&rights, CAP_READ)));
342
343  struct stat info;
344  EXPECT_OK(fstatat(fd, "", &info, AT_EMPTY_PATH));
345  EXPECT_NOTCAPABLE(fstatat(cap_ro, "", &info, AT_EMPTY_PATH));
346  EXPECT_OK(fstatat(cap_rf, "", &info, AT_EMPTY_PATH));
347
348  close(cap_ro);
349  close(cap_rf);
350  close(fd);
351
352  int dir = open(tmpdir.c_str(), O_RDONLY);
353  EXPECT_OK(dir);
354  int dir_rf = dup(dir);
355  EXPECT_OK(dir_rf);
356  EXPECT_OK(cap_rights_limit(dir_rf, cap_rights_init(&rights, CAP_READ, CAP_FSTAT)));
357  int dir_ro = dup(fd);
358  EXPECT_OK(dir_ro);
359  EXPECT_OK(cap_rights_limit(dir_ro, cap_rights_init(&rights, CAP_READ)));
360
361  EXPECT_OK(fstatat(dir, "cap_fstatat", &info, AT_EMPTY_PATH));
362  EXPECT_NOTCAPABLE(fstatat(dir_ro, "cap_fstatat", &info, AT_EMPTY_PATH));
363  EXPECT_OK(fstatat(dir_rf, "cap_fstatat", &info, AT_EMPTY_PATH));
364
365  close(dir_ro);
366  close(dir_rf);
367  close(dir);
368
369  unlink(TmpFile("cap_fstatat"));
370}
371
372// fanotify support may not be available at compile-time
373#ifdef __NR_fanotify_init
374TEST(Linux, FanotifyIfRoot) {
375  GTEST_SKIP_IF_NOT_ROOT();
376  int fa_fd = fanotify_init(FAN_CLASS_NOTIF, O_RDWR);
377  EXPECT_OK(fa_fd);
378  if (fa_fd < 0) return;  // May not be enabled
379
380  cap_rights_t r_rs;
381  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
382  cap_rights_t r_ws;
383  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
384  cap_rights_t r_rws;
385  cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
386  cap_rights_t r_rwspoll;
387  cap_rights_init(&r_rwspoll, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_EVENT);
388  cap_rights_t r_rwsnotify;
389  cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
390  cap_rights_t r_rsl;
391  cap_rights_init(&r_rsl, CAP_READ, CAP_SEEK, CAP_LOOKUP);
392  cap_rights_t r_rslstat;
393  cap_rights_init(&r_rslstat, CAP_READ, CAP_SEEK, CAP_LOOKUP, CAP_FSTAT);
394  cap_rights_t r_rsstat;
395  cap_rights_init(&r_rsstat, CAP_READ, CAP_SEEK, CAP_FSTAT);
396
397  int cap_fd_ro = dup(fa_fd);
398  EXPECT_OK(cap_fd_ro);
399  EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
400  int cap_fd_wo = dup(fa_fd);
401  EXPECT_OK(cap_fd_wo);
402  EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
403  int cap_fd_rw = dup(fa_fd);
404  EXPECT_OK(cap_fd_rw);
405  EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
406  int cap_fd_poll = dup(fa_fd);
407  EXPECT_OK(cap_fd_poll);
408  EXPECT_OK(cap_rights_limit(cap_fd_poll, &r_rwspoll));
409  int cap_fd_not = dup(fa_fd);
410  EXPECT_OK(cap_fd_not);
411  EXPECT_OK(cap_rights_limit(cap_fd_not, &r_rwsnotify));
412
413  int rc = mkdir(TmpFile("cap_notify"), 0755);
414  EXPECT_TRUE(rc == 0 || errno == EEXIST);
415  int dfd = open(TmpFile("cap_notify"), O_RDONLY);
416  EXPECT_OK(dfd);
417  int fd = open(TmpFile("cap_notify/file"), O_CREAT|O_RDWR, 0644);
418  close(fd);
419  int cap_dfd = dup(dfd);
420  EXPECT_OK(cap_dfd);
421  EXPECT_OK(cap_rights_limit(cap_dfd, &r_rslstat));
422  EXPECT_OK(cap_dfd);
423  int cap_dfd_rs = dup(dfd);
424  EXPECT_OK(cap_dfd_rs);
425  EXPECT_OK(cap_rights_limit(cap_dfd_rs, &r_rs));
426  EXPECT_OK(cap_dfd_rs);
427  int cap_dfd_rsstat = dup(dfd);
428  EXPECT_OK(cap_dfd_rsstat);
429  EXPECT_OK(cap_rights_limit(cap_dfd_rsstat, &r_rsstat));
430  EXPECT_OK(cap_dfd_rsstat);
431  int cap_dfd_rsl = dup(dfd);
432  EXPECT_OK(cap_dfd_rsl);
433  EXPECT_OK(cap_rights_limit(cap_dfd_rsl, &r_rsl));
434  EXPECT_OK(cap_dfd_rsl);
435
436  // Need CAP_NOTIFY to change what's monitored.
437  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_ro, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
438  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_wo, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
439  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_rw, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
440  EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd, NULL));
441
442  // Need CAP_FSTAT on the thing monitored.
443  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rs, NULL));
444  EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY|FAN_EVENT_ON_CHILD, cap_dfd_rsstat, NULL));
445
446  // Too add monitoring of a file under a dfd, need CAP_LOOKUP|CAP_FSTAT on the dfd.
447  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsstat, "file"));
448  EXPECT_NOTCAPABLE(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd_rsl, "file"));
449  EXPECT_OK(fanotify_mark(cap_fd_not, FAN_MARK_ADD, FAN_OPEN|FAN_MODIFY, cap_dfd, "file"));
450
451  pid_t child = fork();
452  if (child == 0) {
453    // Child: Perform activity in the directory under notify.
454    sleep(1);
455    unlink(TmpFile("cap_notify/temp"));
456    int fd = open(TmpFile("cap_notify/temp"), O_CREAT|O_RDWR, 0644);
457    close(fd);
458    exit(0);
459  }
460
461  // Need CAP_EVENT to poll.
462  struct pollfd poll_fd;
463  poll_fd.revents = 0;
464  poll_fd.events = POLLIN;
465  poll_fd.fd = cap_fd_rw;
466  EXPECT_OK(poll(&poll_fd, 1, 1400));
467  EXPECT_EQ(0, (poll_fd.revents & POLLIN));
468  EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
469
470  poll_fd.fd = cap_fd_not;
471  EXPECT_OK(poll(&poll_fd, 1, 1400));
472  EXPECT_EQ(0, (poll_fd.revents & POLLIN));
473  EXPECT_NE(0, (poll_fd.revents & POLLNVAL));
474
475  poll_fd.fd = cap_fd_poll;
476  EXPECT_OK(poll(&poll_fd, 1, 1400));
477  EXPECT_NE(0, (poll_fd.revents & POLLIN));
478  EXPECT_EQ(0, (poll_fd.revents & POLLNVAL));
479
480  // Need CAP_READ to read.
481  struct fanotify_event_metadata ev;
482  memset(&ev, 0, sizeof(ev));
483  EXPECT_NOTCAPABLE(read(cap_fd_wo, &ev, sizeof(ev)));
484  rc = read(fa_fd, &ev, sizeof(ev));
485  EXPECT_OK(rc);
486  EXPECT_EQ((int)sizeof(struct fanotify_event_metadata), rc);
487  EXPECT_EQ(child, ev.pid);
488  EXPECT_NE(0, ev.fd);
489
490  // TODO(drysdale): reinstate if/when capsicum-linux propagates rights
491  // to fanotify-generated FDs.
492#ifdef OMIT
493  // fanotify(7) gives us a FD for the changed file.  This should
494  // only have rights that are a subset of those for the original
495  // monitored directory file descriptor.
496  cap_rights_t rights;
497  CAP_SET_ALL(&rights);
498  EXPECT_OK(cap_rights_get(ev.fd, &rights));
499  EXPECT_RIGHTS_IN(&rights, &r_rslstat);
500#endif
501
502  // Wait for the child.
503  int status;
504  EXPECT_EQ(child, waitpid(child, &status, 0));
505  rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
506  EXPECT_EQ(0, rc);
507
508  close(cap_dfd_rsstat);
509  close(cap_dfd_rsl);
510  close(cap_dfd_rs);
511  close(cap_dfd);
512  close(dfd);
513  unlink(TmpFile("cap_notify/file"));
514  unlink(TmpFile("cap_notify/temp"));
515  rmdir(TmpFile("cap_notify"));
516  close(cap_fd_not);
517  close(cap_fd_poll);
518  close(cap_fd_rw);
519  close(cap_fd_wo);
520  close(cap_fd_ro);
521  close(fa_fd);
522}
523#endif
524
525TEST(Linux, inotify) {
526  int i_fd = inotify_init();
527  EXPECT_OK(i_fd);
528
529  cap_rights_t r_rs;
530  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
531  cap_rights_t r_ws;
532  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
533  cap_rights_t r_rws;
534  cap_rights_init(&r_rws, CAP_READ, CAP_WRITE, CAP_SEEK);
535  cap_rights_t r_rwsnotify;
536  cap_rights_init(&r_rwsnotify, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_NOTIFY);
537
538  int cap_fd_ro = dup(i_fd);
539  EXPECT_OK(cap_fd_ro);
540  EXPECT_OK(cap_rights_limit(cap_fd_ro, &r_rs));
541  int cap_fd_wo = dup(i_fd);
542  EXPECT_OK(cap_fd_wo);
543  EXPECT_OK(cap_rights_limit(cap_fd_wo, &r_ws));
544  int cap_fd_rw = dup(i_fd);
545  EXPECT_OK(cap_fd_rw);
546  EXPECT_OK(cap_rights_limit(cap_fd_rw, &r_rws));
547  int cap_fd_all = dup(i_fd);
548  EXPECT_OK(cap_fd_all);
549  EXPECT_OK(cap_rights_limit(cap_fd_all, &r_rwsnotify));
550
551  int fd = open(TmpFile("cap_inotify"), O_CREAT|O_RDWR, 0644);
552  EXPECT_NOTCAPABLE(inotify_add_watch(cap_fd_rw, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY));
553  int wd = inotify_add_watch(i_fd, TmpFile("cap_inotify"), IN_ACCESS|IN_MODIFY);
554  EXPECT_OK(wd);
555
556  unsigned char buffer[] = {1, 2, 3, 4};
557  EXPECT_OK(write(fd, buffer, sizeof(buffer)));
558
559  struct inotify_event iev;
560  memset(&iev, 0, sizeof(iev));
561  EXPECT_NOTCAPABLE(read(cap_fd_wo, &iev, sizeof(iev)));
562  int rc = read(cap_fd_ro, &iev, sizeof(iev));
563  EXPECT_OK(rc);
564  EXPECT_EQ((int)sizeof(iev), rc);
565  EXPECT_EQ(wd, iev.wd);
566
567  EXPECT_NOTCAPABLE(inotify_rm_watch(cap_fd_wo, wd));
568  EXPECT_OK(inotify_rm_watch(cap_fd_all, wd));
569
570  close(fd);
571  close(cap_fd_all);
572  close(cap_fd_rw);
573  close(cap_fd_wo);
574  close(cap_fd_ro);
575  close(i_fd);
576  unlink(TmpFile("cap_inotify"));
577}
578
579TEST(Linux, ArchChangeIfAvailable) {
580  const char* prog_candidates[] = {"./mini-me.32", "./mini-me.x32", "./mini-me.64"};
581  const char* progs[] = {NULL, NULL, NULL};
582  char* argv_pass[] = {(char*)"to-come", (char*)"--capmode", NULL};
583  char* null_envp[] = {NULL};
584  int fds[3];
585  int count = 0;
586
587  for (int ii = 0; ii < 3; ii++) {
588    fds[count] = open(prog_candidates[ii], O_RDONLY);
589    if (fds[count] >= 0) {
590      progs[count] = prog_candidates[ii];
591      count++;
592    }
593  }
594  if (count == 0) {
595    GTEST_SKIP() << "no different-architecture programs available";
596  }
597
598  for (int ii = 0; ii < count; ii++) {
599    // Fork-and-exec a binary of this architecture.
600    pid_t child = fork();
601    if (child == 0) {
602      EXPECT_OK(cap_enter());  // Enter capability mode
603      if (verbose) fprintf(stderr, "[%d] call fexecve(%s, %s)\n",
604                           getpid_(), progs[ii], argv_pass[1]);
605      argv_pass[0] = (char *)progs[ii];
606      int rc = fexecve_(fds[ii], argv_pass, null_envp);
607      fprintf(stderr, "fexecve(%s) returned %d errno %d\n", progs[ii], rc, errno);
608      exit(99);  // Should not reach here.
609    }
610    int status;
611    EXPECT_EQ(child, waitpid(child, &status, 0));
612    int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
613    EXPECT_EQ(0, rc);
614    close(fds[ii]);
615  }
616}
617
618FORK_TEST(Linux, NamespaceIfRoot) {
619  GTEST_SKIP_IF_NOT_ROOT();
620  pid_t me = getpid_();
621
622  // Create a new UTS namespace.
623  EXPECT_OK(unshare(CLONE_NEWUTS));
624  // Open an FD to its symlink.
625  char buffer[256];
626  sprintf(buffer, "/proc/%d/ns/uts", me);
627  int ns_fd = open(buffer, O_RDONLY);
628
629  cap_rights_t r_rwlstat;
630  cap_rights_init(&r_rwlstat, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT);
631  cap_rights_t r_rwlstatns;
632  cap_rights_init(&r_rwlstatns, CAP_READ, CAP_WRITE, CAP_LOOKUP, CAP_FSTAT, CAP_SETNS);
633
634  int cap_fd = dup(ns_fd);
635  EXPECT_OK(cap_fd);
636  EXPECT_OK(cap_rights_limit(cap_fd, &r_rwlstat));
637  int cap_fd_setns = dup(ns_fd);
638  EXPECT_OK(cap_fd_setns);
639  EXPECT_OK(cap_rights_limit(cap_fd_setns, &r_rwlstatns));
640  EXPECT_NOTCAPABLE(setns(cap_fd, CLONE_NEWUTS));
641  EXPECT_OK(setns(cap_fd_setns, CLONE_NEWUTS));
642
643  EXPECT_OK(cap_enter());  // Enter capability mode.
644
645  // No setns(2) but unshare(2) is allowed.
646  EXPECT_CAPMODE(setns(ns_fd, CLONE_NEWUTS));
647  EXPECT_OK(unshare(CLONE_NEWUTS));
648}
649
650static void SendFD(int fd, int over) {
651  struct msghdr mh;
652  mh.msg_name = NULL;  // No address needed
653  mh.msg_namelen = 0;
654  char buffer1[1024];
655  struct iovec iov[1];
656  iov[0].iov_base = buffer1;
657  iov[0].iov_len = sizeof(buffer1);
658  mh.msg_iov = iov;
659  mh.msg_iovlen = 1;
660  char buffer2[1024];
661  mh.msg_control = buffer2;
662  mh.msg_controllen = CMSG_LEN(sizeof(int));
663  struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
664  cmptr->cmsg_level = SOL_SOCKET;
665  cmptr->cmsg_type = SCM_RIGHTS;
666  cmptr->cmsg_len = CMSG_LEN(sizeof(int));
667  *(int *)CMSG_DATA(cmptr) = fd;
668  buffer1[0] = 0;
669  iov[0].iov_len = 1;
670  int rc = sendmsg(over, &mh, 0);
671  EXPECT_OK(rc);
672}
673
674static int ReceiveFD(int over) {
675  struct msghdr mh;
676  mh.msg_name = NULL;  // No address needed
677  mh.msg_namelen = 0;
678  char buffer1[1024];
679  struct iovec iov[1];
680  iov[0].iov_base = buffer1;
681  iov[0].iov_len = sizeof(buffer1);
682  mh.msg_iov = iov;
683  mh.msg_iovlen = 1;
684  char buffer2[1024];
685  mh.msg_control = buffer2;
686  mh.msg_controllen = sizeof(buffer2);
687  int rc = recvmsg(over, &mh, 0);
688  EXPECT_OK(rc);
689  EXPECT_LE(CMSG_LEN(sizeof(int)), mh.msg_controllen);
690  struct cmsghdr *cmptr = CMSG_FIRSTHDR(&mh);
691  int fd = *(int*)CMSG_DATA(cmptr);
692  EXPECT_EQ(CMSG_LEN(sizeof(int)), cmptr->cmsg_len);
693  cmptr = CMSG_NXTHDR(&mh, cmptr);
694  EXPECT_TRUE(cmptr == NULL);
695  return fd;
696}
697
698static int shared_pd = -1;
699static int shared_sock_fds[2];
700
701static int ChildFunc(void *arg) {
702  // This function is running in a new PID namespace, and so is pid 1.
703  if (verbose) fprintf(stderr, "    ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
704  EXPECT_EQ(1, getpid_());
705  EXPECT_EQ(0, getppid());
706
707  // The shared process descriptor is outside our namespace, so we cannot
708  // get its pid.
709  if (verbose) fprintf(stderr, "    ChildFunc: shared_pd=%d\n", shared_pd);
710  pid_t shared_child = -1;
711  EXPECT_OK(pdgetpid(shared_pd, &shared_child));
712  if (verbose) fprintf(stderr, "    ChildFunc: corresponding pid=%d\n", shared_child);
713  EXPECT_EQ(0, shared_child);
714
715  // But we can pdkill() it even so.
716  if (verbose) fprintf(stderr, "    ChildFunc: call pdkill(pd=%d)\n", shared_pd);
717  EXPECT_OK(pdkill(shared_pd, SIGINT));
718
719  int pd;
720  pid_t child = pdfork(&pd, 0);
721  EXPECT_OK(child);
722  if (child == 0) {
723    // Child: expect pid 2.
724    if (verbose) fprintf(stderr, "      child of ChildFunc: pid=%d, ppid=%d\n", getpid_(), getppid());
725    EXPECT_EQ(2, getpid_());
726    EXPECT_EQ(1, getppid());
727    while (true) {
728      if (verbose) fprintf(stderr, "      child of ChildFunc: \"I aten't dead\"\n");
729      sleep(1);
730    }
731    exit(0);
732  }
733  EXPECT_EQ(2, child);
734  EXPECT_PID_ALIVE(child);
735  if (verbose) fprintf(stderr, "    ChildFunc: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
736                       pd, child, ProcessState(child));
737
738  pid_t pid;
739  EXPECT_OK(pdgetpid(pd, &pid));
740  EXPECT_EQ(child, pid);
741
742  sleep(2);
743
744  // Send the process descriptor over UNIX domain socket back to parent.
745  SendFD(pd, shared_sock_fds[1]);
746
747  // Wait for death of (grand)child, killed by our parent.
748  if (verbose) fprintf(stderr, "    ChildFunc: wait on pid=%d\n", child);
749  int status;
750  EXPECT_EQ(child, wait4(child, &status, __WALL, NULL));
751
752  if (verbose) fprintf(stderr, "    ChildFunc: return 0\n");
753  return 0;
754}
755
756#define STACK_SIZE (1024 * 1024)
757static char child_stack[STACK_SIZE];
758
759// TODO(drysdale): fork into a user namespace first so GTEST_SKIP_IF_NOT_ROOT can be removed.
760TEST(Linux, PidNamespacePdForkIfRoot) {
761  GTEST_SKIP_IF_NOT_ROOT();
762  // Pass process descriptors in both directions across a PID namespace boundary.
763  // pdfork() off a child before we start, holding its process descriptor in a global
764  // variable that's accessible to children.
765  pid_t firstborn = pdfork(&shared_pd, 0);
766  EXPECT_OK(firstborn);
767  if (firstborn == 0) {
768    while (true) {
769      if (verbose) fprintf(stderr, "  Firstborn: \"I aten't dead\"\n");
770      sleep(1);
771    }
772    exit(0);
773  }
774  EXPECT_PID_ALIVE(firstborn);
775  if (verbose) fprintf(stderr, "Parent: pre-pdfork()ed pd=%d, pid=%d state='%c'\n",
776                       shared_pd, firstborn, ProcessState(firstborn));
777  sleep(2);
778
779  // Prepare sockets to communicate with child process.
780  EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
781
782  // Clone into a child process with a new pid namespace.
783  pid_t child = clone(ChildFunc, child_stack + STACK_SIZE,
784                      CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
785  EXPECT_OK(child);
786  EXPECT_PID_ALIVE(child);
787  if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
788
789  // Ensure the child runs.  First thing it does is to kill our firstborn, using shared_pd.
790  sleep(1);
791  EXPECT_PID_DEAD(firstborn);
792
793  // But we can still retrieve firstborn's PID, as it's not been reaped yet.
794  pid_t child0;
795  EXPECT_OK(pdgetpid(shared_pd, &child0));
796  EXPECT_EQ(firstborn, child0);
797  if (verbose) fprintf(stderr, "Parent: check on firstborn: pdgetpid(pd=%d) -> child=%d state='%c'\n",
798                       shared_pd, child0, ProcessState(child0));
799
800  // Now reap it.
801  int status;
802  EXPECT_EQ(firstborn, waitpid(firstborn, &status, __WALL));
803
804  // Get the process descriptor of the child-of-child via socket transfer.
805  int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
806
807  // Our notion of the pid associated with the grandchild is in the main PID namespace.
808  pid_t grandchild;
809  EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
810  EXPECT_NE(2, grandchild);
811  if (verbose) fprintf(stderr, "Parent: pre-pdkill:  pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
812                       grandchild_pd, grandchild, ProcessState(grandchild));
813  EXPECT_PID_ALIVE(grandchild);
814
815  // Kill the grandchild via the process descriptor.
816  EXPECT_OK(pdkill(grandchild_pd, SIGINT));
817  usleep(10000);
818  if (verbose) fprintf(stderr, "Parent: post-pdkill: pdgetpid(grandchild_pd=%d) -> grandchild=%d state='%c'\n",
819                       grandchild_pd, grandchild, ProcessState(grandchild));
820  EXPECT_PID_DEAD(grandchild);
821
822  sleep(2);
823
824  // Wait for the child.
825  EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
826  int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
827  EXPECT_EQ(0, rc);
828
829  close(shared_sock_fds[0]);
830  close(shared_sock_fds[1]);
831  close(shared_pd);
832  close(grandchild_pd);
833}
834
835int NSInit(void *data) {
836  // This function is running in a new PID namespace, and so is pid 1.
837  if (verbose) fprintf(stderr, "  NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
838  EXPECT_EQ(1, getpid_());
839  EXPECT_EQ(0, getppid());
840
841  int pd;
842  pid_t child = pdfork(&pd, 0);
843  EXPECT_OK(child);
844  if (child == 0) {
845    // Child: loop forever until terminated.
846    if (verbose) fprintf(stderr, "    child of NSInit: pid=%d, ppid=%d\n", getpid_(), getppid());
847    while (true) {
848      if (verbose) fprintf(stderr, "    child of NSInit: \"I aten't dead\"\n");
849      usleep(100000);
850    }
851    exit(0);
852  }
853  EXPECT_EQ(2, child);
854  EXPECT_PID_ALIVE(child);
855  if (verbose) fprintf(stderr, "  NSInit: pdfork() -> pd=%d, corresponding pid=%d state='%c'\n",
856                       pd, child, ProcessState(child));
857  sleep(1);
858
859  // Send the process descriptor over UNIX domain socket back to parent.
860  SendFD(pd, shared_sock_fds[1]);
861  close(pd);
862
863  // Wait for a byte back in the other direction.
864  int value;
865  if (verbose) fprintf(stderr, "  NSInit: block waiting for value\n");
866  read(shared_sock_fds[1], &value, sizeof(value));
867
868  if (verbose) fprintf(stderr, "  NSInit: return 0\n");
869  return 0;
870}
871
872TEST(Linux, DeadNSInitIfRoot) {
873  GTEST_SKIP_IF_NOT_ROOT();
874
875  // Prepare sockets to communicate with child process.
876  EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
877
878  // Clone into a child process with a new pid namespace.
879  pid_t child = clone(NSInit, child_stack + STACK_SIZE,
880                      CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
881  usleep(10000);
882  EXPECT_OK(child);
883  EXPECT_PID_ALIVE(child);
884  if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
885
886  // Get the process descriptor of the child-of-child via socket transfer.
887  int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
888  pid_t grandchild;
889  EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
890  if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
891
892  // Send an int to the child to trigger its termination.  Grandchild should also
893  // go, as its init process is gone.
894  int zero = 0;
895  if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
896  write(shared_sock_fds[0], &zero, sizeof(zero));
897  EXPECT_PID_ZOMBIE(child);
898  EXPECT_PID_GONE(grandchild);
899
900  // Wait for the child.
901  int status;
902  EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
903  int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
904  EXPECT_EQ(0, rc);
905  EXPECT_PID_GONE(child);
906
907  close(shared_sock_fds[0]);
908  close(shared_sock_fds[1]);
909  close(grandchild_pd);
910
911  if (verbose) {
912    fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
913    fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
914  }
915}
916
917TEST(Linux, DeadNSInit2IfRoot) {
918  GTEST_SKIP_IF_NOT_ROOT();
919
920  // Prepare sockets to communicate with child process.
921  EXPECT_OK(socketpair(AF_UNIX, SOCK_STREAM, 0, shared_sock_fds));
922
923  // Clone into a child process with a new pid namespace.
924  pid_t child = clone(NSInit, child_stack + STACK_SIZE,
925                      CLONE_FILES|CLONE_NEWPID|SIGCHLD, NULL);
926  usleep(10000);
927  EXPECT_OK(child);
928  EXPECT_PID_ALIVE(child);
929  if (verbose) fprintf(stderr, "Parent: child is %d state='%c'\n", child, ProcessState(child));
930
931  // Get the process descriptor of the child-of-child via socket transfer.
932  int grandchild_pd = ReceiveFD(shared_sock_fds[0]);
933  pid_t grandchild;
934  EXPECT_OK(pdgetpid(grandchild_pd, &grandchild));
935  if (verbose) fprintf(stderr, "Parent: grandchild is %d state='%c'\n", grandchild, ProcessState(grandchild));
936
937  // Kill the grandchild
938  EXPECT_OK(pdkill(grandchild_pd, SIGINT));
939  usleep(10000);
940  EXPECT_PID_ZOMBIE(grandchild);
941  // Close the process descriptor, so there are now no procdesc references to grandchild.
942  close(grandchild_pd);
943
944  // Send an int to the child to trigger its termination.  Grandchild should also
945  // go, as its init process is gone.
946  int zero = 0;
947  if (verbose) fprintf(stderr, "Parent: write 0 to pipe\n");
948  write(shared_sock_fds[0], &zero, sizeof(zero));
949  EXPECT_PID_ZOMBIE(child);
950  EXPECT_PID_GONE(grandchild);
951
952  // Wait for the child.
953  int status;
954  EXPECT_EQ(child, waitpid(child, &status, WNOHANG));
955  int rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
956  EXPECT_EQ(0, rc);
957
958  close(shared_sock_fds[0]);
959  close(shared_sock_fds[1]);
960
961  if (verbose) {
962    fprintf(stderr, "Parent: child %d in state='%c'\n", child, ProcessState(child));
963    fprintf(stderr, "Parent: grandchild %d in state='%c'\n", grandchild, ProcessState(grandchild));
964  }
965}
966
967#ifdef __x86_64__
968FORK_TEST(Linux, CheckHighWord) {
969  EXPECT_OK(cap_enter());  // Enter capability mode.
970
971  int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
972  EXPECT_OK(rc);
973  EXPECT_EQ(1, rc);  // no_new_privs = 1
974
975  // Set some of the high 32-bits of argument zero.
976  uint64_t big_cmd = PR_GET_NO_NEW_PRIVS | 0x100000000LL;
977  EXPECT_CAPMODE(syscall(__NR_prctl, big_cmd, 0, 0, 0, 0));
978}
979#endif
980
981FORK_TEST(Linux, PrctlOpenatBeneath) {
982  // Set no_new_privs = 1
983  EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
984  int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
985  EXPECT_OK(rc);
986  EXPECT_EQ(1, rc);  // no_new_privs = 1
987
988  // Set openat-beneath mode
989  EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 1, 0, 0, 0));
990  rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
991  EXPECT_OK(rc);
992  EXPECT_EQ(1, rc);  // openat_beneath = 1
993
994  // Clear openat-beneath mode
995  EXPECT_OK(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
996  rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
997  EXPECT_OK(rc);
998  EXPECT_EQ(0, rc);  // openat_beneath = 0
999
1000  EXPECT_OK(cap_enter());  // Enter capability mode
1001
1002  // Expect to be in openat_beneath mode
1003  rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1004  EXPECT_OK(rc);
1005  EXPECT_EQ(1, rc);  // openat_beneath = 1
1006
1007  // Expect this to be immutable.
1008  EXPECT_CAPMODE(prctl(PR_SET_OPENAT_BENEATH, 0, 0, 0, 0));
1009  rc = prctl(PR_GET_OPENAT_BENEATH, 0, 0, 0, 0);
1010  EXPECT_OK(rc);
1011  EXPECT_EQ(1, rc);  // openat_beneath = 1
1012
1013}
1014
1015FORK_TEST(Linux, NoNewPrivs) {
1016  if (getuid() == 0) {
1017    // If root, drop CAP_SYS_ADMIN POSIX.1e capability.
1018    struct __user_cap_header_struct hdr;
1019    hdr.version = _LINUX_CAPABILITY_VERSION_3;
1020    hdr.pid = getpid_();
1021    struct __user_cap_data_struct data[3];
1022    EXPECT_OK(capget(&hdr, &data[0]));
1023    data[0].effective &= ~(1 << CAP_SYS_ADMIN);
1024    data[0].permitted &= ~(1 << CAP_SYS_ADMIN);
1025    data[0].inheritable &= ~(1 << CAP_SYS_ADMIN);
1026    EXPECT_OK(capset(&hdr, &data[0]));
1027  }
1028  int rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1029  EXPECT_OK(rc);
1030  EXPECT_EQ(0, rc);  // no_new_privs == 0
1031
1032  // Can't enter seccomp-bpf mode with no_new_privs == 0
1033  struct sock_filter filter[] = {
1034    BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1035  };
1036  struct sock_fprog bpf;
1037  bpf.len = (sizeof(filter) / sizeof(filter[0]));
1038  bpf.filter = filter;
1039  rc = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0);
1040  EXPECT_EQ(-1, rc);
1041  EXPECT_EQ(EACCES, errno);
1042
1043  // Set no_new_privs = 1
1044  EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1045  rc = prctl(PR_GET_NO_NEW_PRIVS, 0, 0, 0, 0);
1046  EXPECT_OK(rc);
1047  EXPECT_EQ(1, rc);  // no_new_privs = 1
1048
1049  // Can now turn on seccomp mode
1050  EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1051}
1052
1053/* Macros for BPF generation */
1054#define BPF_RETURN_ERRNO(err) \
1055  BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ERRNO | (err & 0xFFFF))
1056#define BPF_KILL_PROCESS \
1057  BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
1058#define BPF_ALLOW \
1059  BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
1060#define EXAMINE_SYSCALL \
1061  BPF_STMT(BPF_LD+BPF_W+BPF_ABS, offsetof(struct seccomp_data, nr))
1062#define ALLOW_SYSCALL(name) \
1063  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1064  BPF_ALLOW
1065#define KILL_SYSCALL(name) \
1066  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1067  BPF_KILL_PROCESS
1068#define FAIL_SYSCALL(name, err) \
1069  BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, __NR_##name, 0, 1), \
1070  BPF_RETURN_ERRNO(err)
1071
1072TEST(Linux, CapModeWithBPF) {
1073  pid_t child = fork();
1074  EXPECT_OK(child);
1075  if (child == 0) {
1076    int fd = open(TmpFile("cap_bpf_capmode"), O_CREAT|O_RDWR, 0644);
1077    cap_rights_t rights;
1078    cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1079    EXPECT_OK(cap_rights_limit(fd, &rights));
1080
1081    struct sock_filter filter[] = { EXAMINE_SYSCALL,
1082                                    FAIL_SYSCALL(fchmod, ENOMEM),
1083                                    FAIL_SYSCALL(fstat, ENOEXEC),
1084                                    ALLOW_SYSCALL(close),
1085                                    KILL_SYSCALL(fsync),
1086                                    BPF_ALLOW };
1087    struct sock_fprog bpf = {.len = (sizeof(filter) / sizeof(filter[0])),
1088                             .filter = filter};
1089    // Set up seccomp-bpf first.
1090    EXPECT_OK(prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0));
1091    EXPECT_OK(prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &bpf, 0, 0));
1092
1093    EXPECT_OK(cap_enter());  // Enter capability mode.
1094
1095    // fchmod is allowed by Capsicum, but failed by BPF.
1096    EXPECT_SYSCALL_FAIL(ENOMEM, fchmod(fd, 0644));
1097    // open is allowed by BPF, but failed by Capsicum
1098    EXPECT_SYSCALL_FAIL(ECAPMODE, open(TmpFile("cap_bpf_capmode"), O_RDONLY));
1099    // fstat is failed by both BPF and Capsicum; tie-break is on errno
1100    struct stat buf;
1101    EXPECT_SYSCALL_FAIL(ENOEXEC, fstat(fd, &buf));
1102    // fsync is allowed by Capsicum, but BPF's SIGSYS generation take precedence
1103    fsync(fd);  // terminate with unhandled SIGSYS
1104    exit(0);
1105  }
1106  int status;
1107  EXPECT_EQ(child, waitpid(child, &status, 0));
1108  EXPECT_TRUE(WIFSIGNALED(status));
1109  EXPECT_EQ(SIGSYS, WTERMSIG(status));
1110  unlink(TmpFile("cap_bpf_capmode"));
1111}
1112
1113TEST(Linux, AIO) {
1114  int fd = open(TmpFile("cap_aio"), O_CREAT|O_RDWR, 0644);
1115  EXPECT_OK(fd);
1116
1117  cap_rights_t r_rs;
1118  cap_rights_init(&r_rs, CAP_READ, CAP_SEEK);
1119  cap_rights_t r_ws;
1120  cap_rights_init(&r_ws, CAP_WRITE, CAP_SEEK);
1121  cap_rights_t r_rwssync;
1122  cap_rights_init(&r_rwssync, CAP_READ, CAP_WRITE, CAP_SEEK, CAP_FSYNC);
1123
1124  int cap_ro = dup(fd);
1125  EXPECT_OK(cap_ro);
1126  EXPECT_OK(cap_rights_limit(cap_ro, &r_rs));
1127  EXPECT_OK(cap_ro);
1128  int cap_wo = dup(fd);
1129  EXPECT_OK(cap_wo);
1130  EXPECT_OK(cap_rights_limit(cap_wo, &r_ws));
1131  EXPECT_OK(cap_wo);
1132  int cap_all = dup(fd);
1133  EXPECT_OK(cap_all);
1134  EXPECT_OK(cap_rights_limit(cap_all, &r_rwssync));
1135  EXPECT_OK(cap_all);
1136
1137  // Linux: io_setup, io_submit, io_getevents, io_cancel, io_destroy
1138  aio_context_t ctx = 0;
1139  EXPECT_OK(syscall(__NR_io_setup, 10, &ctx));
1140
1141  unsigned char buffer[32] = {1, 2, 3, 4};
1142  struct iocb req;
1143  memset(&req, 0, sizeof(req));
1144  req.aio_reqprio = 0;
1145  req.aio_fildes = fd;
1146  uintptr_t bufaddr = (uintptr_t)buffer;
1147  req.aio_buf = (__u64)bufaddr;
1148  req.aio_nbytes = 4;
1149  req.aio_offset = 0;
1150  struct iocb* reqs[1] = {&req};
1151
1152  // Write operation
1153  req.aio_lio_opcode = IOCB_CMD_PWRITE;
1154  req.aio_fildes = cap_ro;
1155  EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1156  req.aio_fildes = cap_wo;
1157  EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1158
1159  // Sync operation
1160  req.aio_lio_opcode = IOCB_CMD_FSYNC;
1161  EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1162  req.aio_lio_opcode = IOCB_CMD_FDSYNC;
1163  EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1164  // Even with CAP_FSYNC, turns out fsync/fdsync aren't implemented
1165  req.aio_fildes = cap_all;
1166  EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1167  req.aio_lio_opcode = IOCB_CMD_FSYNC;
1168  EXPECT_FAIL_NOT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1, reqs));
1169
1170  // Read operation
1171  req.aio_lio_opcode = IOCB_CMD_PREAD;
1172  req.aio_fildes = cap_wo;
1173  EXPECT_NOTCAPABLE(syscall(__NR_io_submit, ctx, 1,  reqs));
1174  req.aio_fildes = cap_ro;
1175  EXPECT_OK(syscall(__NR_io_submit, ctx, 1,  reqs));
1176
1177  EXPECT_OK(syscall(__NR_io_destroy, ctx));
1178
1179  close(cap_all);
1180  close(cap_wo);
1181  close(cap_ro);
1182  close(fd);
1183  unlink(TmpFile("cap_aio"));
1184}
1185
1186#ifndef KCMP_FILE
1187#define KCMP_FILE 0
1188#endif
1189TEST(Linux, KcmpIfAvailable) {
1190  // This requires CONFIG_CHECKPOINT_RESTORE in kernel config.
1191  int fd = open("/etc/passwd", O_RDONLY);
1192  EXPECT_OK(fd);
1193  pid_t parent = getpid_();
1194
1195  errno = 0;
1196  int rc = syscall(__NR_kcmp, parent, parent, KCMP_FILE, fd, fd);
1197  if (rc == -1 && errno == ENOSYS) {
1198    GTEST_SKIP() << "kcmp(2) gives -ENOSYS";
1199  }
1200
1201  pid_t child = fork();
1202  if (child == 0) {
1203    // Child: limit rights on FD.
1204    child = getpid_();
1205    EXPECT_OK(syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1206    cap_rights_t rights;
1207    cap_rights_init(&rights, CAP_READ, CAP_WRITE);
1208    EXPECT_OK(cap_rights_limit(fd, &rights));
1209    // A capability wrapping a normal FD is different (from a kcmp(2) perspective)
1210    // than the original file.
1211    EXPECT_NE(0, syscall(__NR_kcmp, parent, child, KCMP_FILE, fd, fd));
1212    exit(HasFailure());
1213  }
1214  // Wait for the child.
1215  int status;
1216  EXPECT_EQ(child, waitpid(child, &status, 0));
1217  rc = WIFEXITED(status) ? WEXITSTATUS(status) : -1;
1218  EXPECT_EQ(0, rc);
1219
1220  close(fd);
1221}
1222
1223TEST(Linux, ProcFS) {
1224  cap_rights_t rights;
1225  cap_rights_init(&rights, CAP_READ, CAP_SEEK);
1226  int fd = open("/etc/passwd", O_RDONLY);
1227  EXPECT_OK(fd);
1228  lseek(fd, 4, SEEK_SET);
1229  int cap = dup(fd);
1230  EXPECT_OK(cap);
1231  EXPECT_OK(cap_rights_limit(cap, &rights));
1232  pid_t me = getpid_();
1233
1234  char buffer[1024];
1235  sprintf(buffer, "/proc/%d/fdinfo/%d", me, cap);
1236  int procfd = open(buffer, O_RDONLY);
1237  EXPECT_OK(procfd) << " failed to open " << buffer;
1238  if (procfd < 0) return;
1239  int proccap = dup(procfd);
1240  EXPECT_OK(proccap);
1241  EXPECT_OK(cap_rights_limit(proccap, &rights));
1242
1243  EXPECT_OK(read(proccap, buffer, sizeof(buffer)));
1244  // The fdinfo should include the file pos of the underlying file
1245  EXPECT_NE((char*)NULL, strstr(buffer, "pos:\t4"));
1246  // ...and the rights of the Capsicum capability.
1247  EXPECT_NE((char*)NULL, strstr(buffer, "rights:\t0x"));
1248
1249  close(procfd);
1250  close(proccap);
1251  close(cap);
1252  close(fd);
1253}
1254
1255FORK_TEST(Linux, ProcessClocks) {
1256  pid_t self = getpid_();
1257  pid_t child = fork();
1258  EXPECT_OK(child);
1259  if (child == 0) {
1260    child = getpid_();
1261    usleep(100000);
1262    exit(0);
1263  }
1264
1265  EXPECT_OK(cap_enter());  // Enter capability mode.
1266
1267  // Nefariously build a clock ID for the child's CPU time.
1268  // This relies on knowledge of the internal layout of clock IDs.
1269  clockid_t child_clock;
1270  child_clock = ((~child) << 3) | 0x0;
1271  struct timespec ts;
1272  memset(&ts, 0, sizeof(ts));
1273
1274  // TODO(drysdale): Should not be possible to retrieve info about a
1275  // different process, as the PID global namespace should be locked
1276  // down.
1277  EXPECT_OK(clock_gettime(child_clock, &ts));
1278  if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(child=%d->0x%08x) is %ld.%09ld \n",
1279                       self, child, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1280
1281  child_clock = ((~1) << 3) | 0x0;
1282  memset(&ts, 0, sizeof(ts));
1283  EXPECT_OK(clock_gettime(child_clock, &ts));
1284  if (verbose) fprintf(stderr, "[parent: %d] clock_gettime(init=1->0x%08x) is %ld.%09ld \n",
1285                       self, child_clock, (long)ts.tv_sec, (long)ts.tv_nsec);
1286
1287  // Orphan the child.
1288}
1289
1290TEST(Linux, SetLease) {
1291  int fd_all = open(TmpFile("cap_lease"), O_CREAT|O_RDWR, 0644);
1292  EXPECT_OK(fd_all);
1293  int fd_rw = dup(fd_all);
1294  EXPECT_OK(fd_rw);
1295
1296  cap_rights_t r_all;
1297  cap_rights_init(&r_all, CAP_READ, CAP_WRITE, CAP_FLOCK, CAP_FSIGNAL);
1298  EXPECT_OK(cap_rights_limit(fd_all, &r_all));
1299
1300  cap_rights_t r_rw;
1301  cap_rights_init(&r_rw, CAP_READ, CAP_WRITE);
1302  EXPECT_OK(cap_rights_limit(fd_rw, &r_rw));
1303
1304  EXPECT_NOTCAPABLE(fcntl(fd_rw, F_SETLEASE, F_WRLCK));
1305  EXPECT_NOTCAPABLE(fcntl(fd_rw, F_GETLEASE));
1306
1307  if (!tmpdir_on_tmpfs) {  // tmpfs doesn't support leases
1308    EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_WRLCK));
1309    EXPECT_EQ(F_WRLCK, fcntl(fd_all, F_GETLEASE));
1310
1311    EXPECT_OK(fcntl(fd_all, F_SETLEASE, F_UNLCK, 0));
1312    EXPECT_EQ(F_UNLCK, fcntl(fd_all, F_GETLEASE));
1313  }
1314  close(fd_all);
1315  close(fd_rw);
1316  unlink(TmpFile("cap_lease"));
1317}
1318
1319TEST(Linux, InvalidRightsSyscall) {
1320  int fd = open(TmpFile("cap_invalid_rights"), O_RDONLY|O_CREAT, 0644);
1321  EXPECT_OK(fd);
1322
1323  cap_rights_t rights;
1324  cap_rights_init(&rights, CAP_READ, CAP_WRITE, CAP_FCHMOD, CAP_FSTAT);
1325
1326  // Use the raw syscall throughout.
1327  EXPECT_EQ(0, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1328
1329  // Directly access the syscall, and find all unseemly manner of use for it.
1330  //  - Invalid flags
1331  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 1));
1332  EXPECT_EQ(EINVAL, errno);
1333  //  - Specify an fcntl subright, but no CAP_FCNTL set
1334  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, CAP_FCNTL_GETFL, 0, NULL, 0));
1335  EXPECT_EQ(EINVAL, errno);
1336  //  - Specify an ioctl subright, but no CAP_IOCTL set
1337  unsigned int ioctl1 = 1;
1338  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, &ioctl1, 0));
1339  EXPECT_EQ(EINVAL, errno);
1340  //  - N ioctls, but null pointer passed
1341  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 1, NULL, 0));
1342  EXPECT_EQ(EINVAL, errno);
1343  //  - Invalid nioctls
1344  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, -2, NULL, 0));
1345  EXPECT_EQ(EINVAL, errno);
1346  //  - Null primary rights
1347  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, NULL, 0, 0, NULL, 0));
1348  EXPECT_EQ(EFAULT, errno);
1349  //  - Invalid index bitmask
1350  rights.cr_rights[0] |= 3ULL << 57;
1351  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1352  EXPECT_EQ(EINVAL, errno);
1353  //  - Invalid version
1354  rights.cr_rights[0] |= 2ULL << 62;
1355  EXPECT_EQ(-1, syscall(__NR_cap_rights_limit, fd, &rights, 0, 0, NULL, 0));
1356  EXPECT_EQ(EINVAL, errno);
1357
1358  close(fd);
1359  unlink(TmpFile("cap_invalid_rights"));
1360}
1361
1362FORK_TEST_ON(Linux, OpenByHandleAtIfRoot, TmpFile("cap_openbyhandle_testfile")) {
1363  GTEST_SKIP_IF_NOT_ROOT();
1364  int dir = open(tmpdir.c_str(), O_RDONLY);
1365  EXPECT_OK(dir);
1366  int fd = openat(dir, "cap_openbyhandle_testfile", O_RDWR|O_CREAT, 0644);
1367  EXPECT_OK(fd);
1368  const char* message = "Saved text";
1369  EXPECT_OK(write(fd, message, strlen(message)));
1370  close(fd);
1371
1372  struct file_handle* fhandle = (struct file_handle*)malloc(sizeof(struct file_handle) + MAX_HANDLE_SZ);
1373  fhandle->handle_bytes = MAX_HANDLE_SZ;
1374  int mount_id;
1375  EXPECT_OK(name_to_handle_at(dir, "cap_openbyhandle_testfile", fhandle,  &mount_id, 0));
1376
1377  fd = open_by_handle_at(dir, fhandle, O_RDONLY);
1378  EXPECT_OK(fd);
1379  char buffer[200];
1380  ssize_t len = read(fd, buffer, 199);
1381  EXPECT_OK(len);
1382  EXPECT_EQ(std::string(message), std::string(buffer, len));
1383  close(fd);
1384
1385  // Cannot issue open_by_handle_at after entering capability mode.
1386  cap_enter();
1387  EXPECT_CAPMODE(open_by_handle_at(dir, fhandle, O_RDONLY));
1388
1389  close(dir);
1390}
1391
1392int getrandom_(void *buf, size_t buflen, unsigned int flags) {
1393#ifdef __NR_getrandom
1394  return syscall(__NR_getrandom, buf, buflen, flags);
1395#else
1396  errno = ENOSYS;
1397  return -1;
1398#endif
1399}
1400
1401#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1402#include <linux/random.h>  // Requires 3.17 kernel
1403FORK_TEST(Linux, GetRandom) {
1404  EXPECT_OK(cap_enter());
1405  unsigned char buffer[1024];
1406  unsigned char buffer2[1024];
1407  EXPECT_OK(getrandom_(buffer, sizeof(buffer), GRND_NONBLOCK));
1408  EXPECT_OK(getrandom_(buffer2, sizeof(buffer2), GRND_NONBLOCK));
1409  EXPECT_NE(0, memcmp(buffer, buffer2, sizeof(buffer)));
1410}
1411#endif
1412
1413int memfd_create_(const char *name, unsigned int flags) {
1414#ifdef __NR_memfd_create
1415  return syscall(__NR_memfd_create, name, flags);
1416#else
1417  errno = ENOSYS;
1418  return -1;
1419#endif
1420}
1421
1422#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0)
1423#include <linux/memfd.h>  // Requires 3.17 kernel
1424TEST(Linux, MemFDDeathTestIfAvailable) {
1425  int memfd = memfd_create_("capsicum-test", MFD_ALLOW_SEALING);
1426  if (memfd == -1 && errno == ENOSYS) {
1427    GTEST_SKIP() << "memfd_create(2) gives -ENOSYS";
1428  }
1429  const int LEN = 16;
1430  EXPECT_OK(ftruncate(memfd, LEN));
1431  int memfd_ro = dup(memfd);
1432  int memfd_rw = dup(memfd);
1433  EXPECT_OK(memfd_ro);
1434  EXPECT_OK(memfd_rw);
1435  cap_rights_t rights;
1436  EXPECT_OK(cap_rights_limit(memfd_ro, cap_rights_init(&rights, CAP_MMAP_R, CAP_FSTAT)));
1437  EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW, CAP_FCHMOD)));
1438
1439  unsigned char *p_ro = (unsigned char *)mmap(NULL, LEN, PROT_READ, MAP_SHARED, memfd_ro, 0);
1440  EXPECT_NE((unsigned char *)MAP_FAILED, p_ro);
1441  unsigned char *p_rw = (unsigned char *)mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_rw, 0);
1442  EXPECT_NE((unsigned char *)MAP_FAILED, p_rw);
1443  EXPECT_EQ(MAP_FAILED,
1444            mmap(NULL, LEN, PROT_READ|PROT_WRITE, MAP_SHARED, memfd_ro, 0));
1445
1446  *p_rw = 42;
1447  EXPECT_EQ(42, *p_ro);
1448  EXPECT_DEATH(*p_ro = 42, "");
1449
1450#ifndef F_ADD_SEALS
1451  // Hack for when libc6 does not yet include the updated linux/fcntl.h from kernel 3.17
1452#define _F_LINUX_SPECIFIC_BASE F_SETLEASE
1453#define F_ADD_SEALS	(_F_LINUX_SPECIFIC_BASE + 9)
1454#define F_GET_SEALS	(_F_LINUX_SPECIFIC_BASE + 10)
1455#define F_SEAL_SEAL	0x0001	/* prevent further seals from being set */
1456#define F_SEAL_SHRINK	0x0002	/* prevent file from shrinking */
1457#define F_SEAL_GROW	0x0004	/* prevent file from growing */
1458#define F_SEAL_WRITE	0x0008	/* prevent writes */
1459#endif
1460
1461  // Reading the seal information requires CAP_FSTAT.
1462  int seals = fcntl(memfd, F_GET_SEALS);
1463  EXPECT_OK(seals);
1464  if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1465  int seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1466  EXPECT_EQ(seals, seals_ro);
1467  if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1468  int seals_rw = fcntl(memfd_rw, F_GET_SEALS);
1469  EXPECT_NOTCAPABLE(seals_rw);
1470
1471  // Fail to seal as a writable mapping exists.
1472  EXPECT_EQ(-1, fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1473  EXPECT_EQ(EBUSY, errno);
1474  *p_rw = 42;
1475
1476  // Seal the rw version; need to unmap first.
1477  munmap(p_rw, LEN);
1478  munmap(p_ro, LEN);
1479  EXPECT_OK(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1480
1481  seals = fcntl(memfd, F_GET_SEALS);
1482  EXPECT_OK(seals);
1483  if (verbose) fprintf(stderr, "seals are %08x on base fd\n", seals);
1484  seals_ro = fcntl(memfd_ro, F_GET_SEALS);
1485  EXPECT_EQ(seals, seals_ro);
1486  if (verbose) fprintf(stderr, "seals are %08x on read-only fd\n", seals_ro);
1487
1488  // Remove the CAP_FCHMOD right, can no longer add seals.
1489  EXPECT_OK(cap_rights_limit(memfd_rw, cap_rights_init(&rights, CAP_MMAP_RW)));
1490  EXPECT_NOTCAPABLE(fcntl(memfd_rw, F_ADD_SEALS, F_SEAL_WRITE));
1491
1492  close(memfd);
1493  close(memfd_ro);
1494  close(memfd_rw);
1495}
1496#endif
1497
1498#else
1499void noop() {}
1500#endif
1501