1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 The FreeBSD Foundation
5 *
6 * This software was developed by Edward Tomasz Napierala under sponsorship
7 * from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#ifdef RCTL
32
33#include <sys/param.h>
34#include <sys/devctl.h>
35#include <sys/malloc.h>
36#include <sys/queue.h>
37#include <sys/refcount.h>
38#include <sys/jail.h>
39#include <sys/kernel.h>
40#include <sys/limits.h>
41#include <sys/loginclass.h>
42#include <sys/malloc.h>
43#include <sys/priv.h>
44#include <sys/proc.h>
45#include <sys/racct.h>
46#include <sys/rctl.h>
47#include <sys/resourcevar.h>
48#include <sys/sx.h>
49#include <sys/sysproto.h>
50#include <sys/systm.h>
51#include <sys/types.h>
52#include <sys/eventhandler.h>
53#include <sys/lock.h>
54#include <sys/mutex.h>
55#include <sys/rwlock.h>
56#include <sys/sbuf.h>
57#include <sys/taskqueue.h>
58#include <sys/tree.h>
59#include <vm/uma.h>
60
61#ifndef RACCT
62#error "The RCTL option requires the RACCT option"
63#endif
64
65FEATURE(rctl, "Resource Limits");
66
67#define	HRF_DEFAULT		0
68#define	HRF_DONT_INHERIT	1
69#define	HRF_DONT_ACCUMULATE	2
70
71#define	RCTL_MAX_INBUFSIZE	4 * 1024
72#define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
73#define	RCTL_LOG_BUFSIZE	128
74
75#define	RCTL_PCPU_SHIFT		(10 * 1000000)
76
77static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
78static int rctl_log_rate_limit = 10;
79static int rctl_devctl_rate_limit = 10;
80
81/*
82 * Values below are initialized in rctl_init().
83 */
84static int rctl_throttle_min = -1;
85static int rctl_throttle_max = -1;
86static int rctl_throttle_pct = -1;
87static int rctl_throttle_pct2 = -1;
88
89static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
90static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
91static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
92static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
93
94SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
95    "Resource Limits");
96SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
97    &rctl_maxbufsize, 0, "Maximum output buffer size");
98SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
99    &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
100SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
101    &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
102SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
103    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
104    &rctl_throttle_min_sysctl, "IU",
105    "Shortest throttling duration, in hz");
106TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
107SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
108    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
109    &rctl_throttle_max_sysctl, "IU",
110    "Longest throttling duration, in hz");
111TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
112SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
113    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
114    &rctl_throttle_pct_sysctl, "IU",
115    "Throttling penalty for process consumption, in percent");
116TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
117SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
118    CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
119    &rctl_throttle_pct2_sysctl, "IU",
120    "Throttling penalty for container consumption, in percent");
121TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
122
123/*
124 * 'rctl_rule_link' connects a rule with every racct it's related to.
125 * For example, rule 'user:X:openfiles:deny=N/process' is linked
126 * with uidinfo for user X, and to each process of that user.
127 */
128struct rctl_rule_link {
129	LIST_ENTRY(rctl_rule_link)	rrl_next;
130	struct rctl_rule		*rrl_rule;
131	int				rrl_exceeded;
132};
133
134struct dict {
135	const char	*d_name;
136	int		d_value;
137};
138
139static struct dict subjectnames[] = {
140	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
141	{ "user", RCTL_SUBJECT_TYPE_USER },
142	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
143	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
144	{ NULL, -1 }};
145
146static struct dict resourcenames[] = {
147	{ "cputime", RACCT_CPU },
148	{ "datasize", RACCT_DATA },
149	{ "stacksize", RACCT_STACK },
150	{ "coredumpsize", RACCT_CORE },
151	{ "memoryuse", RACCT_RSS },
152	{ "memorylocked", RACCT_MEMLOCK },
153	{ "maxproc", RACCT_NPROC },
154	{ "openfiles", RACCT_NOFILE },
155	{ "vmemoryuse", RACCT_VMEM },
156	{ "pseudoterminals", RACCT_NPTS },
157	{ "swapuse", RACCT_SWAP },
158	{ "nthr", RACCT_NTHR },
159	{ "msgqqueued", RACCT_MSGQQUEUED },
160	{ "msgqsize", RACCT_MSGQSIZE },
161	{ "nmsgq", RACCT_NMSGQ },
162	{ "nsem", RACCT_NSEM },
163	{ "nsemop", RACCT_NSEMOP },
164	{ "nshm", RACCT_NSHM },
165	{ "shmsize", RACCT_SHMSIZE },
166	{ "wallclock", RACCT_WALLCLOCK },
167	{ "pcpu", RACCT_PCTCPU },
168	{ "readbps", RACCT_READBPS },
169	{ "writebps", RACCT_WRITEBPS },
170	{ "readiops", RACCT_READIOPS },
171	{ "writeiops", RACCT_WRITEIOPS },
172	{ NULL, -1 }};
173
174static struct dict actionnames[] = {
175	{ "sighup", RCTL_ACTION_SIGHUP },
176	{ "sigint", RCTL_ACTION_SIGINT },
177	{ "sigquit", RCTL_ACTION_SIGQUIT },
178	{ "sigill", RCTL_ACTION_SIGILL },
179	{ "sigtrap", RCTL_ACTION_SIGTRAP },
180	{ "sigabrt", RCTL_ACTION_SIGABRT },
181	{ "sigemt", RCTL_ACTION_SIGEMT },
182	{ "sigfpe", RCTL_ACTION_SIGFPE },
183	{ "sigkill", RCTL_ACTION_SIGKILL },
184	{ "sigbus", RCTL_ACTION_SIGBUS },
185	{ "sigsegv", RCTL_ACTION_SIGSEGV },
186	{ "sigsys", RCTL_ACTION_SIGSYS },
187	{ "sigpipe", RCTL_ACTION_SIGPIPE },
188	{ "sigalrm", RCTL_ACTION_SIGALRM },
189	{ "sigterm", RCTL_ACTION_SIGTERM },
190	{ "sigurg", RCTL_ACTION_SIGURG },
191	{ "sigstop", RCTL_ACTION_SIGSTOP },
192	{ "sigtstp", RCTL_ACTION_SIGTSTP },
193	{ "sigchld", RCTL_ACTION_SIGCHLD },
194	{ "sigttin", RCTL_ACTION_SIGTTIN },
195	{ "sigttou", RCTL_ACTION_SIGTTOU },
196	{ "sigio", RCTL_ACTION_SIGIO },
197	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
198	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
199	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
200	{ "sigprof", RCTL_ACTION_SIGPROF },
201	{ "sigwinch", RCTL_ACTION_SIGWINCH },
202	{ "siginfo", RCTL_ACTION_SIGINFO },
203	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
204	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
205	{ "sigthr", RCTL_ACTION_SIGTHR },
206	{ "deny", RCTL_ACTION_DENY },
207	{ "log", RCTL_ACTION_LOG },
208	{ "devctl", RCTL_ACTION_DEVCTL },
209	{ "throttle", RCTL_ACTION_THROTTLE },
210	{ NULL, -1 }};
211
212static void rctl_init(void);
213SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
214
215static uma_zone_t rctl_rule_zone;
216static uma_zone_t rctl_rule_link_zone;
217
218static int rctl_rule_fully_specified(const struct rctl_rule *rule);
219static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
220
221static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
222
223static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
224{
225	int error, val = rctl_throttle_min;
226
227	error = sysctl_handle_int(oidp, &val, 0, req);
228	if (error || !req->newptr)
229		return (error);
230	if (val < 1 || val > rctl_throttle_max)
231		return (EINVAL);
232
233	RACCT_LOCK();
234	rctl_throttle_min = val;
235	RACCT_UNLOCK();
236
237	return (0);
238}
239
240static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
241{
242	int error, val = rctl_throttle_max;
243
244	error = sysctl_handle_int(oidp, &val, 0, req);
245	if (error || !req->newptr)
246		return (error);
247	if (val < rctl_throttle_min)
248		return (EINVAL);
249
250	RACCT_LOCK();
251	rctl_throttle_max = val;
252	RACCT_UNLOCK();
253
254	return (0);
255}
256
257static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
258{
259	int error, val = rctl_throttle_pct;
260
261	error = sysctl_handle_int(oidp, &val, 0, req);
262	if (error || !req->newptr)
263		return (error);
264	if (val < 0)
265		return (EINVAL);
266
267	RACCT_LOCK();
268	rctl_throttle_pct = val;
269	RACCT_UNLOCK();
270
271	return (0);
272}
273
274static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
275{
276	int error, val = rctl_throttle_pct2;
277
278	error = sysctl_handle_int(oidp, &val, 0, req);
279	if (error || !req->newptr)
280		return (error);
281	if (val < 0)
282		return (EINVAL);
283
284	RACCT_LOCK();
285	rctl_throttle_pct2 = val;
286	RACCT_UNLOCK();
287
288	return (0);
289}
290
291static const char *
292rctl_subject_type_name(int subject)
293{
294	int i;
295
296	for (i = 0; subjectnames[i].d_name != NULL; i++) {
297		if (subjectnames[i].d_value == subject)
298			return (subjectnames[i].d_name);
299	}
300
301	panic("rctl_subject_type_name: unknown subject type %d", subject);
302}
303
304static const char *
305rctl_action_name(int action)
306{
307	int i;
308
309	for (i = 0; actionnames[i].d_name != NULL; i++) {
310		if (actionnames[i].d_value == action)
311			return (actionnames[i].d_name);
312	}
313
314	panic("rctl_action_name: unknown action %d", action);
315}
316
317const char *
318rctl_resource_name(int resource)
319{
320	int i;
321
322	for (i = 0; resourcenames[i].d_name != NULL; i++) {
323		if (resourcenames[i].d_value == resource)
324			return (resourcenames[i].d_name);
325	}
326
327	panic("rctl_resource_name: unknown resource %d", resource);
328}
329
330static struct racct *
331rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
332{
333	struct ucred *cred = p->p_ucred;
334
335	ASSERT_RACCT_ENABLED();
336	RACCT_LOCK_ASSERT();
337
338	switch (rule->rr_per) {
339	case RCTL_SUBJECT_TYPE_PROCESS:
340		return (p->p_racct);
341	case RCTL_SUBJECT_TYPE_USER:
342		return (cred->cr_ruidinfo->ui_racct);
343	case RCTL_SUBJECT_TYPE_LOGINCLASS:
344		return (cred->cr_loginclass->lc_racct);
345	case RCTL_SUBJECT_TYPE_JAIL:
346		return (cred->cr_prison->pr_prison_racct->prr_racct);
347	default:
348		panic("%s: unknown per %d", __func__, rule->rr_per);
349	}
350}
351
352/*
353 * Return the amount of resource that can be allocated by 'p' before
354 * hitting 'rule'.
355 */
356static int64_t
357rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
358{
359	const struct racct *racct;
360	int64_t available;
361
362	ASSERT_RACCT_ENABLED();
363	RACCT_LOCK_ASSERT();
364
365	racct = rctl_proc_rule_to_racct(p, rule);
366	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
367
368	return (available);
369}
370
371/*
372 * Called every second for proc, uidinfo, loginclass, and jail containers.
373 * If the limit isn't exceeded, it decreases the usage amount to zero.
374 * Otherwise, it decreases it by the value of the limit.  This way
375 * resource consumption exceeding the limit "carries over" to the next
376 * period.
377 */
378void
379rctl_throttle_decay(struct racct *racct, int resource)
380{
381	struct rctl_rule *rule;
382	struct rctl_rule_link *link;
383	int64_t minavailable;
384
385	ASSERT_RACCT_ENABLED();
386	RACCT_LOCK_ASSERT();
387
388	minavailable = INT64_MAX;
389
390	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
391		rule = link->rrl_rule;
392
393		if (rule->rr_resource != resource)
394			continue;
395		if (rule->rr_action != RCTL_ACTION_THROTTLE)
396			continue;
397
398		if (rule->rr_amount < minavailable)
399			minavailable = rule->rr_amount;
400	}
401
402	if (racct->r_resources[resource] < minavailable) {
403		racct->r_resources[resource] = 0;
404	} else {
405		/*
406		 * Cap utilization counter at ten times the limit.  Otherwise,
407		 * if we changed the rule lowering the allowed amount, it could
408		 * take unreasonably long time for the accumulated resource
409		 * usage to drop.
410		 */
411		if (racct->r_resources[resource] > minavailable * 10)
412			racct->r_resources[resource] = minavailable * 10;
413
414		racct->r_resources[resource] -= minavailable;
415	}
416}
417
418/*
419 * Special version of rctl_get_available() for the %CPU resource.
420 * We slightly cheat here and return less than we normally would.
421 */
422int64_t
423rctl_pcpu_available(const struct proc *p) {
424	struct rctl_rule *rule;
425	struct rctl_rule_link *link;
426	int64_t available, minavailable, limit;
427
428	ASSERT_RACCT_ENABLED();
429	RACCT_LOCK_ASSERT();
430
431	minavailable = INT64_MAX;
432	limit = 0;
433
434	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
435		rule = link->rrl_rule;
436		if (rule->rr_resource != RACCT_PCTCPU)
437			continue;
438		if (rule->rr_action != RCTL_ACTION_DENY)
439			continue;
440		available = rctl_available_resource(p, rule);
441		if (available < minavailable) {
442			minavailable = available;
443			limit = rule->rr_amount;
444		}
445	}
446
447	/*
448	 * Return slightly less than actual value of the available
449	 * %cpu resource.  This makes %cpu throttling more aggressive
450	 * and lets us act sooner than the limits are already exceeded.
451	 */
452	if (limit != 0) {
453		if (limit > 2 * RCTL_PCPU_SHIFT)
454			minavailable -= RCTL_PCPU_SHIFT;
455		else
456			minavailable -= (limit / 2);
457	}
458
459	return (minavailable);
460}
461
462static uint64_t
463xadd(uint64_t a, uint64_t b)
464{
465	uint64_t c;
466
467	c = a + b;
468
469	/*
470	 * Detect overflow.
471	 */
472	if (c < a || c < b)
473		return (UINT64_MAX);
474
475	return (c);
476}
477
478static uint64_t
479xmul(uint64_t a, uint64_t b)
480{
481
482	if (b != 0 && a > UINT64_MAX / b)
483		return (UINT64_MAX);
484
485	return (a * b);
486}
487
488/*
489 * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
490 * to what it keeps allocated now.  Returns non-zero if the allocation should
491 * be denied, 0 otherwise.
492 */
493int
494rctl_enforce(struct proc *p, int resource, uint64_t amount)
495{
496	static struct timeval log_lasttime, devctl_lasttime;
497	static int log_curtime = 0, devctl_curtime = 0;
498	struct rctl_rule *rule;
499	struct rctl_rule_link *link;
500	struct sbuf sb;
501	char *buf;
502	int64_t available;
503	uint64_t sleep_ms, sleep_ratio;
504	int should_deny = 0;
505
506	ASSERT_RACCT_ENABLED();
507	RACCT_LOCK_ASSERT();
508
509	/*
510	 * There may be more than one matching rule; go through all of them.
511	 * Denial should be done last, after logging and sending signals.
512	 */
513	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
514		rule = link->rrl_rule;
515		if (rule->rr_resource != resource)
516			continue;
517
518		available = rctl_available_resource(p, rule);
519		if (available >= (int64_t)amount) {
520			link->rrl_exceeded = 0;
521			continue;
522		}
523
524		switch (rule->rr_action) {
525		case RCTL_ACTION_DENY:
526			should_deny = 1;
527			continue;
528		case RCTL_ACTION_LOG:
529			/*
530			 * If rrl_exceeded != 0, it means we've already
531			 * logged a warning for this process.
532			 */
533			if (link->rrl_exceeded != 0)
534				continue;
535
536			/*
537			 * If the process state is not fully initialized yet,
538			 * we can't access most of the required fields, e.g.
539			 * p->p_comm.  This happens when called from fork1().
540			 * Ignore this rule for now; it will be processed just
541			 * after fork, when called from racct_proc_fork_done().
542			 */
543			if (p->p_state != PRS_NORMAL)
544				continue;
545
546			if (!ppsratecheck(&log_lasttime, &log_curtime,
547			    rctl_log_rate_limit))
548				continue;
549
550			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
551			if (buf == NULL) {
552				printf("rctl_enforce: out of memory\n");
553				continue;
554			}
555			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
556			rctl_rule_to_sbuf(&sb, rule);
557			sbuf_finish(&sb);
558			printf("rctl: rule \"%s\" matched by pid %d "
559			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
560			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
561			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
562			sbuf_delete(&sb);
563			free(buf, M_RCTL);
564			link->rrl_exceeded = 1;
565			continue;
566		case RCTL_ACTION_DEVCTL:
567			if (link->rrl_exceeded != 0)
568				continue;
569
570			if (p->p_state != PRS_NORMAL)
571				continue;
572
573			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
574			    rctl_devctl_rate_limit))
575				continue;
576
577			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
578			if (buf == NULL) {
579				printf("rctl_enforce: out of memory\n");
580				continue;
581			}
582			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
583			sbuf_cat(&sb, "rule=");
584			rctl_rule_to_sbuf(&sb, rule);
585			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
586			    p->p_pid, p->p_ucred->cr_ruid,
587			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
588			sbuf_finish(&sb);
589			devctl_notify("RCTL", "rule", "matched",
590			    sbuf_data(&sb));
591			sbuf_delete(&sb);
592			free(buf, M_RCTL);
593			link->rrl_exceeded = 1;
594			continue;
595		case RCTL_ACTION_THROTTLE:
596			if (p->p_state != PRS_NORMAL)
597				continue;
598
599			if (rule->rr_amount == 0) {
600				racct_proc_throttle(p, rctl_throttle_max);
601				continue;
602			}
603
604			/*
605			 * Make the process sleep for a fraction of second
606			 * proportional to the ratio of process' resource
607			 * utilization compared to the limit.  The point is
608			 * to penalize resource hogs: processes that consume
609			 * more of the available resources sleep for longer.
610			 *
611			 * We're trying to defer division until the very end,
612			 * to minimize the rounding effects.  The following
613			 * calculation could have been written in a clearer
614			 * way like this:
615			 *
616			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
617			 *     rule->rr_amount;
618			 * sleep_ms *= rctl_throttle_pct / 100;
619			 * if (sleep_ms < rctl_throttle_min)
620			 *         sleep_ms = rctl_throttle_min;
621			 *
622			 */
623			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
624			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
625			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
626				sleep_ms = rctl_throttle_min * rule->rr_amount;
627
628			/*
629			 * Multiply that by the ratio of the resource
630			 * consumption for the container compared to the limit,
631			 * squared.  In other words, a process in a container
632			 * that is two times over the limit will be throttled
633			 * four times as much for hitting the same rule.  The
634			 * point is to penalize processes more if the container
635			 * itself (eg certain UID or jail) is above the limit.
636			 */
637			if (available < 0)
638				sleep_ratio = -available / rule->rr_amount;
639			else
640				sleep_ratio = 0;
641			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
642			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
643			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
644
645			/*
646			 * Finally the division.
647			 */
648			sleep_ms /= rule->rr_amount;
649
650			if (sleep_ms > rctl_throttle_max)
651				sleep_ms = rctl_throttle_max;
652#if 0
653			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
654			   __func__, p->p_pid, p->p_comm,
655			   p->p_racct->r_resources[resource],
656			   rule->rr_amount, (uintmax_t)sleep_ms,
657			   (uintmax_t)sleep_ratio, (intmax_t)available);
658#endif
659
660			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
661			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
662			racct_proc_throttle(p, sleep_ms);
663			continue;
664		default:
665			if (link->rrl_exceeded != 0)
666				continue;
667
668			if (p->p_state != PRS_NORMAL)
669				continue;
670
671			KASSERT(rule->rr_action > 0 &&
672			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
673			    ("rctl_enforce: unknown action %d",
674			     rule->rr_action));
675
676			/*
677			 * We're using the fact that RCTL_ACTION_SIG* values
678			 * are equal to their counterparts from sys/signal.h.
679			 */
680			kern_psignal(p, rule->rr_action);
681			link->rrl_exceeded = 1;
682			continue;
683		}
684	}
685
686	if (should_deny) {
687		/*
688		 * Return fake error code; the caller should change it
689		 * into one proper for the situation - EFSIZ, ENOMEM etc.
690		 */
691		return (EDOOFUS);
692	}
693
694	return (0);
695}
696
697uint64_t
698rctl_get_limit(struct proc *p, int resource)
699{
700	struct rctl_rule *rule;
701	struct rctl_rule_link *link;
702	uint64_t amount = UINT64_MAX;
703
704	ASSERT_RACCT_ENABLED();
705	RACCT_LOCK_ASSERT();
706
707	/*
708	 * There may be more than one matching rule; go through all of them.
709	 * Denial should be done last, after logging and sending signals.
710	 */
711	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
712		rule = link->rrl_rule;
713		if (rule->rr_resource != resource)
714			continue;
715		if (rule->rr_action != RCTL_ACTION_DENY)
716			continue;
717		if (rule->rr_amount < amount)
718			amount = rule->rr_amount;
719	}
720
721	return (amount);
722}
723
724uint64_t
725rctl_get_available(struct proc *p, int resource)
726{
727	struct rctl_rule *rule;
728	struct rctl_rule_link *link;
729	int64_t available, minavailable, allocated;
730
731	minavailable = INT64_MAX;
732
733	ASSERT_RACCT_ENABLED();
734	RACCT_LOCK_ASSERT();
735
736	/*
737	 * There may be more than one matching rule; go through all of them.
738	 * Denial should be done last, after logging and sending signals.
739	 */
740	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
741		rule = link->rrl_rule;
742		if (rule->rr_resource != resource)
743			continue;
744		if (rule->rr_action != RCTL_ACTION_DENY)
745			continue;
746		available = rctl_available_resource(p, rule);
747		if (available < minavailable)
748			minavailable = available;
749	}
750
751	/*
752	 * XXX: Think about this _hard_.
753	 */
754	allocated = p->p_racct->r_resources[resource];
755	if (minavailable < INT64_MAX - allocated)
756		minavailable += allocated;
757	if (minavailable < 0)
758		minavailable = 0;
759
760	return (minavailable);
761}
762
763static int
764rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
765{
766
767	ASSERT_RACCT_ENABLED();
768
769	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
770		if (rule->rr_subject_type != filter->rr_subject_type)
771			return (0);
772
773		switch (filter->rr_subject_type) {
774		case RCTL_SUBJECT_TYPE_PROCESS:
775			if (filter->rr_subject.rs_proc != NULL &&
776			    rule->rr_subject.rs_proc !=
777			    filter->rr_subject.rs_proc)
778				return (0);
779			break;
780		case RCTL_SUBJECT_TYPE_USER:
781			if (filter->rr_subject.rs_uip != NULL &&
782			    rule->rr_subject.rs_uip !=
783			    filter->rr_subject.rs_uip)
784				return (0);
785			break;
786		case RCTL_SUBJECT_TYPE_LOGINCLASS:
787			if (filter->rr_subject.rs_loginclass != NULL &&
788			    rule->rr_subject.rs_loginclass !=
789			    filter->rr_subject.rs_loginclass)
790				return (0);
791			break;
792		case RCTL_SUBJECT_TYPE_JAIL:
793			if (filter->rr_subject.rs_prison_racct != NULL &&
794			    rule->rr_subject.rs_prison_racct !=
795			    filter->rr_subject.rs_prison_racct)
796				return (0);
797			break;
798		default:
799			panic("rctl_rule_matches: unknown subject type %d",
800			    filter->rr_subject_type);
801		}
802	}
803
804	if (filter->rr_resource != RACCT_UNDEFINED) {
805		if (rule->rr_resource != filter->rr_resource)
806			return (0);
807	}
808
809	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
810		if (rule->rr_action != filter->rr_action)
811			return (0);
812	}
813
814	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
815		if (rule->rr_amount != filter->rr_amount)
816			return (0);
817	}
818
819	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
820		if (rule->rr_per != filter->rr_per)
821			return (0);
822	}
823
824	return (1);
825}
826
827static int
828str2value(const char *str, int *value, struct dict *table)
829{
830	int i;
831
832	if (value == NULL)
833		return (EINVAL);
834
835	for (i = 0; table[i].d_name != NULL; i++) {
836		if (strcasecmp(table[i].d_name, str) == 0) {
837			*value =  table[i].d_value;
838			return (0);
839		}
840	}
841
842	return (EINVAL);
843}
844
845static int
846str2id(const char *str, id_t *value)
847{
848	char *end;
849
850	if (str == NULL)
851		return (EINVAL);
852
853	*value = strtoul(str, &end, 10);
854	if ((size_t)(end - str) != strlen(str))
855		return (EINVAL);
856
857	return (0);
858}
859
860static int
861str2int64(const char *str, int64_t *value)
862{
863	char *end;
864
865	if (str == NULL)
866		return (EINVAL);
867
868	*value = strtoul(str, &end, 10);
869	if ((size_t)(end - str) != strlen(str))
870		return (EINVAL);
871
872	if (*value < 0)
873		return (ERANGE);
874
875	return (0);
876}
877
878/*
879 * Connect the rule to the racct, increasing refcount for the rule.
880 */
881static void
882rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
883{
884	struct rctl_rule_link *link;
885
886	ASSERT_RACCT_ENABLED();
887	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
888
889	rctl_rule_acquire(rule);
890	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
891	link->rrl_rule = rule;
892	link->rrl_exceeded = 0;
893
894	RACCT_LOCK();
895	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
896	RACCT_UNLOCK();
897}
898
899static int
900rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
901{
902	struct rctl_rule_link *link;
903
904	ASSERT_RACCT_ENABLED();
905	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
906	RACCT_LOCK_ASSERT();
907
908	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
909	if (link == NULL)
910		return (ENOMEM);
911	rctl_rule_acquire(rule);
912	link->rrl_rule = rule;
913	link->rrl_exceeded = 0;
914
915	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
916
917	return (0);
918}
919
920/*
921 * Remove limits for a rules matching the filter and release
922 * the refcounts for the rules, possibly freeing them.  Returns
923 * the number of limit structures removed.
924 */
925static int
926rctl_racct_remove_rules(struct racct *racct,
927    const struct rctl_rule *filter)
928{
929	struct rctl_rule_link *link, *linktmp;
930	int removed = 0;
931
932	ASSERT_RACCT_ENABLED();
933	RACCT_LOCK_ASSERT();
934
935	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
936		if (!rctl_rule_matches(link->rrl_rule, filter))
937			continue;
938
939		LIST_REMOVE(link, rrl_next);
940		rctl_rule_release(link->rrl_rule);
941		uma_zfree(rctl_rule_link_zone, link);
942		removed++;
943	}
944	return (removed);
945}
946
947static void
948rctl_rule_acquire_subject(struct rctl_rule *rule)
949{
950
951	ASSERT_RACCT_ENABLED();
952
953	switch (rule->rr_subject_type) {
954	case RCTL_SUBJECT_TYPE_UNDEFINED:
955	case RCTL_SUBJECT_TYPE_PROCESS:
956		break;
957	case RCTL_SUBJECT_TYPE_JAIL:
958		if (rule->rr_subject.rs_prison_racct != NULL)
959			prison_racct_hold(rule->rr_subject.rs_prison_racct);
960		break;
961	case RCTL_SUBJECT_TYPE_USER:
962		if (rule->rr_subject.rs_uip != NULL)
963			uihold(rule->rr_subject.rs_uip);
964		break;
965	case RCTL_SUBJECT_TYPE_LOGINCLASS:
966		if (rule->rr_subject.rs_loginclass != NULL)
967			loginclass_hold(rule->rr_subject.rs_loginclass);
968		break;
969	default:
970		panic("rctl_rule_acquire_subject: unknown subject type %d",
971		    rule->rr_subject_type);
972	}
973}
974
975static void
976rctl_rule_release_subject(struct rctl_rule *rule)
977{
978
979	ASSERT_RACCT_ENABLED();
980
981	switch (rule->rr_subject_type) {
982	case RCTL_SUBJECT_TYPE_UNDEFINED:
983	case RCTL_SUBJECT_TYPE_PROCESS:
984		break;
985	case RCTL_SUBJECT_TYPE_JAIL:
986		if (rule->rr_subject.rs_prison_racct != NULL)
987			prison_racct_free(rule->rr_subject.rs_prison_racct);
988		break;
989	case RCTL_SUBJECT_TYPE_USER:
990		if (rule->rr_subject.rs_uip != NULL)
991			uifree(rule->rr_subject.rs_uip);
992		break;
993	case RCTL_SUBJECT_TYPE_LOGINCLASS:
994		if (rule->rr_subject.rs_loginclass != NULL)
995			loginclass_free(rule->rr_subject.rs_loginclass);
996		break;
997	default:
998		panic("rctl_rule_release_subject: unknown subject type %d",
999		    rule->rr_subject_type);
1000	}
1001}
1002
1003struct rctl_rule *
1004rctl_rule_alloc(int flags)
1005{
1006	struct rctl_rule *rule;
1007
1008	ASSERT_RACCT_ENABLED();
1009
1010	rule = uma_zalloc(rctl_rule_zone, flags);
1011	if (rule == NULL)
1012		return (NULL);
1013	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1014	rule->rr_subject.rs_proc = NULL;
1015	rule->rr_subject.rs_uip = NULL;
1016	rule->rr_subject.rs_loginclass = NULL;
1017	rule->rr_subject.rs_prison_racct = NULL;
1018	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1019	rule->rr_resource = RACCT_UNDEFINED;
1020	rule->rr_action = RCTL_ACTION_UNDEFINED;
1021	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1022	refcount_init(&rule->rr_refcount, 1);
1023
1024	return (rule);
1025}
1026
1027struct rctl_rule *
1028rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1029{
1030	struct rctl_rule *copy;
1031
1032	ASSERT_RACCT_ENABLED();
1033
1034	copy = uma_zalloc(rctl_rule_zone, flags);
1035	if (copy == NULL)
1036		return (NULL);
1037	copy->rr_subject_type = rule->rr_subject_type;
1038	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1039	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1040	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1041	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1042	copy->rr_per = rule->rr_per;
1043	copy->rr_resource = rule->rr_resource;
1044	copy->rr_action = rule->rr_action;
1045	copy->rr_amount = rule->rr_amount;
1046	refcount_init(&copy->rr_refcount, 1);
1047	rctl_rule_acquire_subject(copy);
1048
1049	return (copy);
1050}
1051
1052void
1053rctl_rule_acquire(struct rctl_rule *rule)
1054{
1055
1056	ASSERT_RACCT_ENABLED();
1057	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1058
1059	refcount_acquire(&rule->rr_refcount);
1060}
1061
1062static void
1063rctl_rule_free(void *context, int pending)
1064{
1065	struct rctl_rule *rule;
1066
1067	rule = (struct rctl_rule *)context;
1068
1069	ASSERT_RACCT_ENABLED();
1070	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1071
1072	/*
1073	 * We don't need locking here; rule is guaranteed to be inaccessible.
1074	 */
1075
1076	rctl_rule_release_subject(rule);
1077	uma_zfree(rctl_rule_zone, rule);
1078}
1079
1080void
1081rctl_rule_release(struct rctl_rule *rule)
1082{
1083
1084	ASSERT_RACCT_ENABLED();
1085	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1086
1087	if (refcount_release(&rule->rr_refcount)) {
1088		/*
1089		 * rctl_rule_release() is often called when iterating
1090		 * over all the uidinfo structures in the system,
1091		 * holding uihashtbl_lock.  Since rctl_rule_free()
1092		 * might end up calling uifree(), this would lead
1093		 * to lock recursion.  Use taskqueue to avoid this.
1094		 */
1095		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1096		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1097	}
1098}
1099
1100static int
1101rctl_rule_fully_specified(const struct rctl_rule *rule)
1102{
1103
1104	ASSERT_RACCT_ENABLED();
1105
1106	switch (rule->rr_subject_type) {
1107	case RCTL_SUBJECT_TYPE_UNDEFINED:
1108		return (0);
1109	case RCTL_SUBJECT_TYPE_PROCESS:
1110		if (rule->rr_subject.rs_proc == NULL)
1111			return (0);
1112		break;
1113	case RCTL_SUBJECT_TYPE_USER:
1114		if (rule->rr_subject.rs_uip == NULL)
1115			return (0);
1116		break;
1117	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1118		if (rule->rr_subject.rs_loginclass == NULL)
1119			return (0);
1120		break;
1121	case RCTL_SUBJECT_TYPE_JAIL:
1122		if (rule->rr_subject.rs_prison_racct == NULL)
1123			return (0);
1124		break;
1125	default:
1126		panic("rctl_rule_fully_specified: unknown subject type %d",
1127		    rule->rr_subject_type);
1128	}
1129	if (rule->rr_resource == RACCT_UNDEFINED)
1130		return (0);
1131	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1132		return (0);
1133	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1134		return (0);
1135	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1136		return (0);
1137
1138	return (1);
1139}
1140
1141static int
1142rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1143{
1144	struct rctl_rule *rule;
1145	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1146	     *amountstr, *perstr;
1147	id_t id;
1148	int error = 0;
1149
1150	ASSERT_RACCT_ENABLED();
1151
1152	rule = rctl_rule_alloc(M_WAITOK);
1153
1154	subjectstr = strsep(&rulestr, ":");
1155	subject_idstr = strsep(&rulestr, ":");
1156	resourcestr = strsep(&rulestr, ":");
1157	actionstr = strsep(&rulestr, "=/");
1158	amountstr = strsep(&rulestr, "/");
1159	perstr = rulestr;
1160
1161	if (subjectstr == NULL || subjectstr[0] == '\0')
1162		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1163	else {
1164		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1165		if (error != 0)
1166			goto out;
1167	}
1168
1169	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1170		rule->rr_subject.rs_proc = NULL;
1171		rule->rr_subject.rs_uip = NULL;
1172		rule->rr_subject.rs_loginclass = NULL;
1173		rule->rr_subject.rs_prison_racct = NULL;
1174	} else {
1175		switch (rule->rr_subject_type) {
1176		case RCTL_SUBJECT_TYPE_UNDEFINED:
1177			error = EINVAL;
1178			goto out;
1179		case RCTL_SUBJECT_TYPE_PROCESS:
1180			error = str2id(subject_idstr, &id);
1181			if (error != 0)
1182				goto out;
1183			sx_assert(&allproc_lock, SA_LOCKED);
1184			rule->rr_subject.rs_proc = pfind(id);
1185			if (rule->rr_subject.rs_proc == NULL) {
1186				error = ESRCH;
1187				goto out;
1188			}
1189			PROC_UNLOCK(rule->rr_subject.rs_proc);
1190			break;
1191		case RCTL_SUBJECT_TYPE_USER:
1192			error = str2id(subject_idstr, &id);
1193			if (error != 0)
1194				goto out;
1195			rule->rr_subject.rs_uip = uifind(id);
1196			break;
1197		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1198			rule->rr_subject.rs_loginclass =
1199			    loginclass_find(subject_idstr);
1200			if (rule->rr_subject.rs_loginclass == NULL) {
1201				error = ENAMETOOLONG;
1202				goto out;
1203			}
1204			break;
1205		case RCTL_SUBJECT_TYPE_JAIL:
1206			rule->rr_subject.rs_prison_racct =
1207			    prison_racct_find(subject_idstr);
1208			if (rule->rr_subject.rs_prison_racct == NULL) {
1209				error = ENAMETOOLONG;
1210				goto out;
1211			}
1212			break;
1213               default:
1214                       panic("rctl_string_to_rule: unknown subject type %d",
1215                           rule->rr_subject_type);
1216               }
1217	}
1218
1219	if (resourcestr == NULL || resourcestr[0] == '\0')
1220		rule->rr_resource = RACCT_UNDEFINED;
1221	else {
1222		error = str2value(resourcestr, &rule->rr_resource,
1223		    resourcenames);
1224		if (error != 0)
1225			goto out;
1226	}
1227
1228	if (actionstr == NULL || actionstr[0] == '\0')
1229		rule->rr_action = RCTL_ACTION_UNDEFINED;
1230	else {
1231		error = str2value(actionstr, &rule->rr_action, actionnames);
1232		if (error != 0)
1233			goto out;
1234	}
1235
1236	if (amountstr == NULL || amountstr[0] == '\0')
1237		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1238	else {
1239		error = str2int64(amountstr, &rule->rr_amount);
1240		if (error != 0)
1241			goto out;
1242		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1243			if (rule->rr_amount > INT64_MAX / 1000000) {
1244				error = ERANGE;
1245				goto out;
1246			}
1247			rule->rr_amount *= 1000000;
1248		}
1249	}
1250
1251	if (perstr == NULL || perstr[0] == '\0')
1252		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1253	else {
1254		error = str2value(perstr, &rule->rr_per, subjectnames);
1255		if (error != 0)
1256			goto out;
1257	}
1258
1259out:
1260	if (error == 0)
1261		*rulep = rule;
1262	else
1263		rctl_rule_release(rule);
1264
1265	return (error);
1266}
1267
1268/*
1269 * Link a rule with all the subjects it applies to.
1270 */
1271int
1272rctl_rule_add(struct rctl_rule *rule)
1273{
1274	struct proc *p;
1275	struct ucred *cred;
1276	struct uidinfo *uip;
1277	struct prison *pr;
1278	struct prison_racct *prr;
1279	struct loginclass *lc;
1280	struct rctl_rule *rule2;
1281	int match;
1282
1283	ASSERT_RACCT_ENABLED();
1284	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1285
1286	/*
1287	 * Some rules just don't make sense, like "deny" rule for an undeniable
1288	 * resource.  The exception are the RSS and %CPU resources - they are
1289	 * not deniable in the racct sense, but the limit is enforced in
1290	 * a different way.
1291	 */
1292	if (rule->rr_action == RCTL_ACTION_DENY &&
1293	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1294	    rule->rr_resource != RACCT_RSS &&
1295	    rule->rr_resource != RACCT_PCTCPU) {
1296		return (EOPNOTSUPP);
1297	}
1298
1299	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1300	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1301		return (EOPNOTSUPP);
1302	}
1303
1304	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1305	    rule->rr_resource == RACCT_PCTCPU) {
1306		return (EOPNOTSUPP);
1307	}
1308
1309	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1310	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1311		return (EOPNOTSUPP);
1312	}
1313
1314	/*
1315	 * Make sure there are no duplicated rules.  Also, for the "deny"
1316	 * rules, remove ones differing only by "amount".
1317	 */
1318	if (rule->rr_action == RCTL_ACTION_DENY) {
1319		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1320		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1321		rctl_rule_remove(rule2);
1322		rctl_rule_release(rule2);
1323	} else
1324		rctl_rule_remove(rule);
1325
1326	switch (rule->rr_subject_type) {
1327	case RCTL_SUBJECT_TYPE_PROCESS:
1328		p = rule->rr_subject.rs_proc;
1329		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1330
1331		rctl_racct_add_rule(p->p_racct, rule);
1332		/*
1333		 * In case of per-process rule, we don't have anything more
1334		 * to do.
1335		 */
1336		return (0);
1337
1338	case RCTL_SUBJECT_TYPE_USER:
1339		uip = rule->rr_subject.rs_uip;
1340		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1341		rctl_racct_add_rule(uip->ui_racct, rule);
1342		break;
1343
1344	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1345		lc = rule->rr_subject.rs_loginclass;
1346		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1347		rctl_racct_add_rule(lc->lc_racct, rule);
1348		break;
1349
1350	case RCTL_SUBJECT_TYPE_JAIL:
1351		prr = rule->rr_subject.rs_prison_racct;
1352		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1353		rctl_racct_add_rule(prr->prr_racct, rule);
1354		break;
1355
1356	default:
1357		panic("rctl_rule_add: unknown subject type %d",
1358		    rule->rr_subject_type);
1359	}
1360
1361	/*
1362	 * Now go through all the processes and add the new rule to the ones
1363	 * it applies to.
1364	 */
1365	sx_assert(&allproc_lock, SA_LOCKED);
1366	FOREACH_PROC_IN_SYSTEM(p) {
1367		cred = p->p_ucred;
1368		switch (rule->rr_subject_type) {
1369		case RCTL_SUBJECT_TYPE_USER:
1370			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1371			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1372				break;
1373			continue;
1374		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1375			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1376				break;
1377			continue;
1378		case RCTL_SUBJECT_TYPE_JAIL:
1379			match = 0;
1380			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1381				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1382					match = 1;
1383					break;
1384				}
1385			}
1386			if (match)
1387				break;
1388			continue;
1389		default:
1390			panic("rctl_rule_add: unknown subject type %d",
1391			    rule->rr_subject_type);
1392		}
1393
1394		rctl_racct_add_rule(p->p_racct, rule);
1395	}
1396
1397	return (0);
1398}
1399
1400static void
1401rctl_rule_pre_callback(void)
1402{
1403
1404	RACCT_LOCK();
1405}
1406
1407static void
1408rctl_rule_post_callback(void)
1409{
1410
1411	RACCT_UNLOCK();
1412}
1413
1414static void
1415rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1416{
1417	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1418	int found = 0;
1419
1420	ASSERT_RACCT_ENABLED();
1421	RACCT_LOCK_ASSERT();
1422
1423	found += rctl_racct_remove_rules(racct, filter);
1424
1425	*((int *)arg3) += found;
1426}
1427
1428/*
1429 * Remove all rules that match the filter.
1430 */
1431int
1432rctl_rule_remove(struct rctl_rule *filter)
1433{
1434	struct proc *p;
1435	int found = 0;
1436
1437	ASSERT_RACCT_ENABLED();
1438
1439	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1440	    filter->rr_subject.rs_proc != NULL) {
1441		p = filter->rr_subject.rs_proc;
1442		RACCT_LOCK();
1443		found = rctl_racct_remove_rules(p->p_racct, filter);
1444		RACCT_UNLOCK();
1445		if (found)
1446			return (0);
1447		return (ESRCH);
1448	}
1449
1450	loginclass_racct_foreach(rctl_rule_remove_callback,
1451	    rctl_rule_pre_callback, rctl_rule_post_callback,
1452	    filter, (void *)&found);
1453	ui_racct_foreach(rctl_rule_remove_callback,
1454	    rctl_rule_pre_callback, rctl_rule_post_callback,
1455	    filter, (void *)&found);
1456	prison_racct_foreach(rctl_rule_remove_callback,
1457	    rctl_rule_pre_callback, rctl_rule_post_callback,
1458	    filter, (void *)&found);
1459
1460	sx_assert(&allproc_lock, SA_LOCKED);
1461	RACCT_LOCK();
1462	FOREACH_PROC_IN_SYSTEM(p) {
1463		found += rctl_racct_remove_rules(p->p_racct, filter);
1464	}
1465	RACCT_UNLOCK();
1466
1467	if (found)
1468		return (0);
1469	return (ESRCH);
1470}
1471
1472/*
1473 * Appends a rule to the sbuf.
1474 */
1475static void
1476rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1477{
1478	int64_t amount;
1479
1480	ASSERT_RACCT_ENABLED();
1481
1482	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1483
1484	switch (rule->rr_subject_type) {
1485	case RCTL_SUBJECT_TYPE_PROCESS:
1486		if (rule->rr_subject.rs_proc == NULL)
1487			sbuf_putc(sb, ':');
1488		else
1489			sbuf_printf(sb, "%d:",
1490			    rule->rr_subject.rs_proc->p_pid);
1491		break;
1492	case RCTL_SUBJECT_TYPE_USER:
1493		if (rule->rr_subject.rs_uip == NULL)
1494			sbuf_putc(sb, ':');
1495		else
1496			sbuf_printf(sb, "%d:",
1497			    rule->rr_subject.rs_uip->ui_uid);
1498		break;
1499	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1500		if (rule->rr_subject.rs_loginclass == NULL)
1501			sbuf_putc(sb, ':');
1502		else
1503			sbuf_printf(sb, "%s:",
1504			    rule->rr_subject.rs_loginclass->lc_name);
1505		break;
1506	case RCTL_SUBJECT_TYPE_JAIL:
1507		if (rule->rr_subject.rs_prison_racct == NULL)
1508			sbuf_putc(sb, ':');
1509		else
1510			sbuf_printf(sb, "%s:",
1511			    rule->rr_subject.rs_prison_racct->prr_name);
1512		break;
1513	default:
1514		panic("rctl_rule_to_sbuf: unknown subject type %d",
1515		    rule->rr_subject_type);
1516	}
1517
1518	amount = rule->rr_amount;
1519	if (amount != RCTL_AMOUNT_UNDEFINED &&
1520	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1521		amount /= 1000000;
1522
1523	sbuf_printf(sb, "%s:%s=%jd",
1524	    rctl_resource_name(rule->rr_resource),
1525	    rctl_action_name(rule->rr_action),
1526	    amount);
1527
1528	if (rule->rr_per != rule->rr_subject_type)
1529		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1530}
1531
1532/*
1533 * Routine used by RCTL syscalls to read in input string.
1534 */
1535static int
1536rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1537{
1538	char *str;
1539	int error;
1540
1541	ASSERT_RACCT_ENABLED();
1542
1543	if (inbuflen <= 0)
1544		return (EINVAL);
1545	if (inbuflen > RCTL_MAX_INBUFSIZE)
1546		return (E2BIG);
1547
1548	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1549	error = copyinstr(inbufp, str, inbuflen, NULL);
1550	if (error != 0) {
1551		free(str, M_RCTL);
1552		return (error);
1553	}
1554
1555	*inputstr = str;
1556
1557	return (0);
1558}
1559
1560/*
1561 * Routine used by RCTL syscalls to write out output string.
1562 */
1563static int
1564rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1565{
1566	int error;
1567
1568	ASSERT_RACCT_ENABLED();
1569
1570	if (outputsbuf == NULL)
1571		return (0);
1572
1573	sbuf_finish(outputsbuf);
1574	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1575		sbuf_delete(outputsbuf);
1576		return (ERANGE);
1577	}
1578	error = copyout(sbuf_data(outputsbuf), outbufp,
1579	    sbuf_len(outputsbuf) + 1);
1580	sbuf_delete(outputsbuf);
1581	return (error);
1582}
1583
1584static struct sbuf *
1585rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1586{
1587	struct sbuf *sb;
1588	int64_t amount;
1589	int i;
1590
1591	ASSERT_RACCT_ENABLED();
1592
1593	sb = sbuf_new_auto();
1594	for (i = 0; i <= RACCT_MAX; i++) {
1595		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1596			continue;
1597		RACCT_LOCK();
1598		amount = racct->r_resources[i];
1599		RACCT_UNLOCK();
1600		if (RACCT_IS_IN_MILLIONS(i))
1601			amount /= 1000000;
1602		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1603	}
1604	sbuf_setpos(sb, sbuf_len(sb) - 1);
1605	return (sb);
1606}
1607
1608int
1609sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1610{
1611	struct rctl_rule *filter;
1612	struct sbuf *outputsbuf = NULL;
1613	struct proc *p;
1614	struct uidinfo *uip;
1615	struct loginclass *lc;
1616	struct prison_racct *prr;
1617	char *inputstr;
1618	int error;
1619
1620	if (!racct_enable)
1621		return (ENOSYS);
1622
1623	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1624	if (error != 0)
1625		return (error);
1626
1627	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1628	if (error != 0)
1629		return (error);
1630
1631	sx_slock(&allproc_lock);
1632	error = rctl_string_to_rule(inputstr, &filter);
1633	free(inputstr, M_RCTL);
1634	if (error != 0) {
1635		sx_sunlock(&allproc_lock);
1636		return (error);
1637	}
1638
1639	switch (filter->rr_subject_type) {
1640	case RCTL_SUBJECT_TYPE_PROCESS:
1641		p = filter->rr_subject.rs_proc;
1642		if (p == NULL) {
1643			error = EINVAL;
1644			goto out;
1645		}
1646		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1647		break;
1648	case RCTL_SUBJECT_TYPE_USER:
1649		uip = filter->rr_subject.rs_uip;
1650		if (uip == NULL) {
1651			error = EINVAL;
1652			goto out;
1653		}
1654		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1655		break;
1656	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1657		lc = filter->rr_subject.rs_loginclass;
1658		if (lc == NULL) {
1659			error = EINVAL;
1660			goto out;
1661		}
1662		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1663		break;
1664	case RCTL_SUBJECT_TYPE_JAIL:
1665		prr = filter->rr_subject.rs_prison_racct;
1666		if (prr == NULL) {
1667			error = EINVAL;
1668			goto out;
1669		}
1670		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1671		break;
1672	default:
1673		error = EINVAL;
1674	}
1675out:
1676	rctl_rule_release(filter);
1677	sx_sunlock(&allproc_lock);
1678	if (error != 0)
1679		return (error);
1680
1681	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1682
1683	return (error);
1684}
1685
1686static void
1687rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1688{
1689	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1690	struct rctl_rule_link *link;
1691	struct sbuf *sb = (struct sbuf *)arg3;
1692
1693	ASSERT_RACCT_ENABLED();
1694	RACCT_LOCK_ASSERT();
1695
1696	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1697		if (!rctl_rule_matches(link->rrl_rule, filter))
1698			continue;
1699		rctl_rule_to_sbuf(sb, link->rrl_rule);
1700		sbuf_putc(sb, ',');
1701	}
1702}
1703
1704int
1705sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1706{
1707	struct sbuf *sb;
1708	struct rctl_rule *filter;
1709	struct rctl_rule_link *link;
1710	struct proc *p;
1711	char *inputstr, *buf;
1712	size_t bufsize;
1713	int error;
1714
1715	if (!racct_enable)
1716		return (ENOSYS);
1717
1718	error = priv_check(td, PRIV_RCTL_GET_RULES);
1719	if (error != 0)
1720		return (error);
1721
1722	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1723	if (error != 0)
1724		return (error);
1725
1726	sx_slock(&allproc_lock);
1727	error = rctl_string_to_rule(inputstr, &filter);
1728	free(inputstr, M_RCTL);
1729	if (error != 0) {
1730		sx_sunlock(&allproc_lock);
1731		return (error);
1732	}
1733
1734	bufsize = uap->outbuflen;
1735	if (bufsize > rctl_maxbufsize) {
1736		sx_sunlock(&allproc_lock);
1737		return (E2BIG);
1738	}
1739
1740	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1741	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1742	KASSERT(sb != NULL, ("sbuf_new failed"));
1743
1744	FOREACH_PROC_IN_SYSTEM(p) {
1745		RACCT_LOCK();
1746		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1747			/*
1748			 * Non-process rules will be added to the buffer later.
1749			 * Adding them here would result in duplicated output.
1750			 */
1751			if (link->rrl_rule->rr_subject_type !=
1752			    RCTL_SUBJECT_TYPE_PROCESS)
1753				continue;
1754			if (!rctl_rule_matches(link->rrl_rule, filter))
1755				continue;
1756			rctl_rule_to_sbuf(sb, link->rrl_rule);
1757			sbuf_putc(sb, ',');
1758		}
1759		RACCT_UNLOCK();
1760	}
1761
1762	loginclass_racct_foreach(rctl_get_rules_callback,
1763	    rctl_rule_pre_callback, rctl_rule_post_callback,
1764	    filter, sb);
1765	ui_racct_foreach(rctl_get_rules_callback,
1766	    rctl_rule_pre_callback, rctl_rule_post_callback,
1767	    filter, sb);
1768	prison_racct_foreach(rctl_get_rules_callback,
1769	    rctl_rule_pre_callback, rctl_rule_post_callback,
1770	    filter, sb);
1771	if (sbuf_error(sb) == ENOMEM) {
1772		error = ERANGE;
1773		goto out;
1774	}
1775
1776	/*
1777	 * Remove trailing ",".
1778	 */
1779	if (sbuf_len(sb) > 0)
1780		sbuf_setpos(sb, sbuf_len(sb) - 1);
1781
1782	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1783out:
1784	rctl_rule_release(filter);
1785	sx_sunlock(&allproc_lock);
1786	free(buf, M_RCTL);
1787	return (error);
1788}
1789
1790int
1791sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1792{
1793	struct sbuf *sb;
1794	struct rctl_rule *filter;
1795	struct rctl_rule_link *link;
1796	char *inputstr, *buf;
1797	size_t bufsize;
1798	int error;
1799
1800	if (!racct_enable)
1801		return (ENOSYS);
1802
1803	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1804	if (error != 0)
1805		return (error);
1806
1807	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1808	if (error != 0)
1809		return (error);
1810
1811	sx_slock(&allproc_lock);
1812	error = rctl_string_to_rule(inputstr, &filter);
1813	free(inputstr, M_RCTL);
1814	if (error != 0) {
1815		sx_sunlock(&allproc_lock);
1816		return (error);
1817	}
1818
1819	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1820		rctl_rule_release(filter);
1821		sx_sunlock(&allproc_lock);
1822		return (EINVAL);
1823	}
1824	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1825		rctl_rule_release(filter);
1826		sx_sunlock(&allproc_lock);
1827		return (EOPNOTSUPP);
1828	}
1829	if (filter->rr_subject.rs_proc == NULL) {
1830		rctl_rule_release(filter);
1831		sx_sunlock(&allproc_lock);
1832		return (EINVAL);
1833	}
1834
1835	bufsize = uap->outbuflen;
1836	if (bufsize > rctl_maxbufsize) {
1837		rctl_rule_release(filter);
1838		sx_sunlock(&allproc_lock);
1839		return (E2BIG);
1840	}
1841
1842	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1843	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1844	KASSERT(sb != NULL, ("sbuf_new failed"));
1845
1846	RACCT_LOCK();
1847	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1848	    rrl_next) {
1849		rctl_rule_to_sbuf(sb, link->rrl_rule);
1850		sbuf_putc(sb, ',');
1851	}
1852	RACCT_UNLOCK();
1853	if (sbuf_error(sb) == ENOMEM) {
1854		error = ERANGE;
1855		sbuf_delete(sb);
1856		goto out;
1857	}
1858
1859	/*
1860	 * Remove trailing ",".
1861	 */
1862	if (sbuf_len(sb) > 0)
1863		sbuf_setpos(sb, sbuf_len(sb) - 1);
1864
1865	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1866out:
1867	rctl_rule_release(filter);
1868	sx_sunlock(&allproc_lock);
1869	free(buf, M_RCTL);
1870	return (error);
1871}
1872
1873int
1874sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1875{
1876	struct rctl_rule *rule;
1877	char *inputstr;
1878	int error;
1879
1880	if (!racct_enable)
1881		return (ENOSYS);
1882
1883	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1884	if (error != 0)
1885		return (error);
1886
1887	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1888	if (error != 0)
1889		return (error);
1890
1891	sx_slock(&allproc_lock);
1892	error = rctl_string_to_rule(inputstr, &rule);
1893	free(inputstr, M_RCTL);
1894	if (error != 0) {
1895		sx_sunlock(&allproc_lock);
1896		return (error);
1897	}
1898	/*
1899	 * The 'per' part of a rule is optional.
1900	 */
1901	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1902	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1903		rule->rr_per = rule->rr_subject_type;
1904
1905	if (!rctl_rule_fully_specified(rule)) {
1906		error = EINVAL;
1907		goto out;
1908	}
1909
1910	error = rctl_rule_add(rule);
1911
1912out:
1913	rctl_rule_release(rule);
1914	sx_sunlock(&allproc_lock);
1915	return (error);
1916}
1917
1918int
1919sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1920{
1921	struct rctl_rule *filter;
1922	char *inputstr;
1923	int error;
1924
1925	if (!racct_enable)
1926		return (ENOSYS);
1927
1928	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1929	if (error != 0)
1930		return (error);
1931
1932	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1933	if (error != 0)
1934		return (error);
1935
1936	sx_slock(&allproc_lock);
1937	error = rctl_string_to_rule(inputstr, &filter);
1938	free(inputstr, M_RCTL);
1939	if (error != 0) {
1940		sx_sunlock(&allproc_lock);
1941		return (error);
1942	}
1943
1944	error = rctl_rule_remove(filter);
1945	rctl_rule_release(filter);
1946	sx_sunlock(&allproc_lock);
1947
1948	return (error);
1949}
1950
1951/*
1952 * Update RCTL rule list after credential change.
1953 */
1954void
1955rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1956{
1957	LIST_HEAD(, rctl_rule_link) newrules;
1958	struct rctl_rule_link *link, *newlink;
1959	struct uidinfo *newuip;
1960	struct loginclass *newlc;
1961	struct prison_racct *newprr;
1962	int rulecnt, i;
1963
1964	if (!racct_enable)
1965		return;
1966
1967	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1968
1969	newuip = newcred->cr_ruidinfo;
1970	newlc = newcred->cr_loginclass;
1971	newprr = newcred->cr_prison->pr_prison_racct;
1972
1973	LIST_INIT(&newrules);
1974
1975again:
1976	/*
1977	 * First, count the rules that apply to the process with new
1978	 * credentials.
1979	 */
1980	rulecnt = 0;
1981	RACCT_LOCK();
1982	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1983		if (link->rrl_rule->rr_subject_type ==
1984		    RCTL_SUBJECT_TYPE_PROCESS)
1985			rulecnt++;
1986	}
1987	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1988		rulecnt++;
1989	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1990		rulecnt++;
1991	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1992		rulecnt++;
1993	RACCT_UNLOCK();
1994
1995	/*
1996	 * Create temporary list.  We've dropped the rctl_lock in order
1997	 * to use M_WAITOK.
1998	 */
1999	for (i = 0; i < rulecnt; i++) {
2000		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2001		newlink->rrl_rule = NULL;
2002		newlink->rrl_exceeded = 0;
2003		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2004	}
2005
2006	newlink = LIST_FIRST(&newrules);
2007
2008	/*
2009	 * Assign rules to the newly allocated list entries.
2010	 */
2011	RACCT_LOCK();
2012	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2013		if (link->rrl_rule->rr_subject_type ==
2014		    RCTL_SUBJECT_TYPE_PROCESS) {
2015			if (newlink == NULL)
2016				goto goaround;
2017			rctl_rule_acquire(link->rrl_rule);
2018			newlink->rrl_rule = link->rrl_rule;
2019			newlink->rrl_exceeded = link->rrl_exceeded;
2020			newlink = LIST_NEXT(newlink, rrl_next);
2021			rulecnt--;
2022		}
2023	}
2024
2025	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2026		if (newlink == NULL)
2027			goto goaround;
2028		rctl_rule_acquire(link->rrl_rule);
2029		newlink->rrl_rule = link->rrl_rule;
2030		newlink->rrl_exceeded = link->rrl_exceeded;
2031		newlink = LIST_NEXT(newlink, rrl_next);
2032		rulecnt--;
2033	}
2034
2035	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2036		if (newlink == NULL)
2037			goto goaround;
2038		rctl_rule_acquire(link->rrl_rule);
2039		newlink->rrl_rule = link->rrl_rule;
2040		newlink->rrl_exceeded = link->rrl_exceeded;
2041		newlink = LIST_NEXT(newlink, rrl_next);
2042		rulecnt--;
2043	}
2044
2045	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2046		if (newlink == NULL)
2047			goto goaround;
2048		rctl_rule_acquire(link->rrl_rule);
2049		newlink->rrl_rule = link->rrl_rule;
2050		newlink->rrl_exceeded = link->rrl_exceeded;
2051		newlink = LIST_NEXT(newlink, rrl_next);
2052		rulecnt--;
2053	}
2054
2055	if (rulecnt == 0) {
2056		/*
2057		 * Free the old rule list.
2058		 */
2059		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2060			link = LIST_FIRST(&p->p_racct->r_rule_links);
2061			LIST_REMOVE(link, rrl_next);
2062			rctl_rule_release(link->rrl_rule);
2063			uma_zfree(rctl_rule_link_zone, link);
2064		}
2065
2066		/*
2067		 * Replace lists and we're done.
2068		 *
2069		 * XXX: Is there any way to switch list heads instead
2070		 *      of iterating here?
2071		 */
2072		while (!LIST_EMPTY(&newrules)) {
2073			newlink = LIST_FIRST(&newrules);
2074			LIST_REMOVE(newlink, rrl_next);
2075			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2076			    newlink, rrl_next);
2077		}
2078
2079		RACCT_UNLOCK();
2080
2081		return;
2082	}
2083
2084goaround:
2085	RACCT_UNLOCK();
2086
2087	/*
2088	 * Rule list changed while we were not holding the rctl_lock.
2089	 * Free the new list and try again.
2090	 */
2091	while (!LIST_EMPTY(&newrules)) {
2092		newlink = LIST_FIRST(&newrules);
2093		LIST_REMOVE(newlink, rrl_next);
2094		if (newlink->rrl_rule != NULL)
2095			rctl_rule_release(newlink->rrl_rule);
2096		uma_zfree(rctl_rule_link_zone, newlink);
2097	}
2098
2099	goto again;
2100}
2101
2102/*
2103 * Assign RCTL rules to the newly created process.
2104 */
2105int
2106rctl_proc_fork(struct proc *parent, struct proc *child)
2107{
2108	struct rctl_rule *rule;
2109	struct rctl_rule_link *link;
2110	int error;
2111
2112	ASSERT_RACCT_ENABLED();
2113	RACCT_LOCK_ASSERT();
2114	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2115
2116	LIST_INIT(&child->p_racct->r_rule_links);
2117
2118	/*
2119	 * Go through limits applicable to the parent and assign them
2120	 * to the child.  Rules with 'process' subject have to be duplicated
2121	 * in order to make their rr_subject point to the new process.
2122	 */
2123	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2124		if (link->rrl_rule->rr_subject_type ==
2125		    RCTL_SUBJECT_TYPE_PROCESS) {
2126			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2127			if (rule == NULL)
2128				goto fail;
2129			KASSERT(rule->rr_subject.rs_proc == parent,
2130			    ("rule->rr_subject.rs_proc != parent"));
2131			rule->rr_subject.rs_proc = child;
2132			error = rctl_racct_add_rule_locked(child->p_racct,
2133			    rule);
2134			rctl_rule_release(rule);
2135			if (error != 0)
2136				goto fail;
2137		} else {
2138			error = rctl_racct_add_rule_locked(child->p_racct,
2139			    link->rrl_rule);
2140			if (error != 0)
2141				goto fail;
2142		}
2143	}
2144
2145	return (0);
2146
2147fail:
2148	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2149		link = LIST_FIRST(&child->p_racct->r_rule_links);
2150		LIST_REMOVE(link, rrl_next);
2151		rctl_rule_release(link->rrl_rule);
2152		uma_zfree(rctl_rule_link_zone, link);
2153	}
2154
2155	return (EAGAIN);
2156}
2157
2158/*
2159 * Release rules attached to the racct.
2160 */
2161void
2162rctl_racct_release(struct racct *racct)
2163{
2164	struct rctl_rule_link *link;
2165
2166	ASSERT_RACCT_ENABLED();
2167	RACCT_LOCK_ASSERT();
2168
2169	while (!LIST_EMPTY(&racct->r_rule_links)) {
2170		link = LIST_FIRST(&racct->r_rule_links);
2171		LIST_REMOVE(link, rrl_next);
2172		rctl_rule_release(link->rrl_rule);
2173		uma_zfree(rctl_rule_link_zone, link);
2174	}
2175}
2176
2177static void
2178rctl_init(void)
2179{
2180
2181	if (!racct_enable)
2182		return;
2183
2184	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2185	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2186	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2187	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2188	    UMA_ALIGN_PTR, 0);
2189
2190	/*
2191	 * Set default values, making sure not to overwrite the ones
2192	 * fetched from tunables.  Most of those could be set at the
2193	 * declaration, except for the rctl_throttle_max - we cannot
2194	 * set it there due to hz not being compile time constant.
2195	 */
2196	if (rctl_throttle_min < 1)
2197		rctl_throttle_min = 1;
2198	if (rctl_throttle_max < rctl_throttle_min)
2199		rctl_throttle_max = 2 * hz;
2200	if (rctl_throttle_pct < 0)
2201		rctl_throttle_pct = 100;
2202	if (rctl_throttle_pct2 < 0)
2203		rctl_throttle_pct2 = 100;
2204}
2205
2206#else /* !RCTL */
2207
2208#include <sys/types.h>
2209#include <sys/errno.h>
2210#include <sys/sysproto.h>
2211
2212int
2213sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2214{
2215
2216	return (ENOSYS);
2217}
2218
2219int
2220sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2221{
2222
2223	return (ENOSYS);
2224}
2225
2226int
2227sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2228{
2229
2230	return (ENOSYS);
2231}
2232
2233int
2234sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2235{
2236
2237	return (ENOSYS);
2238}
2239
2240int
2241sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2242{
2243
2244	return (ENOSYS);
2245}
2246
2247#endif /* RCTL */
2248