1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 The FreeBSD Foundation
5 *
6 * This software was developed by Edward Tomasz Napierala under sponsorship
7 * from the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/cdefs.h>
32#include "opt_sched.h"
33
34#include <sys/param.h>
35#include <sys/buf.h>
36#include <sys/systm.h>
37#include <sys/eventhandler.h>
38#include <sys/jail.h>
39#include <sys/kernel.h>
40#include <sys/kthread.h>
41#include <sys/lock.h>
42#include <sys/loginclass.h>
43#include <sys/malloc.h>
44#include <sys/mutex.h>
45#include <sys/proc.h>
46#include <sys/racct.h>
47#include <sys/resourcevar.h>
48#include <sys/sbuf.h>
49#include <sys/sched.h>
50#include <sys/sdt.h>
51#include <sys/smp.h>
52#include <sys/sx.h>
53#include <sys/sysctl.h>
54#include <sys/sysproto.h>
55#include <sys/umtxvar.h>
56#include <machine/smp.h>
57
58#ifdef RCTL
59#include <sys/rctl.h>
60#endif
61
62FEATURE(racct, "Resource Accounting");
63
64/*
65 * Do not block processes that have their %cpu usage <= pcpu_threshold.
66 */
67static int pcpu_threshold = 1;
68#ifdef RACCT_DEFAULT_TO_DISABLED
69bool __read_frequently racct_enable = false;
70#else
71bool __read_frequently racct_enable = true;
72#endif
73
74SYSCTL_NODE(_kern, OID_AUTO, racct, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
75    "Resource Accounting");
76SYSCTL_BOOL(_kern_racct, OID_AUTO, enable, CTLFLAG_RDTUN, &racct_enable,
77    0, "Enable RACCT/RCTL");
78SYSCTL_UINT(_kern_racct, OID_AUTO, pcpu_threshold, CTLFLAG_RW, &pcpu_threshold,
79    0, "Processes with higher %cpu usage than this value can be throttled.");
80
81/*
82 * How many seconds it takes to use the scheduler %cpu calculations.  When a
83 * process starts, we compute its %cpu usage by dividing its runtime by the
84 * process wall clock time.  After RACCT_PCPU_SECS pass, we use the value
85 * provided by the scheduler.
86 */
87#define RACCT_PCPU_SECS		3
88
89struct mtx racct_lock;
90MTX_SYSINIT(racct_lock, &racct_lock, "racct lock", MTX_DEF);
91
92static uma_zone_t racct_zone;
93
94static void racct_sub_racct(struct racct *dest, const struct racct *src);
95static void racct_sub_cred_locked(struct ucred *cred, int resource,
96		uint64_t amount);
97static void racct_add_cred_locked(struct ucred *cred, int resource,
98		uint64_t amount);
99
100SDT_PROVIDER_DEFINE(racct);
101SDT_PROBE_DEFINE3(racct, , rusage, add,
102    "struct proc *", "int", "uint64_t");
103SDT_PROBE_DEFINE3(racct, , rusage, add__failure,
104    "struct proc *", "int", "uint64_t");
105SDT_PROBE_DEFINE3(racct, , rusage, add__buf,
106    "struct proc *", "const struct buf *", "int");
107SDT_PROBE_DEFINE3(racct, , rusage, add__cred,
108    "struct ucred *", "int", "uint64_t");
109SDT_PROBE_DEFINE3(racct, , rusage, add__force,
110    "struct proc *", "int", "uint64_t");
111SDT_PROBE_DEFINE3(racct, , rusage, set,
112    "struct proc *", "int", "uint64_t");
113SDT_PROBE_DEFINE3(racct, , rusage, set__failure,
114    "struct proc *", "int", "uint64_t");
115SDT_PROBE_DEFINE3(racct, , rusage, set__force,
116    "struct proc *", "int", "uint64_t");
117SDT_PROBE_DEFINE3(racct, , rusage, sub,
118    "struct proc *", "int", "uint64_t");
119SDT_PROBE_DEFINE3(racct, , rusage, sub__cred,
120    "struct ucred *", "int", "uint64_t");
121SDT_PROBE_DEFINE1(racct, , racct, create,
122    "struct racct *");
123SDT_PROBE_DEFINE1(racct, , racct, destroy,
124    "struct racct *");
125SDT_PROBE_DEFINE2(racct, , racct, join,
126    "struct racct *", "struct racct *");
127SDT_PROBE_DEFINE2(racct, , racct, join__failure,
128    "struct racct *", "struct racct *");
129SDT_PROBE_DEFINE2(racct, , racct, leave,
130    "struct racct *", "struct racct *");
131
132int racct_types[] = {
133	[RACCT_CPU] =
134		RACCT_IN_MILLIONS,
135	[RACCT_DATA] =
136		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
137	[RACCT_STACK] =
138		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
139	[RACCT_CORE] =
140		RACCT_DENIABLE,
141	[RACCT_RSS] =
142		RACCT_RECLAIMABLE,
143	[RACCT_MEMLOCK] =
144		RACCT_RECLAIMABLE | RACCT_DENIABLE,
145	[RACCT_NPROC] =
146		RACCT_RECLAIMABLE | RACCT_DENIABLE,
147	[RACCT_NOFILE] =
148		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
149	[RACCT_VMEM] =
150		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
151	[RACCT_NPTS] =
152		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
153	[RACCT_SWAP] =
154		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
155	[RACCT_NTHR] =
156		RACCT_RECLAIMABLE | RACCT_DENIABLE,
157	[RACCT_MSGQQUEUED] =
158		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
159	[RACCT_MSGQSIZE] =
160		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
161	[RACCT_NMSGQ] =
162		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
163	[RACCT_NSEM] =
164		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
165	[RACCT_NSEMOP] =
166		RACCT_RECLAIMABLE | RACCT_INHERITABLE | RACCT_DENIABLE,
167	[RACCT_NSHM] =
168		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
169	[RACCT_SHMSIZE] =
170		RACCT_RECLAIMABLE | RACCT_DENIABLE | RACCT_SLOPPY,
171	[RACCT_WALLCLOCK] =
172		RACCT_IN_MILLIONS,
173	[RACCT_PCTCPU] =
174		RACCT_DECAYING | RACCT_DENIABLE | RACCT_IN_MILLIONS,
175	[RACCT_READBPS] =
176		RACCT_DECAYING,
177	[RACCT_WRITEBPS] =
178		RACCT_DECAYING,
179	[RACCT_READIOPS] =
180		RACCT_DECAYING,
181	[RACCT_WRITEIOPS] =
182		RACCT_DECAYING };
183
184static const fixpt_t RACCT_DECAY_FACTOR = 0.3 * FSCALE;
185
186#ifdef SCHED_4BSD
187/*
188 * Contains intermediate values for %cpu calculations to avoid using floating
189 * point in the kernel.
190 * ccpu_exp[k] = FSCALE * (ccpu/FSCALE)^k = FSCALE * exp(-k/20)
191 * It is needed only for the 4BSD scheduler, because in ULE, the ccpu equals to
192 * zero so the calculations are more straightforward.
193 */
194fixpt_t ccpu_exp[] = {
195	[0] = FSCALE * 1,
196	[1] = FSCALE * 0.95122942450071400909,
197	[2] = FSCALE * 0.90483741803595957316,
198	[3] = FSCALE * 0.86070797642505780722,
199	[4] = FSCALE * 0.81873075307798185866,
200	[5] = FSCALE * 0.77880078307140486824,
201	[6] = FSCALE * 0.74081822068171786606,
202	[7] = FSCALE * 0.70468808971871343435,
203	[8] = FSCALE * 0.67032004603563930074,
204	[9] = FSCALE * 0.63762815162177329314,
205	[10] = FSCALE * 0.60653065971263342360,
206	[11] = FSCALE * 0.57694981038048669531,
207	[12] = FSCALE * 0.54881163609402643262,
208	[13] = FSCALE * 0.52204577676101604789,
209	[14] = FSCALE * 0.49658530379140951470,
210	[15] = FSCALE * 0.47236655274101470713,
211	[16] = FSCALE * 0.44932896411722159143,
212	[17] = FSCALE * 0.42741493194872666992,
213	[18] = FSCALE * 0.40656965974059911188,
214	[19] = FSCALE * 0.38674102345450120691,
215	[20] = FSCALE * 0.36787944117144232159,
216	[21] = FSCALE * 0.34993774911115535467,
217	[22] = FSCALE * 0.33287108369807955328,
218	[23] = FSCALE * 0.31663676937905321821,
219	[24] = FSCALE * 0.30119421191220209664,
220	[25] = FSCALE * 0.28650479686019010032,
221	[26] = FSCALE * 0.27253179303401260312,
222	[27] = FSCALE * 0.25924026064589150757,
223	[28] = FSCALE * 0.24659696394160647693,
224	[29] = FSCALE * 0.23457028809379765313,
225	[30] = FSCALE * 0.22313016014842982893,
226	[31] = FSCALE * 0.21224797382674305771,
227	[32] = FSCALE * 0.20189651799465540848,
228	[33] = FSCALE * 0.19204990862075411423,
229	[34] = FSCALE * 0.18268352405273465022,
230	[35] = FSCALE * 0.17377394345044512668,
231	[36] = FSCALE * 0.16529888822158653829,
232	[37] = FSCALE * 0.15723716631362761621,
233	[38] = FSCALE * 0.14956861922263505264,
234	[39] = FSCALE * 0.14227407158651357185,
235	[40] = FSCALE * 0.13533528323661269189,
236	[41] = FSCALE * 0.12873490358780421886,
237	[42] = FSCALE * 0.12245642825298191021,
238	[43] = FSCALE * 0.11648415777349695786,
239	[44] = FSCALE * 0.11080315836233388333,
240	[45] = FSCALE * 0.10539922456186433678,
241	[46] = FSCALE * 0.10025884372280373372,
242	[47] = FSCALE * 0.09536916221554961888,
243	[48] = FSCALE * 0.09071795328941250337,
244	[49] = FSCALE * 0.08629358649937051097,
245	[50] = FSCALE * 0.08208499862389879516,
246	[51] = FSCALE * 0.07808166600115315231,
247	[52] = FSCALE * 0.07427357821433388042,
248	[53] = FSCALE * 0.07065121306042958674,
249	[54] = FSCALE * 0.06720551273974976512,
250	[55] = FSCALE * 0.06392786120670757270,
251	[56] = FSCALE * 0.06081006262521796499,
252	[57] = FSCALE * 0.05784432087483846296,
253	[58] = FSCALE * 0.05502322005640722902,
254	[59] = FSCALE * 0.05233970594843239308,
255	[60] = FSCALE * 0.04978706836786394297,
256	[61] = FSCALE * 0.04735892439114092119,
257	[62] = FSCALE * 0.04504920239355780606,
258	[63] = FSCALE * 0.04285212686704017991,
259	[64] = FSCALE * 0.04076220397836621516,
260	[65] = FSCALE * 0.03877420783172200988,
261	[66] = FSCALE * 0.03688316740124000544,
262	[67] = FSCALE * 0.03508435410084502588,
263	[68] = FSCALE * 0.03337326996032607948,
264	[69] = FSCALE * 0.03174563637806794323,
265	[70] = FSCALE * 0.03019738342231850073,
266	[71] = FSCALE * 0.02872463965423942912,
267	[72] = FSCALE * 0.02732372244729256080,
268	[73] = FSCALE * 0.02599112877875534358,
269	[74] = FSCALE * 0.02472352647033939120,
270	[75] = FSCALE * 0.02351774585600910823,
271	[76] = FSCALE * 0.02237077185616559577,
272	[77] = FSCALE * 0.02127973643837716938,
273	[78] = FSCALE * 0.02024191144580438847,
274	[79] = FSCALE * 0.01925470177538692429,
275	[80] = FSCALE * 0.01831563888873418029,
276	[81] = FSCALE * 0.01742237463949351138,
277	[82] = FSCALE * 0.01657267540176124754,
278	[83] = FSCALE * 0.01576441648485449082,
279	[84] = FSCALE * 0.01499557682047770621,
280	[85] = FSCALE * 0.01426423390899925527,
281	[86] = FSCALE * 0.01356855901220093175,
282	[87] = FSCALE * 0.01290681258047986886,
283	[88] = FSCALE * 0.01227733990306844117,
284	[89] = FSCALE * 0.01167856697039544521,
285	[90] = FSCALE * 0.01110899653824230649,
286	[91] = FSCALE * 0.01056720438385265337,
287	[92] = FSCALE * 0.01005183574463358164,
288	[93] = FSCALE * 0.00956160193054350793,
289	[94] = FSCALE * 0.00909527710169581709,
290	[95] = FSCALE * 0.00865169520312063417,
291	[96] = FSCALE * 0.00822974704902002884,
292	[97] = FSCALE * 0.00782837754922577143,
293	[98] = FSCALE * 0.00744658307092434051,
294	[99] = FSCALE * 0.00708340892905212004,
295	[100] = FSCALE * 0.00673794699908546709,
296	[101] = FSCALE * 0.00640933344625638184,
297	[102] = FSCALE * 0.00609674656551563610,
298	[103] = FSCALE * 0.00579940472684214321,
299	[104] = FSCALE * 0.00551656442076077241,
300	[105] = FSCALE * 0.00524751839918138427,
301	[106] = FSCALE * 0.00499159390691021621,
302	[107] = FSCALE * 0.00474815099941147558,
303	[108] = FSCALE * 0.00451658094261266798,
304	[109] = FSCALE * 0.00429630469075234057,
305	[110] = FSCALE * 0.00408677143846406699,
306};
307#endif
308
309#define	CCPU_EXP_MAX	110
310
311/*
312 * This function is analogical to the getpcpu() function in the ps(1) command.
313 * They should both calculate in the same way so that the racct %cpu
314 * calculations are consistent with the values showed by the ps(1) tool.
315 * The calculations are more complex in the 4BSD scheduler because of the value
316 * of the ccpu variable.  In ULE it is defined to be zero which saves us some
317 * work.
318 */
319static uint64_t
320racct_getpcpu(struct proc *p, u_int pcpu)
321{
322	u_int swtime;
323#ifdef SCHED_4BSD
324	fixpt_t pctcpu, pctcpu_next;
325#endif
326#ifdef SMP
327	struct pcpu *pc;
328	int found;
329#endif
330	fixpt_t p_pctcpu;
331	struct thread *td;
332
333	ASSERT_RACCT_ENABLED();
334
335	/*
336	 * If the process is swapped out, we count its %cpu usage as zero.
337	 * This behaviour is consistent with the userland ps(1) tool.
338	 */
339	if ((p->p_flag & P_INMEM) == 0)
340		return (0);
341	swtime = (ticks - p->p_swtick) / hz;
342
343	/*
344	 * For short-lived processes, the sched_pctcpu() returns small
345	 * values even for cpu intensive processes.  Therefore we use
346	 * our own estimate in this case.
347	 */
348	if (swtime < RACCT_PCPU_SECS)
349		return (pcpu);
350
351	p_pctcpu = 0;
352	FOREACH_THREAD_IN_PROC(p, td) {
353		if (td == PCPU_GET(idlethread))
354			continue;
355#ifdef SMP
356		found = 0;
357		STAILQ_FOREACH(pc, &cpuhead, pc_allcpu) {
358			if (td == pc->pc_idlethread) {
359				found = 1;
360				break;
361			}
362		}
363		if (found)
364			continue;
365#endif
366		thread_lock(td);
367#ifdef SCHED_4BSD
368		pctcpu = sched_pctcpu(td);
369		/* Count also the yet unfinished second. */
370		pctcpu_next = (pctcpu * ccpu_exp[1]) >> FSHIFT;
371		pctcpu_next += sched_pctcpu_delta(td);
372		p_pctcpu += max(pctcpu, pctcpu_next);
373#else
374		/*
375		 * In ULE the %cpu statistics are updated on every
376		 * sched_pctcpu() call.  So special calculations to
377		 * account for the latest (unfinished) second are
378		 * not needed.
379		 */
380		p_pctcpu += sched_pctcpu(td);
381#endif
382		thread_unlock(td);
383	}
384
385#ifdef SCHED_4BSD
386	if (swtime <= CCPU_EXP_MAX)
387		return ((100 * (uint64_t)p_pctcpu * 1000000) /
388		    (FSCALE - ccpu_exp[swtime]));
389#endif
390
391	return ((100 * (uint64_t)p_pctcpu * 1000000) / FSCALE);
392}
393
394static void
395racct_add_racct(struct racct *dest, const struct racct *src)
396{
397	int i;
398
399	ASSERT_RACCT_ENABLED();
400	RACCT_LOCK_ASSERT();
401
402	/*
403	 * Update resource usage in dest.
404	 */
405	for (i = 0; i <= RACCT_MAX; i++) {
406		KASSERT(dest->r_resources[i] >= 0,
407		    ("%s: resource %d propagation meltdown: dest < 0",
408		    __func__, i));
409		KASSERT(src->r_resources[i] >= 0,
410		    ("%s: resource %d propagation meltdown: src < 0",
411		    __func__, i));
412		dest->r_resources[i] += src->r_resources[i];
413	}
414}
415
416static void
417racct_sub_racct(struct racct *dest, const struct racct *src)
418{
419	int i;
420
421	ASSERT_RACCT_ENABLED();
422	RACCT_LOCK_ASSERT();
423
424	/*
425	 * Update resource usage in dest.
426	 */
427	for (i = 0; i <= RACCT_MAX; i++) {
428		if (!RACCT_IS_SLOPPY(i) && !RACCT_IS_DECAYING(i)) {
429			KASSERT(dest->r_resources[i] >= 0,
430			    ("%s: resource %d propagation meltdown: dest < 0",
431			    __func__, i));
432			KASSERT(src->r_resources[i] >= 0,
433			    ("%s: resource %d propagation meltdown: src < 0",
434			    __func__, i));
435			KASSERT(src->r_resources[i] <= dest->r_resources[i],
436			    ("%s: resource %d propagation meltdown: src > dest",
437			    __func__, i));
438		}
439		if (RACCT_CAN_DROP(i)) {
440			dest->r_resources[i] -= src->r_resources[i];
441			if (dest->r_resources[i] < 0)
442				dest->r_resources[i] = 0;
443		}
444	}
445}
446
447void
448racct_create(struct racct **racctp)
449{
450
451	if (!racct_enable)
452		return;
453
454	SDT_PROBE1(racct, , racct, create, racctp);
455
456	KASSERT(*racctp == NULL, ("racct already allocated"));
457
458	*racctp = uma_zalloc(racct_zone, M_WAITOK | M_ZERO);
459}
460
461static void
462racct_destroy_locked(struct racct **racctp)
463{
464	struct racct *racct;
465	int i;
466
467	ASSERT_RACCT_ENABLED();
468
469	SDT_PROBE1(racct, , racct, destroy, racctp);
470
471	RACCT_LOCK_ASSERT();
472	KASSERT(racctp != NULL, ("NULL racctp"));
473	KASSERT(*racctp != NULL, ("NULL racct"));
474
475	racct = *racctp;
476
477	for (i = 0; i <= RACCT_MAX; i++) {
478		if (RACCT_IS_SLOPPY(i))
479			continue;
480		if (!RACCT_IS_RECLAIMABLE(i))
481			continue;
482		KASSERT(racct->r_resources[i] == 0,
483		    ("destroying non-empty racct: "
484		    "%ju allocated for resource %d\n",
485		    racct->r_resources[i], i));
486	}
487	uma_zfree(racct_zone, racct);
488	*racctp = NULL;
489}
490
491void
492racct_destroy(struct racct **racct)
493{
494
495	if (!racct_enable)
496		return;
497
498	RACCT_LOCK();
499	racct_destroy_locked(racct);
500	RACCT_UNLOCK();
501}
502
503/*
504 * Increase consumption of 'resource' by 'amount' for 'racct',
505 * but not its parents.  Differently from other cases, 'amount' here
506 * may be less than zero.
507 */
508static void
509racct_adjust_resource(struct racct *racct, int resource,
510    int64_t amount)
511{
512
513	ASSERT_RACCT_ENABLED();
514	RACCT_LOCK_ASSERT();
515	KASSERT(racct != NULL, ("NULL racct"));
516
517	racct->r_resources[resource] += amount;
518	if (racct->r_resources[resource] < 0) {
519		KASSERT(RACCT_IS_SLOPPY(resource) || RACCT_IS_DECAYING(resource),
520		    ("%s: resource %d usage < 0", __func__, resource));
521		racct->r_resources[resource] = 0;
522	}
523
524	/*
525	 * There are some cases where the racct %cpu resource would grow
526	 * beyond 100% per core.  For example in racct_proc_exit() we add
527	 * the process %cpu usage to the ucred racct containers.  If too
528	 * many processes terminated in a short time span, the ucred %cpu
529	 * resource could grow too much.  Also, the 4BSD scheduler sometimes
530	 * returns for a thread more than 100% cpu usage. So we set a sane
531	 * boundary here to 100% * the maximum number of CPUs.
532	 */
533	if ((resource == RACCT_PCTCPU) &&
534	    (racct->r_resources[RACCT_PCTCPU] > 100 * 1000000 * (int64_t)MAXCPU))
535		racct->r_resources[RACCT_PCTCPU] = 100 * 1000000 * (int64_t)MAXCPU;
536}
537
538static int
539racct_add_locked(struct proc *p, int resource, uint64_t amount, int force)
540{
541#ifdef RCTL
542	int error;
543#endif
544
545	ASSERT_RACCT_ENABLED();
546
547	/*
548	 * We need proc lock to dereference p->p_ucred.
549	 */
550	PROC_LOCK_ASSERT(p, MA_OWNED);
551
552#ifdef RCTL
553	error = rctl_enforce(p, resource, amount);
554	if (error && !force && RACCT_IS_DENIABLE(resource)) {
555		SDT_PROBE3(racct, , rusage, add__failure, p, resource, amount);
556		return (error);
557	}
558#endif
559	racct_adjust_resource(p->p_racct, resource, amount);
560	racct_add_cred_locked(p->p_ucred, resource, amount);
561
562	return (0);
563}
564
565/*
566 * Increase allocation of 'resource' by 'amount' for process 'p'.
567 * Return 0 if it's below limits, or errno, if it's not.
568 */
569int
570racct_add(struct proc *p, int resource, uint64_t amount)
571{
572	int error;
573
574	if (!racct_enable)
575		return (0);
576
577	SDT_PROBE3(racct, , rusage, add, p, resource, amount);
578
579	RACCT_LOCK();
580	error = racct_add_locked(p, resource, amount, 0);
581	RACCT_UNLOCK();
582	return (error);
583}
584
585/*
586 * Increase allocation of 'resource' by 'amount' for process 'p'.
587 * Doesn't check for limits and never fails.
588 */
589void
590racct_add_force(struct proc *p, int resource, uint64_t amount)
591{
592
593	if (!racct_enable)
594		return;
595
596	SDT_PROBE3(racct, , rusage, add__force, p, resource, amount);
597
598	RACCT_LOCK();
599	racct_add_locked(p, resource, amount, 1);
600	RACCT_UNLOCK();
601}
602
603static void
604racct_add_cred_locked(struct ucred *cred, int resource, uint64_t amount)
605{
606	struct prison *pr;
607
608	ASSERT_RACCT_ENABLED();
609
610	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, amount);
611	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
612		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
613		    amount);
614	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, amount);
615}
616
617/*
618 * Increase allocation of 'resource' by 'amount' for credential 'cred'.
619 * Doesn't check for limits and never fails.
620 */
621void
622racct_add_cred(struct ucred *cred, int resource, uint64_t amount)
623{
624
625	if (!racct_enable)
626		return;
627
628	SDT_PROBE3(racct, , rusage, add__cred, cred, resource, amount);
629
630	RACCT_LOCK();
631	racct_add_cred_locked(cred, resource, amount);
632	RACCT_UNLOCK();
633}
634
635/*
636 * Account for disk IO resource consumption.  Checks for limits,
637 * but never fails, due to disk limits being undeniable.
638 */
639void
640racct_add_buf(struct proc *p, const struct buf *bp, int is_write)
641{
642
643	ASSERT_RACCT_ENABLED();
644	PROC_LOCK_ASSERT(p, MA_OWNED);
645
646	SDT_PROBE3(racct, , rusage, add__buf, p, bp, is_write);
647
648	RACCT_LOCK();
649	if (is_write) {
650		racct_add_locked(curproc, RACCT_WRITEBPS, bp->b_bcount, 1);
651		racct_add_locked(curproc, RACCT_WRITEIOPS, 1, 1);
652	} else {
653		racct_add_locked(curproc, RACCT_READBPS, bp->b_bcount, 1);
654		racct_add_locked(curproc, RACCT_READIOPS, 1, 1);
655	}
656	RACCT_UNLOCK();
657}
658
659static int
660racct_set_locked(struct proc *p, int resource, uint64_t amount, int force)
661{
662	int64_t old_amount, decayed_amount, diff_proc, diff_cred;
663#ifdef RCTL
664	int error;
665#endif
666
667	ASSERT_RACCT_ENABLED();
668
669	/*
670	 * We need proc lock to dereference p->p_ucred.
671	 */
672	PROC_LOCK_ASSERT(p, MA_OWNED);
673
674	old_amount = p->p_racct->r_resources[resource];
675	/*
676	 * The diffs may be negative.
677	 */
678	diff_proc = amount - old_amount;
679	if (resource == RACCT_PCTCPU) {
680		/*
681		 * Resources in per-credential racct containers may decay.
682		 * If this is the case, we need to calculate the difference
683		 * between the new amount and the proportional value of the
684		 * old amount that has decayed in the ucred racct containers.
685		 */
686		decayed_amount = old_amount * RACCT_DECAY_FACTOR / FSCALE;
687		diff_cred = amount - decayed_amount;
688	} else
689		diff_cred = diff_proc;
690#ifdef notyet
691	KASSERT(diff_proc >= 0 || RACCT_CAN_DROP(resource),
692	    ("%s: usage of non-droppable resource %d dropping", __func__,
693	     resource));
694#endif
695#ifdef RCTL
696	if (diff_proc > 0) {
697		error = rctl_enforce(p, resource, diff_proc);
698		if (error && !force && RACCT_IS_DENIABLE(resource)) {
699			SDT_PROBE3(racct, , rusage, set__failure, p, resource,
700			    amount);
701			return (error);
702		}
703	}
704#endif
705	racct_adjust_resource(p->p_racct, resource, diff_proc);
706	if (diff_cred > 0)
707		racct_add_cred_locked(p->p_ucred, resource, diff_cred);
708	else if (diff_cred < 0)
709		racct_sub_cred_locked(p->p_ucred, resource, -diff_cred);
710
711	return (0);
712}
713
714/*
715 * Set allocation of 'resource' to 'amount' for process 'p'.
716 * Return 0 if it's below limits, or errno, if it's not.
717 *
718 * Note that decreasing the allocation always returns 0,
719 * even if it's above the limit.
720 */
721int
722racct_set_unlocked(struct proc *p, int resource, uint64_t amount)
723{
724	int error;
725
726	ASSERT_RACCT_ENABLED();
727	PROC_LOCK(p);
728	error = racct_set(p, resource, amount);
729	PROC_UNLOCK(p);
730	return (error);
731}
732
733int
734racct_set(struct proc *p, int resource, uint64_t amount)
735{
736	int error;
737
738	if (!racct_enable)
739		return (0);
740
741	SDT_PROBE3(racct, , rusage, set__force, p, resource, amount);
742
743	RACCT_LOCK();
744	error = racct_set_locked(p, resource, amount, 0);
745	RACCT_UNLOCK();
746	return (error);
747}
748
749void
750racct_set_force(struct proc *p, int resource, uint64_t amount)
751{
752
753	if (!racct_enable)
754		return;
755
756	SDT_PROBE3(racct, , rusage, set, p, resource, amount);
757
758	RACCT_LOCK();
759	racct_set_locked(p, resource, amount, 1);
760	RACCT_UNLOCK();
761}
762
763/*
764 * Returns amount of 'resource' the process 'p' can keep allocated.
765 * Allocating more than that would be denied, unless the resource
766 * is marked undeniable.  Amount of already allocated resource does
767 * not matter.
768 */
769uint64_t
770racct_get_limit(struct proc *p, int resource)
771{
772#ifdef RCTL
773	uint64_t available;
774
775	if (!racct_enable)
776		return (UINT64_MAX);
777
778	RACCT_LOCK();
779	available = rctl_get_limit(p, resource);
780	RACCT_UNLOCK();
781
782	return (available);
783#else
784
785	return (UINT64_MAX);
786#endif
787}
788
789/*
790 * Returns amount of 'resource' the process 'p' can keep allocated.
791 * Allocating more than that would be denied, unless the resource
792 * is marked undeniable.  Amount of already allocated resource does
793 * matter.
794 */
795uint64_t
796racct_get_available(struct proc *p, int resource)
797{
798#ifdef RCTL
799	uint64_t available;
800
801	if (!racct_enable)
802		return (UINT64_MAX);
803
804	RACCT_LOCK();
805	available = rctl_get_available(p, resource);
806	RACCT_UNLOCK();
807
808	return (available);
809#else
810
811	return (UINT64_MAX);
812#endif
813}
814
815/*
816 * Returns amount of the %cpu resource that process 'p' can add to its %cpu
817 * utilization.  Adding more than that would lead to the process being
818 * throttled.
819 */
820static int64_t
821racct_pcpu_available(struct proc *p)
822{
823#ifdef RCTL
824	uint64_t available;
825
826	ASSERT_RACCT_ENABLED();
827
828	RACCT_LOCK();
829	available = rctl_pcpu_available(p);
830	RACCT_UNLOCK();
831
832	return (available);
833#else
834
835	return (INT64_MAX);
836#endif
837}
838
839/*
840 * Decrease allocation of 'resource' by 'amount' for process 'p'.
841 */
842void
843racct_sub(struct proc *p, int resource, uint64_t amount)
844{
845
846	if (!racct_enable)
847		return;
848
849	SDT_PROBE3(racct, , rusage, sub, p, resource, amount);
850
851	/*
852	 * We need proc lock to dereference p->p_ucred.
853	 */
854	PROC_LOCK_ASSERT(p, MA_OWNED);
855	KASSERT(RACCT_CAN_DROP(resource),
856	    ("%s: called for non-droppable resource %d", __func__, resource));
857
858	RACCT_LOCK();
859	KASSERT(amount <= p->p_racct->r_resources[resource],
860	    ("%s: freeing %ju of resource %d, which is more "
861	     "than allocated %jd for %s (pid %d)", __func__, amount, resource,
862	    (intmax_t)p->p_racct->r_resources[resource], p->p_comm, p->p_pid));
863
864	racct_adjust_resource(p->p_racct, resource, -amount);
865	racct_sub_cred_locked(p->p_ucred, resource, amount);
866	RACCT_UNLOCK();
867}
868
869static void
870racct_sub_cred_locked(struct ucred *cred, int resource, uint64_t amount)
871{
872	struct prison *pr;
873
874	ASSERT_RACCT_ENABLED();
875
876	racct_adjust_resource(cred->cr_ruidinfo->ui_racct, resource, -amount);
877	for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent)
878		racct_adjust_resource(pr->pr_prison_racct->prr_racct, resource,
879		    -amount);
880	racct_adjust_resource(cred->cr_loginclass->lc_racct, resource, -amount);
881}
882
883/*
884 * Decrease allocation of 'resource' by 'amount' for credential 'cred'.
885 */
886void
887racct_sub_cred(struct ucred *cred, int resource, uint64_t amount)
888{
889
890	if (!racct_enable)
891		return;
892
893	SDT_PROBE3(racct, , rusage, sub__cred, cred, resource, amount);
894
895#ifdef notyet
896	KASSERT(RACCT_CAN_DROP(resource),
897	    ("%s: called for resource %d which can not drop", __func__,
898	     resource));
899#endif
900
901	RACCT_LOCK();
902	racct_sub_cred_locked(cred, resource, amount);
903	RACCT_UNLOCK();
904}
905
906/*
907 * Inherit resource usage information from the parent process.
908 */
909int
910racct_proc_fork(struct proc *parent, struct proc *child)
911{
912	int i, error = 0;
913
914	if (!racct_enable)
915		return (0);
916
917	/*
918	 * Create racct for the child process.
919	 */
920	racct_create(&child->p_racct);
921
922	PROC_LOCK(parent);
923	PROC_LOCK(child);
924	RACCT_LOCK();
925
926#ifdef RCTL
927	error = rctl_proc_fork(parent, child);
928	if (error != 0)
929		goto out;
930#endif
931
932	/* Init process cpu time. */
933	child->p_prev_runtime = 0;
934	child->p_throttled = 0;
935
936	/*
937	 * Inherit resource usage.
938	 */
939	for (i = 0; i <= RACCT_MAX; i++) {
940		if (parent->p_racct->r_resources[i] == 0 ||
941		    !RACCT_IS_INHERITABLE(i))
942			continue;
943
944		error = racct_set_locked(child, i,
945		    parent->p_racct->r_resources[i], 0);
946		if (error != 0)
947			goto out;
948	}
949
950	error = racct_add_locked(child, RACCT_NPROC, 1, 0);
951	error += racct_add_locked(child, RACCT_NTHR, 1, 0);
952
953out:
954	RACCT_UNLOCK();
955	PROC_UNLOCK(child);
956	PROC_UNLOCK(parent);
957
958	if (error != 0)
959		racct_proc_exit(child);
960
961	return (error);
962}
963
964/*
965 * Called at the end of fork1(), to handle rules that require the process
966 * to be fully initialized.
967 */
968void
969racct_proc_fork_done(struct proc *child)
970{
971
972	if (!racct_enable)
973		return;
974
975#ifdef RCTL
976	PROC_LOCK(child);
977	RACCT_LOCK();
978	rctl_enforce(child, RACCT_NPROC, 0);
979	rctl_enforce(child, RACCT_NTHR, 0);
980	RACCT_UNLOCK();
981	PROC_UNLOCK(child);
982#endif
983}
984
985void
986racct_proc_exit(struct proc *p)
987{
988	struct timeval wallclock;
989	uint64_t pct_estimate, pct, runtime;
990	int i;
991
992	if (!racct_enable)
993		return;
994
995	PROC_LOCK(p);
996	/*
997	 * We don't need to calculate rux, proc_reap() has already done this.
998	 */
999	runtime = cputick2usec(p->p_rux.rux_runtime);
1000#ifdef notyet
1001	KASSERT(runtime >= p->p_prev_runtime, ("runtime < p_prev_runtime"));
1002#else
1003	if (runtime < p->p_prev_runtime)
1004		runtime = p->p_prev_runtime;
1005#endif
1006	microuptime(&wallclock);
1007	timevalsub(&wallclock, &p->p_stats->p_start);
1008	if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1009		pct_estimate = (1000000 * runtime * 100) /
1010		    ((uint64_t)wallclock.tv_sec * 1000000 +
1011		    wallclock.tv_usec);
1012	} else
1013		pct_estimate = 0;
1014	pct = racct_getpcpu(p, pct_estimate);
1015
1016	RACCT_LOCK();
1017	racct_set_locked(p, RACCT_CPU, runtime, 0);
1018	racct_add_cred_locked(p->p_ucred, RACCT_PCTCPU, pct);
1019
1020	KASSERT(p->p_racct->r_resources[RACCT_RSS] == 0,
1021	    ("process reaped with %ju allocated for RSS\n",
1022	    p->p_racct->r_resources[RACCT_RSS]));
1023	for (i = 0; i <= RACCT_MAX; i++) {
1024		if (p->p_racct->r_resources[i] == 0)
1025			continue;
1026		if (!RACCT_IS_RECLAIMABLE(i))
1027			continue;
1028		racct_set_locked(p, i, 0, 0);
1029	}
1030
1031#ifdef RCTL
1032	rctl_racct_release(p->p_racct);
1033#endif
1034	racct_destroy_locked(&p->p_racct);
1035	RACCT_UNLOCK();
1036	PROC_UNLOCK(p);
1037}
1038
1039/*
1040 * Called after credentials change, to move resource utilisation
1041 * between raccts.
1042 */
1043void
1044racct_proc_ucred_changed(struct proc *p, struct ucred *oldcred,
1045    struct ucred *newcred)
1046{
1047	struct uidinfo *olduip, *newuip;
1048	struct loginclass *oldlc, *newlc;
1049	struct prison *oldpr, *newpr, *pr;
1050
1051	if (!racct_enable)
1052		return;
1053
1054	PROC_LOCK_ASSERT(p, MA_OWNED);
1055
1056	newuip = newcred->cr_ruidinfo;
1057	olduip = oldcred->cr_ruidinfo;
1058	newlc = newcred->cr_loginclass;
1059	oldlc = oldcred->cr_loginclass;
1060	newpr = newcred->cr_prison;
1061	oldpr = oldcred->cr_prison;
1062
1063	RACCT_LOCK();
1064	if (newuip != olduip) {
1065		racct_sub_racct(olduip->ui_racct, p->p_racct);
1066		racct_add_racct(newuip->ui_racct, p->p_racct);
1067	}
1068	if (newlc != oldlc) {
1069		racct_sub_racct(oldlc->lc_racct, p->p_racct);
1070		racct_add_racct(newlc->lc_racct, p->p_racct);
1071	}
1072	if (newpr != oldpr) {
1073		for (pr = oldpr; pr != NULL; pr = pr->pr_parent)
1074			racct_sub_racct(pr->pr_prison_racct->prr_racct,
1075			    p->p_racct);
1076		for (pr = newpr; pr != NULL; pr = pr->pr_parent)
1077			racct_add_racct(pr->pr_prison_racct->prr_racct,
1078			    p->p_racct);
1079	}
1080	RACCT_UNLOCK();
1081}
1082
1083void
1084racct_move(struct racct *dest, struct racct *src)
1085{
1086
1087	ASSERT_RACCT_ENABLED();
1088
1089	RACCT_LOCK();
1090	racct_add_racct(dest, src);
1091	racct_sub_racct(src, src);
1092	RACCT_UNLOCK();
1093}
1094
1095static void
1096ast_racct(struct thread *td, int tda __unused)
1097{
1098	struct proc *p;
1099
1100	ASSERT_RACCT_ENABLED();
1101
1102	p = td->td_proc;
1103	if (p->p_throttled == 0)
1104		return;
1105
1106	PROC_LOCK(p);
1107	while (p->p_throttled != 0) {
1108		msleep(p->p_racct, &p->p_mtx, 0, "racct",
1109		    p->p_throttled < 0 ? 0 : p->p_throttled);
1110		if (p->p_throttled > 0)
1111			p->p_throttled = 0;
1112	}
1113	PROC_UNLOCK(p);
1114}
1115
1116/*
1117 * Make the process sleep in userret() for 'timeout' ticks.  Setting
1118 * timeout to -1 makes it sleep until woken up by racct_proc_wakeup().
1119 */
1120void
1121racct_proc_throttle(struct proc *p, int timeout)
1122{
1123	struct thread *td;
1124#ifdef SMP
1125	int cpuid;
1126#endif
1127
1128	KASSERT(timeout != 0, ("timeout %d", timeout));
1129	ASSERT_RACCT_ENABLED();
1130	PROC_LOCK_ASSERT(p, MA_OWNED);
1131
1132	/*
1133	 * Do not block kernel processes.  Also do not block processes with
1134	 * low %cpu utilization to improve interactivity.
1135	 */
1136	if ((p->p_flag & (P_SYSTEM | P_KPROC)) != 0)
1137		return;
1138
1139	if (p->p_throttled < 0 || (timeout > 0 && p->p_throttled > timeout))
1140		return;
1141
1142	p->p_throttled = timeout;
1143
1144	FOREACH_THREAD_IN_PROC(p, td) {
1145		thread_lock(td);
1146		ast_sched_locked(td, TDA_RACCT);
1147
1148		switch (TD_GET_STATE(td)) {
1149		case TDS_RUNQ:
1150			/*
1151			 * If the thread is on the scheduler run-queue, we can
1152			 * not just remove it from there.  So we set the flag
1153			 * TDA_SCHED for the thread, so that once it is
1154			 * running, it is taken off the cpu as soon as possible.
1155			 */
1156			ast_sched_locked(td, TDA_SCHED);
1157			break;
1158		case TDS_RUNNING:
1159			/*
1160			 * If the thread is running, we request a context
1161			 * switch for it by setting the TDA_SCHED flag.
1162			 */
1163			ast_sched_locked(td, TDA_SCHED);
1164#ifdef SMP
1165			cpuid = td->td_oncpu;
1166			if ((cpuid != NOCPU) && (td != curthread))
1167				ipi_cpu(cpuid, IPI_AST);
1168#endif
1169			break;
1170		default:
1171			break;
1172		}
1173		thread_unlock(td);
1174	}
1175}
1176
1177static void
1178racct_proc_wakeup(struct proc *p)
1179{
1180
1181	ASSERT_RACCT_ENABLED();
1182
1183	PROC_LOCK_ASSERT(p, MA_OWNED);
1184
1185	if (p->p_throttled != 0) {
1186		p->p_throttled = 0;
1187		wakeup(p->p_racct);
1188	}
1189}
1190
1191static void
1192racct_decay_callback(struct racct *racct, void *dummy1, void *dummy2)
1193{
1194	int64_t r_old, r_new;
1195
1196	ASSERT_RACCT_ENABLED();
1197	RACCT_LOCK_ASSERT();
1198
1199#ifdef RCTL
1200	rctl_throttle_decay(racct, RACCT_READBPS);
1201	rctl_throttle_decay(racct, RACCT_WRITEBPS);
1202	rctl_throttle_decay(racct, RACCT_READIOPS);
1203	rctl_throttle_decay(racct, RACCT_WRITEIOPS);
1204#endif
1205
1206	r_old = racct->r_resources[RACCT_PCTCPU];
1207
1208	/* If there is nothing to decay, just exit. */
1209	if (r_old <= 0)
1210		return;
1211
1212	r_new = r_old * RACCT_DECAY_FACTOR / FSCALE;
1213	racct->r_resources[RACCT_PCTCPU] = r_new;
1214}
1215
1216static void
1217racct_decay_pre(void)
1218{
1219
1220	RACCT_LOCK();
1221}
1222
1223static void
1224racct_decay_post(void)
1225{
1226
1227	RACCT_UNLOCK();
1228}
1229
1230static void
1231racct_decay(void)
1232{
1233
1234	ASSERT_RACCT_ENABLED();
1235
1236	ui_racct_foreach(racct_decay_callback, racct_decay_pre,
1237	    racct_decay_post, NULL, NULL);
1238	loginclass_racct_foreach(racct_decay_callback, racct_decay_pre,
1239	    racct_decay_post, NULL, NULL);
1240	prison_racct_foreach(racct_decay_callback, racct_decay_pre,
1241	    racct_decay_post, NULL, NULL);
1242}
1243
1244static void
1245racctd(void)
1246{
1247	struct thread *td;
1248	struct proc *p;
1249	struct timeval wallclock;
1250	uint64_t pct, pct_estimate, runtime;
1251
1252	ASSERT_RACCT_ENABLED();
1253
1254	for (;;) {
1255		racct_decay();
1256
1257		sx_slock(&allproc_lock);
1258
1259		FOREACH_PROC_IN_SYSTEM(p) {
1260			PROC_LOCK(p);
1261			if (p->p_state != PRS_NORMAL) {
1262				if (p->p_state == PRS_ZOMBIE)
1263					racct_set(p, RACCT_PCTCPU, 0);
1264				PROC_UNLOCK(p);
1265				continue;
1266			}
1267
1268			microuptime(&wallclock);
1269			timevalsub(&wallclock, &p->p_stats->p_start);
1270			PROC_STATLOCK(p);
1271			FOREACH_THREAD_IN_PROC(p, td)
1272				ruxagg(p, td);
1273			runtime = cputick2usec(p->p_rux.rux_runtime);
1274			PROC_STATUNLOCK(p);
1275#ifdef notyet
1276			KASSERT(runtime >= p->p_prev_runtime,
1277			    ("runtime < p_prev_runtime"));
1278#else
1279			if (runtime < p->p_prev_runtime)
1280				runtime = p->p_prev_runtime;
1281#endif
1282			p->p_prev_runtime = runtime;
1283			if (wallclock.tv_sec > 0 || wallclock.tv_usec > 0) {
1284				pct_estimate = (1000000 * runtime * 100) /
1285				    ((uint64_t)wallclock.tv_sec * 1000000 +
1286				    wallclock.tv_usec);
1287			} else
1288				pct_estimate = 0;
1289			pct = racct_getpcpu(p, pct_estimate);
1290			RACCT_LOCK();
1291#ifdef RCTL
1292			rctl_throttle_decay(p->p_racct, RACCT_READBPS);
1293			rctl_throttle_decay(p->p_racct, RACCT_WRITEBPS);
1294			rctl_throttle_decay(p->p_racct, RACCT_READIOPS);
1295			rctl_throttle_decay(p->p_racct, RACCT_WRITEIOPS);
1296#endif
1297			racct_set_locked(p, RACCT_PCTCPU, pct, 1);
1298			racct_set_locked(p, RACCT_CPU, runtime, 0);
1299			racct_set_locked(p, RACCT_WALLCLOCK,
1300			    (uint64_t)wallclock.tv_sec * 1000000 +
1301			    wallclock.tv_usec, 0);
1302			RACCT_UNLOCK();
1303			PROC_UNLOCK(p);
1304		}
1305
1306		/*
1307		 * To ensure that processes are throttled in a fair way, we need
1308		 * to iterate over all processes again and check the limits
1309		 * for %cpu resource only after ucred racct containers have been
1310		 * properly filled.
1311		 */
1312		FOREACH_PROC_IN_SYSTEM(p) {
1313			PROC_LOCK(p);
1314			if (p->p_state != PRS_NORMAL) {
1315				PROC_UNLOCK(p);
1316				continue;
1317			}
1318
1319			if (racct_pcpu_available(p) <= 0) {
1320				if (p->p_racct->r_resources[RACCT_PCTCPU] >
1321				    pcpu_threshold)
1322					racct_proc_throttle(p, -1);
1323			} else if (p->p_throttled == -1) {
1324				racct_proc_wakeup(p);
1325			}
1326			PROC_UNLOCK(p);
1327		}
1328		sx_sunlock(&allproc_lock);
1329		pause("-", hz);
1330	}
1331}
1332
1333static struct kproc_desc racctd_kp = {
1334	"racctd",
1335	racctd,
1336	NULL
1337};
1338
1339static void
1340racctd_init(void)
1341{
1342	if (!racct_enable)
1343		return;
1344
1345	kproc_start(&racctd_kp);
1346}
1347SYSINIT(racctd, SI_SUB_RACCTD, SI_ORDER_FIRST, racctd_init, NULL);
1348
1349static void
1350racct_init(void)
1351{
1352	if (!racct_enable)
1353		return;
1354
1355	racct_zone = uma_zcreate("racct", sizeof(struct racct),
1356	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
1357	ast_register(TDA_RACCT, ASTR_ASTF_REQUIRED, 0, ast_racct);
1358
1359	/*
1360	 * XXX: Move this somewhere.
1361	 */
1362	prison0.pr_prison_racct = prison_racct_find("0");
1363}
1364SYSINIT(racct, SI_SUB_RACCT, SI_ORDER_FIRST, racct_init, NULL);
1365