1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 */
29
30#include <kern/sched_prim.h>
31#include <kern/kalloc.h>
32#include <kern/assert.h>
33#include <kern/debug.h>
34#include <kern/lock.h>
35#include <kern/task.h>
36#include <kern/thread.h>
37#include <kern/host.h>
38#include <libkern/libkern.h>
39#include <mach/mach_time.h>
40#include <mach/task.h>
41#include <mach/host_priv.h>
42#include <mach/mach_host.h>
43#include <pexpert/pexpert.h>
44#include <sys/kern_event.h>
45#include <sys/proc.h>
46#include <sys/proc_info.h>
47#include <sys/signal.h>
48#include <sys/signalvar.h>
49#include <sys/sysctl.h>
50#include <sys/sysproto.h>
51#include <sys/wait.h>
52#include <sys/tree.h>
53#include <sys/priv.h>
54#include <vm/vm_pageout.h>
55#include <vm/vm_protos.h>
56
57#if CONFIG_FREEZE
58#include <vm/vm_map.h>
59#endif /* CONFIG_FREEZE */
60
61#include <sys/kern_memorystatus.h>
62
63/* These are very verbose printfs(), enable with
64 * MEMORYSTATUS_DEBUG_LOG
65 */
66#if MEMORYSTATUS_DEBUG_LOG
67#define MEMORYSTATUS_DEBUG(cond, format, ...)      \
68do {                                              \
69	if (cond) { printf(format, ##__VA_ARGS__); } \
70} while(0)
71#else
72#define MEMORYSTATUS_DEBUG(cond, format, ...)
73#endif
74
75/* General tunables */
76
77unsigned long delta_percentage = 5;
78unsigned long critical_threshold_percentage = 5;
79unsigned long idle_offset_percentage = 5;
80unsigned long pressure_threshold_percentage = 15;
81unsigned long freeze_threshold_percentage = 50;
82
83/* General memorystatus stuff */
84
85struct klist memorystatus_klist;
86static lck_mtx_t memorystatus_klist_mutex;
87
88static void memorystatus_klist_lock(void);
89static void memorystatus_klist_unlock(void);
90
91static uint64_t memorystatus_idle_delay_time = 0;
92
93/*
94 * Memorystatus kevents
95 */
96
97static int filt_memorystatusattach(struct knote *kn);
98static void filt_memorystatusdetach(struct knote *kn);
99static int filt_memorystatus(struct knote *kn, long hint);
100
101struct filterops memorystatus_filtops = {
102	.f_attach = filt_memorystatusattach,
103	.f_detach = filt_memorystatusdetach,
104	.f_event = filt_memorystatus,
105};
106
107enum {
108	kMemorystatusNoPressure = 1,
109	kMemorystatusPressure = 2
110};
111
112/* Idle guard handling */
113
114static int32_t memorystatus_scheduled_idle_demotions = 0;
115
116static thread_call_t memorystatus_idle_demotion_call;
117
118static void memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2);
119static void memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state);
120static void memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clean_state);
121static void memorystatus_reschedule_idle_demotion_locked(void);
122
123static void memorystatus_update_priority_locked(proc_t p, int priority);
124
125int memorystatus_wakeup = 0;
126
127unsigned int memorystatus_level = 0;
128
129static int memorystatus_list_count = 0;
130
131#define MEMSTAT_BUCKET_COUNT (JETSAM_PRIORITY_MAX + 1)
132
133typedef struct memstat_bucket {
134    TAILQ_HEAD(, proc) list;
135    int count;
136} memstat_bucket_t;
137
138memstat_bucket_t memstat_bucket[MEMSTAT_BUCKET_COUNT];
139
140uint64_t memstat_idle_demotion_deadline = 0;
141
142static unsigned int memorystatus_dirty_count = 0;
143
144#if !CONFIG_JETSAM
145static boolean_t kill_idle_exit = FALSE;
146#endif
147
148
149int
150memorystatus_get_level(__unused struct proc *p, struct memorystatus_get_level_args *args, __unused int *ret)
151{
152	user_addr_t	level = 0;
153
154	level = args->level;
155
156	if (copyout(&memorystatus_level, level, sizeof(memorystatus_level)) != 0) {
157		return EFAULT;
158	}
159
160	return 0;
161}
162
163static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search);
164static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search);
165
166static void memorystatus_thread(void *param __unused, wait_result_t wr __unused);
167
168/* Jetsam */
169
170#if CONFIG_JETSAM
171
172/* Kill processes exceeding their limit either under memory pressure (1), or as soon as possible (0) */
173#define LEGACY_HIWATER 1
174
175static int memorystatus_highwater_enabled = 1;
176
177extern unsigned int    vm_page_free_count;
178extern unsigned int    vm_page_active_count;
179extern unsigned int    vm_page_inactive_count;
180extern unsigned int    vm_page_throttled_count;
181extern unsigned int    vm_page_purgeable_count;
182extern unsigned int    vm_page_wire_count;
183
184unsigned int memorystatus_delta = 0;
185
186static unsigned int memorystatus_available_pages = (unsigned int)-1;
187static unsigned int memorystatus_available_pages_pressure = 0;
188static unsigned int memorystatus_available_pages_critical = 0;
189static unsigned int memorystatus_available_pages_critical_base = 0;
190static unsigned int memorystatus_last_foreground_pressure_pages = (unsigned int)-1;
191#if !LATENCY_JETSAM
192static unsigned int memorystatus_available_pages_critical_idle_offset = 0;
193#endif
194
195#if DEVELOPMENT || DEBUG
196static unsigned int memorystatus_jetsam_panic_debug = 0;
197
198static unsigned int memorystatus_jetsam_policy = kPolicyDefault;
199static unsigned int memorystatus_jetsam_policy_offset_pages_diagnostic = 0;
200#endif
201
202static boolean_t kill_under_pressure = FALSE;
203
204static memorystatus_jetsam_snapshot_t *memorystatus_jetsam_snapshot;
205#define memorystatus_jetsam_snapshot_list memorystatus_jetsam_snapshot->entries
206
207static unsigned int memorystatus_jetsam_snapshot_count = 0;
208static unsigned int memorystatus_jetsam_snapshot_max = 0;
209
210static void memorystatus_clear_errors(void);
211static void memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint);
212static int memorystatus_send_note(int event_code, void *data, size_t data_length);
213static uint32_t memorystatus_build_state(proc_t p);
214static void memorystatus_update_levels_locked(boolean_t critical_only);
215static boolean_t memorystatus_issue_pressure_kevent(boolean_t pressured);
216
217static boolean_t memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause);
218static boolean_t memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors);
219#if LEGACY_HIWATER
220static boolean_t memorystatus_kill_hiwat_proc(uint32_t *errors);
221#endif
222
223static boolean_t memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause);
224static boolean_t memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause);
225
226#endif /* CONFIG_JETSAM */
227
228/* VM pressure */
229
230#if VM_PRESSURE_EVENTS
231
232#include "vm_pressure.h"
233
234extern boolean_t memorystatus_warn_process(pid_t pid);
235
236vm_pressure_level_t memorystatus_vm_pressure_level = kVMPressureNormal;
237
238#endif /* VM_PRESSURE_EVENTS */
239
240/* Freeze */
241
242#if CONFIG_FREEZE
243
244boolean_t memorystatus_freeze_enabled = FALSE;
245int memorystatus_freeze_wakeup = 0;
246
247static inline boolean_t memorystatus_can_freeze_processes(void);
248static boolean_t memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low);
249
250static void memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused);
251
252/* Thresholds */
253static unsigned int memorystatus_freeze_threshold = 0;
254
255static unsigned int memorystatus_freeze_pages_min = FREEZE_PAGES_MIN;
256static unsigned int memorystatus_freeze_pages_max = FREEZE_PAGES_MAX;
257
258static unsigned int memorystatus_frozen_count = 0;
259
260static unsigned int memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
261
262/* Stats */
263static uint64_t memorystatus_freeze_count = 0;
264static uint64_t memorystatus_freeze_pageouts = 0;
265
266/* Throttling */
267static throttle_interval_t throttle_intervals[] = {
268	{      60,  8, 0, 0, { 0, 0 }, FALSE }, /* 1 hour intermediate interval, 8x burst */
269	{ 24 * 60,  1, 0, 0, { 0, 0 }, FALSE }, /* 24 hour long interval, no burst */
270};
271
272static uint64_t memorystatus_freeze_throttle_count = 0;
273
274static unsigned int memorystatus_suspended_count = 0;
275static unsigned int memorystatus_suspended_footprint_total = 0;
276
277#endif /* CONFIG_FREEZE */
278
279/* Debug */
280
281#if DEVELOPMENT || DEBUG
282
283#if CONFIG_JETSAM
284
285/* Debug aid to aid determination of limit */
286
287static int
288sysctl_memorystatus_highwater_enable SYSCTL_HANDLER_ARGS
289{
290#pragma unused(oidp, arg2)
291	proc_t p;
292	unsigned int b = 0;
293	int error, enable = 0;
294	int32_t memlimit;
295
296	error = SYSCTL_OUT(req, arg1, sizeof(int));
297	if (error || !req->newptr) {
298		return (error);
299	}
300
301	error = SYSCTL_IN(req, &enable, sizeof(int));
302	if (error || !req->newptr) {
303		return (error);
304	}
305
306	if (!(enable == 0 || enable == 1)) {
307		return EINVAL;
308	}
309
310	proc_list_lock();
311
312	p = memorystatus_get_first_proc_locked(&b, TRUE);
313	while (p) {
314		if (enable) {
315			if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
316				memlimit = -1;
317			} else {
318				memlimit = p->p_memstat_memlimit;
319			}
320		} else {
321			memlimit = -1;
322		}
323		task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
324
325		p = memorystatus_get_next_proc_locked(&b, p, TRUE);
326	}
327
328	memorystatus_highwater_enabled = enable;
329
330	proc_list_unlock();
331
332	return 0;
333}
334
335SYSCTL_PROC(_kern, OID_AUTO, memorystatus_highwater_enabled, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_highwater_enabled, 0, sysctl_memorystatus_highwater_enable, "I", "");
336
337SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages, 0, "");
338SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_available_pages_critical, 0, "");
339SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_base, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_base, 0, "");
340#if !LATENCY_JETSAM
341SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_critical_idle_offset, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_critical_idle_offset, 0, "");
342#endif
343
344/* Diagnostic code */
345
346enum {
347	kJetsamDiagnosticModeNone =              0,
348	kJetsamDiagnosticModeAll  =              1,
349	kJetsamDiagnosticModeStopAtFirstActive = 2,
350	kJetsamDiagnosticModeCount
351} jetsam_diagnostic_mode = kJetsamDiagnosticModeNone;
352
353static int jetsam_diagnostic_suspended_one_active_proc = 0;
354
355static int
356sysctl_jetsam_diagnostic_mode SYSCTL_HANDLER_ARGS
357{
358#pragma unused(arg1, arg2)
359
360	const char *diagnosticStrings[] = {
361		"jetsam: diagnostic mode: resetting critical level.",
362		"jetsam: diagnostic mode: will examine all processes",
363		"jetsam: diagnostic mode: will stop at first active process"
364	};
365
366	int error, val = jetsam_diagnostic_mode;
367	boolean_t changed = FALSE;
368
369	error = sysctl_handle_int(oidp, &val, 0, req);
370	if (error || !req->newptr)
371 		return (error);
372	if ((val < 0) || (val >= kJetsamDiagnosticModeCount)) {
373		printf("jetsam: diagnostic mode: invalid value - %d\n", val);
374		return EINVAL;
375	}
376
377	proc_list_lock();
378
379	if ((unsigned int) val != jetsam_diagnostic_mode) {
380		jetsam_diagnostic_mode = val;
381
382		memorystatus_jetsam_policy &= ~kPolicyDiagnoseActive;
383
384		switch (jetsam_diagnostic_mode) {
385		case kJetsamDiagnosticModeNone:
386			/* Already cleared */
387			break;
388		case kJetsamDiagnosticModeAll:
389			memorystatus_jetsam_policy |= kPolicyDiagnoseAll;
390			break;
391		case kJetsamDiagnosticModeStopAtFirstActive:
392			memorystatus_jetsam_policy |= kPolicyDiagnoseFirst;
393			break;
394		default:
395			/* Already validated */
396			break;
397		}
398
399		memorystatus_update_levels_locked(FALSE);
400		changed = TRUE;
401	}
402
403	proc_list_unlock();
404
405	if (changed) {
406		printf("%s\n", diagnosticStrings[val]);
407	}
408
409	return (0);
410}
411
412SYSCTL_PROC(_debug, OID_AUTO, jetsam_diagnostic_mode, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY,
413  		&jetsam_diagnostic_mode, 0, sysctl_jetsam_diagnostic_mode, "I", "Jetsam Diagnostic Mode");
414
415SYSCTL_UINT(_kern, OID_AUTO, memorystatus_jetsam_policy_offset_pages_diagnostic, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_jetsam_policy_offset_pages_diagnostic, 0, "");
416
417#if VM_PRESSURE_EVENTS
418
419SYSCTL_UINT(_kern, OID_AUTO, memorystatus_available_pages_pressure, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_available_pages_pressure, 0, "");
420
421static int
422sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
423{
424#pragma unused(arg1, arg2, oidp)
425	int error = 0;
426
427	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
428	if (error)
429		return (error);
430
431	return SYSCTL_OUT(req, &memorystatus_vm_pressure_level, sizeof(memorystatus_vm_pressure_level));
432}
433
434SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED|CTLFLAG_MASKED,
435    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
436
437static int
438sysctl_memorystatus_vm_pressure_send SYSCTL_HANDLER_ARGS
439{
440#pragma unused(arg1, arg2)
441
442	int error, pid = 0;
443
444	error = sysctl_handle_int(oidp, &pid, 0, req);
445	if (error || !req->newptr)
446		return (error);
447
448	return vm_dispatch_pressure_note_to_pid(pid, FALSE);
449}
450
451SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_send, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
452    0, 0, &sysctl_memorystatus_vm_pressure_send, "I", "");
453
454#endif /* VM_PRESSURE_EVENTS */
455
456#endif /* CONFIG_JETSAM */
457
458#endif /* DEVELOPMENT || DEBUG */
459
460#if CONFIG_FREEZE
461
462SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_threshold, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_threshold, 0, "");
463
464SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_min, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_min, 0, "");
465SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_pages_max, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_pages_max, 0, "");
466
467SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_count, "");
468SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_pageouts, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_pageouts, "");
469SYSCTL_QUAD(_kern, OID_AUTO, memorystatus_freeze_throttle_count, CTLFLAG_RD|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_count, "");
470SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_min_processes, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_suspended_threshold, 0, "");
471
472boolean_t memorystatus_freeze_throttle_enabled = TRUE;
473SYSCTL_UINT(_kern, OID_AUTO, memorystatus_freeze_throttle_enabled, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_freeze_throttle_enabled, 0, "");
474
475/*
476 * Enabled via: <rdar://problem/13248767> Enable the sysctl_memorystatus_freeze/thaw sysctls on Release KC
477 *
478 * TODO: Manual trigger of freeze and thaw for dev / debug kernels only.
479 * <rdar://problem/13248795> Disable/restrict the sysctl_memorystatus_freeze/thaw sysctls on Release KC
480 */
481static int
482sysctl_memorystatus_freeze SYSCTL_HANDLER_ARGS
483{
484#pragma unused(arg1, arg2)
485
486	int error, pid = 0;
487	proc_t p;
488
489	error = sysctl_handle_int(oidp, &pid, 0, req);
490	if (error || !req->newptr)
491		return (error);
492
493	p = proc_find(pid);
494	if (p != NULL) {
495		uint32_t purgeable, wired, clean, dirty;
496		boolean_t shared;
497		uint32_t max_pages = 0;
498
499		if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
500			max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
501		} else {
502			max_pages = UINT32_MAX - 1;
503		}
504		error = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
505		proc_rele(p);
506
507		if (error)
508			error = EIO;
509		return error;
510	}
511	return EINVAL;
512}
513
514SYSCTL_PROC(_kern, OID_AUTO, memorystatus_freeze, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
515    0, 0, &sysctl_memorystatus_freeze, "I", "");
516
517static int
518sysctl_memorystatus_available_pages_thaw SYSCTL_HANDLER_ARGS
519{
520#pragma unused(arg1, arg2)
521
522	int error, pid = 0;
523	proc_t p;
524
525	error = sysctl_handle_int(oidp, &pid, 0, req);
526	if (error || !req->newptr)
527		return (error);
528
529	p = proc_find(pid);
530	if (p != NULL) {
531		error = task_thaw(p->task);
532		proc_rele(p);
533
534		if (error)
535			error = EIO;
536		return error;
537	}
538
539	return EINVAL;
540}
541
542SYSCTL_PROC(_kern, OID_AUTO, memorystatus_thaw, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
543    0, 0, &sysctl_memorystatus_available_pages_thaw, "I", "");
544
545#endif /* CONFIG_FREEZE */
546
547extern kern_return_t kernel_thread_start_priority(thread_continue_t continuation,
548                                                  void *parameter,
549                                                  integer_t priority,
550                                                  thread_t *new_thread);
551
552static proc_t memorystatus_get_first_proc_locked(unsigned int *bucket_index, boolean_t search) {
553	memstat_bucket_t *current_bucket;
554	proc_t next_p;
555
556	if ((*bucket_index) >= MEMSTAT_BUCKET_COUNT) {
557		return NULL;
558	}
559
560	current_bucket = &memstat_bucket[*bucket_index];
561	next_p = TAILQ_FIRST(&current_bucket->list);
562	if (!next_p && search) {
563		while (!next_p && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
564			current_bucket = &memstat_bucket[*bucket_index];
565			next_p = TAILQ_FIRST(&current_bucket->list);
566		}
567	}
568
569	return next_p;
570}
571
572static proc_t memorystatus_get_next_proc_locked(unsigned int *bucket_index, proc_t p, boolean_t search) {
573	memstat_bucket_t *current_bucket;
574	proc_t next_p;
575
576	if (!p || ((*bucket_index) >= MEMSTAT_BUCKET_COUNT)) {
577		return NULL;
578	}
579
580	next_p = TAILQ_NEXT(p, p_memstat_list);
581	while (!next_p && search && (++(*bucket_index) < MEMSTAT_BUCKET_COUNT)) {
582		current_bucket = &memstat_bucket[*bucket_index];
583		next_p = TAILQ_FIRST(&current_bucket->list);
584	}
585
586	return next_p;
587}
588
589__private_extern__ void
590memorystatus_init(void)
591{
592	thread_t thread = THREAD_NULL;
593	kern_return_t result;
594	int i;
595
596	nanoseconds_to_absolutetime((uint64_t)DEFERRED_IDLE_EXIT_TIME_SECS * NSEC_PER_SEC, &memorystatus_idle_delay_time);
597
598	/* Init buckets */
599	for (i = 0; i < MEMSTAT_BUCKET_COUNT; i++) {
600		TAILQ_INIT(&memstat_bucket[i].list);
601		memstat_bucket[i].count = 0;
602	}
603
604	memorystatus_idle_demotion_call = thread_call_allocate((thread_call_func_t)memorystatus_perform_idle_demotion, NULL);
605
606	/* Apply overrides */
607	PE_get_default("kern.jetsam_delta", &delta_percentage, sizeof(delta_percentage));
608	assert(delta_percentage < 100);
609	PE_get_default("kern.jetsam_critical_threshold", &critical_threshold_percentage, sizeof(critical_threshold_percentage));
610	assert(critical_threshold_percentage < 100);
611	PE_get_default("kern.jetsam_idle_offset", &idle_offset_percentage, sizeof(idle_offset_percentage));
612	assert(idle_offset_percentage < 100);
613	PE_get_default("kern.jetsam_pressure_threshold", &pressure_threshold_percentage, sizeof(pressure_threshold_percentage));
614	assert(pressure_threshold_percentage < 100);
615	PE_get_default("kern.jetsam_freeze_threshold", &freeze_threshold_percentage, sizeof(freeze_threshold_percentage));
616	assert(freeze_threshold_percentage < 100);
617
618#if CONFIG_JETSAM
619	memorystatus_delta = delta_percentage * atop_64(max_mem) / 100;
620#if !LATENCY_JETSAM
621	memorystatus_available_pages_critical_idle_offset = idle_offset_percentage * atop_64(max_mem) / 100;
622#endif
623
624	memorystatus_available_pages_critical_base = (critical_threshold_percentage / delta_percentage) * memorystatus_delta;
625
626	memorystatus_jetsam_snapshot_max = maxproc;
627	memorystatus_jetsam_snapshot =
628		(memorystatus_jetsam_snapshot_t*)kalloc(sizeof(memorystatus_jetsam_snapshot_t) +
629		sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_max);
630	if (!memorystatus_jetsam_snapshot) {
631		panic("Could not allocate memorystatus_jetsam_snapshot");
632	}
633
634	/* No contention at this point */
635	memorystatus_update_levels_locked(FALSE);
636#endif
637
638#if CONFIG_FREEZE
639	memorystatus_freeze_threshold = (freeze_threshold_percentage / delta_percentage) * memorystatus_delta;
640#endif
641
642	result = kernel_thread_start_priority(memorystatus_thread, NULL, 95 /* MAXPRI_KERNEL */, &thread);
643	if (result == KERN_SUCCESS) {
644		thread_deallocate(thread);
645	} else {
646		panic("Could not create memorystatus_thread");
647	}
648}
649
650/* Centralised for the purposes of allowing panic-on-jetsam */
651extern void
652vm_wake_compactor_swapper(void);
653
654static boolean_t
655memorystatus_do_kill(proc_t p, uint32_t cause) {
656
657	int retval = 0;
658
659#if CONFIG_JETSAM && (DEVELOPMENT || DEBUG)
660	if (memorystatus_jetsam_panic_debug & (1 << cause)) {
661		panic("memorystatus_do_kill(): jetsam debug panic (cause: %d)", cause);
662	}
663#else
664#pragma unused(cause)
665#endif
666	int jetsam_flags = P_LTERM_JETSAM;
667	switch (cause) {
668		case kMemorystatusKilledHiwat:			jetsam_flags |= P_JETSAM_HIWAT; break;
669		case kMemorystatusKilledVnodes:			jetsam_flags |= P_JETSAM_VNODE; break;
670		case kMemorystatusKilledVMPageShortage:		jetsam_flags |= P_JETSAM_VMPAGESHORTAGE; break;
671		case kMemorystatusKilledVMThrashing:		jetsam_flags |= P_JETSAM_VMTHRASHING; break;
672		case kMemorystatusKilledPerProcessLimit:	jetsam_flags |= P_JETSAM_PID; break;
673		case kMemorystatusKilledIdleExit:		jetsam_flags |= P_JETSAM_IDLEEXIT; break;
674	}
675	retval = exit1_internal(p, W_EXITCODE(0, SIGKILL), (int *)NULL, FALSE, FALSE, jetsam_flags);
676
677	if (COMPRESSED_PAGER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
678 		vm_wake_compactor_swapper();
679 	}
680
681	return (retval == 0);
682}
683
684/*
685 * Node manipulation
686 */
687
688static void
689memorystatus_check_levels_locked(void) {
690#if CONFIG_JETSAM
691	/* Update levels */
692	memorystatus_update_levels_locked(TRUE);
693#endif
694}
695
696static void
697memorystatus_perform_idle_demotion(__unused void *spare1, __unused void *spare2)
698{
699	proc_t p;
700	uint64_t current_time;
701	memstat_bucket_t *demotion_bucket;
702
703	MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion()\n");
704
705	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_START, 0, 0, 0, 0, 0);
706
707 	current_time = mach_absolute_time();
708
709	proc_list_lock();
710
711	demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
712	p = TAILQ_FIRST(&demotion_bucket->list);
713
714	while (p) {
715		MEMORYSTATUS_DEBUG(1, "memorystatus_perform_idle_demotion() found %d\n", p->p_pid);
716
717		assert(p->p_memstat_idledeadline);
718		assert(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS);
719		assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED);
720
721		if (current_time >= p->p_memstat_idledeadline) {
722#if DEBUG || DEVELOPMENT
723			if (!(p->p_memstat_dirty & P_DIRTY_MARKED)) {
724				printf("memorystatus_perform_idle_demotion: moving process %d to idle band, but never dirtied (0x%x)!\n", p->p_pid, p->p_memstat_dirty);
725			}
726#endif
727			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
728			memorystatus_update_priority_locked(p, JETSAM_PRIORITY_IDLE);
729
730			// The prior process has moved out of the demotion bucket, so grab the new head and continue
731			p = TAILQ_FIRST(&demotion_bucket->list);
732			continue;
733		}
734
735		// No further candidates
736		break;
737	}
738
739	memorystatus_reschedule_idle_demotion_locked();
740
741	proc_list_unlock();
742
743	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_IDLE_DEMOTE) | DBG_FUNC_END, 0, 0, 0, 0, 0);
744}
745
746static void
747memorystatus_schedule_idle_demotion_locked(proc_t p, boolean_t set_state)
748{
749	MEMORYSTATUS_DEBUG(1, "memorystatus_schedule_idle_demotion_locked: scheduling demotion to idle band for process %d (dirty:0x%x, set_state %d, demotions %d).\n",
750	    p->p_pid, p->p_memstat_dirty, set_state, memorystatus_scheduled_idle_demotions);
751
752	assert((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) == (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS));
753
754	if (set_state) {
755		assert(p->p_memstat_idledeadline == 0);
756		p->p_memstat_idledeadline = mach_absolute_time() + memorystatus_idle_delay_time;
757	}
758
759 	assert(p->p_memstat_idledeadline);
760
761 	memorystatus_scheduled_idle_demotions++;
762}
763
764static void
765memorystatus_invalidate_idle_demotion_locked(proc_t p, boolean_t clear_state)
766{
767	MEMORYSTATUS_DEBUG(1, "memorystatus_invalidate_idle_demotion(): invalidating demotion to idle band for process %d (clear_state %d, demotions %d).\n",
768	    p->p_pid, clear_state, memorystatus_scheduled_idle_demotions);
769
770	assert(p->p_memstat_idledeadline);
771
772	if (clear_state) {
773 		p->p_memstat_idledeadline = 0;
774 		p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
775	}
776
777 	memorystatus_scheduled_idle_demotions--;
778 	assert(memorystatus_scheduled_idle_demotions >= 0);
779}
780
781static void
782memorystatus_reschedule_idle_demotion_locked(void) {
783 	if (0 == memorystatus_scheduled_idle_demotions) {
784 	 	if (memstat_idle_demotion_deadline) {
785 	 	 	/* Transitioned 1->0, so cancel next call */
786 	 	 	thread_call_cancel(memorystatus_idle_demotion_call);
787 	 	 	memstat_idle_demotion_deadline = 0;
788 		}
789 	} else {
790 		memstat_bucket_t *demotion_bucket;
791 		proc_t p;
792 		demotion_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE_DEFERRED];
793 		p = TAILQ_FIRST(&demotion_bucket->list);
794 		assert(p && p->p_memstat_idledeadline);
795
796 		if (memstat_idle_demotion_deadline != p->p_memstat_idledeadline){
797 	 	 	thread_call_enter_delayed(memorystatus_idle_demotion_call, p->p_memstat_idledeadline);
798 	 	 	memstat_idle_demotion_deadline = p->p_memstat_idledeadline;
799		}
800 	}
801}
802
803/*
804 * List manipulation
805 */
806
807int
808memorystatus_add(proc_t p, boolean_t locked)
809{
810	memstat_bucket_t *bucket;
811
812	MEMORYSTATUS_DEBUG(1, "memorystatus_list_add(): adding process %d with priority %d.\n", p->pid, priority);
813
814	if (!locked) {
815   	   	proc_list_lock();
816   	}
817
818	/* Processes marked internal do not have priority tracked */
819	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
820                goto exit;
821	}
822
823	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
824
825	TAILQ_INSERT_TAIL(&bucket->list, p, p_memstat_list);
826	bucket->count++;
827
828	memorystatus_list_count++;
829
830	memorystatus_check_levels_locked();
831
832exit:
833   	if (!locked) {
834   	   	proc_list_unlock();
835   	}
836
837	return 0;
838}
839
840static void
841memorystatus_update_priority_locked(proc_t p, int priority)
842{
843	memstat_bucket_t *old_bucket, *new_bucket;
844
845	assert(priority < MEMSTAT_BUCKET_COUNT);
846
847	/* Ensure that exit isn't underway, leaving the proc retained but removed from its bucket */
848	if ((p->p_listflag & P_LIST_EXITED) != 0) {
849		return;
850	}
851
852	MEMORYSTATUS_DEBUG(1, "memorystatus_update_priority_locked(): setting process %d to priority %d\n", p->p_pid, priority);
853
854	old_bucket = &memstat_bucket[p->p_memstat_effectivepriority];
855	TAILQ_REMOVE(&old_bucket->list, p, p_memstat_list);
856	old_bucket->count--;
857
858	new_bucket = &memstat_bucket[priority];
859	TAILQ_INSERT_TAIL(&new_bucket->list, p, p_memstat_list);
860	new_bucket->count++;
861
862#if CONFIG_JETSAM
863	if (memorystatus_highwater_enabled && (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND)) {
864		if (((priority >= JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority < JETSAM_PRIORITY_FOREGROUND)) ||
865			((priority < JETSAM_PRIORITY_FOREGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND))) {
866			int32_t memlimit = (priority >= JETSAM_PRIORITY_FOREGROUND) ? -1 : p->p_memstat_memlimit;
867			task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
868		}
869	}
870#endif
871
872	p->p_memstat_effectivepriority = priority;
873
874	memorystatus_check_levels_locked();
875}
876
877int
878memorystatus_update(proc_t p, int priority, uint64_t user_data, boolean_t effective, boolean_t update_memlimit, int32_t memlimit, boolean_t memlimit_background)
879{
880	int ret;
881
882#if !CONFIG_JETSAM
883#pragma unused(update_memlimit, memlimit, memlimit_background)
884#endif
885
886	MEMORYSTATUS_DEBUG(1, "memorystatus_update: changing process %d: priority %d, user_data 0x%llx\n", p->p_pid, priority, user_data);
887
888	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_START, p->p_pid, priority, user_data, effective, 0);
889
890	if (priority == -1) {
891		/* Use as shorthand for default priority */
892		priority = JETSAM_PRIORITY_DEFAULT;
893	} else if (priority == JETSAM_PRIORITY_IDLE_DEFERRED) {
894		/* JETSAM_PRIORITY_IDLE_DEFERRED is reserved for internal use; if requested, adjust to JETSAM_PRIORITY_IDLE. */
895		priority = JETSAM_PRIORITY_IDLE;
896	} else if ((priority < 0) || (priority >= MEMSTAT_BUCKET_COUNT)) {
897		/* Sanity check */
898		ret = EINVAL;
899		goto out;
900	}
901
902	proc_list_lock();
903
904	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
905
906	if (effective && (p->p_memstat_state & P_MEMSTAT_PRIORITYUPDATED)) {
907		ret = EALREADY;
908		proc_list_unlock();
909		MEMORYSTATUS_DEBUG(1, "memorystatus_update: effective change specified for pid %d, but change already occurred.\n", pid);
910		goto out;
911	}
912
913	p->p_memstat_state |= P_MEMSTAT_PRIORITYUPDATED;
914	p->p_memstat_userdata = user_data;
915	p->p_memstat_requestedpriority = priority;
916
917#if CONFIG_JETSAM
918	if (update_memlimit) {
919		p->p_memstat_memlimit = memlimit;
920		if (memlimit_background) {
921			/* Will be set as priority is updated */
922			p->p_memstat_state |= P_MEMSTAT_MEMLIMIT_BACKGROUND;
923		} else {
924			/* Otherwise, apply now */
925			if (memorystatus_highwater_enabled) {
926				task_set_phys_footprint_limit_internal(p->task, (memlimit  > 0) ? memlimit : -1, NULL, TRUE);
927			}
928		}
929	}
930#endif
931
932	memorystatus_update_priority_locked(p, priority);
933
934	proc_list_unlock();
935	ret = 0;
936
937out:
938	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_UPDATE) | DBG_FUNC_END, ret, 0, 0, 0, 0);
939
940	return ret;
941}
942
943int
944memorystatus_remove(proc_t p, boolean_t locked)
945{
946	int ret;
947	memstat_bucket_t *bucket;
948
949	MEMORYSTATUS_DEBUG(1, "memorystatus_list_remove: removing process %d\n", pid);
950
951   	if (!locked) {
952   	   	proc_list_lock();
953   	}
954
955	assert(!(p->p_memstat_state & P_MEMSTAT_INTERNAL));
956
957	bucket = &memstat_bucket[p->p_memstat_effectivepriority];
958	TAILQ_REMOVE(&bucket->list, p, p_memstat_list);
959	bucket->count--;
960
961	memorystatus_list_count--;
962
963	/* If awaiting demotion to the idle band, clean up */
964	if (p->p_memstat_effectivepriority == JETSAM_PRIORITY_IDLE_DEFERRED) {
965		memorystatus_invalidate_idle_demotion_locked(p, TRUE);
966 		memorystatus_reschedule_idle_demotion_locked();
967	}
968
969	memorystatus_check_levels_locked();
970
971#if CONFIG_FREEZE
972	if (p->p_memstat_state & (P_MEMSTAT_FROZEN)) {
973		memorystatus_frozen_count--;
974	}
975
976	if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
977		memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
978		memorystatus_suspended_count--;
979	}
980#endif
981
982   	if (!locked) {
983   	   	proc_list_unlock();
984   	}
985
986	if (p) {
987		ret = 0;
988	} else {
989		ret = ESRCH;
990	}
991
992	return ret;
993}
994
995static boolean_t
996memorystatus_validate_track_flags(struct proc *target_p, uint32_t pcontrol) {
997	/* See that the process isn't marked for termination */
998	if (target_p->p_memstat_dirty & P_DIRTY_TERMINATED) {
999		return FALSE;
1000	}
1001
1002	/* Idle exit requires that process be tracked */
1003	if ((pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) &&
1004	   !(pcontrol & PROC_DIRTY_TRACK)) {
1005		return FALSE;
1006	}
1007
1008	/* Deferral is only relevant if idle exit is specified */
1009	if ((pcontrol & PROC_DIRTY_DEFER) &&
1010	   !(pcontrol & PROC_DIRTY_ALLOWS_IDLE_EXIT)) {
1011		return FALSE;
1012	}
1013
1014	return TRUE;
1015}
1016
1017static void
1018memorystatus_update_idle_priority_locked(proc_t p) {
1019	int32_t priority;
1020
1021	MEMORYSTATUS_DEBUG(1, "memorystatus_update_idle_priority_locked(): pid %d dirty 0x%X\n", p->p_pid, p->p_memstat_dirty);
1022
1023	if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_IS_DIRTY)) == P_DIRTY_IDLE_EXIT_ENABLED) {
1024		priority = (p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) ? JETSAM_PRIORITY_IDLE_DEFERRED : JETSAM_PRIORITY_IDLE;
1025	} else {
1026		priority = p->p_memstat_requestedpriority;
1027	}
1028
1029	memorystatus_update_priority_locked(p, priority);
1030}
1031
1032/*
1033 * Processes can opt to have their state tracked by the kernel, indicating  when they are busy (dirty) or idle
1034 * (clean). They may also indicate that they support termination when idle, with the result that they are promoted
1035 * to their desired, higher, jetsam priority when dirty (and are therefore killed later), and demoted to the low
1036 * priority idle band when clean (and killed earlier, protecting higher priority procesess).
1037 *
1038 * If the deferral flag is set, then newly tracked processes will be protected for an initial period (as determined by
1039 * memorystatus_idle_delay_time); if they go clean during this time, then they will be moved to a deferred-idle band
1040 * with a slightly higher priority, guarding against immediate termination under memory pressure and being unable to
1041 * make forward progress. Finally, when the guard expires, they will be moved to the standard, lowest-priority, idle
1042 * band. The deferral can be cleared early by clearing the appropriate flag.
1043 *
1044 * The deferral timer is active only for the duration that the process is marked as guarded and clean; if the process
1045 * is marked dirty, the timer will be cancelled. Upon being subsequently marked clean, the deferment will either be
1046 * re-enabled or the guard state cleared, depending on whether the guard deadline has passed.
1047 */
1048
1049int
1050memorystatus_dirty_track(proc_t p, uint32_t pcontrol) {
1051	unsigned int old_dirty;
1052	boolean_t reschedule = FALSE;
1053	int ret;
1054
1055	proc_list_lock();
1056
1057	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1058		ret = EPERM;
1059		goto exit;
1060	}
1061
1062	if (!memorystatus_validate_track_flags(p, pcontrol)) {
1063		ret = EINVAL;
1064		goto exit;
1065        }
1066
1067        old_dirty = p->p_memstat_dirty;
1068
1069	/* These bits are cumulative, as per <rdar://problem/11159924> */
1070	if (pcontrol & PROC_DIRTY_TRACK) {
1071		p->p_memstat_dirty |= P_DIRTY_TRACK;
1072	}
1073
1074	if (pcontrol & PROC_DIRTY_ALLOW_IDLE_EXIT) {
1075		p->p_memstat_dirty |= P_DIRTY_ALLOW_IDLE_EXIT;
1076	}
1077
1078	/* This can be set and cleared exactly once. */
1079	if ((pcontrol & PROC_DIRTY_DEFER) && !(old_dirty & P_DIRTY_DEFER)) {
1080		p->p_memstat_dirty |= (P_DIRTY_DEFER|P_DIRTY_DEFER_IN_PROGRESS);
1081	} else {
1082		p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
1083	}
1084
1085	MEMORYSTATUS_DEBUG(1, "memorystatus_on_track_dirty(): set idle-exit %s / deferred %s / dirty %s for process %d\n",
1086		((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) ? "Y" : "N",
1087		p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS ? "Y" : "N",
1088		p->p_memstat_dirty & P_DIRTY ? "Y" : "N",
1089		p->p_pid);
1090
1091	/* Kick off or invalidate the idle exit deferment if there's a state transition. */
1092	if (!(p->p_memstat_dirty & P_DIRTY_IS_DIRTY)) {
1093		if (((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) &&
1094			(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) && !(old_dirty & P_DIRTY_DEFER_IN_PROGRESS)) {
1095			memorystatus_schedule_idle_demotion_locked(p, TRUE);
1096			reschedule = TRUE;
1097		} else if (!(p->p_memstat_dirty & P_DIRTY_DEFER_IN_PROGRESS) && (old_dirty & P_DIRTY_DEFER_IN_PROGRESS)) {
1098			memorystatus_invalidate_idle_demotion_locked(p, TRUE);
1099			reschedule = TRUE;
1100		}
1101	}
1102
1103	memorystatus_update_idle_priority_locked(p);
1104
1105	if (reschedule) {
1106		memorystatus_reschedule_idle_demotion_locked();
1107	}
1108
1109	ret = 0;
1110
1111exit:
1112	proc_list_unlock();
1113
1114	return ret;
1115}
1116
1117int
1118memorystatus_dirty_set(proc_t p, boolean_t self, uint32_t pcontrol) {
1119	int ret;
1120	boolean_t kill = false;
1121	boolean_t reschedule = FALSE;
1122	boolean_t was_dirty = FALSE;
1123	boolean_t now_dirty = FALSE;
1124
1125	MEMORYSTATUS_DEBUG(1, "memorystatus_dirty_set(): %d %d 0x%x 0x%x\n", self, p->p_pid, pcontrol, p->p_memstat_dirty);
1126
1127	proc_list_lock();
1128
1129	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
1130		ret = EPERM;
1131		goto exit;
1132	}
1133
1134	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1135		was_dirty = TRUE;
1136
1137	if (!(p->p_memstat_dirty & P_DIRTY_TRACK)) {
1138		/* Dirty tracking not enabled */
1139		ret = EINVAL;
1140	} else if (pcontrol && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1141		/*
1142		 * Process is set to be terminated and we're attempting to mark it dirty.
1143		 * Set for termination and marking as clean is OK - see <rdar://problem/10594349>.
1144		 */
1145		ret = EBUSY;
1146	} else {
1147		int flag = (self == TRUE) ? P_DIRTY : P_DIRTY_SHUTDOWN;
1148		if (pcontrol && !(p->p_memstat_dirty & flag)) {
1149			/* Mark the process as having been dirtied at some point */
1150			p->p_memstat_dirty |= (flag | P_DIRTY_MARKED);
1151			memorystatus_dirty_count++;
1152			ret = 0;
1153		} else if ((pcontrol == 0) && (p->p_memstat_dirty & flag)) {
1154			if ((flag == P_DIRTY_SHUTDOWN) && (!p->p_memstat_dirty & P_DIRTY)) {
1155				/* Clearing the dirty shutdown flag, and the process is otherwise clean - kill */
1156				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1157				kill = true;
1158			} else if ((flag == P_DIRTY) && (p->p_memstat_dirty & P_DIRTY_TERMINATED)) {
1159				/* Kill previously terminated processes if set clean */
1160				kill = true;
1161			}
1162			p->p_memstat_dirty &= ~flag;
1163			memorystatus_dirty_count--;
1164			ret = 0;
1165		} else {
1166			/* Already set */
1167			ret = EALREADY;
1168		}
1169	}
1170
1171	if (ret != 0) {
1172		goto exit;
1173	}
1174
1175	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY)
1176		now_dirty = TRUE;
1177
1178	if ((was_dirty == TRUE && now_dirty == FALSE) ||
1179	    (was_dirty == FALSE && now_dirty == TRUE)) {
1180
1181		/* Manage idle exit deferral, if applied */
1182		if ((p->p_memstat_dirty & (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) ==
1183		    (P_DIRTY_IDLE_EXIT_ENABLED|P_DIRTY_DEFER_IN_PROGRESS)) {
1184			if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
1185				memorystatus_invalidate_idle_demotion_locked(p, FALSE);
1186				reschedule = TRUE;
1187			} else {
1188				/* We evaluate lazily, so reset the idle-deadline if it's expired by the time the process becomes clean. */
1189				if (mach_absolute_time() >= p->p_memstat_idledeadline) {
1190					p->p_memstat_idledeadline = 0;
1191					p->p_memstat_dirty &= ~P_DIRTY_DEFER_IN_PROGRESS;
1192				} else {
1193					memorystatus_schedule_idle_demotion_locked(p, FALSE);
1194					reschedule = TRUE;
1195				}
1196			}
1197		}
1198
1199		memorystatus_update_idle_priority_locked(p);
1200
1201		/* If the deferral state changed, reschedule the demotion timer */
1202		if (reschedule) {
1203			memorystatus_reschedule_idle_demotion_locked();
1204		}
1205	}
1206
1207	if (kill) {
1208		psignal(p, SIGKILL);
1209	}
1210
1211exit:
1212	proc_list_unlock();
1213
1214	return ret;
1215}
1216
1217int
1218memorystatus_dirty_get(proc_t p) {
1219	int ret = 0;
1220
1221	proc_list_lock();
1222
1223	if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1224		ret |= PROC_DIRTY_TRACKED;
1225		if (p->p_memstat_dirty & P_DIRTY_ALLOW_IDLE_EXIT) {
1226			ret |= PROC_DIRTY_ALLOWS_IDLE_EXIT;
1227		}
1228		if (p->p_memstat_dirty & P_DIRTY) {
1229			ret |= PROC_DIRTY_IS_DIRTY;
1230		}
1231	}
1232
1233	proc_list_unlock();
1234
1235	return ret;
1236}
1237
1238int
1239memorystatus_on_terminate(proc_t p) {
1240	int sig;
1241
1242	proc_list_lock();
1243
1244	p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1245
1246	if ((p->p_memstat_dirty & (P_DIRTY_TRACK|P_DIRTY_IS_DIRTY)) == P_DIRTY_TRACK) {
1247		/* Clean; mark as terminated and issue SIGKILL */
1248		sig = SIGKILL;
1249	} else {
1250		/* Dirty, terminated, or state tracking is unsupported; issue SIGTERM to allow cleanup */
1251		sig = SIGTERM;
1252	}
1253
1254	proc_list_unlock();
1255
1256	return sig;
1257}
1258
1259void
1260memorystatus_on_suspend(proc_t p)
1261{
1262#if CONFIG_FREEZE
1263	uint32_t pages;
1264	memorystatus_get_task_page_counts(p->task, &pages, NULL);
1265#endif
1266	proc_list_lock();
1267#if CONFIG_FREEZE
1268	p->p_memstat_suspendedfootprint = pages;
1269	memorystatus_suspended_footprint_total += pages;
1270	memorystatus_suspended_count++;
1271#endif
1272	p->p_memstat_state |= P_MEMSTAT_SUSPENDED;
1273	proc_list_unlock();
1274}
1275
1276void
1277memorystatus_on_resume(proc_t p)
1278{
1279#if CONFIG_FREEZE
1280	boolean_t frozen;
1281	pid_t pid;
1282#endif
1283
1284	proc_list_lock();
1285
1286#if CONFIG_FREEZE
1287	frozen = (p->p_memstat_state & P_MEMSTAT_FROZEN);
1288	if (frozen) {
1289		memorystatus_frozen_count--;
1290		p->p_memstat_state |= P_MEMSTAT_PRIOR_THAW;
1291	}
1292
1293	memorystatus_suspended_footprint_total -= p->p_memstat_suspendedfootprint;
1294	memorystatus_suspended_count--;
1295
1296	pid = p->p_pid;
1297#endif
1298
1299	p->p_memstat_state &= ~(P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN);
1300
1301	proc_list_unlock();
1302
1303#if CONFIG_FREEZE
1304	if (frozen) {
1305		memorystatus_freeze_entry_t data = { pid, FALSE, 0 };
1306		memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
1307	}
1308#endif
1309}
1310
1311void
1312memorystatus_on_inactivity(proc_t p)
1313{
1314#pragma unused(p)
1315#if CONFIG_FREEZE
1316	/* Wake the freeze thread */
1317	thread_wakeup((event_t)&memorystatus_freeze_wakeup);
1318#endif
1319}
1320
1321static uint32_t
1322memorystatus_build_state(proc_t p) {
1323	uint32_t snapshot_state = 0;
1324
1325	/* General */
1326	if (p->p_memstat_state & P_MEMSTAT_SUSPENDED) {
1327		snapshot_state |= kMemorystatusSuspended;
1328	}
1329	if (p->p_memstat_state & P_MEMSTAT_FROZEN) {
1330		snapshot_state |= kMemorystatusFrozen;
1331	}
1332	if (p->p_memstat_state & P_MEMSTAT_PRIOR_THAW) {
1333 		snapshot_state |= kMemorystatusWasThawed;
1334	}
1335
1336	/* Tracking */
1337	if (p->p_memstat_dirty & P_DIRTY_TRACK) {
1338		snapshot_state |= kMemorystatusTracked;
1339	}
1340	if ((p->p_memstat_dirty & P_DIRTY_IDLE_EXIT_ENABLED) == P_DIRTY_IDLE_EXIT_ENABLED) {
1341		snapshot_state |= kMemorystatusSupportsIdleExit;
1342	}
1343	if (p->p_memstat_dirty & P_DIRTY_IS_DIRTY) {
1344		snapshot_state |= kMemorystatusDirty;
1345	}
1346
1347	return snapshot_state;
1348}
1349
1350#if !CONFIG_JETSAM
1351
1352static boolean_t
1353kill_idle_exit_proc(void)
1354{
1355	proc_t p, victim_p = PROC_NULL;
1356	uint64_t current_time;
1357	boolean_t killed = FALSE;
1358	unsigned int i = 0;
1359
1360	/* Pick next idle exit victim. */
1361	current_time = mach_absolute_time();
1362
1363	proc_list_lock();
1364
1365	p = memorystatus_get_first_proc_locked(&i, FALSE);
1366	while (p) {
1367		/* No need to look beyond the idle band */
1368		if (p->p_memstat_effectivepriority != JETSAM_PRIORITY_IDLE) {
1369			break;
1370		}
1371
1372		if ((p->p_memstat_dirty & (P_DIRTY_ALLOW_IDLE_EXIT|P_DIRTY_IS_DIRTY|P_DIRTY_TERMINATED)) == (P_DIRTY_ALLOW_IDLE_EXIT)) {
1373			if (current_time >= p->p_memstat_idledeadline) {
1374				p->p_memstat_dirty |= P_DIRTY_TERMINATED;
1375				victim_p = proc_ref_locked(p);
1376				break;
1377			}
1378		}
1379
1380		p = memorystatus_get_next_proc_locked(&i, p, FALSE);
1381	}
1382
1383	proc_list_unlock();
1384
1385	if (victim_p) {
1386		printf("memorystatus_thread: idle exiting pid %d [%s]\n", victim_p->p_pid, (victim_p->p_comm ? victim_p->p_comm : "(unknown)"));
1387		killed = memorystatus_do_kill(victim_p, kMemorystatusKilledIdleExit);
1388		proc_rele(victim_p);
1389	}
1390
1391	return killed;
1392}
1393#endif
1394
1395static void
1396memorystatus_thread_wake(void) {
1397	thread_wakeup((event_t)&memorystatus_wakeup);
1398}
1399
1400static int
1401memorystatus_thread_block(uint32_t interval_ms, thread_continue_t continuation)
1402{
1403	if (interval_ms) {
1404		assert_wait_timeout(&memorystatus_wakeup, THREAD_UNINT, interval_ms, 1000 * NSEC_PER_USEC);
1405	} else {
1406		assert_wait(&memorystatus_wakeup, THREAD_UNINT);
1407	}
1408
1409	return thread_block(continuation);
1410}
1411
1412extern boolean_t vm_compressor_thrashing_detected;
1413extern uint64_t vm_compressor_total_compressions(void);
1414
1415static void
1416memorystatus_thread(void *param __unused, wait_result_t wr __unused)
1417{
1418	static boolean_t is_vm_privileged = FALSE;
1419#if CONFIG_JETSAM
1420	boolean_t post_snapshot = FALSE;
1421	uint32_t errors = 0;
1422#endif
1423
1424	if (is_vm_privileged == FALSE) {
1425		/*
1426		 * It's the first time the thread has run, so just mark the thread as privileged and block.
1427		 * This avoids a spurious pass with unset variables, as set out in <rdar://problem/9609402>.
1428		 */
1429		thread_wire(host_priv_self(), current_thread(), TRUE);
1430		is_vm_privileged = TRUE;
1431
1432		memorystatus_thread_block(0, memorystatus_thread);
1433	}
1434
1435#if CONFIG_JETSAM
1436
1437	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_START,
1438		memorystatus_available_pages, 0, 0, 0, 0);
1439
1440	uint32_t cause = vm_compressor_thrashing_detected ? kMemorystatusKilledVMThrashing : kMemorystatusKilledVMPageShortage;
1441
1442	/* Jetsam aware version.
1443	 *
1444	 * If woken under pressure, go down the path of killing:
1445	 *
1446	 * - processes exceeding their highwater mark if no clean victims available
1447	 * - the least recently used process if no highwater mark victims available
1448	 */
1449#if !LATENCY_JETSAM
1450	while (vm_compressor_thrashing_detected || memorystatus_available_pages <= memorystatus_available_pages_critical) {
1451#else
1452	while (kill_under_pressure) {
1453		const uint32_t SNAPSHOT_WAIT_TIMEOUT_MS = 100;
1454		wait_result_t wait_result;
1455#endif
1456		boolean_t killed;
1457		int32_t priority;
1458
1459#if LEGACY_HIWATER
1460		/* Highwater */
1461		killed = memorystatus_kill_hiwat_proc(&errors);
1462		if (killed) {
1463			post_snapshot = TRUE;
1464			goto done;
1465		}
1466#endif
1467
1468		/* LRU */
1469		killed = memorystatus_kill_top_process(TRUE, cause, &priority, &errors);
1470		if (killed) {
1471			if (!kill_under_pressure && (priority != JETSAM_PRIORITY_IDLE)) {
1472			        /* Don't generate logs for steady-state idle-exit kills */
1473        			post_snapshot = TRUE;
1474			}
1475			goto done;
1476		}
1477
1478		/* Under pressure and unable to kill a process - panic */
1479		panic("memorystatus_jetsam_thread: no victim! available pages:%d\n", memorystatus_available_pages);
1480
1481done:
1482		kill_under_pressure = FALSE;
1483		vm_compressor_thrashing_detected = FALSE;
1484
1485#if LATENCY_JETSAM
1486		KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_LATENCY_COALESCE) | DBG_FUNC_START,
1487			memorystatus_available_pages, 0, 0, 0, 0);
1488		thread_wakeup((event_t)&latency_jetsam_wakeup);
1489		/*
1490		 * Coalesce snapshot reports in the face of repeated jetsams by blocking here with a timeout.
1491		 * If the wait expires, issue the note.
1492		 */
1493		wait_result = memorystatus_thread_block(SNAPSHOT_WAIT_TIMEOUT_MS, THREAD_CONTINUE_NULL);
1494		KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_LATENCY_COALESCE) | DBG_FUNC_END,
1495			memorystatus_available_pages, 0, 0, 0, 0);
1496		if (wait_result != THREAD_AWAKENED) {
1497			/* Catch-all */
1498			break;
1499		}
1500#endif
1501	}
1502
1503	if (errors) {
1504		memorystatus_clear_errors();
1505	}
1506
1507#if VM_PRESSURE_EVENTS
1508	memorystatus_update_vm_pressure(TRUE);
1509#endif
1510
1511	if (post_snapshot) {
1512		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
1513			sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count);
1514		memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
1515		memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
1516	}
1517
1518	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_SCAN) | DBG_FUNC_END,
1519		memorystatus_available_pages, 0, 0, 0, 0);
1520
1521#else /* CONFIG_JETSAM */
1522
1523	/* Simple version.
1524	 *
1525	 * Jetsam not enabled, so just kill the first suitable clean process
1526	 * and sleep.
1527	 */
1528
1529	if (kill_idle_exit) {
1530		kill_idle_exit_proc();
1531		kill_idle_exit = FALSE;
1532	}
1533
1534#endif /* CONFIG_JETSAM */
1535
1536	memorystatus_thread_block(0, memorystatus_thread);
1537}
1538
1539#if !CONFIG_JETSAM
1540boolean_t memorystatus_idle_exit_from_VM(void) {
1541	kill_idle_exit = TRUE;
1542	memorystatus_thread_wake();
1543	return TRUE;
1544}
1545#endif
1546
1547#if CONFIG_JETSAM
1548
1549/*
1550 * Callback invoked when allowable physical memory footprint exceeded
1551 * (dirty pages + IOKit mappings)
1552 *
1553 * This is invoked for both advisory, non-fatal per-task high watermarks,
1554 * as well as the fatal system-wide task memory limit.
1555 */
1556void
1557memorystatus_on_ledger_footprint_exceeded(boolean_t warning, const int max_footprint_mb)
1558{
1559	proc_t p = current_proc();
1560
1561	printf("process %d (%s) %s physical memory footprint limit of %d MB\n",
1562		p->p_pid, p->p_comm,
1563		warning ? "approaching" : "exceeded",
1564		max_footprint_mb);
1565
1566#if VM_PRESSURE_EVENTS
1567	if (warning == TRUE) {
1568		if (memorystatus_warn_process(p->p_pid) != TRUE) {
1569			/* Print warning, since it's possible that task has not registered for pressure notifications */
1570			printf("task_exceeded_footprint: failed to warn the current task (exiting?).\n");
1571		}
1572		return;
1573	}
1574#endif /* VM_PRESSURE_EVENTS */
1575
1576	if (p->p_memstat_memlimit <= 0) {
1577		/*
1578		 * If this process has no high watermark, then we have been invoked because the task
1579		 * has violated the system-wide per-task memory limit.
1580		 */
1581		if (memorystatus_kill_process_sync(p->p_pid, kMemorystatusKilledPerProcessLimit) != TRUE) {
1582			printf("task_exceeded_footprint: failed to kill the current task (exiting?).\n");
1583		}
1584	}
1585}
1586
1587static void
1588memorystatus_get_task_page_counts(task_t task, uint32_t *footprint, uint32_t *max_footprint)
1589{
1590	assert(task);
1591	assert(footprint);
1592
1593	*footprint = (uint32_t)(get_task_phys_footprint(task) / PAGE_SIZE_64);
1594	if (max_footprint) {
1595		*max_footprint = (uint32_t)(get_task_phys_footprint_max(task) / PAGE_SIZE_64);
1596	}
1597}
1598
1599static int
1600memorystatus_send_note(int event_code, void *data, size_t data_length) {
1601	int ret;
1602	struct kev_msg ev_msg;
1603
1604	ev_msg.vendor_code    = KEV_VENDOR_APPLE;
1605	ev_msg.kev_class      = KEV_SYSTEM_CLASS;
1606	ev_msg.kev_subclass   = KEV_MEMORYSTATUS_SUBCLASS;
1607
1608	ev_msg.event_code     = event_code;
1609
1610	ev_msg.dv[0].data_length = data_length;
1611	ev_msg.dv[0].data_ptr = data;
1612	ev_msg.dv[1].data_length = 0;
1613
1614	ret = kev_post_msg(&ev_msg);
1615	if (ret) {
1616		printf("%s: kev_post_msg() failed, err %d\n", __func__, ret);
1617	}
1618
1619	return ret;
1620}
1621
1622static void
1623memorystatus_update_snapshot_locked(proc_t p, uint32_t kill_cause)
1624{
1625	unsigned int i;
1626
1627	for (i = 0; i < memorystatus_jetsam_snapshot_count; i++) {
1628		if (memorystatus_jetsam_snapshot_list[i].pid == p->p_pid) {
1629			/* Update if the priority has changed since the snapshot was taken */
1630			if (memorystatus_jetsam_snapshot_list[i].priority != p->p_memstat_effectivepriority) {
1631				memorystatus_jetsam_snapshot_list[i].priority = p->p_memstat_effectivepriority;
1632				strlcpy(memorystatus_jetsam_snapshot_list[i].name, p->p_comm, MAXCOMLEN+1);
1633				memorystatus_jetsam_snapshot_list[i].state = memorystatus_build_state(p);
1634				memorystatus_jetsam_snapshot_list[i].user_data = p->p_memstat_userdata;
1635				memorystatus_jetsam_snapshot_list[i].fds = p->p_fd->fd_nfiles;
1636			}
1637			memorystatus_jetsam_snapshot_list[i].killed = kill_cause;
1638			return;
1639		}
1640	}
1641}
1642
1643void memorystatus_pages_update(unsigned int pages_avail)
1644{
1645	boolean_t critical, delta;
1646
1647	if (!memorystatus_delta) {
1648	    return;
1649	}
1650
1651	critical = (pages_avail < memorystatus_available_pages_critical) ? TRUE : FALSE;
1652	delta = ((pages_avail >= (memorystatus_available_pages + memorystatus_delta))
1653                || (memorystatus_available_pages >= (pages_avail + memorystatus_delta))) ? TRUE : FALSE;
1654
1655	if (critical || delta) {
1656		memorystatus_available_pages = pages_avail;
1657  		memorystatus_level = memorystatus_available_pages * 100 / atop_64(max_mem);
1658
1659#if LATENCY_JETSAM
1660		/* Bail early to avoid excessive wake-ups */
1661		if (critical) {
1662			return;
1663		}
1664#endif
1665
1666		memorystatus_thread_wake();
1667	}
1668}
1669
1670static boolean_t
1671memorystatus_get_snapshot_properties_for_proc_locked(proc_t p, memorystatus_jetsam_snapshot_entry_t *entry)
1672{
1673	memset(entry, 0, sizeof(memorystatus_jetsam_snapshot_entry_t));
1674
1675	entry->pid = p->p_pid;
1676	strlcpy(&entry->name[0], p->p_comm, MAXCOMLEN+1);
1677	entry->priority = p->p_memstat_effectivepriority;
1678	memorystatus_get_task_page_counts(p->task, &entry->pages, &entry->max_pages);
1679	entry->state = memorystatus_build_state(p);
1680	entry->user_data = p->p_memstat_userdata;
1681	memcpy(&entry->uuid[0], &p->p_uuid[0], sizeof(p->p_uuid));
1682
1683	return TRUE;
1684}
1685
1686static void
1687memorystatus_jetsam_snapshot_procs_locked(void)
1688{
1689	proc_t p, next_p;
1690	unsigned int b = 0, i = 0;
1691	kern_return_t kr = KERN_SUCCESS;
1692
1693	mach_msg_type_number_t	count = HOST_VM_INFO64_COUNT;
1694	vm_statistics64_data_t	vm_stat;
1695
1696	if ((kr = host_statistics64(host_self(), HOST_VM_INFO64, (host_info64_t)&vm_stat, &count) != KERN_SUCCESS)) {
1697		printf("memorystatus_jetsam_snapshot_procs_locked: host_statistics64 failed with %d\n", kr);
1698		memset(&memorystatus_jetsam_snapshot->stats, 0, sizeof(memorystatus_jetsam_snapshot->stats));
1699	} else {
1700		memorystatus_jetsam_snapshot->stats.free_pages = vm_stat.free_count;
1701		memorystatus_jetsam_snapshot->stats.active_pages = vm_stat.active_count;
1702		memorystatus_jetsam_snapshot->stats.inactive_pages = vm_stat.inactive_count;
1703		memorystatus_jetsam_snapshot->stats.throttled_pages = vm_stat.throttled_count;
1704		memorystatus_jetsam_snapshot->stats.purgeable_pages = vm_stat.purgeable_count;
1705		memorystatus_jetsam_snapshot->stats.wired_pages = vm_stat.wire_count;
1706
1707		memorystatus_jetsam_snapshot->stats.speculative_pages = vm_stat.speculative_count;
1708		memorystatus_jetsam_snapshot->stats.filebacked_pages = vm_stat.external_page_count;
1709		memorystatus_jetsam_snapshot->stats.anonymous_pages = vm_stat.internal_page_count;
1710		memorystatus_jetsam_snapshot->stats.compressions = vm_stat.compressions;
1711		memorystatus_jetsam_snapshot->stats.decompressions = vm_stat.decompressions;
1712		memorystatus_jetsam_snapshot->stats.compressor_pages = vm_stat.compressor_page_count;
1713		memorystatus_jetsam_snapshot->stats.total_uncompressed_pages_in_compressor = vm_stat.total_uncompressed_pages_in_compressor;
1714	}
1715
1716	next_p = memorystatus_get_first_proc_locked(&b, TRUE);
1717	while (next_p) {
1718		p = next_p;
1719		next_p = memorystatus_get_next_proc_locked(&b, p, TRUE);
1720
1721		if (FALSE == memorystatus_get_snapshot_properties_for_proc_locked(p, &memorystatus_jetsam_snapshot_list[i])) {
1722			continue;
1723		}
1724
1725		MEMORYSTATUS_DEBUG(0, "jetsam snapshot pid = %d, uuid = %02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n",
1726			p->p_pid,
1727			p->p_uuid[0], p->p_uuid[1], p->p_uuid[2], p->p_uuid[3], p->p_uuid[4], p->p_uuid[5], p->p_uuid[6], p->p_uuid[7],
1728			p->p_uuid[8], p->p_uuid[9], p->p_uuid[10], p->p_uuid[11], p->p_uuid[12], p->p_uuid[13], p->p_uuid[14], p->p_uuid[15]);
1729
1730		if (++i == memorystatus_jetsam_snapshot_max) {
1731			break;
1732		}
1733	}
1734
1735	memorystatus_jetsam_snapshot->snapshot_time = mach_absolute_time();
1736	memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = i;
1737}
1738
1739#if DEVELOPMENT || DEBUG
1740
1741static int
1742memorystatus_cmd_set_panic_bits(user_addr_t buffer, uint32_t buffer_size) {
1743	int ret;
1744	memorystatus_jetsam_panic_options_t debug;
1745
1746	if (buffer_size != sizeof(memorystatus_jetsam_panic_options_t)) {
1747		return EINVAL;
1748	}
1749
1750	ret = copyin(buffer, &debug, buffer_size);
1751	if (ret) {
1752		return ret;
1753	}
1754
1755	/* Panic bits match kMemorystatusKilled* enum */
1756	memorystatus_jetsam_panic_debug = (memorystatus_jetsam_panic_debug & ~debug.mask) | (debug.data & debug.mask);
1757
1758	/* Copyout new value */
1759	debug.data = memorystatus_jetsam_panic_debug;
1760	ret = copyout(&debug, buffer, sizeof(memorystatus_jetsam_panic_options_t));
1761
1762	return ret;
1763}
1764
1765#endif
1766
1767/*
1768 * Jetsam a specific process.
1769 */
1770static boolean_t
1771memorystatus_kill_specific_process(pid_t victim_pid, uint32_t cause) {
1772	boolean_t killed;
1773	proc_t p;
1774
1775	/* TODO - add a victim queue and push this into the main jetsam thread */
1776
1777	p = proc_find(victim_pid);
1778	if (!p) {
1779		return FALSE;
1780	}
1781
1782	printf("memorystatus: specifically killing pid %d [%s] - memorystatus_available_pages: %d\n",
1783		victim_pid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages);
1784
1785	proc_list_lock();
1786
1787	if (memorystatus_jetsam_snapshot_count == 0) {
1788		memorystatus_jetsam_snapshot_procs_locked();
1789	}
1790
1791	memorystatus_update_snapshot_locked(p, cause);
1792	proc_list_unlock();
1793
1794	killed = memorystatus_do_kill(p, cause);
1795	proc_rele(p);
1796
1797	return killed;
1798}
1799
1800/*
1801 * Jetsam the first process in the queue.
1802 */
1803static boolean_t
1804memorystatus_kill_top_process(boolean_t any, uint32_t cause, int32_t *priority, uint32_t *errors)
1805{
1806	pid_t aPid;
1807	proc_t p = PROC_NULL, next_p = PROC_NULL;
1808	boolean_t new_snapshot = FALSE, killed = FALSE;
1809	unsigned int i = 0;
1810
1811#ifndef CONFIG_FREEZE
1812#pragma unused(any)
1813#endif
1814
1815	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_START,
1816		memorystatus_available_pages, 0, 0, 0, 0);
1817
1818	proc_list_lock();
1819
1820	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
1821	while (next_p) {
1822#if DEVELOPMENT || DEBUG
1823		int activeProcess;
1824		int procSuspendedForDiagnosis;
1825#endif /* DEVELOPMENT || DEBUG */
1826
1827		p = next_p;
1828		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
1829
1830#if DEVELOPMENT || DEBUG
1831		activeProcess = p->p_memstat_state & P_MEMSTAT_FOREGROUND;
1832		procSuspendedForDiagnosis = p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED;
1833#endif /* DEVELOPMENT || DEBUG */
1834
1835		aPid = p->p_pid;
1836
1837		if (p->p_memstat_state & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
1838			continue;
1839		}
1840
1841#if DEVELOPMENT || DEBUG
1842		if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && procSuspendedForDiagnosis) {
1843			printf("jetsam: continuing after ignoring proc suspended already for diagnosis - %d\n", aPid);
1844			continue;
1845		}
1846#endif /* DEVELOPMENT || DEBUG */
1847
1848#if CONFIG_FREEZE
1849		boolean_t skip;
1850		boolean_t reclaim_proc = !(p->p_memstat_state & (P_MEMSTAT_LOCKED | P_MEMSTAT_NORECLAIM));
1851		if (any || reclaim_proc) {
1852			skip = FALSE;
1853		} else {
1854			skip = TRUE;
1855		}
1856
1857		if (skip) {
1858			continue;
1859		} else
1860#endif
1861		{
1862			if (priority) {
1863				*priority = p->p_memstat_effectivepriority;
1864			}
1865
1866		        /*
1867		         * Capture a snapshot if none exists and:
1868		         * - priority was not requested (this is something other than an ambient kill)
1869		         * - the priority was requested *and* the targeted process is not at idle priority
1870		         */
1871                	if ((memorystatus_jetsam_snapshot_count == 0) &&
1872                	    ((!priority) || (priority && (*priority != JETSAM_PRIORITY_IDLE)))) {
1873                		memorystatus_jetsam_snapshot_procs_locked();
1874                		new_snapshot = TRUE;
1875                	}
1876
1877			/*
1878			 * Mark as terminated so that if exit1() indicates success, but the process (for example)
1879			 * is blocked in task_exception_notify(), it'll be skipped if encountered again - see
1880			 * <rdar://problem/13553476>. This is cheaper than examining P_LEXIT, which requires the
1881			 * acquisition of the proc lock.
1882			 */
1883			p->p_memstat_state |= P_MEMSTAT_TERMINATED;
1884
1885#if DEVELOPMENT || DEBUG
1886			if ((memorystatus_jetsam_policy & kPolicyDiagnoseActive) && activeProcess) {
1887				MEMORYSTATUS_DEBUG(1, "jetsam: suspending pid %d [%s] (active) for diagnosis - memory_status_level: %d\n",
1888					aPid, (p->p_comm ? p->p_comm: "(unknown)"), memorystatus_level);
1889				memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
1890				p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
1891				if (memorystatus_jetsam_policy & kPolicyDiagnoseFirst) {
1892					jetsam_diagnostic_suspended_one_active_proc = 1;
1893					printf("jetsam: returning after suspending first active proc - %d\n", aPid);
1894				}
1895
1896				p = proc_ref_locked(p);
1897				proc_list_unlock();
1898				if (p) {
1899					task_suspend(p->task);
1900					proc_rele(p);
1901					killed = TRUE;
1902				}
1903
1904				goto exit;
1905			} else
1906#endif /* DEVELOPMENT || DEBUG */
1907			{
1908				/* Shift queue, update stats */
1909				memorystatus_update_snapshot_locked(p, cause);
1910
1911				p = proc_ref_locked(p);
1912				proc_list_unlock();
1913				if (p) {
1914					printf("memorystatus: jetsam killing pid %d [%s] - memorystatus_available_pages: %d\n",
1915    						aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages);
1916					killed = memorystatus_do_kill(p, cause);
1917				}
1918
1919				/* Success? */
1920				if (killed) {
1921					proc_rele(p);
1922					goto exit;
1923				}
1924
1925				/* Failure - unwind and restart. */
1926				proc_list_lock();
1927				proc_rele_locked(p);
1928				p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
1929				p->p_memstat_state |= P_MEMSTAT_ERROR;
1930				*errors += 1;
1931				i = 0;
1932				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
1933			}
1934		}
1935	}
1936
1937	proc_list_unlock();
1938
1939exit:
1940	/* Clear snapshot if freshly captured and no target was found */
1941	if (new_snapshot && !killed) {
1942	    memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
1943	}
1944
1945	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM) | DBG_FUNC_END,
1946	    memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
1947
1948	return killed;
1949}
1950
1951#if LEGACY_HIWATER
1952
1953static boolean_t
1954memorystatus_kill_hiwat_proc(uint32_t *errors)
1955{
1956	pid_t aPid = 0;
1957	proc_t p = PROC_NULL, next_p = PROC_NULL;
1958	boolean_t new_snapshot = FALSE, killed = FALSE;
1959	unsigned int i = 0;
1960
1961	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_START,
1962		memorystatus_available_pages, 0, 0, 0, 0);
1963
1964	proc_list_lock();
1965
1966	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
1967	while (next_p) {
1968		uint32_t footprint;
1969		boolean_t skip;
1970
1971		p = next_p;
1972		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
1973
1974		aPid = p->p_pid;
1975
1976		if (p->p_memstat_state  & (P_MEMSTAT_ERROR | P_MEMSTAT_TERMINATED)) {
1977			continue;
1978		}
1979
1980		/* skip if no limit set */
1981		if (p->p_memstat_memlimit <= 0) {
1982			continue;
1983		}
1984
1985		/* skip if a currently inapplicable limit is encountered */
1986		if ((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) {
1987			continue;
1988		}
1989
1990		footprint = (uint32_t)(get_task_phys_footprint(p->task) / (1024 * 1024));
1991		skip = (((int32_t)footprint) <= p->p_memstat_memlimit);
1992#if DEVELOPMENT || DEBUG
1993		if (!skip && (memorystatus_jetsam_policy & kPolicyDiagnoseActive)) {
1994			if (p->p_memstat_state & P_MEMSTAT_DIAG_SUSPENDED) {
1995				continue;
1996			}
1997		}
1998#endif /* DEVELOPMENT || DEBUG */
1999
2000#if CONFIG_FREEZE
2001		if (!skip) {
2002			if (p->p_memstat_state & P_MEMSTAT_LOCKED) {
2003				skip = TRUE;
2004			} else {
2005				skip = FALSE;
2006			}
2007		}
2008#endif
2009
2010		if (skip) {
2011			continue;
2012		} else {
2013			MEMORYSTATUS_DEBUG(1, "jetsam: %s pid %d [%s] - %d pages > 1 (%d)\n",
2014				(memorystatus_jetsam_policy & kPolicyDiagnoseActive) ? "suspending": "killing", aPid, p->p_comm, pages, hiwat);
2015
2016			if (memorystatus_jetsam_snapshot_count == 0) {
2017                		memorystatus_jetsam_snapshot_procs_locked();
2018                		new_snapshot = TRUE;
2019                	}
2020
2021			p->p_memstat_state |= P_MEMSTAT_TERMINATED;
2022
2023#if DEVELOPMENT || DEBUG
2024			if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
2025			        MEMORYSTATUS_DEBUG(1, "jetsam: pid %d suspended for diagnosis - memorystatus_available_pages: %d\n", aPid, memorystatus_available_pages);
2026				memorystatus_update_snapshot_locked(p, kMemorystatusKilledDiagnostic);
2027				p->p_memstat_state |= P_MEMSTAT_DIAG_SUSPENDED;
2028
2029				p = proc_ref_locked(p);
2030				proc_list_unlock();
2031				if (p) {
2032					task_suspend(p->task);
2033					proc_rele(p);
2034					killed = TRUE;
2035				}
2036
2037				goto exit;
2038			} else
2039#endif /* DEVELOPMENT || DEBUG */
2040			{
2041				memorystatus_update_snapshot_locked(p, kMemorystatusKilledHiwat);
2042
2043				p = proc_ref_locked(p);
2044				proc_list_unlock();
2045				if (p) {
2046				    printf("memorystatus: jetsam killing pid %d [%s] (highwater) - memorystatus_available_pages: %d\n",
2047        					aPid, (p->p_comm ? p->p_comm : "(unknown)"), memorystatus_available_pages);
2048				    killed = memorystatus_do_kill(p, kMemorystatusKilledHiwat);
2049				}
2050
2051				/* Success? */
2052				if (killed) {
2053					proc_rele(p);
2054					goto exit;
2055				}
2056
2057				/* Failure - unwind and restart. */
2058				proc_list_lock();
2059				proc_rele_locked(p);
2060				p->p_memstat_state &= ~P_MEMSTAT_TERMINATED;
2061				p->p_memstat_state |= P_MEMSTAT_ERROR;
2062				*errors += 1;
2063				i = 0;
2064				next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2065			}
2066		}
2067	}
2068
2069	proc_list_unlock();
2070
2071exit:
2072	/* Clear snapshot if freshly captured and no target was found */
2073	if (new_snapshot && !killed) {
2074		memorystatus_jetsam_snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
2075	}
2076
2077	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_JETSAM_HIWAT) | DBG_FUNC_END,
2078	    memorystatus_available_pages, killed ? aPid : 0, 0, 0, 0);
2079
2080	return killed;
2081}
2082
2083#endif /* LEGACY_HIWATER */
2084
2085static boolean_t
2086memorystatus_kill_process_async(pid_t victim_pid, uint32_t cause) {
2087	/* TODO: allow a general async path */
2088	if ((victim_pid != -1) || (cause != kMemorystatusKilledVMPageShortage || cause != kMemorystatusKilledVMThrashing)) {
2089		return FALSE;
2090	}
2091
2092	kill_under_pressure = TRUE;
2093	memorystatus_thread_wake();
2094	return TRUE;
2095}
2096
2097static boolean_t
2098memorystatus_kill_process_sync(pid_t victim_pid, uint32_t cause) {
2099	boolean_t res;
2100	uint32_t errors = 0;
2101
2102	if (victim_pid == -1) {
2103		/* No pid, so kill first process */
2104		res = memorystatus_kill_top_process(TRUE, cause, NULL, &errors);
2105	} else {
2106		res = memorystatus_kill_specific_process(victim_pid, cause);
2107	}
2108
2109	if (errors) {
2110		memorystatus_clear_errors();
2111	}
2112
2113	if (res == TRUE) {
2114		/* Fire off snapshot notification */
2115		size_t snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) +
2116			sizeof(memorystatus_jetsam_snapshot_entry_t) * memorystatus_jetsam_snapshot_count;
2117		memorystatus_jetsam_snapshot->notification_time = mach_absolute_time();
2118		memorystatus_send_note(kMemorystatusSnapshotNote, &snapshot_size, sizeof(snapshot_size));
2119	}
2120
2121	return res;
2122}
2123
2124boolean_t
2125memorystatus_kill_on_VM_page_shortage(boolean_t async) {
2126	if (async) {
2127		return memorystatus_kill_process_async(-1, kMemorystatusKilledVMPageShortage);
2128	} else {
2129		return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMPageShortage);
2130	}
2131}
2132
2133boolean_t
2134memorystatus_kill_on_VM_thrashing(boolean_t async) {
2135	if (async) {
2136		return memorystatus_kill_process_async(-1, kMemorystatusKilledVMThrashing);
2137	} else {
2138		return memorystatus_kill_process_sync(-1, kMemorystatusKilledVMThrashing);
2139	}
2140}
2141
2142boolean_t
2143memorystatus_kill_on_vnode_limit(void) {
2144	return memorystatus_kill_process_sync(-1, kMemorystatusKilledVnodes);
2145}
2146
2147#endif /* CONFIG_JETSAM */
2148
2149#if CONFIG_FREEZE
2150
2151__private_extern__ void
2152memorystatus_freeze_init(void)
2153{
2154	kern_return_t result;
2155	thread_t thread;
2156
2157	result = kernel_thread_start(memorystatus_freeze_thread, NULL, &thread);
2158	if (result == KERN_SUCCESS) {
2159		thread_deallocate(thread);
2160	} else {
2161		panic("Could not create memorystatus_freeze_thread");
2162	}
2163}
2164
2165static int
2166memorystatus_freeze_top_process(boolean_t *memorystatus_freeze_swap_low)
2167{
2168	pid_t aPid = 0;
2169	int ret = -1;
2170	proc_t p = PROC_NULL, next_p = PROC_NULL;
2171	unsigned int i = 0;
2172
2173	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_START,
2174		memorystatus_available_pages, 0, 0, 0, 0);
2175
2176	proc_list_lock();
2177
2178	next_p = memorystatus_get_first_proc_locked(&i, TRUE);
2179	while (next_p) {
2180		kern_return_t kr;
2181		uint32_t purgeable, wired, clean, dirty;
2182		boolean_t shared;
2183		uint32_t pages;
2184		uint32_t max_pages = 0;
2185		uint32_t state;
2186
2187		p = next_p;
2188		next_p = memorystatus_get_next_proc_locked(&i, p, TRUE);
2189
2190		aPid = p->p_pid;
2191		state = p->p_memstat_state;
2192
2193		/* Ensure the process is eligible for freezing */
2194		if ((state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_FROZEN)) || !(state & P_MEMSTAT_SUSPENDED)) {
2195			continue; // with lock held
2196		}
2197
2198		/* Only freeze processes meeting our minimum resident page criteria */
2199		memorystatus_get_task_page_counts(p->task, &pages, NULL);
2200		if (pages < memorystatus_freeze_pages_min) {
2201			continue; // with lock held
2202		}
2203
2204		if (DEFAULT_FREEZER_IS_ACTIVE || DEFAULT_FREEZER_COMPRESSED_PAGER_IS_ACTIVE) {
2205			/* Ensure there's enough free space to freeze this process. */
2206			max_pages = MIN(default_pager_swap_pages_free(), memorystatus_freeze_pages_max);
2207			if (max_pages < memorystatus_freeze_pages_min) {
2208				*memorystatus_freeze_swap_low = TRUE;
2209				proc_list_unlock();
2210				goto exit;
2211			}
2212		} else {
2213			max_pages = UINT32_MAX - 1;
2214		}
2215
2216		/* Mark as locked temporarily to avoid kill */
2217		p->p_memstat_state |= P_MEMSTAT_LOCKED;
2218
2219		p = proc_ref_locked(p);
2220		proc_list_unlock();
2221		if (!p) {
2222			goto exit;
2223		}
2224
2225		kr = task_freeze(p->task, &purgeable, &wired, &clean, &dirty, max_pages, &shared, FALSE);
2226
2227		MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_top_process: task_freeze %s for pid %d [%s] - "
2228    			"memorystatus_pages: %d, purgeable: %d, wired: %d, clean: %d, dirty: %d, shared %d, free swap: %d\n",
2229       		(kr == KERN_SUCCESS) ? "SUCCEEDED" : "FAILED", aPid, (p->p_comm ? p->p_comm : "(unknown)"),
2230       		memorystatus_available_pages, purgeable, wired, clean, dirty, shared, default_pager_swap_pages_free());
2231
2232		proc_list_lock();
2233		p->p_memstat_state &= ~P_MEMSTAT_LOCKED;
2234
2235		/* Success? */
2236		if (KERN_SUCCESS == kr) {
2237			memorystatus_freeze_entry_t data = { aPid, TRUE, dirty };
2238
2239			memorystatus_frozen_count++;
2240
2241			p->p_memstat_state |= (P_MEMSTAT_FROZEN | (shared ? 0: P_MEMSTAT_NORECLAIM));
2242
2243			/* Update stats */
2244			for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
2245       				throttle_intervals[i].pageouts += dirty;
2246			}
2247
2248			memorystatus_freeze_pageouts += dirty;
2249			memorystatus_freeze_count++;
2250
2251			proc_list_unlock();
2252
2253			memorystatus_send_note(kMemorystatusFreezeNote, &data, sizeof(data));
2254
2255			/* Return the number of reclaimed pages */
2256			ret = dirty;
2257
2258		} else {
2259			proc_list_unlock();
2260		}
2261
2262		proc_rele(p);
2263		goto exit;
2264	}
2265
2266	proc_list_unlock();
2267
2268exit:
2269	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_FREEZE) | DBG_FUNC_END,
2270		memorystatus_available_pages, aPid, 0, 0, 0);
2271
2272	return ret;
2273}
2274
2275static inline boolean_t
2276memorystatus_can_freeze_processes(void)
2277{
2278	boolean_t ret;
2279
2280	proc_list_lock();
2281
2282	if (memorystatus_suspended_count) {
2283		uint32_t average_resident_pages, estimated_processes;
2284
2285		/* Estimate the number of suspended processes we can fit */
2286		average_resident_pages = memorystatus_suspended_footprint_total / memorystatus_suspended_count;
2287		estimated_processes = memorystatus_suspended_count +
2288			((memorystatus_available_pages - memorystatus_available_pages_critical) / average_resident_pages);
2289
2290		/* If it's predicted that no freeze will occur, lower the threshold temporarily */
2291		if (estimated_processes <= FREEZE_SUSPENDED_THRESHOLD_DEFAULT) {
2292			memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_LOW;
2293		} else {
2294			memorystatus_freeze_suspended_threshold = FREEZE_SUSPENDED_THRESHOLD_DEFAULT;
2295		}
2296
2297		MEMORYSTATUS_DEBUG(1, "memorystatus_can_freeze_processes: %d suspended processes, %d average resident pages / process, %d suspended processes estimated\n",
2298			memorystatus_suspended_count, average_resident_pages, estimated_processes);
2299
2300		if ((memorystatus_suspended_count - memorystatus_frozen_count) > memorystatus_freeze_suspended_threshold) {
2301			ret = TRUE;
2302		} else {
2303			ret = FALSE;
2304		}
2305	} else {
2306		ret = FALSE;
2307	}
2308
2309	proc_list_unlock();
2310
2311	return ret;
2312}
2313
2314static boolean_t
2315memorystatus_can_freeze(boolean_t *memorystatus_freeze_swap_low)
2316{
2317	/* Only freeze if we're sufficiently low on memory; this holds off freeze right
2318	   after boot,  and is generally is a no-op once we've reached steady state. */
2319	if (memorystatus_available_pages > memorystatus_freeze_threshold) {
2320		return FALSE;
2321	}
2322
2323	/* Check minimum suspended process threshold. */
2324	if (!memorystatus_can_freeze_processes()) {
2325		return FALSE;
2326	}
2327
2328	/* Is swap running low? */
2329	if (*memorystatus_freeze_swap_low) {
2330		/* If there's been no movement in free swap pages since we last attempted freeze, return. */
2331		if (default_pager_swap_pages_free() < memorystatus_freeze_pages_min) {
2332			return FALSE;
2333		}
2334
2335		/* Pages have been freed - we can retry. */
2336		*memorystatus_freeze_swap_low = FALSE;
2337	}
2338
2339	/* OK */
2340	return TRUE;
2341}
2342
2343static void
2344memorystatus_freeze_update_throttle_interval(mach_timespec_t *ts, struct throttle_interval_t *interval)
2345{
2346	if (CMP_MACH_TIMESPEC(ts, &interval->ts) >= 0) {
2347		if (!interval->max_pageouts) {
2348			interval->max_pageouts = (interval->burst_multiple * (((uint64_t)interval->mins * FREEZE_DAILY_PAGEOUTS_MAX) / (24 * 60)));
2349		} else {
2350			printf("memorystatus_freeze_update_throttle_interval: %d minute throttle timeout, resetting\n", interval->mins);
2351		}
2352		interval->ts.tv_sec = interval->mins * 60;
2353		interval->ts.tv_nsec = 0;
2354		ADD_MACH_TIMESPEC(&interval->ts, ts);
2355		/* Since we update the throttle stats pre-freeze, adjust for overshoot here */
2356		if (interval->pageouts > interval->max_pageouts) {
2357			interval->pageouts -= interval->max_pageouts;
2358		} else {
2359			interval->pageouts = 0;
2360		}
2361		interval->throttle = FALSE;
2362	} else if (!interval->throttle && interval->pageouts >= interval->max_pageouts) {
2363		printf("memorystatus_freeze_update_throttle_interval: %d minute pageout limit exceeded; enabling throttle\n", interval->mins);
2364		interval->throttle = TRUE;
2365	}
2366
2367	MEMORYSTATUS_DEBUG(1, "memorystatus_freeze_update_throttle_interval: throttle updated - %d frozen (%d max) within %dm; %dm remaining; throttle %s\n",
2368		interval->pageouts, interval->max_pageouts, interval->mins, (interval->ts.tv_sec - ts->tv_sec) / 60,
2369		interval->throttle ? "on" : "off");
2370}
2371
2372static boolean_t
2373memorystatus_freeze_update_throttle(void)
2374{
2375	clock_sec_t sec;
2376	clock_nsec_t nsec;
2377	mach_timespec_t ts;
2378	uint32_t i;
2379	boolean_t throttled = FALSE;
2380
2381#if DEVELOPMENT || DEBUG
2382	if (!memorystatus_freeze_throttle_enabled)
2383		return FALSE;
2384#endif
2385
2386	clock_get_system_nanotime(&sec, &nsec);
2387	ts.tv_sec = sec;
2388	ts.tv_nsec = nsec;
2389
2390	/* Check freeze pageouts over multiple intervals and throttle if we've exceeded our budget.
2391	 *
2392	 * This ensures that periods of inactivity can't be used as 'credit' towards freeze if the device has
2393	 * remained dormant for a long period. We do, however, allow increased thresholds for shorter intervals in
2394	 * order to allow for bursts of activity.
2395	 */
2396	for (i = 0; i < sizeof(throttle_intervals) / sizeof(struct throttle_interval_t); i++) {
2397		memorystatus_freeze_update_throttle_interval(&ts, &throttle_intervals[i]);
2398		if (throttle_intervals[i].throttle == TRUE)
2399			throttled = TRUE;
2400	}
2401
2402	return throttled;
2403}
2404
2405static void
2406memorystatus_freeze_thread(void *param __unused, wait_result_t wr __unused)
2407{
2408	static boolean_t memorystatus_freeze_swap_low = FALSE;
2409
2410	if (memorystatus_freeze_enabled) {
2411		if (memorystatus_can_freeze(&memorystatus_freeze_swap_low)) {
2412			/* Only freeze if we've not exceeded our pageout budgets */
2413			if (!memorystatus_freeze_update_throttle()) {
2414				memorystatus_freeze_top_process(&memorystatus_freeze_swap_low);
2415			} else {
2416				printf("memorystatus_freeze_thread: in throttle, ignoring freeze\n");
2417				memorystatus_freeze_throttle_count++; /* Throttled, update stats */
2418			}
2419		}
2420	}
2421
2422	assert_wait((event_t) &memorystatus_freeze_wakeup, THREAD_UNINT);
2423	thread_block((thread_continue_t) memorystatus_freeze_thread);
2424}
2425
2426#endif /* CONFIG_FREEZE */
2427
2428#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
2429
2430boolean_t
2431memorystatus_warn_process(pid_t pid) {
2432	return (vm_dispatch_pressure_note_to_pid(pid, FALSE) == 0);
2433}
2434
2435static inline boolean_t
2436memorystatus_update_pressure_locked(boolean_t *pressured) {
2437	vm_pressure_level_t old_level, new_level;
2438
2439	old_level = memorystatus_vm_pressure_level;
2440
2441	if (memorystatus_available_pages > memorystatus_available_pages_pressure) {
2442		/* Too many free pages */
2443		new_level = kVMPressureNormal;
2444	}
2445#if CONFIG_FREEZE
2446	else if (memorystatus_frozen_count > 0) {
2447		/* Frozen processes exist */
2448		new_level = kVMPressureNormal;
2449	}
2450#endif
2451	else if (memorystatus_suspended_count > MEMORYSTATUS_SUSPENDED_THRESHOLD) {
2452		/* Too many supended processes */
2453		new_level = kVMPressureNormal;
2454	}
2455	else if (memorystatus_suspended_count > 0) {
2456		/* Some suspended processes - warn */
2457		new_level = kVMPressureWarning;
2458	}
2459    else {
2460		/* Otherwise, pressure level is urgent */
2461		new_level = kVMPressureUrgent;
2462	}
2463
2464	*pressured = (new_level != kVMPressureNormal);
2465
2466	/* Did the pressure level change? */
2467	if (old_level != new_level) {
2468		MEMORYSTATUS_DEBUG(1, "memorystatus_update_pressure_locked(): memory pressure changed %d -> %d; memorystatus_available_pages: %d\n ",
2469		    old_level, new_level, memorystatus_available_pages);
2470		memorystatus_vm_pressure_level = new_level;
2471		return TRUE;
2472	}
2473
2474	return FALSE;
2475}
2476
2477kern_return_t
2478memorystatus_update_vm_pressure(boolean_t target_foreground) {
2479	boolean_t pressure_changed, pressured;
2480	boolean_t warn = FALSE;
2481
2482	/*
2483	 * Centralised pressure handling routine. Called from:
2484	 * - The main jetsam thread. In this case, we update the pressure level and dispatch warnings to the foreground
2485	 *   process *only*, each time the available page % drops.
2486	 * - The pageout scan path. In this scenario, every other registered process is targeted in footprint order.
2487	 *
2488	 * This scheme guarantees delivery to the foreground app, while providing for warnings to the remaining processes
2489	 * driven by the pageout scan.
2490	 */
2491
2492	MEMORYSTATUS_DEBUG(1, "memorystatus_update_vm_pressure(): foreground %d; available %d, critical %d, pressure %d\n",
2493        	target_foreground, memorystatus_available_pages, memorystatus_available_pages_critical, memorystatus_available_pages_pressure);
2494
2495	proc_list_lock();
2496
2497	pressure_changed = memorystatus_update_pressure_locked(&pressured);
2498
2499	if (pressured) {
2500		if (target_foreground) {
2501			if (memorystatus_available_pages != memorystatus_last_foreground_pressure_pages) {
2502				if (memorystatus_available_pages < memorystatus_last_foreground_pressure_pages) {
2503					warn = TRUE;
2504				}
2505				memorystatus_last_foreground_pressure_pages = memorystatus_available_pages;
2506			}
2507		} else {
2508			warn = TRUE;
2509		}
2510	} else if (pressure_changed) {
2511		memorystatus_last_foreground_pressure_pages =  (unsigned int)-1;
2512	}
2513
2514	proc_list_unlock();
2515
2516	/* Target foreground processes if specified */
2517	if (warn) {
2518		if (target_foreground) {
2519			MEMORYSTATUS_DEBUG(1, "memorystatus_update_vm_pressure(): invoking vm_find_pressure_foreground_candidates()\n");
2520			vm_find_pressure_foreground_candidates();
2521		} else {
2522			MEMORYSTATUS_DEBUG(1, "memorystatus_update_vm_pressure(): invoking vm_find_pressure_candidate()\n");
2523			/* Defer to VM code. This can race with the foreground priority, but
2524			 * it's preferable to holding onto locks for an extended period. */
2525			vm_find_pressure_candidate();
2526		}
2527	}
2528
2529	/* Dispatch the global kevent to privileged listeners */
2530	if (pressure_changed) {
2531		memorystatus_issue_pressure_kevent(pressured);
2532	}
2533
2534 	return KERN_SUCCESS;
2535}
2536
2537int
2538memorystatus_send_pressure_note(pid_t pid) {
2539 	MEMORYSTATUS_DEBUG(1, "memorystatus_send_pressure_note(): pid %d\n", pid);
2540 	return memorystatus_send_note(kMemorystatusPressureNote, &pid, sizeof(pid));
2541}
2542
2543boolean_t
2544memorystatus_bg_pressure_eligible(proc_t p) {
2545 	boolean_t eligible = FALSE;
2546
2547	proc_list_lock();
2548
2549	MEMORYSTATUS_DEBUG(1, "memorystatus_bg_pressure_eligible: pid %d, state 0x%x\n", p->p_pid, p->p_memstat_state);
2550
2551 	/* Foreground processes have already been dealt with at this point, so just test for eligibility */
2552 	if (!(p->p_memstat_state & (P_MEMSTAT_TERMINATED | P_MEMSTAT_LOCKED | P_MEMSTAT_SUSPENDED | P_MEMSTAT_FROZEN))) {
2553                eligible = TRUE;
2554	}
2555
2556	proc_list_unlock();
2557
2558 	return eligible;
2559}
2560
2561boolean_t
2562memorystatus_is_foreground_locked(proc_t p) {
2563        return ((p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND) ||
2564                (p->p_memstat_effectivepriority == JETSAM_PRIORITY_FOREGROUND_SUPPORT));
2565}
2566
2567#else /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
2568
2569/*
2570 * Trigger levels to test the mechanism.
2571 * Can be used via a sysctl.
2572 */
2573#define TEST_LOW_MEMORY_TRIGGER_ONE		1
2574#define TEST_LOW_MEMORY_TRIGGER_ALL		2
2575#define TEST_PURGEABLE_TRIGGER_ONE		3
2576#define TEST_PURGEABLE_TRIGGER_ALL		4
2577#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE	5
2578#define TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL	6
2579
2580boolean_t		memorystatus_manual_testing_on = FALSE;
2581vm_pressure_level_t	memorystatus_manual_testing_level = kVMPressureNormal;
2582
2583extern struct knote *
2584vm_pressure_select_optimal_candidate_to_notify(struct klist *, int);
2585
2586extern
2587kern_return_t vm_pressure_notification_without_levels(void);
2588
2589extern void vm_pressure_klist_lock(void);
2590extern void vm_pressure_klist_unlock(void);
2591
2592extern void vm_reset_active_list(void);
2593
2594extern void delay(int);
2595
2596#define INTER_NOTIFICATION_DELAY	(250000)	/* .25 second */
2597
2598void memorystatus_on_pageout_scan_end(void) {
2599	/* No-op */
2600}
2601
2602/*
2603 * kn_max - knote
2604 *
2605 * knote_pressure_level - to check if the knote is registered for this notification level.
2606 *
2607 * task	- task whose bits we'll be modifying
2608 *
2609 * pressure_level_to_clear - if the task has been notified of this past level, clear that notification bit so that if/when we revert to that level, the task will be notified again.
2610 *
2611 * pressure_level_to_set - the task is about to be notified of this new level. Update the task's bit notification information appropriately.
2612 *
2613 */
2614boolean_t
2615is_knote_registered_modify_task_pressure_bits(struct knote*, int, task_t, vm_pressure_level_t, vm_pressure_level_t);
2616
2617boolean_t
2618is_knote_registered_modify_task_pressure_bits(struct knote *kn_max, int knote_pressure_level, task_t task, vm_pressure_level_t pressure_level_to_clear, vm_pressure_level_t pressure_level_to_set)
2619{
2620	if (kn_max->kn_sfflags & knote_pressure_level) {
2621
2622		if (task_has_been_notified(task, pressure_level_to_clear) == TRUE) {
2623
2624			task_clear_has_been_notified(task, pressure_level_to_clear);
2625		}
2626
2627		task_mark_has_been_notified(task, pressure_level_to_set);
2628		return TRUE;
2629	}
2630
2631	return FALSE;
2632}
2633
2634extern kern_return_t vm_pressure_notify_dispatch_vm_clients(void);
2635
2636kern_return_t
2637memorystatus_update_vm_pressure(boolean_t target_best_process)
2638{
2639	struct knote			*kn_max = NULL;
2640        pid_t				target_pid = -1;
2641        struct klist			dispatch_klist = { NULL };
2642	proc_t				target_proc = PROC_NULL;
2643	static vm_pressure_level_t 	level_snapshot = kVMPressureNormal;
2644	struct task			*task = NULL;
2645	boolean_t			found_candidate = FALSE;
2646
2647	while (1) {
2648
2649		/*
2650		 * There is a race window here. But it's not clear
2651		 * how much we benefit from having extra synchronization.
2652		 */
2653		level_snapshot = memorystatus_vm_pressure_level;
2654
2655		memorystatus_klist_lock();
2656		kn_max = vm_pressure_select_optimal_candidate_to_notify(&memorystatus_klist, level_snapshot);
2657
2658        	if (kn_max == NULL) {
2659			memorystatus_klist_unlock();
2660
2661			/*
2662			 * No more level-based clients to notify.
2663			 * Try the non-level based notification clients.
2664			 *
2665			 * However, these non-level clients don't understand
2666			 * the "return-to-normal" notification.
2667			 *
2668			 * So don't consider them for those notifications. Just
2669			 * return instead.
2670			 *
2671			 */
2672
2673			if (level_snapshot != kVMPressureNormal) {
2674				goto try_dispatch_vm_clients;
2675			} else {
2676				return KERN_FAILURE;
2677			}
2678		}
2679
2680		target_proc = kn_max->kn_kq->kq_p;
2681
2682		proc_list_lock();
2683		if (target_proc != proc_ref_locked(target_proc)) {
2684			target_proc = PROC_NULL;
2685			proc_list_unlock();
2686			memorystatus_klist_unlock();
2687			continue;
2688		}
2689		proc_list_unlock();
2690		memorystatus_klist_unlock();
2691
2692		target_pid = target_proc->p_pid;
2693
2694		task = (struct task *)(target_proc->task);
2695
2696		if (level_snapshot != kVMPressureNormal) {
2697
2698			if (level_snapshot == kVMPressureWarning || level_snapshot == kVMPressureUrgent) {
2699
2700				if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_WARN, task, kVMPressureCritical, kVMPressureWarning) == TRUE) {
2701					found_candidate = TRUE;
2702				}
2703			} else {
2704				if (level_snapshot == kVMPressureCritical) {
2705
2706					if (is_knote_registered_modify_task_pressure_bits(kn_max, NOTE_MEMORYSTATUS_PRESSURE_CRITICAL, task, kVMPressureWarning, kVMPressureCritical) == TRUE) {
2707						found_candidate = TRUE;
2708					}
2709				}
2710			}
2711		} else {
2712			if (kn_max->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
2713
2714				task_clear_has_been_notified(task, kVMPressureWarning);
2715				task_clear_has_been_notified(task, kVMPressureCritical);
2716
2717				found_candidate = TRUE;
2718			}
2719		}
2720
2721		if (found_candidate == FALSE) {
2722			continue;
2723		}
2724
2725		memorystatus_klist_lock();
2726		KNOTE_DETACH(&memorystatus_klist, kn_max);
2727		KNOTE_ATTACH(&dispatch_klist, kn_max);
2728		memorystatus_klist_unlock();
2729
2730		KNOTE(&dispatch_klist, (level_snapshot != kVMPressureNormal) ? kMemorystatusPressure : kMemorystatusNoPressure);
2731
2732		memorystatus_klist_lock();
2733		KNOTE_DETACH(&dispatch_klist, kn_max);
2734		KNOTE_ATTACH(&memorystatus_klist, kn_max);
2735		memorystatus_klist_unlock();
2736
2737		microuptime(&target_proc->vm_pressure_last_notify_tstamp);
2738		proc_rele(target_proc);
2739
2740		if (target_best_process == TRUE) {
2741			break;
2742		}
2743
2744try_dispatch_vm_clients:
2745		if (level_snapshot != kVMPressureNormal) {
2746			/*
2747			 * Wake up idle-exit thread.
2748			 * Targets one process per invocation.
2749			 *
2750			 * TODO: memorystatus_idle_exit_from_VM should return FALSE once it's
2751			 * done with all idle-exitable processes. Currently, we will exit this
2752			 * loop when we are done with notification clients (level and non-level based)
2753			 * but we may still have some idle-exitable processes around.
2754			 *
2755			 */
2756			memorystatus_idle_exit_from_VM();
2757
2758			if ((vm_pressure_notify_dispatch_vm_clients() == KERN_FAILURE) && (kn_max == NULL)) {
2759				/*
2760				 * kn_max == NULL i.e. we didn't find any eligible clients for the level-based notifications
2761				 * AND
2762				 * we have failed to find any eligible clients for the non-level based notifications too.
2763				 * So, we are done.
2764				 */
2765
2766				return KERN_FAILURE;
2767			}
2768		}
2769
2770		if (memorystatus_manual_testing_on == FALSE) {
2771			delay(INTER_NOTIFICATION_DELAY);
2772		}
2773	}
2774
2775	return KERN_SUCCESS;
2776}
2777
2778vm_pressure_level_t
2779convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t);
2780
2781vm_pressure_level_t
2782convert_internal_pressure_level_to_dispatch_level(vm_pressure_level_t internal_pressure_level)
2783{
2784	vm_pressure_level_t	dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
2785
2786	switch (internal_pressure_level) {
2787
2788		case kVMPressureNormal:
2789		{
2790			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
2791			break;
2792		}
2793
2794		case kVMPressureWarning:
2795		case kVMPressureUrgent:
2796		{
2797			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_WARN;
2798			break;
2799		}
2800
2801		case kVMPressureCritical:
2802		{
2803			dispatch_level = NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
2804			break;
2805		}
2806
2807		default:
2808			break;
2809	}
2810
2811	return dispatch_level;
2812}
2813
2814static int
2815sysctl_memorystatus_vm_pressure_level SYSCTL_HANDLER_ARGS
2816{
2817#pragma unused(arg1, arg2, oidp)
2818
2819	vm_pressure_level_t dispatch_level = convert_internal_pressure_level_to_dispatch_level(memorystatus_vm_pressure_level);
2820
2821	return SYSCTL_OUT(req, &dispatch_level, sizeof(dispatch_level));
2822}
2823
2824SYSCTL_PROC(_kern, OID_AUTO, memorystatus_vm_pressure_level, CTLTYPE_INT|CTLFLAG_RD|CTLFLAG_LOCKED,
2825    0, 0, &sysctl_memorystatus_vm_pressure_level, "I", "");
2826
2827
2828extern int memorystatus_purge_on_warning;
2829extern int memorystatus_purge_on_critical;
2830
2831static int
2832sysctl_memorypressure_manual_trigger SYSCTL_HANDLER_ARGS
2833{
2834#pragma unused(arg1, arg2)
2835
2836	int level = 0;
2837	int error = 0;
2838	int pressure_level = 0;
2839	int trigger_request = 0;
2840	int force_purge;
2841
2842	error = sysctl_handle_int(oidp, &level, 0, req);
2843	if (error || !req->newptr) {
2844		return (error);
2845	}
2846
2847	memorystatus_manual_testing_on = TRUE;
2848
2849	trigger_request = (level >> 16) & 0xFFFF;
2850	pressure_level = (level & 0xFFFF);
2851
2852	if (trigger_request < TEST_LOW_MEMORY_TRIGGER_ONE ||
2853	    trigger_request > TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL) {
2854		return EINVAL;
2855	}
2856	switch (pressure_level) {
2857	case NOTE_MEMORYSTATUS_PRESSURE_NORMAL:
2858	case NOTE_MEMORYSTATUS_PRESSURE_WARN:
2859	case NOTE_MEMORYSTATUS_PRESSURE_CRITICAL:
2860		break;
2861	default:
2862		return EINVAL;
2863	}
2864
2865	/*
2866	 * The pressure level is being set from user-space.
2867	 * And user-space uses the constants in sys/event.h
2868	 * So we translate those events to our internal levels here.
2869	 */
2870	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
2871
2872		memorystatus_manual_testing_level = kVMPressureNormal;
2873		force_purge = 0;
2874
2875	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_WARN) {
2876
2877		memorystatus_manual_testing_level = kVMPressureWarning;
2878		force_purge = memorystatus_purge_on_warning;
2879
2880	} else if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
2881
2882		memorystatus_manual_testing_level = kVMPressureCritical;
2883		force_purge = memorystatus_purge_on_critical;
2884	}
2885
2886	memorystatus_vm_pressure_level = memorystatus_manual_testing_level;
2887
2888	/* purge according to the new pressure level */
2889	switch (trigger_request) {
2890	case TEST_PURGEABLE_TRIGGER_ONE:
2891	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE:
2892		if (force_purge == 0) {
2893			/* no purging requested */
2894			break;
2895		}
2896		vm_purgeable_object_purge_one_unlocked(force_purge);
2897		break;
2898	case TEST_PURGEABLE_TRIGGER_ALL:
2899	case TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL:
2900		if (force_purge == 0) {
2901			/* no purging requested */
2902			break;
2903		}
2904		while (vm_purgeable_object_purge_one_unlocked(force_purge));
2905		break;
2906	}
2907
2908	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ONE) ||
2909	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ONE)) {
2910
2911		memorystatus_update_vm_pressure(TRUE);
2912	}
2913
2914	if ((trigger_request == TEST_LOW_MEMORY_TRIGGER_ALL) ||
2915	    (trigger_request == TEST_LOW_MEMORY_PURGEABLE_TRIGGER_ALL)) {
2916
2917		while (memorystatus_update_vm_pressure(FALSE) == KERN_SUCCESS) {
2918			continue;
2919		}
2920	}
2921
2922	if (pressure_level == NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
2923		memorystatus_manual_testing_on = FALSE;
2924
2925		vm_pressure_klist_lock();
2926		vm_reset_active_list();
2927		vm_pressure_klist_unlock();
2928	} else {
2929
2930		vm_pressure_klist_lock();
2931		vm_pressure_notification_without_levels();
2932		vm_pressure_klist_unlock();
2933	}
2934
2935	return 0;
2936}
2937
2938SYSCTL_PROC(_kern, OID_AUTO, memorypressure_manual_trigger, CTLTYPE_INT|CTLFLAG_WR|CTLFLAG_LOCKED|CTLFLAG_MASKED,
2939    0, 0, &sysctl_memorypressure_manual_trigger, "I", "");
2940
2941
2942extern int memorystatus_purge_on_warning;
2943extern int memorystatus_purge_on_urgent;
2944extern int memorystatus_purge_on_critical;
2945
2946SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_warning, CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_warning, 0, "");
2947SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_urgent, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_urgent, 0, "");
2948SYSCTL_INT(_kern, OID_AUTO, memorystatus_purge_on_critical, CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_LOCKED, &memorystatus_purge_on_critical, 0, "");
2949
2950
2951#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
2952
2953/* Return both allocated and actual size, since there's a race between allocation and list compilation */
2954static int
2955memorystatus_get_priority_list(memorystatus_priority_entry_t **list_ptr, size_t *buffer_size, size_t *list_size, boolean_t size_only)
2956{
2957 	uint32_t list_count, i = 0;
2958	memorystatus_priority_entry_t *list_entry;
2959	proc_t p;
2960
2961 	list_count = memorystatus_list_count;
2962	*list_size = sizeof(memorystatus_priority_entry_t) * list_count;
2963
2964	/* Just a size check? */
2965	if (size_only) {
2966		return 0;
2967	}
2968
2969	/* Otherwise, validate the size of the buffer */
2970	if (*buffer_size < *list_size) {
2971		return EINVAL;
2972	}
2973
2974 	*list_ptr = (memorystatus_priority_entry_t*)kalloc(*list_size);
2975	if (!list_ptr) {
2976		return ENOMEM;
2977	}
2978
2979	memset(*list_ptr, 0, *list_size);
2980
2981	*buffer_size = *list_size;
2982	*list_size = 0;
2983
2984	list_entry = *list_ptr;
2985
2986	proc_list_lock();
2987
2988	p = memorystatus_get_first_proc_locked(&i, TRUE);
2989	while (p && (*list_size < *buffer_size)) {
2990		list_entry->pid = p->p_pid;
2991		list_entry->priority = p->p_memstat_effectivepriority;
2992		list_entry->user_data = p->p_memstat_userdata;
2993#if LEGACY_HIWATER
2994		if (((p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) && (p->p_memstat_effectivepriority >= JETSAM_PRIORITY_FOREGROUND)) ||
2995		     (p->p_memstat_memlimit <= 0)) {
2996			task_get_phys_footprint_limit(p->task, &list_entry->limit);
2997		} else {
2998			list_entry->limit = p->p_memstat_memlimit;
2999		}
3000#else
3001		task_get_phys_footprint_limit(p->task, &list_entry->limit);
3002#endif
3003		list_entry->state = memorystatus_build_state(p);
3004		list_entry++;
3005
3006		*list_size += sizeof(memorystatus_priority_entry_t);
3007
3008		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3009	}
3010
3011	proc_list_unlock();
3012
3013	MEMORYSTATUS_DEBUG(1, "memorystatus_get_priority_list: returning %lu for size\n", (unsigned long)*list_size);
3014
3015	return 0;
3016}
3017
3018static int
3019memorystatus_cmd_get_priority_list(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3020	int error = EINVAL;
3021	boolean_t size_only;
3022	memorystatus_priority_entry_t *list = NULL;
3023	size_t list_size;
3024
3025	size_only = ((buffer == USER_ADDR_NULL) ? TRUE: FALSE);
3026
3027	error = memorystatus_get_priority_list(&list, &buffer_size, &list_size, size_only);
3028	if (error) {
3029		goto out;
3030	}
3031
3032	if (!size_only) {
3033		error = copyout(list, buffer, list_size);
3034	}
3035
3036	if (error == 0) {
3037		*retval = list_size;
3038	}
3039out:
3040
3041	if (list) {
3042		kfree(list, buffer_size);
3043	}
3044
3045	return error;
3046}
3047
3048#if CONFIG_JETSAM
3049
3050static void
3051memorystatus_clear_errors(void)
3052{
3053	proc_t p;
3054	unsigned int i = 0;
3055
3056	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_START, 0, 0, 0, 0, 0);
3057
3058	proc_list_lock();
3059
3060	p = memorystatus_get_first_proc_locked(&i, TRUE);
3061	while (p) {
3062		if (p->p_memstat_state & P_MEMSTAT_ERROR) {
3063			p->p_memstat_state &= ~P_MEMSTAT_ERROR;
3064		}
3065		p = memorystatus_get_next_proc_locked(&i, p, TRUE);
3066	}
3067
3068	proc_list_unlock();
3069
3070	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_MEMSTAT, BSD_MEMSTAT_CLEAR_ERRORS) | DBG_FUNC_END, 0, 0, 0, 0, 0);
3071}
3072
3073static void
3074memorystatus_update_levels_locked(boolean_t critical_only) {
3075	memorystatus_available_pages_critical = memorystatus_available_pages_critical_base;
3076#if !LATENCY_JETSAM
3077	{
3078		// If there's an entry in the first bucket, we have idle processes
3079		memstat_bucket_t *first_bucket = &memstat_bucket[JETSAM_PRIORITY_IDLE];
3080		if (first_bucket->count) {
3081			memorystatus_available_pages_critical += memorystatus_available_pages_critical_idle_offset;
3082		}
3083	}
3084#endif
3085#if DEBUG || DEVELOPMENT
3086	if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3087		memorystatus_available_pages_critical += memorystatus_jetsam_policy_offset_pages_diagnostic;
3088	}
3089#endif
3090
3091	if (critical_only) {
3092		return;
3093	}
3094
3095#if VM_PRESSURE_EVENTS
3096	memorystatus_available_pages_pressure = (pressure_threshold_percentage / delta_percentage) * memorystatus_delta;
3097#if DEBUG || DEVELOPMENT
3098	if (memorystatus_jetsam_policy & kPolicyDiagnoseActive) {
3099		memorystatus_available_pages_pressure += memorystatus_jetsam_policy_offset_pages_diagnostic;
3100	}
3101#endif
3102#endif
3103}
3104
3105static int
3106memorystatus_get_snapshot(memorystatus_jetsam_snapshot_t **snapshot, size_t *snapshot_size, boolean_t size_only) {
3107	size_t input_size = *snapshot_size;
3108
3109	if (memorystatus_jetsam_snapshot_count > 0) {
3110		*snapshot_size = sizeof(memorystatus_jetsam_snapshot_t) + (sizeof(memorystatus_jetsam_snapshot_entry_t) * (memorystatus_jetsam_snapshot_count));
3111	} else {
3112		*snapshot_size = 0;
3113	}
3114
3115	if (size_only) {
3116		return 0;
3117	}
3118
3119	if (input_size < *snapshot_size) {
3120		return EINVAL;
3121	}
3122
3123	*snapshot = memorystatus_jetsam_snapshot;
3124
3125	MEMORYSTATUS_DEBUG(1, "memorystatus_snapshot: returning %ld for size\n", (long)*snapshot_size);
3126
3127	return 0;
3128}
3129
3130static int
3131memorystatus_cmd_get_jetsam_snapshot(user_addr_t buffer, size_t buffer_size, int32_t *retval) {
3132	int error = EINVAL;
3133	boolean_t size_only;
3134	memorystatus_jetsam_snapshot_t *snapshot;
3135
3136	size_only = ((buffer == USER_ADDR_NULL) ? TRUE : FALSE);
3137
3138	error = memorystatus_get_snapshot(&snapshot, &buffer_size, size_only);
3139	if (error) {
3140		goto out;
3141	}
3142
3143	/* Copy out and reset */
3144	if (!size_only) {
3145		if ((error = copyout(snapshot, buffer, buffer_size)) == 0) {
3146			snapshot->entry_count = memorystatus_jetsam_snapshot_count = 0;
3147		}
3148	}
3149
3150	if (error == 0) {
3151		*retval = buffer_size;
3152	}
3153out:
3154	return error;
3155}
3156
3157static int
3158memorystatus_cmd_set_priority_properties(pid_t pid, user_addr_t buffer, size_t buffer_size, __unused int32_t *retval) {
3159	const uint32_t MAX_ENTRY_COUNT = 2; /* Cap the entry count */
3160
3161	int error;
3162	uint32_t i;
3163	uint32_t entry_count;
3164	memorystatus_priority_properties_t *entries;
3165
3166	/* Validate inputs */
3167	if ((pid == 0) || (buffer == USER_ADDR_NULL) || (buffer_size == 0)) {
3168		return EINVAL;
3169	}
3170
3171	/* Make sure the buffer is a multiple of the entry size, and that an excessive size isn't specified */
3172	entry_count = (buffer_size / sizeof(memorystatus_priority_properties_t));
3173	if (((buffer_size % sizeof(memorystatus_priority_properties_t)) != 0) || (entry_count > MAX_ENTRY_COUNT)) {
3174		return EINVAL;
3175	}
3176
3177	entries = (memorystatus_priority_properties_t *)kalloc(buffer_size);
3178
3179	error = copyin(buffer, entries, buffer_size);
3180
3181	for (i = 0; i < entry_count; i++) {
3182		proc_t p;
3183
3184		if (error) {
3185			break;
3186		}
3187
3188		p = proc_find(pid);
3189		if (!p) {
3190			error = ESRCH;
3191			break;
3192		}
3193
3194		if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3195			error = EPERM;
3196			proc_rele(p);
3197			break;
3198		}
3199
3200		error = memorystatus_update(p, entries[i].priority, entries[i].user_data, FALSE, FALSE, 0, 0);
3201		proc_rele(p);
3202	}
3203
3204	kfree(entries, buffer_size);
3205
3206	return error;
3207}
3208
3209static int
3210memorystatus_cmd_get_pressure_status(int32_t *retval) {
3211	int error;
3212
3213	/* Need privilege for check */
3214	error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
3215	if (error) {
3216		return (error);
3217	}
3218
3219	/* Inherently racy, so it's not worth taking a lock here */
3220	*retval = (kVMPressureNormal != memorystatus_vm_pressure_level) ? 1 : 0;
3221
3222	return error;
3223}
3224
3225static int
3226memorystatus_cmd_set_jetsam_high_water_mark(pid_t pid, int32_t high_water_mark, __unused int32_t *retval) {
3227	int error = 0;
3228
3229	proc_t p = proc_find(pid);
3230	if (!p) {
3231		return ESRCH;
3232	}
3233
3234	if (high_water_mark <= 0) {
3235		high_water_mark = -1; /* Disable */
3236	}
3237
3238	proc_list_lock();
3239
3240	if (p->p_memstat_state & P_MEMSTAT_INTERNAL) {
3241		error = EPERM;
3242		goto exit;
3243	}
3244
3245	p->p_memstat_memlimit = high_water_mark;
3246	if (memorystatus_highwater_enabled) {
3247        	if (p->p_memstat_state & P_MEMSTAT_MEMLIMIT_BACKGROUND) {
3248        		memorystatus_update_priority_locked(p, p->p_memstat_effectivepriority);
3249        	} else {
3250        		error = (task_set_phys_footprint_limit_internal(p->task, high_water_mark, NULL, TRUE) == 0) ? 0 : EINVAL;
3251        	}
3252	}
3253
3254exit:
3255	proc_list_unlock();
3256	proc_rele(p);
3257
3258	return error;
3259}
3260
3261#endif /* CONFIG_JETSAM */
3262
3263int
3264memorystatus_control(struct proc *p __unused, struct memorystatus_control_args *args, int *ret) {
3265	int error = EINVAL;
3266
3267#if !CONFIG_JETSAM
3268	#pragma unused(ret)
3269#endif
3270
3271	/* Root only for now */
3272	if (!kauth_cred_issuser(kauth_cred_get())) {
3273		error = EPERM;
3274		goto out;
3275	}
3276
3277	/* Sanity check */
3278	if (args->buffersize > MEMORYSTATUS_BUFFERSIZE_MAX) {
3279		error = EINVAL;
3280		goto out;
3281	}
3282
3283	switch (args->command) {
3284	case MEMORYSTATUS_CMD_GET_PRIORITY_LIST:
3285		error = memorystatus_cmd_get_priority_list(args->buffer, args->buffersize, ret);
3286		break;
3287#if CONFIG_JETSAM
3288	case MEMORYSTATUS_CMD_SET_PRIORITY_PROPERTIES:
3289		error = memorystatus_cmd_set_priority_properties(args->pid, args->buffer, args->buffersize, ret);
3290		break;
3291	case MEMORYSTATUS_CMD_GET_JETSAM_SNAPSHOT:
3292		error = memorystatus_cmd_get_jetsam_snapshot(args->buffer, args->buffersize, ret);
3293		break;
3294	case MEMORYSTATUS_CMD_GET_PRESSURE_STATUS:
3295		error = memorystatus_cmd_get_pressure_status(ret);
3296		break;
3297	case MEMORYSTATUS_CMD_SET_JETSAM_HIGH_WATER_MARK:
3298		/* TODO: deprecate. Keeping it in as there's no pid based way to set the ledger limit right now. */
3299		error = memorystatus_cmd_set_jetsam_high_water_mark(args->pid, (int32_t)args->flags, ret);
3300		break;
3301	/* Test commands */
3302#if DEVELOPMENT || DEBUG
3303	case MEMORYSTATUS_CMD_TEST_JETSAM:
3304		error = memorystatus_kill_process_sync(args->pid, kMemorystatusKilled) ? 0 : EINVAL;
3305		break;
3306	case MEMORYSTATUS_CMD_SET_JETSAM_PANIC_BITS:
3307		error = memorystatus_cmd_set_panic_bits(args->buffer, args->buffersize);
3308		break;
3309#endif /* DEVELOPMENT || DEBUG */
3310#endif /* CONFIG_JETSAM */
3311	default:
3312		break;
3313	}
3314
3315out:
3316	return error;
3317}
3318
3319
3320static int
3321filt_memorystatusattach(struct knote *kn)
3322{
3323	kn->kn_flags |= EV_CLEAR;
3324	return memorystatus_knote_register(kn);
3325}
3326
3327static void
3328filt_memorystatusdetach(struct knote *kn)
3329{
3330	memorystatus_knote_unregister(kn);
3331}
3332
3333static int
3334filt_memorystatus(struct knote *kn __unused, long hint)
3335{
3336	if (hint) {
3337		switch (hint) {
3338		case kMemorystatusNoPressure:
3339			if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_NORMAL) {
3340				kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_NORMAL;
3341			}
3342			break;
3343		case kMemorystatusPressure:
3344			if (memorystatus_vm_pressure_level == kVMPressureWarning || memorystatus_vm_pressure_level == kVMPressureUrgent) {
3345				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_WARN) {
3346					kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_WARN;
3347				}
3348			} else if (memorystatus_vm_pressure_level == kVMPressureCritical) {
3349
3350				if (kn->kn_sfflags & NOTE_MEMORYSTATUS_PRESSURE_CRITICAL) {
3351					kn->kn_fflags |= NOTE_MEMORYSTATUS_PRESSURE_CRITICAL;
3352				}
3353			}
3354			break;
3355		default:
3356			break;
3357		}
3358	}
3359
3360	return (kn->kn_fflags != 0);
3361}
3362
3363static void
3364memorystatus_klist_lock(void) {
3365	lck_mtx_lock(&memorystatus_klist_mutex);
3366}
3367
3368static void
3369memorystatus_klist_unlock(void) {
3370	lck_mtx_unlock(&memorystatus_klist_mutex);
3371}
3372
3373void
3374memorystatus_kevent_init(lck_grp_t *grp, lck_attr_t *attr) {
3375	lck_mtx_init(&memorystatus_klist_mutex, grp, attr);
3376	klist_init(&memorystatus_klist);
3377}
3378
3379int
3380memorystatus_knote_register(struct knote *kn) {
3381	int error = 0;
3382
3383	memorystatus_klist_lock();
3384
3385	if (kn->kn_sfflags & (NOTE_MEMORYSTATUS_PRESSURE_NORMAL | NOTE_MEMORYSTATUS_PRESSURE_WARN | NOTE_MEMORYSTATUS_PRESSURE_CRITICAL)) {
3386
3387#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
3388		/* Need a privilege to register */
3389		error = priv_check_cred(kauth_cred_get(), PRIV_VM_PRESSURE, 0);
3390#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
3391
3392		if (!error) {
3393			KNOTE_ATTACH(&memorystatus_klist, kn);
3394		}
3395	} else {
3396		error = ENOTSUP;
3397	}
3398
3399	memorystatus_klist_unlock();
3400
3401	return error;
3402}
3403
3404void
3405memorystatus_knote_unregister(struct knote *kn __unused) {
3406	memorystatus_klist_lock();
3407	KNOTE_DETACH(&memorystatus_klist, kn);
3408	memorystatus_klist_unlock();
3409}
3410
3411#if CONFIG_JETSAM && VM_PRESSURE_EVENTS
3412static boolean_t
3413memorystatus_issue_pressure_kevent(boolean_t pressured) {
3414	memorystatus_klist_lock();
3415	KNOTE(&memorystatus_klist, pressured ? kMemorystatusPressure : kMemorystatusNoPressure);
3416	memorystatus_klist_unlock();
3417	return TRUE;
3418}
3419
3420#endif /* CONFIG_JETSAM && VM_PRESSURE_EVENTS */
3421