1/*
2 * Copyright (c) 2006-2011 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Memory allocator with per-CPU caching, derived from the kmem magazine
31 * concept and implementation as described in the following paper:
32 * http://www.usenix.org/events/usenix01/full_papers/bonwick/bonwick.pdf
33 * That implementation is Copyright 2006 Sun Microsystems, Inc.  All rights
34 * reserved.  Use is subject to license terms.
35 *
36 * There are several major differences between this and the original kmem
37 * magazine: this derivative implementation allows for multiple objects to
38 * be allocated and freed from/to the object cache in one call; in addition,
39 * it provides for better flexibility where the user is allowed to define
40 * its own slab allocator (instead of the default zone allocator).  Finally,
41 * no object construction/destruction takes place at the moment, although
42 * this could be added in future to improve efficiency.
43 */
44
45#include <sys/param.h>
46#include <sys/types.h>
47#include <sys/malloc.h>
48#include <sys/mbuf.h>
49#include <sys/queue.h>
50#include <sys/kernel.h>
51#include <sys/systm.h>
52
53#include <kern/debug.h>
54#include <kern/zalloc.h>
55#include <kern/cpu_number.h>
56#include <kern/locks.h>
57
58#include <libkern/libkern.h>
59#include <libkern/OSAtomic.h>
60#include <libkern/OSDebug.h>
61
62#include <mach/vm_param.h>
63#include <machine/limits.h>
64#include <machine/machine_routines.h>
65
66#include <string.h>
67
68#include <sys/mcache.h>
69
70#define	MCACHE_SIZE(n) \
71	((size_t)(&((mcache_t *)0)->mc_cpu[n]))
72
73/* Allocate extra in case we need to manually align the pointer */
74#define	MCACHE_ALLOC_SIZE \
75	(sizeof (void *) + MCACHE_SIZE(ncpu) + CPU_CACHE_SIZE)
76
77#define	MCACHE_CPU(c) \
78	(mcache_cpu_t *)((void *)((char *)(c) + MCACHE_SIZE(cpu_number())))
79
80/*
81 * MCACHE_LIST_LOCK() and MCACHE_LIST_UNLOCK() are macros used
82 * to serialize accesses to the global list of caches in the system.
83 * They also record the thread currently running in the critical
84 * section, so that we can avoid recursive requests to reap the
85 * caches when memory runs low.
86 */
87#define	MCACHE_LIST_LOCK() {				\
88	lck_mtx_lock(mcache_llock);			\
89	mcache_llock_owner = current_thread();		\
90}
91
92#define	MCACHE_LIST_UNLOCK() {				\
93	mcache_llock_owner = NULL;			\
94	lck_mtx_unlock(mcache_llock);			\
95}
96
97#define	MCACHE_LOCK(l)		lck_mtx_lock(l)
98#define	MCACHE_UNLOCK(l)	lck_mtx_unlock(l)
99#define	MCACHE_LOCK_TRY(l)	lck_mtx_try_lock(l)
100
101static int ncpu;
102static lck_mtx_t *mcache_llock;
103static struct thread *mcache_llock_owner;
104static lck_attr_t *mcache_llock_attr;
105static lck_grp_t *mcache_llock_grp;
106static lck_grp_attr_t *mcache_llock_grp_attr;
107static struct zone *mcache_zone;
108static unsigned int mcache_reap_interval;
109static UInt32 mcache_reaping;
110static int mcache_ready;
111static int mcache_updating;
112
113static int mcache_bkt_contention = 3;
114#if DEBUG
115static unsigned int mcache_flags = MCF_DEBUG;
116#else
117static unsigned int mcache_flags = 0;
118#endif
119
120#define	DUMP_MCA_BUF_SIZE	512
121static char *mca_dump_buf;
122
123static mcache_bkttype_t mcache_bkttype[] = {
124	{ 1,	4096,	32768,	NULL },
125	{ 3,	2048,	16384,	NULL },
126	{ 7,	1024,	12288,	NULL },
127	{ 15,	256,	8192,	NULL },
128	{ 31,	64,	4096,	NULL },
129	{ 47,	0,	2048,	NULL },
130	{ 63,	0,	1024,	NULL },
131	{ 95,	0,	512,	NULL },
132	{ 143,	0,	256,	NULL },
133	{ 165,	0,	0,	NULL },
134};
135
136static mcache_t *mcache_create_common(const char *, size_t, size_t,
137    mcache_allocfn_t, mcache_freefn_t, mcache_auditfn_t, mcache_logfn_t,
138    mcache_notifyfn_t, void *, u_int32_t, int, int);
139static unsigned int mcache_slab_alloc(void *, mcache_obj_t ***,
140    unsigned int, int);
141static void mcache_slab_free(void *, mcache_obj_t *, boolean_t);
142static void mcache_slab_audit(void *, mcache_obj_t *, boolean_t);
143static void mcache_cpu_refill(mcache_cpu_t *, mcache_bkt_t *, int);
144static mcache_bkt_t *mcache_bkt_alloc(mcache_t *, mcache_bktlist_t *,
145    mcache_bkttype_t **);
146static void mcache_bkt_free(mcache_t *, mcache_bktlist_t *, mcache_bkt_t *);
147static void mcache_cache_bkt_enable(mcache_t *);
148static void mcache_bkt_purge(mcache_t *);
149static void mcache_bkt_destroy(mcache_t *, mcache_bkttype_t *,
150    mcache_bkt_t *, int);
151static void mcache_bkt_ws_update(mcache_t *);
152static void mcache_bkt_ws_reap(mcache_t *);
153static void mcache_dispatch(void (*)(void *), void *);
154static void mcache_cache_reap(mcache_t *);
155static void mcache_cache_update(mcache_t *);
156static void mcache_cache_bkt_resize(void *);
157static void mcache_cache_enable(void *);
158static void mcache_update(void *);
159static void mcache_update_timeout(void *);
160static void mcache_applyall(void (*)(mcache_t *));
161static void mcache_reap_start(void *);
162static void mcache_reap_done(void *);
163static void mcache_reap_timeout(void *);
164static void mcache_notify(mcache_t *, u_int32_t);
165static void mcache_purge(void *);
166
167static LIST_HEAD(, mcache) mcache_head;
168mcache_t *mcache_audit_cache;
169
170/*
171 * Initialize the framework; this is currently called as part of BSD init.
172 */
173__private_extern__ void
174mcache_init(void)
175{
176	mcache_bkttype_t *btp;
177	unsigned int i;
178	char name[32];
179
180	ncpu = ml_get_max_cpus();
181
182	mcache_llock_grp_attr = lck_grp_attr_alloc_init();
183	mcache_llock_grp = lck_grp_alloc_init("mcache.list",
184	    mcache_llock_grp_attr);
185	mcache_llock_attr = lck_attr_alloc_init();
186	mcache_llock = lck_mtx_alloc_init(mcache_llock_grp, mcache_llock_attr);
187
188	mcache_zone = zinit(MCACHE_ALLOC_SIZE, 256 * MCACHE_ALLOC_SIZE,
189	    PAGE_SIZE, "mcache");
190	if (mcache_zone == NULL)
191		panic("mcache_init: failed to allocate mcache zone\n");
192	zone_change(mcache_zone, Z_CALLERACCT, FALSE);
193
194	LIST_INIT(&mcache_head);
195
196	for (i = 0; i < sizeof (mcache_bkttype) / sizeof (*btp); i++) {
197		btp = &mcache_bkttype[i];
198		(void) snprintf(name, sizeof (name), "bkt_%d",
199		    btp->bt_bktsize);
200		btp->bt_cache = mcache_create(name,
201		    (btp->bt_bktsize + 1) * sizeof (void *), 0, 0, MCR_SLEEP);
202	}
203
204	PE_parse_boot_argn("mcache_flags", &mcache_flags, sizeof (mcache_flags));
205	mcache_flags &= MCF_FLAGS_MASK;
206
207	mcache_audit_cache = mcache_create("audit", sizeof (mcache_audit_t),
208	    0, 0, MCR_SLEEP);
209
210	mcache_reap_interval = 15 * hz;
211	mcache_applyall(mcache_cache_bkt_enable);
212	mcache_ready = 1;
213}
214
215/*
216 * Return the global mcache flags.
217 */
218__private_extern__ unsigned int
219mcache_getflags(void)
220{
221	return (mcache_flags);
222}
223
224/*
225 * Create a cache using the zone allocator as the backend slab allocator.
226 * The caller may specify any alignment for the object; if it specifies 0
227 * the default alignment (MCACHE_ALIGN) will be used.
228 */
229__private_extern__ mcache_t *
230mcache_create(const char *name, size_t bufsize, size_t align,
231    u_int32_t flags, int wait)
232{
233	return (mcache_create_common(name, bufsize, align, mcache_slab_alloc,
234	    mcache_slab_free, mcache_slab_audit, NULL, NULL, NULL, flags, 1,
235	    wait));
236}
237
238/*
239 * Create a cache using a custom backend slab allocator.  Since the caller
240 * is responsible for allocation, no alignment guarantee will be provided
241 * by this framework.
242 */
243__private_extern__ mcache_t *
244mcache_create_ext(const char *name, size_t bufsize,
245    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
246    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
247    u_int32_t flags, int wait)
248{
249	return (mcache_create_common(name, bufsize, 0, allocfn,
250	    freefn, auditfn, logfn, notifyfn, arg, flags, 0, wait));
251}
252
253/*
254 * Common cache creation routine.
255 */
256static mcache_t *
257mcache_create_common(const char *name, size_t bufsize, size_t align,
258    mcache_allocfn_t allocfn, mcache_freefn_t freefn, mcache_auditfn_t auditfn,
259    mcache_logfn_t logfn, mcache_notifyfn_t notifyfn, void *arg,
260    u_int32_t flags, int need_zone, int wait)
261{
262	mcache_bkttype_t *btp;
263	mcache_t *cp = NULL;
264	size_t chunksize;
265	void *buf, **pbuf;
266	int c;
267	char lck_name[64];
268
269	/* If auditing is on and print buffer is NULL, allocate it now */
270	if ((flags & MCF_DEBUG) && mca_dump_buf == NULL) {
271		int malloc_wait = (wait & MCR_NOSLEEP) ? M_NOWAIT : M_WAITOK;
272		MALLOC(mca_dump_buf, char *, DUMP_MCA_BUF_SIZE, M_TEMP,
273		    malloc_wait | M_ZERO);
274		if (mca_dump_buf == NULL)
275			return (NULL);
276	}
277
278	if (!(wait & MCR_NOSLEEP))
279		buf = zalloc(mcache_zone);
280	else
281		buf = zalloc_noblock(mcache_zone);
282
283	if (buf == NULL)
284		goto fail;
285
286	bzero(buf, MCACHE_ALLOC_SIZE);
287
288	/*
289	 * In case we didn't get a cache-aligned memory, round it up
290	 * accordingly.  This is needed in order to get the rest of
291	 * structure members aligned properly.  It also means that
292	 * the memory span gets shifted due to the round up, but it
293	 * is okay since we've allocated extra space for this.
294	 */
295	cp = (mcache_t *)
296	    P2ROUNDUP((intptr_t)buf + sizeof (void *), CPU_CACHE_SIZE);
297	pbuf = (void **)((intptr_t)cp - sizeof (void *));
298	*pbuf = buf;
299
300	/*
301	 * Guaranteed alignment is valid only when we use the internal
302	 * slab allocator (currently set to use the zone allocator).
303	 */
304	if (!need_zone)
305		align = 1;
306	else if (align == 0)
307		align = MCACHE_ALIGN;
308
309	if ((align & (align - 1)) != 0)
310		panic("mcache_create: bad alignment %lu", align);
311
312	cp->mc_align = align;
313	cp->mc_slab_alloc = allocfn;
314	cp->mc_slab_free = freefn;
315	cp->mc_slab_audit = auditfn;
316	cp->mc_slab_log = logfn;
317	cp->mc_slab_notify = notifyfn;
318	cp->mc_private = need_zone ? cp : arg;
319	cp->mc_bufsize = bufsize;
320	cp->mc_flags = (flags & MCF_FLAGS_MASK) | mcache_flags;
321
322	(void) snprintf(cp->mc_name, sizeof (cp->mc_name), "mcache.%s", name);
323
324	(void) snprintf(lck_name, sizeof (lck_name), "%s.cpu", cp->mc_name);
325	cp->mc_cpu_lock_grp_attr = lck_grp_attr_alloc_init();
326	cp->mc_cpu_lock_grp = lck_grp_alloc_init(lck_name,
327	    cp->mc_cpu_lock_grp_attr);
328	cp->mc_cpu_lock_attr = lck_attr_alloc_init();
329
330	/*
331	 * Allocation chunk size is the object's size plus any extra size
332	 * needed to satisfy the object's alignment.  It is enforced to be
333	 * at least the size of an LP64 pointer to simplify auditing and to
334	 * handle multiple-element allocation requests, where the elements
335	 * returned are linked together in a list.
336	 */
337	chunksize = MAX(bufsize, sizeof (u_int64_t));
338	if (need_zone) {
339		/* Enforce 64-bit minimum alignment for zone-based buffers */
340		align = MAX(align, sizeof (u_int64_t));
341		chunksize += sizeof (void *) + align;
342		chunksize = P2ROUNDUP(chunksize, align);
343		if ((cp->mc_slab_zone = zinit(chunksize, 64 * 1024 * ncpu,
344		    PAGE_SIZE, cp->mc_name)) == NULL)
345			goto fail;
346		zone_change(cp->mc_slab_zone, Z_EXPAND, TRUE);
347	}
348	cp->mc_chunksize = chunksize;
349
350	/*
351	 * Initialize the bucket layer.
352	 */
353	(void) snprintf(lck_name, sizeof (lck_name), "%s.bkt", cp->mc_name);
354	cp->mc_bkt_lock_grp_attr = lck_grp_attr_alloc_init();
355	cp->mc_bkt_lock_grp = lck_grp_alloc_init(lck_name,
356	    cp->mc_bkt_lock_grp_attr);
357	cp->mc_bkt_lock_attr = lck_attr_alloc_init();
358	lck_mtx_init(&cp->mc_bkt_lock, cp->mc_bkt_lock_grp,
359	    cp->mc_bkt_lock_attr);
360
361	(void) snprintf(lck_name, sizeof (lck_name), "%s.sync", cp->mc_name);
362	cp->mc_sync_lock_grp_attr = lck_grp_attr_alloc_init();
363	cp->mc_sync_lock_grp = lck_grp_alloc_init(lck_name,
364	    cp->mc_sync_lock_grp_attr);
365	cp->mc_sync_lock_attr = lck_attr_alloc_init();
366	lck_mtx_init(&cp->mc_sync_lock, cp->mc_sync_lock_grp,
367	    cp->mc_sync_lock_attr);
368
369	for (btp = mcache_bkttype; chunksize <= btp->bt_minbuf; btp++)
370		continue;
371
372	cp->cache_bkttype = btp;
373
374	/*
375	 * Initialize the CPU layer.  Each per-CPU structure is aligned
376	 * on the CPU cache line boundary to prevent false sharing.
377	 */
378	for (c = 0; c < ncpu; c++) {
379		mcache_cpu_t *ccp = &cp->mc_cpu[c];
380
381		VERIFY(IS_P2ALIGNED(ccp, CPU_CACHE_SIZE));
382		lck_mtx_init(&ccp->cc_lock, cp->mc_cpu_lock_grp,
383		    cp->mc_cpu_lock_attr);
384		ccp->cc_objs = -1;
385		ccp->cc_pobjs = -1;
386	}
387
388	if (mcache_ready)
389		mcache_cache_bkt_enable(cp);
390
391	/* TODO: dynamically create sysctl for stats */
392
393	MCACHE_LIST_LOCK();
394	LIST_INSERT_HEAD(&mcache_head, cp, mc_list);
395	MCACHE_LIST_UNLOCK();
396
397	/*
398	 * If cache buckets are enabled and this is the first cache
399	 * created, start the periodic cache update.
400	 */
401	if (!(mcache_flags & MCF_NOCPUCACHE) && !mcache_updating) {
402		mcache_updating = 1;
403		mcache_update_timeout(NULL);
404	}
405	if (cp->mc_flags & MCF_DEBUG) {
406		printf("mcache_create: %s (%s) arg %p bufsize %lu align %lu "
407		    "chunksize %lu bktsize %d\n", name, need_zone ? "i" : "e",
408		    arg, bufsize, cp->mc_align, chunksize, btp->bt_bktsize);
409	}
410	return (cp);
411
412fail:
413	if (buf != NULL)
414		zfree(mcache_zone, buf);
415	return (NULL);
416}
417
418/*
419 * Allocate one or more objects from a cache.
420 */
421__private_extern__ unsigned int
422mcache_alloc_ext(mcache_t *cp, mcache_obj_t **list, unsigned int num, int wait)
423{
424	mcache_cpu_t *ccp;
425	mcache_obj_t **top = &(*list);
426	mcache_bkt_t *bkt;
427	unsigned int need = num;
428	boolean_t nwretry = FALSE;
429
430	/* MCR_NOSLEEP and MCR_FAILOK are mutually exclusive */
431	VERIFY((wait & (MCR_NOSLEEP|MCR_FAILOK)) != (MCR_NOSLEEP|MCR_FAILOK));
432
433	ASSERT(list != NULL);
434	*list = NULL;
435
436	if (num == 0)
437		return (0);
438
439retry_alloc:
440	/* We may not always be running in the same CPU in case of retries */
441	ccp = MCACHE_CPU(cp);
442
443	MCACHE_LOCK(&ccp->cc_lock);
444	for (;;) {
445		/*
446		 * If we have an object in the current CPU's filled bucket,
447		 * chain the object to any previous objects and return if
448		 * we've satisfied the number of requested objects.
449		 */
450		if (ccp->cc_objs > 0) {
451			mcache_obj_t *tail;
452			int objs;
453
454			/*
455			 * Objects in the bucket are already linked together
456			 * with the most recently freed object at the head of
457			 * the list; grab as many objects as we can.
458			 */
459			objs = MIN((unsigned int)ccp->cc_objs, need);
460			*list = ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
461			ccp->cc_objs -= objs;
462			ccp->cc_alloc += objs;
463
464			tail = ccp->cc_filled->bkt_obj[ccp->cc_objs];
465			list = &tail->obj_next;
466			*list = NULL;
467
468			/* If we got them all, return to caller */
469			if ((need -= objs) == 0) {
470				MCACHE_UNLOCK(&ccp->cc_lock);
471
472				if (!(cp->mc_flags & MCF_NOLEAKLOG) &&
473				    cp->mc_slab_log != NULL)
474					(*cp->mc_slab_log)(num, *top, TRUE);
475
476				if (cp->mc_flags & MCF_DEBUG)
477					goto debug_alloc;
478
479				return (num);
480			}
481		}
482
483		/*
484		 * The CPU's filled bucket is empty.  If the previous filled
485		 * bucket was full, exchange and try again.
486		 */
487		if (ccp->cc_pobjs > 0) {
488			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
489			continue;
490		}
491
492		/*
493		 * If the bucket layer is disabled, allocate from slab.  This
494		 * can happen either because MCF_NOCPUCACHE is set, or because
495		 * the bucket layer is currently being resized.
496		 */
497		if (ccp->cc_bktsize == 0)
498			break;
499
500		/*
501		 * Both of the CPU's buckets are empty; try to get a full
502		 * bucket from the bucket layer.  Upon success, refill this
503		 * CPU and place any empty bucket into the empty list.
504		 */
505		bkt = mcache_bkt_alloc(cp, &cp->mc_full, NULL);
506		if (bkt != NULL) {
507			if (ccp->cc_pfilled != NULL)
508				mcache_bkt_free(cp, &cp->mc_empty,
509				    ccp->cc_pfilled);
510			mcache_cpu_refill(ccp, bkt, ccp->cc_bktsize);
511			continue;
512		}
513
514		/*
515		 * The bucket layer has no full buckets; allocate the
516		 * object(s) directly from the slab layer.
517		 */
518		break;
519	}
520	MCACHE_UNLOCK(&ccp->cc_lock);
521
522	need -= (*cp->mc_slab_alloc)(cp->mc_private, &list, need, wait);
523
524	/*
525	 * If this is a blocking allocation, or if it is non-blocking and
526	 * the cache's full bucket is non-empty, then retry the allocation.
527	 */
528	if (need > 0) {
529		if (!(wait & MCR_NONBLOCKING)) {
530			atomic_add_32(&cp->mc_wretry_cnt, 1);
531			goto retry_alloc;
532		} else if ((wait & (MCR_NOSLEEP | MCR_TRYHARD)) &&
533		    !mcache_bkt_isempty(cp)) {
534			if (!nwretry)
535				nwretry = TRUE;
536			atomic_add_32(&cp->mc_nwretry_cnt, 1);
537			goto retry_alloc;
538		} else if (nwretry) {
539			atomic_add_32(&cp->mc_nwfail_cnt, 1);
540		}
541	}
542
543	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
544		(*cp->mc_slab_log)((num - need), *top, TRUE);
545
546	if (!(cp->mc_flags & MCF_DEBUG))
547		return (num - need);
548
549debug_alloc:
550	if (cp->mc_flags & MCF_DEBUG) {
551		mcache_obj_t **o = top;
552		unsigned int n;
553
554		n = 0;
555		/*
556		 * Verify that the chain of objects have the same count as
557		 * what we are about to report to the caller.  Any mismatch
558		 * here means that the object list is insanely broken and
559		 * therefore we must panic.
560		 */
561		while (*o != NULL) {
562			o = &(*o)->obj_next;
563			++n;
564		}
565		if (n != (num - need)) {
566			panic("mcache_alloc_ext: %s cp %p corrupted list "
567			    "(got %d actual %d)\n", cp->mc_name,
568			    (void *)cp, num - need, n);
569		}
570	}
571
572	/* Invoke the slab layer audit callback if auditing is enabled */
573	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
574		(*cp->mc_slab_audit)(cp->mc_private, *top, TRUE);
575
576	return (num - need);
577}
578
579/*
580 * Allocate a single object from a cache.
581 */
582__private_extern__ void *
583mcache_alloc(mcache_t *cp, int wait)
584{
585	mcache_obj_t *buf;
586
587	(void) mcache_alloc_ext(cp, &buf, 1, wait);
588	return (buf);
589}
590
591__private_extern__ void
592mcache_waiter_inc(mcache_t *cp)
593{
594	atomic_add_32(&cp->mc_waiter_cnt, 1);
595}
596
597__private_extern__ void
598mcache_waiter_dec(mcache_t *cp)
599{
600	atomic_add_32(&cp->mc_waiter_cnt, -1);
601}
602
603__private_extern__ boolean_t
604mcache_bkt_isempty(mcache_t *cp)
605{
606	/*
607	 * This isn't meant to accurately tell whether there are
608	 * any full buckets in the cache; it is simply a way to
609	 * obtain "hints" about the state of the cache.
610	 */
611	return (cp->mc_full.bl_total == 0);
612}
613
614/*
615 * Notify the slab layer about an event.
616 */
617static void
618mcache_notify(mcache_t *cp, u_int32_t event)
619{
620	if (cp->mc_slab_notify != NULL)
621		(*cp->mc_slab_notify)(cp->mc_private, event);
622}
623
624/*
625 * Purge the cache and disable its buckets.
626 */
627static void
628mcache_purge(void *arg)
629{
630	mcache_t *cp = arg;
631
632	mcache_bkt_purge(cp);
633	/*
634	 * We cannot simply call mcache_cache_bkt_enable() from here as
635	 * a bucket resize may be in flight and we would cause the CPU
636	 * layers of the cache to point to different sizes.  Therefore,
637	 * we simply increment the enable count so that during the next
638	 * periodic cache update the buckets can be reenabled.
639	 */
640	lck_mtx_lock_spin(&cp->mc_sync_lock);
641	cp->mc_enable_cnt++;
642	lck_mtx_unlock(&cp->mc_sync_lock);
643
644}
645
646__private_extern__ boolean_t
647mcache_purge_cache(mcache_t *cp)
648{
649	/*
650	 * Purging a cache that has no per-CPU caches or is already
651	 * in the process of being purged is rather pointless.
652	 */
653	if (cp->mc_flags & MCF_NOCPUCACHE)
654		return (FALSE);
655
656	lck_mtx_lock_spin(&cp->mc_sync_lock);
657	if (cp->mc_purge_cnt > 0) {
658		lck_mtx_unlock(&cp->mc_sync_lock);
659		return (FALSE);
660	}
661	cp->mc_purge_cnt++;
662	lck_mtx_unlock(&cp->mc_sync_lock);
663
664	mcache_dispatch(mcache_purge, cp);
665
666	return (TRUE);
667}
668
669/*
670 * Free a single object to a cache.
671 */
672__private_extern__ void
673mcache_free(mcache_t *cp, void *buf)
674{
675	((mcache_obj_t *)buf)->obj_next = NULL;
676	mcache_free_ext(cp, (mcache_obj_t *)buf);
677}
678
679/*
680 * Free one or more objects to a cache.
681 */
682__private_extern__ void
683mcache_free_ext(mcache_t *cp, mcache_obj_t *list)
684{
685	mcache_cpu_t *ccp = MCACHE_CPU(cp);
686	mcache_bkttype_t *btp;
687	mcache_obj_t *nlist;
688	mcache_bkt_t *bkt;
689
690	if (!(cp->mc_flags & MCF_NOLEAKLOG) && cp->mc_slab_log != NULL)
691		(*cp->mc_slab_log)(0, list, FALSE);
692
693	/* Invoke the slab layer audit callback if auditing is enabled */
694	if ((cp->mc_flags & MCF_DEBUG) && cp->mc_slab_audit != NULL)
695		(*cp->mc_slab_audit)(cp->mc_private, list, FALSE);
696
697	MCACHE_LOCK(&ccp->cc_lock);
698	for (;;) {
699		/*
700		 * If there is space in the current CPU's filled bucket, put
701		 * the object there and return once all objects are freed.
702		 * Note the cast to unsigned integer takes care of the case
703		 * where the bucket layer is disabled (when cc_objs is -1).
704		 */
705		if ((unsigned int)ccp->cc_objs <
706		    (unsigned int)ccp->cc_bktsize) {
707			/*
708			 * Reverse the list while we place the object into the
709			 * bucket; this effectively causes the most recently
710			 * freed object(s) to be reused during allocation.
711			 */
712			nlist = list->obj_next;
713			list->obj_next = (ccp->cc_objs == 0) ? NULL :
714			    ccp->cc_filled->bkt_obj[ccp->cc_objs - 1];
715
716#if 0
717			ccp->cc_filled->bkt_obj[ccp->cc_objs++] = list;
718			ccp->cc_free++;
719#endif
720
721			if ((list = nlist) != NULL)
722				continue;
723
724			/* We are done; return to caller */
725			MCACHE_UNLOCK(&ccp->cc_lock);
726
727			/* If there is a waiter below, notify it */
728			if (cp->mc_waiter_cnt > 0)
729				mcache_notify(cp, MCN_RETRYALLOC);
730			return;
731		}
732
733		/*
734		 * The CPU's filled bucket is full.  If the previous filled
735		 * bucket was empty, exchange and try again.
736		 */
737		if (ccp->cc_pobjs == 0) {
738			mcache_cpu_refill(ccp, ccp->cc_pfilled, ccp->cc_pobjs);
739			continue;
740		}
741
742		/*
743		 * If the bucket layer is disabled, free to slab.  This can
744		 * happen either because MCF_NOCPUCACHE is set, or because
745		 * the bucket layer is currently being resized.
746		 */
747		if (ccp->cc_bktsize == 0)
748			break;
749
750		/*
751		 * Both of the CPU's buckets are full; try to get an empty
752		 * bucket from the bucket layer.  Upon success, empty this
753		 * CPU and place any full bucket into the full list.
754		 */
755		bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp);
756		if (bkt != NULL) {
757			if (ccp->cc_pfilled != NULL)
758				mcache_bkt_free(cp, &cp->mc_full,
759				    ccp->cc_pfilled);
760			mcache_cpu_refill(ccp, bkt, 0);
761			continue;
762		}
763
764		/*
765		 * We need an empty bucket to put our freed objects into
766		 * but couldn't get an empty bucket from the bucket layer;
767		 * attempt to allocate one.  We do not want to block for
768		 * allocation here, and if the bucket allocation fails
769		 * we will simply fall through to the slab layer.
770		 */
771		MCACHE_UNLOCK(&ccp->cc_lock);
772		bkt = mcache_alloc(btp->bt_cache, MCR_NOSLEEP);
773		MCACHE_LOCK(&ccp->cc_lock);
774
775		if (bkt != NULL) {
776			/*
777			 * We have an empty bucket, but since we drop the
778			 * CPU lock above, the cache's bucket size may have
779			 * changed.  If so, free the bucket and try again.
780			 */
781			if (ccp->cc_bktsize != btp->bt_bktsize) {
782				MCACHE_UNLOCK(&ccp->cc_lock);
783				mcache_free(btp->bt_cache, bkt);
784				MCACHE_LOCK(&ccp->cc_lock);
785				continue;
786			}
787
788			/*
789			 * We have an empty bucket of the right size;
790			 * add it to the bucket layer and try again.
791			 */
792			mcache_bkt_free(cp, &cp->mc_empty, bkt);
793			continue;
794		}
795
796		/*
797		 * The bucket layer has no empty buckets; free the
798		 * object(s) directly to the slab layer.
799		 */
800		break;
801	}
802	MCACHE_UNLOCK(&ccp->cc_lock);
803
804	/* If there is a waiter below, notify it */
805	if (cp->mc_waiter_cnt > 0)
806		mcache_notify(cp, MCN_RETRYALLOC);
807
808	/* Advise the slab layer to purge the object(s) */
809	(*cp->mc_slab_free)(cp->mc_private, list,
810	    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
811}
812
813/*
814 * Cache destruction routine.
815 */
816__private_extern__ void
817mcache_destroy(mcache_t *cp)
818{
819	void **pbuf;
820
821	MCACHE_LIST_LOCK();
822	LIST_REMOVE(cp, mc_list);
823	MCACHE_LIST_UNLOCK();
824
825	mcache_bkt_purge(cp);
826
827	/*
828	 * This cache is dead; there should be no further transaction.
829	 * If it's still invoked, make sure that it induces a fault.
830	 */
831	cp->mc_slab_alloc = NULL;
832	cp->mc_slab_free = NULL;
833	cp->mc_slab_audit = NULL;
834
835	lck_attr_free(cp->mc_bkt_lock_attr);
836	lck_grp_free(cp->mc_bkt_lock_grp);
837	lck_grp_attr_free(cp->mc_bkt_lock_grp_attr);
838
839	lck_attr_free(cp->mc_cpu_lock_attr);
840	lck_grp_free(cp->mc_cpu_lock_grp);
841	lck_grp_attr_free(cp->mc_cpu_lock_grp_attr);
842
843	lck_attr_free(cp->mc_sync_lock_attr);
844	lck_grp_free(cp->mc_sync_lock_grp);
845	lck_grp_attr_free(cp->mc_sync_lock_grp_attr);
846
847	/*
848	 * TODO: We need to destroy the zone here, but cannot do it
849	 * because there is no such way to achieve that.  Until then
850	 * the memory allocated for the zone structure is leaked.
851	 * Once it is achievable, uncomment these lines:
852	 *
853	 *	if (cp->mc_slab_zone != NULL) {
854	 *		zdestroy(cp->mc_slab_zone);
855	 *		cp->mc_slab_zone = NULL;
856	 *	}
857	 */
858
859	/* Get the original address since we're about to free it */
860	pbuf = (void **)((intptr_t)cp - sizeof (void *));
861
862	zfree(mcache_zone, *pbuf);
863}
864
865/*
866 * Internal slab allocator used as a backend for simple caches.  The current
867 * implementation uses the zone allocator for simplicity reasons.
868 */
869static unsigned int
870mcache_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
871{
872	mcache_t *cp = arg;
873	unsigned int need = num;
874	size_t offset = 0;
875	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
876	u_int32_t flags = cp->mc_flags;
877	void *buf, *base, **pbuf;
878	mcache_obj_t **list = *plist;
879
880	*list = NULL;
881
882	/*
883	 * The address of the object returned to the caller is an
884	 * offset from the 64-bit aligned base address only if the
885	 * cache's alignment requirement is neither 1 nor 8 bytes.
886	 */
887	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
888		offset = cp->mc_align;
889
890	for (;;) {
891		if (!(wait & MCR_NOSLEEP))
892			buf = zalloc(cp->mc_slab_zone);
893		else
894			buf = zalloc_noblock(cp->mc_slab_zone);
895
896		if (buf == NULL)
897			break;
898
899		/* Get the 64-bit aligned base address for this object */
900		base = (void *)P2ROUNDUP((intptr_t)buf + sizeof (u_int64_t),
901		    sizeof (u_int64_t));
902
903		/*
904		 * Wind back a pointer size from the aligned base and
905		 * save the original address so we can free it later.
906		 */
907		pbuf = (void **)((intptr_t)base - sizeof (void *));
908		*pbuf = buf;
909
910		/*
911		 * If auditing is enabled, patternize the contents of
912		 * the buffer starting from the 64-bit aligned base to
913		 * the end of the buffer; the length is rounded up to
914		 * the nearest 64-bit multiply; this is because we use
915		 * 64-bit memory access to set/check the pattern.
916		 */
917		if (flags & MCF_DEBUG) {
918			VERIFY(((intptr_t)base + rsize) <=
919			    ((intptr_t)buf + cp->mc_chunksize));
920			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
921		}
922
923		/*
924		 * Fix up the object's address to fulfill the cache's
925		 * alignment requirement (if needed) and return this
926		 * to the caller.
927		 */
928		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
929		    ((intptr_t)buf + cp->mc_chunksize));
930		*list = (mcache_obj_t *)((intptr_t)base + offset);
931
932		(*list)->obj_next = NULL;
933		list = *plist = &(*list)->obj_next;
934
935		/* If we got them all, return to mcache */
936		if (--need == 0)
937			break;
938	}
939
940	return (num - need);
941}
942
943/*
944 * Internal slab deallocator used as a backend for simple caches.
945 */
946static void
947mcache_slab_free(void *arg, mcache_obj_t *list, __unused boolean_t purged)
948{
949	mcache_t *cp = arg;
950	mcache_obj_t *nlist;
951	size_t offset = 0;
952	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
953	u_int32_t flags = cp->mc_flags;
954	void *base;
955	void **pbuf;
956
957	/*
958	 * The address of the object is an offset from a 64-bit
959	 * aligned base address only if the cache's alignment
960	 * requirement is neither 1 nor 8 bytes.
961	 */
962	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
963		offset = cp->mc_align;
964
965	for (;;) {
966		nlist = list->obj_next;
967		list->obj_next = NULL;
968
969		/* Get the 64-bit aligned base address of this object */
970		base = (void *)((intptr_t)list - offset);
971		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
972
973		/* Get the original address since we're about to free it */
974		pbuf = (void **)((intptr_t)base - sizeof (void *));
975
976		if (flags & MCF_DEBUG) {
977			VERIFY(((intptr_t)base + rsize) <=
978			    ((intptr_t)*pbuf + cp->mc_chunksize));
979			mcache_audit_free_verify(NULL, base, offset, rsize);
980		}
981
982		/* Free it to zone */
983		VERIFY(((intptr_t)base + offset + cp->mc_bufsize) <=
984		    ((intptr_t)*pbuf + cp->mc_chunksize));
985		zfree(cp->mc_slab_zone, *pbuf);
986
987		/* No more objects to free; return to mcache */
988		if ((list = nlist) == NULL)
989			break;
990	}
991}
992
993/*
994 * Internal slab auditor for simple caches.
995 */
996static void
997mcache_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
998{
999	mcache_t *cp = arg;
1000	size_t offset = 0;
1001	size_t rsize = P2ROUNDUP(cp->mc_bufsize, sizeof (u_int64_t));
1002	void *base, **pbuf;
1003
1004	/*
1005	 * The address of the object returned to the caller is an
1006	 * offset from the 64-bit aligned base address only if the
1007	 * cache's alignment requirement is neither 1 nor 8 bytes.
1008	 */
1009	if (cp->mc_align != 1 && cp->mc_align != sizeof (u_int64_t))
1010		offset = cp->mc_align;
1011
1012	while (list != NULL) {
1013		mcache_obj_t *next = list->obj_next;
1014
1015		/* Get the 64-bit aligned base address of this object */
1016		base = (void *)((intptr_t)list - offset);
1017		VERIFY(IS_P2ALIGNED(base, sizeof (u_int64_t)));
1018
1019		/* Get the original address */
1020		pbuf = (void **)((intptr_t)base - sizeof (void *));
1021
1022		VERIFY(((intptr_t)base + rsize) <=
1023		    ((intptr_t)*pbuf + cp->mc_chunksize));
1024
1025		if (!alloc)
1026			mcache_set_pattern(MCACHE_FREE_PATTERN, base, rsize);
1027		else
1028			mcache_audit_free_verify_set(NULL, base, offset, rsize);
1029
1030		list = list->obj_next = next;
1031	}
1032}
1033
1034/*
1035 * Refill the CPU's filled bucket with bkt and save the previous one.
1036 */
1037static void
1038mcache_cpu_refill(mcache_cpu_t *ccp, mcache_bkt_t *bkt, int objs)
1039{
1040	ASSERT((ccp->cc_filled == NULL && ccp->cc_objs == -1) ||
1041	    (ccp->cc_filled && ccp->cc_objs + objs == ccp->cc_bktsize));
1042	ASSERT(ccp->cc_bktsize > 0);
1043
1044	ccp->cc_pfilled = ccp->cc_filled;
1045	ccp->cc_pobjs = ccp->cc_objs;
1046	ccp->cc_filled = bkt;
1047	ccp->cc_objs = objs;
1048}
1049
1050/*
1051 * Allocate a bucket from the bucket layer.
1052 */
1053static mcache_bkt_t *
1054mcache_bkt_alloc(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkttype_t **btp)
1055{
1056	mcache_bkt_t *bkt;
1057
1058	if (!MCACHE_LOCK_TRY(&cp->mc_bkt_lock)) {
1059		/*
1060		 * The bucket layer lock is held by another CPU; increase
1061		 * the contention count so that we can later resize the
1062		 * bucket size accordingly.
1063		 */
1064		MCACHE_LOCK(&cp->mc_bkt_lock);
1065		cp->mc_bkt_contention++;
1066	}
1067
1068	if ((bkt = blp->bl_list) != NULL) {
1069		blp->bl_list = bkt->bkt_next;
1070		if (--blp->bl_total < blp->bl_min)
1071			blp->bl_min = blp->bl_total;
1072		blp->bl_alloc++;
1073	}
1074
1075	if (btp != NULL)
1076		*btp = cp->cache_bkttype;
1077
1078	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1079
1080	return (bkt);
1081}
1082
1083/*
1084 * Free a bucket to the bucket layer.
1085 */
1086static void
1087mcache_bkt_free(mcache_t *cp, mcache_bktlist_t *blp, mcache_bkt_t *bkt)
1088{
1089	MCACHE_LOCK(&cp->mc_bkt_lock);
1090
1091	bkt->bkt_next = blp->bl_list;
1092	blp->bl_list = bkt;
1093	blp->bl_total++;
1094
1095	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1096}
1097
1098/*
1099 * Enable the bucket layer of a cache.
1100 */
1101static void
1102mcache_cache_bkt_enable(mcache_t *cp)
1103{
1104	mcache_cpu_t *ccp;
1105	int cpu;
1106
1107	if (cp->mc_flags & MCF_NOCPUCACHE)
1108		return;
1109
1110	for (cpu = 0; cpu < ncpu; cpu++) {
1111		ccp = &cp->mc_cpu[cpu];
1112		MCACHE_LOCK(&ccp->cc_lock);
1113		ccp->cc_bktsize = cp->cache_bkttype->bt_bktsize;
1114		MCACHE_UNLOCK(&ccp->cc_lock);
1115	}
1116}
1117
1118/*
1119 * Purge all buckets from a cache and disable its bucket layer.
1120 */
1121static void
1122mcache_bkt_purge(mcache_t *cp)
1123{
1124	mcache_cpu_t *ccp;
1125	mcache_bkt_t *bp, *pbp;
1126	mcache_bkttype_t *btp;
1127	int cpu, objs, pobjs;
1128
1129	for (cpu = 0; cpu < ncpu; cpu++) {
1130		ccp = &cp->mc_cpu[cpu];
1131
1132		MCACHE_LOCK(&ccp->cc_lock);
1133
1134		btp = cp->cache_bkttype;
1135		bp = ccp->cc_filled;
1136		pbp = ccp->cc_pfilled;
1137		objs = ccp->cc_objs;
1138		pobjs = ccp->cc_pobjs;
1139		ccp->cc_filled = NULL;
1140		ccp->cc_pfilled = NULL;
1141		ccp->cc_objs = -1;
1142		ccp->cc_pobjs = -1;
1143		ccp->cc_bktsize = 0;
1144
1145		MCACHE_UNLOCK(&ccp->cc_lock);
1146
1147		if (bp != NULL)
1148			mcache_bkt_destroy(cp, btp, bp, objs);
1149		if (pbp != NULL)
1150			mcache_bkt_destroy(cp, btp, pbp, pobjs);
1151	}
1152
1153	/*
1154	 * Updating the working set back to back essentially sets
1155	 * the working set size to zero, so everything is reapable.
1156	 */
1157	mcache_bkt_ws_update(cp);
1158	mcache_bkt_ws_update(cp);
1159
1160	mcache_bkt_ws_reap(cp);
1161}
1162
1163/*
1164 * Free one or more objects in the bucket to the slab layer,
1165 * and also free the bucket itself.
1166 */
1167static void
1168mcache_bkt_destroy(mcache_t *cp, mcache_bkttype_t *btp, mcache_bkt_t *bkt,
1169    int nobjs)
1170{
1171	if (nobjs > 0) {
1172		mcache_obj_t *top = bkt->bkt_obj[nobjs - 1];
1173
1174		if (cp->mc_flags & MCF_DEBUG) {
1175			mcache_obj_t *o = top;
1176			int cnt = 0;
1177
1178			/*
1179			 * Verify that the chain of objects in the bucket is
1180			 * valid.  Any mismatch here means a mistake when the
1181			 * object(s) were freed to the CPU layer, so we panic.
1182			 */
1183			while (o != NULL) {
1184				o = o->obj_next;
1185				++cnt;
1186			}
1187			if (cnt != nobjs) {
1188				panic("mcache_bkt_destroy: %s cp %p corrupted "
1189				    "list in bkt %p (nobjs %d actual %d)\n",
1190				    cp->mc_name, (void *)cp, (void *)bkt,
1191				    nobjs, cnt);
1192			}
1193		}
1194
1195		/* Advise the slab layer to purge the object(s) */
1196		(*cp->mc_slab_free)(cp->mc_private, top,
1197		    (cp->mc_flags & MCF_DEBUG) || cp->mc_purge_cnt);
1198	}
1199	mcache_free(btp->bt_cache, bkt);
1200}
1201
1202/*
1203 * Update the bucket layer working set statistics.
1204 */
1205static void
1206mcache_bkt_ws_update(mcache_t *cp)
1207{
1208	MCACHE_LOCK(&cp->mc_bkt_lock);
1209
1210	cp->mc_full.bl_reaplimit = cp->mc_full.bl_min;
1211	cp->mc_full.bl_min = cp->mc_full.bl_total;
1212	cp->mc_empty.bl_reaplimit = cp->mc_empty.bl_min;
1213	cp->mc_empty.bl_min = cp->mc_empty.bl_total;
1214
1215	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1216}
1217
1218/*
1219 * Reap all buckets that are beyond the working set.
1220 */
1221static void
1222mcache_bkt_ws_reap(mcache_t *cp)
1223{
1224	long reap;
1225	mcache_bkt_t *bkt;
1226	mcache_bkttype_t *btp;
1227
1228	reap = MIN(cp->mc_full.bl_reaplimit, cp->mc_full.bl_min);
1229	while (reap-- &&
1230	    (bkt = mcache_bkt_alloc(cp, &cp->mc_full, &btp)) != NULL)
1231		mcache_bkt_destroy(cp, btp, bkt, btp->bt_bktsize);
1232
1233	reap = MIN(cp->mc_empty.bl_reaplimit, cp->mc_empty.bl_min);
1234	while (reap-- &&
1235	    (bkt = mcache_bkt_alloc(cp, &cp->mc_empty, &btp)) != NULL)
1236		mcache_bkt_destroy(cp, btp, bkt, 0);
1237}
1238
1239static void
1240mcache_reap_timeout(void *arg)
1241{
1242	volatile UInt32 *flag = arg;
1243
1244	ASSERT(flag == &mcache_reaping);
1245
1246	*flag = 0;
1247}
1248
1249static void
1250mcache_reap_done(void *flag)
1251{
1252	timeout(mcache_reap_timeout, flag, mcache_reap_interval);
1253}
1254
1255static void
1256mcache_reap_start(void *arg)
1257{
1258	UInt32 *flag = arg;
1259
1260	ASSERT(flag == &mcache_reaping);
1261
1262	mcache_applyall(mcache_cache_reap);
1263	mcache_dispatch(mcache_reap_done, flag);
1264}
1265
1266__private_extern__ void
1267mcache_reap(void)
1268{
1269	UInt32 *flag = &mcache_reaping;
1270
1271	if (mcache_llock_owner == current_thread() ||
1272	    !OSCompareAndSwap(0, 1, flag))
1273		return;
1274
1275	mcache_dispatch(mcache_reap_start, flag);
1276}
1277
1278static void
1279mcache_cache_reap(mcache_t *cp)
1280{
1281	mcache_bkt_ws_reap(cp);
1282}
1283
1284/*
1285 * Performs period maintenance on a cache.
1286 */
1287static void
1288mcache_cache_update(mcache_t *cp)
1289{
1290	int need_bkt_resize = 0;
1291	int need_bkt_reenable = 0;
1292
1293	lck_mtx_assert(mcache_llock, LCK_MTX_ASSERT_OWNED);
1294
1295	mcache_bkt_ws_update(cp);
1296
1297	/*
1298	 * Cache resize and post-purge reenable are mutually exclusive.
1299	 * If the cache was previously purged, there is no point of
1300	 * increasing the bucket size as there was an indication of
1301	 * memory pressure on the system.
1302	 */
1303	lck_mtx_lock_spin(&cp->mc_sync_lock);
1304	if (!(cp->mc_flags & MCF_NOCPUCACHE) && cp->mc_enable_cnt)
1305		need_bkt_reenable = 1;
1306	lck_mtx_unlock(&cp->mc_sync_lock);
1307
1308	MCACHE_LOCK(&cp->mc_bkt_lock);
1309	/*
1310	 * If the contention count is greater than the threshold, and if
1311	 * we are not already at the maximum bucket size, increase it.
1312	 * Otherwise, if this cache was previously purged by the user
1313	 * then we simply reenable it.
1314	 */
1315	if ((unsigned int)cp->mc_chunksize < cp->cache_bkttype->bt_maxbuf &&
1316	    (int)(cp->mc_bkt_contention - cp->mc_bkt_contention_prev) >
1317	    mcache_bkt_contention && !need_bkt_reenable)
1318		need_bkt_resize = 1;
1319
1320	cp ->mc_bkt_contention_prev = cp->mc_bkt_contention;
1321	MCACHE_UNLOCK(&cp->mc_bkt_lock);
1322
1323	if (need_bkt_resize)
1324		mcache_dispatch(mcache_cache_bkt_resize, cp);
1325	else if (need_bkt_reenable)
1326		mcache_dispatch(mcache_cache_enable, cp);
1327}
1328
1329/*
1330 * Recompute a cache's bucket size.  This is an expensive operation
1331 * and should not be done frequently; larger buckets provide for a
1332 * higher transfer rate with the bucket while smaller buckets reduce
1333 * the memory consumption.
1334 */
1335static void
1336mcache_cache_bkt_resize(void *arg)
1337{
1338	mcache_t *cp = arg;
1339	mcache_bkttype_t *btp = cp->cache_bkttype;
1340
1341	if ((unsigned int)cp->mc_chunksize < btp->bt_maxbuf) {
1342		mcache_bkt_purge(cp);
1343
1344		/*
1345		 * Upgrade to the next bucket type with larger bucket size;
1346		 * temporarily set the previous contention snapshot to a
1347		 * negative number to prevent unnecessary resize request.
1348		 */
1349		MCACHE_LOCK(&cp->mc_bkt_lock);
1350		cp->cache_bkttype = ++btp;
1351		cp ->mc_bkt_contention_prev = cp->mc_bkt_contention + INT_MAX;
1352		MCACHE_UNLOCK(&cp->mc_bkt_lock);
1353
1354		mcache_cache_enable(cp);
1355	}
1356}
1357
1358/*
1359 * Reenable a previously disabled cache due to purge.
1360 */
1361static void
1362mcache_cache_enable(void *arg)
1363{
1364	mcache_t *cp = arg;
1365
1366	lck_mtx_lock_spin(&cp->mc_sync_lock);
1367	cp->mc_purge_cnt = 0;
1368	cp->mc_enable_cnt = 0;
1369	lck_mtx_unlock(&cp->mc_sync_lock);
1370
1371	mcache_cache_bkt_enable(cp);
1372}
1373
1374static void
1375mcache_update_timeout(__unused void *arg)
1376{
1377	timeout(mcache_update, NULL, mcache_reap_interval);
1378}
1379
1380static void
1381mcache_update(__unused void *arg)
1382{
1383	mcache_applyall(mcache_cache_update);
1384	mcache_dispatch(mcache_update_timeout, NULL);
1385}
1386
1387static void
1388mcache_applyall(void (*func)(mcache_t *))
1389{
1390	mcache_t *cp;
1391
1392	MCACHE_LIST_LOCK();
1393	LIST_FOREACH(cp, &mcache_head, mc_list) {
1394		func(cp);
1395	}
1396	MCACHE_LIST_UNLOCK();
1397}
1398
1399static void
1400mcache_dispatch(void (*func)(void *), void *arg)
1401{
1402	ASSERT(func != NULL);
1403	timeout(func, arg, hz/1000);
1404}
1405
1406__private_extern__ void
1407mcache_buffer_log(mcache_audit_t *mca, void *addr, mcache_t *cp)
1408{
1409	mca->mca_addr = addr;
1410	mca->mca_cache = cp;
1411	mca->mca_pthread = mca->mca_thread;
1412	mca->mca_thread = current_thread();
1413	bcopy(mca->mca_stack, mca->mca_pstack, sizeof (mca->mca_pstack));
1414	mca->mca_pdepth = mca->mca_depth;
1415	bzero(mca->mca_stack, sizeof (mca->mca_stack));
1416	mca->mca_depth = OSBacktrace(mca->mca_stack, MCACHE_STACK_DEPTH);
1417}
1418
1419__private_extern__ void
1420mcache_set_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1421{
1422	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1423	u_int64_t *buf = (u_int64_t *)buf_arg;
1424
1425	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1426	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1427
1428	while (buf < buf_end)
1429		*buf++ = pattern;
1430}
1431
1432__private_extern__ void *
1433mcache_verify_pattern(u_int64_t pattern, void *buf_arg, size_t size)
1434{
1435	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1436	u_int64_t *buf;
1437
1438	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1439	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1440
1441	for (buf = buf_arg; buf < buf_end; buf++) {
1442		if (*buf != pattern)
1443			return (buf);
1444	}
1445	return (NULL);
1446}
1447
1448__private_extern__ void *
1449mcache_verify_set_pattern(u_int64_t old, u_int64_t new, void *buf_arg,
1450    size_t size)
1451{
1452	u_int64_t *buf_end = (u_int64_t *)((void *)((char *)buf_arg + size));
1453	u_int64_t *buf;
1454
1455	VERIFY(IS_P2ALIGNED(buf_arg, sizeof (u_int64_t)));
1456	VERIFY(IS_P2ALIGNED(size, sizeof (u_int64_t)));
1457
1458	for (buf = buf_arg; buf < buf_end; buf++) {
1459		if (*buf != old) {
1460			mcache_set_pattern(old, buf_arg,
1461			    (uintptr_t)buf - (uintptr_t)buf_arg);
1462			return (buf);
1463		}
1464		*buf = new;
1465	}
1466	return (NULL);
1467}
1468
1469__private_extern__ void
1470mcache_audit_free_verify(mcache_audit_t *mca, void *base, size_t offset,
1471    size_t size)
1472{
1473	void *addr;
1474	u_int64_t *oaddr64;
1475	mcache_obj_t *next;
1476
1477	addr = (void *)((uintptr_t)base + offset);
1478	next = ((mcache_obj_t *)addr)->obj_next;
1479
1480	/* For the "obj_next" pointer in the buffer */
1481	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1482	*oaddr64 = MCACHE_FREE_PATTERN;
1483
1484	if ((oaddr64 = mcache_verify_pattern(MCACHE_FREE_PATTERN,
1485	    (caddr_t)base, size)) != NULL) {
1486		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1487		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1488		/* NOTREACHED */
1489	}
1490	((mcache_obj_t *)addr)->obj_next = next;
1491}
1492
1493__private_extern__ void
1494mcache_audit_free_verify_set(mcache_audit_t *mca, void *base, size_t offset,
1495    size_t size)
1496{
1497	void *addr;
1498	u_int64_t *oaddr64;
1499	mcache_obj_t *next;
1500
1501	addr = (void *)((uintptr_t)base + offset);
1502	next = ((mcache_obj_t *)addr)->obj_next;
1503
1504	/* For the "obj_next" pointer in the buffer */
1505	oaddr64 = (u_int64_t *)P2ROUNDDOWN(addr, sizeof (u_int64_t));
1506	*oaddr64 = MCACHE_FREE_PATTERN;
1507
1508	if ((oaddr64 = mcache_verify_set_pattern(MCACHE_FREE_PATTERN,
1509	    MCACHE_UNINITIALIZED_PATTERN, (caddr_t)base, size)) != NULL) {
1510		mcache_audit_panic(mca, addr, (caddr_t)oaddr64 - (caddr_t)base,
1511		    (int64_t)MCACHE_FREE_PATTERN, (int64_t)*oaddr64);
1512		/* NOTREACHED */
1513	}
1514	((mcache_obj_t *)addr)->obj_next = next;
1515}
1516
1517#undef panic
1518
1519__private_extern__ char *
1520mcache_dump_mca(mcache_audit_t *mca)
1521{
1522	if (mca_dump_buf == NULL)
1523		return (NULL);
1524
1525	snprintf(mca_dump_buf, DUMP_MCA_BUF_SIZE,
1526	    "mca %p: addr %p, cache %p (%s)\n"
1527	    "last transaction; thread %p, saved PC stack (%d deep):\n"
1528	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1529	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1530	    "previous transaction; thread %p, saved PC stack (%d deep):\n"
1531	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n"
1532	    "\t%p, %p, %p, %p, %p, %p, %p, %p\n",
1533	    mca, mca->mca_addr, mca->mca_cache,
1534	    mca->mca_cache ? mca->mca_cache->mc_name : "?",
1535	    mca->mca_thread, mca->mca_depth,
1536	    mca->mca_stack[0], mca->mca_stack[1], mca->mca_stack[2],
1537	    mca->mca_stack[3], mca->mca_stack[4], mca->mca_stack[5],
1538	    mca->mca_stack[6], mca->mca_stack[7], mca->mca_stack[8],
1539	    mca->mca_stack[9], mca->mca_stack[10], mca->mca_stack[11],
1540	    mca->mca_stack[12], mca->mca_stack[13], mca->mca_stack[14],
1541	    mca->mca_stack[15],
1542	    mca->mca_pthread, mca->mca_pdepth,
1543	    mca->mca_pstack[0], mca->mca_pstack[1], mca->mca_pstack[2],
1544	    mca->mca_pstack[3], mca->mca_pstack[4], mca->mca_pstack[5],
1545	    mca->mca_pstack[6], mca->mca_pstack[7], mca->mca_pstack[8],
1546	    mca->mca_pstack[9], mca->mca_pstack[10], mca->mca_pstack[11],
1547	    mca->mca_pstack[12], mca->mca_pstack[13], mca->mca_pstack[14],
1548	    mca->mca_pstack[15]);
1549
1550	return (mca_dump_buf);
1551}
1552
1553__private_extern__ void
1554mcache_audit_panic(mcache_audit_t *mca, void *addr, size_t offset,
1555    int64_t expected, int64_t got)
1556{
1557	if (mca == NULL) {
1558		panic("mcache_audit: buffer %p modified after free at "
1559		    "offset 0x%lx (0x%llx instead of 0x%llx)\n", addr,
1560		    offset, got, expected);
1561		/* NOTREACHED */
1562	}
1563
1564	panic("mcache_audit: buffer %p modified after free at offset 0x%lx "
1565	    "(0x%llx instead of 0x%llx)\n%s\n",
1566	    addr, offset, got, expected, mcache_dump_mca(mca));
1567	/* NOTREACHED */
1568}
1569
1570__private_extern__ int
1571assfail(const char *a, const char *f, int l)
1572{
1573	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
1574	return (0);
1575}
1576