uma_core.c revision 260301
1/*-
2 * Copyright (c) 2002-2005, 2009, 2013 Jeffrey Roberson <jeff@FreeBSD.org>
3 * Copyright (c) 2004, 2005 Bosko Milekic <bmilekic@FreeBSD.org>
4 * Copyright (c) 2004-2006 Robert N. M. Watson
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice unmodified, this list of conditions, and the following
12 *    disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29/*
30 * uma_core.c  Implementation of the Universal Memory allocator
31 *
32 * This allocator is intended to replace the multitude of similar object caches
33 * in the standard FreeBSD kernel.  The intent is to be flexible as well as
34 * effecient.  A primary design goal is to return unused memory to the rest of
35 * the system.  This will make the system as a whole more flexible due to the
36 * ability to move memory to subsystems which most need it instead of leaving
37 * pools of reserved memory unused.
38 *
39 * The basic ideas stem from similar slab/zone based allocators whose algorithms
40 * are well known.
41 *
42 */
43
44/*
45 * TODO:
46 *	- Improve memory usage for large allocations
47 *	- Investigate cache size adjustments
48 */
49
50#include <sys/cdefs.h>
51__FBSDID("$FreeBSD: stable/10/sys/vm/uma_core.c 260301 2014-01-04 23:37:01Z mav $");
52
53/* I should really use ktr.. */
54/*
55#define UMA_DEBUG 1
56#define UMA_DEBUG_ALLOC 1
57#define UMA_DEBUG_ALLOC_1 1
58*/
59
60#include "opt_ddb.h"
61#include "opt_param.h"
62#include "opt_vm.h"
63
64#include <sys/param.h>
65#include <sys/systm.h>
66#include <sys/bitset.h>
67#include <sys/kernel.h>
68#include <sys/types.h>
69#include <sys/queue.h>
70#include <sys/malloc.h>
71#include <sys/ktr.h>
72#include <sys/lock.h>
73#include <sys/sysctl.h>
74#include <sys/mutex.h>
75#include <sys/proc.h>
76#include <sys/rwlock.h>
77#include <sys/sbuf.h>
78#include <sys/smp.h>
79#include <sys/vmmeter.h>
80
81#include <vm/vm.h>
82#include <vm/vm_object.h>
83#include <vm/vm_page.h>
84#include <vm/vm_pageout.h>
85#include <vm/vm_param.h>
86#include <vm/vm_map.h>
87#include <vm/vm_kern.h>
88#include <vm/vm_extern.h>
89#include <vm/uma.h>
90#include <vm/uma_int.h>
91#include <vm/uma_dbg.h>
92
93#include <ddb/ddb.h>
94
95#ifdef DEBUG_MEMGUARD
96#include <vm/memguard.h>
97#endif
98
99/*
100 * This is the zone and keg from which all zones are spawned.  The idea is that
101 * even the zone & keg heads are allocated from the allocator, so we use the
102 * bss section to bootstrap us.
103 */
104static struct uma_keg masterkeg;
105static struct uma_zone masterzone_k;
106static struct uma_zone masterzone_z;
107static uma_zone_t kegs = &masterzone_k;
108static uma_zone_t zones = &masterzone_z;
109
110/* This is the zone from which all of uma_slab_t's are allocated. */
111static uma_zone_t slabzone;
112static uma_zone_t slabrefzone;	/* With refcounters (for UMA_ZONE_REFCNT) */
113
114/*
115 * The initial hash tables come out of this zone so they can be allocated
116 * prior to malloc coming up.
117 */
118static uma_zone_t hashzone;
119
120/* The boot-time adjusted value for cache line alignment. */
121int uma_align_cache = 64 - 1;
122
123static MALLOC_DEFINE(M_UMAHASH, "UMAHash", "UMA Hash Buckets");
124
125/*
126 * Are we allowed to allocate buckets?
127 */
128static int bucketdisable = 1;
129
130/* Linked list of all kegs in the system */
131static LIST_HEAD(,uma_keg) uma_kegs = LIST_HEAD_INITIALIZER(uma_kegs);
132
133/* This mutex protects the keg list */
134static struct mtx_padalign uma_mtx;
135
136/* Linked list of boot time pages */
137static LIST_HEAD(,uma_slab) uma_boot_pages =
138    LIST_HEAD_INITIALIZER(uma_boot_pages);
139
140/* This mutex protects the boot time pages list */
141static struct mtx_padalign uma_boot_pages_mtx;
142
143/* Is the VM done starting up? */
144static int booted = 0;
145#define	UMA_STARTUP	1
146#define	UMA_STARTUP2	2
147
148/* Maximum number of allowed items-per-slab if the slab header is OFFPAGE */
149static const u_int uma_max_ipers = SLAB_SETSIZE;
150
151/*
152 * Only mbuf clusters use ref zones.  Just provide enough references
153 * to support the one user.  New code should not use the ref facility.
154 */
155static const u_int uma_max_ipers_ref = PAGE_SIZE / MCLBYTES;
156
157/*
158 * This is the handle used to schedule events that need to happen
159 * outside of the allocation fast path.
160 */
161static struct callout uma_callout;
162#define	UMA_TIMEOUT	20		/* Seconds for callout interval. */
163
164/*
165 * This structure is passed as the zone ctor arg so that I don't have to create
166 * a special allocation function just for zones.
167 */
168struct uma_zctor_args {
169	const char *name;
170	size_t size;
171	uma_ctor ctor;
172	uma_dtor dtor;
173	uma_init uminit;
174	uma_fini fini;
175	uma_import import;
176	uma_release release;
177	void *arg;
178	uma_keg_t keg;
179	int align;
180	uint32_t flags;
181};
182
183struct uma_kctor_args {
184	uma_zone_t zone;
185	size_t size;
186	uma_init uminit;
187	uma_fini fini;
188	int align;
189	uint32_t flags;
190};
191
192struct uma_bucket_zone {
193	uma_zone_t	ubz_zone;
194	char		*ubz_name;
195	int		ubz_entries;	/* Number of items it can hold. */
196	int		ubz_maxsize;	/* Maximum allocation size per-item. */
197};
198
199/*
200 * Compute the actual number of bucket entries to pack them in power
201 * of two sizes for more efficient space utilization.
202 */
203#define	BUCKET_SIZE(n)						\
204    (((sizeof(void *) * (n)) - sizeof(struct uma_bucket)) / sizeof(void *))
205
206#define	BUCKET_MAX	BUCKET_SIZE(128)
207
208struct uma_bucket_zone bucket_zones[] = {
209	{ NULL, "4 Bucket", BUCKET_SIZE(4), 4096 },
210	{ NULL, "6 Bucket", BUCKET_SIZE(6), 3072 },
211	{ NULL, "8 Bucket", BUCKET_SIZE(8), 2048 },
212	{ NULL, "12 Bucket", BUCKET_SIZE(12), 1536 },
213	{ NULL, "16 Bucket", BUCKET_SIZE(16), 1024 },
214	{ NULL, "32 Bucket", BUCKET_SIZE(32), 512 },
215	{ NULL, "64 Bucket", BUCKET_SIZE(64), 256 },
216	{ NULL, "128 Bucket", BUCKET_SIZE(128), 128 },
217	{ NULL, NULL, 0}
218};
219
220/*
221 * Flags and enumerations to be passed to internal functions.
222 */
223enum zfreeskip { SKIP_NONE = 0, SKIP_DTOR, SKIP_FINI };
224
225/* Prototypes.. */
226
227static void *noobj_alloc(uma_zone_t, int, uint8_t *, int);
228static void *page_alloc(uma_zone_t, int, uint8_t *, int);
229static void *startup_alloc(uma_zone_t, int, uint8_t *, int);
230static void page_free(void *, int, uint8_t);
231static uma_slab_t keg_alloc_slab(uma_keg_t, uma_zone_t, int);
232static void cache_drain(uma_zone_t);
233static void bucket_drain(uma_zone_t, uma_bucket_t);
234static void bucket_cache_drain(uma_zone_t zone);
235static int keg_ctor(void *, int, void *, int);
236static void keg_dtor(void *, int, void *);
237static int zone_ctor(void *, int, void *, int);
238static void zone_dtor(void *, int, void *);
239static int zero_init(void *, int, int);
240static void keg_small_init(uma_keg_t keg);
241static void keg_large_init(uma_keg_t keg);
242static void zone_foreach(void (*zfunc)(uma_zone_t));
243static void zone_timeout(uma_zone_t zone);
244static int hash_alloc(struct uma_hash *);
245static int hash_expand(struct uma_hash *, struct uma_hash *);
246static void hash_free(struct uma_hash *hash);
247static void uma_timeout(void *);
248static void uma_startup3(void);
249static void *zone_alloc_item(uma_zone_t, void *, int);
250static void zone_free_item(uma_zone_t, void *, void *, enum zfreeskip);
251static void bucket_enable(void);
252static void bucket_init(void);
253static uma_bucket_t bucket_alloc(uma_zone_t zone, void *, int);
254static void bucket_free(uma_zone_t zone, uma_bucket_t, void *);
255static void bucket_zone_drain(void);
256static uma_bucket_t zone_alloc_bucket(uma_zone_t zone, void *, int flags);
257static uma_slab_t zone_fetch_slab(uma_zone_t zone, uma_keg_t last, int flags);
258static uma_slab_t zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int flags);
259static void *slab_alloc_item(uma_keg_t keg, uma_slab_t slab);
260static void slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item);
261static uma_keg_t uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit,
262    uma_fini fini, int align, uint32_t flags);
263static int zone_import(uma_zone_t zone, void **bucket, int max, int flags);
264static void zone_release(uma_zone_t zone, void **bucket, int cnt);
265
266void uma_print_zone(uma_zone_t);
267void uma_print_stats(void);
268static int sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS);
269static int sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS);
270
271SYSINIT(uma_startup3, SI_SUB_VM_CONF, SI_ORDER_SECOND, uma_startup3, NULL);
272
273SYSCTL_PROC(_vm, OID_AUTO, zone_count, CTLFLAG_RD|CTLTYPE_INT,
274    0, 0, sysctl_vm_zone_count, "I", "Number of UMA zones");
275
276SYSCTL_PROC(_vm, OID_AUTO, zone_stats, CTLFLAG_RD|CTLTYPE_STRUCT,
277    0, 0, sysctl_vm_zone_stats, "s,struct uma_type_header", "Zone Stats");
278
279static int zone_warnings = 1;
280TUNABLE_INT("vm.zone_warnings", &zone_warnings);
281SYSCTL_INT(_vm, OID_AUTO, zone_warnings, CTLFLAG_RW, &zone_warnings, 0,
282    "Warn when UMA zones becomes full");
283
284/*
285 * This routine checks to see whether or not it's safe to enable buckets.
286 */
287static void
288bucket_enable(void)
289{
290	bucketdisable = vm_page_count_min();
291}
292
293/*
294 * Initialize bucket_zones, the array of zones of buckets of various sizes.
295 *
296 * For each zone, calculate the memory required for each bucket, consisting
297 * of the header and an array of pointers.
298 */
299static void
300bucket_init(void)
301{
302	struct uma_bucket_zone *ubz;
303	int size;
304	int i;
305
306	for (i = 0, ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++) {
307		size = roundup(sizeof(struct uma_bucket), sizeof(void *));
308		size += sizeof(void *) * ubz->ubz_entries;
309		ubz->ubz_zone = uma_zcreate(ubz->ubz_name, size,
310		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR,
311		    UMA_ZONE_MTXCLASS | UMA_ZFLAG_BUCKET);
312	}
313}
314
315/*
316 * Given a desired number of entries for a bucket, return the zone from which
317 * to allocate the bucket.
318 */
319static struct uma_bucket_zone *
320bucket_zone_lookup(int entries)
321{
322	struct uma_bucket_zone *ubz;
323
324	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
325		if (ubz->ubz_entries >= entries)
326			return (ubz);
327	ubz--;
328	return (ubz);
329}
330
331static int
332bucket_select(int size)
333{
334	struct uma_bucket_zone *ubz;
335
336	ubz = &bucket_zones[0];
337	if (size > ubz->ubz_maxsize)
338		return MAX((ubz->ubz_maxsize * ubz->ubz_entries) / size, 1);
339
340	for (; ubz->ubz_entries != 0; ubz++)
341		if (ubz->ubz_maxsize < size)
342			break;
343	ubz--;
344	return (ubz->ubz_entries);
345}
346
347static uma_bucket_t
348bucket_alloc(uma_zone_t zone, void *udata, int flags)
349{
350	struct uma_bucket_zone *ubz;
351	uma_bucket_t bucket;
352
353	/*
354	 * This is to stop us from allocating per cpu buckets while we're
355	 * running out of vm.boot_pages.  Otherwise, we would exhaust the
356	 * boot pages.  This also prevents us from allocating buckets in
357	 * low memory situations.
358	 */
359	if (bucketdisable)
360		return (NULL);
361	/*
362	 * To limit bucket recursion we store the original zone flags
363	 * in a cookie passed via zalloc_arg/zfree_arg.  This allows the
364	 * NOVM flag to persist even through deep recursions.  We also
365	 * store ZFLAG_BUCKET once we have recursed attempting to allocate
366	 * a bucket for a bucket zone so we do not allow infinite bucket
367	 * recursion.  This cookie will even persist to frees of unused
368	 * buckets via the allocation path or bucket allocations in the
369	 * free path.
370	 */
371	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
372		udata = (void *)(uintptr_t)zone->uz_flags;
373	else {
374		if ((uintptr_t)udata & UMA_ZFLAG_BUCKET)
375			return (NULL);
376		udata = (void *)((uintptr_t)udata | UMA_ZFLAG_BUCKET);
377	}
378	if ((uintptr_t)udata & UMA_ZFLAG_CACHEONLY)
379		flags |= M_NOVM;
380	ubz = bucket_zone_lookup(zone->uz_count);
381	bucket = uma_zalloc_arg(ubz->ubz_zone, udata, flags);
382	if (bucket) {
383#ifdef INVARIANTS
384		bzero(bucket->ub_bucket, sizeof(void *) * ubz->ubz_entries);
385#endif
386		bucket->ub_cnt = 0;
387		bucket->ub_entries = ubz->ubz_entries;
388	}
389
390	return (bucket);
391}
392
393static void
394bucket_free(uma_zone_t zone, uma_bucket_t bucket, void *udata)
395{
396	struct uma_bucket_zone *ubz;
397
398	KASSERT(bucket->ub_cnt == 0,
399	    ("bucket_free: Freeing a non free bucket."));
400	if ((zone->uz_flags & UMA_ZFLAG_BUCKET) == 0)
401		udata = (void *)(uintptr_t)zone->uz_flags;
402	ubz = bucket_zone_lookup(bucket->ub_entries);
403	uma_zfree_arg(ubz->ubz_zone, bucket, udata);
404}
405
406static void
407bucket_zone_drain(void)
408{
409	struct uma_bucket_zone *ubz;
410
411	for (ubz = &bucket_zones[0]; ubz->ubz_entries != 0; ubz++)
412		zone_drain(ubz->ubz_zone);
413}
414
415static void
416zone_log_warning(uma_zone_t zone)
417{
418	static const struct timeval warninterval = { 300, 0 };
419
420	if (!zone_warnings || zone->uz_warning == NULL)
421		return;
422
423	if (ratecheck(&zone->uz_ratecheck, &warninterval))
424		printf("[zone: %s] %s\n", zone->uz_name, zone->uz_warning);
425}
426
427static void
428zone_foreach_keg(uma_zone_t zone, void (*kegfn)(uma_keg_t))
429{
430	uma_klink_t klink;
431
432	LIST_FOREACH(klink, &zone->uz_kegs, kl_link)
433		kegfn(klink->kl_keg);
434}
435
436/*
437 * Routine called by timeout which is used to fire off some time interval
438 * based calculations.  (stats, hash size, etc.)
439 *
440 * Arguments:
441 *	arg   Unused
442 *
443 * Returns:
444 *	Nothing
445 */
446static void
447uma_timeout(void *unused)
448{
449	bucket_enable();
450	zone_foreach(zone_timeout);
451
452	/* Reschedule this event */
453	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
454}
455
456/*
457 * Routine to perform timeout driven calculations.  This expands the
458 * hashes and does per cpu statistics aggregation.
459 *
460 *  Returns nothing.
461 */
462static void
463keg_timeout(uma_keg_t keg)
464{
465
466	KEG_LOCK(keg);
467	/*
468	 * Expand the keg hash table.
469	 *
470	 * This is done if the number of slabs is larger than the hash size.
471	 * What I'm trying to do here is completely reduce collisions.  This
472	 * may be a little aggressive.  Should I allow for two collisions max?
473	 */
474	if (keg->uk_flags & UMA_ZONE_HASH &&
475	    keg->uk_pages / keg->uk_ppera >= keg->uk_hash.uh_hashsize) {
476		struct uma_hash newhash;
477		struct uma_hash oldhash;
478		int ret;
479
480		/*
481		 * This is so involved because allocating and freeing
482		 * while the keg lock is held will lead to deadlock.
483		 * I have to do everything in stages and check for
484		 * races.
485		 */
486		newhash = keg->uk_hash;
487		KEG_UNLOCK(keg);
488		ret = hash_alloc(&newhash);
489		KEG_LOCK(keg);
490		if (ret) {
491			if (hash_expand(&keg->uk_hash, &newhash)) {
492				oldhash = keg->uk_hash;
493				keg->uk_hash = newhash;
494			} else
495				oldhash = newhash;
496
497			KEG_UNLOCK(keg);
498			hash_free(&oldhash);
499			return;
500		}
501	}
502	KEG_UNLOCK(keg);
503}
504
505static void
506zone_timeout(uma_zone_t zone)
507{
508
509	zone_foreach_keg(zone, &keg_timeout);
510}
511
512/*
513 * Allocate and zero fill the next sized hash table from the appropriate
514 * backing store.
515 *
516 * Arguments:
517 *	hash  A new hash structure with the old hash size in uh_hashsize
518 *
519 * Returns:
520 *	1 on sucess and 0 on failure.
521 */
522static int
523hash_alloc(struct uma_hash *hash)
524{
525	int oldsize;
526	int alloc;
527
528	oldsize = hash->uh_hashsize;
529
530	/* We're just going to go to a power of two greater */
531	if (oldsize)  {
532		hash->uh_hashsize = oldsize * 2;
533		alloc = sizeof(hash->uh_slab_hash[0]) * hash->uh_hashsize;
534		hash->uh_slab_hash = (struct slabhead *)malloc(alloc,
535		    M_UMAHASH, M_NOWAIT);
536	} else {
537		alloc = sizeof(hash->uh_slab_hash[0]) * UMA_HASH_SIZE_INIT;
538		hash->uh_slab_hash = zone_alloc_item(hashzone, NULL,
539		    M_WAITOK);
540		hash->uh_hashsize = UMA_HASH_SIZE_INIT;
541	}
542	if (hash->uh_slab_hash) {
543		bzero(hash->uh_slab_hash, alloc);
544		hash->uh_hashmask = hash->uh_hashsize - 1;
545		return (1);
546	}
547
548	return (0);
549}
550
551/*
552 * Expands the hash table for HASH zones.  This is done from zone_timeout
553 * to reduce collisions.  This must not be done in the regular allocation
554 * path, otherwise, we can recurse on the vm while allocating pages.
555 *
556 * Arguments:
557 *	oldhash  The hash you want to expand
558 *	newhash  The hash structure for the new table
559 *
560 * Returns:
561 *	Nothing
562 *
563 * Discussion:
564 */
565static int
566hash_expand(struct uma_hash *oldhash, struct uma_hash *newhash)
567{
568	uma_slab_t slab;
569	int hval;
570	int i;
571
572	if (!newhash->uh_slab_hash)
573		return (0);
574
575	if (oldhash->uh_hashsize >= newhash->uh_hashsize)
576		return (0);
577
578	/*
579	 * I need to investigate hash algorithms for resizing without a
580	 * full rehash.
581	 */
582
583	for (i = 0; i < oldhash->uh_hashsize; i++)
584		while (!SLIST_EMPTY(&oldhash->uh_slab_hash[i])) {
585			slab = SLIST_FIRST(&oldhash->uh_slab_hash[i]);
586			SLIST_REMOVE_HEAD(&oldhash->uh_slab_hash[i], us_hlink);
587			hval = UMA_HASH(newhash, slab->us_data);
588			SLIST_INSERT_HEAD(&newhash->uh_slab_hash[hval],
589			    slab, us_hlink);
590		}
591
592	return (1);
593}
594
595/*
596 * Free the hash bucket to the appropriate backing store.
597 *
598 * Arguments:
599 *	slab_hash  The hash bucket we're freeing
600 *	hashsize   The number of entries in that hash bucket
601 *
602 * Returns:
603 *	Nothing
604 */
605static void
606hash_free(struct uma_hash *hash)
607{
608	if (hash->uh_slab_hash == NULL)
609		return;
610	if (hash->uh_hashsize == UMA_HASH_SIZE_INIT)
611		zone_free_item(hashzone, hash->uh_slab_hash, NULL, SKIP_NONE);
612	else
613		free(hash->uh_slab_hash, M_UMAHASH);
614}
615
616/*
617 * Frees all outstanding items in a bucket
618 *
619 * Arguments:
620 *	zone   The zone to free to, must be unlocked.
621 *	bucket The free/alloc bucket with items, cpu queue must be locked.
622 *
623 * Returns:
624 *	Nothing
625 */
626
627static void
628bucket_drain(uma_zone_t zone, uma_bucket_t bucket)
629{
630	int i;
631
632	if (bucket == NULL)
633		return;
634
635	if (zone->uz_fini)
636		for (i = 0; i < bucket->ub_cnt; i++)
637			zone->uz_fini(bucket->ub_bucket[i], zone->uz_size);
638	zone->uz_release(zone->uz_arg, bucket->ub_bucket, bucket->ub_cnt);
639	bucket->ub_cnt = 0;
640}
641
642/*
643 * Drains the per cpu caches for a zone.
644 *
645 * NOTE: This may only be called while the zone is being turn down, and not
646 * during normal operation.  This is necessary in order that we do not have
647 * to migrate CPUs to drain the per-CPU caches.
648 *
649 * Arguments:
650 *	zone     The zone to drain, must be unlocked.
651 *
652 * Returns:
653 *	Nothing
654 */
655static void
656cache_drain(uma_zone_t zone)
657{
658	uma_cache_t cache;
659	int cpu;
660
661	/*
662	 * XXX: It is safe to not lock the per-CPU caches, because we're
663	 * tearing down the zone anyway.  I.e., there will be no further use
664	 * of the caches at this point.
665	 *
666	 * XXX: It would good to be able to assert that the zone is being
667	 * torn down to prevent improper use of cache_drain().
668	 *
669	 * XXX: We lock the zone before passing into bucket_cache_drain() as
670	 * it is used elsewhere.  Should the tear-down path be made special
671	 * there in some form?
672	 */
673	CPU_FOREACH(cpu) {
674		cache = &zone->uz_cpu[cpu];
675		bucket_drain(zone, cache->uc_allocbucket);
676		bucket_drain(zone, cache->uc_freebucket);
677		if (cache->uc_allocbucket != NULL)
678			bucket_free(zone, cache->uc_allocbucket, NULL);
679		if (cache->uc_freebucket != NULL)
680			bucket_free(zone, cache->uc_freebucket, NULL);
681		cache->uc_allocbucket = cache->uc_freebucket = NULL;
682	}
683	ZONE_LOCK(zone);
684	bucket_cache_drain(zone);
685	ZONE_UNLOCK(zone);
686}
687
688/*
689 * Drain the cached buckets from a zone.  Expects a locked zone on entry.
690 */
691static void
692bucket_cache_drain(uma_zone_t zone)
693{
694	uma_bucket_t bucket;
695
696	/*
697	 * Drain the bucket queues and free the buckets, we just keep two per
698	 * cpu (alloc/free).
699	 */
700	while ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
701		LIST_REMOVE(bucket, ub_link);
702		ZONE_UNLOCK(zone);
703		bucket_drain(zone, bucket);
704		bucket_free(zone, bucket, NULL);
705		ZONE_LOCK(zone);
706	}
707
708	/*
709	 * Shrink further bucket sizes.  Price of single zone lock collision
710	 * is probably lower then price of global cache drain.
711	 */
712	if (zone->uz_count > zone->uz_count_min)
713		zone->uz_count--;
714}
715
716static void
717keg_free_slab(uma_keg_t keg, uma_slab_t slab, int start)
718{
719	uint8_t *mem;
720	int i;
721	uint8_t flags;
722
723	mem = slab->us_data;
724	flags = slab->us_flags;
725	i = start;
726	if (keg->uk_fini != NULL) {
727		for (i--; i > -1; i--)
728			keg->uk_fini(slab->us_data + (keg->uk_rsize * i),
729			    keg->uk_size);
730	}
731	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
732		zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
733#ifdef UMA_DEBUG
734	printf("%s: Returning %d bytes.\n", keg->uk_name,
735	    PAGE_SIZE * keg->uk_ppera);
736#endif
737	keg->uk_freef(mem, PAGE_SIZE * keg->uk_ppera, flags);
738}
739
740/*
741 * Frees pages from a keg back to the system.  This is done on demand from
742 * the pageout daemon.
743 *
744 * Returns nothing.
745 */
746static void
747keg_drain(uma_keg_t keg)
748{
749	struct slabhead freeslabs = { 0 };
750	uma_slab_t slab;
751	uma_slab_t n;
752
753	/*
754	 * We don't want to take pages from statically allocated kegs at this
755	 * time
756	 */
757	if (keg->uk_flags & UMA_ZONE_NOFREE || keg->uk_freef == NULL)
758		return;
759
760#ifdef UMA_DEBUG
761	printf("%s free items: %u\n", keg->uk_name, keg->uk_free);
762#endif
763	KEG_LOCK(keg);
764	if (keg->uk_free == 0)
765		goto finished;
766
767	slab = LIST_FIRST(&keg->uk_free_slab);
768	while (slab) {
769		n = LIST_NEXT(slab, us_link);
770
771		/* We have no where to free these to */
772		if (slab->us_flags & UMA_SLAB_BOOT) {
773			slab = n;
774			continue;
775		}
776
777		LIST_REMOVE(slab, us_link);
778		keg->uk_pages -= keg->uk_ppera;
779		keg->uk_free -= keg->uk_ipers;
780
781		if (keg->uk_flags & UMA_ZONE_HASH)
782			UMA_HASH_REMOVE(&keg->uk_hash, slab, slab->us_data);
783
784		SLIST_INSERT_HEAD(&freeslabs, slab, us_hlink);
785
786		slab = n;
787	}
788finished:
789	KEG_UNLOCK(keg);
790
791	while ((slab = SLIST_FIRST(&freeslabs)) != NULL) {
792		SLIST_REMOVE(&freeslabs, slab, uma_slab, us_hlink);
793		keg_free_slab(keg, slab, keg->uk_ipers);
794	}
795}
796
797static void
798zone_drain_wait(uma_zone_t zone, int waitok)
799{
800
801	/*
802	 * Set draining to interlock with zone_dtor() so we can release our
803	 * locks as we go.  Only dtor() should do a WAITOK call since it
804	 * is the only call that knows the structure will still be available
805	 * when it wakes up.
806	 */
807	ZONE_LOCK(zone);
808	while (zone->uz_flags & UMA_ZFLAG_DRAINING) {
809		if (waitok == M_NOWAIT)
810			goto out;
811		mtx_unlock(&uma_mtx);
812		msleep(zone, zone->uz_lockptr, PVM, "zonedrain", 1);
813		mtx_lock(&uma_mtx);
814	}
815	zone->uz_flags |= UMA_ZFLAG_DRAINING;
816	bucket_cache_drain(zone);
817	ZONE_UNLOCK(zone);
818	/*
819	 * The DRAINING flag protects us from being freed while
820	 * we're running.  Normally the uma_mtx would protect us but we
821	 * must be able to release and acquire the right lock for each keg.
822	 */
823	zone_foreach_keg(zone, &keg_drain);
824	ZONE_LOCK(zone);
825	zone->uz_flags &= ~UMA_ZFLAG_DRAINING;
826	wakeup(zone);
827out:
828	ZONE_UNLOCK(zone);
829}
830
831void
832zone_drain(uma_zone_t zone)
833{
834
835	zone_drain_wait(zone, M_NOWAIT);
836}
837
838/*
839 * Allocate a new slab for a keg.  This does not insert the slab onto a list.
840 *
841 * Arguments:
842 *	wait  Shall we wait?
843 *
844 * Returns:
845 *	The slab that was allocated or NULL if there is no memory and the
846 *	caller specified M_NOWAIT.
847 */
848static uma_slab_t
849keg_alloc_slab(uma_keg_t keg, uma_zone_t zone, int wait)
850{
851	uma_slabrefcnt_t slabref;
852	uma_alloc allocf;
853	uma_slab_t slab;
854	uint8_t *mem;
855	uint8_t flags;
856	int i;
857
858	mtx_assert(&keg->uk_lock, MA_OWNED);
859	slab = NULL;
860	mem = NULL;
861
862#ifdef UMA_DEBUG
863	printf("alloc_slab:  Allocating a new slab for %s\n", keg->uk_name);
864#endif
865	allocf = keg->uk_allocf;
866	KEG_UNLOCK(keg);
867
868	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
869		slab = zone_alloc_item(keg->uk_slabzone, NULL, wait);
870		if (slab == NULL)
871			goto out;
872	}
873
874	/*
875	 * This reproduces the old vm_zone behavior of zero filling pages the
876	 * first time they are added to a zone.
877	 *
878	 * Malloced items are zeroed in uma_zalloc.
879	 */
880
881	if ((keg->uk_flags & UMA_ZONE_MALLOC) == 0)
882		wait |= M_ZERO;
883	else
884		wait &= ~M_ZERO;
885
886	if (keg->uk_flags & UMA_ZONE_NODUMP)
887		wait |= M_NODUMP;
888
889	/* zone is passed for legacy reasons. */
890	mem = allocf(zone, keg->uk_ppera * PAGE_SIZE, &flags, wait);
891	if (mem == NULL) {
892		if (keg->uk_flags & UMA_ZONE_OFFPAGE)
893			zone_free_item(keg->uk_slabzone, slab, NULL, SKIP_NONE);
894		slab = NULL;
895		goto out;
896	}
897
898	/* Point the slab into the allocated memory */
899	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE))
900		slab = (uma_slab_t )(mem + keg->uk_pgoff);
901
902	if (keg->uk_flags & UMA_ZONE_VTOSLAB)
903		for (i = 0; i < keg->uk_ppera; i++)
904			vsetslab((vm_offset_t)mem + (i * PAGE_SIZE), slab);
905
906	slab->us_keg = keg;
907	slab->us_data = mem;
908	slab->us_freecount = keg->uk_ipers;
909	slab->us_flags = flags;
910	BIT_FILL(SLAB_SETSIZE, &slab->us_free);
911#ifdef INVARIANTS
912	BIT_ZERO(SLAB_SETSIZE, &slab->us_debugfree);
913#endif
914	if (keg->uk_flags & UMA_ZONE_REFCNT) {
915		slabref = (uma_slabrefcnt_t)slab;
916		for (i = 0; i < keg->uk_ipers; i++)
917			slabref->us_refcnt[i] = 0;
918	}
919
920	if (keg->uk_init != NULL) {
921		for (i = 0; i < keg->uk_ipers; i++)
922			if (keg->uk_init(slab->us_data + (keg->uk_rsize * i),
923			    keg->uk_size, wait) != 0)
924				break;
925		if (i != keg->uk_ipers) {
926			keg_free_slab(keg, slab, i);
927			slab = NULL;
928			goto out;
929		}
930	}
931out:
932	KEG_LOCK(keg);
933
934	if (slab != NULL) {
935		if (keg->uk_flags & UMA_ZONE_HASH)
936			UMA_HASH_INSERT(&keg->uk_hash, slab, mem);
937
938		keg->uk_pages += keg->uk_ppera;
939		keg->uk_free += keg->uk_ipers;
940	}
941
942	return (slab);
943}
944
945/*
946 * This function is intended to be used early on in place of page_alloc() so
947 * that we may use the boot time page cache to satisfy allocations before
948 * the VM is ready.
949 */
950static void *
951startup_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
952{
953	uma_keg_t keg;
954	uma_slab_t tmps;
955	int pages, check_pages;
956
957	keg = zone_first_keg(zone);
958	pages = howmany(bytes, PAGE_SIZE);
959	check_pages = pages - 1;
960	KASSERT(pages > 0, ("startup_alloc can't reserve 0 pages\n"));
961
962	/*
963	 * Check our small startup cache to see if it has pages remaining.
964	 */
965	mtx_lock(&uma_boot_pages_mtx);
966
967	/* First check if we have enough room. */
968	tmps = LIST_FIRST(&uma_boot_pages);
969	while (tmps != NULL && check_pages-- > 0)
970		tmps = LIST_NEXT(tmps, us_link);
971	if (tmps != NULL) {
972		/*
973		 * It's ok to lose tmps references.  The last one will
974		 * have tmps->us_data pointing to the start address of
975		 * "pages" contiguous pages of memory.
976		 */
977		while (pages-- > 0) {
978			tmps = LIST_FIRST(&uma_boot_pages);
979			LIST_REMOVE(tmps, us_link);
980		}
981		mtx_unlock(&uma_boot_pages_mtx);
982		*pflag = tmps->us_flags;
983		return (tmps->us_data);
984	}
985	mtx_unlock(&uma_boot_pages_mtx);
986	if (booted < UMA_STARTUP2)
987		panic("UMA: Increase vm.boot_pages");
988	/*
989	 * Now that we've booted reset these users to their real allocator.
990	 */
991#ifdef UMA_MD_SMALL_ALLOC
992	keg->uk_allocf = (keg->uk_ppera > 1) ? page_alloc : uma_small_alloc;
993#else
994	keg->uk_allocf = page_alloc;
995#endif
996	return keg->uk_allocf(zone, bytes, pflag, wait);
997}
998
999/*
1000 * Allocates a number of pages from the system
1001 *
1002 * Arguments:
1003 *	bytes  The number of bytes requested
1004 *	wait  Shall we wait?
1005 *
1006 * Returns:
1007 *	A pointer to the alloced memory or possibly
1008 *	NULL if M_NOWAIT is set.
1009 */
1010static void *
1011page_alloc(uma_zone_t zone, int bytes, uint8_t *pflag, int wait)
1012{
1013	void *p;	/* Returned page */
1014
1015	*pflag = UMA_SLAB_KMEM;
1016	p = (void *) kmem_malloc(kmem_arena, bytes, wait);
1017
1018	return (p);
1019}
1020
1021/*
1022 * Allocates a number of pages from within an object
1023 *
1024 * Arguments:
1025 *	bytes  The number of bytes requested
1026 *	wait   Shall we wait?
1027 *
1028 * Returns:
1029 *	A pointer to the alloced memory or possibly
1030 *	NULL if M_NOWAIT is set.
1031 */
1032static void *
1033noobj_alloc(uma_zone_t zone, int bytes, uint8_t *flags, int wait)
1034{
1035	TAILQ_HEAD(, vm_page) alloctail;
1036	u_long npages;
1037	vm_offset_t retkva, zkva;
1038	vm_page_t p, p_next;
1039	uma_keg_t keg;
1040
1041	TAILQ_INIT(&alloctail);
1042	keg = zone_first_keg(zone);
1043
1044	npages = howmany(bytes, PAGE_SIZE);
1045	while (npages > 0) {
1046		p = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
1047		    VM_ALLOC_WIRED | VM_ALLOC_NOOBJ);
1048		if (p != NULL) {
1049			/*
1050			 * Since the page does not belong to an object, its
1051			 * listq is unused.
1052			 */
1053			TAILQ_INSERT_TAIL(&alloctail, p, listq);
1054			npages--;
1055			continue;
1056		}
1057		if (wait & M_WAITOK) {
1058			VM_WAIT;
1059			continue;
1060		}
1061
1062		/*
1063		 * Page allocation failed, free intermediate pages and
1064		 * exit.
1065		 */
1066		TAILQ_FOREACH_SAFE(p, &alloctail, listq, p_next) {
1067			vm_page_unwire(p, 0);
1068			vm_page_free(p);
1069		}
1070		return (NULL);
1071	}
1072	*flags = UMA_SLAB_PRIV;
1073	zkva = keg->uk_kva +
1074	    atomic_fetchadd_long(&keg->uk_offset, round_page(bytes));
1075	retkva = zkva;
1076	TAILQ_FOREACH(p, &alloctail, listq) {
1077		pmap_qenter(zkva, &p, 1);
1078		zkva += PAGE_SIZE;
1079	}
1080
1081	return ((void *)retkva);
1082}
1083
1084/*
1085 * Frees a number of pages to the system
1086 *
1087 * Arguments:
1088 *	mem   A pointer to the memory to be freed
1089 *	size  The size of the memory being freed
1090 *	flags The original p->us_flags field
1091 *
1092 * Returns:
1093 *	Nothing
1094 */
1095static void
1096page_free(void *mem, int size, uint8_t flags)
1097{
1098	struct vmem *vmem;
1099
1100	if (flags & UMA_SLAB_KMEM)
1101		vmem = kmem_arena;
1102	else if (flags & UMA_SLAB_KERNEL)
1103		vmem = kernel_arena;
1104	else
1105		panic("UMA: page_free used with invalid flags %d", flags);
1106
1107	kmem_free(vmem, (vm_offset_t)mem, size);
1108}
1109
1110/*
1111 * Zero fill initializer
1112 *
1113 * Arguments/Returns follow uma_init specifications
1114 */
1115static int
1116zero_init(void *mem, int size, int flags)
1117{
1118	bzero(mem, size);
1119	return (0);
1120}
1121
1122/*
1123 * Finish creating a small uma keg.  This calculates ipers, and the keg size.
1124 *
1125 * Arguments
1126 *	keg  The zone we should initialize
1127 *
1128 * Returns
1129 *	Nothing
1130 */
1131static void
1132keg_small_init(uma_keg_t keg)
1133{
1134	u_int rsize;
1135	u_int memused;
1136	u_int wastedspace;
1137	u_int shsize;
1138
1139	if (keg->uk_flags & UMA_ZONE_PCPU) {
1140		u_int ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
1141
1142		keg->uk_slabsize = sizeof(struct pcpu);
1143		keg->uk_ppera = howmany(ncpus * sizeof(struct pcpu),
1144		    PAGE_SIZE);
1145	} else {
1146		keg->uk_slabsize = UMA_SLAB_SIZE;
1147		keg->uk_ppera = 1;
1148	}
1149
1150	/*
1151	 * Calculate the size of each allocation (rsize) according to
1152	 * alignment.  If the requested size is smaller than we have
1153	 * allocation bits for we round it up.
1154	 */
1155	rsize = keg->uk_size;
1156	if (rsize < keg->uk_slabsize / SLAB_SETSIZE)
1157		rsize = keg->uk_slabsize / SLAB_SETSIZE;
1158	if (rsize & keg->uk_align)
1159		rsize = (rsize & ~keg->uk_align) + (keg->uk_align + 1);
1160	keg->uk_rsize = rsize;
1161
1162	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0 ||
1163	    keg->uk_rsize < sizeof(struct pcpu),
1164	    ("%s: size %u too large", __func__, keg->uk_rsize));
1165
1166	if (keg->uk_flags & UMA_ZONE_REFCNT)
1167		rsize += sizeof(uint32_t);
1168
1169	if (keg->uk_flags & UMA_ZONE_OFFPAGE)
1170		shsize = 0;
1171	else
1172		shsize = sizeof(struct uma_slab);
1173
1174	keg->uk_ipers = (keg->uk_slabsize - shsize) / rsize;
1175	KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1176	    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1177
1178	memused = keg->uk_ipers * rsize + shsize;
1179	wastedspace = keg->uk_slabsize - memused;
1180
1181	/*
1182	 * We can't do OFFPAGE if we're internal or if we've been
1183	 * asked to not go to the VM for buckets.  If we do this we
1184	 * may end up going to the VM  for slabs which we do not
1185	 * want to do if we're UMA_ZFLAG_CACHEONLY as a result
1186	 * of UMA_ZONE_VM, which clearly forbids it.
1187	 */
1188	if ((keg->uk_flags & UMA_ZFLAG_INTERNAL) ||
1189	    (keg->uk_flags & UMA_ZFLAG_CACHEONLY))
1190		return;
1191
1192	/*
1193	 * See if using an OFFPAGE slab will limit our waste.  Only do
1194	 * this if it permits more items per-slab.
1195	 *
1196	 * XXX We could try growing slabsize to limit max waste as well.
1197	 * Historically this was not done because the VM could not
1198	 * efficiently handle contiguous allocations.
1199	 */
1200	if ((wastedspace >= keg->uk_slabsize / UMA_MAX_WASTE) &&
1201	    (keg->uk_ipers < (keg->uk_slabsize / keg->uk_rsize))) {
1202		keg->uk_ipers = keg->uk_slabsize / keg->uk_rsize;
1203		KASSERT(keg->uk_ipers > 0 && keg->uk_ipers <= SLAB_SETSIZE,
1204		    ("%s: keg->uk_ipers %u", __func__, keg->uk_ipers));
1205#ifdef UMA_DEBUG
1206		printf("UMA decided we need offpage slab headers for "
1207		    "keg: %s, calculated wastedspace = %d, "
1208		    "maximum wasted space allowed = %d, "
1209		    "calculated ipers = %d, "
1210		    "new wasted space = %d\n", keg->uk_name, wastedspace,
1211		    keg->uk_slabsize / UMA_MAX_WASTE, keg->uk_ipers,
1212		    keg->uk_slabsize - keg->uk_ipers * keg->uk_rsize);
1213#endif
1214		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1215	}
1216
1217	if ((keg->uk_flags & UMA_ZONE_OFFPAGE) &&
1218	    (keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1219		keg->uk_flags |= UMA_ZONE_HASH;
1220}
1221
1222/*
1223 * Finish creating a large (> UMA_SLAB_SIZE) uma kegs.  Just give in and do
1224 * OFFPAGE for now.  When I can allow for more dynamic slab sizes this will be
1225 * more complicated.
1226 *
1227 * Arguments
1228 *	keg  The keg we should initialize
1229 *
1230 * Returns
1231 *	Nothing
1232 */
1233static void
1234keg_large_init(uma_keg_t keg)
1235{
1236
1237	KASSERT(keg != NULL, ("Keg is null in keg_large_init"));
1238	KASSERT((keg->uk_flags & UMA_ZFLAG_CACHEONLY) == 0,
1239	    ("keg_large_init: Cannot large-init a UMA_ZFLAG_CACHEONLY keg"));
1240	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1241	    ("%s: Cannot large-init a UMA_ZONE_PCPU keg", __func__));
1242
1243	keg->uk_ppera = howmany(keg->uk_size, PAGE_SIZE);
1244	keg->uk_slabsize = keg->uk_ppera * PAGE_SIZE;
1245	keg->uk_ipers = 1;
1246	keg->uk_rsize = keg->uk_size;
1247
1248	/* We can't do OFFPAGE if we're internal, bail out here. */
1249	if (keg->uk_flags & UMA_ZFLAG_INTERNAL)
1250		return;
1251
1252	keg->uk_flags |= UMA_ZONE_OFFPAGE;
1253	if ((keg->uk_flags & UMA_ZONE_VTOSLAB) == 0)
1254		keg->uk_flags |= UMA_ZONE_HASH;
1255}
1256
1257static void
1258keg_cachespread_init(uma_keg_t keg)
1259{
1260	int alignsize;
1261	int trailer;
1262	int pages;
1263	int rsize;
1264
1265	KASSERT((keg->uk_flags & UMA_ZONE_PCPU) == 0,
1266	    ("%s: Cannot cachespread-init a UMA_ZONE_PCPU keg", __func__));
1267
1268	alignsize = keg->uk_align + 1;
1269	rsize = keg->uk_size;
1270	/*
1271	 * We want one item to start on every align boundary in a page.  To
1272	 * do this we will span pages.  We will also extend the item by the
1273	 * size of align if it is an even multiple of align.  Otherwise, it
1274	 * would fall on the same boundary every time.
1275	 */
1276	if (rsize & keg->uk_align)
1277		rsize = (rsize & ~keg->uk_align) + alignsize;
1278	if ((rsize & alignsize) == 0)
1279		rsize += alignsize;
1280	trailer = rsize - keg->uk_size;
1281	pages = (rsize * (PAGE_SIZE / alignsize)) / PAGE_SIZE;
1282	pages = MIN(pages, (128 * 1024) / PAGE_SIZE);
1283	keg->uk_rsize = rsize;
1284	keg->uk_ppera = pages;
1285	keg->uk_slabsize = UMA_SLAB_SIZE;
1286	keg->uk_ipers = ((pages * PAGE_SIZE) + trailer) / rsize;
1287	keg->uk_flags |= UMA_ZONE_OFFPAGE | UMA_ZONE_VTOSLAB;
1288	KASSERT(keg->uk_ipers <= uma_max_ipers,
1289	    ("%s: keg->uk_ipers too high(%d) increase max_ipers", __func__,
1290	    keg->uk_ipers));
1291}
1292
1293/*
1294 * Keg header ctor.  This initializes all fields, locks, etc.  And inserts
1295 * the keg onto the global keg list.
1296 *
1297 * Arguments/Returns follow uma_ctor specifications
1298 *	udata  Actually uma_kctor_args
1299 */
1300static int
1301keg_ctor(void *mem, int size, void *udata, int flags)
1302{
1303	struct uma_kctor_args *arg = udata;
1304	uma_keg_t keg = mem;
1305	uma_zone_t zone;
1306
1307	bzero(keg, size);
1308	keg->uk_size = arg->size;
1309	keg->uk_init = arg->uminit;
1310	keg->uk_fini = arg->fini;
1311	keg->uk_align = arg->align;
1312	keg->uk_free = 0;
1313	keg->uk_reserve = 0;
1314	keg->uk_pages = 0;
1315	keg->uk_flags = arg->flags;
1316	keg->uk_allocf = page_alloc;
1317	keg->uk_freef = page_free;
1318	keg->uk_slabzone = NULL;
1319
1320	/*
1321	 * The master zone is passed to us at keg-creation time.
1322	 */
1323	zone = arg->zone;
1324	keg->uk_name = zone->uz_name;
1325
1326	if (arg->flags & UMA_ZONE_VM)
1327		keg->uk_flags |= UMA_ZFLAG_CACHEONLY;
1328
1329	if (arg->flags & UMA_ZONE_ZINIT)
1330		keg->uk_init = zero_init;
1331
1332	if (arg->flags & UMA_ZONE_REFCNT || arg->flags & UMA_ZONE_MALLOC)
1333		keg->uk_flags |= UMA_ZONE_VTOSLAB;
1334
1335	if (arg->flags & UMA_ZONE_PCPU)
1336#ifdef SMP
1337		keg->uk_flags |= UMA_ZONE_OFFPAGE;
1338#else
1339		keg->uk_flags &= ~UMA_ZONE_PCPU;
1340#endif
1341
1342	if (keg->uk_flags & UMA_ZONE_CACHESPREAD) {
1343		keg_cachespread_init(keg);
1344	} else if (keg->uk_flags & UMA_ZONE_REFCNT) {
1345		if (keg->uk_size >
1346		    (UMA_SLAB_SIZE - sizeof(struct uma_slab_refcnt) -
1347		    sizeof(uint32_t)))
1348			keg_large_init(keg);
1349		else
1350			keg_small_init(keg);
1351	} else {
1352		if (keg->uk_size > (UMA_SLAB_SIZE - sizeof(struct uma_slab)))
1353			keg_large_init(keg);
1354		else
1355			keg_small_init(keg);
1356	}
1357
1358	if (keg->uk_flags & UMA_ZONE_OFFPAGE) {
1359		if (keg->uk_flags & UMA_ZONE_REFCNT) {
1360			if (keg->uk_ipers > uma_max_ipers_ref)
1361				panic("Too many ref items per zone: %d > %d\n",
1362				    keg->uk_ipers, uma_max_ipers_ref);
1363			keg->uk_slabzone = slabrefzone;
1364		} else
1365			keg->uk_slabzone = slabzone;
1366	}
1367
1368	/*
1369	 * If we haven't booted yet we need allocations to go through the
1370	 * startup cache until the vm is ready.
1371	 */
1372	if (keg->uk_ppera == 1) {
1373#ifdef UMA_MD_SMALL_ALLOC
1374		keg->uk_allocf = uma_small_alloc;
1375		keg->uk_freef = uma_small_free;
1376
1377		if (booted < UMA_STARTUP)
1378			keg->uk_allocf = startup_alloc;
1379#else
1380		if (booted < UMA_STARTUP2)
1381			keg->uk_allocf = startup_alloc;
1382#endif
1383	} else if (booted < UMA_STARTUP2 &&
1384	    (keg->uk_flags & UMA_ZFLAG_INTERNAL))
1385		keg->uk_allocf = startup_alloc;
1386
1387	/*
1388	 * Initialize keg's lock
1389	 */
1390	KEG_LOCK_INIT(keg, (arg->flags & UMA_ZONE_MTXCLASS));
1391
1392	/*
1393	 * If we're putting the slab header in the actual page we need to
1394	 * figure out where in each page it goes.  This calculates a right
1395	 * justified offset into the memory on an ALIGN_PTR boundary.
1396	 */
1397	if (!(keg->uk_flags & UMA_ZONE_OFFPAGE)) {
1398		u_int totsize;
1399
1400		/* Size of the slab struct and free list */
1401		totsize = sizeof(struct uma_slab);
1402
1403		/* Size of the reference counts. */
1404		if (keg->uk_flags & UMA_ZONE_REFCNT)
1405			totsize += keg->uk_ipers * sizeof(uint32_t);
1406
1407		if (totsize & UMA_ALIGN_PTR)
1408			totsize = (totsize & ~UMA_ALIGN_PTR) +
1409			    (UMA_ALIGN_PTR + 1);
1410		keg->uk_pgoff = (PAGE_SIZE * keg->uk_ppera) - totsize;
1411
1412		/*
1413		 * The only way the following is possible is if with our
1414		 * UMA_ALIGN_PTR adjustments we are now bigger than
1415		 * UMA_SLAB_SIZE.  I haven't checked whether this is
1416		 * mathematically possible for all cases, so we make
1417		 * sure here anyway.
1418		 */
1419		totsize = keg->uk_pgoff + sizeof(struct uma_slab);
1420		if (keg->uk_flags & UMA_ZONE_REFCNT)
1421			totsize += keg->uk_ipers * sizeof(uint32_t);
1422		if (totsize > PAGE_SIZE * keg->uk_ppera) {
1423			printf("zone %s ipers %d rsize %d size %d\n",
1424			    zone->uz_name, keg->uk_ipers, keg->uk_rsize,
1425			    keg->uk_size);
1426			panic("UMA slab won't fit.");
1427		}
1428	}
1429
1430	if (keg->uk_flags & UMA_ZONE_HASH)
1431		hash_alloc(&keg->uk_hash);
1432
1433#ifdef UMA_DEBUG
1434	printf("UMA: %s(%p) size %d(%d) flags %#x ipers %d ppera %d out %d free %d\n",
1435	    zone->uz_name, zone, keg->uk_size, keg->uk_rsize, keg->uk_flags,
1436	    keg->uk_ipers, keg->uk_ppera,
1437	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free);
1438#endif
1439
1440	LIST_INSERT_HEAD(&keg->uk_zones, zone, uz_link);
1441
1442	mtx_lock(&uma_mtx);
1443	LIST_INSERT_HEAD(&uma_kegs, keg, uk_link);
1444	mtx_unlock(&uma_mtx);
1445	return (0);
1446}
1447
1448/*
1449 * Zone header ctor.  This initializes all fields, locks, etc.
1450 *
1451 * Arguments/Returns follow uma_ctor specifications
1452 *	udata  Actually uma_zctor_args
1453 */
1454static int
1455zone_ctor(void *mem, int size, void *udata, int flags)
1456{
1457	struct uma_zctor_args *arg = udata;
1458	uma_zone_t zone = mem;
1459	uma_zone_t z;
1460	uma_keg_t keg;
1461
1462	bzero(zone, size);
1463	zone->uz_name = arg->name;
1464	zone->uz_ctor = arg->ctor;
1465	zone->uz_dtor = arg->dtor;
1466	zone->uz_slab = zone_fetch_slab;
1467	zone->uz_init = NULL;
1468	zone->uz_fini = NULL;
1469	zone->uz_allocs = 0;
1470	zone->uz_frees = 0;
1471	zone->uz_fails = 0;
1472	zone->uz_sleeps = 0;
1473	zone->uz_count = 0;
1474	zone->uz_count_min = 0;
1475	zone->uz_flags = 0;
1476	zone->uz_warning = NULL;
1477	timevalclear(&zone->uz_ratecheck);
1478	keg = arg->keg;
1479
1480	ZONE_LOCK_INIT(zone, (arg->flags & UMA_ZONE_MTXCLASS));
1481
1482	/*
1483	 * This is a pure cache zone, no kegs.
1484	 */
1485	if (arg->import) {
1486		if (arg->flags & UMA_ZONE_VM)
1487			arg->flags |= UMA_ZFLAG_CACHEONLY;
1488		zone->uz_flags = arg->flags;
1489		zone->uz_size = arg->size;
1490		zone->uz_import = arg->import;
1491		zone->uz_release = arg->release;
1492		zone->uz_arg = arg->arg;
1493		zone->uz_lockptr = &zone->uz_lock;
1494		goto out;
1495	}
1496
1497	/*
1498	 * Use the regular zone/keg/slab allocator.
1499	 */
1500	zone->uz_import = (uma_import)zone_import;
1501	zone->uz_release = (uma_release)zone_release;
1502	zone->uz_arg = zone;
1503
1504	if (arg->flags & UMA_ZONE_SECONDARY) {
1505		KASSERT(arg->keg != NULL, ("Secondary zone on zero'd keg"));
1506		zone->uz_init = arg->uminit;
1507		zone->uz_fini = arg->fini;
1508		zone->uz_lockptr = &keg->uk_lock;
1509		zone->uz_flags |= UMA_ZONE_SECONDARY;
1510		mtx_lock(&uma_mtx);
1511		ZONE_LOCK(zone);
1512		LIST_FOREACH(z, &keg->uk_zones, uz_link) {
1513			if (LIST_NEXT(z, uz_link) == NULL) {
1514				LIST_INSERT_AFTER(z, zone, uz_link);
1515				break;
1516			}
1517		}
1518		ZONE_UNLOCK(zone);
1519		mtx_unlock(&uma_mtx);
1520	} else if (keg == NULL) {
1521		if ((keg = uma_kcreate(zone, arg->size, arg->uminit, arg->fini,
1522		    arg->align, arg->flags)) == NULL)
1523			return (ENOMEM);
1524	} else {
1525		struct uma_kctor_args karg;
1526		int error;
1527
1528		/* We should only be here from uma_startup() */
1529		karg.size = arg->size;
1530		karg.uminit = arg->uminit;
1531		karg.fini = arg->fini;
1532		karg.align = arg->align;
1533		karg.flags = arg->flags;
1534		karg.zone = zone;
1535		error = keg_ctor(arg->keg, sizeof(struct uma_keg), &karg,
1536		    flags);
1537		if (error)
1538			return (error);
1539	}
1540
1541	/*
1542	 * Link in the first keg.
1543	 */
1544	zone->uz_klink.kl_keg = keg;
1545	LIST_INSERT_HEAD(&zone->uz_kegs, &zone->uz_klink, kl_link);
1546	zone->uz_lockptr = &keg->uk_lock;
1547	zone->uz_size = keg->uk_size;
1548	zone->uz_flags |= (keg->uk_flags &
1549	    (UMA_ZONE_INHERIT | UMA_ZFLAG_INHERIT));
1550
1551	/*
1552	 * Some internal zones don't have room allocated for the per cpu
1553	 * caches.  If we're internal, bail out here.
1554	 */
1555	if (keg->uk_flags & UMA_ZFLAG_INTERNAL) {
1556		KASSERT((zone->uz_flags & UMA_ZONE_SECONDARY) == 0,
1557		    ("Secondary zone requested UMA_ZFLAG_INTERNAL"));
1558		return (0);
1559	}
1560
1561out:
1562	if ((arg->flags & UMA_ZONE_MAXBUCKET) == 0)
1563		zone->uz_count = bucket_select(zone->uz_size);
1564	else
1565		zone->uz_count = BUCKET_MAX;
1566	zone->uz_count_min = zone->uz_count;
1567
1568	return (0);
1569}
1570
1571/*
1572 * Keg header dtor.  This frees all data, destroys locks, frees the hash
1573 * table and removes the keg from the global list.
1574 *
1575 * Arguments/Returns follow uma_dtor specifications
1576 *	udata  unused
1577 */
1578static void
1579keg_dtor(void *arg, int size, void *udata)
1580{
1581	uma_keg_t keg;
1582
1583	keg = (uma_keg_t)arg;
1584	KEG_LOCK(keg);
1585	if (keg->uk_free != 0) {
1586		printf("Freed UMA keg (%s) was not empty (%d items). "
1587		    " Lost %d pages of memory.\n",
1588		    keg->uk_name ? keg->uk_name : "",
1589		    keg->uk_free, keg->uk_pages);
1590	}
1591	KEG_UNLOCK(keg);
1592
1593	hash_free(&keg->uk_hash);
1594
1595	KEG_LOCK_FINI(keg);
1596}
1597
1598/*
1599 * Zone header dtor.
1600 *
1601 * Arguments/Returns follow uma_dtor specifications
1602 *	udata  unused
1603 */
1604static void
1605zone_dtor(void *arg, int size, void *udata)
1606{
1607	uma_klink_t klink;
1608	uma_zone_t zone;
1609	uma_keg_t keg;
1610
1611	zone = (uma_zone_t)arg;
1612	keg = zone_first_keg(zone);
1613
1614	if (!(zone->uz_flags & UMA_ZFLAG_INTERNAL))
1615		cache_drain(zone);
1616
1617	mtx_lock(&uma_mtx);
1618	LIST_REMOVE(zone, uz_link);
1619	mtx_unlock(&uma_mtx);
1620	/*
1621	 * XXX there are some races here where
1622	 * the zone can be drained but zone lock
1623	 * released and then refilled before we
1624	 * remove it... we dont care for now
1625	 */
1626	zone_drain_wait(zone, M_WAITOK);
1627	/*
1628	 * Unlink all of our kegs.
1629	 */
1630	while ((klink = LIST_FIRST(&zone->uz_kegs)) != NULL) {
1631		klink->kl_keg = NULL;
1632		LIST_REMOVE(klink, kl_link);
1633		if (klink == &zone->uz_klink)
1634			continue;
1635		free(klink, M_TEMP);
1636	}
1637	/*
1638	 * We only destroy kegs from non secondary zones.
1639	 */
1640	if (keg != NULL && (zone->uz_flags & UMA_ZONE_SECONDARY) == 0)  {
1641		mtx_lock(&uma_mtx);
1642		LIST_REMOVE(keg, uk_link);
1643		mtx_unlock(&uma_mtx);
1644		zone_free_item(kegs, keg, NULL, SKIP_NONE);
1645	}
1646	ZONE_LOCK_FINI(zone);
1647}
1648
1649/*
1650 * Traverses every zone in the system and calls a callback
1651 *
1652 * Arguments:
1653 *	zfunc  A pointer to a function which accepts a zone
1654 *		as an argument.
1655 *
1656 * Returns:
1657 *	Nothing
1658 */
1659static void
1660zone_foreach(void (*zfunc)(uma_zone_t))
1661{
1662	uma_keg_t keg;
1663	uma_zone_t zone;
1664
1665	mtx_lock(&uma_mtx);
1666	LIST_FOREACH(keg, &uma_kegs, uk_link) {
1667		LIST_FOREACH(zone, &keg->uk_zones, uz_link)
1668			zfunc(zone);
1669	}
1670	mtx_unlock(&uma_mtx);
1671}
1672
1673/* Public functions */
1674/* See uma.h */
1675void
1676uma_startup(void *bootmem, int boot_pages)
1677{
1678	struct uma_zctor_args args;
1679	uma_slab_t slab;
1680	u_int slabsize;
1681	int i;
1682
1683#ifdef UMA_DEBUG
1684	printf("Creating uma keg headers zone and keg.\n");
1685#endif
1686	mtx_init(&uma_mtx, "UMA lock", NULL, MTX_DEF);
1687
1688	/* "manually" create the initial zone */
1689	memset(&args, 0, sizeof(args));
1690	args.name = "UMA Kegs";
1691	args.size = sizeof(struct uma_keg);
1692	args.ctor = keg_ctor;
1693	args.dtor = keg_dtor;
1694	args.uminit = zero_init;
1695	args.fini = NULL;
1696	args.keg = &masterkeg;
1697	args.align = 32 - 1;
1698	args.flags = UMA_ZFLAG_INTERNAL;
1699	/* The initial zone has no Per cpu queues so it's smaller */
1700	zone_ctor(kegs, sizeof(struct uma_zone), &args, M_WAITOK);
1701
1702#ifdef UMA_DEBUG
1703	printf("Filling boot free list.\n");
1704#endif
1705	for (i = 0; i < boot_pages; i++) {
1706		slab = (uma_slab_t)((uint8_t *)bootmem + (i * UMA_SLAB_SIZE));
1707		slab->us_data = (uint8_t *)slab;
1708		slab->us_flags = UMA_SLAB_BOOT;
1709		LIST_INSERT_HEAD(&uma_boot_pages, slab, us_link);
1710	}
1711	mtx_init(&uma_boot_pages_mtx, "UMA boot pages", NULL, MTX_DEF);
1712
1713#ifdef UMA_DEBUG
1714	printf("Creating uma zone headers zone and keg.\n");
1715#endif
1716	args.name = "UMA Zones";
1717	args.size = sizeof(struct uma_zone) +
1718	    (sizeof(struct uma_cache) * (mp_maxid + 1));
1719	args.ctor = zone_ctor;
1720	args.dtor = zone_dtor;
1721	args.uminit = zero_init;
1722	args.fini = NULL;
1723	args.keg = NULL;
1724	args.align = 32 - 1;
1725	args.flags = UMA_ZFLAG_INTERNAL;
1726	/* The initial zone has no Per cpu queues so it's smaller */
1727	zone_ctor(zones, sizeof(struct uma_zone), &args, M_WAITOK);
1728
1729#ifdef UMA_DEBUG
1730	printf("Initializing pcpu cache locks.\n");
1731#endif
1732#ifdef UMA_DEBUG
1733	printf("Creating slab and hash zones.\n");
1734#endif
1735
1736	/* Now make a zone for slab headers */
1737	slabzone = uma_zcreate("UMA Slabs",
1738				sizeof(struct uma_slab),
1739				NULL, NULL, NULL, NULL,
1740				UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1741
1742	/*
1743	 * We also create a zone for the bigger slabs with reference
1744	 * counts in them, to accomodate UMA_ZONE_REFCNT zones.
1745	 */
1746	slabsize = sizeof(struct uma_slab_refcnt);
1747	slabsize += uma_max_ipers_ref * sizeof(uint32_t);
1748	slabrefzone = uma_zcreate("UMA RCntSlabs",
1749				  slabsize,
1750				  NULL, NULL, NULL, NULL,
1751				  UMA_ALIGN_PTR,
1752				  UMA_ZFLAG_INTERNAL);
1753
1754	hashzone = uma_zcreate("UMA Hash",
1755	    sizeof(struct slabhead *) * UMA_HASH_SIZE_INIT,
1756	    NULL, NULL, NULL, NULL,
1757	    UMA_ALIGN_PTR, UMA_ZFLAG_INTERNAL);
1758
1759	bucket_init();
1760
1761	booted = UMA_STARTUP;
1762
1763#ifdef UMA_DEBUG
1764	printf("UMA startup complete.\n");
1765#endif
1766}
1767
1768/* see uma.h */
1769void
1770uma_startup2(void)
1771{
1772	booted = UMA_STARTUP2;
1773	bucket_enable();
1774#ifdef UMA_DEBUG
1775	printf("UMA startup2 complete.\n");
1776#endif
1777}
1778
1779/*
1780 * Initialize our callout handle
1781 *
1782 */
1783
1784static void
1785uma_startup3(void)
1786{
1787#ifdef UMA_DEBUG
1788	printf("Starting callout.\n");
1789#endif
1790	callout_init(&uma_callout, CALLOUT_MPSAFE);
1791	callout_reset(&uma_callout, UMA_TIMEOUT * hz, uma_timeout, NULL);
1792#ifdef UMA_DEBUG
1793	printf("UMA startup3 complete.\n");
1794#endif
1795}
1796
1797static uma_keg_t
1798uma_kcreate(uma_zone_t zone, size_t size, uma_init uminit, uma_fini fini,
1799		int align, uint32_t flags)
1800{
1801	struct uma_kctor_args args;
1802
1803	args.size = size;
1804	args.uminit = uminit;
1805	args.fini = fini;
1806	args.align = (align == UMA_ALIGN_CACHE) ? uma_align_cache : align;
1807	args.flags = flags;
1808	args.zone = zone;
1809	return (zone_alloc_item(kegs, &args, M_WAITOK));
1810}
1811
1812/* See uma.h */
1813void
1814uma_set_align(int align)
1815{
1816
1817	if (align != UMA_ALIGN_CACHE)
1818		uma_align_cache = align;
1819}
1820
1821/* See uma.h */
1822uma_zone_t
1823uma_zcreate(const char *name, size_t size, uma_ctor ctor, uma_dtor dtor,
1824		uma_init uminit, uma_fini fini, int align, uint32_t flags)
1825
1826{
1827	struct uma_zctor_args args;
1828
1829	/* This stuff is essential for the zone ctor */
1830	memset(&args, 0, sizeof(args));
1831	args.name = name;
1832	args.size = size;
1833	args.ctor = ctor;
1834	args.dtor = dtor;
1835	args.uminit = uminit;
1836	args.fini = fini;
1837	args.align = align;
1838	args.flags = flags;
1839	args.keg = NULL;
1840
1841	return (zone_alloc_item(zones, &args, M_WAITOK));
1842}
1843
1844/* See uma.h */
1845uma_zone_t
1846uma_zsecond_create(char *name, uma_ctor ctor, uma_dtor dtor,
1847		    uma_init zinit, uma_fini zfini, uma_zone_t master)
1848{
1849	struct uma_zctor_args args;
1850	uma_keg_t keg;
1851
1852	keg = zone_first_keg(master);
1853	memset(&args, 0, sizeof(args));
1854	args.name = name;
1855	args.size = keg->uk_size;
1856	args.ctor = ctor;
1857	args.dtor = dtor;
1858	args.uminit = zinit;
1859	args.fini = zfini;
1860	args.align = keg->uk_align;
1861	args.flags = keg->uk_flags | UMA_ZONE_SECONDARY;
1862	args.keg = keg;
1863
1864	/* XXX Attaches only one keg of potentially many. */
1865	return (zone_alloc_item(zones, &args, M_WAITOK));
1866}
1867
1868/* See uma.h */
1869uma_zone_t
1870uma_zcache_create(char *name, int size, uma_ctor ctor, uma_dtor dtor,
1871		    uma_init zinit, uma_fini zfini, uma_import zimport,
1872		    uma_release zrelease, void *arg, int flags)
1873{
1874	struct uma_zctor_args args;
1875
1876	memset(&args, 0, sizeof(args));
1877	args.name = name;
1878	args.size = size;
1879	args.ctor = ctor;
1880	args.dtor = dtor;
1881	args.uminit = zinit;
1882	args.fini = zfini;
1883	args.import = zimport;
1884	args.release = zrelease;
1885	args.arg = arg;
1886	args.align = 0;
1887	args.flags = flags;
1888
1889	return (zone_alloc_item(zones, &args, M_WAITOK));
1890}
1891
1892static void
1893zone_lock_pair(uma_zone_t a, uma_zone_t b)
1894{
1895	if (a < b) {
1896		ZONE_LOCK(a);
1897		mtx_lock_flags(b->uz_lockptr, MTX_DUPOK);
1898	} else {
1899		ZONE_LOCK(b);
1900		mtx_lock_flags(a->uz_lockptr, MTX_DUPOK);
1901	}
1902}
1903
1904static void
1905zone_unlock_pair(uma_zone_t a, uma_zone_t b)
1906{
1907
1908	ZONE_UNLOCK(a);
1909	ZONE_UNLOCK(b);
1910}
1911
1912int
1913uma_zsecond_add(uma_zone_t zone, uma_zone_t master)
1914{
1915	uma_klink_t klink;
1916	uma_klink_t kl;
1917	int error;
1918
1919	error = 0;
1920	klink = malloc(sizeof(*klink), M_TEMP, M_WAITOK | M_ZERO);
1921
1922	zone_lock_pair(zone, master);
1923	/*
1924	 * zone must use vtoslab() to resolve objects and must already be
1925	 * a secondary.
1926	 */
1927	if ((zone->uz_flags & (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY))
1928	    != (UMA_ZONE_VTOSLAB | UMA_ZONE_SECONDARY)) {
1929		error = EINVAL;
1930		goto out;
1931	}
1932	/*
1933	 * The new master must also use vtoslab().
1934	 */
1935	if ((zone->uz_flags & UMA_ZONE_VTOSLAB) != UMA_ZONE_VTOSLAB) {
1936		error = EINVAL;
1937		goto out;
1938	}
1939	/*
1940	 * Both must either be refcnt, or not be refcnt.
1941	 */
1942	if ((zone->uz_flags & UMA_ZONE_REFCNT) !=
1943	    (master->uz_flags & UMA_ZONE_REFCNT)) {
1944		error = EINVAL;
1945		goto out;
1946	}
1947	/*
1948	 * The underlying object must be the same size.  rsize
1949	 * may be different.
1950	 */
1951	if (master->uz_size != zone->uz_size) {
1952		error = E2BIG;
1953		goto out;
1954	}
1955	/*
1956	 * Put it at the end of the list.
1957	 */
1958	klink->kl_keg = zone_first_keg(master);
1959	LIST_FOREACH(kl, &zone->uz_kegs, kl_link) {
1960		if (LIST_NEXT(kl, kl_link) == NULL) {
1961			LIST_INSERT_AFTER(kl, klink, kl_link);
1962			break;
1963		}
1964	}
1965	klink = NULL;
1966	zone->uz_flags |= UMA_ZFLAG_MULTI;
1967	zone->uz_slab = zone_fetch_slab_multi;
1968
1969out:
1970	zone_unlock_pair(zone, master);
1971	if (klink != NULL)
1972		free(klink, M_TEMP);
1973
1974	return (error);
1975}
1976
1977
1978/* See uma.h */
1979void
1980uma_zdestroy(uma_zone_t zone)
1981{
1982
1983	zone_free_item(zones, zone, NULL, SKIP_NONE);
1984}
1985
1986/* See uma.h */
1987void *
1988uma_zalloc_arg(uma_zone_t zone, void *udata, int flags)
1989{
1990	void *item;
1991	uma_cache_t cache;
1992	uma_bucket_t bucket;
1993	int lockfail;
1994	int cpu;
1995
1996	/* This is the fast path allocation */
1997#ifdef UMA_DEBUG_ALLOC_1
1998	printf("Allocating one item from %s(%p)\n", zone->uz_name, zone);
1999#endif
2000	CTR3(KTR_UMA, "uma_zalloc_arg thread %x zone %s flags %d", curthread,
2001	    zone->uz_name, flags);
2002
2003	if (flags & M_WAITOK) {
2004		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
2005		    "uma_zalloc_arg: zone \"%s\"", zone->uz_name);
2006	}
2007#ifdef DEBUG_MEMGUARD
2008	if (memguard_cmp_zone(zone)) {
2009		item = memguard_alloc(zone->uz_size, flags);
2010		if (item != NULL) {
2011			/*
2012			 * Avoid conflict with the use-after-free
2013			 * protecting infrastructure from INVARIANTS.
2014			 */
2015			if (zone->uz_init != NULL &&
2016			    zone->uz_init != mtrash_init &&
2017			    zone->uz_init(item, zone->uz_size, flags) != 0)
2018				return (NULL);
2019			if (zone->uz_ctor != NULL &&
2020			    zone->uz_ctor != mtrash_ctor &&
2021			    zone->uz_ctor(item, zone->uz_size, udata,
2022			    flags) != 0) {
2023			    	zone->uz_fini(item, zone->uz_size);
2024				return (NULL);
2025			}
2026			return (item);
2027		}
2028		/* This is unfortunate but should not be fatal. */
2029	}
2030#endif
2031	/*
2032	 * If possible, allocate from the per-CPU cache.  There are two
2033	 * requirements for safe access to the per-CPU cache: (1) the thread
2034	 * accessing the cache must not be preempted or yield during access,
2035	 * and (2) the thread must not migrate CPUs without switching which
2036	 * cache it accesses.  We rely on a critical section to prevent
2037	 * preemption and migration.  We release the critical section in
2038	 * order to acquire the zone mutex if we are unable to allocate from
2039	 * the current cache; when we re-acquire the critical section, we
2040	 * must detect and handle migration if it has occurred.
2041	 */
2042	critical_enter();
2043	cpu = curcpu;
2044	cache = &zone->uz_cpu[cpu];
2045
2046zalloc_start:
2047	bucket = cache->uc_allocbucket;
2048	if (bucket != NULL && bucket->ub_cnt > 0) {
2049		bucket->ub_cnt--;
2050		item = bucket->ub_bucket[bucket->ub_cnt];
2051#ifdef INVARIANTS
2052		bucket->ub_bucket[bucket->ub_cnt] = NULL;
2053#endif
2054		KASSERT(item != NULL, ("uma_zalloc: Bucket pointer mangled."));
2055		cache->uc_allocs++;
2056		critical_exit();
2057		if (zone->uz_ctor != NULL &&
2058		    zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2059			atomic_add_long(&zone->uz_fails, 1);
2060			zone_free_item(zone, item, udata, SKIP_DTOR);
2061			return (NULL);
2062		}
2063#ifdef INVARIANTS
2064		uma_dbg_alloc(zone, NULL, item);
2065#endif
2066		if (flags & M_ZERO)
2067			bzero(item, zone->uz_size);
2068		return (item);
2069	}
2070
2071	/*
2072	 * We have run out of items in our alloc bucket.
2073	 * See if we can switch with our free bucket.
2074	 */
2075	bucket = cache->uc_freebucket;
2076	if (bucket != NULL && bucket->ub_cnt > 0) {
2077#ifdef UMA_DEBUG_ALLOC
2078		printf("uma_zalloc: Swapping empty with alloc.\n");
2079#endif
2080		cache->uc_freebucket = cache->uc_allocbucket;
2081		cache->uc_allocbucket = bucket;
2082		goto zalloc_start;
2083	}
2084
2085	/*
2086	 * Discard any empty allocation bucket while we hold no locks.
2087	 */
2088	bucket = cache->uc_allocbucket;
2089	cache->uc_allocbucket = NULL;
2090	critical_exit();
2091	if (bucket != NULL)
2092		bucket_free(zone, bucket, udata);
2093
2094	/* Short-circuit for zones without buckets and low memory. */
2095	if (zone->uz_count == 0 || bucketdisable)
2096		goto zalloc_item;
2097
2098	/*
2099	 * Attempt to retrieve the item from the per-CPU cache has failed, so
2100	 * we must go back to the zone.  This requires the zone lock, so we
2101	 * must drop the critical section, then re-acquire it when we go back
2102	 * to the cache.  Since the critical section is released, we may be
2103	 * preempted or migrate.  As such, make sure not to maintain any
2104	 * thread-local state specific to the cache from prior to releasing
2105	 * the critical section.
2106	 */
2107	lockfail = 0;
2108	if (ZONE_TRYLOCK(zone) == 0) {
2109		/* Record contention to size the buckets. */
2110		ZONE_LOCK(zone);
2111		lockfail = 1;
2112	}
2113	critical_enter();
2114	cpu = curcpu;
2115	cache = &zone->uz_cpu[cpu];
2116
2117	/*
2118	 * Since we have locked the zone we may as well send back our stats.
2119	 */
2120	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2121	atomic_add_long(&zone->uz_frees, cache->uc_frees);
2122	cache->uc_allocs = 0;
2123	cache->uc_frees = 0;
2124
2125	/* See if we lost the race to fill the cache. */
2126	if (cache->uc_allocbucket != NULL) {
2127		ZONE_UNLOCK(zone);
2128		goto zalloc_start;
2129	}
2130
2131	/*
2132	 * Check the zone's cache of buckets.
2133	 */
2134	if ((bucket = LIST_FIRST(&zone->uz_buckets)) != NULL) {
2135		KASSERT(bucket->ub_cnt != 0,
2136		    ("uma_zalloc_arg: Returning an empty bucket."));
2137
2138		LIST_REMOVE(bucket, ub_link);
2139		cache->uc_allocbucket = bucket;
2140		ZONE_UNLOCK(zone);
2141		goto zalloc_start;
2142	}
2143	/* We are no longer associated with this CPU. */
2144	critical_exit();
2145
2146	/*
2147	 * We bump the uz count when the cache size is insufficient to
2148	 * handle the working set.
2149	 */
2150	if (lockfail && zone->uz_count < BUCKET_MAX)
2151		zone->uz_count++;
2152	ZONE_UNLOCK(zone);
2153
2154	/*
2155	 * Now lets just fill a bucket and put it on the free list.  If that
2156	 * works we'll restart the allocation from the begining and it
2157	 * will use the just filled bucket.
2158	 */
2159	bucket = zone_alloc_bucket(zone, udata, flags);
2160	if (bucket != NULL) {
2161		ZONE_LOCK(zone);
2162		critical_enter();
2163		cpu = curcpu;
2164		cache = &zone->uz_cpu[cpu];
2165		/*
2166		 * See if we lost the race or were migrated.  Cache the
2167		 * initialized bucket to make this less likely or claim
2168		 * the memory directly.
2169		 */
2170		if (cache->uc_allocbucket == NULL)
2171			cache->uc_allocbucket = bucket;
2172		else
2173			LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2174		ZONE_UNLOCK(zone);
2175		goto zalloc_start;
2176	}
2177
2178	/*
2179	 * We may not be able to get a bucket so return an actual item.
2180	 */
2181#ifdef UMA_DEBUG
2182	printf("uma_zalloc_arg: Bucketzone returned NULL\n");
2183#endif
2184
2185zalloc_item:
2186	item = zone_alloc_item(zone, udata, flags);
2187
2188	return (item);
2189}
2190
2191static uma_slab_t
2192keg_fetch_slab(uma_keg_t keg, uma_zone_t zone, int flags)
2193{
2194	uma_slab_t slab;
2195	int reserve;
2196
2197	mtx_assert(&keg->uk_lock, MA_OWNED);
2198	slab = NULL;
2199	reserve = 0;
2200	if ((flags & M_USE_RESERVE) == 0)
2201		reserve = keg->uk_reserve;
2202
2203	for (;;) {
2204		/*
2205		 * Find a slab with some space.  Prefer slabs that are partially
2206		 * used over those that are totally full.  This helps to reduce
2207		 * fragmentation.
2208		 */
2209		if (keg->uk_free > reserve) {
2210			if (!LIST_EMPTY(&keg->uk_part_slab)) {
2211				slab = LIST_FIRST(&keg->uk_part_slab);
2212			} else {
2213				slab = LIST_FIRST(&keg->uk_free_slab);
2214				LIST_REMOVE(slab, us_link);
2215				LIST_INSERT_HEAD(&keg->uk_part_slab, slab,
2216				    us_link);
2217			}
2218			MPASS(slab->us_keg == keg);
2219			return (slab);
2220		}
2221
2222		/*
2223		 * M_NOVM means don't ask at all!
2224		 */
2225		if (flags & M_NOVM)
2226			break;
2227
2228		if (keg->uk_maxpages && keg->uk_pages >= keg->uk_maxpages) {
2229			keg->uk_flags |= UMA_ZFLAG_FULL;
2230			/*
2231			 * If this is not a multi-zone, set the FULL bit.
2232			 * Otherwise slab_multi() takes care of it.
2233			 */
2234			if ((zone->uz_flags & UMA_ZFLAG_MULTI) == 0) {
2235				zone->uz_flags |= UMA_ZFLAG_FULL;
2236				zone_log_warning(zone);
2237			}
2238			if (flags & M_NOWAIT)
2239				break;
2240			zone->uz_sleeps++;
2241			msleep(keg, &keg->uk_lock, PVM, "keglimit", 0);
2242			continue;
2243		}
2244		slab = keg_alloc_slab(keg, zone, flags);
2245		/*
2246		 * If we got a slab here it's safe to mark it partially used
2247		 * and return.  We assume that the caller is going to remove
2248		 * at least one item.
2249		 */
2250		if (slab) {
2251			MPASS(slab->us_keg == keg);
2252			LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2253			return (slab);
2254		}
2255		/*
2256		 * We might not have been able to get a slab but another cpu
2257		 * could have while we were unlocked.  Check again before we
2258		 * fail.
2259		 */
2260		flags |= M_NOVM;
2261	}
2262	return (slab);
2263}
2264
2265static uma_slab_t
2266zone_fetch_slab(uma_zone_t zone, uma_keg_t keg, int flags)
2267{
2268	uma_slab_t slab;
2269
2270	if (keg == NULL) {
2271		keg = zone_first_keg(zone);
2272		KEG_LOCK(keg);
2273	}
2274
2275	for (;;) {
2276		slab = keg_fetch_slab(keg, zone, flags);
2277		if (slab)
2278			return (slab);
2279		if (flags & (M_NOWAIT | M_NOVM))
2280			break;
2281	}
2282	KEG_UNLOCK(keg);
2283	return (NULL);
2284}
2285
2286/*
2287 * uma_zone_fetch_slab_multi:  Fetches a slab from one available keg.  Returns
2288 * with the keg locked.  On NULL no lock is held.
2289 *
2290 * The last pointer is used to seed the search.  It is not required.
2291 */
2292static uma_slab_t
2293zone_fetch_slab_multi(uma_zone_t zone, uma_keg_t last, int rflags)
2294{
2295	uma_klink_t klink;
2296	uma_slab_t slab;
2297	uma_keg_t keg;
2298	int flags;
2299	int empty;
2300	int full;
2301
2302	/*
2303	 * Don't wait on the first pass.  This will skip limit tests
2304	 * as well.  We don't want to block if we can find a provider
2305	 * without blocking.
2306	 */
2307	flags = (rflags & ~M_WAITOK) | M_NOWAIT;
2308	/*
2309	 * Use the last slab allocated as a hint for where to start
2310	 * the search.
2311	 */
2312	if (last != NULL) {
2313		slab = keg_fetch_slab(last, zone, flags);
2314		if (slab)
2315			return (slab);
2316		KEG_UNLOCK(last);
2317	}
2318	/*
2319	 * Loop until we have a slab incase of transient failures
2320	 * while M_WAITOK is specified.  I'm not sure this is 100%
2321	 * required but we've done it for so long now.
2322	 */
2323	for (;;) {
2324		empty = 0;
2325		full = 0;
2326		/*
2327		 * Search the available kegs for slabs.  Be careful to hold the
2328		 * correct lock while calling into the keg layer.
2329		 */
2330		LIST_FOREACH(klink, &zone->uz_kegs, kl_link) {
2331			keg = klink->kl_keg;
2332			KEG_LOCK(keg);
2333			if ((keg->uk_flags & UMA_ZFLAG_FULL) == 0) {
2334				slab = keg_fetch_slab(keg, zone, flags);
2335				if (slab)
2336					return (slab);
2337			}
2338			if (keg->uk_flags & UMA_ZFLAG_FULL)
2339				full++;
2340			else
2341				empty++;
2342			KEG_UNLOCK(keg);
2343		}
2344		if (rflags & (M_NOWAIT | M_NOVM))
2345			break;
2346		flags = rflags;
2347		/*
2348		 * All kegs are full.  XXX We can't atomically check all kegs
2349		 * and sleep so just sleep for a short period and retry.
2350		 */
2351		if (full && !empty) {
2352			ZONE_LOCK(zone);
2353			zone->uz_flags |= UMA_ZFLAG_FULL;
2354			zone->uz_sleeps++;
2355			zone_log_warning(zone);
2356			msleep(zone, zone->uz_lockptr, PVM,
2357			    "zonelimit", hz/100);
2358			zone->uz_flags &= ~UMA_ZFLAG_FULL;
2359			ZONE_UNLOCK(zone);
2360			continue;
2361		}
2362	}
2363	return (NULL);
2364}
2365
2366static void *
2367slab_alloc_item(uma_keg_t keg, uma_slab_t slab)
2368{
2369	void *item;
2370	uint8_t freei;
2371
2372	MPASS(keg == slab->us_keg);
2373	mtx_assert(&keg->uk_lock, MA_OWNED);
2374
2375	freei = BIT_FFS(SLAB_SETSIZE, &slab->us_free) - 1;
2376	BIT_CLR(SLAB_SETSIZE, freei, &slab->us_free);
2377	item = slab->us_data + (keg->uk_rsize * freei);
2378	slab->us_freecount--;
2379	keg->uk_free--;
2380
2381	/* Move this slab to the full list */
2382	if (slab->us_freecount == 0) {
2383		LIST_REMOVE(slab, us_link);
2384		LIST_INSERT_HEAD(&keg->uk_full_slab, slab, us_link);
2385	}
2386
2387	return (item);
2388}
2389
2390static int
2391zone_import(uma_zone_t zone, void **bucket, int max, int flags)
2392{
2393	uma_slab_t slab;
2394	uma_keg_t keg;
2395	int i;
2396
2397	slab = NULL;
2398	keg = NULL;
2399	/* Try to keep the buckets totally full */
2400	for (i = 0; i < max; ) {
2401		if ((slab = zone->uz_slab(zone, keg, flags)) == NULL)
2402			break;
2403		keg = slab->us_keg;
2404		while (slab->us_freecount && i < max) {
2405			bucket[i++] = slab_alloc_item(keg, slab);
2406			if (keg->uk_free <= keg->uk_reserve)
2407				break;
2408		}
2409		/* Don't grab more than one slab at a time. */
2410		flags &= ~M_WAITOK;
2411		flags |= M_NOWAIT;
2412	}
2413	if (slab != NULL)
2414		KEG_UNLOCK(keg);
2415
2416	return i;
2417}
2418
2419static uma_bucket_t
2420zone_alloc_bucket(uma_zone_t zone, void *udata, int flags)
2421{
2422	uma_bucket_t bucket;
2423	int max;
2424
2425	/* Don't wait for buckets, preserve caller's NOVM setting. */
2426	bucket = bucket_alloc(zone, udata, M_NOWAIT | (flags & M_NOVM));
2427	if (bucket == NULL)
2428		goto out;
2429
2430	max = MIN(bucket->ub_entries, zone->uz_count);
2431	bucket->ub_cnt = zone->uz_import(zone->uz_arg, bucket->ub_bucket,
2432	    max, flags);
2433
2434	/*
2435	 * Initialize the memory if necessary.
2436	 */
2437	if (bucket->ub_cnt != 0 && zone->uz_init != NULL) {
2438		int i;
2439
2440		for (i = 0; i < bucket->ub_cnt; i++)
2441			if (zone->uz_init(bucket->ub_bucket[i], zone->uz_size,
2442			    flags) != 0)
2443				break;
2444		/*
2445		 * If we couldn't initialize the whole bucket, put the
2446		 * rest back onto the freelist.
2447		 */
2448		if (i != bucket->ub_cnt) {
2449			zone->uz_release(zone->uz_arg, &bucket->ub_bucket[i],
2450			    bucket->ub_cnt - i);
2451#ifdef INVARIANTS
2452			bzero(&bucket->ub_bucket[i],
2453			    sizeof(void *) * (bucket->ub_cnt - i));
2454#endif
2455			bucket->ub_cnt = i;
2456		}
2457	}
2458
2459out:
2460	if (bucket == NULL || bucket->ub_cnt == 0) {
2461		if (bucket != NULL)
2462			bucket_free(zone, bucket, udata);
2463		atomic_add_long(&zone->uz_fails, 1);
2464		return (NULL);
2465	}
2466
2467	return (bucket);
2468}
2469
2470/*
2471 * Allocates a single item from a zone.
2472 *
2473 * Arguments
2474 *	zone   The zone to alloc for.
2475 *	udata  The data to be passed to the constructor.
2476 *	flags  M_WAITOK, M_NOWAIT, M_ZERO.
2477 *
2478 * Returns
2479 *	NULL if there is no memory and M_NOWAIT is set
2480 *	An item if successful
2481 */
2482
2483static void *
2484zone_alloc_item(uma_zone_t zone, void *udata, int flags)
2485{
2486	void *item;
2487
2488	item = NULL;
2489
2490#ifdef UMA_DEBUG_ALLOC
2491	printf("INTERNAL: Allocating one item from %s(%p)\n", zone->uz_name, zone);
2492#endif
2493	if (zone->uz_import(zone->uz_arg, &item, 1, flags) != 1)
2494		goto fail;
2495	atomic_add_long(&zone->uz_allocs, 1);
2496
2497	/*
2498	 * We have to call both the zone's init (not the keg's init)
2499	 * and the zone's ctor.  This is because the item is going from
2500	 * a keg slab directly to the user, and the user is expecting it
2501	 * to be both zone-init'd as well as zone-ctor'd.
2502	 */
2503	if (zone->uz_init != NULL) {
2504		if (zone->uz_init(item, zone->uz_size, flags) != 0) {
2505			zone_free_item(zone, item, udata, SKIP_FINI);
2506			goto fail;
2507		}
2508	}
2509	if (zone->uz_ctor != NULL) {
2510		if (zone->uz_ctor(item, zone->uz_size, udata, flags) != 0) {
2511			zone_free_item(zone, item, udata, SKIP_DTOR);
2512			goto fail;
2513		}
2514	}
2515#ifdef INVARIANTS
2516	uma_dbg_alloc(zone, NULL, item);
2517#endif
2518	if (flags & M_ZERO)
2519		bzero(item, zone->uz_size);
2520
2521	return (item);
2522
2523fail:
2524	atomic_add_long(&zone->uz_fails, 1);
2525	return (NULL);
2526}
2527
2528/* See uma.h */
2529void
2530uma_zfree_arg(uma_zone_t zone, void *item, void *udata)
2531{
2532	uma_cache_t cache;
2533	uma_bucket_t bucket;
2534	int cpu;
2535
2536#ifdef UMA_DEBUG_ALLOC_1
2537	printf("Freeing item %p to %s(%p)\n", item, zone->uz_name, zone);
2538#endif
2539	CTR2(KTR_UMA, "uma_zfree_arg thread %x zone %s", curthread,
2540	    zone->uz_name);
2541
2542        /* uma_zfree(..., NULL) does nothing, to match free(9). */
2543        if (item == NULL)
2544                return;
2545#ifdef DEBUG_MEMGUARD
2546	if (is_memguard_addr(item)) {
2547		if (zone->uz_dtor != NULL && zone->uz_dtor != mtrash_dtor)
2548			zone->uz_dtor(item, zone->uz_size, udata);
2549		if (zone->uz_fini != NULL && zone->uz_fini != mtrash_fini)
2550			zone->uz_fini(item, zone->uz_size);
2551		memguard_free(item);
2552		return;
2553	}
2554#endif
2555#ifdef INVARIANTS
2556	if (zone->uz_flags & UMA_ZONE_MALLOC)
2557		uma_dbg_free(zone, udata, item);
2558	else
2559		uma_dbg_free(zone, NULL, item);
2560#endif
2561	if (zone->uz_dtor != NULL)
2562		zone->uz_dtor(item, zone->uz_size, udata);
2563
2564	/*
2565	 * The race here is acceptable.  If we miss it we'll just have to wait
2566	 * a little longer for the limits to be reset.
2567	 */
2568	if (zone->uz_flags & UMA_ZFLAG_FULL)
2569		goto zfree_item;
2570
2571	/*
2572	 * If possible, free to the per-CPU cache.  There are two
2573	 * requirements for safe access to the per-CPU cache: (1) the thread
2574	 * accessing the cache must not be preempted or yield during access,
2575	 * and (2) the thread must not migrate CPUs without switching which
2576	 * cache it accesses.  We rely on a critical section to prevent
2577	 * preemption and migration.  We release the critical section in
2578	 * order to acquire the zone mutex if we are unable to free to the
2579	 * current cache; when we re-acquire the critical section, we must
2580	 * detect and handle migration if it has occurred.
2581	 */
2582zfree_restart:
2583	critical_enter();
2584	cpu = curcpu;
2585	cache = &zone->uz_cpu[cpu];
2586
2587zfree_start:
2588	/*
2589	 * Try to free into the allocbucket first to give LIFO ordering
2590	 * for cache-hot datastructures.  Spill over into the freebucket
2591	 * if necessary.  Alloc will swap them if one runs dry.
2592	 */
2593	bucket = cache->uc_allocbucket;
2594	if (bucket == NULL || bucket->ub_cnt >= bucket->ub_entries)
2595		bucket = cache->uc_freebucket;
2596	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2597		KASSERT(bucket->ub_bucket[bucket->ub_cnt] == NULL,
2598		    ("uma_zfree: Freeing to non free bucket index."));
2599		bucket->ub_bucket[bucket->ub_cnt] = item;
2600		bucket->ub_cnt++;
2601		cache->uc_frees++;
2602		critical_exit();
2603		return;
2604	}
2605
2606	/*
2607	 * We must go back the zone, which requires acquiring the zone lock,
2608	 * which in turn means we must release and re-acquire the critical
2609	 * section.  Since the critical section is released, we may be
2610	 * preempted or migrate.  As such, make sure not to maintain any
2611	 * thread-local state specific to the cache from prior to releasing
2612	 * the critical section.
2613	 */
2614	critical_exit();
2615	if (zone->uz_count == 0 || bucketdisable)
2616		goto zfree_item;
2617
2618	ZONE_LOCK(zone);
2619	critical_enter();
2620	cpu = curcpu;
2621	cache = &zone->uz_cpu[cpu];
2622
2623	/*
2624	 * Since we have locked the zone we may as well send back our stats.
2625	 */
2626	atomic_add_long(&zone->uz_allocs, cache->uc_allocs);
2627	atomic_add_long(&zone->uz_frees, cache->uc_frees);
2628	cache->uc_allocs = 0;
2629	cache->uc_frees = 0;
2630
2631	bucket = cache->uc_freebucket;
2632	if (bucket != NULL && bucket->ub_cnt < bucket->ub_entries) {
2633		ZONE_UNLOCK(zone);
2634		goto zfree_start;
2635	}
2636	cache->uc_freebucket = NULL;
2637
2638	/* Can we throw this on the zone full list? */
2639	if (bucket != NULL) {
2640#ifdef UMA_DEBUG_ALLOC
2641		printf("uma_zfree: Putting old bucket on the free list.\n");
2642#endif
2643		/* ub_cnt is pointing to the last free item */
2644		KASSERT(bucket->ub_cnt != 0,
2645		    ("uma_zfree: Attempting to insert an empty bucket onto the full list.\n"));
2646		LIST_INSERT_HEAD(&zone->uz_buckets, bucket, ub_link);
2647	}
2648
2649	/* We are no longer associated with this CPU. */
2650	critical_exit();
2651
2652	/* And the zone.. */
2653	ZONE_UNLOCK(zone);
2654
2655#ifdef UMA_DEBUG_ALLOC
2656	printf("uma_zfree: Allocating new free bucket.\n");
2657#endif
2658	bucket = bucket_alloc(zone, udata, M_NOWAIT);
2659	if (bucket) {
2660		critical_enter();
2661		cpu = curcpu;
2662		cache = &zone->uz_cpu[cpu];
2663		if (cache->uc_freebucket == NULL) {
2664			cache->uc_freebucket = bucket;
2665			goto zfree_start;
2666		}
2667		/*
2668		 * We lost the race, start over.  We have to drop our
2669		 * critical section to free the bucket.
2670		 */
2671		critical_exit();
2672		bucket_free(zone, bucket, udata);
2673		goto zfree_restart;
2674	}
2675
2676	/*
2677	 * If nothing else caught this, we'll just do an internal free.
2678	 */
2679zfree_item:
2680	zone_free_item(zone, item, udata, SKIP_DTOR);
2681
2682	return;
2683}
2684
2685static void
2686slab_free_item(uma_keg_t keg, uma_slab_t slab, void *item)
2687{
2688	uint8_t freei;
2689
2690	mtx_assert(&keg->uk_lock, MA_OWNED);
2691	MPASS(keg == slab->us_keg);
2692
2693	/* Do we need to remove from any lists? */
2694	if (slab->us_freecount+1 == keg->uk_ipers) {
2695		LIST_REMOVE(slab, us_link);
2696		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
2697	} else if (slab->us_freecount == 0) {
2698		LIST_REMOVE(slab, us_link);
2699		LIST_INSERT_HEAD(&keg->uk_part_slab, slab, us_link);
2700	}
2701
2702	/* Slab management. */
2703	freei = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
2704	BIT_SET(SLAB_SETSIZE, freei, &slab->us_free);
2705	slab->us_freecount++;
2706
2707	/* Keg statistics. */
2708	keg->uk_free++;
2709}
2710
2711static void
2712zone_release(uma_zone_t zone, void **bucket, int cnt)
2713{
2714	void *item;
2715	uma_slab_t slab;
2716	uma_keg_t keg;
2717	uint8_t *mem;
2718	int clearfull;
2719	int i;
2720
2721	clearfull = 0;
2722	keg = zone_first_keg(zone);
2723	KEG_LOCK(keg);
2724	for (i = 0; i < cnt; i++) {
2725		item = bucket[i];
2726		if (!(zone->uz_flags & UMA_ZONE_VTOSLAB)) {
2727			mem = (uint8_t *)((uintptr_t)item & (~UMA_SLAB_MASK));
2728			if (zone->uz_flags & UMA_ZONE_HASH) {
2729				slab = hash_sfind(&keg->uk_hash, mem);
2730			} else {
2731				mem += keg->uk_pgoff;
2732				slab = (uma_slab_t)mem;
2733			}
2734		} else {
2735			slab = vtoslab((vm_offset_t)item);
2736			if (slab->us_keg != keg) {
2737				KEG_UNLOCK(keg);
2738				keg = slab->us_keg;
2739				KEG_LOCK(keg);
2740			}
2741		}
2742		slab_free_item(keg, slab, item);
2743		if (keg->uk_flags & UMA_ZFLAG_FULL) {
2744			if (keg->uk_pages < keg->uk_maxpages) {
2745				keg->uk_flags &= ~UMA_ZFLAG_FULL;
2746				clearfull = 1;
2747			}
2748
2749			/*
2750			 * We can handle one more allocation. Since we're
2751			 * clearing ZFLAG_FULL, wake up all procs blocked
2752			 * on pages. This should be uncommon, so keeping this
2753			 * simple for now (rather than adding count of blocked
2754			 * threads etc).
2755			 */
2756			wakeup(keg);
2757		}
2758	}
2759	KEG_UNLOCK(keg);
2760	if (clearfull) {
2761		ZONE_LOCK(zone);
2762		zone->uz_flags &= ~UMA_ZFLAG_FULL;
2763		wakeup(zone);
2764		ZONE_UNLOCK(zone);
2765	}
2766
2767}
2768
2769/*
2770 * Frees a single item to any zone.
2771 *
2772 * Arguments:
2773 *	zone   The zone to free to
2774 *	item   The item we're freeing
2775 *	udata  User supplied data for the dtor
2776 *	skip   Skip dtors and finis
2777 */
2778static void
2779zone_free_item(uma_zone_t zone, void *item, void *udata, enum zfreeskip skip)
2780{
2781
2782#ifdef INVARIANTS
2783	if (skip == SKIP_NONE) {
2784		if (zone->uz_flags & UMA_ZONE_MALLOC)
2785			uma_dbg_free(zone, udata, item);
2786		else
2787			uma_dbg_free(zone, NULL, item);
2788	}
2789#endif
2790	if (skip < SKIP_DTOR && zone->uz_dtor)
2791		zone->uz_dtor(item, zone->uz_size, udata);
2792
2793	if (skip < SKIP_FINI && zone->uz_fini)
2794		zone->uz_fini(item, zone->uz_size);
2795
2796	atomic_add_long(&zone->uz_frees, 1);
2797	zone->uz_release(zone->uz_arg, &item, 1);
2798}
2799
2800/* See uma.h */
2801int
2802uma_zone_set_max(uma_zone_t zone, int nitems)
2803{
2804	uma_keg_t keg;
2805
2806	keg = zone_first_keg(zone);
2807	if (keg == NULL)
2808		return (0);
2809	KEG_LOCK(keg);
2810	keg->uk_maxpages = (nitems / keg->uk_ipers) * keg->uk_ppera;
2811	if (keg->uk_maxpages * keg->uk_ipers < nitems)
2812		keg->uk_maxpages += keg->uk_ppera;
2813	nitems = keg->uk_maxpages * keg->uk_ipers;
2814	KEG_UNLOCK(keg);
2815
2816	return (nitems);
2817}
2818
2819/* See uma.h */
2820int
2821uma_zone_get_max(uma_zone_t zone)
2822{
2823	int nitems;
2824	uma_keg_t keg;
2825
2826	keg = zone_first_keg(zone);
2827	if (keg == NULL)
2828		return (0);
2829	KEG_LOCK(keg);
2830	nitems = keg->uk_maxpages * keg->uk_ipers;
2831	KEG_UNLOCK(keg);
2832
2833	return (nitems);
2834}
2835
2836/* See uma.h */
2837void
2838uma_zone_set_warning(uma_zone_t zone, const char *warning)
2839{
2840
2841	ZONE_LOCK(zone);
2842	zone->uz_warning = warning;
2843	ZONE_UNLOCK(zone);
2844}
2845
2846/* See uma.h */
2847int
2848uma_zone_get_cur(uma_zone_t zone)
2849{
2850	int64_t nitems;
2851	u_int i;
2852
2853	ZONE_LOCK(zone);
2854	nitems = zone->uz_allocs - zone->uz_frees;
2855	CPU_FOREACH(i) {
2856		/*
2857		 * See the comment in sysctl_vm_zone_stats() regarding the
2858		 * safety of accessing the per-cpu caches. With the zone lock
2859		 * held, it is safe, but can potentially result in stale data.
2860		 */
2861		nitems += zone->uz_cpu[i].uc_allocs -
2862		    zone->uz_cpu[i].uc_frees;
2863	}
2864	ZONE_UNLOCK(zone);
2865
2866	return (nitems < 0 ? 0 : nitems);
2867}
2868
2869/* See uma.h */
2870void
2871uma_zone_set_init(uma_zone_t zone, uma_init uminit)
2872{
2873	uma_keg_t keg;
2874
2875	keg = zone_first_keg(zone);
2876	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
2877	KEG_LOCK(keg);
2878	KASSERT(keg->uk_pages == 0,
2879	    ("uma_zone_set_init on non-empty keg"));
2880	keg->uk_init = uminit;
2881	KEG_UNLOCK(keg);
2882}
2883
2884/* See uma.h */
2885void
2886uma_zone_set_fini(uma_zone_t zone, uma_fini fini)
2887{
2888	uma_keg_t keg;
2889
2890	keg = zone_first_keg(zone);
2891	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
2892	KEG_LOCK(keg);
2893	KASSERT(keg->uk_pages == 0,
2894	    ("uma_zone_set_fini on non-empty keg"));
2895	keg->uk_fini = fini;
2896	KEG_UNLOCK(keg);
2897}
2898
2899/* See uma.h */
2900void
2901uma_zone_set_zinit(uma_zone_t zone, uma_init zinit)
2902{
2903
2904	ZONE_LOCK(zone);
2905	KASSERT(zone_first_keg(zone)->uk_pages == 0,
2906	    ("uma_zone_set_zinit on non-empty keg"));
2907	zone->uz_init = zinit;
2908	ZONE_UNLOCK(zone);
2909}
2910
2911/* See uma.h */
2912void
2913uma_zone_set_zfini(uma_zone_t zone, uma_fini zfini)
2914{
2915
2916	ZONE_LOCK(zone);
2917	KASSERT(zone_first_keg(zone)->uk_pages == 0,
2918	    ("uma_zone_set_zfini on non-empty keg"));
2919	zone->uz_fini = zfini;
2920	ZONE_UNLOCK(zone);
2921}
2922
2923/* See uma.h */
2924/* XXX uk_freef is not actually used with the zone locked */
2925void
2926uma_zone_set_freef(uma_zone_t zone, uma_free freef)
2927{
2928	uma_keg_t keg;
2929
2930	keg = zone_first_keg(zone);
2931	KASSERT(keg != NULL, ("uma_zone_set_init: Invalid zone type"));
2932	KEG_LOCK(keg);
2933	keg->uk_freef = freef;
2934	KEG_UNLOCK(keg);
2935}
2936
2937/* See uma.h */
2938/* XXX uk_allocf is not actually used with the zone locked */
2939void
2940uma_zone_set_allocf(uma_zone_t zone, uma_alloc allocf)
2941{
2942	uma_keg_t keg;
2943
2944	keg = zone_first_keg(zone);
2945	KEG_LOCK(keg);
2946	keg->uk_allocf = allocf;
2947	KEG_UNLOCK(keg);
2948}
2949
2950/* See uma.h */
2951void
2952uma_zone_reserve(uma_zone_t zone, int items)
2953{
2954	uma_keg_t keg;
2955
2956	keg = zone_first_keg(zone);
2957	if (keg == NULL)
2958		return;
2959	KEG_LOCK(keg);
2960	keg->uk_reserve = items;
2961	KEG_UNLOCK(keg);
2962
2963	return;
2964}
2965
2966/* See uma.h */
2967int
2968uma_zone_reserve_kva(uma_zone_t zone, int count)
2969{
2970	uma_keg_t keg;
2971	vm_offset_t kva;
2972	int pages;
2973
2974	keg = zone_first_keg(zone);
2975	if (keg == NULL)
2976		return (0);
2977	pages = count / keg->uk_ipers;
2978
2979	if (pages * keg->uk_ipers < count)
2980		pages++;
2981
2982#ifdef UMA_MD_SMALL_ALLOC
2983	if (keg->uk_ppera > 1) {
2984#else
2985	if (1) {
2986#endif
2987		kva = kva_alloc(pages * UMA_SLAB_SIZE);
2988		if (kva == 0)
2989			return (0);
2990	} else
2991		kva = 0;
2992	KEG_LOCK(keg);
2993	keg->uk_kva = kva;
2994	keg->uk_offset = 0;
2995	keg->uk_maxpages = pages;
2996#ifdef UMA_MD_SMALL_ALLOC
2997	keg->uk_allocf = (keg->uk_ppera > 1) ? noobj_alloc : uma_small_alloc;
2998#else
2999	keg->uk_allocf = noobj_alloc;
3000#endif
3001	keg->uk_flags |= UMA_ZONE_NOFREE;
3002	KEG_UNLOCK(keg);
3003
3004	return (1);
3005}
3006
3007/* See uma.h */
3008void
3009uma_prealloc(uma_zone_t zone, int items)
3010{
3011	int slabs;
3012	uma_slab_t slab;
3013	uma_keg_t keg;
3014
3015	keg = zone_first_keg(zone);
3016	if (keg == NULL)
3017		return;
3018	KEG_LOCK(keg);
3019	slabs = items / keg->uk_ipers;
3020	if (slabs * keg->uk_ipers < items)
3021		slabs++;
3022	while (slabs > 0) {
3023		slab = keg_alloc_slab(keg, zone, M_WAITOK);
3024		if (slab == NULL)
3025			break;
3026		MPASS(slab->us_keg == keg);
3027		LIST_INSERT_HEAD(&keg->uk_free_slab, slab, us_link);
3028		slabs--;
3029	}
3030	KEG_UNLOCK(keg);
3031}
3032
3033/* See uma.h */
3034uint32_t *
3035uma_find_refcnt(uma_zone_t zone, void *item)
3036{
3037	uma_slabrefcnt_t slabref;
3038	uma_slab_t slab;
3039	uma_keg_t keg;
3040	uint32_t *refcnt;
3041	int idx;
3042
3043	slab = vtoslab((vm_offset_t)item & (~UMA_SLAB_MASK));
3044	slabref = (uma_slabrefcnt_t)slab;
3045	keg = slab->us_keg;
3046	KASSERT(keg->uk_flags & UMA_ZONE_REFCNT,
3047	    ("uma_find_refcnt(): zone possibly not UMA_ZONE_REFCNT"));
3048	idx = ((uintptr_t)item - (uintptr_t)slab->us_data) / keg->uk_rsize;
3049	refcnt = &slabref->us_refcnt[idx];
3050	return refcnt;
3051}
3052
3053/* See uma.h */
3054void
3055uma_reclaim(void)
3056{
3057#ifdef UMA_DEBUG
3058	printf("UMA: vm asked us to release pages!\n");
3059#endif
3060	bucket_enable();
3061	zone_foreach(zone_drain);
3062	/*
3063	 * Some slabs may have been freed but this zone will be visited early
3064	 * we visit again so that we can free pages that are empty once other
3065	 * zones are drained.  We have to do the same for buckets.
3066	 */
3067	zone_drain(slabzone);
3068	zone_drain(slabrefzone);
3069	bucket_zone_drain();
3070}
3071
3072/* See uma.h */
3073int
3074uma_zone_exhausted(uma_zone_t zone)
3075{
3076	int full;
3077
3078	ZONE_LOCK(zone);
3079	full = (zone->uz_flags & UMA_ZFLAG_FULL);
3080	ZONE_UNLOCK(zone);
3081	return (full);
3082}
3083
3084int
3085uma_zone_exhausted_nolock(uma_zone_t zone)
3086{
3087	return (zone->uz_flags & UMA_ZFLAG_FULL);
3088}
3089
3090void *
3091uma_large_malloc(int size, int wait)
3092{
3093	void *mem;
3094	uma_slab_t slab;
3095	uint8_t flags;
3096
3097	slab = zone_alloc_item(slabzone, NULL, wait);
3098	if (slab == NULL)
3099		return (NULL);
3100	mem = page_alloc(NULL, size, &flags, wait);
3101	if (mem) {
3102		vsetslab((vm_offset_t)mem, slab);
3103		slab->us_data = mem;
3104		slab->us_flags = flags | UMA_SLAB_MALLOC;
3105		slab->us_size = size;
3106	} else {
3107		zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3108	}
3109
3110	return (mem);
3111}
3112
3113void
3114uma_large_free(uma_slab_t slab)
3115{
3116
3117	page_free(slab->us_data, slab->us_size, slab->us_flags);
3118	zone_free_item(slabzone, slab, NULL, SKIP_NONE);
3119}
3120
3121void
3122uma_print_stats(void)
3123{
3124	zone_foreach(uma_print_zone);
3125}
3126
3127static void
3128slab_print(uma_slab_t slab)
3129{
3130	printf("slab: keg %p, data %p, freecount %d\n",
3131		slab->us_keg, slab->us_data, slab->us_freecount);
3132}
3133
3134static void
3135cache_print(uma_cache_t cache)
3136{
3137	printf("alloc: %p(%d), free: %p(%d)\n",
3138		cache->uc_allocbucket,
3139		cache->uc_allocbucket?cache->uc_allocbucket->ub_cnt:0,
3140		cache->uc_freebucket,
3141		cache->uc_freebucket?cache->uc_freebucket->ub_cnt:0);
3142}
3143
3144static void
3145uma_print_keg(uma_keg_t keg)
3146{
3147	uma_slab_t slab;
3148
3149	printf("keg: %s(%p) size %d(%d) flags %#x ipers %d ppera %d "
3150	    "out %d free %d limit %d\n",
3151	    keg->uk_name, keg, keg->uk_size, keg->uk_rsize, keg->uk_flags,
3152	    keg->uk_ipers, keg->uk_ppera,
3153	    (keg->uk_ipers * keg->uk_pages) - keg->uk_free, keg->uk_free,
3154	    (keg->uk_maxpages / keg->uk_ppera) * keg->uk_ipers);
3155	printf("Part slabs:\n");
3156	LIST_FOREACH(slab, &keg->uk_part_slab, us_link)
3157		slab_print(slab);
3158	printf("Free slabs:\n");
3159	LIST_FOREACH(slab, &keg->uk_free_slab, us_link)
3160		slab_print(slab);
3161	printf("Full slabs:\n");
3162	LIST_FOREACH(slab, &keg->uk_full_slab, us_link)
3163		slab_print(slab);
3164}
3165
3166void
3167uma_print_zone(uma_zone_t zone)
3168{
3169	uma_cache_t cache;
3170	uma_klink_t kl;
3171	int i;
3172
3173	printf("zone: %s(%p) size %d flags %#x\n",
3174	    zone->uz_name, zone, zone->uz_size, zone->uz_flags);
3175	LIST_FOREACH(kl, &zone->uz_kegs, kl_link)
3176		uma_print_keg(kl->kl_keg);
3177	CPU_FOREACH(i) {
3178		cache = &zone->uz_cpu[i];
3179		printf("CPU %d Cache:\n", i);
3180		cache_print(cache);
3181	}
3182}
3183
3184#ifdef DDB
3185/*
3186 * Generate statistics across both the zone and its per-cpu cache's.  Return
3187 * desired statistics if the pointer is non-NULL for that statistic.
3188 *
3189 * Note: does not update the zone statistics, as it can't safely clear the
3190 * per-CPU cache statistic.
3191 *
3192 * XXXRW: Following the uc_allocbucket and uc_freebucket pointers here isn't
3193 * safe from off-CPU; we should modify the caches to track this information
3194 * directly so that we don't have to.
3195 */
3196static void
3197uma_zone_sumstat(uma_zone_t z, int *cachefreep, uint64_t *allocsp,
3198    uint64_t *freesp, uint64_t *sleepsp)
3199{
3200	uma_cache_t cache;
3201	uint64_t allocs, frees, sleeps;
3202	int cachefree, cpu;
3203
3204	allocs = frees = sleeps = 0;
3205	cachefree = 0;
3206	CPU_FOREACH(cpu) {
3207		cache = &z->uz_cpu[cpu];
3208		if (cache->uc_allocbucket != NULL)
3209			cachefree += cache->uc_allocbucket->ub_cnt;
3210		if (cache->uc_freebucket != NULL)
3211			cachefree += cache->uc_freebucket->ub_cnt;
3212		allocs += cache->uc_allocs;
3213		frees += cache->uc_frees;
3214	}
3215	allocs += z->uz_allocs;
3216	frees += z->uz_frees;
3217	sleeps += z->uz_sleeps;
3218	if (cachefreep != NULL)
3219		*cachefreep = cachefree;
3220	if (allocsp != NULL)
3221		*allocsp = allocs;
3222	if (freesp != NULL)
3223		*freesp = frees;
3224	if (sleepsp != NULL)
3225		*sleepsp = sleeps;
3226}
3227#endif /* DDB */
3228
3229static int
3230sysctl_vm_zone_count(SYSCTL_HANDLER_ARGS)
3231{
3232	uma_keg_t kz;
3233	uma_zone_t z;
3234	int count;
3235
3236	count = 0;
3237	mtx_lock(&uma_mtx);
3238	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3239		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3240			count++;
3241	}
3242	mtx_unlock(&uma_mtx);
3243	return (sysctl_handle_int(oidp, &count, 0, req));
3244}
3245
3246static int
3247sysctl_vm_zone_stats(SYSCTL_HANDLER_ARGS)
3248{
3249	struct uma_stream_header ush;
3250	struct uma_type_header uth;
3251	struct uma_percpu_stat ups;
3252	uma_bucket_t bucket;
3253	struct sbuf sbuf;
3254	uma_cache_t cache;
3255	uma_klink_t kl;
3256	uma_keg_t kz;
3257	uma_zone_t z;
3258	uma_keg_t k;
3259	int count, error, i;
3260
3261	error = sysctl_wire_old_buffer(req, 0);
3262	if (error != 0)
3263		return (error);
3264	sbuf_new_for_sysctl(&sbuf, NULL, 128, req);
3265
3266	count = 0;
3267	mtx_lock(&uma_mtx);
3268	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3269		LIST_FOREACH(z, &kz->uk_zones, uz_link)
3270			count++;
3271	}
3272
3273	/*
3274	 * Insert stream header.
3275	 */
3276	bzero(&ush, sizeof(ush));
3277	ush.ush_version = UMA_STREAM_VERSION;
3278	ush.ush_maxcpus = (mp_maxid + 1);
3279	ush.ush_count = count;
3280	(void)sbuf_bcat(&sbuf, &ush, sizeof(ush));
3281
3282	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3283		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3284			bzero(&uth, sizeof(uth));
3285			ZONE_LOCK(z);
3286			strlcpy(uth.uth_name, z->uz_name, UTH_MAX_NAME);
3287			uth.uth_align = kz->uk_align;
3288			uth.uth_size = kz->uk_size;
3289			uth.uth_rsize = kz->uk_rsize;
3290			LIST_FOREACH(kl, &z->uz_kegs, kl_link) {
3291				k = kl->kl_keg;
3292				uth.uth_maxpages += k->uk_maxpages;
3293				uth.uth_pages += k->uk_pages;
3294				uth.uth_keg_free += k->uk_free;
3295				uth.uth_limit = (k->uk_maxpages / k->uk_ppera)
3296				    * k->uk_ipers;
3297			}
3298
3299			/*
3300			 * A zone is secondary is it is not the first entry
3301			 * on the keg's zone list.
3302			 */
3303			if ((z->uz_flags & UMA_ZONE_SECONDARY) &&
3304			    (LIST_FIRST(&kz->uk_zones) != z))
3305				uth.uth_zone_flags = UTH_ZONE_SECONDARY;
3306
3307			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3308				uth.uth_zone_free += bucket->ub_cnt;
3309			uth.uth_allocs = z->uz_allocs;
3310			uth.uth_frees = z->uz_frees;
3311			uth.uth_fails = z->uz_fails;
3312			uth.uth_sleeps = z->uz_sleeps;
3313			(void)sbuf_bcat(&sbuf, &uth, sizeof(uth));
3314			/*
3315			 * While it is not normally safe to access the cache
3316			 * bucket pointers while not on the CPU that owns the
3317			 * cache, we only allow the pointers to be exchanged
3318			 * without the zone lock held, not invalidated, so
3319			 * accept the possible race associated with bucket
3320			 * exchange during monitoring.
3321			 */
3322			for (i = 0; i < (mp_maxid + 1); i++) {
3323				bzero(&ups, sizeof(ups));
3324				if (kz->uk_flags & UMA_ZFLAG_INTERNAL)
3325					goto skip;
3326				if (CPU_ABSENT(i))
3327					goto skip;
3328				cache = &z->uz_cpu[i];
3329				if (cache->uc_allocbucket != NULL)
3330					ups.ups_cache_free +=
3331					    cache->uc_allocbucket->ub_cnt;
3332				if (cache->uc_freebucket != NULL)
3333					ups.ups_cache_free +=
3334					    cache->uc_freebucket->ub_cnt;
3335				ups.ups_allocs = cache->uc_allocs;
3336				ups.ups_frees = cache->uc_frees;
3337skip:
3338				(void)sbuf_bcat(&sbuf, &ups, sizeof(ups));
3339			}
3340			ZONE_UNLOCK(z);
3341		}
3342	}
3343	mtx_unlock(&uma_mtx);
3344	error = sbuf_finish(&sbuf);
3345	sbuf_delete(&sbuf);
3346	return (error);
3347}
3348
3349#ifdef DDB
3350DB_SHOW_COMMAND(uma, db_show_uma)
3351{
3352	uint64_t allocs, frees, sleeps;
3353	uma_bucket_t bucket;
3354	uma_keg_t kz;
3355	uma_zone_t z;
3356	int cachefree;
3357
3358	db_printf("%18s %8s %8s %8s %12s %8s\n", "Zone", "Size", "Used", "Free",
3359	    "Requests", "Sleeps");
3360	LIST_FOREACH(kz, &uma_kegs, uk_link) {
3361		LIST_FOREACH(z, &kz->uk_zones, uz_link) {
3362			if (kz->uk_flags & UMA_ZFLAG_INTERNAL) {
3363				allocs = z->uz_allocs;
3364				frees = z->uz_frees;
3365				sleeps = z->uz_sleeps;
3366				cachefree = 0;
3367			} else
3368				uma_zone_sumstat(z, &cachefree, &allocs,
3369				    &frees, &sleeps);
3370			if (!((z->uz_flags & UMA_ZONE_SECONDARY) &&
3371			    (LIST_FIRST(&kz->uk_zones) != z)))
3372				cachefree += kz->uk_free;
3373			LIST_FOREACH(bucket, &z->uz_buckets, ub_link)
3374				cachefree += bucket->ub_cnt;
3375			db_printf("%18s %8ju %8jd %8d %12ju %8ju\n", z->uz_name,
3376			    (uintmax_t)kz->uk_size,
3377			    (intmax_t)(allocs - frees), cachefree,
3378			    (uintmax_t)allocs, sleeps);
3379			if (db_pager_quit)
3380				return;
3381		}
3382	}
3383}
3384#endif
3385