1/*
2 * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1991, 1993
31 *	The Regents of the University of California.  All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 *    must display the following acknowledgement:
43 *	This product includes software developed by the University of
44 *	California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 *    may be used to endorse or promote products derived from this software
47 *    without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections.  This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/malloc.h>
73#include <sys/mbuf.h>
74#include <sys/kernel.h>
75#include <sys/sysctl.h>
76#include <sys/syslog.h>
77#include <sys/protosw.h>
78#include <sys/domain.h>
79#include <sys/queue.h>
80#include <sys/proc.h>
81
82#include <kern/kern_types.h>
83#include <kern/simple_lock.h>
84#include <kern/queue.h>
85#include <kern/sched_prim.h>
86#include <kern/cpu_number.h>
87#include <kern/zalloc.h>
88
89#include <libkern/OSAtomic.h>
90#include <libkern/libkern.h>
91
92#include <IOKit/IOMapper.h>
93
94#include <machine/limits.h>
95#include <machine/machine_routines.h>
96
97#if CONFIG_MACF_NET
98#include <security/mac_framework.h>
99#endif /* MAC_NET */
100
101#include <sys/mcache.h>
102
103/*
104 * MBUF IMPLEMENTATION NOTES.
105 *
106 * There is a total of 5 per-CPU caches:
107 *
108 * MC_MBUF:
109 *	This is a cache of rudimentary objects of MSIZE in size; each
110 *	object represents an mbuf structure.  This cache preserves only
111 *	the m_type field of the mbuf during its transactions.
112 *
113 * MC_CL:
114 *	This is a cache of rudimentary objects of MCLBYTES in size; each
115 *	object represents a mcluster structure.  This cache does not
116 *	preserve the contents of the objects during its transactions.
117 *
118 * MC_BIGCL:
119 *	This is a cache of rudimentary objects of MBIGCLBYTES in size; each
120 *	object represents a mbigcluster structure.  This cache does not
121 *	preserve the contents of the objects during its transaction.
122 *
123 * MC_MBUF_CL:
124 *	This is a cache of mbufs each having a cluster attached to it.
125 *	It is backed by MC_MBUF and MC_CL rudimentary caches.  Several
126 *	fields of the mbuf related to the external cluster are preserved
127 *	during transactions.
128 *
129 * MC_MBUF_BIGCL:
130 *	This is a cache of mbufs each having a big cluster attached to it.
131 *	It is backed by MC_MBUF and MC_BIGCL rudimentary caches.  Several
132 *	fields of the mbuf related to the external cluster are preserved
133 *	during transactions.
134 *
135 * OBJECT ALLOCATION:
136 *
137 * Allocation requests are handled first at the per-CPU (mcache) layer
138 * before falling back to the slab layer.  Performance is optimal when
139 * the request is satisfied at the CPU layer because global data/lock
140 * never gets accessed.  When the slab layer is entered for allocation,
141 * the slab freelist will be checked first for available objects before
142 * the VM backing store is invoked.  Slab layer operations are serialized
143 * for all of the caches as the mbuf global lock is held most of the time.
144 * Allocation paths are different depending on the class of objects:
145 *
146 * a. Rudimentary object:
147 *
148 *	{ m_get_common(), m_clattach(), m_mclget(),
149 *	  m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(),
150 *	  composite object allocation }
151 *			|	^
152 *			|	|
153 *			|	+-----------------------+
154 *			v				|
155 *	   mcache_alloc/mcache_alloc_ext()	mbuf_slab_audit()
156 *			|				^
157 *			v				|
158 *		   [CPU cache] ------->	(found?) -------+
159 *			|				|
160 *			v				|
161 *		 mbuf_slab_alloc()			|
162 *			|				|
163 *			v				|
164 *	+---------> [freelist] ------->	(found?) -------+
165 *	|		|
166 *	|		v
167 *	|	    m_clalloc()
168 *	|		|
169 *	|		v
170 *	+---<<---- kmem_mb_alloc()
171 *
172 * b. Composite object:
173 *
174 *	{ m_getpackets_internal(), m_allocpacket_internal() }
175 *			|	^
176 *			|	|
177 *			|	+------	(done) ---------+
178 *			v				|
179 *	   mcache_alloc/mcache_alloc_ext()	mbuf_cslab_audit()
180 *			|				^
181 *			v				|
182 *		   [CPU cache] ------->	(found?) -------+
183 *			|				|
184 *			v				|
185 *		 mbuf_cslab_alloc()			|
186 *			|				|
187 *			v				|
188 *		    [freelist] ------->	(found?) -------+
189 *			|				|
190 *			v				|
191 *		(rudimentary object)			|
192 *	   mcache_alloc/mcache_alloc_ext() ------>>-----+
193 *
194 * Auditing notes: If auditing is enabled, buffers will be subjected to
195 * integrity checks by the audit routine.  This is done by verifying their
196 * contents against DEADBEEF (free) pattern before returning them to caller.
197 * As part of this step, the routine will also record the transaction and
198 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern.  It will
199 * also restore any constructed data structure fields if necessary.
200 *
201 * OBJECT DEALLOCATION:
202 *
203 * Freeing an object simply involves placing it into the CPU cache; this
204 * pollutes the cache to benefit subsequent allocations.  The slab layer
205 * will only be entered if the object is to be purged out of the cache.
206 * During normal operations, this happens only when the CPU layer resizes
207 * its bucket while it's adjusting to the allocation load.  Deallocation
208 * paths are different depending on the class of objects:
209 *
210 * a. Rudimentary object:
211 *
212 *	{ m_free(), m_freem_list(), composite object deallocation }
213 *			|	^
214 *			|	|
215 *			|	+------	(done) ---------+
216 *			v				|
217 *	   mcache_free/mcache_free_ext()		|
218 *			|				|
219 *			v				|
220 *		mbuf_slab_audit()			|
221 *			|				|
222 *			v				|
223 *		   [CPU cache] ---> (not purging?) -----+
224 *			|				|
225 *			v				|
226 *		 mbuf_slab_free()			|
227 *			|				|
228 *			v				|
229 *		    [freelist] ----------->>------------+
230 *	 (objects never get purged to VM)
231 *
232 * b. Composite object:
233 *
234 *	{ m_free(), m_freem_list() }
235 *			|	^
236 *			|	|
237 *			|	+------	(done) ---------+
238 *			v				|
239 *	   mcache_free/mcache_free_ext()		|
240 *			|				|
241 *			v				|
242 *		mbuf_cslab_audit()			|
243 *			|				|
244 *			v				|
245 *		   [CPU cache] ---> (not purging?) -----+
246 *			|				|
247 *			v				|
248 *		 mbuf_cslab_free()			|
249 *			|				|
250 *			v				|
251 *		    [freelist] ---> (not purging?) -----+
252 *			|				|
253 *			v				|
254 *		(rudimentary object)			|
255 *	   mcache_free/mcache_free_ext() ------->>------+
256 *
257 * Auditing notes: If auditing is enabled, the audit routine will save
258 * any constructed data structure fields (if necessary) before filling the
259 * contents of the buffers with DEADBEEF (free) pattern and recording the
260 * transaction.  Buffers that are freed (whether at CPU or slab layer) are
261 * expected to contain the free pattern.
262 *
263 * DEBUGGING:
264 *
265 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this
266 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT).  Additionally,
267 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag,
268 * i.e. modify the boot argument parameter to "mbuf_debug=0x13".  Leak
269 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g.
270 * "mbuf_debug=0x113".  Note that debugging consumes more CPU and memory.
271 *
272 * Each object is associated with exactly one mcache_audit_t structure that
273 * contains the information related to its last buffer transaction.  Given
274 * an address of an object, the audit structure can be retrieved by finding
275 * the position of the object relevant to the base address of the cluster:
276 *
277 *	+------------+			+=============+
278 *	| mbuf addr  |			| mclaudit[i] |
279 *	+------------+			+=============+
280 *	      |				| cl_audit[0] |
281 *	i = MTOBG(addr)			+-------------+
282 *	      |			+----->	| cl_audit[1] | -----> mcache_audit_t
283 *	b = BGTOM(i)		|	+-------------+
284 *	      |			|	|     ...     |
285 *	x = MCLIDX(b, addr)	|	+-------------+
286 *	      |			|	| cl_audit[7] |
287 *	      +-----------------+	+-------------+
288 *		 (e.g. x == 1)
289 *
290 * The mclaudit[] array is allocated at initialization time, but its contents
291 * get populated when the corresponding cluster is created.  Because a page
292 * can be turned into NMBPBG number of mbufs, we preserve enough space for the
293 * mbufs so that there is a 1-to-1 mapping between them.  A page that never
294 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the
295 * remaining entries unused.  For 16KB cluster, only one entry from the first
296 * page is allocated and used for the entire object.
297 */
298
299/* TODO: should be in header file */
300/* kernel translater */
301extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int);
302extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va);
303extern vm_map_t mb_map;		/* special map */
304
305/* Global lock */
306decl_lck_mtx_data(static, mbuf_mlock_data);
307static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data;
308static lck_attr_t *mbuf_mlock_attr;
309static lck_grp_t *mbuf_mlock_grp;
310static lck_grp_attr_t *mbuf_mlock_grp_attr;
311
312/* Back-end (common) layer */
313static void *mbuf_worker_run;	/* wait channel for worker thread */
314static int mbuf_worker_ready;	/* worker thread is runnable */
315static int mbuf_expand_mcl;	/* number of cluster creation requets */
316static int mbuf_expand_big;	/* number of big cluster creation requests */
317static int mbuf_expand_16k;	/* number of 16KB cluster creation requests */
318static int ncpu;		/* number of CPUs */
319static ppnum_t *mcl_paddr;	/* Array of cluster physical addresses */
320static ppnum_t mcl_pages;	/* Size of array (# physical pages) */
321static ppnum_t mcl_paddr_base;	/* Handle returned by IOMapper::iovmAlloc() */
322static mcache_t *ref_cache;	/* Cache of cluster reference & flags */
323static mcache_t *mcl_audit_con_cache; /* Audit contents cache */
324static unsigned int mbuf_debug;	/* patchable mbuf mcache flags */
325static unsigned int mb_normalized; /* number of packets "normalized" */
326
327#define	MB_GROWTH_AGGRESSIVE	1	/* Threshold: 1/2 of total */
328#define	MB_GROWTH_NORMAL	2	/* Threshold: 3/4 of total */
329
330typedef enum {
331	MC_MBUF = 0,	/* Regular mbuf */
332	MC_CL,		/* Cluster */
333	MC_BIGCL,	/* Large (4KB) cluster */
334	MC_16KCL,	/* Jumbo (16KB) cluster */
335	MC_MBUF_CL,	/* mbuf + cluster */
336	MC_MBUF_BIGCL,	/* mbuf + large (4KB) cluster */
337	MC_MBUF_16KCL	/* mbuf + jumbo (16KB) cluster */
338} mbuf_class_t;
339
340#define	MBUF_CLASS_MIN		MC_MBUF
341#define	MBUF_CLASS_MAX		MC_MBUF_16KCL
342#define	MBUF_CLASS_LAST		MC_16KCL
343#define	MBUF_CLASS_VALID(c) \
344	((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX)
345#define	MBUF_CLASS_COMPOSITE(c) \
346	((int)(c) > MBUF_CLASS_LAST)
347
348
349/*
350 * mbuf specific mcache allocation request flags.
351 */
352#define	MCR_COMP	MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */
353
354/*
355 * Per-cluster slab structure.
356 *
357 * A slab is a cluster control structure that contains one or more object
358 * chunks; the available chunks are chained in the slab's freelist (sl_head).
359 * Each time a chunk is taken out of the slab, the slab's reference count
360 * gets incremented.  When all chunks have been taken out, the empty slab
361 * gets removed (SLF_DETACHED) from the class's slab list.  A chunk that is
362 * returned to a slab causes the slab's reference count to be decremented;
363 * it also causes the slab to be reinserted back to class's slab list, if
364 * it's not already done.
365 *
366 * Compartmentalizing of the object chunks into slabs allows us to easily
367 * merge one or more slabs together when the adjacent slabs are idle, as
368 * well as to convert or move a slab from one class to another; e.g. the
369 * mbuf cluster slab can be converted to a regular cluster slab when all
370 * mbufs in the slab have been freed.
371 *
372 * A slab may also span across multiple clusters for chunks larger than
373 * a cluster's size.  In this case, only the slab of the first cluster is
374 * used.  The rest of the slabs are marked with SLF_PARTIAL to indicate
375 * that they are part of the larger slab.
376 *
377 * Each slab controls a page of memory.
378 */
379typedef struct mcl_slab {
380	struct mcl_slab	*sl_next;	/* neighboring slab */
381	u_int8_t	sl_class;	/* controlling mbuf class */
382	int8_t		sl_refcnt;	/* outstanding allocations */
383	int8_t		sl_chunks;	/* chunks (bufs) in this slab */
384	u_int16_t	sl_flags;	/* slab flags (see below) */
385	u_int16_t	sl_len;		/* slab length */
386	void		*sl_base;	/* base of allocated memory */
387	void		*sl_head;	/* first free buffer */
388	TAILQ_ENTRY(mcl_slab) sl_link;	/* next/prev slab on freelist */
389} mcl_slab_t;
390
391#define	SLF_MAPPED	0x0001		/* backed by a mapped page */
392#define	SLF_PARTIAL	0x0002		/* part of another slab */
393#define	SLF_DETACHED	0x0004		/* not in slab freelist */
394
395/*
396 * The array of slabs are broken into groups of arrays per 1MB of kernel
397 * memory to reduce the footprint.  Each group is allocated on demand
398 * whenever a new piece of memory mapped in from the VM crosses the 1MB
399 * boundary.
400 */
401#define	NSLABSPMB	((1 << MBSHIFT) >> PGSHIFT)	/* 256 slabs/grp */
402
403typedef struct mcl_slabg {
404	mcl_slab_t	slg_slab[NSLABSPMB];	/* group of slabs */
405} mcl_slabg_t;
406
407/*
408 * Number of slabs needed to control a 16KB cluster object.
409 */
410#define	NSLABSP16KB	(M16KCLBYTES >> PGSHIFT)
411
412/*
413 * Per-cluster audit structure.
414 */
415typedef struct {
416	mcache_audit_t	*cl_audit[NMBPBG];	/* array of audits */
417} mcl_audit_t;
418
419/*
420 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr
421 * and m_ext structures.  If auditing is enabled, we allocate a shadow
422 * mbuf structure of this size inside each audit structure, and the
423 * contents of the real mbuf gets copied into it when the mbuf is freed.
424 * This allows us to pattern-fill the mbuf for integrity check, and to
425 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case).
426 * Note that we don't save the contents of clusters when they are freed;
427 * we simply pattern-fill them.
428 */
429#define	AUDIT_CONTENTS_SIZE	((MSIZE - MHLEN) + sizeof (_m_ext_t))
430
431/*
432 * mbuf specific mcache audit flags
433 */
434#define	MB_INUSE	0x01	/* object has not been returned to slab */
435#define	MB_COMP_INUSE	0x02	/* object has not been returned to cslab */
436#define	MB_SCVALID	0x04	/* object has valid saved contents */
437
438/*
439 * Each of the following two arrays hold up to nmbclusters elements.
440 */
441static mcl_audit_t *mclaudit;	/* array of cluster audit information */
442static unsigned int maxclaudit;	/* max # of entries in audit table */
443static mcl_slabg_t **slabstbl;	/* cluster slabs table */
444static unsigned int maxslabgrp;	/* max # of entries in slabs table */
445static unsigned int slabgrp;	/* # of entries in slabs table */
446
447/* Globals */
448int nclusters;			/* # of clusters for non-jumbo (legacy) sizes */
449int njcl;			/* # of clusters for jumbo sizes */
450int njclbytes;			/* size of a jumbo cluster */
451union mbigcluster *mbutl;	/* first mapped cluster address */
452union mbigcluster *embutl;	/* ending virtual address of mclusters */
453int _max_linkhdr;		/* largest link-level header */
454int _max_protohdr;		/* largest protocol header */
455int max_hdr;			/* largest link+protocol header */
456int max_datalen;		/* MHLEN - max_hdr */
457
458static boolean_t mclverify;	/* debug: pattern-checking */
459static boolean_t mcltrace;	/* debug: stack tracing */
460static boolean_t mclfindleak;	/* debug: leak detection */
461static boolean_t mclexpleak;	/* debug: expose leak info to user space */
462
463/* mbuf leak detection variables */
464static struct mleak_table mleak_table;
465static mleak_stat_t *mleak_stat;
466
467#define	MLEAK_STAT_SIZE(n) \
468	((size_t)(&((mleak_stat_t *)0)->ml_trace[n]))
469
470struct mallocation {
471	mcache_obj_t *element;	/* the alloc'ed element, NULL if unused */
472	u_int32_t trace_index;	/* mtrace index for corresponding backtrace */
473	u_int32_t count;	/* How many objects were requested */
474	u_int64_t hitcount;	/* for determining hash effectiveness */
475};
476
477struct mtrace {
478	u_int64_t	collisions;
479	u_int64_t	hitcount;
480	u_int64_t	allocs;
481	u_int64_t	depth;
482	uintptr_t	addr[MLEAK_STACK_DEPTH];
483};
484
485/* Size must be a power of two for the zhash to be able to just mask off bits */
486#define	MLEAK_ALLOCATION_MAP_NUM	512
487#define	MLEAK_TRACE_MAP_NUM		256
488
489/*
490 * Sample factor for how often to record a trace.  This is overwritable
491 * by the boot-arg mleak_sample_factor.
492 */
493#define	MLEAK_SAMPLE_FACTOR		500
494
495/*
496 * Number of top leakers recorded.
497 */
498#define	MLEAK_NUM_TRACES		5
499
500#define	MB_LEAK_SPACING_64 "                    "
501#define MB_LEAK_SPACING_32 "            "
502
503
504#define	MB_LEAK_HDR_32	"\n\
505    trace [1]   trace [2]   trace [3]   trace [4]   trace [5]  \n\
506    ----------  ----------  ----------  ----------  ---------- \n\
507"
508
509#define	MB_LEAK_HDR_64	"\n\
510    trace [1]           trace [2]           trace [3]       \
511        trace [4]           trace [5]      \n\
512    ------------------  ------------------  ------------------  \
513    ------------------  ------------------ \n\
514"
515
516static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM;
517static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM;
518
519/* Hashmaps of allocations and their corresponding traces */
520static struct mallocation *mleak_allocations;
521static struct mtrace *mleak_traces;
522static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES];
523
524/* Lock to protect mleak tables from concurrent modification */
525decl_lck_mtx_data(static, mleak_lock_data);
526static lck_mtx_t *mleak_lock = &mleak_lock_data;
527static lck_attr_t *mleak_lock_attr;
528static lck_grp_t *mleak_lock_grp;
529static lck_grp_attr_t *mleak_lock_grp_attr;
530
531extern u_int32_t high_sb_max;
532
533/* TODO: should be in header file */
534int do_reclaim = 0;
535
536/* The minimum number of objects that are allocated, to start. */
537#define	MINCL		32
538#define	MINBIGCL	(MINCL >> 1)
539#define	MIN16KCL	(MINCL >> 2)
540
541/* Low watermarks (only map in pages once free counts go below) */
542#define	MBIGCL_LOWAT	MINBIGCL
543#define	M16KCL_LOWAT	MIN16KCL
544
545typedef struct {
546	mbuf_class_t	mtbl_class;	/* class type */
547	mcache_t	*mtbl_cache;	/* mcache for this buffer class */
548	TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */
549	mcache_obj_t	*mtbl_cobjlist;	/* composite objects freelist */
550	mb_class_stat_t	*mtbl_stats;	/* statistics fetchable via sysctl */
551	u_int32_t	mtbl_maxsize;	/* maximum buffer size */
552	int		mtbl_minlimit;	/* minimum allowed */
553	int		mtbl_maxlimit;	/* maximum allowed */
554	u_int32_t	mtbl_wantpurge;	/* purge during next reclaim */
555} mbuf_table_t;
556
557#define	m_class(c)	mbuf_table[c].mtbl_class
558#define	m_cache(c)	mbuf_table[c].mtbl_cache
559#define	m_slablist(c)	mbuf_table[c].mtbl_slablist
560#define	m_cobjlist(c)	mbuf_table[c].mtbl_cobjlist
561#define	m_maxsize(c)	mbuf_table[c].mtbl_maxsize
562#define	m_minlimit(c)	mbuf_table[c].mtbl_minlimit
563#define	m_maxlimit(c)	mbuf_table[c].mtbl_maxlimit
564#define	m_wantpurge(c)	mbuf_table[c].mtbl_wantpurge
565#define	m_cname(c)	mbuf_table[c].mtbl_stats->mbcl_cname
566#define	m_size(c)	mbuf_table[c].mtbl_stats->mbcl_size
567#define	m_total(c)	mbuf_table[c].mtbl_stats->mbcl_total
568#define	m_active(c)	mbuf_table[c].mtbl_stats->mbcl_active
569#define	m_infree(c)	mbuf_table[c].mtbl_stats->mbcl_infree
570#define	m_slab_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_slab_cnt
571#define	m_alloc_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_alloc_cnt
572#define	m_free_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_free_cnt
573#define	m_notified(c)	mbuf_table[c].mtbl_stats->mbcl_notified
574#define	m_purge_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_purge_cnt
575#define	m_fail_cnt(c)	mbuf_table[c].mtbl_stats->mbcl_fail_cnt
576#define	m_ctotal(c)	mbuf_table[c].mtbl_stats->mbcl_ctotal
577
578static mbuf_table_t mbuf_table[] = {
579	/*
580	 * The caches for mbufs, regular clusters and big clusters.
581	 */
582	{ MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)),
583	    NULL, NULL, 0, 0, 0, 0 },
584	{ MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)),
585	    NULL, NULL, 0, 0, 0, 0 },
586	{ MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)),
587	    NULL, NULL, 0, 0, 0, 0 },
588	{ MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)),
589	    NULL, NULL, 0, 0, 0, 0 },
590	/*
591	 * The following are special caches; they serve as intermediate
592	 * caches backed by the above rudimentary caches.  Each object
593	 * in the cache is an mbuf with a cluster attached to it.  Unlike
594	 * the above caches, these intermediate caches do not directly
595	 * deal with the slab structures; instead, the constructed
596	 * cached elements are simply stored in the freelists.
597	 */
598	{ MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
599	{ MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
600	{ MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 },
601};
602
603#define	NELEM(a)	(sizeof (a) / sizeof ((a)[0]))
604
605static void *mb_waitchan = &mbuf_table;	/* wait channel for all caches */
606static int mb_waiters;			/* number of waiters */
607
608#define	MB_WDT_MAXTIME	10		/* # of secs before watchdog panic */
609static struct timeval mb_wdtstart;	/* watchdog start timestamp */
610static char *mbuf_dump_buf;
611
612#define	MBUF_DUMP_BUF_SIZE	2048
613
614/*
615 * mbuf watchdog is enabled by default on embedded platforms.  It is
616 * also toggeable via the kern.ipc.mb_watchdog sysctl.
617 */
618#if CONFIG_EMBEDDED
619static unsigned int mb_watchdog = 1;
620#else
621static unsigned int mb_watchdog = 0;
622#endif /* CONFIG_EMBEDDED */
623
624/* The following are used to serialize m_clalloc() */
625static boolean_t mb_clalloc_busy;
626static void *mb_clalloc_waitchan = &mb_clalloc_busy;
627static int mb_clalloc_waiters;
628
629static void mbuf_mtypes_sync(boolean_t);
630static int mbstat_sysctl SYSCTL_HANDLER_ARGS;
631static void mbuf_stat_sync(void);
632static int mb_stat_sysctl SYSCTL_HANDLER_ARGS;
633static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS;
634static int mleak_table_sysctl SYSCTL_HANDLER_ARGS;
635static char *mbuf_dump(void);
636static void mbuf_table_init(void);
637static inline void m_incref(struct mbuf *);
638static inline u_int32_t m_decref(struct mbuf *);
639static int m_clalloc(const u_int32_t, const int, const u_int32_t);
640static void mbuf_worker_thread_init(void);
641static mcache_obj_t *slab_alloc(mbuf_class_t, int);
642static void slab_free(mbuf_class_t, mcache_obj_t *);
643static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***,
644    unsigned int, int);
645static void mbuf_slab_free(void *, mcache_obj_t *, int);
646static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t);
647static void mbuf_slab_notify(void *, u_int32_t);
648static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***,
649    unsigned int);
650static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int);
651static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***,
652    unsigned int, int);
653static void mbuf_cslab_free(void *, mcache_obj_t *, int);
654static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t);
655static int freelist_populate(mbuf_class_t, unsigned int, int);
656static void freelist_init(mbuf_class_t);
657static boolean_t mbuf_cached_above(mbuf_class_t, int);
658static boolean_t mbuf_steal(mbuf_class_t, unsigned int);
659static void m_reclaim(mbuf_class_t, unsigned int, boolean_t);
660static int m_howmany(int, size_t);
661static void mbuf_worker_thread(void);
662static void mbuf_watchdog(void);
663static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int);
664
665static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **,
666    size_t, unsigned int);
667static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *);
668static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t);
669static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t,
670    boolean_t);
671static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t);
672static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *);
673static void mcl_audit_mcheck_panic(struct mbuf *);
674static void mcl_audit_verify_nextptr(void *, mcache_audit_t *);
675
676static void mleak_activate(void);
677static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t);
678static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int);
679static void mleak_free(mcache_obj_t *);
680static void mleak_sort_traces(void);
681static void mleak_update_stats(void);
682
683static mcl_slab_t *slab_get(void *);
684static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t,
685    void *, void *, unsigned int, int, int);
686static void slab_insert(mcl_slab_t *, mbuf_class_t);
687static void slab_remove(mcl_slab_t *, mbuf_class_t);
688static boolean_t slab_inrange(mcl_slab_t *, void *);
689static void slab_nextptr_panic(mcl_slab_t *, void *);
690static void slab_detach(mcl_slab_t *);
691static boolean_t slab_is_detached(mcl_slab_t *);
692
693static int m_copyback0(struct mbuf **, int, int, const void *, int, int);
694static struct mbuf *m_split0(struct mbuf *, int, int, int);
695
696/* flags for m_copyback0 */
697#define	M_COPYBACK0_COPYBACK	0x0001	/* copyback from cp */
698#define	M_COPYBACK0_PRESERVE	0x0002	/* preserve original data */
699#define	M_COPYBACK0_COW		0x0004	/* do copy-on-write */
700#define	M_COPYBACK0_EXTEND	0x0008	/* extend chain */
701
702/*
703 * This flag is set for all mbufs that come out of and into the composite
704 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL.  mbufs that
705 * are marked with such a flag have clusters attached to them, and will be
706 * treated differently when they are freed; instead of being placed back
707 * into the mbuf and cluster freelists, the composite mbuf + cluster objects
708 * are placed back into the appropriate composite cache's freelist, and the
709 * actual freeing is deferred until the composite objects are purged.  At
710 * such a time, this flag will be cleared from the mbufs and the objects
711 * will be freed into their own separate freelists.
712 */
713#define	EXTF_COMPOSITE	0x1
714
715/*
716 * This flag indicates that the external cluster is read-only, i.e. it is
717 * or was referred to by more than one mbufs.  Once set, this flag is never
718 * cleared.
719 */
720#define	EXTF_READONLY	0x2
721#define	EXTF_MASK	(EXTF_COMPOSITE | EXTF_READONLY)
722
723#define	MEXT_RFA(m)		((m)->m_ext.ext_refflags)
724#define	MEXT_REF(m)		(MEXT_RFA(m)->refcnt)
725#define	MEXT_FLAGS(m)		(MEXT_RFA(m)->flags)
726#define	MBUF_IS_COMPOSITE(m)	\
727	(MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE)
728
729/*
730 * Macros used to verify the integrity of the mbuf.
731 */
732#define	_MCHECK(m) {							\
733	if ((m)->m_type != MT_FREE) {					\
734		if (mclaudit == NULL)					\
735			panic("MCHECK: m_type=%d m=%p",			\
736			    (u_int16_t)(m)->m_type, m);			\
737		else							\
738			mcl_audit_mcheck_panic(m);			\
739	}								\
740}
741
742#define	MBUF_IN_MAP(addr)						\
743	((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl)
744
745#define	MRANGE(addr) {							\
746	if (!MBUF_IN_MAP(addr))						\
747		panic("MRANGE: address out of range 0x%p", addr);	\
748}
749
750/*
751 * Macro version of mtod.
752 */
753#define	MTOD(m, t)	((t)((m)->m_data))
754
755/*
756 * Macros to obtain (4KB) cluster index and base cluster address.
757 */
758
759#define	MTOBG(x)	(((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT)
760#define	BGTOM(x)	((union mbigcluster *)(mbutl + (x)))
761
762/*
763 * Macro to find the mbuf index relative to a base.
764 */
765#define	MCLIDX(c, m)	(((char *)(m) - (char *)(c)) >> MSIZESHIFT)
766
767/*
768 * Same thing for 2KB cluster index.
769 */
770#define	CLBGIDX(c, m)	(((char *)(m) - (char *)(c)) >> MCLSHIFT)
771
772/*
773 * Macros used during mbuf and cluster initialization.
774 */
775#define	MBUF_INIT(m, pkthdr, type) {					\
776	_MCHECK(m);							\
777	(m)->m_next = (m)->m_nextpkt = NULL;				\
778	(m)->m_len = 0;							\
779	(m)->m_type = type;						\
780	if ((pkthdr) == 0) {						\
781		(m)->m_data = (m)->m_dat;				\
782		(m)->m_flags = 0;					\
783	} else {							\
784		(m)->m_data = (m)->m_pktdat;				\
785		(m)->m_flags = M_PKTHDR;				\
786		(m)->m_pkthdr.rcvif = NULL;				\
787		(m)->m_pkthdr.len = 0;					\
788		(m)->m_pkthdr.header = NULL;				\
789		(m)->m_pkthdr.csum_flags = 0;				\
790		(m)->m_pkthdr.csum_data = 0;				\
791		(m)->m_pkthdr.tso_segsz = 0;				\
792		(m)->m_pkthdr.vlan_tag = 0;				\
793		(m)->m_pkthdr.socket_id = 0;				\
794		(m)->m_pkthdr.vt_nrecs = 0;				\
795		(m)->m_pkthdr.aux_flags = 0;				\
796		m_tag_init(m);						\
797		m_service_class_init(m);				\
798	}								\
799}
800
801#define	MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) {		\
802	(m)->m_data = (m)->m_ext.ext_buf = (buf);			\
803	(m)->m_flags |= M_EXT;						\
804	(m)->m_ext.ext_size = (size);					\
805	(m)->m_ext.ext_free = (free);					\
806	(m)->m_ext.ext_arg = (arg);					\
807	(m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward =	\
808	    &(m)->m_ext.ext_refs;					\
809	MEXT_RFA(m) = (rfa);						\
810	MEXT_REF(m) = (ref);						\
811	MEXT_FLAGS(m) = (flag);						\
812}
813
814#define	MBUF_CL_INIT(m, buf, rfa, ref, flag)	\
815	MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag)
816
817#define	MBUF_BIGCL_INIT(m, buf, rfa, ref, flag)	\
818	MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag)
819
820#define	MBUF_16KCL_INIT(m, buf, rfa, ref, flag)	\
821	MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag)
822
823/*
824 * Macro to convert BSD malloc sleep flag to mcache's
825 */
826#define	MSLEEPF(f)	((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP)
827
828/*
829 * The structure that holds all mbuf class statistics exportable via sysctl.
830 * Similar to mbstat structure, the mb_stat structure is protected by the
831 * global mbuf lock.  It contains additional information about the classes
832 * that allows for a more accurate view of the state of the allocator.
833 */
834struct mb_stat *mb_stat;
835struct omb_stat *omb_stat;	/* For backwards compatibility */
836
837#define	MB_STAT_SIZE(n) \
838	((size_t)(&((mb_stat_t *)0)->mbs_class[n]))
839#define	OMB_STAT_SIZE(n) \
840	((size_t)(&((struct omb_stat *)0)->mbs_class[n]))
841
842/*
843 * The legacy structure holding all of the mbuf allocation statistics.
844 * The actual statistics used by the kernel are stored in the mbuf_table
845 * instead, and are updated atomically while the global mbuf lock is held.
846 * They are mirrored in mbstat to support legacy applications (e.g. netstat).
847 * Unlike before, the kernel no longer relies on the contents of mbstat for
848 * its operations (e.g. cluster expansion) because the structure is exposed
849 * to outside and could possibly be modified, therefore making it unsafe.
850 * With the exception of the mbstat.m_mtypes array (see below), all of the
851 * statistics are updated as they change.
852 */
853struct mbstat mbstat;
854
855#define	MBSTAT_MTYPES_MAX \
856	(sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0]))
857
858/*
859 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated
860 * atomically and stored in a per-CPU structure which is lock-free; this is
861 * done in order to avoid writing to the global mbstat data structure which
862 * would cause false sharing.  During sysctl request for kern.ipc.mbstat,
863 * the statistics across all CPUs will be converged into the mbstat.m_mtypes
864 * array and returned to the application.  Any updates for types greater or
865 * equal than MT_MAX would be done atomically to the mbstat; this slows down
866 * performance but is okay since the kernel uses only up to MT_MAX-1 while
867 * anything beyond that (up to type 255) is considered a corner case.
868 */
869typedef struct {
870	unsigned int	cpu_mtypes[MT_MAX];
871} __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t;
872
873typedef struct {
874	mtypes_cpu_t	mbs_cpu[1];
875} mbuf_mtypes_t;
876
877static mbuf_mtypes_t *mbuf_mtypes;	/* per-CPU statistics */
878
879#define	MBUF_MTYPES_SIZE(n) \
880	((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n]))
881
882#define	MTYPES_CPU(p) \
883	((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number())))
884
885#define	mtype_stat_add(type, n) {					\
886	if ((unsigned)(type) < MT_MAX) {				\
887		mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes);		\
888		atomic_add_32(&mbs->cpu_mtypes[type], n);		\
889	} else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) {	\
890		atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n);	\
891	}								\
892}
893
894#define	mtype_stat_sub(t, n)	mtype_stat_add(t, -(n))
895#define	mtype_stat_inc(t)	mtype_stat_add(t, 1)
896#define	mtype_stat_dec(t)	mtype_stat_sub(t, 1)
897
898static void
899mbuf_mtypes_sync(boolean_t locked)
900{
901	int m, n;
902	mtypes_cpu_t mtc;
903
904	if (locked)
905		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
906
907	bzero(&mtc, sizeof (mtc));
908	for (m = 0; m < ncpu; m++) {
909		mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m];
910		mtypes_cpu_t temp;
911
912		bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes,
913		    sizeof (temp.cpu_mtypes));
914
915		for (n = 0; n < MT_MAX; n++)
916			mtc.cpu_mtypes[n] += temp.cpu_mtypes[n];
917	}
918	if (!locked)
919		lck_mtx_lock(mbuf_mlock);
920	for (n = 0; n < MT_MAX; n++)
921		mbstat.m_mtypes[n] = mtc.cpu_mtypes[n];
922	if (!locked)
923		lck_mtx_unlock(mbuf_mlock);
924}
925
926static int
927mbstat_sysctl SYSCTL_HANDLER_ARGS
928{
929#pragma unused(oidp, arg1, arg2)
930	mbuf_mtypes_sync(FALSE);
931
932	return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat)));
933}
934
935static void
936mbuf_stat_sync(void)
937{
938	mb_class_stat_t *sp;
939	mcache_cpu_t *ccp;
940	mcache_t *cp;
941	int k, m, bktsize;
942
943	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
944
945	for (k = 0; k < NELEM(mbuf_table); k++) {
946		cp = m_cache(k);
947		ccp = &cp->mc_cpu[0];
948		bktsize = ccp->cc_bktsize;
949		sp = mbuf_table[k].mtbl_stats;
950
951		if (cp->mc_flags & MCF_NOCPUCACHE)
952			sp->mbcl_mc_state = MCS_DISABLED;
953		else if (cp->mc_purge_cnt > 0)
954			sp->mbcl_mc_state = MCS_PURGING;
955		else if (bktsize == 0)
956			sp->mbcl_mc_state = MCS_OFFLINE;
957		else
958			sp->mbcl_mc_state = MCS_ONLINE;
959
960		sp->mbcl_mc_cached = 0;
961		for (m = 0; m < ncpu; m++) {
962			ccp = &cp->mc_cpu[m];
963			if (ccp->cc_objs > 0)
964				sp->mbcl_mc_cached += ccp->cc_objs;
965			if (ccp->cc_pobjs > 0)
966				sp->mbcl_mc_cached += ccp->cc_pobjs;
967		}
968		sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize);
969		sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached -
970		    sp->mbcl_infree;
971
972		sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt;
973		sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt;
974		sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt;
975
976		/* Calculate total count specific to each class */
977		sp->mbcl_ctotal = sp->mbcl_total;
978		switch (m_class(k)) {
979		case MC_MBUF:
980			/* Deduct mbufs used in composite caches */
981			sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) +
982			    m_total(MC_MBUF_BIGCL));
983			break;
984
985		case MC_CL:
986			/* Deduct clusters used in composite cache */
987			sp->mbcl_ctotal -= m_total(MC_MBUF_CL);
988			break;
989
990		case MC_BIGCL:
991			/* Deduct clusters used in composite cache */
992			sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL);
993			break;
994
995		case MC_16KCL:
996			/* Deduct clusters used in composite cache */
997			sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL);
998			break;
999
1000		default:
1001			break;
1002		}
1003	}
1004}
1005
1006static int
1007mb_stat_sysctl SYSCTL_HANDLER_ARGS
1008{
1009#pragma unused(oidp, arg1, arg2)
1010	void *statp;
1011	int k, statsz, proc64 = proc_is64bit(req->p);
1012
1013	lck_mtx_lock(mbuf_mlock);
1014	mbuf_stat_sync();
1015
1016	if (!proc64) {
1017		struct omb_class_stat *oc;
1018		struct mb_class_stat *c;
1019
1020		omb_stat->mbs_cnt = mb_stat->mbs_cnt;
1021		oc = &omb_stat->mbs_class[0];
1022		c = &mb_stat->mbs_class[0];
1023		for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) {
1024			(void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname),
1025			    "%s", c->mbcl_cname);
1026			oc->mbcl_size = c->mbcl_size;
1027			oc->mbcl_total = c->mbcl_total;
1028			oc->mbcl_active = c->mbcl_active;
1029			oc->mbcl_infree = c->mbcl_infree;
1030			oc->mbcl_slab_cnt = c->mbcl_slab_cnt;
1031			oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt;
1032			oc->mbcl_free_cnt = c->mbcl_free_cnt;
1033			oc->mbcl_notified = c->mbcl_notified;
1034			oc->mbcl_purge_cnt = c->mbcl_purge_cnt;
1035			oc->mbcl_fail_cnt = c->mbcl_fail_cnt;
1036			oc->mbcl_ctotal = c->mbcl_ctotal;
1037			oc->mbcl_mc_state = c->mbcl_mc_state;
1038			oc->mbcl_mc_cached = c->mbcl_mc_cached;
1039			oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt;
1040			oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt;
1041			oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt;
1042		}
1043		statp = omb_stat;
1044		statsz = OMB_STAT_SIZE(NELEM(mbuf_table));
1045	} else {
1046		statp = mb_stat;
1047		statsz = MB_STAT_SIZE(NELEM(mbuf_table));
1048	}
1049
1050	lck_mtx_unlock(mbuf_mlock);
1051
1052	return (SYSCTL_OUT(req, statp, statsz));
1053}
1054
1055static int
1056mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS
1057{
1058#pragma unused(oidp, arg1, arg2)
1059	int i;
1060
1061	/* Ensure leak tracing turned on */
1062	if (!mclfindleak || !mclexpleak)
1063		return (ENXIO);
1064
1065	lck_mtx_lock(mleak_lock);
1066	mleak_update_stats();
1067	i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES));
1068	lck_mtx_unlock(mleak_lock);
1069
1070	return (i);
1071}
1072
1073static int
1074mleak_table_sysctl SYSCTL_HANDLER_ARGS
1075{
1076#pragma unused(oidp, arg1, arg2)
1077	int i = 0;
1078
1079	/* Ensure leak tracing turned on */
1080	if (!mclfindleak || !mclexpleak)
1081		return (ENXIO);
1082
1083	lck_mtx_lock(mleak_lock);
1084	i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table));
1085	lck_mtx_unlock(mleak_lock);
1086
1087	return (i);
1088}
1089
1090static inline void
1091m_incref(struct mbuf *m)
1092{
1093	UInt32 old, new;
1094	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1095
1096	do {
1097		old = *addr;
1098		new = old + 1;
1099		ASSERT(new != 0);
1100	} while (!OSCompareAndSwap(old, new, addr));
1101
1102	/*
1103	 * If cluster is shared, mark it with (sticky) EXTF_READONLY;
1104	 * we don't clear the flag when the refcount goes back to 1
1105	 * to simplify code calling m_mclhasreference().
1106	 */
1107	if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY))
1108		(void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m));
1109}
1110
1111static inline u_int32_t
1112m_decref(struct mbuf *m)
1113{
1114	UInt32 old, new;
1115	volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m);
1116
1117	do {
1118		old = *addr;
1119		new = old - 1;
1120		ASSERT(old != 0);
1121	} while (!OSCompareAndSwap(old, new, addr));
1122
1123	return (new);
1124}
1125
1126static void
1127mbuf_table_init(void)
1128{
1129	unsigned int b, c, s;
1130	int m;
1131
1132	MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)),
1133	    M_TEMP, M_WAITOK | M_ZERO);
1134	VERIFY(omb_stat != NULL);
1135
1136	MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)),
1137	    M_TEMP, M_WAITOK | M_ZERO);
1138	VERIFY(mb_stat != NULL);
1139
1140	mb_stat->mbs_cnt = NELEM(mbuf_table);
1141	for (m = 0; m < NELEM(mbuf_table); m++)
1142		mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m];
1143
1144#if CONFIG_MBUF_JUMBO
1145	/*
1146	 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do
1147	 * this only on platforms where jumbo cluster pool is enabled.
1148	 */
1149	njcl = nmbclusters / 3;
1150	njclbytes = M16KCLBYTES;
1151#endif /* CONFIG_MBUF_JUMBO */
1152
1153	/*
1154	 * nclusters holds both the 2KB and 4KB pools, so ensure it's
1155	 * a multiple of 4KB clusters.
1156	 */
1157	nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1158	if (njcl > 0) {
1159		/*
1160		 * Each jumbo cluster takes 8 2KB clusters, so make
1161		 * sure that the pool size is evenly divisible by 8;
1162		 * njcl is in 2KB unit, hence treated as such.
1163		 */
1164		njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8);
1165
1166		/* Update nclusters with rounded down value of njcl */
1167		nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG);
1168	}
1169
1170	/*
1171	 * njcl is valid only on platforms with 16KB jumbo clusters, where
1172	 * it is configured to 1/3 of the pool size.  On these platforms,
1173	 * the remaining is used for 2KB and 4KB clusters.  On platforms
1174	 * without 16KB jumbo clusters, the entire pool is used for both
1175	 * 2KB and 4KB clusters.  A 4KB cluster can either be splitted into
1176	 * 16 mbufs, or into 2 2KB clusters.
1177	 *
1178	 *  +---+---+------------ ... -----------+------- ... -------+
1179	 *  | c | b |              s             |        njcl       |
1180	 *  +---+---+------------ ... -----------+------- ... -------+
1181	 *
1182	 * 1/32th of the shared region is reserved for pure 2KB and 4KB
1183	 * clusters (1/64th each.)
1184	 */
1185	c = P2ROUNDDOWN((nclusters >> 6), 2);		/* in 2KB unit */
1186	b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */
1187	s = nclusters - (c + (b << NCLPBGSHIFT));	/* in 2KB unit */
1188
1189	/*
1190	 * 1/64th (c) is reserved for 2KB clusters.
1191	 */
1192	m_minlimit(MC_CL) = c;
1193	m_maxlimit(MC_CL) = s + c;			/* in 2KB unit */
1194	m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES;
1195	(void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl");
1196
1197	/*
1198	 * Another 1/64th (b) of the map is reserved for 4KB clusters.
1199	 * It cannot be turned into 2KB clusters or mbufs.
1200	 */
1201	m_minlimit(MC_BIGCL) = b;
1202	m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b;	/* in 4KB unit */
1203	m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES;
1204	(void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl");
1205
1206	/*
1207	 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB)
1208	 */
1209	m_minlimit(MC_MBUF) = 0;
1210	m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT);	/* in mbuf unit */
1211	m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE;
1212	(void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf");
1213
1214	/*
1215	 * Set limits for the composite classes.
1216	 */
1217	m_minlimit(MC_MBUF_CL) = 0;
1218	m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL);
1219	m_maxsize(MC_MBUF_CL) = MCLBYTES;
1220	m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL);
1221	(void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl");
1222
1223	m_minlimit(MC_MBUF_BIGCL) = 0;
1224	m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL);
1225	m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES;
1226	m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL);
1227	(void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl");
1228
1229	/*
1230	 * And for jumbo classes.
1231	 */
1232	m_minlimit(MC_16KCL) = 0;
1233	m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT);	/* in 16KB unit */
1234	m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES;
1235	(void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl");
1236
1237	m_minlimit(MC_MBUF_16KCL) = 0;
1238	m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL);
1239	m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES;
1240	m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL);
1241	(void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl");
1242
1243	/*
1244	 * Initialize the legacy mbstat structure.
1245	 */
1246	bzero(&mbstat, sizeof (mbstat));
1247	mbstat.m_msize = m_maxsize(MC_MBUF);
1248	mbstat.m_mclbytes = m_maxsize(MC_CL);
1249	mbstat.m_minclsize = MINCLSIZE;
1250	mbstat.m_mlen = MLEN;
1251	mbstat.m_mhlen = MHLEN;
1252	mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL);
1253}
1254
1255#if defined(__LP64__)
1256typedef struct ncl_tbl {
1257	uint64_t nt_maxmem;	/* memory (sane) size */
1258	uint32_t nt_mbpool;	/* mbuf pool size */
1259} ncl_tbl_t;
1260
1261/* Non-server */
1262static ncl_tbl_t ncl_table[] = {
1263	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(64 << MBSHIFT)	 /*  64 MB */ },
1264	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(96 << MBSHIFT)	 /*  96 MB */ },
1265	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1266	{ 0, 0 }
1267};
1268
1269/* Server */
1270static ncl_tbl_t ncl_table_srv[] = {
1271	{ (1ULL << GBSHIFT)	  /*  1 GB */,	(96 << MBSHIFT)  /*  96 MB */ },
1272	{ (1ULL << (GBSHIFT + 2)) /*  4 GB */,	(128 << MBSHIFT) /* 128 MB */ },
1273	{ (1ULL << (GBSHIFT + 3)) /*  8 GB */,	(160 << MBSHIFT) /* 160 MB */ },
1274	{ (1ULL << (GBSHIFT + 4)) /* 16 GB */,	(192 << MBSHIFT) /* 192 MB */ },
1275	{ (1ULL << (GBSHIFT + 5)) /* 32 GB */,	(256 << MBSHIFT) /* 256 MB */ },
1276	{ (1ULL << (GBSHIFT + 6)) /* 64 GB */,	(384 << MBSHIFT) /* 384 MB */ },
1277	{ 0, 0 }
1278};
1279#endif /* __LP64__ */
1280
1281__private_extern__ unsigned int
1282mbuf_default_ncl(int server, uint64_t mem)
1283{
1284#if !defined(__LP64__)
1285#pragma unused(server)
1286	unsigned int n;
1287	/*
1288	 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM).
1289	 */
1290	if ((n = ((mem / 16) / MCLBYTES)) > 32768)
1291		n = 32768;
1292#else
1293	unsigned int n, i;
1294	ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table);
1295	/*
1296	 * 64-bit kernel (mbuf pool size based on table).
1297	 */
1298	n = tbl[0].nt_mbpool;
1299	for (i = 0; tbl[i].nt_mbpool != 0; i++) {
1300		if (mem < tbl[i].nt_maxmem)
1301			break;
1302		n = tbl[i].nt_mbpool;
1303	}
1304	n >>= MCLSHIFT;
1305#endif /* !__LP64__ */
1306	return (n);
1307}
1308
1309__private_extern__ void
1310mbinit(void)
1311{
1312	unsigned int m;
1313	unsigned int initmcl = 0;
1314	void *buf;
1315	thread_t thread = THREAD_NULL;
1316
1317	/*
1318	 * These MBUF_ values must be equal to their private counterparts.
1319	 */
1320	_CASSERT(MBUF_EXT == M_EXT);
1321	_CASSERT(MBUF_PKTHDR == M_PKTHDR);
1322	_CASSERT(MBUF_EOR == M_EOR);
1323	_CASSERT(MBUF_LOOP == M_LOOP);
1324	_CASSERT(MBUF_BCAST == M_BCAST);
1325	_CASSERT(MBUF_MCAST == M_MCAST);
1326	_CASSERT(MBUF_FRAG == M_FRAG);
1327	_CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG);
1328	_CASSERT(MBUF_LASTFRAG == M_LASTFRAG);
1329	_CASSERT(MBUF_PROMISC == M_PROMISC);
1330	_CASSERT(MBUF_HASFCS == M_HASFCS);
1331
1332	_CASSERT(MBUF_TYPE_FREE == MT_FREE);
1333	_CASSERT(MBUF_TYPE_DATA == MT_DATA);
1334	_CASSERT(MBUF_TYPE_HEADER == MT_HEADER);
1335	_CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET);
1336	_CASSERT(MBUF_TYPE_PCB == MT_PCB);
1337	_CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE);
1338	_CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE);
1339	_CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE);
1340	_CASSERT(MBUF_TYPE_SONAME == MT_SONAME);
1341	_CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS);
1342	_CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE);
1343	_CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS);
1344	_CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR);
1345	_CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL);
1346	_CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA);
1347
1348	_CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4);
1349	_CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6);
1350	_CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16);
1351	_CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16);
1352	_CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP);
1353	_CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP);
1354	_CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP);
1355	_CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6);
1356	_CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6);
1357	_CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED);
1358	_CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID);
1359	_CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID);
1360	_CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR);
1361
1362	_CASSERT(MBUF_WAITOK == M_WAIT);
1363	_CASSERT(MBUF_DONTWAIT == M_DONTWAIT);
1364	_CASSERT(MBUF_COPYALL == M_COPYALL);
1365
1366	_CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR);
1367	_CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR);
1368
1369	_CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK);
1370	_CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK);
1371	_CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE);
1372	_CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE);
1373	_CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE);
1374	_CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI);
1375	_CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI);
1376	_CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI);
1377	_CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO);
1378	_CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO);
1379
1380	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK);
1381	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE);
1382	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI);
1383	_CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO);
1384
1385	if (nmbclusters == 0)
1386		nmbclusters = NMBCLUSTERS;
1387
1388	/* This should be a sane (at least even) value by now */
1389	VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1));
1390
1391	/* Setup the mbuf table */
1392	mbuf_table_init();
1393
1394	/* Global lock for common layer */
1395	mbuf_mlock_grp_attr = lck_grp_attr_alloc_init();
1396	mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr);
1397	mbuf_mlock_attr = lck_attr_alloc_init();
1398	lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr);
1399
1400	/*
1401	 * Allocate cluster slabs table:
1402	 *
1403	 *	maxslabgrp = (N * 2048) / (1024 * 1024)
1404	 *
1405	 * Where N is nmbclusters rounded up to the nearest 512.  This yields
1406	 * mcl_slab_g_t units, each one representing a MB of memory.
1407	 */
1408	maxslabgrp =
1409	    (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT;
1410	MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *),
1411	    M_TEMP, M_WAITOK | M_ZERO);
1412	VERIFY(slabstbl != NULL);
1413
1414	/*
1415	 * Allocate audit structures, if needed:
1416	 *
1417	 *	maxclaudit = (maxslabgrp * 1024 * 1024) / 4096
1418	 *
1419	 * This yields mcl_audit_t units, each one representing a page.
1420	 */
1421	PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug));
1422	mbuf_debug |= mcache_getflags();
1423	if (mbuf_debug & MCF_DEBUG) {
1424		maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT);
1425		MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit),
1426		    M_TEMP, M_WAITOK | M_ZERO);
1427		VERIFY(mclaudit != NULL);
1428
1429		mcl_audit_con_cache = mcache_create("mcl_audit_contents",
1430		    AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP);
1431		VERIFY(mcl_audit_con_cache != NULL);
1432	}
1433	mclverify = (mbuf_debug & MCF_VERIFY);
1434	mcltrace = (mbuf_debug & MCF_TRACE);
1435	mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG);
1436	mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG);
1437
1438	/* Enable mbuf leak logging, with a lock to protect the tables */
1439
1440	mleak_lock_grp_attr = lck_grp_attr_alloc_init();
1441	mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr);
1442	mleak_lock_attr = lck_attr_alloc_init();
1443	lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr);
1444
1445	mleak_activate();
1446
1447	/* Calculate the number of pages assigned to the cluster pool */
1448	mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES;
1449	MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t),
1450	    M_TEMP, M_WAITOK);
1451	VERIFY(mcl_paddr != NULL);
1452
1453	/* Register with the I/O Bus mapper */
1454	mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages);
1455	bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t));
1456
1457	embutl = (union mbigcluster *)
1458	    ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES)));
1459	VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0);
1460
1461	/* Prime up the freelist */
1462	PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl));
1463	if (initmcl != 0) {
1464		initmcl >>= NCLPBGSHIFT;	/* become a 4K unit */
1465		if (initmcl > m_maxlimit(MC_BIGCL))
1466			initmcl = m_maxlimit(MC_BIGCL);
1467	}
1468	if (initmcl < m_minlimit(MC_BIGCL))
1469		initmcl = m_minlimit(MC_BIGCL);
1470
1471	lck_mtx_lock(mbuf_mlock);
1472
1473	/*
1474	 * For classes with non-zero minimum limits, populate their freelists
1475	 * so that m_total(class) is at least m_minlimit(class).
1476	 */
1477	VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0);
1478	freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT);
1479	VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
1480	freelist_init(m_class(MC_CL));
1481
1482	for (m = 0; m < NELEM(mbuf_table); m++) {
1483		/* Make sure we didn't miss any */
1484		VERIFY(m_minlimit(m_class(m)) == 0 ||
1485		    m_total(m_class(m)) >= m_minlimit(m_class(m)));
1486	}
1487
1488	lck_mtx_unlock(mbuf_mlock);
1489
1490	(void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init,
1491	    NULL, &thread);
1492	thread_deallocate(thread);
1493
1494	ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref),
1495	    0, 0, MCR_SLEEP);
1496
1497	/* Create the cache for each class */
1498	for (m = 0; m < NELEM(mbuf_table); m++) {
1499		void *allocfunc, *freefunc, *auditfunc, *logfunc;
1500		u_int32_t flags;
1501
1502		flags = mbuf_debug;
1503		if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL ||
1504		    m_class(m) == MC_MBUF_16KCL) {
1505			allocfunc = mbuf_cslab_alloc;
1506			freefunc = mbuf_cslab_free;
1507			auditfunc = mbuf_cslab_audit;
1508			logfunc = mleak_logger;
1509		} else {
1510			allocfunc = mbuf_slab_alloc;
1511			freefunc = mbuf_slab_free;
1512			auditfunc = mbuf_slab_audit;
1513			logfunc = mleak_logger;
1514		}
1515
1516		/*
1517		 * Disable per-CPU caches for jumbo classes if there
1518		 * is no jumbo cluster pool available in the system.
1519		 * The cache itself is still created (but will never
1520		 * be populated) since it simplifies the code.
1521		 */
1522		if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) &&
1523		    njcl == 0)
1524			flags |= MCF_NOCPUCACHE;
1525
1526		if (!mclfindleak)
1527			flags |= MCF_NOLEAKLOG;
1528
1529		m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m),
1530		    allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify,
1531		    (void *)(uintptr_t)m, flags, MCR_SLEEP);
1532	}
1533
1534	/*
1535	 * Allocate structure for per-CPU statistics that's aligned
1536	 * on the CPU cache boundary; this code assumes that we never
1537	 * uninitialize this framework, since the original address
1538	 * before alignment is not saved.
1539	 */
1540	ncpu = ml_get_max_cpus();
1541	MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE,
1542	    M_TEMP, M_WAITOK);
1543	VERIFY(buf != NULL);
1544
1545	mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE);
1546	bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu));
1547
1548	/*
1549	 * Set the max limit on sb_max to be 1/16 th of the size of
1550	 * memory allocated for mbuf clusters.
1551	 */
1552	high_sb_max = (nmbclusters << (MCLSHIFT - 4));
1553	if (high_sb_max < sb_max) {
1554		/* sb_max is too large for this configuration, scale it down */
1555		if (high_sb_max > (1 << MBSHIFT)) {
1556			/* We have atleast 16 M of mbuf pool */
1557			sb_max = high_sb_max;
1558		} else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) {
1559			/*
1560			 * If we have more than 1M of mbufpool, cap the size of
1561			 * max sock buf at 1M
1562			 */
1563			sb_max = high_sb_max = (1 << MBSHIFT);
1564		} else {
1565			sb_max = high_sb_max;
1566		}
1567	}
1568
1569	/* allocate space for mbuf_dump_buf */
1570	MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK);
1571	VERIFY(mbuf_dump_buf != NULL);
1572
1573	printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n",
1574	    (nmbclusters << MCLSHIFT) >> MBSHIFT,
1575	    (nclusters << MCLSHIFT) >> MBSHIFT,
1576	    (njcl << MCLSHIFT) >> MBSHIFT);
1577}
1578
1579/*
1580 * Obtain a slab of object(s) from the class's freelist.
1581 */
1582static mcache_obj_t *
1583slab_alloc(mbuf_class_t class, int wait)
1584{
1585	mcl_slab_t *sp;
1586	mcache_obj_t *buf;
1587
1588	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1589
1590	VERIFY(class != MC_16KCL || njcl > 0);
1591
1592	/* This should always be NULL for us */
1593	VERIFY(m_cobjlist(class) == NULL);
1594
1595	/*
1596	 * Treat composite objects as having longer lifespan by using
1597	 * a slab from the reverse direction, in hoping that this could
1598	 * reduce the probability of fragmentation for slabs that hold
1599	 * more than one buffer chunks (e.g. mbuf slabs).  For other
1600	 * slabs, this probably doesn't make much of a difference.
1601	 */
1602	if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP))
1603		sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead);
1604	else
1605		sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class));
1606
1607	if (sp == NULL) {
1608		VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0);
1609		/* The slab list for this class is empty */
1610		return (NULL);
1611	}
1612
1613	VERIFY(m_infree(class) > 0);
1614	VERIFY(!slab_is_detached(sp));
1615	VERIFY(sp->sl_class == class &&
1616	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1617	buf = sp->sl_head;
1618	VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf));
1619
1620	if (class == MC_MBUF) {
1621		sp->sl_head = buf->obj_next;
1622		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1));
1623	} else if (class == MC_CL) {
1624		sp->sl_head = buf->obj_next;
1625		VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1));
1626	} else {
1627		sp->sl_head = NULL;
1628	}
1629	if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) {
1630		slab_nextptr_panic(sp, sp->sl_head);
1631		/* In case sl_head is in the map but not in the slab */
1632		VERIFY(slab_inrange(sp, sp->sl_head));
1633		/* NOTREACHED */
1634	}
1635
1636	/* Increment slab reference */
1637	sp->sl_refcnt++;
1638
1639	if (mclaudit != NULL) {
1640		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1641		mca->mca_uflags = 0;
1642		/* Save contents on mbuf objects only */
1643		if (class == MC_MBUF)
1644			mca->mca_uflags |= MB_SCVALID;
1645	}
1646
1647	if (class == MC_CL) {
1648		mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1649		/*
1650		 * A 2K cluster slab can have at most NCLPBG references.
1651		 */
1652		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG &&
1653		    sp->sl_chunks == NCLPBG &&
1654		    sp->sl_len == m_maxsize(MC_BIGCL));
1655		VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL);
1656	} else if (class == MC_BIGCL) {
1657		mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) +
1658		    m_infree(MC_MBUF_BIGCL);
1659		/*
1660		 * A 4K cluster slab can have at most 1 reference.
1661		 */
1662		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1663		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1664	} else if (class == MC_16KCL) {
1665		mcl_slab_t *nsp;
1666		int k;
1667
1668		--m_infree(MC_16KCL);
1669		VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 &&
1670		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1671		/*
1672		 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB.
1673		 * A 16KB big cluster takes NSLABSP16KB slabs, each having at
1674		 * most 1 reference.
1675		 */
1676		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1677			nsp = nsp->sl_next;
1678			/* Next slab must already be present */
1679			VERIFY(nsp != NULL);
1680			nsp->sl_refcnt++;
1681			VERIFY(!slab_is_detached(nsp));
1682			VERIFY(nsp->sl_class == MC_16KCL &&
1683			    nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) &&
1684			    nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 &&
1685			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1686			    nsp->sl_head == NULL);
1687		}
1688	} else {
1689		VERIFY(class == MC_MBUF);
1690		--m_infree(MC_MBUF);
1691		/*
1692		 * If auditing is turned on, this check is
1693		 * deferred until later in mbuf_slab_audit().
1694		 */
1695		if (mclaudit == NULL)
1696			_MCHECK((struct mbuf *)buf);
1697		/*
1698		 * Since we have incremented the reference count above,
1699		 * an mbuf slab (formerly a 4KB cluster slab that was cut
1700		 * up into mbufs) must have a reference count between 1
1701		 * and NMBPBG at this point.
1702		 */
1703		VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG &&
1704		    sp->sl_chunks == NMBPBG &&
1705		    sp->sl_len == m_maxsize(MC_BIGCL));
1706		VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL);
1707	}
1708
1709	/* If empty, remove this slab from the class's freelist */
1710	if (sp->sl_head == NULL) {
1711		VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG);
1712		VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG);
1713		slab_remove(sp, class);
1714	}
1715
1716	return (buf);
1717}
1718
1719/*
1720 * Place a slab of object(s) back into a class's slab list.
1721 */
1722static void
1723slab_free(mbuf_class_t class, mcache_obj_t *buf)
1724{
1725	mcl_slab_t *sp;
1726
1727	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1728
1729	VERIFY(class != MC_16KCL || njcl > 0);
1730	VERIFY(buf->obj_next == NULL);
1731	sp = slab_get(buf);
1732	VERIFY(sp->sl_class == class && slab_inrange(sp, buf) &&
1733	    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
1734
1735	/* Decrement slab reference */
1736	sp->sl_refcnt--;
1737
1738	if (class == MC_CL) {
1739		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1740		/*
1741		 * A slab that has been splitted for 2KB clusters can have
1742		 * at most 1 outstanding reference at this point.
1743		 */
1744		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) &&
1745		    sp->sl_chunks == NCLPBG &&
1746		    sp->sl_len == m_maxsize(MC_BIGCL));
1747		VERIFY(sp->sl_refcnt < (NCLPBG - 1) ||
1748		    (slab_is_detached(sp) && sp->sl_head == NULL));
1749	} else if (class == MC_BIGCL) {
1750		VERIFY(IS_P2ALIGNED(buf, MCLBYTES));
1751		/*
1752		 * A 4KB cluster slab can have at most 1 reference
1753		 * which must be 0 at this point.
1754		 */
1755		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1756		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1757		VERIFY(slab_is_detached(sp));
1758	} else if (class == MC_16KCL) {
1759		mcl_slab_t *nsp;
1760		int k;
1761		/*
1762		 * A 16KB cluster takes NSLABSP16KB slabs, all must
1763		 * now have 0 reference.
1764		 */
1765		VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES));
1766		VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 &&
1767		    sp->sl_len == m_maxsize(class) && sp->sl_head == NULL);
1768		VERIFY(slab_is_detached(sp));
1769		for (nsp = sp, k = 1; k < NSLABSP16KB; k++) {
1770			nsp = nsp->sl_next;
1771			/* Next slab must already be present */
1772			VERIFY(nsp != NULL);
1773			nsp->sl_refcnt--;
1774			VERIFY(slab_is_detached(nsp));
1775			VERIFY(nsp->sl_class == MC_16KCL &&
1776			    (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) &&
1777			    nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 &&
1778			    nsp->sl_len == 0 && nsp->sl_base == sp->sl_base &&
1779			    nsp->sl_head == NULL);
1780		}
1781	} else {
1782		/*
1783		 * A slab that has been splitted for mbufs has at most NMBPBG
1784		 * reference counts.  Since we have decremented one reference
1785		 * above, it must now be between 0 and NMBPBG-1.
1786		 */
1787		VERIFY(class == MC_MBUF);
1788		VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) &&
1789		    sp->sl_chunks == NMBPBG &&
1790		    sp->sl_len == m_maxsize(MC_BIGCL));
1791		VERIFY(sp->sl_refcnt < (NMBPBG - 1) ||
1792		    (slab_is_detached(sp) && sp->sl_head == NULL));
1793	}
1794
1795	/*
1796	 * When auditing is enabled, ensure that the buffer still
1797	 * contains the free pattern.  Otherwise it got corrupted
1798	 * while at the CPU cache layer.
1799	 */
1800	if (mclaudit != NULL) {
1801		mcache_audit_t *mca = mcl_audit_buf2mca(class, buf);
1802		if (mclverify) {
1803			mcache_audit_free_verify(mca, buf, 0, m_maxsize(class));
1804		}
1805		mca->mca_uflags &= ~MB_SCVALID;
1806	}
1807
1808	if (class == MC_CL) {
1809		mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL);
1810		buf->obj_next = sp->sl_head;
1811	} else if (class == MC_BIGCL) {
1812		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1813		    m_infree(MC_MBUF_BIGCL);
1814	} else if (class == MC_16KCL) {
1815		++m_infree(MC_16KCL);
1816	} else {
1817		++m_infree(MC_MBUF);
1818		buf->obj_next = sp->sl_head;
1819	}
1820	sp->sl_head = buf;
1821
1822	/*
1823	 * If a slab has been splitted to either one which holds 2KB clusters,
1824	 * or one which holds mbufs, turn it back to one which holds a 4KB
1825	 * cluster.
1826	 */
1827	if (class == MC_MBUF && sp->sl_refcnt == 0 &&
1828	    m_total(class) > m_minlimit(class) &&
1829	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1830		int i = NMBPBG;
1831
1832		m_total(MC_BIGCL)++;
1833		mbstat.m_bigclusters = m_total(MC_BIGCL);
1834		m_total(MC_MBUF) -= NMBPBG;
1835		mbstat.m_mbufs = m_total(MC_MBUF);
1836		m_infree(MC_MBUF) -= NMBPBG;
1837		mtype_stat_add(MT_FREE, -((unsigned)NMBPBG));
1838
1839		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1840		VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF));
1841
1842		while (i--) {
1843			struct mbuf *m = sp->sl_head;
1844			VERIFY(m != NULL);
1845			sp->sl_head = m->m_next;
1846			m->m_next = NULL;
1847		}
1848		VERIFY(sp->sl_head == NULL);
1849
1850		/* Remove the slab from the mbuf class's slab list */
1851		slab_remove(sp, class);
1852
1853		/* Reinitialize it as a 4KB cluster slab */
1854		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1855		    sp->sl_len, 0, 1);
1856
1857		if (mclverify) {
1858			mcache_set_pattern(MCACHE_FREE_PATTERN,
1859			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1860		}
1861		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1862		    m_infree(MC_MBUF_BIGCL);
1863
1864		VERIFY(slab_is_detached(sp));
1865		/* And finally switch class */
1866		class = MC_BIGCL;
1867	} else if (class == MC_CL && sp->sl_refcnt == 0 &&
1868	    m_total(class) > m_minlimit(class) &&
1869	    m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) {
1870		int i = NCLPBG;
1871
1872		m_total(MC_BIGCL)++;
1873		mbstat.m_bigclusters = m_total(MC_BIGCL);
1874		m_total(MC_CL) -= NCLPBG;
1875		mbstat.m_clusters = m_total(MC_CL);
1876		m_infree(MC_CL) -= NCLPBG;
1877		VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
1878		VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL));
1879
1880		while (i--) {
1881			union mcluster *c = sp->sl_head;
1882			VERIFY(c != NULL);
1883			sp->sl_head = c->mcl_next;
1884			c->mcl_next = NULL;
1885		}
1886		VERIFY(sp->sl_head == NULL);
1887
1888		/* Remove the slab from the 2KB cluster class's slab list */
1889		slab_remove(sp, class);
1890
1891		/* Reinitialize it as a 4KB cluster slab */
1892		slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base,
1893		    sp->sl_len, 0, 1);
1894
1895		if (mclverify) {
1896			mcache_set_pattern(MCACHE_FREE_PATTERN,
1897			    (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL));
1898		}
1899		mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) +
1900		    m_infree(MC_MBUF_BIGCL);
1901
1902		VERIFY(slab_is_detached(sp));
1903		/* And finally switch class */
1904		class = MC_BIGCL;
1905	}
1906
1907	/* Reinsert the slab to the class's slab list */
1908	if (slab_is_detached(sp))
1909		slab_insert(sp, class);
1910}
1911
1912/*
1913 * Common allocator for rudimentary objects called by the CPU cache layer
1914 * during an allocation request whenever there is no available element in the
1915 * bucket layer.  It returns one or more elements from the appropriate global
1916 * freelist.  If the freelist is empty, it will attempt to populate it and
1917 * retry the allocation.
1918 */
1919static unsigned int
1920mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait)
1921{
1922	mbuf_class_t class = (mbuf_class_t)arg;
1923	unsigned int need = num;
1924	mcache_obj_t **list = *plist;
1925
1926	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
1927	ASSERT(need > 0);
1928
1929	lck_mtx_lock(mbuf_mlock);
1930
1931	for (;;) {
1932		if ((*list = slab_alloc(class, wait)) != NULL) {
1933			(*list)->obj_next = NULL;
1934			list = *plist = &(*list)->obj_next;
1935
1936			if (--need == 0) {
1937				/*
1938				 * If the number of elements in freelist has
1939				 * dropped below low watermark, asynchronously
1940				 * populate the freelist now rather than doing
1941				 * it later when we run out of elements.
1942				 */
1943				if (!mbuf_cached_above(class, wait) &&
1944				    m_infree(class) < m_total(class) >> 5) {
1945					(void) freelist_populate(class, 1,
1946					    M_DONTWAIT);
1947				}
1948				break;
1949			}
1950		} else {
1951			VERIFY(m_infree(class) == 0 || class == MC_CL);
1952
1953			(void) freelist_populate(class, 1,
1954			    (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT);
1955
1956			if (m_infree(class) > 0)
1957				continue;
1958
1959			/* Check if there's anything at the cache layer */
1960			if (mbuf_cached_above(class, wait))
1961				break;
1962
1963			/* watchdog checkpoint */
1964			mbuf_watchdog();
1965
1966			/* We have nothing and cannot block; give up */
1967			if (wait & MCR_NOSLEEP) {
1968				if (!(wait & MCR_TRYHARD)) {
1969					m_fail_cnt(class)++;
1970					mbstat.m_drops++;
1971					break;
1972				}
1973			}
1974
1975			/*
1976			 * If the freelist is still empty and the caller is
1977			 * willing to be blocked, sleep on the wait channel
1978			 * until an element is available.  Otherwise, if
1979			 * MCR_TRYHARD is set, do our best to satisfy the
1980			 * request without having to go to sleep.
1981			 */
1982			if (mbuf_worker_ready &&
1983			    mbuf_sleep(class, need, wait))
1984				break;
1985
1986			lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
1987		}
1988	}
1989
1990	m_alloc_cnt(class) += num - need;
1991	lck_mtx_unlock(mbuf_mlock);
1992
1993	return (num - need);
1994}
1995
1996/*
1997 * Common de-allocator for rudimentary objects called by the CPU cache
1998 * layer when one or more elements need to be returned to the appropriate
1999 * global freelist.
2000 */
2001static void
2002mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged)
2003{
2004	mbuf_class_t class = (mbuf_class_t)arg;
2005	mcache_obj_t *nlist;
2006	unsigned int num = 0;
2007	int w;
2008
2009	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2010
2011	lck_mtx_lock(mbuf_mlock);
2012
2013	for (;;) {
2014		nlist = list->obj_next;
2015		list->obj_next = NULL;
2016		slab_free(class, list);
2017		++num;
2018		if ((list = nlist) == NULL)
2019			break;
2020	}
2021	m_free_cnt(class) += num;
2022
2023	if ((w = mb_waiters) > 0)
2024		mb_waiters = 0;
2025
2026	lck_mtx_unlock(mbuf_mlock);
2027
2028	if (w != 0)
2029		wakeup(mb_waitchan);
2030}
2031
2032/*
2033 * Common auditor for rudimentary objects called by the CPU cache layer
2034 * during an allocation or free request.  For the former, this is called
2035 * after the objects are obtained from either the bucket or slab layer
2036 * and before they are returned to the caller.  For the latter, this is
2037 * called immediately during free and before placing the objects into
2038 * the bucket or slab layer.
2039 */
2040static void
2041mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2042{
2043	mbuf_class_t class = (mbuf_class_t)arg;
2044	mcache_audit_t *mca;
2045
2046	ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class));
2047
2048	while (list != NULL) {
2049		lck_mtx_lock(mbuf_mlock);
2050		mca = mcl_audit_buf2mca(class, list);
2051
2052		/* Do the sanity checks */
2053		if (class == MC_MBUF) {
2054			mcl_audit_mbuf(mca, list, FALSE, alloc);
2055			ASSERT(mca->mca_uflags & MB_SCVALID);
2056		} else {
2057			mcl_audit_cluster(mca, list, m_maxsize(class),
2058			    alloc, TRUE);
2059			ASSERT(!(mca->mca_uflags & MB_SCVALID));
2060		}
2061		/* Record this transaction */
2062		if (mcltrace)
2063			mcache_buffer_log(mca, list, m_cache(class));
2064
2065		if (alloc)
2066			mca->mca_uflags |= MB_INUSE;
2067		else
2068			mca->mca_uflags &= ~MB_INUSE;
2069		/* Unpair the object (unconditionally) */
2070		mca->mca_uptr = NULL;
2071		lck_mtx_unlock(mbuf_mlock);
2072
2073		list = list->obj_next;
2074	}
2075}
2076
2077/*
2078 * Common notify routine for all caches.  It is called by mcache when
2079 * one or more objects get freed.  We use this indication to trigger
2080 * the wakeup of any sleeping threads so that they can retry their
2081 * allocation requests.
2082 */
2083static void
2084mbuf_slab_notify(void *arg, u_int32_t reason)
2085{
2086	mbuf_class_t class = (mbuf_class_t)arg;
2087	int w;
2088
2089	ASSERT(MBUF_CLASS_VALID(class));
2090
2091	if (reason != MCN_RETRYALLOC)
2092		return;
2093
2094	lck_mtx_lock(mbuf_mlock);
2095	if ((w = mb_waiters) > 0) {
2096		m_notified(class)++;
2097		mb_waiters = 0;
2098	}
2099	lck_mtx_unlock(mbuf_mlock);
2100
2101	if (w != 0)
2102		wakeup(mb_waitchan);
2103}
2104
2105/*
2106 * Obtain object(s) from the composite class's freelist.
2107 */
2108static unsigned int
2109cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num)
2110{
2111	unsigned int need = num;
2112	mcl_slab_t *sp, *clsp, *nsp;
2113	struct mbuf *m;
2114	mcache_obj_t **list = *plist;
2115	void *cl;
2116
2117	VERIFY(need > 0);
2118	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2119	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2120
2121	/* Get what we can from the freelist */
2122	while ((*list = m_cobjlist(class)) != NULL) {
2123		MRANGE(*list);
2124
2125		m = (struct mbuf *)*list;
2126		sp = slab_get(m);
2127		cl = m->m_ext.ext_buf;
2128		clsp = slab_get(cl);
2129		VERIFY(m->m_flags == M_EXT && cl != NULL);
2130		VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m));
2131
2132		if (class == MC_MBUF_CL) {
2133			VERIFY(clsp->sl_refcnt >= 1 &&
2134			    clsp->sl_refcnt <= NCLPBG);
2135		} else {
2136			VERIFY(clsp->sl_refcnt == 1);
2137		}
2138
2139		if (class == MC_MBUF_16KCL) {
2140			int k;
2141			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2142				nsp = nsp->sl_next;
2143				/* Next slab must already be present */
2144				VERIFY(nsp != NULL);
2145				VERIFY(nsp->sl_refcnt == 1);
2146			}
2147		}
2148
2149		if ((m_cobjlist(class) = (*list)->obj_next) != NULL &&
2150		    !MBUF_IN_MAP(m_cobjlist(class))) {
2151			slab_nextptr_panic(sp, m_cobjlist(class));
2152			/* NOTREACHED */
2153		}
2154		(*list)->obj_next = NULL;
2155		list = *plist = &(*list)->obj_next;
2156
2157		if (--need == 0)
2158			break;
2159	}
2160	m_infree(class) -= (num - need);
2161
2162	return (num - need);
2163}
2164
2165/*
2166 * Place object(s) back into a composite class's freelist.
2167 */
2168static unsigned int
2169cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged)
2170{
2171	mcache_obj_t *o, *tail;
2172	unsigned int num = 0;
2173	struct mbuf *m, *ms;
2174	mcache_audit_t *mca = NULL;
2175	mcache_obj_t *ref_list = NULL;
2176	mcl_slab_t *clsp, *nsp;
2177	void *cl;
2178	mbuf_class_t cl_class;
2179
2180	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2181	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2182	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2183
2184	if (class == MC_MBUF_CL) {
2185		cl_class = MC_CL;
2186	} else if (class == MC_MBUF_BIGCL) {
2187		cl_class = MC_BIGCL;
2188	} else {
2189		VERIFY(class == MC_MBUF_16KCL);
2190		cl_class = MC_16KCL;
2191	}
2192
2193	o = tail = list;
2194
2195	while ((m = ms = (struct mbuf *)o) != NULL) {
2196		mcache_obj_t *rfa, *nexto = o->obj_next;
2197
2198		/* Do the mbuf sanity checks */
2199		if (mclaudit != NULL) {
2200			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2201			if (mclverify) {
2202				mcache_audit_free_verify(mca, m, 0,
2203				    m_maxsize(MC_MBUF));
2204			}
2205			ms = (struct mbuf *)mca->mca_contents;
2206		}
2207
2208		/* Do the cluster sanity checks */
2209		cl = ms->m_ext.ext_buf;
2210		clsp = slab_get(cl);
2211		if (mclverify) {
2212			size_t size = m_maxsize(cl_class);
2213			mcache_audit_free_verify(mcl_audit_buf2mca(cl_class,
2214			    (mcache_obj_t *)cl), cl, 0, size);
2215		}
2216		VERIFY(ms->m_type == MT_FREE);
2217		VERIFY(ms->m_flags == M_EXT);
2218		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2219		if (cl_class == MC_CL) {
2220			VERIFY(clsp->sl_refcnt >= 1 &&
2221			    clsp->sl_refcnt <= NCLPBG);
2222		} else {
2223			VERIFY(clsp->sl_refcnt == 1);
2224		}
2225		if (cl_class == MC_16KCL) {
2226			int k;
2227			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2228				nsp = nsp->sl_next;
2229				/* Next slab must already be present */
2230				VERIFY(nsp != NULL);
2231				VERIFY(nsp->sl_refcnt == 1);
2232			}
2233		}
2234
2235		/*
2236		 * If we're asked to purge, restore the actual mbuf using
2237		 * contents of the shadow structure (if auditing is enabled)
2238		 * and clear EXTF_COMPOSITE flag from the mbuf, as we are
2239		 * about to free it and the attached cluster into their caches.
2240		 */
2241		if (purged) {
2242			/* Restore constructed mbuf fields */
2243			if (mclaudit != NULL)
2244				mcl_audit_restore_mbuf(m, mca, TRUE);
2245
2246			MEXT_REF(m) = 0;
2247			MEXT_FLAGS(m) = 0;
2248
2249			rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
2250			rfa->obj_next = ref_list;
2251			ref_list = rfa;
2252			MEXT_RFA(m) = NULL;
2253
2254			m->m_type = MT_FREE;
2255			m->m_flags = m->m_len = 0;
2256			m->m_next = m->m_nextpkt = NULL;
2257
2258			/* Save mbuf fields and make auditing happy */
2259			if (mclaudit != NULL)
2260				mcl_audit_mbuf(mca, o, FALSE, FALSE);
2261
2262			VERIFY(m_total(class) > 0);
2263			m_total(class)--;
2264
2265			/* Free the mbuf */
2266			o->obj_next = NULL;
2267			slab_free(MC_MBUF, o);
2268
2269			/* And free the cluster */
2270			((mcache_obj_t *)cl)->obj_next = NULL;
2271			if (class == MC_MBUF_CL)
2272				slab_free(MC_CL, cl);
2273			else if (class == MC_MBUF_BIGCL)
2274				slab_free(MC_BIGCL, cl);
2275			else
2276				slab_free(MC_16KCL, cl);
2277		}
2278
2279		++num;
2280		tail = o;
2281		o = nexto;
2282	}
2283
2284	if (!purged) {
2285		tail->obj_next = m_cobjlist(class);
2286		m_cobjlist(class) = list;
2287		m_infree(class) += num;
2288	} else if (ref_list != NULL) {
2289		mcache_free_ext(ref_cache, ref_list);
2290	}
2291
2292	return (num);
2293}
2294
2295/*
2296 * Common allocator for composite objects called by the CPU cache layer
2297 * during an allocation request whenever there is no available element in
2298 * the bucket layer.  It returns one or more composite elements from the
2299 * appropriate global freelist.  If the freelist is empty, it will attempt
2300 * to obtain the rudimentary objects from their caches and construct them
2301 * into composite mbuf + cluster objects.
2302 */
2303static unsigned int
2304mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed,
2305    int wait)
2306{
2307	mbuf_class_t class = (mbuf_class_t)arg;
2308	mbuf_class_t cl_class = 0;
2309	unsigned int num = 0, cnum = 0, want = needed;
2310	mcache_obj_t *ref_list = NULL;
2311	mcache_obj_t *mp_list = NULL;
2312	mcache_obj_t *clp_list = NULL;
2313	mcache_obj_t **list;
2314	struct ext_ref *rfa;
2315	struct mbuf *m;
2316	void *cl;
2317
2318	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2319	ASSERT(needed > 0);
2320
2321	VERIFY(class != MC_MBUF_16KCL || njcl > 0);
2322
2323	/* There should not be any slab for this class */
2324	VERIFY(m_slab_cnt(class) == 0 &&
2325	    m_slablist(class).tqh_first == NULL &&
2326	    m_slablist(class).tqh_last == NULL);
2327
2328	lck_mtx_lock(mbuf_mlock);
2329
2330	/* Try using the freelist first */
2331	num = cslab_alloc(class, plist, needed);
2332	list = *plist;
2333	if (num == needed) {
2334		m_alloc_cnt(class) += num;
2335		lck_mtx_unlock(mbuf_mlock);
2336		return (needed);
2337	}
2338
2339	lck_mtx_unlock(mbuf_mlock);
2340
2341	/*
2342	 * We could not satisfy the request using the freelist alone;
2343	 * allocate from the appropriate rudimentary caches and use
2344	 * whatever we can get to construct the composite objects.
2345	 */
2346	needed -= num;
2347
2348	/*
2349	 * Mark these allocation requests as coming from a composite cache.
2350	 * Also, if the caller is willing to be blocked, mark the request
2351	 * with MCR_FAILOK such that we don't end up sleeping at the mbuf
2352	 * slab layer waiting for the individual object when one or more
2353	 * of the already-constructed composite objects are available.
2354	 */
2355	wait |= MCR_COMP;
2356	if (!(wait & MCR_NOSLEEP))
2357		wait |= MCR_FAILOK;
2358
2359	/* allocate mbufs */
2360	needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait);
2361	if (needed == 0) {
2362		ASSERT(mp_list == NULL);
2363		goto fail;
2364	}
2365
2366	/* allocate clusters */
2367	if (class == MC_MBUF_CL) {
2368		cl_class = MC_CL;
2369	} else if (class == MC_MBUF_BIGCL) {
2370		cl_class = MC_BIGCL;
2371	} else {
2372		VERIFY(class == MC_MBUF_16KCL);
2373		cl_class = MC_16KCL;
2374	}
2375	needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait);
2376	if (needed == 0) {
2377		ASSERT(clp_list == NULL);
2378		goto fail;
2379	}
2380
2381	needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait);
2382	if (needed == 0) {
2383		ASSERT(ref_list == NULL);
2384		goto fail;
2385	}
2386
2387	/*
2388	 * By this time "needed" is MIN(mbuf, cluster, ref).  Any left
2389	 * overs will get freed accordingly before we return to caller.
2390	 */
2391	for (cnum = 0; cnum < needed; cnum++) {
2392		struct mbuf *ms;
2393
2394		m = ms = (struct mbuf *)mp_list;
2395		mp_list = mp_list->obj_next;
2396
2397		cl = clp_list;
2398		clp_list = clp_list->obj_next;
2399		((mcache_obj_t *)cl)->obj_next = NULL;
2400
2401		rfa = (struct ext_ref *)ref_list;
2402		ref_list = ref_list->obj_next;
2403		((mcache_obj_t *)(void *)rfa)->obj_next = NULL;
2404
2405		/*
2406		 * If auditing is enabled, construct the shadow mbuf
2407		 * in the audit structure instead of in the actual one.
2408		 * mbuf_cslab_audit() will take care of restoring the
2409		 * contents after the integrity check.
2410		 */
2411		if (mclaudit != NULL) {
2412			mcache_audit_t *mca, *cl_mca;
2413
2414			lck_mtx_lock(mbuf_mlock);
2415			mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2416			ms = ((struct mbuf *)mca->mca_contents);
2417			cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl);
2418
2419			/*
2420			 * Pair them up.  Note that this is done at the time
2421			 * the mbuf+cluster objects are constructed.  This
2422			 * information should be treated as "best effort"
2423			 * debugging hint since more than one mbufs can refer
2424			 * to a cluster.  In that case, the cluster might not
2425			 * be freed along with the mbuf it was paired with.
2426			 */
2427			mca->mca_uptr = cl_mca;
2428			cl_mca->mca_uptr = mca;
2429
2430			ASSERT(mca->mca_uflags & MB_SCVALID);
2431			ASSERT(!(cl_mca->mca_uflags & MB_SCVALID));
2432			lck_mtx_unlock(mbuf_mlock);
2433
2434			/* Technically, they are in the freelist */
2435			if (mclverify) {
2436				size_t size;
2437
2438				mcache_set_pattern(MCACHE_FREE_PATTERN, m,
2439				    m_maxsize(MC_MBUF));
2440
2441				if (class == MC_MBUF_CL)
2442					size = m_maxsize(MC_CL);
2443				else if (class == MC_MBUF_BIGCL)
2444					size = m_maxsize(MC_BIGCL);
2445				else
2446					size = m_maxsize(MC_16KCL);
2447
2448				mcache_set_pattern(MCACHE_FREE_PATTERN, cl,
2449				    size);
2450			}
2451		}
2452
2453		MBUF_INIT(ms, 0, MT_FREE);
2454		if (class == MC_MBUF_16KCL) {
2455			MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2456		} else if (class == MC_MBUF_BIGCL) {
2457			MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2458		} else {
2459			MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE);
2460		}
2461		VERIFY(ms->m_flags == M_EXT);
2462		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2463
2464		*list = (mcache_obj_t *)m;
2465		(*list)->obj_next = NULL;
2466		list = *plist = &(*list)->obj_next;
2467	}
2468
2469fail:
2470	/*
2471	 * Free up what's left of the above.
2472	 */
2473	if (mp_list != NULL)
2474		mcache_free_ext(m_cache(MC_MBUF), mp_list);
2475	if (clp_list != NULL)
2476		mcache_free_ext(m_cache(cl_class), clp_list);
2477	if (ref_list != NULL)
2478		mcache_free_ext(ref_cache, ref_list);
2479
2480	lck_mtx_lock(mbuf_mlock);
2481	if (num > 0 || cnum > 0) {
2482		m_total(class) += cnum;
2483		VERIFY(m_total(class) <= m_maxlimit(class));
2484		m_alloc_cnt(class) += num + cnum;
2485	}
2486	if ((num + cnum) < want)
2487		m_fail_cnt(class) += (want - (num + cnum));
2488	lck_mtx_unlock(mbuf_mlock);
2489
2490	return (num + cnum);
2491}
2492
2493/*
2494 * Common de-allocator for composite objects called by the CPU cache
2495 * layer when one or more elements need to be returned to the appropriate
2496 * global freelist.
2497 */
2498static void
2499mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged)
2500{
2501	mbuf_class_t class = (mbuf_class_t)arg;
2502	unsigned int num;
2503	int w;
2504
2505	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2506
2507	lck_mtx_lock(mbuf_mlock);
2508
2509	num = cslab_free(class, list, purged);
2510	m_free_cnt(class) += num;
2511
2512	if ((w = mb_waiters) > 0)
2513		mb_waiters = 0;
2514
2515	lck_mtx_unlock(mbuf_mlock);
2516
2517	if (w != 0)
2518		wakeup(mb_waitchan);
2519}
2520
2521/*
2522 * Common auditor for composite objects called by the CPU cache layer
2523 * during an allocation or free request.  For the former, this is called
2524 * after the objects are obtained from either the bucket or slab layer
2525 * and before they are returned to the caller.  For the latter, this is
2526 * called immediately during free and before placing the objects into
2527 * the bucket or slab layer.
2528 */
2529static void
2530mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc)
2531{
2532	mbuf_class_t class = (mbuf_class_t)arg;
2533	mcache_audit_t *mca;
2534	struct mbuf *m, *ms;
2535	mcl_slab_t *clsp, *nsp;
2536	size_t size;
2537	void *cl;
2538
2539	ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class));
2540
2541	while ((m = ms = (struct mbuf *)list) != NULL) {
2542		lck_mtx_lock(mbuf_mlock);
2543		/* Do the mbuf sanity checks and record its transaction */
2544		mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
2545		mcl_audit_mbuf(mca, m, TRUE, alloc);
2546		if (mcltrace)
2547			mcache_buffer_log(mca, m, m_cache(class));
2548
2549		if (alloc)
2550			mca->mca_uflags |= MB_COMP_INUSE;
2551		else
2552			mca->mca_uflags &= ~MB_COMP_INUSE;
2553
2554		/*
2555		 * Use the shadow mbuf in the audit structure if we are
2556		 * freeing, since the contents of the actual mbuf has been
2557		 * pattern-filled by the above call to mcl_audit_mbuf().
2558		 */
2559		if (!alloc && mclverify)
2560			ms = (struct mbuf *)mca->mca_contents;
2561
2562		/* Do the cluster sanity checks and record its transaction */
2563		cl = ms->m_ext.ext_buf;
2564		clsp = slab_get(cl);
2565		VERIFY(ms->m_flags == M_EXT && cl != NULL);
2566		VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms));
2567		if (class == MC_MBUF_CL)
2568			VERIFY(clsp->sl_refcnt >= 1 &&
2569			    clsp->sl_refcnt <= NCLPBG);
2570		else
2571			VERIFY(clsp->sl_refcnt == 1);
2572
2573		if (class == MC_MBUF_16KCL) {
2574			int k;
2575			for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) {
2576				nsp = nsp->sl_next;
2577				/* Next slab must already be present */
2578				VERIFY(nsp != NULL);
2579				VERIFY(nsp->sl_refcnt == 1);
2580			}
2581		}
2582
2583		mca = mcl_audit_buf2mca(MC_CL, cl);
2584		if (class == MC_MBUF_CL)
2585			size = m_maxsize(MC_CL);
2586		else if (class == MC_MBUF_BIGCL)
2587			size = m_maxsize(MC_BIGCL);
2588		else
2589			size = m_maxsize(MC_16KCL);
2590		mcl_audit_cluster(mca, cl, size, alloc, FALSE);
2591		if (mcltrace)
2592			mcache_buffer_log(mca, cl, m_cache(class));
2593
2594		if (alloc)
2595			mca->mca_uflags |= MB_COMP_INUSE;
2596		else
2597			mca->mca_uflags &= ~MB_COMP_INUSE;
2598		lck_mtx_unlock(mbuf_mlock);
2599
2600		list = list->obj_next;
2601	}
2602}
2603
2604/*
2605 * Allocate some number of mbuf clusters and place on cluster freelist.
2606 */
2607static int
2608m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize)
2609{
2610	int i;
2611	vm_size_t size = 0;
2612	int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL));
2613	vm_offset_t page = 0;
2614	mcache_audit_t *mca_list = NULL;
2615	mcache_obj_t *con_list = NULL;
2616	mcl_slab_t *sp;
2617
2618	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
2619	    bufsize == m_maxsize(MC_16KCL));
2620
2621	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2622
2623	/*
2624	 * Multiple threads may attempt to populate the cluster map one
2625	 * after another.  Since we drop the lock below prior to acquiring
2626	 * the physical page(s), our view of the cluster map may no longer
2627	 * be accurate, and we could end up over-committing the pages beyond
2628	 * the maximum allowed for each class.  To prevent it, this entire
2629	 * operation (including the page mapping) is serialized.
2630	 */
2631	while (mb_clalloc_busy) {
2632		mb_clalloc_waiters++;
2633		(void) msleep(mb_clalloc_waitchan, mbuf_mlock,
2634		    (PZERO-1), "m_clalloc", NULL);
2635		lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2636	}
2637
2638	/* We are busy now; tell everyone else to go away */
2639	mb_clalloc_busy = TRUE;
2640
2641	/*
2642	 * Honor the caller's wish to block or not block.  We have a way
2643	 * to grow the pool asynchronously using the mbuf worker thread.
2644	 */
2645	i = m_howmany(num, bufsize);
2646	if (i == 0 || (wait & M_DONTWAIT))
2647		goto out;
2648
2649	lck_mtx_unlock(mbuf_mlock);
2650
2651	size = round_page(i * bufsize);
2652	page = kmem_mb_alloc(mb_map, size, large_buffer);
2653
2654	/*
2655	 * If we did ask for "n" 16KB physically contiguous chunks
2656	 * and didn't get them, then please try again without this
2657	 * restriction.
2658	 */
2659	if (large_buffer && page == 0)
2660		page = kmem_mb_alloc(mb_map, size, 0);
2661
2662	if (page == 0) {
2663		if (bufsize == m_maxsize(MC_BIGCL)) {
2664			/* Try for 1 page if failed, only 4KB request */
2665			size = NBPG;
2666			page = kmem_mb_alloc(mb_map, size, 0);
2667		}
2668
2669		if (page == 0) {
2670			lck_mtx_lock(mbuf_mlock);
2671			goto out;
2672		}
2673	}
2674
2675	VERIFY(IS_P2ALIGNED(page, NBPG));
2676	numpages = size / NBPG;
2677
2678	/* If auditing is enabled, allocate the audit structures now */
2679	if (mclaudit != NULL) {
2680		int needed;
2681
2682		/*
2683		 * Yes, I realize this is a waste of memory for clusters
2684		 * that never get transformed into mbufs, as we may end
2685		 * up with NMBPBG-1 unused audit structures per cluster.
2686		 * But doing so tremendously simplifies the allocation
2687		 * strategy, since at this point we are not holding the
2688		 * mbuf lock and the caller is okay to be blocked.
2689		 */
2690		if (bufsize == m_maxsize(MC_BIGCL)) {
2691			needed = numpages * NMBPBG;
2692
2693			i = mcache_alloc_ext(mcl_audit_con_cache,
2694			    &con_list, needed, MCR_SLEEP);
2695
2696			VERIFY(con_list != NULL && i == needed);
2697		} else {
2698			needed = numpages / NSLABSP16KB;
2699		}
2700
2701		i = mcache_alloc_ext(mcache_audit_cache,
2702		    (mcache_obj_t **)&mca_list, needed, MCR_SLEEP);
2703
2704		VERIFY(mca_list != NULL && i == needed);
2705	}
2706
2707	lck_mtx_lock(mbuf_mlock);
2708
2709	for (i = 0; i < numpages; i++, page += NBPG) {
2710		ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG;
2711		ppnum_t new_page = pmap_find_phys(kernel_pmap, page);
2712
2713		/*
2714		 * In the case of no mapper being available the following
2715		 * code noops and returns the input page; if there is a
2716		 * mapper the appropriate I/O page is returned.
2717		 */
2718		VERIFY(offset < mcl_pages);
2719		if (mcl_paddr_base) {
2720		    bzero((void *)(uintptr_t) page, page_size);
2721		    new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page);
2722		}
2723		mcl_paddr[offset] = new_page << PGSHIFT;
2724
2725		/* Pattern-fill this fresh page */
2726		if (mclverify) {
2727			mcache_set_pattern(MCACHE_FREE_PATTERN,
2728			    (caddr_t)page, NBPG);
2729		}
2730		if (bufsize == m_maxsize(MC_BIGCL)) {
2731			union mbigcluster *mbc = (union mbigcluster *)page;
2732
2733			/* One for the entire page */
2734			sp = slab_get(mbc);
2735			if (mclaudit != NULL) {
2736				mcl_audit_init(mbc, &mca_list, &con_list,
2737				    AUDIT_CONTENTS_SIZE, NMBPBG);
2738			}
2739			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2740			slab_init(sp, MC_BIGCL, SLF_MAPPED,
2741			    mbc, mbc, bufsize, 0, 1);
2742
2743			/* Insert this slab */
2744			slab_insert(sp, MC_BIGCL);
2745
2746			/* Update stats now since slab_get() drops the lock */
2747			mbstat.m_bigclfree = ++m_infree(MC_BIGCL) +
2748			    m_infree(MC_MBUF_BIGCL);
2749			mbstat.m_bigclusters = ++m_total(MC_BIGCL);
2750			VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
2751		} else if ((i % NSLABSP16KB) == 0) {
2752			union m16kcluster *m16kcl = (union m16kcluster *)page;
2753			mcl_slab_t *nsp;
2754			int k;
2755
2756			VERIFY(njcl > 0);
2757			/* One for the entire 16KB */
2758			sp = slab_get(m16kcl);
2759			if (mclaudit != NULL)
2760				mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1);
2761
2762			VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0);
2763			slab_init(sp, MC_16KCL, SLF_MAPPED,
2764			    m16kcl, m16kcl, bufsize, 0, 1);
2765
2766			/*
2767			 * 2nd-Nth page's slab is part of the first one,
2768			 * where N is NSLABSP16KB.
2769			 */
2770			for (k = 1; k < NSLABSP16KB; k++) {
2771				nsp = slab_get(((union mbigcluster *)page) + k);
2772				VERIFY(nsp->sl_refcnt == 0 &&
2773				    nsp->sl_flags == 0);
2774				slab_init(nsp, MC_16KCL,
2775				    SLF_MAPPED | SLF_PARTIAL,
2776				    m16kcl, NULL, 0, 0, 0);
2777			}
2778
2779			/* Insert this slab */
2780			slab_insert(sp, MC_16KCL);
2781
2782			/* Update stats now since slab_get() drops the lock */
2783			m_infree(MC_16KCL)++;
2784			m_total(MC_16KCL)++;
2785			VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
2786		}
2787	}
2788	VERIFY(mca_list == NULL && con_list == NULL);
2789
2790	/* We're done; let others enter */
2791	mb_clalloc_busy = FALSE;
2792	if (mb_clalloc_waiters > 0) {
2793		mb_clalloc_waiters = 0;
2794		wakeup(mb_clalloc_waitchan);
2795	}
2796
2797	if (bufsize == m_maxsize(MC_BIGCL))
2798		return (numpages);
2799
2800	VERIFY(bufsize == m_maxsize(MC_16KCL));
2801	return (numpages / NSLABSP16KB);
2802
2803out:
2804	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2805
2806	/* We're done; let others enter */
2807	mb_clalloc_busy = FALSE;
2808	if (mb_clalloc_waiters > 0) {
2809		mb_clalloc_waiters = 0;
2810		wakeup(mb_clalloc_waitchan);
2811	}
2812
2813	/*
2814	 * When non-blocking we kick a thread if we have to grow the
2815	 * pool or if the number of free clusters is less than requested.
2816	 */
2817	if (bufsize == m_maxsize(MC_BIGCL)) {
2818		if (i > 0) {
2819			/*
2820			 * Remember total number of 4KB clusters needed
2821			 * at this time.
2822			 */
2823			i += m_total(MC_BIGCL);
2824			if (i > mbuf_expand_big) {
2825				mbuf_expand_big = i;
2826				if (mbuf_worker_ready)
2827					wakeup((caddr_t)&mbuf_worker_run);
2828			}
2829		}
2830
2831		if (m_infree(MC_BIGCL) >= num)
2832			return (1);
2833	} else {
2834		if (i > 0) {
2835			/*
2836			 * Remember total number of 16KB clusters needed
2837			 * at this time.
2838			 */
2839			i += m_total(MC_16KCL);
2840			if (i > mbuf_expand_16k) {
2841				mbuf_expand_16k = i;
2842				if (mbuf_worker_ready)
2843					wakeup((caddr_t)&mbuf_worker_run);
2844			}
2845		}
2846
2847		if (m_infree(MC_16KCL) >= num)
2848			return (1);
2849	}
2850	return (0);
2851}
2852
2853/*
2854 * Populate the global freelist of the corresponding buffer class.
2855 */
2856static int
2857freelist_populate(mbuf_class_t class, unsigned int num, int wait)
2858{
2859	mcache_obj_t *o = NULL;
2860	int i, numpages = 0, count;
2861
2862	VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL ||
2863	    class == MC_16KCL);
2864
2865	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2866
2867	switch (class) {
2868	case MC_MBUF:
2869	case MC_CL:
2870	case MC_BIGCL:
2871		numpages = (num * m_size(class) + NBPG - 1) / NBPG;
2872		i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL));
2873
2874		/* Respect the 4KB clusters minimum limit */
2875		if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) &&
2876		    m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) {
2877			if (class != MC_BIGCL || (wait & MCR_COMP))
2878				return (0);
2879		}
2880		if (class == MC_BIGCL)
2881			return (i != 0);
2882		break;
2883
2884	case MC_16KCL:
2885		return (m_clalloc(num, wait, m_maxsize(class)) != 0);
2886		/* NOTREACHED */
2887
2888	default:
2889		VERIFY(0);
2890		/* NOTREACHED */
2891	}
2892
2893	VERIFY(class == MC_MBUF || class == MC_CL);
2894
2895	/* how many objects will we cut the page into? */
2896	int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG);
2897
2898	for (count = 0; count < numpages; count++) {
2899
2900		/* respect totals, minlimit, maxlimit */
2901		if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) ||
2902		    m_total(class) >= m_maxlimit(class))
2903			break;
2904
2905		if ((o = slab_alloc(MC_BIGCL, wait)) == NULL)
2906			break;
2907
2908		struct mbuf *m = (struct mbuf *)o;
2909		union mcluster *c = (union mcluster *)o;
2910		mcl_slab_t *sp = slab_get(o);
2911		mcache_audit_t *mca = NULL;
2912
2913		VERIFY(slab_is_detached(sp) &&
2914		    (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED);
2915
2916		/*
2917		 * Make sure that the cluster is unmolested
2918		 * while in freelist
2919		 */
2920		if (mclverify) {
2921			mca = mcl_audit_buf2mca(MC_BIGCL, o);
2922			mcache_audit_free_verify(mca, o, 0,
2923			    m_maxsize(MC_BIGCL));
2924		}
2925
2926		/* Reinitialize it as an mbuf or 2K slab */
2927		slab_init(sp, class, sp->sl_flags,
2928		    sp->sl_base, NULL, sp->sl_len, 0, numobj);
2929
2930		VERIFY(o == (mcache_obj_t *)sp->sl_base);
2931		VERIFY(sp->sl_head == NULL);
2932
2933		VERIFY(m_total(MC_BIGCL) > 0);
2934		m_total(MC_BIGCL)--;
2935		mbstat.m_bigclusters = m_total(MC_BIGCL);
2936
2937		m_total(class) += numobj;
2938		m_infree(class) += numobj;
2939
2940		VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL));
2941		VERIFY(m_total(class) <= m_maxlimit(class));
2942
2943		i = numobj;
2944		if (class == MC_MBUF) {
2945			mbstat.m_mbufs = m_total(MC_MBUF);
2946			mtype_stat_add(MT_FREE, NMBPBG);
2947			while (i--) {
2948				/*
2949				 * If auditing is enabled, construct the
2950				 * shadow mbuf in the audit structure
2951				 * instead of the actual one.
2952				 * mbuf_slab_audit() will take care of
2953				 * restoring the contents after the
2954				 * integrity check.
2955				 */
2956				if (mclaudit != NULL) {
2957					struct mbuf *ms;
2958					mca = mcl_audit_buf2mca(MC_MBUF,
2959					    (mcache_obj_t *)m);
2960					ms = ((struct mbuf *)
2961					    mca->mca_contents);
2962					ms->m_type = MT_FREE;
2963				} else {
2964					m->m_type = MT_FREE;
2965				}
2966				m->m_next = sp->sl_head;
2967				sp->sl_head = (void *)m++;
2968			}
2969		} else { /* MC_CL */
2970			mbstat.m_clfree =
2971			    m_infree(MC_CL) + m_infree(MC_MBUF_CL);
2972			mbstat.m_clusters = m_total(MC_CL);
2973			while (i--) {
2974				c->mcl_next = sp->sl_head;
2975				sp->sl_head = (void *)c++;
2976			}
2977		}
2978
2979		/* Insert into the mbuf or 2k slab list */
2980		slab_insert(sp, class);
2981
2982		if ((i = mb_waiters) > 0)
2983			mb_waiters = 0;
2984		if (i != 0)
2985			wakeup(mb_waitchan);
2986	}
2987	return (count != 0);
2988}
2989
2990/*
2991 * For each class, initialize the freelist to hold m_minlimit() objects.
2992 */
2993static void
2994freelist_init(mbuf_class_t class)
2995{
2996	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
2997
2998	VERIFY(class == MC_CL || class == MC_BIGCL);
2999	VERIFY(m_total(class) == 0);
3000	VERIFY(m_minlimit(class) > 0);
3001
3002	while (m_total(class) < m_minlimit(class))
3003		(void) freelist_populate(class, m_minlimit(class), M_WAIT);
3004
3005	VERIFY(m_total(class) >= m_minlimit(class));
3006}
3007
3008/*
3009 * (Inaccurately) check if it might be worth a trip back to the
3010 * mcache layer due the availability of objects there.  We'll
3011 * end up back here if there's nothing up there.
3012 */
3013static boolean_t
3014mbuf_cached_above(mbuf_class_t class, int wait)
3015{
3016	switch (class) {
3017	case MC_MBUF:
3018		if (wait & MCR_COMP)
3019			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) ||
3020			    !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3021		break;
3022
3023	case MC_CL:
3024		if (wait & MCR_COMP)
3025			return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)));
3026		break;
3027
3028	case MC_BIGCL:
3029		if (wait & MCR_COMP)
3030			return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL)));
3031		break;
3032
3033	case MC_16KCL:
3034		if (wait & MCR_COMP)
3035			return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL)));
3036		break;
3037
3038	case MC_MBUF_CL:
3039	case MC_MBUF_BIGCL:
3040	case MC_MBUF_16KCL:
3041		break;
3042
3043	default:
3044		VERIFY(0);
3045		/* NOTREACHED */
3046	}
3047
3048	return (!mcache_bkt_isempty(m_cache(class)));
3049}
3050
3051/*
3052 * If possible, convert constructed objects to raw ones.
3053 */
3054static boolean_t
3055mbuf_steal(mbuf_class_t class, unsigned int num)
3056{
3057	mcache_obj_t *top = NULL;
3058	mcache_obj_t **list = &top;
3059	unsigned int tot = 0;
3060
3061	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3062
3063	switch (class) {
3064	case MC_MBUF:
3065	case MC_CL:
3066	case MC_BIGCL:
3067	case MC_16KCL:
3068		return (FALSE);
3069
3070	case MC_MBUF_CL:
3071	case MC_MBUF_BIGCL:
3072	case MC_MBUF_16KCL:
3073		/* Get the required number of constructed objects if possible */
3074		if (m_infree(class) > m_minlimit(class)) {
3075			tot = cslab_alloc(class, &list,
3076			    MIN(num, m_infree(class)));
3077		}
3078
3079		/* And destroy them to get back the raw objects */
3080		if (top != NULL)
3081			(void) cslab_free(class, top, 1);
3082		break;
3083
3084	default:
3085		VERIFY(0);
3086		/* NOTREACHED */
3087	}
3088
3089	return (tot == num);
3090}
3091
3092static void
3093m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp)
3094{
3095	int m, bmap = 0;
3096
3097	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
3098
3099	VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL));
3100	VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL));
3101	VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL));
3102
3103	/*
3104	 * This logic can be made smarter; for now, simply mark
3105	 * all other related classes as potential victims.
3106	 */
3107	switch (class) {
3108	case MC_MBUF:
3109		m_wantpurge(MC_CL)++;
3110		m_wantpurge(MC_BIGCL)++;
3111		m_wantpurge(MC_MBUF_CL)++;
3112		m_wantpurge(MC_MBUF_BIGCL)++;
3113		break;
3114
3115	case MC_CL:
3116		m_wantpurge(MC_MBUF)++;
3117		m_wantpurge(MC_BIGCL)++;
3118		m_wantpurge(MC_MBUF_BIGCL)++;
3119		if (!comp)
3120			m_wantpurge(MC_MBUF_CL)++;
3121		break;
3122
3123	case MC_BIGCL:
3124		m_wantpurge(MC_MBUF)++;
3125		m_wantpurge(MC_CL)++;
3126		m_wantpurge(MC_MBUF_CL)++;
3127		if (!comp)
3128			m_wantpurge(MC_MBUF_BIGCL)++;
3129		break;
3130
3131	case MC_16KCL:
3132		if (!comp)
3133			m_wantpurge(MC_MBUF_16KCL)++;
3134		break;
3135
3136	default:
3137		VERIFY(0);
3138		/* NOTREACHED */
3139	}
3140
3141	/*
3142	 * Run through each marked class and check if we really need to
3143	 * purge (and therefore temporarily disable) the per-CPU caches
3144	 * layer used by the class.  If so, remember the classes since
3145	 * we are going to drop the lock below prior to purging.
3146	 */
3147	for (m = 0; m < NELEM(mbuf_table); m++) {
3148		if (m_wantpurge(m) > 0) {
3149			m_wantpurge(m) = 0;
3150			/*
3151			 * Try hard to steal the required number of objects
3152			 * from the freelist of other mbuf classes.  Only
3153			 * purge and disable the per-CPU caches layer when
3154			 * we don't have enough; it's the last resort.
3155			 */
3156			if (!mbuf_steal(m, num))
3157				bmap |= (1 << m);
3158		}
3159	}
3160
3161	lck_mtx_unlock(mbuf_mlock);
3162
3163	if (bmap != 0) {
3164		/* drain is performed in pfslowtimo(), to avoid deadlocks */
3165		do_reclaim = 1;
3166
3167		/* Sigh; we have no other choices but to ask mcache to purge */
3168		for (m = 0; m < NELEM(mbuf_table); m++) {
3169			if ((bmap & (1 << m)) &&
3170			    mcache_purge_cache(m_cache(m))) {
3171				lck_mtx_lock(mbuf_mlock);
3172				m_purge_cnt(m)++;
3173				mbstat.m_drain++;
3174				lck_mtx_unlock(mbuf_mlock);
3175			}
3176		}
3177	} else {
3178		/*
3179		 * Request mcache to reap extra elements from all of its caches;
3180		 * note that all reaps are serialized and happen only at a fixed
3181		 * interval.
3182		 */
3183		mcache_reap();
3184	}
3185	lck_mtx_lock(mbuf_mlock);
3186}
3187
3188static inline struct mbuf *
3189m_get_common(int wait, short type, int hdr)
3190{
3191	struct mbuf *m;
3192	int mcflags = MSLEEPF(wait);
3193
3194	/* Is this due to a non-blocking retry?  If so, then try harder */
3195	if (mcflags & MCR_NOSLEEP)
3196		mcflags |= MCR_TRYHARD;
3197
3198	m = mcache_alloc(m_cache(MC_MBUF), mcflags);
3199	if (m != NULL) {
3200		MBUF_INIT(m, hdr, type);
3201		mtype_stat_inc(type);
3202		mtype_stat_dec(MT_FREE);
3203#if CONFIG_MACF_NET
3204		if (hdr && mac_init_mbuf(m, wait) != 0) {
3205			m_free(m);
3206			return (NULL);
3207		}
3208#endif /* MAC_NET */
3209	}
3210	return (m);
3211}
3212
3213/*
3214 * Space allocation routines; these are also available as macros
3215 * for critical paths.
3216 */
3217#define	_M_GET(wait, type)	m_get_common(wait, type, 0)
3218#define	_M_GETHDR(wait, type)	m_get_common(wait, type, 1)
3219#define	_M_RETRY(wait, type)	_M_GET(wait, type)
3220#define	_M_RETRYHDR(wait, type)	_M_GETHDR(wait, type)
3221#define	_MGET(m, how, type)	((m) = _M_GET(how, type))
3222#define	_MGETHDR(m, how, type)	((m) = _M_GETHDR(how, type))
3223
3224struct mbuf *
3225m_get(int wait, int type)
3226{
3227	return (_M_GET(wait, type));
3228}
3229
3230struct mbuf *
3231m_gethdr(int wait, int type)
3232{
3233	return (_M_GETHDR(wait, type));
3234}
3235
3236struct mbuf *
3237m_retry(int wait, int type)
3238{
3239	return (_M_RETRY(wait, type));
3240}
3241
3242struct mbuf *
3243m_retryhdr(int wait, int type)
3244{
3245	return (_M_RETRYHDR(wait, type));
3246}
3247
3248struct mbuf *
3249m_getclr(int wait, int type)
3250{
3251	struct mbuf *m;
3252
3253	_MGET(m, wait, type);
3254	if (m != NULL)
3255		bzero(MTOD(m, caddr_t), MLEN);
3256	return (m);
3257}
3258
3259struct mbuf *
3260m_free(struct mbuf *m)
3261{
3262	struct mbuf *n = m->m_next;
3263
3264	if (m->m_type == MT_FREE)
3265		panic("m_free: freeing an already freed mbuf");
3266
3267	/* Free the aux data and tags if there is any */
3268	if (m->m_flags & M_PKTHDR) {
3269		m_tag_delete_chain(m, NULL);
3270	}
3271
3272	if (m->m_flags & M_EXT) {
3273		u_int32_t refcnt;
3274		u_int32_t composite;
3275
3276		refcnt = m_decref(m);
3277		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3278		if (refcnt == 0 && !composite) {
3279			if (m->m_ext.ext_free == NULL) {
3280				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3281			} else if (m->m_ext.ext_free == m_bigfree) {
3282				mcache_free(m_cache(MC_BIGCL),
3283				    m->m_ext.ext_buf);
3284			} else if (m->m_ext.ext_free == m_16kfree) {
3285				mcache_free(m_cache(MC_16KCL),
3286				    m->m_ext.ext_buf);
3287			} else {
3288				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3289				    m->m_ext.ext_size, m->m_ext.ext_arg);
3290			}
3291			mcache_free(ref_cache, MEXT_RFA(m));
3292			MEXT_RFA(m) = NULL;
3293		} else if (refcnt == 0 && composite) {
3294			VERIFY(m->m_type != MT_FREE);
3295
3296			mtype_stat_dec(m->m_type);
3297			mtype_stat_inc(MT_FREE);
3298
3299			m->m_type = MT_FREE;
3300			m->m_flags = M_EXT;
3301			m->m_len = 0;
3302			m->m_next = m->m_nextpkt = NULL;
3303
3304			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3305
3306			/* "Free" into the intermediate cache */
3307			if (m->m_ext.ext_free == NULL) {
3308				mcache_free(m_cache(MC_MBUF_CL), m);
3309			} else if (m->m_ext.ext_free == m_bigfree) {
3310				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3311			} else {
3312				VERIFY(m->m_ext.ext_free == m_16kfree);
3313				mcache_free(m_cache(MC_MBUF_16KCL), m);
3314			}
3315			return (n);
3316		}
3317	}
3318
3319	if (m->m_type != MT_FREE) {
3320		mtype_stat_dec(m->m_type);
3321		mtype_stat_inc(MT_FREE);
3322	}
3323
3324	m->m_type = MT_FREE;
3325	m->m_flags = m->m_len = 0;
3326	m->m_next = m->m_nextpkt = NULL;
3327
3328	mcache_free(m_cache(MC_MBUF), m);
3329
3330	return (n);
3331}
3332
3333__private_extern__ struct mbuf *
3334m_clattach(struct mbuf *m, int type, caddr_t extbuf,
3335    void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg,
3336    int wait)
3337{
3338	struct ext_ref *rfa = NULL;
3339
3340	if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL)
3341		return (NULL);
3342
3343	if (m->m_flags & M_EXT) {
3344		u_int32_t refcnt;
3345		u_int32_t composite;
3346
3347		refcnt = m_decref(m);
3348		composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
3349		if (refcnt == 0 && !composite) {
3350			if (m->m_ext.ext_free == NULL) {
3351				mcache_free(m_cache(MC_CL), m->m_ext.ext_buf);
3352			} else if (m->m_ext.ext_free == m_bigfree) {
3353				mcache_free(m_cache(MC_BIGCL),
3354				    m->m_ext.ext_buf);
3355			} else if (m->m_ext.ext_free == m_16kfree) {
3356				mcache_free(m_cache(MC_16KCL),
3357				    m->m_ext.ext_buf);
3358			} else {
3359				(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
3360				    m->m_ext.ext_size, m->m_ext.ext_arg);
3361			}
3362			/* Re-use the reference structure */
3363			rfa = MEXT_RFA(m);
3364		} else if (refcnt == 0 && composite) {
3365			VERIFY(m->m_type != MT_FREE);
3366
3367			mtype_stat_dec(m->m_type);
3368			mtype_stat_inc(MT_FREE);
3369
3370			m->m_type = MT_FREE;
3371			m->m_flags = M_EXT;
3372			m->m_len = 0;
3373			m->m_next = m->m_nextpkt = NULL;
3374
3375			MEXT_FLAGS(m) &= ~EXTF_READONLY;
3376
3377			/* "Free" into the intermediate cache */
3378			if (m->m_ext.ext_free == NULL) {
3379				mcache_free(m_cache(MC_MBUF_CL), m);
3380			} else if (m->m_ext.ext_free == m_bigfree) {
3381				mcache_free(m_cache(MC_MBUF_BIGCL), m);
3382			} else {
3383				VERIFY(m->m_ext.ext_free == m_16kfree);
3384				mcache_free(m_cache(MC_MBUF_16KCL), m);
3385			}
3386			/*
3387			 * Allocate a new mbuf, since we didn't divorce
3388			 * the composite mbuf + cluster pair above.
3389			 */
3390			if ((m = _M_GETHDR(wait, type)) == NULL)
3391				return (NULL);
3392		}
3393	}
3394
3395	if (rfa == NULL &&
3396	    (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) {
3397		m_free(m);
3398		return (NULL);
3399	}
3400
3401	MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0);
3402
3403	return (m);
3404}
3405
3406/*
3407 * Perform `fast' allocation mbuf clusters from a cache of recently-freed
3408 * clusters. (If the cache is empty, new clusters are allocated en-masse.)
3409 */
3410struct mbuf *
3411m_getcl(int wait, int type, int flags)
3412{
3413	struct mbuf *m;
3414	int mcflags = MSLEEPF(wait);
3415	int hdr = (flags & M_PKTHDR);
3416
3417	/* Is this due to a non-blocking retry?  If so, then try harder */
3418	if (mcflags & MCR_NOSLEEP)
3419		mcflags |= MCR_TRYHARD;
3420
3421	m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags);
3422	if (m != NULL) {
3423		u_int32_t flag;
3424		struct ext_ref *rfa;
3425		void *cl;
3426
3427		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3428		cl = m->m_ext.ext_buf;
3429		rfa = MEXT_RFA(m);
3430
3431		ASSERT(cl != NULL && rfa != NULL);
3432		VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL);
3433
3434		flag = MEXT_FLAGS(m);
3435
3436		MBUF_INIT(m, hdr, type);
3437		MBUF_CL_INIT(m, cl, rfa, 1, flag);
3438
3439		mtype_stat_inc(type);
3440		mtype_stat_dec(MT_FREE);
3441#if CONFIG_MACF_NET
3442		if (hdr && mac_init_mbuf(m, wait) != 0) {
3443			m_freem(m);
3444			return (NULL);
3445		}
3446#endif /* MAC_NET */
3447	}
3448	return (m);
3449}
3450
3451/* m_mclget() add an mbuf cluster to a normal mbuf */
3452struct mbuf *
3453m_mclget(struct mbuf *m, int wait)
3454{
3455	struct ext_ref *rfa;
3456
3457	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3458		return (m);
3459
3460	m->m_ext.ext_buf = m_mclalloc(wait);
3461	if (m->m_ext.ext_buf != NULL) {
3462		MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3463	} else {
3464		mcache_free(ref_cache, rfa);
3465	}
3466	return (m);
3467}
3468
3469/* Allocate an mbuf cluster */
3470caddr_t
3471m_mclalloc(int wait)
3472{
3473	int mcflags = MSLEEPF(wait);
3474
3475	/* Is this due to a non-blocking retry?  If so, then try harder */
3476	if (mcflags & MCR_NOSLEEP)
3477		mcflags |= MCR_TRYHARD;
3478
3479	return (mcache_alloc(m_cache(MC_CL), mcflags));
3480}
3481
3482/* Free an mbuf cluster */
3483void
3484m_mclfree(caddr_t p)
3485{
3486	mcache_free(m_cache(MC_CL), p);
3487}
3488
3489/*
3490 * mcl_hasreference() checks if a cluster of an mbuf is referenced by
3491 * another mbuf; see comments in m_incref() regarding EXTF_READONLY.
3492 */
3493int
3494m_mclhasreference(struct mbuf *m)
3495{
3496	if (!(m->m_flags & M_EXT))
3497		return (0);
3498
3499	ASSERT(MEXT_RFA(m) != NULL);
3500
3501	return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0);
3502}
3503
3504__private_extern__ caddr_t
3505m_bigalloc(int wait)
3506{
3507	int mcflags = MSLEEPF(wait);
3508
3509	/* Is this due to a non-blocking retry?  If so, then try harder */
3510	if (mcflags & MCR_NOSLEEP)
3511		mcflags |= MCR_TRYHARD;
3512
3513	return (mcache_alloc(m_cache(MC_BIGCL), mcflags));
3514}
3515
3516__private_extern__ void
3517m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3518{
3519	mcache_free(m_cache(MC_BIGCL), p);
3520}
3521
3522/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */
3523__private_extern__ struct mbuf *
3524m_mbigget(struct mbuf *m, int wait)
3525{
3526	struct ext_ref *rfa;
3527
3528	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3529		return (m);
3530
3531	m->m_ext.ext_buf =  m_bigalloc(wait);
3532	if (m->m_ext.ext_buf != NULL) {
3533		MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3534	} else {
3535		mcache_free(ref_cache, rfa);
3536	}
3537	return (m);
3538}
3539
3540__private_extern__ caddr_t
3541m_16kalloc(int wait)
3542{
3543	int mcflags = MSLEEPF(wait);
3544
3545	/* Is this due to a non-blocking retry?  If so, then try harder */
3546	if (mcflags & MCR_NOSLEEP)
3547		mcflags |= MCR_TRYHARD;
3548
3549	return (mcache_alloc(m_cache(MC_16KCL), mcflags));
3550}
3551
3552__private_extern__ void
3553m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg)
3554{
3555	mcache_free(m_cache(MC_16KCL), p);
3556}
3557
3558/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */
3559__private_extern__ struct mbuf *
3560m_m16kget(struct mbuf *m, int wait)
3561{
3562	struct ext_ref *rfa;
3563
3564	if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL)
3565		return (m);
3566
3567	m->m_ext.ext_buf =  m_16kalloc(wait);
3568	if (m->m_ext.ext_buf != NULL) {
3569		MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0);
3570	} else {
3571		mcache_free(ref_cache, rfa);
3572	}
3573	return (m);
3574}
3575
3576/*
3577 * "Move" mbuf pkthdr from "from" to "to".
3578 * "from" must have M_PKTHDR set, and "to" must be empty.
3579 */
3580void
3581m_copy_pkthdr(struct mbuf *to, struct mbuf *from)
3582{
3583	/* We will be taking over the tags of 'to' */
3584	if (to->m_flags & M_PKTHDR)
3585		m_tag_delete_chain(to, NULL);
3586	to->m_pkthdr = from->m_pkthdr;		/* especially tags */
3587	m_tag_init(from);			/* purge tags from src */
3588	m_service_class_init(from);		/* reset svc class from src */
3589	from->m_pkthdr.aux_flags = 0;		/* clear aux flags from src */
3590	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3591	if ((to->m_flags & M_EXT) == 0)
3592		to->m_data = to->m_pktdat;
3593}
3594
3595/*
3596 * Duplicate "from"'s mbuf pkthdr in "to".
3597 * "from" must have M_PKTHDR set, and "to" must be empty.
3598 * In particular, this does a deep copy of the packet tags.
3599 */
3600static int
3601m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how)
3602{
3603	if (to->m_flags & M_PKTHDR)
3604		m_tag_delete_chain(to, NULL);
3605	to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT);
3606	if ((to->m_flags & M_EXT) == 0)
3607		to->m_data = to->m_pktdat;
3608	to->m_pkthdr = from->m_pkthdr;
3609	m_tag_init(to);
3610	return (m_tag_copy_chain(to, from, how));
3611}
3612
3613void
3614m_copy_pftag(struct mbuf *to, struct mbuf *from)
3615{
3616	to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag;
3617	to->m_pkthdr.pf_mtag.pftag_hdr = NULL;
3618	to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6);
3619}
3620
3621/*
3622 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
3623 * if wantall is not set, return whatever number were available.  Set up the
3624 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these
3625 * are chained on the m_nextpkt field.  Any packets requested beyond this
3626 * are chained onto the last packet header's m_next field.  The size of
3627 * the cluster is controlled by the parameter bufsize.
3628 */
3629__private_extern__ struct mbuf *
3630m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs,
3631    int wait, int wantall, size_t bufsize)
3632{
3633	struct mbuf *m;
3634	struct mbuf **np, *top;
3635	unsigned int pnum, needed = *num_needed;
3636	mcache_obj_t *mp_list = NULL;
3637	int mcflags = MSLEEPF(wait);
3638	u_int32_t flag;
3639	struct ext_ref *rfa;
3640	mcache_t *cp;
3641	void *cl;
3642
3643	ASSERT(bufsize == m_maxsize(MC_CL) ||
3644	    bufsize == m_maxsize(MC_BIGCL) ||
3645	    bufsize == m_maxsize(MC_16KCL));
3646
3647	/*
3648	 * Caller must first check for njcl because this
3649	 * routine is internal and not exposed/used via KPI.
3650	 */
3651	VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0);
3652
3653	top = NULL;
3654	np = &top;
3655	pnum = 0;
3656
3657	/*
3658	 * The caller doesn't want all the requested buffers; only some.
3659	 * Try hard to get what we can, but don't block.  This effectively
3660	 * overrides MCR_SLEEP, since this thread will not go to sleep
3661	 * if we can't get all the buffers.
3662	 */
3663	if (!wantall || (mcflags & MCR_NOSLEEP))
3664		mcflags |= MCR_TRYHARD;
3665
3666	/* Allocate the composite mbuf + cluster elements from the cache */
3667	if (bufsize == m_maxsize(MC_CL))
3668		cp = m_cache(MC_MBUF_CL);
3669	else if (bufsize == m_maxsize(MC_BIGCL))
3670		cp = m_cache(MC_MBUF_BIGCL);
3671	else
3672		cp = m_cache(MC_MBUF_16KCL);
3673	needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags);
3674
3675	for (pnum = 0; pnum < needed; pnum++) {
3676		m = (struct mbuf *)mp_list;
3677		mp_list = mp_list->obj_next;
3678
3679		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3680		cl = m->m_ext.ext_buf;
3681		rfa = MEXT_RFA(m);
3682
3683		ASSERT(cl != NULL && rfa != NULL);
3684		VERIFY(MBUF_IS_COMPOSITE(m));
3685
3686		flag = MEXT_FLAGS(m);
3687
3688		MBUF_INIT(m, num_with_pkthdrs, MT_DATA);
3689		if (bufsize == m_maxsize(MC_16KCL)) {
3690			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
3691		} else if (bufsize == m_maxsize(MC_BIGCL)) {
3692			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
3693		} else {
3694			MBUF_CL_INIT(m, cl, rfa, 1, flag);
3695		}
3696
3697		if (num_with_pkthdrs > 0) {
3698			--num_with_pkthdrs;
3699#if CONFIG_MACF_NET
3700			if (mac_mbuf_label_init(m, wait) != 0) {
3701				m_freem(m);
3702				break;
3703			}
3704#endif /* MAC_NET */
3705		}
3706
3707		*np = m;
3708		if (num_with_pkthdrs > 0)
3709			np = &m->m_nextpkt;
3710		else
3711			np = &m->m_next;
3712	}
3713	ASSERT(pnum != *num_needed || mp_list == NULL);
3714	if (mp_list != NULL)
3715		mcache_free_ext(cp, mp_list);
3716
3717	if (pnum > 0) {
3718		mtype_stat_add(MT_DATA, pnum);
3719		mtype_stat_sub(MT_FREE, pnum);
3720	}
3721
3722	if (wantall && (pnum != *num_needed)) {
3723		if (top != NULL)
3724			m_freem_list(top);
3725		return (NULL);
3726	}
3727
3728	if (pnum > *num_needed) {
3729		printf("%s: File a radar related to <rdar://10146739>. \
3730			needed = %u, pnum = %u, num_needed = %u \n",
3731			__func__, needed, pnum, *num_needed);
3732	}
3733
3734	*num_needed = pnum;
3735	return (top);
3736}
3737
3738/*
3739 * Return list of mbuf linked by m_nextpkt.  Try for numlist, and if
3740 * wantall is not set, return whatever number were available.  The size of
3741 * each mbuf in the list is controlled by the parameter packetlen.  Each
3742 * mbuf of the list may have a chain of mbufs linked by m_next.  Each mbuf
3743 * in the chain is called a segment.  If maxsegments is not null and the
3744 * value pointed to is not null, this specify the maximum number of segments
3745 * for a chain of mbufs.  If maxsegments is zero or the value pointed to
3746 * is zero the caller does not have any restriction on the number of segments.
3747 * The actual  number of segments of a mbuf chain is return in the value
3748 * pointed to by maxsegments.
3749 */
3750__private_extern__ struct mbuf *
3751m_allocpacket_internal(unsigned int *numlist, size_t packetlen,
3752    unsigned int *maxsegments, int wait, int wantall, size_t wantsize)
3753{
3754	struct mbuf **np, *top, *first = NULL;
3755	size_t bufsize, r_bufsize;
3756	unsigned int num = 0;
3757	unsigned int nsegs = 0;
3758	unsigned int needed, resid;
3759	int mcflags = MSLEEPF(wait);
3760	mcache_obj_t *mp_list = NULL, *rmp_list = NULL;
3761	mcache_t *cp = NULL, *rcp = NULL;
3762
3763	if (*numlist == 0)
3764		return (NULL);
3765
3766	top = NULL;
3767	np = &top;
3768
3769	if (wantsize == 0) {
3770		if (packetlen <= MINCLSIZE) {
3771			bufsize = packetlen;
3772		} else if (packetlen > m_maxsize(MC_CL)) {
3773			/* Use 4KB if jumbo cluster pool isn't available */
3774			if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0)
3775				bufsize = m_maxsize(MC_BIGCL);
3776			else
3777				bufsize = m_maxsize(MC_16KCL);
3778		} else {
3779			bufsize = m_maxsize(MC_CL);
3780		}
3781	} else if (wantsize == m_maxsize(MC_CL) ||
3782	    wantsize == m_maxsize(MC_BIGCL) ||
3783	    (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) {
3784		bufsize = wantsize;
3785	} else {
3786		return (NULL);
3787	}
3788
3789	if (bufsize <= MHLEN) {
3790		nsegs = 1;
3791	} else if (bufsize <= MINCLSIZE) {
3792		if (maxsegments != NULL && *maxsegments == 1) {
3793			bufsize = m_maxsize(MC_CL);
3794			nsegs = 1;
3795		} else {
3796			nsegs = 2;
3797		}
3798	} else if (bufsize == m_maxsize(MC_16KCL)) {
3799		VERIFY(njcl > 0);
3800		nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1;
3801	} else if (bufsize == m_maxsize(MC_BIGCL)) {
3802		nsegs = ((packetlen - 1) >> PGSHIFT) + 1;
3803	} else {
3804		nsegs = ((packetlen - 1) >> MCLSHIFT) + 1;
3805	}
3806	if (maxsegments != NULL) {
3807		if (*maxsegments && nsegs > *maxsegments) {
3808			*maxsegments = nsegs;
3809			return (NULL);
3810		}
3811		*maxsegments = nsegs;
3812	}
3813
3814	/*
3815	 * The caller doesn't want all the requested buffers; only some.
3816	 * Try hard to get what we can, but don't block.  This effectively
3817	 * overrides MCR_SLEEP, since this thread will not go to sleep
3818	 * if we can't get all the buffers.
3819	 */
3820	if (!wantall || (mcflags & MCR_NOSLEEP))
3821		mcflags |= MCR_TRYHARD;
3822
3823	/*
3824	 * Simple case where all elements in the lists/chains are mbufs.
3825	 * Unless bufsize is greater than MHLEN, each segment chain is made
3826	 * up of exactly 1 mbuf.  Otherwise, each segment chain is made up
3827	 * of 2 mbufs; the second one is used for the residual data, i.e.
3828	 * the remaining data that cannot fit into the first mbuf.
3829	 */
3830	if (bufsize <= MINCLSIZE) {
3831		/* Allocate the elements in one shot from the mbuf cache */
3832		ASSERT(bufsize <= MHLEN || nsegs == 2);
3833		cp = m_cache(MC_MBUF);
3834		needed = mcache_alloc_ext(cp, &mp_list,
3835		    (*numlist) * nsegs, mcflags);
3836
3837		/*
3838		 * The number of elements must be even if we are to use an
3839		 * mbuf (instead of a cluster) to store the residual data.
3840		 * If we couldn't allocate the requested number of mbufs,
3841		 * trim the number down (if it's odd) in order to avoid
3842		 * creating a partial segment chain.
3843		 */
3844		if (bufsize > MHLEN && (needed & 0x1))
3845			needed--;
3846
3847		while (num < needed) {
3848			struct mbuf *m;
3849
3850			m = (struct mbuf *)mp_list;
3851			mp_list = mp_list->obj_next;
3852			ASSERT(m != NULL);
3853
3854			MBUF_INIT(m, 1, MT_DATA);
3855#if CONFIG_MACF_NET
3856			if (mac_init_mbuf(m, wait) != 0) {
3857				m_free(m);
3858				break;
3859			}
3860#endif /* MAC_NET */
3861			num++;
3862			if (bufsize > MHLEN) {
3863				/* A second mbuf for this segment chain */
3864				m->m_next = (struct mbuf *)mp_list;
3865				mp_list = mp_list->obj_next;
3866				ASSERT(m->m_next != NULL);
3867
3868				MBUF_INIT(m->m_next, 0, MT_DATA);
3869				num++;
3870			}
3871			*np = m;
3872			np = &m->m_nextpkt;
3873		}
3874		ASSERT(num != *numlist || mp_list == NULL);
3875
3876		if (num > 0) {
3877			mtype_stat_add(MT_DATA, num);
3878			mtype_stat_sub(MT_FREE, num);
3879		}
3880		num /= nsegs;
3881
3882		/* We've got them all; return to caller */
3883		if (num == *numlist)
3884			return (top);
3885
3886		goto fail;
3887	}
3888
3889	/*
3890	 * Complex cases where elements are made up of one or more composite
3891	 * mbufs + cluster, depending on packetlen.  Each N-segment chain can
3892	 * be illustrated as follows:
3893	 *
3894	 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N]
3895	 *
3896	 * Every composite mbuf + cluster element comes from the intermediate
3897	 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL).  For space efficiency,
3898	 * the last composite element will come from the MC_MBUF_CL cache,
3899	 * unless the residual data is larger than 2KB where we use the
3900	 * big cluster composite cache (MC_MBUF_BIGCL) instead.  Residual
3901	 * data is defined as extra data beyond the first element that cannot
3902	 * fit into the previous element, i.e. there is no residual data if
3903	 * the chain only has 1 segment.
3904	 */
3905	r_bufsize = bufsize;
3906	resid = packetlen > bufsize ? packetlen % bufsize : 0;
3907	if (resid > 0) {
3908		/* There is residual data; figure out the cluster size */
3909		if (wantsize == 0 && packetlen > MINCLSIZE) {
3910			/*
3911			 * Caller didn't request that all of the segments
3912			 * in the chain use the same cluster size; use the
3913			 * smaller of the cluster sizes.
3914			 */
3915			if (njcl > 0 && resid > m_maxsize(MC_BIGCL))
3916				r_bufsize = m_maxsize(MC_16KCL);
3917			else if (resid > m_maxsize(MC_CL))
3918				r_bufsize = m_maxsize(MC_BIGCL);
3919			else
3920				r_bufsize = m_maxsize(MC_CL);
3921		} else {
3922			/* Use the same cluster size as the other segments */
3923			resid = 0;
3924		}
3925	}
3926
3927	needed = *numlist;
3928	if (resid > 0) {
3929		/*
3930		 * Attempt to allocate composite mbuf + cluster elements for
3931		 * the residual data in each chain; record the number of such
3932		 * elements that can be allocated so that we know how many
3933		 * segment chains we can afford to create.
3934		 */
3935		if (r_bufsize <= m_maxsize(MC_CL))
3936			rcp = m_cache(MC_MBUF_CL);
3937		else if (r_bufsize <= m_maxsize(MC_BIGCL))
3938			rcp = m_cache(MC_MBUF_BIGCL);
3939		else
3940			rcp = m_cache(MC_MBUF_16KCL);
3941		needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags);
3942
3943		if (needed == 0)
3944			goto fail;
3945
3946		/* This is temporarily reduced for calculation */
3947		ASSERT(nsegs > 1);
3948		nsegs--;
3949	}
3950
3951	/*
3952	 * Attempt to allocate the rest of the composite mbuf + cluster
3953	 * elements for the number of segment chains that we need.
3954	 */
3955	if (bufsize <= m_maxsize(MC_CL))
3956		cp = m_cache(MC_MBUF_CL);
3957	else if (bufsize <= m_maxsize(MC_BIGCL))
3958		cp = m_cache(MC_MBUF_BIGCL);
3959	else
3960		cp = m_cache(MC_MBUF_16KCL);
3961	needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags);
3962
3963	/* Round it down to avoid creating a partial segment chain */
3964	needed = (needed / nsegs) * nsegs;
3965	if (needed == 0)
3966		goto fail;
3967
3968	if (resid > 0) {
3969		/*
3970		 * We're about to construct the chain(s); take into account
3971		 * the number of segments we have created above to hold the
3972		 * residual data for each chain, as well as restore the
3973		 * original count of segments per chain.
3974		 */
3975		ASSERT(nsegs > 0);
3976		needed += needed / nsegs;
3977		nsegs++;
3978	}
3979
3980	for (;;) {
3981		struct mbuf *m;
3982		u_int32_t flag;
3983		struct ext_ref *rfa;
3984		void *cl;
3985		int pkthdr;
3986
3987		++num;
3988		if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) {
3989			m = (struct mbuf *)mp_list;
3990			mp_list = mp_list->obj_next;
3991		} else {
3992			m = (struct mbuf *)rmp_list;
3993			rmp_list = rmp_list->obj_next;
3994		}
3995		ASSERT(m != NULL);
3996		VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT);
3997		VERIFY(m->m_ext.ext_free == NULL ||
3998		    m->m_ext.ext_free == m_bigfree ||
3999		    m->m_ext.ext_free == m_16kfree);
4000
4001		cl = m->m_ext.ext_buf;
4002		rfa = MEXT_RFA(m);
4003
4004		ASSERT(cl != NULL && rfa != NULL);
4005		VERIFY(MBUF_IS_COMPOSITE(m));
4006
4007		flag = MEXT_FLAGS(m);
4008
4009		pkthdr = (nsegs == 1 || (num % nsegs) == 1);
4010		if (pkthdr)
4011			first = m;
4012		MBUF_INIT(m, pkthdr, MT_DATA);
4013		if (m->m_ext.ext_free == m_16kfree) {
4014			MBUF_16KCL_INIT(m, cl, rfa, 1, flag);
4015		} else if (m->m_ext.ext_free == m_bigfree) {
4016			MBUF_BIGCL_INIT(m, cl, rfa, 1, flag);
4017		} else {
4018			MBUF_CL_INIT(m, cl, rfa, 1, flag);
4019		}
4020#if CONFIG_MACF_NET
4021		if (pkthdr && mac_init_mbuf(m, wait) != 0) {
4022			--num;
4023			m_freem(m);
4024			break;
4025		}
4026#endif /* MAC_NET */
4027
4028		*np = m;
4029		if ((num % nsegs) == 0)
4030			np = &first->m_nextpkt;
4031		else
4032			np = &m->m_next;
4033
4034		if (num == needed)
4035			break;
4036	}
4037
4038	if (num > 0) {
4039		mtype_stat_add(MT_DATA, num);
4040		mtype_stat_sub(MT_FREE, num);
4041	}
4042
4043	num /= nsegs;
4044
4045	/* We've got them all; return to caller */
4046	if (num == *numlist) {
4047		ASSERT(mp_list == NULL && rmp_list == NULL);
4048		return (top);
4049	}
4050
4051fail:
4052	/* Free up what's left of the above */
4053	if (mp_list != NULL)
4054		mcache_free_ext(cp, mp_list);
4055	if (rmp_list != NULL)
4056		mcache_free_ext(rcp, rmp_list);
4057	if (wantall && top != NULL) {
4058		m_freem(top);
4059		return (NULL);
4060	}
4061	*numlist = num;
4062	return (top);
4063}
4064
4065/*
4066 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4067 * packets on receive ring.
4068 */
4069__private_extern__ struct mbuf *
4070m_getpacket_how(int wait)
4071{
4072	unsigned int num_needed = 1;
4073
4074	return (m_getpackets_internal(&num_needed, 1, wait, 1,
4075	    m_maxsize(MC_CL)));
4076}
4077
4078/*
4079 * Best effort to get a mbuf cluster + pkthdr.  Used by drivers to allocated
4080 * packets on receive ring.
4081 */
4082struct mbuf *
4083m_getpacket(void)
4084{
4085	unsigned int num_needed = 1;
4086
4087	return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1,
4088	    m_maxsize(MC_CL)));
4089}
4090
4091/*
4092 * Return a list of mbuf hdrs that point to clusters.  Try for num_needed;
4093 * if this can't be met, return whatever number were available.  Set up the
4094 * first num_with_pkthdrs with mbuf hdrs configured as packet headers.  These
4095 * are chained on the m_nextpkt field.  Any packets requested beyond this are
4096 * chained onto the last packet header's m_next field.
4097 */
4098struct mbuf *
4099m_getpackets(int num_needed, int num_with_pkthdrs, int how)
4100{
4101	unsigned int n = num_needed;
4102
4103	return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0,
4104	    m_maxsize(MC_CL)));
4105}
4106
4107/*
4108 * Return a list of mbuf hdrs set up as packet hdrs chained together
4109 * on the m_nextpkt field
4110 */
4111struct mbuf *
4112m_getpackethdrs(int num_needed, int how)
4113{
4114	struct mbuf *m;
4115	struct mbuf **np, *top;
4116
4117	top = NULL;
4118	np = &top;
4119
4120	while (num_needed--) {
4121		m = _M_RETRYHDR(how, MT_DATA);
4122		if (m == NULL)
4123			break;
4124
4125		*np = m;
4126		np = &m->m_nextpkt;
4127	}
4128
4129	return (top);
4130}
4131
4132/*
4133 * Free an mbuf list (m_nextpkt) while following m_next.  Returns the count
4134 * for mbufs packets freed.  Used by the drivers.
4135 */
4136int
4137m_freem_list(struct mbuf *m)
4138{
4139	struct mbuf *nextpkt;
4140	mcache_obj_t *mp_list = NULL;
4141	mcache_obj_t *mcl_list = NULL;
4142	mcache_obj_t *mbc_list = NULL;
4143	mcache_obj_t *m16k_list = NULL;
4144	mcache_obj_t *m_mcl_list = NULL;
4145	mcache_obj_t *m_mbc_list = NULL;
4146	mcache_obj_t *m_m16k_list = NULL;
4147	mcache_obj_t *ref_list = NULL;
4148	int pktcount = 0;
4149	int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0;
4150
4151	while (m != NULL) {
4152		pktcount++;
4153
4154		nextpkt = m->m_nextpkt;
4155		m->m_nextpkt = NULL;
4156
4157		while (m != NULL) {
4158			struct mbuf *next = m->m_next;
4159			mcache_obj_t *o, *rfa;
4160			u_int32_t refcnt, composite;
4161
4162			if (m->m_type == MT_FREE)
4163				panic("m_free: freeing an already freed mbuf");
4164
4165			if (m->m_type != MT_FREE)
4166				mt_free++;
4167
4168			if (m->m_flags & M_PKTHDR) {
4169				m_tag_delete_chain(m, NULL);
4170			}
4171
4172			if (!(m->m_flags & M_EXT))
4173				goto simple_free;
4174
4175			o = (mcache_obj_t *)(void *)m->m_ext.ext_buf;
4176			refcnt = m_decref(m);
4177			composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE);
4178			if (refcnt == 0 && !composite) {
4179				if (m->m_ext.ext_free == NULL) {
4180					o->obj_next = mcl_list;
4181					mcl_list = o;
4182				} else if (m->m_ext.ext_free == m_bigfree) {
4183					o->obj_next = mbc_list;
4184					mbc_list = o;
4185				} else if (m->m_ext.ext_free == m_16kfree) {
4186					o->obj_next = m16k_list;
4187					m16k_list = o;
4188				} else {
4189					(*(m->m_ext.ext_free))((caddr_t)o,
4190					    m->m_ext.ext_size,
4191					    m->m_ext.ext_arg);
4192				}
4193				rfa = (mcache_obj_t *)(void *)MEXT_RFA(m);
4194				rfa->obj_next = ref_list;
4195				ref_list = rfa;
4196				MEXT_RFA(m) = NULL;
4197			} else if (refcnt == 0 && composite) {
4198				VERIFY(m->m_type != MT_FREE);
4199				/*
4200				 * Amortize the costs of atomic operations
4201				 * by doing them at the end, if possible.
4202				 */
4203				if (m->m_type == MT_DATA)
4204					mt_data++;
4205				else if (m->m_type == MT_HEADER)
4206					mt_header++;
4207				else if (m->m_type == MT_SONAME)
4208					mt_soname++;
4209				else if (m->m_type == MT_TAG)
4210					mt_tag++;
4211				else
4212					mtype_stat_dec(m->m_type);
4213
4214				m->m_type = MT_FREE;
4215				m->m_flags = M_EXT;
4216				m->m_len = 0;
4217				m->m_next = m->m_nextpkt = NULL;
4218
4219				MEXT_FLAGS(m) &= ~EXTF_READONLY;
4220
4221				/* "Free" into the intermediate cache */
4222				o = (mcache_obj_t *)m;
4223				if (m->m_ext.ext_free == NULL) {
4224					o->obj_next = m_mcl_list;
4225					m_mcl_list = o;
4226				} else if (m->m_ext.ext_free == m_bigfree) {
4227					o->obj_next = m_mbc_list;
4228					m_mbc_list = o;
4229				} else {
4230					VERIFY(m->m_ext.ext_free == m_16kfree);
4231					o->obj_next = m_m16k_list;
4232					m_m16k_list = o;
4233				}
4234				m = next;
4235				continue;
4236			}
4237simple_free:
4238			/*
4239			 * Amortize the costs of atomic operations
4240			 * by doing them at the end, if possible.
4241			 */
4242			if (m->m_type == MT_DATA)
4243				mt_data++;
4244			else if (m->m_type == MT_HEADER)
4245				mt_header++;
4246			else if (m->m_type == MT_SONAME)
4247				mt_soname++;
4248			else if (m->m_type == MT_TAG)
4249				mt_tag++;
4250			else if (m->m_type != MT_FREE)
4251				mtype_stat_dec(m->m_type);
4252
4253			m->m_type = MT_FREE;
4254			m->m_flags = m->m_len = 0;
4255			m->m_next = m->m_nextpkt = NULL;
4256
4257			((mcache_obj_t *)m)->obj_next = mp_list;
4258			mp_list = (mcache_obj_t *)m;
4259
4260			m = next;
4261		}
4262
4263		m = nextpkt;
4264	}
4265
4266	if (mt_free > 0)
4267		mtype_stat_add(MT_FREE, mt_free);
4268	if (mt_data > 0)
4269		mtype_stat_sub(MT_DATA, mt_data);
4270	if (mt_header > 0)
4271		mtype_stat_sub(MT_HEADER, mt_header);
4272	if (mt_soname > 0)
4273		mtype_stat_sub(MT_SONAME, mt_soname);
4274	if (mt_tag > 0)
4275		mtype_stat_sub(MT_TAG, mt_tag);
4276
4277	if (mp_list != NULL)
4278		mcache_free_ext(m_cache(MC_MBUF), mp_list);
4279	if (mcl_list != NULL)
4280		mcache_free_ext(m_cache(MC_CL), mcl_list);
4281	if (mbc_list != NULL)
4282		mcache_free_ext(m_cache(MC_BIGCL), mbc_list);
4283	if (m16k_list != NULL)
4284		mcache_free_ext(m_cache(MC_16KCL), m16k_list);
4285	if (m_mcl_list != NULL)
4286		mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list);
4287	if (m_mbc_list != NULL)
4288		mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list);
4289	if (m_m16k_list != NULL)
4290		mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list);
4291	if (ref_list != NULL)
4292		mcache_free_ext(ref_cache, ref_list);
4293
4294	return (pktcount);
4295}
4296
4297void
4298m_freem(struct mbuf *m)
4299{
4300	while (m != NULL)
4301		m = m_free(m);
4302}
4303
4304/*
4305 * Mbuffer utility routines.
4306 */
4307
4308/*
4309 * Compute the amount of space available before the current start
4310 * of data in an mbuf.
4311 */
4312int
4313m_leadingspace(struct mbuf *m)
4314{
4315	if (m->m_flags & M_EXT) {
4316		if (MCLHASREFERENCE(m))
4317			return (0);
4318		return (m->m_data - m->m_ext.ext_buf);
4319	}
4320	if (m->m_flags & M_PKTHDR)
4321		return (m->m_data - m->m_pktdat);
4322	return (m->m_data - m->m_dat);
4323}
4324
4325/*
4326 * Compute the amount of space available after the end of data in an mbuf.
4327 */
4328int
4329m_trailingspace(struct mbuf *m)
4330{
4331	if (m->m_flags & M_EXT) {
4332		if (MCLHASREFERENCE(m))
4333			return (0);
4334		return (m->m_ext.ext_buf + m->m_ext.ext_size -
4335		    (m->m_data + m->m_len));
4336	}
4337	return (&m->m_dat[MLEN] - (m->m_data + m->m_len));
4338}
4339
4340/*
4341 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain,
4342 * copy junk along.  Does not adjust packet header length.
4343 */
4344struct mbuf *
4345m_prepend(struct mbuf *m, int len, int how)
4346{
4347	struct mbuf *mn;
4348
4349	_MGET(mn, how, m->m_type);
4350	if (mn == NULL) {
4351		m_freem(m);
4352		return (NULL);
4353	}
4354	if (m->m_flags & M_PKTHDR) {
4355		M_COPY_PKTHDR(mn, m);
4356		m->m_flags &= ~M_PKTHDR;
4357	}
4358	mn->m_next = m;
4359	m = mn;
4360	if (len < MHLEN)
4361		MH_ALIGN(m, len);
4362	m->m_len = len;
4363	return (m);
4364}
4365
4366/*
4367 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to
4368 * chain, copy junk along, and adjust length.
4369 */
4370struct mbuf *
4371m_prepend_2(struct mbuf *m, int len, int how)
4372{
4373	if (M_LEADINGSPACE(m) >= len) {
4374		m->m_data -= len;
4375		m->m_len += len;
4376	} else {
4377		m = m_prepend(m, len, how);
4378	}
4379	if ((m) && (m->m_flags & M_PKTHDR))
4380		m->m_pkthdr.len += len;
4381	return (m);
4382}
4383
4384/*
4385 * Make a copy of an mbuf chain starting "off0" bytes from the beginning,
4386 * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
4387 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
4388 */
4389int MCFail;
4390
4391struct mbuf *
4392m_copym(struct mbuf *m, int off0, int len, int wait)
4393{
4394	struct mbuf *n, *mhdr = NULL, **np;
4395	int off = off0;
4396	struct mbuf *top;
4397	int copyhdr = 0;
4398
4399	if (off < 0 || len < 0)
4400		panic("m_copym: invalid offset %d or len %d", off, len);
4401
4402	if (off == 0 && (m->m_flags & M_PKTHDR)) {
4403		mhdr = m;
4404		copyhdr = 1;
4405	}
4406
4407	while (off >= m->m_len) {
4408		if (m->m_next == NULL)
4409			panic("m_copym: invalid mbuf chain");
4410		off -= m->m_len;
4411		m = m->m_next;
4412	}
4413	np = &top;
4414	top = NULL;
4415
4416	while (len > 0) {
4417		if (m == NULL) {
4418			if (len != M_COPYALL)
4419				panic("m_copym: len != M_COPYALL");
4420			break;
4421		}
4422
4423		n = _M_RETRY(wait, m->m_type);
4424		*np = n;
4425
4426		if (n == NULL)
4427			goto nospace;
4428
4429		if (copyhdr != 0) {
4430			M_COPY_PKTHDR(n, mhdr);
4431			if (len == M_COPYALL)
4432				n->m_pkthdr.len -= off0;
4433			else
4434				n->m_pkthdr.len = len;
4435			copyhdr = 0;
4436		}
4437		if (len == M_COPYALL) {
4438			if (MIN(len, (m->m_len - off)) == len) {
4439				printf("m->m_len %d - off %d = %d, %d\n",
4440				    m->m_len, off, m->m_len - off,
4441				    MIN(len, (m->m_len - off)));
4442			}
4443		}
4444		n->m_len = MIN(len, (m->m_len - off));
4445		if (n->m_len == M_COPYALL) {
4446			printf("n->m_len == M_COPYALL, fixing\n");
4447			n->m_len = MHLEN;
4448		}
4449		if (m->m_flags & M_EXT) {
4450			n->m_ext = m->m_ext;
4451			m_incref(m);
4452			n->m_data = m->m_data + off;
4453			n->m_flags |= M_EXT;
4454		} else {
4455			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4456			    (unsigned)n->m_len);
4457		}
4458		if (len != M_COPYALL)
4459			len -= n->m_len;
4460		off = 0;
4461		m = m->m_next;
4462		np = &n->m_next;
4463	}
4464
4465	if (top == NULL)
4466		MCFail++;
4467
4468	return (top);
4469nospace:
4470
4471	m_freem(top);
4472	MCFail++;
4473	return (NULL);
4474}
4475
4476/*
4477 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated
4478 * within this routine also, the last mbuf and offset accessed are passed
4479 * out and can be passed back in to avoid having to rescan the entire mbuf
4480 * list (normally hung off of the socket)
4481 */
4482struct mbuf *
4483m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait,
4484    struct mbuf **m_lastm, int *m_off)
4485{
4486	struct mbuf *n, **np = NULL;
4487	int off = off0, len = len0;
4488	struct mbuf *top = NULL;
4489	int mcflags = MSLEEPF(wait);
4490	int copyhdr = 0;
4491	int type = 0;
4492	mcache_obj_t *list = NULL;
4493	int needed = 0;
4494
4495	if (off == 0 && (m->m_flags & M_PKTHDR))
4496		copyhdr = 1;
4497
4498	if (*m_lastm != NULL) {
4499		m = *m_lastm;
4500		off = *m_off;
4501	} else {
4502		while (off >= m->m_len) {
4503			off -= m->m_len;
4504			m = m->m_next;
4505		}
4506	}
4507
4508	n = m;
4509	while (len > 0) {
4510		needed++;
4511		ASSERT(n != NULL);
4512		len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0)));
4513		n = n->m_next;
4514	}
4515	needed++;
4516	len = len0;
4517
4518	/*
4519	 * If the caller doesn't want to be put to sleep, mark it with
4520	 * MCR_TRYHARD so that we may reclaim buffers from other places
4521	 * before giving up.
4522	 */
4523	if (mcflags & MCR_NOSLEEP)
4524		mcflags |= MCR_TRYHARD;
4525
4526	if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed,
4527	    mcflags) != needed)
4528		goto nospace;
4529
4530	needed = 0;
4531	while (len > 0) {
4532		n = (struct mbuf *)list;
4533		list = list->obj_next;
4534		ASSERT(n != NULL && m != NULL);
4535
4536		type = (top == NULL) ? MT_HEADER : m->m_type;
4537		MBUF_INIT(n, (top == NULL), type);
4538#if CONFIG_MACF_NET
4539		if (top == NULL && mac_mbuf_label_init(n, wait) != 0) {
4540			mtype_stat_inc(MT_HEADER);
4541			mtype_stat_dec(MT_FREE);
4542			m_free(n);
4543			goto nospace;
4544		}
4545#endif /* MAC_NET */
4546
4547		if (top == NULL) {
4548			top = n;
4549			np = &top->m_next;
4550			continue;
4551		} else {
4552			needed++;
4553			*np = n;
4554		}
4555
4556		if (copyhdr) {
4557			M_COPY_PKTHDR(n, m);
4558			n->m_pkthdr.len = len;
4559			copyhdr = 0;
4560		}
4561		n->m_len = MIN(len, (m->m_len - off));
4562
4563		if (m->m_flags & M_EXT) {
4564			n->m_ext = m->m_ext;
4565			m_incref(m);
4566			n->m_data = m->m_data + off;
4567			n->m_flags |= M_EXT;
4568		} else {
4569			bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t),
4570			    (unsigned)n->m_len);
4571		}
4572		len -= n->m_len;
4573
4574		if (len == 0) {
4575			if ((off + n->m_len) == m->m_len) {
4576				*m_lastm = m->m_next;
4577				*m_off  = 0;
4578			} else {
4579				*m_lastm = m;
4580				*m_off  = off + n->m_len;
4581			}
4582			break;
4583		}
4584		off = 0;
4585		m = m->m_next;
4586		np = &n->m_next;
4587	}
4588
4589	mtype_stat_inc(MT_HEADER);
4590	mtype_stat_add(type, needed);
4591	mtype_stat_sub(MT_FREE, needed + 1);
4592
4593	ASSERT(list == NULL);
4594	return (top);
4595
4596nospace:
4597	if (list != NULL)
4598		mcache_free_ext(m_cache(MC_MBUF), list);
4599	if (top != NULL)
4600		m_freem(top);
4601	MCFail++;
4602	return (NULL);
4603}
4604
4605/*
4606 * Copy data from an mbuf chain starting "off" bytes from the beginning,
4607 * continuing for "len" bytes, into the indicated buffer.
4608 */
4609void
4610m_copydata(struct mbuf *m, int off, int len, void *vp)
4611{
4612	unsigned count;
4613	char *cp = vp;
4614
4615	if (off < 0 || len < 0)
4616		panic("m_copydata: invalid offset %d or len %d", off, len);
4617
4618	while (off > 0) {
4619		if (m == NULL)
4620			panic("m_copydata: invalid mbuf chain");
4621		if (off < m->m_len)
4622			break;
4623		off -= m->m_len;
4624		m = m->m_next;
4625	}
4626	while (len > 0) {
4627		if (m == NULL)
4628			panic("m_copydata: invalid mbuf chain");
4629		count = MIN(m->m_len - off, len);
4630		bcopy(MTOD(m, caddr_t) + off, cp, count);
4631		len -= count;
4632		cp += count;
4633		off = 0;
4634		m = m->m_next;
4635	}
4636}
4637
4638/*
4639 * Concatenate mbuf chain n to m.  Both chains must be of the same type
4640 * (e.g. MT_DATA).  Any m_pkthdr is not updated.
4641 */
4642void
4643m_cat(struct mbuf *m, struct mbuf *n)
4644{
4645	while (m->m_next)
4646		m = m->m_next;
4647	while (n) {
4648		if ((m->m_flags & M_EXT) ||
4649		    m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) {
4650			/* just join the two chains */
4651			m->m_next = n;
4652			return;
4653		}
4654		/* splat the data from one into the other */
4655		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4656		    (u_int)n->m_len);
4657		m->m_len += n->m_len;
4658		n = m_free(n);
4659	}
4660}
4661
4662void
4663m_adj(struct mbuf *mp, int req_len)
4664{
4665	int len = req_len;
4666	struct mbuf *m;
4667	int count;
4668
4669	if ((m = mp) == NULL)
4670		return;
4671	if (len >= 0) {
4672		/*
4673		 * Trim from head.
4674		 */
4675		while (m != NULL && len > 0) {
4676			if (m->m_len <= len) {
4677				len -= m->m_len;
4678				m->m_len = 0;
4679				m = m->m_next;
4680			} else {
4681				m->m_len -= len;
4682				m->m_data += len;
4683				len = 0;
4684			}
4685		}
4686		m = mp;
4687		if (m->m_flags & M_PKTHDR)
4688			m->m_pkthdr.len -= (req_len - len);
4689	} else {
4690		/*
4691		 * Trim from tail.  Scan the mbuf chain,
4692		 * calculating its length and finding the last mbuf.
4693		 * If the adjustment only affects this mbuf, then just
4694		 * adjust and return.  Otherwise, rescan and truncate
4695		 * after the remaining size.
4696		 */
4697		len = -len;
4698		count = 0;
4699		for (;;) {
4700			count += m->m_len;
4701			if (m->m_next == (struct mbuf *)0)
4702				break;
4703			m = m->m_next;
4704		}
4705		if (m->m_len >= len) {
4706			m->m_len -= len;
4707			m = mp;
4708			if (m->m_flags & M_PKTHDR)
4709				m->m_pkthdr.len -= len;
4710			return;
4711		}
4712		count -= len;
4713		if (count < 0)
4714			count = 0;
4715		/*
4716		 * Correct length for chain is "count".
4717		 * Find the mbuf with last data, adjust its length,
4718		 * and toss data from remaining mbufs on chain.
4719		 */
4720		m = mp;
4721		if (m->m_flags & M_PKTHDR)
4722			m->m_pkthdr.len = count;
4723		for (; m; m = m->m_next) {
4724			if (m->m_len >= count) {
4725				m->m_len = count;
4726				break;
4727			}
4728			count -= m->m_len;
4729		}
4730		while ((m = m->m_next))
4731			m->m_len = 0;
4732	}
4733}
4734
4735/*
4736 * Rearange an mbuf chain so that len bytes are contiguous
4737 * and in the data area of an mbuf (so that mtod and dtom
4738 * will work for a structure of size len).  Returns the resulting
4739 * mbuf chain on success, frees it and returns null on failure.
4740 * If there is room, it will add up to max_protohdr-len extra bytes to the
4741 * contiguous region in an attempt to avoid being called next time.
4742 */
4743int MPFail;
4744
4745struct mbuf *
4746m_pullup(struct mbuf *n, int len)
4747{
4748	struct mbuf *m;
4749	int count;
4750	int space;
4751
4752	/*
4753	 * If first mbuf has no cluster, and has room for len bytes
4754	 * without shifting current data, pullup into it,
4755	 * otherwise allocate a new mbuf to prepend to the chain.
4756	 */
4757	if ((n->m_flags & M_EXT) == 0 &&
4758	    n->m_data + len < &n->m_dat[MLEN] && n->m_next) {
4759		if (n->m_len >= len)
4760			return (n);
4761		m = n;
4762		n = n->m_next;
4763		len -= m->m_len;
4764	} else {
4765		if (len > MHLEN)
4766			goto bad;
4767		_MGET(m, M_DONTWAIT, n->m_type);
4768		if (m == 0)
4769			goto bad;
4770		m->m_len = 0;
4771		if (n->m_flags & M_PKTHDR) {
4772			M_COPY_PKTHDR(m, n);
4773			n->m_flags &= ~M_PKTHDR;
4774		}
4775	}
4776	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4777	do {
4778		count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len);
4779		bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len,
4780		    (unsigned)count);
4781		len -= count;
4782		m->m_len += count;
4783		n->m_len -= count;
4784		space -= count;
4785		if (n->m_len)
4786			n->m_data += count;
4787		else
4788			n = m_free(n);
4789	} while (len > 0 && n);
4790	if (len > 0) {
4791		(void) m_free(m);
4792		goto bad;
4793	}
4794	m->m_next = n;
4795	return (m);
4796bad:
4797	m_freem(n);
4798	MPFail++;
4799	return (0);
4800}
4801
4802/*
4803 * Like m_pullup(), except a new mbuf is always allocated, and we allow
4804 * the amount of empty space before the data in the new mbuf to be specified
4805 * (in the event that the caller expects to prepend later).
4806 */
4807__private_extern__ int MSFail = 0;
4808
4809__private_extern__ struct mbuf *
4810m_copyup(struct mbuf *n, int len, int dstoff)
4811{
4812	struct mbuf *m;
4813	int count, space;
4814
4815	if (len > (MHLEN - dstoff))
4816		goto bad;
4817	MGET(m, M_DONTWAIT, n->m_type);
4818	if (m == NULL)
4819		goto bad;
4820	m->m_len = 0;
4821	if (n->m_flags & M_PKTHDR) {
4822		m_copy_pkthdr(m, n);
4823		n->m_flags &= ~M_PKTHDR;
4824	}
4825	m->m_data += dstoff;
4826	space = &m->m_dat[MLEN] - (m->m_data + m->m_len);
4827	do {
4828		count = min(min(max(len, max_protohdr), space), n->m_len);
4829		memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t),
4830		    (unsigned)count);
4831		len -= count;
4832		m->m_len += count;
4833		n->m_len -= count;
4834		space -= count;
4835		if (n->m_len)
4836			n->m_data += count;
4837		else
4838			n = m_free(n);
4839	} while (len > 0 && n);
4840	if (len > 0) {
4841		(void) m_free(m);
4842		goto bad;
4843	}
4844	m->m_next = n;
4845	return (m);
4846bad:
4847	m_freem(n);
4848	MSFail++;
4849	return (NULL);
4850}
4851
4852/*
4853 * Partition an mbuf chain in two pieces, returning the tail --
4854 * all but the first len0 bytes.  In case of failure, it returns NULL and
4855 * attempts to restore the chain to its original state.
4856 */
4857struct mbuf *
4858m_split(struct mbuf *m0, int len0, int wait)
4859{
4860	return (m_split0(m0, len0, wait, 1));
4861}
4862
4863static struct mbuf *
4864m_split0(struct mbuf *m0, int len0, int wait, int copyhdr)
4865{
4866	struct mbuf *m, *n;
4867	unsigned len = len0, remain;
4868
4869	for (m = m0; m && len > m->m_len; m = m->m_next)
4870		len -= m->m_len;
4871	if (m == NULL)
4872		return (NULL);
4873	remain = m->m_len - len;
4874	if (copyhdr && (m0->m_flags & M_PKTHDR)) {
4875		_MGETHDR(n, wait, m0->m_type);
4876		if (n == NULL)
4877			return (NULL);
4878		n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif;
4879		n->m_pkthdr.len = m0->m_pkthdr.len - len0;
4880		m0->m_pkthdr.len = len0;
4881		if (m->m_flags & M_EXT)
4882			goto extpacket;
4883		if (remain > MHLEN) {
4884			/* m can't be the lead packet */
4885			MH_ALIGN(n, 0);
4886			n->m_next = m_split(m, len, wait);
4887			if (n->m_next == NULL) {
4888				(void) m_free(n);
4889				return (NULL);
4890			} else
4891				return (n);
4892		} else
4893			MH_ALIGN(n, remain);
4894	} else if (remain == 0) {
4895		n = m->m_next;
4896		m->m_next = NULL;
4897		return (n);
4898	} else {
4899		_MGET(n, wait, m->m_type);
4900		if (n == NULL)
4901			return (NULL);
4902		M_ALIGN(n, remain);
4903	}
4904extpacket:
4905	if (m->m_flags & M_EXT) {
4906		n->m_flags |= M_EXT;
4907		n->m_ext = m->m_ext;
4908		m_incref(m);
4909		n->m_data = m->m_data + len;
4910	} else {
4911		bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain);
4912	}
4913	n->m_len = remain;
4914	m->m_len = len;
4915	n->m_next = m->m_next;
4916	m->m_next = NULL;
4917	return (n);
4918}
4919
4920/*
4921 * Routine to copy from device local memory into mbufs.
4922 */
4923struct mbuf *
4924m_devget(char *buf, int totlen, int off0, struct ifnet *ifp,
4925    void (*copy)(const void *, void *, size_t))
4926{
4927	struct mbuf *m;
4928	struct mbuf *top = NULL, **mp = &top;
4929	int off = off0, len;
4930	char *cp;
4931	char *epkt;
4932
4933	cp = buf;
4934	epkt = cp + totlen;
4935	if (off) {
4936		/*
4937		 * If 'off' is non-zero, packet is trailer-encapsulated,
4938		 * so we have to skip the type and length fields.
4939		 */
4940		cp += off + 2 * sizeof (u_int16_t);
4941		totlen -= 2 * sizeof (u_int16_t);
4942	}
4943	_MGETHDR(m, M_DONTWAIT, MT_DATA);
4944	if (m == NULL)
4945		return (NULL);
4946	m->m_pkthdr.rcvif = ifp;
4947	m->m_pkthdr.len = totlen;
4948	m->m_len = MHLEN;
4949
4950	while (totlen > 0) {
4951		if (top != NULL) {
4952			_MGET(m, M_DONTWAIT, MT_DATA);
4953			if (m == NULL) {
4954				m_freem(top);
4955				return (NULL);
4956			}
4957			m->m_len = MLEN;
4958		}
4959		len = MIN(totlen, epkt - cp);
4960		if (len >= MINCLSIZE) {
4961			MCLGET(m, M_DONTWAIT);
4962			if (m->m_flags & M_EXT) {
4963				m->m_len = len = MIN(len, m_maxsize(MC_CL));
4964			} else {
4965				/* give up when it's out of cluster mbufs */
4966				if (top != NULL)
4967					m_freem(top);
4968				m_freem(m);
4969				return (NULL);
4970			}
4971		} else {
4972			/*
4973			 * Place initial small packet/header at end of mbuf.
4974			 */
4975			if (len < m->m_len) {
4976				if (top == NULL &&
4977				    len + max_linkhdr <= m->m_len)
4978					m->m_data += max_linkhdr;
4979				m->m_len = len;
4980			} else {
4981				len = m->m_len;
4982			}
4983		}
4984		if (copy)
4985			copy(cp, MTOD(m, caddr_t), (unsigned)len);
4986		else
4987			bcopy(cp, MTOD(m, caddr_t), (unsigned)len);
4988		cp += len;
4989		*mp = m;
4990		mp = &m->m_next;
4991		totlen -= len;
4992		if (cp == epkt)
4993			cp = buf;
4994	}
4995	return (top);
4996}
4997
4998#ifndef MBUF_GROWTH_NORMAL_THRESH
4999#define	MBUF_GROWTH_NORMAL_THRESH 25
5000#endif
5001
5002/*
5003 * Cluster freelist allocation check.
5004 */
5005static int
5006m_howmany(int num, size_t bufsize)
5007{
5008	int i = 0, j = 0;
5009	u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters;
5010	u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree;
5011	u_int32_t sumclusters, freeclusters;
5012	u_int32_t percent_pool, percent_kmem;
5013	u_int32_t mb_growth, mb_growth_thresh;
5014
5015	VERIFY(bufsize == m_maxsize(MC_BIGCL) ||
5016	    bufsize == m_maxsize(MC_16KCL));
5017
5018	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5019
5020	/* Numbers in 2K cluster units */
5021	m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT;
5022	m_clusters = m_total(MC_CL);
5023	m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT;
5024	m_16kclusters = m_total(MC_16KCL);
5025	sumclusters = m_mbclusters + m_clusters + m_bigclusters;
5026
5027	m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT;
5028	m_clfree = m_infree(MC_CL);
5029	m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT;
5030	m_16kclfree = m_infree(MC_16KCL);
5031	freeclusters = m_mbfree + m_clfree + m_bigclfree;
5032
5033	/* Bail if we've maxed out the mbuf memory map */
5034	if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) ||
5035	    (njcl > 0 && bufsize == m_maxsize(MC_16KCL) &&
5036	    (m_16kclusters << NCLPJCLSHIFT) >= njcl)) {
5037		return (0);
5038	}
5039
5040	if (bufsize == m_maxsize(MC_BIGCL)) {
5041		/* Under minimum */
5042		if (m_bigclusters < m_minlimit(MC_BIGCL))
5043			return (m_minlimit(MC_BIGCL) - m_bigclusters);
5044
5045		percent_pool =
5046		    ((sumclusters - freeclusters) * 100) / sumclusters;
5047		percent_kmem = (sumclusters * 100) / nclusters;
5048
5049		/*
5050		 * If a light/normal user, grow conservatively (75%)
5051		 * If a heavy user, grow aggressively (50%)
5052		 */
5053		if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH)
5054			mb_growth = MB_GROWTH_NORMAL;
5055		else
5056			mb_growth = MB_GROWTH_AGGRESSIVE;
5057
5058		if (percent_kmem < 5) {
5059			/* For initial allocations */
5060			i = num;
5061		} else {
5062			/* Return if >= MBIGCL_LOWAT clusters available */
5063			if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT &&
5064			    m_total(MC_BIGCL) >=
5065			    MBIGCL_LOWAT + m_minlimit(MC_BIGCL))
5066				return (0);
5067
5068			/* Ensure at least num clusters are accessible */
5069			if (num >= m_infree(MC_BIGCL))
5070				i = num - m_infree(MC_BIGCL);
5071			if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL))
5072				j = num - (m_total(MC_BIGCL) -
5073				    m_minlimit(MC_BIGCL));
5074
5075			i = MAX(i, j);
5076
5077			/*
5078			 * Grow pool if percent_pool > 75 (normal growth)
5079			 * or percent_pool > 50 (aggressive growth).
5080			 */
5081			mb_growth_thresh = 100 - (100 / (1 << mb_growth));
5082			if (percent_pool > mb_growth_thresh)
5083				j = ((sumclusters + num) >> mb_growth) -
5084				    freeclusters;
5085			i = MAX(i, j);
5086		}
5087
5088		/* Check to ensure we didn't go over limits */
5089		if (i + m_bigclusters >= m_maxlimit(MC_BIGCL))
5090			i = m_maxlimit(MC_BIGCL) - m_bigclusters;
5091		if ((i << 1) + sumclusters >= nclusters)
5092			i = (nclusters - sumclusters) >> 1;
5093		VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL));
5094		VERIFY(sumclusters + (i << 1) <= nclusters);
5095
5096	} else { /* 16K CL */
5097		VERIFY(njcl > 0);
5098		/* Under minimum */
5099		if (m_16kclusters < MIN16KCL)
5100			return (MIN16KCL - m_16kclusters);
5101		if (m_16kclfree >= M16KCL_LOWAT)
5102			return (0);
5103
5104		/* Ensure at least num clusters are available */
5105		if (num >= m_16kclfree)
5106			i = num - m_16kclfree;
5107
5108		/* Always grow 16KCL pool aggressively */
5109		if (((m_16kclusters + num) >> 1) > m_16kclfree)
5110			j = ((m_16kclusters + num) >> 1) - m_16kclfree;
5111		i = MAX(i, j);
5112
5113		/* Check to ensure we don't go over limit */
5114		if (i + m_16kclusters >= m_maxlimit(MC_16KCL))
5115			i = m_maxlimit(MC_16KCL) - m_16kclusters;
5116		VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL));
5117	}
5118	return (i);
5119}
5120/*
5121 * Return the number of bytes in the mbuf chain, m.
5122 */
5123unsigned int
5124m_length(struct mbuf *m)
5125{
5126	struct mbuf *m0;
5127	unsigned int pktlen;
5128
5129	if (m->m_flags & M_PKTHDR)
5130		return (m->m_pkthdr.len);
5131
5132	pktlen = 0;
5133	for (m0 = m; m0 != NULL; m0 = m0->m_next)
5134		pktlen += m0->m_len;
5135	return (pktlen);
5136}
5137
5138/*
5139 * Copy data from a buffer back into the indicated mbuf chain,
5140 * starting "off" bytes from the beginning, extending the mbuf
5141 * chain if necessary.
5142 */
5143void
5144m_copyback(struct mbuf *m0, int off, int len, const void *cp)
5145{
5146#if DEBUG
5147	struct mbuf *origm = m0;
5148	int error;
5149#endif /* DEBUG */
5150
5151	if (m0 == NULL)
5152		return;
5153
5154#if DEBUG
5155	error =
5156#endif /* DEBUG */
5157	m_copyback0(&m0, off, len, cp,
5158	    M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT);
5159
5160#if DEBUG
5161	if (error != 0 || (m0 != NULL && origm != m0))
5162		panic("m_copyback");
5163#endif /* DEBUG */
5164}
5165
5166struct mbuf *
5167m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how)
5168{
5169	int error;
5170
5171	/* don't support chain expansion */
5172	VERIFY(off + len <= m_length(m0));
5173
5174	error = m_copyback0(&m0, off, len, cp,
5175	    M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how);
5176	if (error) {
5177		/*
5178		 * no way to recover from partial success.
5179		 * just free the chain.
5180		 */
5181		m_freem(m0);
5182		return (NULL);
5183	}
5184	return (m0);
5185}
5186
5187/*
5188 * m_makewritable: ensure the specified range writable.
5189 */
5190int
5191m_makewritable(struct mbuf **mp, int off, int len, int how)
5192{
5193	int error;
5194#if DEBUG
5195	struct mbuf *n;
5196	int origlen, reslen;
5197
5198	origlen = m_length(*mp);
5199#endif /* DEBUG */
5200
5201#if 0 /* M_COPYALL is large enough */
5202	if (len == M_COPYALL)
5203		len = m_length(*mp) - off; /* XXX */
5204#endif
5205
5206	error = m_copyback0(mp, off, len, NULL,
5207	    M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how);
5208
5209#if DEBUG
5210	reslen = 0;
5211	for (n = *mp; n; n = n->m_next)
5212		reslen += n->m_len;
5213	if (origlen != reslen)
5214		panic("m_makewritable: length changed");
5215	if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len)
5216		panic("m_makewritable: inconsist");
5217#endif /* DEBUG */
5218
5219	return (error);
5220}
5221
5222static int
5223m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags,
5224    int how)
5225{
5226	int mlen;
5227	struct mbuf *m, *n;
5228	struct mbuf **mp;
5229	int totlen = 0;
5230	const char *cp = vp;
5231
5232	VERIFY(mp0 != NULL);
5233	VERIFY(*mp0 != NULL);
5234	VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL);
5235	VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL);
5236
5237	/*
5238	 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW,
5239	 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive.
5240	 */
5241
5242	VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0);
5243
5244	mp = mp0;
5245	m = *mp;
5246	while (off > (mlen = m->m_len)) {
5247		off -= mlen;
5248		totlen += mlen;
5249		if (m->m_next == NULL) {
5250			int tspace;
5251extend:
5252			if (!(flags & M_COPYBACK0_EXTEND))
5253				goto out;
5254
5255			/*
5256			 * try to make some space at the end of "m".
5257			 */
5258
5259			mlen = m->m_len;
5260			if (off + len >= MINCLSIZE &&
5261			    !(m->m_flags & M_EXT) && m->m_len == 0) {
5262				MCLGET(m, how);
5263			}
5264			tspace = M_TRAILINGSPACE(m);
5265			if (tspace > 0) {
5266				tspace = MIN(tspace, off + len);
5267				VERIFY(tspace > 0);
5268				bzero(mtod(m, char *) + m->m_len,
5269				    MIN(off, tspace));
5270				m->m_len += tspace;
5271				off += mlen;
5272				totlen -= mlen;
5273				continue;
5274			}
5275
5276			/*
5277			 * need to allocate an mbuf.
5278			 */
5279
5280			if (off + len >= MINCLSIZE) {
5281				n = m_getcl(how, m->m_type, 0);
5282			} else {
5283				n = _M_GET(how, m->m_type);
5284			}
5285			if (n == NULL) {
5286				goto out;
5287			}
5288			n->m_len = 0;
5289			n->m_len = MIN(M_TRAILINGSPACE(n), off + len);
5290			bzero(mtod(n, char *), MIN(n->m_len, off));
5291			m->m_next = n;
5292		}
5293		mp = &m->m_next;
5294		m = m->m_next;
5295	}
5296	while (len > 0) {
5297		mlen = m->m_len - off;
5298		if (mlen != 0 && m_mclhasreference(m)) {
5299			char *datap;
5300			int eatlen;
5301
5302			/*
5303			 * this mbuf is read-only.
5304			 * allocate a new writable mbuf and try again.
5305			 */
5306
5307#if defined(DIAGNOSTIC)
5308			if (!(flags & M_COPYBACK0_COW))
5309				panic("m_copyback0: read-only");
5310#endif /* defined(DIAGNOSTIC) */
5311
5312			/*
5313			 * if we're going to write into the middle of
5314			 * a mbuf, split it first.
5315			 */
5316			if (off > 0 && len < mlen) {
5317				n = m_split0(m, off, how, 0);
5318				if (n == NULL)
5319					goto enobufs;
5320				m->m_next = n;
5321				mp = &m->m_next;
5322				m = n;
5323				off = 0;
5324				continue;
5325			}
5326
5327			/*
5328			 * XXX TODO coalesce into the trailingspace of
5329			 * the previous mbuf when possible.
5330			 */
5331
5332			/*
5333			 * allocate a new mbuf.  copy packet header if needed.
5334			 */
5335			n = _M_GET(how, m->m_type);
5336			if (n == NULL)
5337				goto enobufs;
5338			if (off == 0 && (m->m_flags & M_PKTHDR)) {
5339				M_COPY_PKTHDR(n, m);
5340				n->m_len = MHLEN;
5341			} else {
5342				if (len >= MINCLSIZE)
5343					MCLGET(n, M_DONTWAIT);
5344				n->m_len =
5345				    (n->m_flags & M_EXT) ? MCLBYTES : MLEN;
5346			}
5347			if (n->m_len > len)
5348				n->m_len = len;
5349
5350			/*
5351			 * free the region which has been overwritten.
5352			 * copying data from old mbufs if requested.
5353			 */
5354			if (flags & M_COPYBACK0_PRESERVE)
5355				datap = mtod(n, char *);
5356			else
5357				datap = NULL;
5358			eatlen = n->m_len;
5359			VERIFY(off == 0 || eatlen >= mlen);
5360			if (off > 0) {
5361				VERIFY(len >= mlen);
5362				m->m_len = off;
5363				m->m_next = n;
5364				if (datap) {
5365					m_copydata(m, off, mlen, datap);
5366					datap += mlen;
5367				}
5368				eatlen -= mlen;
5369				mp = &m->m_next;
5370				m = m->m_next;
5371			}
5372			while (m != NULL && m_mclhasreference(m) &&
5373			    n->m_type == m->m_type && eatlen > 0) {
5374				mlen = MIN(eatlen, m->m_len);
5375				if (datap) {
5376					m_copydata(m, 0, mlen, datap);
5377					datap += mlen;
5378				}
5379				m->m_data += mlen;
5380				m->m_len -= mlen;
5381				eatlen -= mlen;
5382				if (m->m_len == 0)
5383					*mp = m = m_free(m);
5384			}
5385			if (eatlen > 0)
5386				n->m_len -= eatlen;
5387			n->m_next = m;
5388			*mp = m = n;
5389			continue;
5390		}
5391		mlen = MIN(mlen, len);
5392		if (flags & M_COPYBACK0_COPYBACK) {
5393			bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen);
5394			cp += mlen;
5395		}
5396		len -= mlen;
5397		mlen += off;
5398		off = 0;
5399		totlen += mlen;
5400		if (len == 0)
5401			break;
5402		if (m->m_next == NULL) {
5403			goto extend;
5404		}
5405		mp = &m->m_next;
5406		m = m->m_next;
5407	}
5408out:
5409	if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) {
5410		VERIFY(flags & M_COPYBACK0_EXTEND);
5411		m->m_pkthdr.len = totlen;
5412	}
5413
5414	return (0);
5415
5416enobufs:
5417	return (ENOBUFS);
5418}
5419
5420char *
5421mcl_to_paddr(char *addr)
5422{
5423	vm_offset_t base_phys;
5424
5425	if (!MBUF_IN_MAP(addr))
5426		return (NULL);
5427	base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT];
5428
5429	if (base_phys == 0)
5430		return (NULL);
5431	return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET)));
5432}
5433
5434/*
5435 * Dup the mbuf chain passed in.  The whole thing.  No cute additional cruft.
5436 * And really copy the thing.  That way, we don't "precompute" checksums
5437 * for unsuspecting consumers.  Assumption: m->m_nextpkt == 0.  Trick: for
5438 * small packets, don't dup into a cluster.  That way received  packets
5439 * don't take up too much room in the sockbuf (cf. sbspace()).
5440 */
5441int MDFail;
5442
5443struct mbuf *
5444m_dup(struct mbuf *m, int how)
5445{
5446	struct mbuf *n, **np;
5447	struct mbuf *top;
5448	int copyhdr = 0;
5449
5450	np = &top;
5451	top = NULL;
5452	if (m->m_flags & M_PKTHDR)
5453		copyhdr = 1;
5454
5455	/*
5456	 * Quick check: if we have one mbuf and its data fits in an
5457	 *  mbuf with packet header, just copy and go.
5458	 */
5459	if (m->m_next == NULL) {
5460		/* Then just move the data into an mbuf and be done... */
5461		if (copyhdr) {
5462			if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) {
5463				if ((n = _M_GETHDR(how, m->m_type)) == NULL)
5464					return (NULL);
5465				n->m_len = m->m_len;
5466				m_dup_pkthdr(n, m, how);
5467				bcopy(m->m_data, n->m_data, m->m_len);
5468				return (n);
5469			}
5470		} else if (m->m_len <= MLEN) {
5471			if ((n = _M_GET(how, m->m_type)) == NULL)
5472				return (NULL);
5473			bcopy(m->m_data, n->m_data, m->m_len);
5474			n->m_len = m->m_len;
5475			return (n);
5476		}
5477	}
5478	while (m != NULL) {
5479#if BLUE_DEBUG
5480		kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len,
5481		    m->m_data);
5482#endif
5483		if (copyhdr)
5484			n = _M_GETHDR(how, m->m_type);
5485		else
5486			n = _M_GET(how, m->m_type);
5487		if (n == NULL)
5488			goto nospace;
5489		if (m->m_flags & M_EXT) {
5490			if (m->m_len <= m_maxsize(MC_CL))
5491				MCLGET(n, how);
5492			else if (m->m_len <= m_maxsize(MC_BIGCL))
5493				n = m_mbigget(n, how);
5494			else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0)
5495				n = m_m16kget(n, how);
5496			if (!(n->m_flags & M_EXT)) {
5497				(void) m_free(n);
5498				goto nospace;
5499			}
5500		}
5501		*np = n;
5502		if (copyhdr) {
5503			/* Don't use M_COPY_PKTHDR: preserve m_data */
5504			m_dup_pkthdr(n, m, how);
5505			copyhdr = 0;
5506			if (!(n->m_flags & M_EXT))
5507				n->m_data = n->m_pktdat;
5508		}
5509		n->m_len = m->m_len;
5510		/*
5511		 * Get the dup on the same bdry as the original
5512		 * Assume that the two mbufs have the same offset to data area
5513		 * (up to word boundaries)
5514		 */
5515		bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len);
5516		m = m->m_next;
5517		np = &n->m_next;
5518#if BLUE_DEBUG
5519		kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len,
5520		    n->m_data);
5521#endif
5522	}
5523
5524	if (top == NULL)
5525		MDFail++;
5526	return (top);
5527
5528nospace:
5529	m_freem(top);
5530	MDFail++;
5531	return (NULL);
5532}
5533
5534#define	MBUF_MULTIPAGES(m)						\
5535	(((m)->m_flags & M_EXT) &&					\
5536	((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) ||	\
5537	(!IS_P2ALIGNED((m)->m_data, NBPG) &&				\
5538	P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len))))
5539
5540static struct mbuf *
5541m_expand(struct mbuf *m, struct mbuf **last)
5542{
5543	struct mbuf *top = NULL;
5544	struct mbuf **nm = &top;
5545	uintptr_t data0, data;
5546	unsigned int len0, len;
5547
5548	VERIFY(MBUF_MULTIPAGES(m));
5549	VERIFY(m->m_next == NULL);
5550	data0 = (uintptr_t)m->m_data;
5551	len0 = m->m_len;
5552	*last = top;
5553
5554	for (;;) {
5555		struct mbuf *n;
5556
5557		data = data0;
5558		if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG)
5559			len = NBPG;
5560		else if (!IS_P2ALIGNED(data, NBPG) &&
5561		    P2ROUNDUP(data, NBPG) < (data + len0))
5562			len = P2ROUNDUP(data, NBPG) - data;
5563		else
5564			len = len0;
5565
5566		VERIFY(len > 0);
5567		VERIFY(m->m_flags & M_EXT);
5568		m->m_data = (void *)data;
5569		m->m_len = len;
5570
5571		*nm = *last = m;
5572		nm = &m->m_next;
5573		m->m_next = NULL;
5574
5575		data0 += len;
5576		len0 -= len;
5577		if (len0 == 0)
5578			break;
5579
5580		n = _M_RETRY(M_DONTWAIT, MT_DATA);
5581		if (n == NULL) {
5582			m_freem(top);
5583			top = *last = NULL;
5584			break;
5585		}
5586
5587		n->m_ext = m->m_ext;
5588		m_incref(m);
5589		n->m_flags |= M_EXT;
5590		m = n;
5591	}
5592	return (top);
5593}
5594
5595struct mbuf *
5596m_normalize(struct mbuf *m)
5597{
5598	struct mbuf *top = NULL;
5599	struct mbuf **nm = &top;
5600	boolean_t expanded = FALSE;
5601
5602	while (m != NULL) {
5603		struct mbuf *n;
5604
5605		n = m->m_next;
5606		m->m_next = NULL;
5607
5608		/* Does the data cross one or more page boundaries? */
5609		if (MBUF_MULTIPAGES(m)) {
5610			struct mbuf *last;
5611			if ((m = m_expand(m, &last)) == NULL) {
5612				m_freem(n);
5613				m_freem(top);
5614				top = NULL;
5615				break;
5616			}
5617			*nm = m;
5618			nm = &last->m_next;
5619			expanded = TRUE;
5620		} else {
5621			*nm = m;
5622			nm = &m->m_next;
5623		}
5624		m = n;
5625	}
5626	if (expanded)
5627		atomic_add_32(&mb_normalized, 1);
5628	return (top);
5629}
5630
5631/*
5632 * Append the specified data to the indicated mbuf chain,
5633 * Extend the mbuf chain if the new data does not fit in
5634 * existing space.
5635 *
5636 * Return 1 if able to complete the job; otherwise 0.
5637 */
5638int
5639m_append(struct mbuf *m0, int len, caddr_t cp)
5640{
5641	struct mbuf *m, *n;
5642	int remainder, space;
5643
5644	for (m = m0; m->m_next != NULL; m = m->m_next)
5645		;
5646	remainder = len;
5647	space = M_TRAILINGSPACE(m);
5648	if (space > 0) {
5649		/*
5650		 * Copy into available space.
5651		 */
5652		if (space > remainder)
5653			space = remainder;
5654		bcopy(cp, mtod(m, caddr_t) + m->m_len, space);
5655		m->m_len += space;
5656		cp += space, remainder -= space;
5657	}
5658	while (remainder > 0) {
5659		/*
5660		 * Allocate a new mbuf; could check space
5661		 * and allocate a cluster instead.
5662		 */
5663		n = m_get(M_WAITOK, m->m_type);
5664		if (n == NULL)
5665			break;
5666		n->m_len = min(MLEN, remainder);
5667		bcopy(cp, mtod(n, caddr_t), n->m_len);
5668		cp += n->m_len;
5669		remainder -= n->m_len;
5670		m->m_next = n;
5671		m = n;
5672	}
5673	if (m0->m_flags & M_PKTHDR)
5674		m0->m_pkthdr.len += len - remainder;
5675	return (remainder == 0);
5676}
5677
5678struct mbuf *
5679m_last(struct mbuf *m)
5680{
5681	while (m->m_next != NULL)
5682		m = m->m_next;
5683	return (m);
5684}
5685
5686unsigned int
5687m_fixhdr(struct mbuf *m0)
5688{
5689	u_int len;
5690
5691	len = m_length2(m0, NULL);
5692	m0->m_pkthdr.len = len;
5693	return (len);
5694}
5695
5696unsigned int
5697m_length2(struct mbuf *m0, struct mbuf **last)
5698{
5699	struct mbuf *m;
5700	u_int len;
5701
5702	len = 0;
5703	for (m = m0; m != NULL; m = m->m_next) {
5704		len += m->m_len;
5705		if (m->m_next == NULL)
5706			break;
5707	}
5708	if (last != NULL)
5709		*last = m;
5710	return (len);
5711}
5712
5713/*
5714 * Defragment a mbuf chain, returning the shortest possible chain of mbufs
5715 * and clusters.  If allocation fails and this cannot be completed, NULL will
5716 * be returned, but the passed in chain will be unchanged.  Upon success,
5717 * the original chain will be freed, and the new chain will be returned.
5718 *
5719 * If a non-packet header is passed in, the original mbuf (chain?) will
5720 * be returned unharmed.
5721 *
5722 * If offset is specfied, the first mbuf in the chain will have a leading
5723 * space of the amount stated by the "off" parameter.
5724 *
5725 * This routine requires that the m_pkthdr.header field of the original
5726 * mbuf chain is cleared by the caller.
5727 */
5728struct mbuf *
5729m_defrag_offset(struct mbuf *m0, u_int32_t off, int how)
5730{
5731	struct mbuf *m_new = NULL, *m_final = NULL;
5732	int progress = 0, length, pktlen;
5733
5734	if (!(m0->m_flags & M_PKTHDR))
5735		return (m0);
5736
5737	VERIFY(off < MHLEN);
5738	m_fixhdr(m0); /* Needed sanity check */
5739
5740	pktlen = m0->m_pkthdr.len + off;
5741	if (pktlen > MHLEN)
5742		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
5743	else
5744		m_final = m_gethdr(how, MT_DATA);
5745
5746	if (m_final == NULL)
5747		goto nospace;
5748
5749	if (off > 0) {
5750		pktlen -= off;
5751		m_final->m_len -= off;
5752		m_final->m_data += off;
5753	}
5754
5755	/*
5756	 * Caller must have handled the contents pointed to by this
5757	 * pointer before coming here, as otherwise it will point to
5758	 * the original mbuf which will get freed upon success.
5759	 */
5760	VERIFY(m0->m_pkthdr.header == NULL);
5761
5762	if (m_dup_pkthdr(m_final, m0, how) == 0)
5763		goto nospace;
5764
5765	m_new = m_final;
5766
5767	while (progress < pktlen) {
5768		length = pktlen - progress;
5769		if (length > MCLBYTES)
5770			length = MCLBYTES;
5771
5772		if (m_new == NULL) {
5773			if (length > MLEN)
5774				m_new = m_getcl(how, MT_DATA, 0);
5775			else
5776				m_new = m_get(how, MT_DATA);
5777			if (m_new == NULL)
5778				goto nospace;
5779		}
5780
5781		m_copydata(m0, progress, length, mtod(m_new, caddr_t));
5782		progress += length;
5783		m_new->m_len = length;
5784		if (m_new != m_final)
5785			m_cat(m_final, m_new);
5786		m_new = NULL;
5787	}
5788	m_freem(m0);
5789	m0 = m_final;
5790	return (m0);
5791nospace:
5792	if (m_final)
5793		m_freem(m_final);
5794	return (NULL);
5795}
5796
5797struct mbuf *
5798m_defrag(struct mbuf *m0, int how)
5799{
5800	return (m_defrag_offset(m0, 0, how));
5801}
5802
5803void
5804m_mchtype(struct mbuf *m, int t)
5805{
5806	mtype_stat_inc(t);
5807	mtype_stat_dec(m->m_type);
5808	(m)->m_type = t;
5809}
5810
5811void *
5812m_mtod(struct mbuf *m)
5813{
5814	return (MTOD(m, void *));
5815}
5816
5817struct mbuf *
5818m_dtom(void *x)
5819{
5820	return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1)));
5821}
5822
5823void
5824m_mcheck(struct mbuf *m)
5825{
5826	_MCHECK(m);
5827}
5828
5829/*
5830 * Return a pointer to mbuf/offset of location in mbuf chain.
5831 */
5832struct mbuf *
5833m_getptr(struct mbuf *m, int loc, int *off)
5834{
5835
5836	while (loc >= 0) {
5837		/* Normal end of search. */
5838		if (m->m_len > loc) {
5839			*off = loc;
5840			return (m);
5841		} else {
5842			loc -= m->m_len;
5843			if (m->m_next == NULL) {
5844				if (loc == 0) {
5845					/* Point at the end of valid data. */
5846					*off = m->m_len;
5847					return (m);
5848				}
5849				return (NULL);
5850			}
5851			m = m->m_next;
5852		}
5853	}
5854	return (NULL);
5855}
5856
5857/*
5858 * Inform the corresponding mcache(s) that there's a waiter below.
5859 */
5860static void
5861mbuf_waiter_inc(mbuf_class_t class, boolean_t comp)
5862{
5863	mcache_waiter_inc(m_cache(class));
5864	if (comp) {
5865		if (class == MC_CL) {
5866			mcache_waiter_inc(m_cache(MC_MBUF_CL));
5867		} else if (class == MC_BIGCL) {
5868			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5869		} else if (class == MC_16KCL) {
5870			mcache_waiter_inc(m_cache(MC_MBUF_16KCL));
5871		} else {
5872			mcache_waiter_inc(m_cache(MC_MBUF_CL));
5873			mcache_waiter_inc(m_cache(MC_MBUF_BIGCL));
5874		}
5875	}
5876}
5877
5878/*
5879 * Inform the corresponding mcache(s) that there's no more waiter below.
5880 */
5881static void
5882mbuf_waiter_dec(mbuf_class_t class, boolean_t comp)
5883{
5884	mcache_waiter_dec(m_cache(class));
5885	if (comp) {
5886		if (class == MC_CL) {
5887			mcache_waiter_dec(m_cache(MC_MBUF_CL));
5888		} else if (class == MC_BIGCL) {
5889			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5890		} else if (class == MC_16KCL) {
5891			mcache_waiter_dec(m_cache(MC_MBUF_16KCL));
5892		} else {
5893			mcache_waiter_dec(m_cache(MC_MBUF_CL));
5894			mcache_waiter_dec(m_cache(MC_MBUF_BIGCL));
5895		}
5896	}
5897}
5898
5899/*
5900 * Called during slab (blocking and non-blocking) allocation.  If there
5901 * is at least one waiter, and the time since the first waiter is blocked
5902 * is greater than the watchdog timeout, panic the system.
5903 */
5904static void
5905mbuf_watchdog(void)
5906{
5907	struct timeval now;
5908	unsigned int since;
5909
5910	if (mb_waiters == 0 || !mb_watchdog)
5911		return;
5912
5913	microuptime(&now);
5914	since = now.tv_sec - mb_wdtstart.tv_sec;
5915	if (since >= MB_WDT_MAXTIME) {
5916		panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__,
5917		    mb_waiters, since, mbuf_dump());
5918		/* NOTREACHED */
5919	}
5920}
5921
5922/*
5923 * Called during blocking allocation.  Returns TRUE if one or more objects
5924 * are available at the per-CPU caches layer and that allocation should be
5925 * retried at that level.
5926 */
5927static boolean_t
5928mbuf_sleep(mbuf_class_t class, unsigned int num, int wait)
5929{
5930	boolean_t mcache_retry = FALSE;
5931
5932	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
5933
5934	/* Check if there's anything at the cache layer */
5935	if (mbuf_cached_above(class, wait)) {
5936		mcache_retry = TRUE;
5937		goto done;
5938	}
5939
5940	/* Nothing?  Then try hard to get it from somewhere */
5941	m_reclaim(class, num, (wait & MCR_COMP));
5942
5943	/* We tried hard and got something? */
5944	if (m_infree(class) > 0) {
5945		mbstat.m_wait++;
5946		goto done;
5947	} else if (mbuf_cached_above(class, wait)) {
5948		mbstat.m_wait++;
5949		mcache_retry = TRUE;
5950		goto done;
5951	} else if (wait & MCR_TRYHARD) {
5952		mcache_retry = TRUE;
5953		goto done;
5954	}
5955
5956	/*
5957	 * There's really nothing for us right now; inform the
5958	 * cache(s) that there is a waiter below and go to sleep.
5959	 */
5960	mbuf_waiter_inc(class, (wait & MCR_COMP));
5961
5962	VERIFY(!(wait & MCR_NOSLEEP));
5963
5964	/*
5965	 * If this is the first waiter, arm the watchdog timer.  Otherwise
5966	 * check if we need to panic the system due to watchdog timeout.
5967	 */
5968	if (mb_waiters == 0)
5969		microuptime(&mb_wdtstart);
5970	else
5971		mbuf_watchdog();
5972
5973	mb_waiters++;
5974	(void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL);
5975
5976	/* We are now up; stop getting notified until next round */
5977	mbuf_waiter_dec(class, (wait & MCR_COMP));
5978
5979	/* We waited and got something */
5980	if (m_infree(class) > 0) {
5981		mbstat.m_wait++;
5982		goto done;
5983	} else if (mbuf_cached_above(class, wait)) {
5984		mbstat.m_wait++;
5985		mcache_retry = TRUE;
5986	}
5987done:
5988	return (mcache_retry);
5989}
5990
5991static void
5992mbuf_worker_thread(void)
5993{
5994	int mbuf_expand;
5995
5996	while (1) {
5997		lck_mtx_lock(mbuf_mlock);
5998
5999		mbuf_expand = 0;
6000		if (mbuf_expand_mcl) {
6001			int n;
6002
6003			/* Adjust to current number of cluster in use */
6004			n = mbuf_expand_mcl -
6005			    (m_total(MC_CL) - m_infree(MC_CL));
6006			if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL))
6007				n = m_maxlimit(MC_CL) - m_total(MC_CL);
6008			mbuf_expand_mcl = 0;
6009
6010			if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0)
6011				mbuf_expand++;
6012		}
6013		if (mbuf_expand_big) {
6014			int n;
6015
6016			/* Adjust to current number of 4 KB cluster in use */
6017			n = mbuf_expand_big -
6018			    (m_total(MC_BIGCL) - m_infree(MC_BIGCL));
6019			if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL))
6020				n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL);
6021			mbuf_expand_big = 0;
6022
6023			if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0)
6024				mbuf_expand++;
6025		}
6026		if (mbuf_expand_16k) {
6027			int n;
6028
6029			/* Adjust to current number of 16 KB cluster in use */
6030			n = mbuf_expand_16k -
6031			    (m_total(MC_16KCL) - m_infree(MC_16KCL));
6032			if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL))
6033				n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL);
6034			mbuf_expand_16k = 0;
6035
6036			if (n > 0)
6037				(void) freelist_populate(MC_16KCL, n, M_WAIT);
6038		}
6039
6040		/*
6041		 * Because we can run out of memory before filling the mbuf
6042		 * map, we should not allocate more clusters than they are
6043		 * mbufs -- otherwise we could have a large number of useless
6044		 * clusters allocated.
6045		 */
6046		if (mbuf_expand) {
6047			while (m_total(MC_MBUF) <
6048			    (m_total(MC_BIGCL) + m_total(MC_CL))) {
6049				if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0)
6050					break;
6051			}
6052		}
6053
6054		lck_mtx_unlock(mbuf_mlock);
6055
6056		assert_wait(&mbuf_worker_run, THREAD_UNINT);
6057		(void) thread_block((thread_continue_t)mbuf_worker_thread);
6058	}
6059}
6060
6061static void
6062mbuf_worker_thread_init(void)
6063{
6064	mbuf_worker_ready++;
6065	mbuf_worker_thread();
6066}
6067
6068static mcl_slab_t *
6069slab_get(void *buf)
6070{
6071	mcl_slabg_t *slg;
6072	unsigned int ix, k;
6073
6074	lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED);
6075
6076	VERIFY(MBUF_IN_MAP(buf));
6077	ix = ((char *)buf - (char *)mbutl) >> MBSHIFT;
6078	VERIFY(ix < maxslabgrp);
6079
6080	if ((slg = slabstbl[ix]) == NULL) {
6081		/*
6082		 * In the current implementation, we never shrink the memory
6083		 * pool (hence the cluster map); if we attempt to reallocate
6084		 * a cluster group when it's already allocated, panic since
6085		 * this is a sign of a memory corruption (slabstbl[ix] got
6086		 * nullified).  This also means that there shouldn't be any
6087		 * hole in the kernel sub-map for the mbuf pool.
6088		 */
6089		++slabgrp;
6090		VERIFY(ix < slabgrp);
6091		/*
6092		 * Slabs expansion can only be done single threaded; when
6093		 * we get here, it must be as a result of m_clalloc() which
6094		 * is serialized and therefore mb_clalloc_busy must be set.
6095		 */
6096		VERIFY(mb_clalloc_busy);
6097		lck_mtx_unlock(mbuf_mlock);
6098
6099		/* This is a new buffer; create the slabs group for it */
6100		MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP,
6101		    M_WAITOK | M_ZERO);
6102		VERIFY(slg != NULL);
6103
6104		lck_mtx_lock(mbuf_mlock);
6105		/*
6106		 * No other thread could have gone into m_clalloc() after
6107		 * we dropped the lock above, so verify that it's true.
6108		 */
6109		VERIFY(mb_clalloc_busy);
6110
6111		slabstbl[ix] = slg;
6112
6113		/* Chain each slab in the group to its forward neighbor */
6114		for (k = 1; k < NSLABSPMB; k++)
6115			slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k];
6116		VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL);
6117
6118		/* And chain the last slab in the previous group to this */
6119		if (ix > 0) {
6120			VERIFY(slabstbl[ix - 1]->
6121			    slg_slab[NSLABSPMB - 1].sl_next == NULL);
6122			slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next =
6123			    &slg->slg_slab[0];
6124		}
6125	}
6126
6127	ix = MTOBG(buf) % NSLABSPMB;
6128	VERIFY(ix < NSLABSPMB);
6129
6130	return (&slg->slg_slab[ix]);
6131}
6132
6133static void
6134slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags,
6135    void *base, void *head, unsigned int len, int refcnt, int chunks)
6136{
6137	sp->sl_class = class;
6138	sp->sl_flags = flags;
6139	sp->sl_base = base;
6140	sp->sl_head = head;
6141	sp->sl_len = len;
6142	sp->sl_refcnt = refcnt;
6143	sp->sl_chunks = chunks;
6144	slab_detach(sp);
6145}
6146
6147static void
6148slab_insert(mcl_slab_t *sp, mbuf_class_t class)
6149{
6150	VERIFY(slab_is_detached(sp));
6151	m_slab_cnt(class)++;
6152	TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link);
6153	sp->sl_flags &= ~SLF_DETACHED;
6154	if (class == MC_16KCL) {
6155		int k;
6156		for (k = 1; k < NSLABSP16KB; k++) {
6157			sp = sp->sl_next;
6158			/* Next slab must already be present */
6159			VERIFY(sp != NULL);
6160			VERIFY(slab_is_detached(sp));
6161			sp->sl_flags &= ~SLF_DETACHED;
6162		}
6163	}
6164}
6165
6166static void
6167slab_remove(mcl_slab_t *sp, mbuf_class_t class)
6168{
6169	VERIFY(!slab_is_detached(sp));
6170	VERIFY(m_slab_cnt(class) > 0);
6171	m_slab_cnt(class)--;
6172	TAILQ_REMOVE(&m_slablist(class), sp, sl_link);
6173	slab_detach(sp);
6174	if (class == MC_16KCL) {
6175		int k;
6176		for (k = 1; k < NSLABSP16KB; k++) {
6177			sp = sp->sl_next;
6178			/* Next slab must already be present */
6179			VERIFY(sp != NULL);
6180			VERIFY(!slab_is_detached(sp));
6181			slab_detach(sp);
6182		}
6183	}
6184}
6185
6186static boolean_t
6187slab_inrange(mcl_slab_t *sp, void *buf)
6188{
6189	return ((uintptr_t)buf >= (uintptr_t)sp->sl_base &&
6190	    (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len));
6191}
6192
6193#undef panic
6194
6195static void
6196slab_nextptr_panic(mcl_slab_t *sp, void *addr)
6197{
6198	int i;
6199	unsigned int chunk_len = sp->sl_len / sp->sl_chunks;
6200	uintptr_t buf = (uintptr_t)sp->sl_base;
6201
6202	for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) {
6203		void *next = ((mcache_obj_t *)buf)->obj_next;
6204		if (next != addr)
6205			continue;
6206		if (!mclverify) {
6207			if (next != NULL && !MBUF_IN_MAP(next)) {
6208				mcache_t *cp = m_cache(sp->sl_class);
6209				panic("%s: %s buffer %p in slab %p modified "
6210				    "after free at offset 0: %p out of range "
6211				    "[%p-%p)\n", __func__, cp->mc_name,
6212				    (void *)buf, sp, next, mbutl, embutl);
6213				/* NOTREACHED */
6214			}
6215		} else {
6216			mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class,
6217			    (mcache_obj_t *)buf);
6218			mcl_audit_verify_nextptr(next, mca);
6219		}
6220	}
6221}
6222
6223static void
6224slab_detach(mcl_slab_t *sp)
6225{
6226	sp->sl_link.tqe_next = (mcl_slab_t *)-1;
6227	sp->sl_link.tqe_prev = (mcl_slab_t **)-1;
6228	sp->sl_flags |= SLF_DETACHED;
6229}
6230
6231static boolean_t
6232slab_is_detached(mcl_slab_t *sp)
6233{
6234	return ((intptr_t)sp->sl_link.tqe_next == -1 &&
6235	    (intptr_t)sp->sl_link.tqe_prev == -1 &&
6236	    (sp->sl_flags & SLF_DETACHED));
6237}
6238
6239static void
6240mcl_audit_init(void *buf, mcache_audit_t **mca_list,
6241    mcache_obj_t **con_list, size_t con_size, unsigned int num)
6242{
6243	mcache_audit_t *mca, *mca_tail;
6244	mcache_obj_t *con = NULL;
6245	boolean_t save_contents = (con_list != NULL);
6246	unsigned int i, ix;
6247
6248	ASSERT(num <= NMBPBG);
6249	ASSERT(con_list == NULL || con_size != 0);
6250
6251	ix = MTOBG(buf);
6252	VERIFY(ix < maxclaudit);
6253
6254	/* Make sure we haven't been here before */
6255	for (i = 0; i < NMBPBG; i++)
6256		VERIFY(mclaudit[ix].cl_audit[i] == NULL);
6257
6258	mca = mca_tail = *mca_list;
6259	if (save_contents)
6260		con = *con_list;
6261
6262	for (i = 0; i < num; i++) {
6263		mcache_audit_t *next;
6264
6265		next = mca->mca_next;
6266		bzero(mca, sizeof (*mca));
6267		mca->mca_next = next;
6268		mclaudit[ix].cl_audit[i] = mca;
6269
6270		/* Attach the contents buffer if requested */
6271		if (save_contents) {
6272			VERIFY(con != NULL);
6273			mca->mca_contents_size = con_size;
6274			mca->mca_contents = con;
6275			con = con->obj_next;
6276			bzero(mca->mca_contents, mca->mca_contents_size);
6277		}
6278
6279		mca_tail = mca;
6280		mca = mca->mca_next;
6281	}
6282
6283	if (save_contents)
6284		*con_list = con;
6285
6286	*mca_list = mca_tail->mca_next;
6287	mca_tail->mca_next = NULL;
6288}
6289
6290/*
6291 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return
6292 * the corresponding audit structure for that buffer.
6293 */
6294static mcache_audit_t *
6295mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o)
6296{
6297	mcache_audit_t *mca = NULL;
6298	int ix = MTOBG(o);
6299
6300	VERIFY(ix < maxclaudit);
6301	VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG)));
6302
6303	switch (class) {
6304	case MC_MBUF:
6305		/*
6306		 * For the mbuf case, find the index of the page
6307		 * used by the mbuf and use that index to locate the
6308		 * base address of the page.  Then find out the
6309		 * mbuf index relative to the page base and use
6310		 * it to locate the audit structure.
6311		 */
6312		VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG);
6313		mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)];
6314		break;
6315
6316	case MC_CL:
6317		/*
6318		 * Same thing as above, but for 2KB clusters in a page.
6319		 */
6320		VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG);
6321		mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)];
6322		break;
6323
6324	case MC_BIGCL:
6325	case MC_16KCL:
6326		/*
6327		 * Same as above, but only return the first element.
6328		 */
6329		mca = mclaudit[ix].cl_audit[0];
6330		break;
6331
6332	default:
6333		VERIFY(0);
6334		/* NOTREACHED */
6335	}
6336
6337	return (mca);
6338}
6339
6340static void
6341mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite,
6342    boolean_t alloc)
6343{
6344	struct mbuf *m = addr;
6345	mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next;
6346
6347	VERIFY(mca->mca_contents != NULL &&
6348	    mca->mca_contents_size == AUDIT_CONTENTS_SIZE);
6349
6350	if (mclverify)
6351		mcl_audit_verify_nextptr(next, mca);
6352
6353	if (!alloc) {
6354		/* Save constructed mbuf fields */
6355		mcl_audit_save_mbuf(m, mca);
6356		if (mclverify) {
6357			mcache_set_pattern(MCACHE_FREE_PATTERN, m,
6358			    m_maxsize(MC_MBUF));
6359		}
6360		((mcache_obj_t *)m)->obj_next = next;
6361		return;
6362	}
6363
6364	/* Check if the buffer has been corrupted while in freelist */
6365	if (mclverify) {
6366		mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF));
6367	}
6368	/* Restore constructed mbuf fields */
6369	mcl_audit_restore_mbuf(m, mca, composite);
6370}
6371
6372static void
6373mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite)
6374{
6375	struct mbuf *ms = (struct mbuf *)mca->mca_contents;
6376
6377	if (composite) {
6378		struct mbuf *next = m->m_next;
6379		VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL &&
6380		    MBUF_IS_COMPOSITE(ms));
6381		/*
6382		 * We could have hand-picked the mbuf fields and restore
6383		 * them individually, but that will be a maintenance
6384		 * headache.  Instead, restore everything that was saved;
6385		 * the mbuf layer will recheck and reinitialize anyway.
6386		 */
6387		bcopy(ms, m, mca->mca_contents_size);
6388		m->m_next = next;
6389	} else {
6390		/*
6391		 * For a regular mbuf (no cluster attached) there's nothing
6392		 * to restore other than the type field, which is expected
6393		 * to be MT_FREE.
6394		 */
6395		m->m_type = ms->m_type;
6396	}
6397	_MCHECK(m);
6398}
6399
6400static void
6401mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca)
6402{
6403	_MCHECK(m);
6404	bcopy(m, mca->mca_contents, mca->mca_contents_size);
6405}
6406
6407static void
6408mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc,
6409    boolean_t save_next)
6410{
6411	mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next;
6412
6413	if (!alloc) {
6414		if (mclverify) {
6415			mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size);
6416		}
6417		if (save_next) {
6418			mcl_audit_verify_nextptr(next, mca);
6419			((mcache_obj_t *)addr)->obj_next = next;
6420		}
6421	} else if (mclverify) {
6422		/* Check if the buffer has been corrupted while in freelist */
6423		mcl_audit_verify_nextptr(next, mca);
6424		mcache_audit_free_verify_set(mca, addr, 0, size);
6425	}
6426}
6427
6428static void
6429mcl_audit_mcheck_panic(struct mbuf *m)
6430{
6431	mcache_audit_t *mca;
6432
6433	MRANGE(m);
6434	mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m);
6435
6436	panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n",
6437	    m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca));
6438	/* NOTREACHED */
6439}
6440
6441static void
6442mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca)
6443{
6444	if (next != NULL && !MBUF_IN_MAP(next) &&
6445	    (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) {
6446		panic("mcl_audit: buffer %p modified after free at offset 0: "
6447		    "%p out of range [%p-%p)\n%s\n",
6448		    mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca));
6449		/* NOTREACHED */
6450	}
6451}
6452
6453/* This function turns on mbuf leak detection */
6454static void
6455mleak_activate(void)
6456{
6457	mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR;
6458	PE_parse_boot_argn("mleak_sample_factor",
6459	    &mleak_table.mleak_sample_factor,
6460	    sizeof (mleak_table.mleak_sample_factor));
6461
6462	if (mleak_table.mleak_sample_factor == 0)
6463		mclfindleak = 0;
6464
6465	if (mclfindleak == 0)
6466		return;
6467
6468	vm_size_t alloc_size =
6469	    mleak_alloc_buckets * sizeof (struct mallocation);
6470	vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace);
6471
6472	MALLOC(mleak_allocations, struct mallocation *, alloc_size,
6473	    M_TEMP, M_WAITOK | M_ZERO);
6474	VERIFY(mleak_allocations != NULL);
6475
6476	MALLOC(mleak_traces, struct mtrace *, trace_size,
6477	    M_TEMP, M_WAITOK | M_ZERO);
6478	VERIFY(mleak_traces != NULL);
6479
6480	MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES),
6481	    M_TEMP, M_WAITOK | M_ZERO);
6482	VERIFY(mleak_stat != NULL);
6483	mleak_stat->ml_cnt = MLEAK_NUM_TRACES;
6484#ifdef __LP64__
6485	mleak_stat->ml_isaddr64 = 1;
6486#endif /* __LP64__ */
6487}
6488
6489static void
6490mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc)
6491{
6492	int temp;
6493
6494	if (mclfindleak == 0)
6495		return;
6496
6497	if (!alloc)
6498		return (mleak_free(addr));
6499
6500	temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1);
6501
6502	if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) {
6503		uintptr_t bt[MLEAK_STACK_DEPTH];
6504		int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH);
6505		mleak_log(bt, addr, logged, num);
6506	}
6507}
6508
6509/*
6510 * This function records the allocation in the mleak_allocations table
6511 * and the backtrace in the mleak_traces table; if allocation slot is in use,
6512 * replace old allocation with new one if the trace slot is in use, return
6513 * (or increment refcount if same trace).
6514 */
6515static boolean_t
6516mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num)
6517{
6518	struct mallocation *allocation;
6519	struct mtrace *trace;
6520	uint32_t trace_index;
6521
6522	/* Quit if someone else modifying the tables */
6523	if (!lck_mtx_try_lock_spin(mleak_lock)) {
6524		mleak_table.total_conflicts++;
6525		return (FALSE);
6526	}
6527
6528	allocation = &mleak_allocations[hashaddr((uintptr_t)addr,
6529	    mleak_alloc_buckets)];
6530	trace_index = hashbacktrace(bt, depth, mleak_trace_buckets);
6531	trace = &mleak_traces[trace_index];
6532
6533	VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]);
6534	VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]);
6535
6536	allocation->hitcount++;
6537	trace->hitcount++;
6538
6539	/*
6540	 * If the allocation bucket we want is occupied
6541	 * and the occupier has the same trace, just bail.
6542	 */
6543	if (allocation->element != NULL &&
6544	    trace_index == allocation->trace_index) {
6545		mleak_table.alloc_collisions++;
6546		lck_mtx_unlock(mleak_lock);
6547		return (TRUE);
6548	}
6549
6550	/*
6551	 * Store the backtrace in the traces array;
6552	 * Size of zero = trace bucket is free.
6553	 */
6554	if (trace->allocs > 0 &&
6555	    bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) {
6556		/* Different, unique trace, but the same hash! Bail out. */
6557		trace->collisions++;
6558		mleak_table.trace_collisions++;
6559		lck_mtx_unlock(mleak_lock);
6560		return (TRUE);
6561	} else if (trace->allocs > 0) {
6562		/* Same trace, already added, so increment refcount */
6563		trace->allocs++;
6564	} else {
6565		/* Found an unused trace bucket, so record the trace here */
6566		if (trace->depth != 0) {
6567			/* this slot previously used but not currently in use */
6568			mleak_table.trace_overwrites++;
6569		}
6570		mleak_table.trace_recorded++;
6571		trace->allocs = 1;
6572		memcpy(trace->addr, bt, (depth * sizeof (uintptr_t)));
6573		trace->depth = depth;
6574		trace->collisions = 0;
6575	}
6576
6577	/* Step 2: Store the allocation record in the allocations array */
6578	if (allocation->element != NULL) {
6579		/*
6580		 * Replace an existing allocation.  No need to preserve
6581		 * because only a subset of the allocations are being
6582		 * recorded anyway.
6583		 */
6584		mleak_table.alloc_collisions++;
6585	} else if (allocation->trace_index != 0) {
6586		mleak_table.alloc_overwrites++;
6587	}
6588	allocation->element = addr;
6589	allocation->trace_index = trace_index;
6590	allocation->count = num;
6591	mleak_table.alloc_recorded++;
6592	mleak_table.outstanding_allocs++;
6593
6594	lck_mtx_unlock(mleak_lock);
6595	return (TRUE);
6596}
6597
6598static void
6599mleak_free(mcache_obj_t *addr)
6600{
6601	while (addr != NULL) {
6602		struct mallocation *allocation = &mleak_allocations
6603		    [hashaddr((uintptr_t)addr, mleak_alloc_buckets)];
6604
6605		if (allocation->element == addr &&
6606		    allocation->trace_index < mleak_trace_buckets) {
6607			lck_mtx_lock_spin(mleak_lock);
6608			if (allocation->element == addr &&
6609			    allocation->trace_index < mleak_trace_buckets) {
6610				struct mtrace *trace;
6611				trace = &mleak_traces[allocation->trace_index];
6612				/* allocs = 0 means trace bucket is unused */
6613				if (trace->allocs > 0)
6614					trace->allocs--;
6615				if (trace->allocs == 0)
6616					trace->depth = 0;
6617				/* NULL element means alloc bucket is unused */
6618				allocation->element = NULL;
6619				mleak_table.outstanding_allocs--;
6620			}
6621			lck_mtx_unlock(mleak_lock);
6622		}
6623		addr = addr->obj_next;
6624	}
6625}
6626
6627static void
6628mleak_sort_traces()
6629{
6630	int i, j, k;
6631	struct mtrace *swap;
6632
6633	for(i = 0; i < MLEAK_NUM_TRACES; i++)
6634		mleak_top_trace[i] = NULL;
6635
6636	for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++)
6637	{
6638		if (mleak_traces[i].allocs <= 0)
6639			continue;
6640
6641		mleak_top_trace[j] = &mleak_traces[i];
6642		for (k = j; k > 0; k--) {
6643			if (mleak_top_trace[k]->allocs <=
6644			    mleak_top_trace[k-1]->allocs)
6645				break;
6646
6647			swap = mleak_top_trace[k-1];
6648			mleak_top_trace[k-1] = mleak_top_trace[k];
6649			mleak_top_trace[k] = swap;
6650		}
6651		j++;
6652	}
6653
6654	j--;
6655	for(; i < mleak_trace_buckets; i++) {
6656		if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs)
6657			continue;
6658
6659		mleak_top_trace[j] = &mleak_traces[i];
6660
6661		for (k = j; k > 0; k--) {
6662			if (mleak_top_trace[k]->allocs <=
6663			    mleak_top_trace[k-1]->allocs)
6664				break;
6665
6666			swap = mleak_top_trace[k-1];
6667			mleak_top_trace[k-1] = mleak_top_trace[k];
6668			mleak_top_trace[k] = swap;
6669		}
6670	}
6671}
6672
6673static void
6674mleak_update_stats()
6675{
6676	mleak_trace_stat_t *mltr;
6677	int i;
6678
6679	VERIFY(mleak_stat != NULL);
6680#ifdef __LP64__
6681	VERIFY(mleak_stat->ml_isaddr64);
6682#else
6683	VERIFY(!mleak_stat->ml_isaddr64);
6684#endif /* !__LP64__ */
6685	VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES);
6686
6687	mleak_sort_traces();
6688
6689	mltr = &mleak_stat->ml_trace[0];
6690	bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES);
6691	for (i = 0; i < MLEAK_NUM_TRACES; i++) {
6692	int j;
6693
6694		if (mleak_top_trace[i] == NULL ||
6695		    mleak_top_trace[i]->allocs == 0)
6696			continue;
6697
6698		mltr->mltr_collisions	= mleak_top_trace[i]->collisions;
6699		mltr->mltr_hitcount	= mleak_top_trace[i]->hitcount;
6700		mltr->mltr_allocs	= mleak_top_trace[i]->allocs;
6701		mltr->mltr_depth	= mleak_top_trace[i]->depth;
6702
6703		VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH);
6704		for (j = 0; j < mltr->mltr_depth; j++)
6705			mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j];
6706
6707		mltr++;
6708	}
6709}
6710
6711static struct mbtypes {
6712	int		mt_type;
6713	const char	*mt_name;
6714} mbtypes[] = {
6715	{ MT_DATA,	"data" },
6716	{ MT_OOBDATA,	"oob data" },
6717	{ MT_CONTROL,	"ancillary data" },
6718	{ MT_HEADER,	"packet headers" },
6719	{ MT_SOCKET,	"socket structures" },
6720	{ MT_PCB,	"protocol control blocks" },
6721	{ MT_RTABLE,	"routing table entries" },
6722	{ MT_HTABLE,	"IMP host table entries" },
6723	{ MT_ATABLE,	"address resolution tables" },
6724	{ MT_FTABLE,	"fragment reassembly queue headers" },
6725	{ MT_SONAME,	"socket names and addresses" },
6726	{ MT_SOOPTS,	"socket options" },
6727	{ MT_RIGHTS,	"access rights" },
6728	{ MT_IFADDR,	"interface addresses" },
6729	{ MT_TAG,	"packet tags" },
6730	{ 0,		NULL }
6731};
6732
6733#define	MBUF_DUMP_BUF_CHK() {	\
6734	clen -= k;		\
6735	if (clen < 1)		\
6736		goto done;	\
6737	c += k;			\
6738}
6739
6740static char *
6741mbuf_dump(void)
6742{
6743	unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct;
6744	u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0;
6745	u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0;
6746	u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0;
6747	int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short);
6748	uint8_t seen[256];
6749	struct mbtypes *mp;
6750	mb_class_stat_t *sp;
6751	mleak_trace_stat_t *mltr;
6752	char *c = mbuf_dump_buf;
6753	int i, k, clen = MBUF_DUMP_BUF_SIZE;
6754
6755	mbuf_dump_buf[0] = '\0';
6756
6757	/* synchronize all statistics in the mbuf table */
6758	mbuf_stat_sync();
6759	mbuf_mtypes_sync(TRUE);
6760
6761	sp = &mb_stat->mbs_class[0];
6762	for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) {
6763		u_int32_t mem;
6764
6765		if (m_class(i) == MC_MBUF) {
6766			m_mbufs = sp->mbcl_active;
6767		} else if (m_class(i) == MC_CL) {
6768			m_clfree = sp->mbcl_total - sp->mbcl_active;
6769		} else if (m_class(i) == MC_BIGCL) {
6770			m_bigclfree = sp->mbcl_total - sp->mbcl_active;
6771		} else if (njcl > 0 && m_class(i) == MC_16KCL) {
6772			m_16kclfree = sp->mbcl_total - sp->mbcl_active;
6773			m_16kclusters = sp->mbcl_total;
6774		} else if (m_class(i) == MC_MBUF_CL) {
6775			m_mbufclfree = sp->mbcl_total - sp->mbcl_active;
6776		} else if (m_class(i) == MC_MBUF_BIGCL) {
6777			m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active;
6778		} else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) {
6779			m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active;
6780		}
6781
6782		mem = sp->mbcl_ctotal * sp->mbcl_size;
6783		totmem += mem;
6784		totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) *
6785		    sp->mbcl_size;
6786
6787	}
6788
6789	/* adjust free counts to include composite caches */
6790	m_clfree += m_mbufclfree;
6791	m_bigclfree += m_mbufbigclfree;
6792	m_16kclfree += m_mbuf16kclfree;
6793
6794	totmbufs = 0;
6795	for (mp = mbtypes; mp->mt_name != NULL; mp++)
6796		totmbufs += mbstat.m_mtypes[mp->mt_type];
6797	if (totmbufs > m_mbufs)
6798		totmbufs = m_mbufs;
6799	k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs);
6800	MBUF_DUMP_BUF_CHK();
6801
6802	bzero(&seen, sizeof (seen));
6803	for (mp = mbtypes; mp->mt_name != NULL; mp++) {
6804		if (mbstat.m_mtypes[mp->mt_type] != 0) {
6805			seen[mp->mt_type] = 1;
6806			k = snprintf(c, clen, "\t%u mbufs allocated to %s\n",
6807			    mbstat.m_mtypes[mp->mt_type], mp->mt_name);
6808			MBUF_DUMP_BUF_CHK();
6809		}
6810	}
6811	seen[MT_FREE] = 1;
6812	for (i = 0; i < nmbtypes; i++)
6813		if (!seen[i] && mbstat.m_mtypes[i] != 0) {
6814			k = snprintf(c, clen, "\t%u mbufs allocated to "
6815			    "<mbuf type %d>\n", mbstat.m_mtypes[i], i);
6816			MBUF_DUMP_BUF_CHK();
6817		}
6818	if ((m_mbufs - totmbufs) > 0) {
6819		k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n",
6820		    m_mbufs - totmbufs);
6821		MBUF_DUMP_BUF_CHK();
6822	}
6823	k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n"
6824	    "%u/%u mbuf 4KB clusters in use\n",
6825	    (unsigned int)(mbstat.m_clusters - m_clfree),
6826	    (unsigned int)mbstat.m_clusters,
6827	    (unsigned int)(mbstat.m_bigclusters - m_bigclfree),
6828	    (unsigned int)mbstat.m_bigclusters);
6829	MBUF_DUMP_BUF_CHK();
6830
6831	if (njcl > 0) {
6832		k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n",
6833		    m_16kclusters - m_16kclfree, m_16kclusters,
6834		    njclbytes / 1024);
6835		MBUF_DUMP_BUF_CHK();
6836	}
6837	totused = totmem - totfree;
6838	if (totmem == 0) {
6839		totpct = 0;
6840	} else if (totused < (ULONG_MAX / 100)) {
6841		totpct = (totused * 100) / totmem;
6842	} else {
6843		u_long totmem1 = totmem / 100;
6844		u_long totused1 = totused / 100;
6845		totpct = (totused1 * 100) / totmem1;
6846	}
6847	k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% "
6848	    "in use)\n", totmem / 1024, totpct);
6849	MBUF_DUMP_BUF_CHK();
6850
6851	/* mbuf leak detection statistics */
6852	mleak_update_stats();
6853
6854	k = snprintf(c, clen, "\nmbuf leak detection table:\n");
6855	MBUF_DUMP_BUF_CHK();
6856	k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n",
6857	    mleak_table.mleak_capture / mleak_table.mleak_sample_factor,
6858	    mleak_table.mleak_sample_factor);
6859	MBUF_DUMP_BUF_CHK();
6860	k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n",
6861	    mleak_table.outstanding_allocs);
6862	MBUF_DUMP_BUF_CHK();
6863	k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n",
6864	    mleak_table.alloc_recorded, mleak_table.trace_recorded);
6865	MBUF_DUMP_BUF_CHK();
6866	k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n",
6867	    mleak_table.alloc_collisions, mleak_table.trace_collisions);
6868	MBUF_DUMP_BUF_CHK();
6869	k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n",
6870	    mleak_table.alloc_overwrites, mleak_table.trace_overwrites);
6871	MBUF_DUMP_BUF_CHK();
6872	k = snprintf(c, clen, "\tlock conflicts: %llu\n\n",
6873	    mleak_table.total_conflicts);
6874	MBUF_DUMP_BUF_CHK();
6875
6876	k = snprintf(c, clen, "top %d outstanding traces:\n",
6877	    mleak_stat->ml_cnt);
6878	MBUF_DUMP_BUF_CHK();
6879	for (i = 0; i < mleak_stat->ml_cnt; i++) {
6880		mltr = &mleak_stat->ml_trace[i];
6881		k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), "
6882		    "%llu hit(s), %llu collision(s)\n", (i + 1),
6883		    mltr->mltr_allocs, mltr->mltr_hitcount,
6884		    mltr->mltr_collisions);
6885		MBUF_DUMP_BUF_CHK();
6886	}
6887
6888	if (mleak_stat->ml_isaddr64)
6889		k = snprintf(c, clen, MB_LEAK_HDR_64);
6890	else
6891		k = snprintf(c, clen, MB_LEAK_HDR_32);
6892	MBUF_DUMP_BUF_CHK();
6893
6894	for (i = 0; i < MLEAK_STACK_DEPTH; i++) {
6895		int j;
6896		k = snprintf(c, clen, "%2d: ", (i + 1));
6897		MBUF_DUMP_BUF_CHK();
6898		for (j = 0; j < mleak_stat->ml_cnt; j++) {
6899			mltr = &mleak_stat->ml_trace[j];
6900			if (i < mltr->mltr_depth) {
6901				if (mleak_stat->ml_isaddr64) {
6902					k = snprintf(c, clen, "0x%0llx  ",
6903					    mltr->mltr_addr[i]);
6904				} else {
6905					k = snprintf(c, clen,
6906					    "0x%08x  ",
6907					    (u_int32_t)mltr->mltr_addr[i]);
6908				}
6909			} else {
6910				if (mleak_stat->ml_isaddr64)
6911					k = snprintf(c, clen,
6912					    MB_LEAK_SPACING_64);
6913				else
6914					k = snprintf(c, clen,
6915					    MB_LEAK_SPACING_32);
6916			}
6917			MBUF_DUMP_BUF_CHK();
6918		}
6919		k = snprintf(c, clen, "\n");
6920		MBUF_DUMP_BUF_CHK();
6921	}
6922done:
6923	return (mbuf_dump_buf);
6924}
6925
6926#undef MBUF_DUMP_BUF_CHK
6927
6928SYSCTL_DECL(_kern_ipc);
6929SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat,
6930    CTLFLAG_RD | CTLFLAG_LOCKED,
6931    0, 0, mbstat_sysctl, "S,mbstat", "");
6932SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat,
6933    CTLFLAG_RD | CTLFLAG_LOCKED,
6934    0, 0, mb_stat_sysctl, "S,mb_stat", "");
6935SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace,
6936    CTLFLAG_RD | CTLFLAG_LOCKED,
6937    0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", "");
6938SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table,
6939    CTLFLAG_RD | CTLFLAG_LOCKED,
6940    0, 0, mleak_table_sysctl, "S,mleak_table", "");
6941SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor,
6942    CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, "");
6943SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized,
6944    CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, "");
6945SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog,
6946    CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, "");
6947