1/* 2 * Copyright (c) 1998-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */ 29/* 30 * Copyright (c) 1982, 1986, 1988, 1991, 1993 31 * The Regents of the University of California. All rights reserved. 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 3. All advertising materials mentioning features or use of this software 42 * must display the following acknowledgement: 43 * This product includes software developed by the University of 44 * California, Berkeley and its contributors. 45 * 4. Neither the name of the University nor the names of its contributors 46 * may be used to endorse or promote products derived from this software 47 * without specific prior written permission. 48 * 49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 59 * SUCH DAMAGE. 60 * 61 * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 62 */ 63/* 64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce 65 * support for mandatory and extensible security protections. This notice 66 * is included in support of clause 2.2 (b) of the Apple Public License, 67 * Version 2.0. 68 */ 69 70#include <sys/param.h> 71#include <sys/systm.h> 72#include <sys/malloc.h> 73#include <sys/mbuf.h> 74#include <sys/kernel.h> 75#include <sys/sysctl.h> 76#include <sys/syslog.h> 77#include <sys/protosw.h> 78#include <sys/domain.h> 79#include <sys/queue.h> 80#include <sys/proc.h> 81 82#include <kern/kern_types.h> 83#include <kern/simple_lock.h> 84#include <kern/queue.h> 85#include <kern/sched_prim.h> 86#include <kern/cpu_number.h> 87#include <kern/zalloc.h> 88 89#include <libkern/OSAtomic.h> 90#include <libkern/libkern.h> 91 92#include <IOKit/IOMapper.h> 93 94#include <machine/limits.h> 95#include <machine/machine_routines.h> 96 97#if CONFIG_MACF_NET 98#include <security/mac_framework.h> 99#endif /* MAC_NET */ 100 101#include <sys/mcache.h> 102 103/* 104 * MBUF IMPLEMENTATION NOTES. 105 * 106 * There is a total of 5 per-CPU caches: 107 * 108 * MC_MBUF: 109 * This is a cache of rudimentary objects of MSIZE in size; each 110 * object represents an mbuf structure. This cache preserves only 111 * the m_type field of the mbuf during its transactions. 112 * 113 * MC_CL: 114 * This is a cache of rudimentary objects of MCLBYTES in size; each 115 * object represents a mcluster structure. This cache does not 116 * preserve the contents of the objects during its transactions. 117 * 118 * MC_BIGCL: 119 * This is a cache of rudimentary objects of MBIGCLBYTES in size; each 120 * object represents a mbigcluster structure. This cache does not 121 * preserve the contents of the objects during its transaction. 122 * 123 * MC_MBUF_CL: 124 * This is a cache of mbufs each having a cluster attached to it. 125 * It is backed by MC_MBUF and MC_CL rudimentary caches. Several 126 * fields of the mbuf related to the external cluster are preserved 127 * during transactions. 128 * 129 * MC_MBUF_BIGCL: 130 * This is a cache of mbufs each having a big cluster attached to it. 131 * It is backed by MC_MBUF and MC_BIGCL rudimentary caches. Several 132 * fields of the mbuf related to the external cluster are preserved 133 * during transactions. 134 * 135 * OBJECT ALLOCATION: 136 * 137 * Allocation requests are handled first at the per-CPU (mcache) layer 138 * before falling back to the slab layer. Performance is optimal when 139 * the request is satisfied at the CPU layer because global data/lock 140 * never gets accessed. When the slab layer is entered for allocation, 141 * the slab freelist will be checked first for available objects before 142 * the VM backing store is invoked. Slab layer operations are serialized 143 * for all of the caches as the mbuf global lock is held most of the time. 144 * Allocation paths are different depending on the class of objects: 145 * 146 * a. Rudimentary object: 147 * 148 * { m_get_common(), m_clattach(), m_mclget(), 149 * m_mclalloc(), m_bigalloc(), m_copym_with_hdrs(), 150 * composite object allocation } 151 * | ^ 152 * | | 153 * | +-----------------------+ 154 * v | 155 * mcache_alloc/mcache_alloc_ext() mbuf_slab_audit() 156 * | ^ 157 * v | 158 * [CPU cache] -------> (found?) -------+ 159 * | | 160 * v | 161 * mbuf_slab_alloc() | 162 * | | 163 * v | 164 * +---------> [freelist] -------> (found?) -------+ 165 * | | 166 * | v 167 * | m_clalloc() 168 * | | 169 * | v 170 * +---<<---- kmem_mb_alloc() 171 * 172 * b. Composite object: 173 * 174 * { m_getpackets_internal(), m_allocpacket_internal() } 175 * | ^ 176 * | | 177 * | +------ (done) ---------+ 178 * v | 179 * mcache_alloc/mcache_alloc_ext() mbuf_cslab_audit() 180 * | ^ 181 * v | 182 * [CPU cache] -------> (found?) -------+ 183 * | | 184 * v | 185 * mbuf_cslab_alloc() | 186 * | | 187 * v | 188 * [freelist] -------> (found?) -------+ 189 * | | 190 * v | 191 * (rudimentary object) | 192 * mcache_alloc/mcache_alloc_ext() ------>>-----+ 193 * 194 * Auditing notes: If auditing is enabled, buffers will be subjected to 195 * integrity checks by the audit routine. This is done by verifying their 196 * contents against DEADBEEF (free) pattern before returning them to caller. 197 * As part of this step, the routine will also record the transaction and 198 * pattern-fill the buffers with BADDCAFE (uninitialized) pattern. It will 199 * also restore any constructed data structure fields if necessary. 200 * 201 * OBJECT DEALLOCATION: 202 * 203 * Freeing an object simply involves placing it into the CPU cache; this 204 * pollutes the cache to benefit subsequent allocations. The slab layer 205 * will only be entered if the object is to be purged out of the cache. 206 * During normal operations, this happens only when the CPU layer resizes 207 * its bucket while it's adjusting to the allocation load. Deallocation 208 * paths are different depending on the class of objects: 209 * 210 * a. Rudimentary object: 211 * 212 * { m_free(), m_freem_list(), composite object deallocation } 213 * | ^ 214 * | | 215 * | +------ (done) ---------+ 216 * v | 217 * mcache_free/mcache_free_ext() | 218 * | | 219 * v | 220 * mbuf_slab_audit() | 221 * | | 222 * v | 223 * [CPU cache] ---> (not purging?) -----+ 224 * | | 225 * v | 226 * mbuf_slab_free() | 227 * | | 228 * v | 229 * [freelist] ----------->>------------+ 230 * (objects never get purged to VM) 231 * 232 * b. Composite object: 233 * 234 * { m_free(), m_freem_list() } 235 * | ^ 236 * | | 237 * | +------ (done) ---------+ 238 * v | 239 * mcache_free/mcache_free_ext() | 240 * | | 241 * v | 242 * mbuf_cslab_audit() | 243 * | | 244 * v | 245 * [CPU cache] ---> (not purging?) -----+ 246 * | | 247 * v | 248 * mbuf_cslab_free() | 249 * | | 250 * v | 251 * [freelist] ---> (not purging?) -----+ 252 * | | 253 * v | 254 * (rudimentary object) | 255 * mcache_free/mcache_free_ext() ------->>------+ 256 * 257 * Auditing notes: If auditing is enabled, the audit routine will save 258 * any constructed data structure fields (if necessary) before filling the 259 * contents of the buffers with DEADBEEF (free) pattern and recording the 260 * transaction. Buffers that are freed (whether at CPU or slab layer) are 261 * expected to contain the free pattern. 262 * 263 * DEBUGGING: 264 * 265 * Debugging can be enabled by adding "mbuf_debug=0x3" to boot-args; this 266 * translates to the mcache flags (MCF_VERIFY | MCF_AUDIT). Additionally, 267 * the CPU layer cache can be disabled by setting the MCF_NOCPUCACHE flag, 268 * i.e. modify the boot argument parameter to "mbuf_debug=0x13". Leak 269 * detection may also be disabled by setting the MCF_NOLEAKLOG flag, e.g. 270 * "mbuf_debug=0x113". Note that debugging consumes more CPU and memory. 271 * 272 * Each object is associated with exactly one mcache_audit_t structure that 273 * contains the information related to its last buffer transaction. Given 274 * an address of an object, the audit structure can be retrieved by finding 275 * the position of the object relevant to the base address of the cluster: 276 * 277 * +------------+ +=============+ 278 * | mbuf addr | | mclaudit[i] | 279 * +------------+ +=============+ 280 * | | cl_audit[0] | 281 * i = MTOBG(addr) +-------------+ 282 * | +-----> | cl_audit[1] | -----> mcache_audit_t 283 * b = BGTOM(i) | +-------------+ 284 * | | | ... | 285 * x = MCLIDX(b, addr) | +-------------+ 286 * | | | cl_audit[7] | 287 * +-----------------+ +-------------+ 288 * (e.g. x == 1) 289 * 290 * The mclaudit[] array is allocated at initialization time, but its contents 291 * get populated when the corresponding cluster is created. Because a page 292 * can be turned into NMBPBG number of mbufs, we preserve enough space for the 293 * mbufs so that there is a 1-to-1 mapping between them. A page that never 294 * gets (or has not yet) turned into mbufs will use only cl_audit[0] with the 295 * remaining entries unused. For 16KB cluster, only one entry from the first 296 * page is allocated and used for the entire object. 297 */ 298 299/* TODO: should be in header file */ 300/* kernel translater */ 301extern vm_offset_t kmem_mb_alloc(vm_map_t, int, int); 302extern ppnum_t pmap_find_phys(pmap_t pmap, addr64_t va); 303extern vm_map_t mb_map; /* special map */ 304 305/* Global lock */ 306decl_lck_mtx_data(static, mbuf_mlock_data); 307static lck_mtx_t *mbuf_mlock = &mbuf_mlock_data; 308static lck_attr_t *mbuf_mlock_attr; 309static lck_grp_t *mbuf_mlock_grp; 310static lck_grp_attr_t *mbuf_mlock_grp_attr; 311 312/* Back-end (common) layer */ 313static void *mbuf_worker_run; /* wait channel for worker thread */ 314static int mbuf_worker_ready; /* worker thread is runnable */ 315static int mbuf_expand_mcl; /* number of cluster creation requets */ 316static int mbuf_expand_big; /* number of big cluster creation requests */ 317static int mbuf_expand_16k; /* number of 16KB cluster creation requests */ 318static int ncpu; /* number of CPUs */ 319static ppnum_t *mcl_paddr; /* Array of cluster physical addresses */ 320static ppnum_t mcl_pages; /* Size of array (# physical pages) */ 321static ppnum_t mcl_paddr_base; /* Handle returned by IOMapper::iovmAlloc() */ 322static mcache_t *ref_cache; /* Cache of cluster reference & flags */ 323static mcache_t *mcl_audit_con_cache; /* Audit contents cache */ 324static unsigned int mbuf_debug; /* patchable mbuf mcache flags */ 325static unsigned int mb_normalized; /* number of packets "normalized" */ 326 327#define MB_GROWTH_AGGRESSIVE 1 /* Threshold: 1/2 of total */ 328#define MB_GROWTH_NORMAL 2 /* Threshold: 3/4 of total */ 329 330typedef enum { 331 MC_MBUF = 0, /* Regular mbuf */ 332 MC_CL, /* Cluster */ 333 MC_BIGCL, /* Large (4KB) cluster */ 334 MC_16KCL, /* Jumbo (16KB) cluster */ 335 MC_MBUF_CL, /* mbuf + cluster */ 336 MC_MBUF_BIGCL, /* mbuf + large (4KB) cluster */ 337 MC_MBUF_16KCL /* mbuf + jumbo (16KB) cluster */ 338} mbuf_class_t; 339 340#define MBUF_CLASS_MIN MC_MBUF 341#define MBUF_CLASS_MAX MC_MBUF_16KCL 342#define MBUF_CLASS_LAST MC_16KCL 343#define MBUF_CLASS_VALID(c) \ 344 ((int)(c) >= MBUF_CLASS_MIN && (int)(c) <= MBUF_CLASS_MAX) 345#define MBUF_CLASS_COMPOSITE(c) \ 346 ((int)(c) > MBUF_CLASS_LAST) 347 348 349/* 350 * mbuf specific mcache allocation request flags. 351 */ 352#define MCR_COMP MCR_USR1 /* for MC_MBUF_{CL,BIGCL,16KCL} caches */ 353 354/* 355 * Per-cluster slab structure. 356 * 357 * A slab is a cluster control structure that contains one or more object 358 * chunks; the available chunks are chained in the slab's freelist (sl_head). 359 * Each time a chunk is taken out of the slab, the slab's reference count 360 * gets incremented. When all chunks have been taken out, the empty slab 361 * gets removed (SLF_DETACHED) from the class's slab list. A chunk that is 362 * returned to a slab causes the slab's reference count to be decremented; 363 * it also causes the slab to be reinserted back to class's slab list, if 364 * it's not already done. 365 * 366 * Compartmentalizing of the object chunks into slabs allows us to easily 367 * merge one or more slabs together when the adjacent slabs are idle, as 368 * well as to convert or move a slab from one class to another; e.g. the 369 * mbuf cluster slab can be converted to a regular cluster slab when all 370 * mbufs in the slab have been freed. 371 * 372 * A slab may also span across multiple clusters for chunks larger than 373 * a cluster's size. In this case, only the slab of the first cluster is 374 * used. The rest of the slabs are marked with SLF_PARTIAL to indicate 375 * that they are part of the larger slab. 376 * 377 * Each slab controls a page of memory. 378 */ 379typedef struct mcl_slab { 380 struct mcl_slab *sl_next; /* neighboring slab */ 381 u_int8_t sl_class; /* controlling mbuf class */ 382 int8_t sl_refcnt; /* outstanding allocations */ 383 int8_t sl_chunks; /* chunks (bufs) in this slab */ 384 u_int16_t sl_flags; /* slab flags (see below) */ 385 u_int16_t sl_len; /* slab length */ 386 void *sl_base; /* base of allocated memory */ 387 void *sl_head; /* first free buffer */ 388 TAILQ_ENTRY(mcl_slab) sl_link; /* next/prev slab on freelist */ 389} mcl_slab_t; 390 391#define SLF_MAPPED 0x0001 /* backed by a mapped page */ 392#define SLF_PARTIAL 0x0002 /* part of another slab */ 393#define SLF_DETACHED 0x0004 /* not in slab freelist */ 394 395/* 396 * The array of slabs are broken into groups of arrays per 1MB of kernel 397 * memory to reduce the footprint. Each group is allocated on demand 398 * whenever a new piece of memory mapped in from the VM crosses the 1MB 399 * boundary. 400 */ 401#define NSLABSPMB ((1 << MBSHIFT) >> PGSHIFT) /* 256 slabs/grp */ 402 403typedef struct mcl_slabg { 404 mcl_slab_t slg_slab[NSLABSPMB]; /* group of slabs */ 405} mcl_slabg_t; 406 407/* 408 * Number of slabs needed to control a 16KB cluster object. 409 */ 410#define NSLABSP16KB (M16KCLBYTES >> PGSHIFT) 411 412/* 413 * Per-cluster audit structure. 414 */ 415typedef struct { 416 mcache_audit_t *cl_audit[NMBPBG]; /* array of audits */ 417} mcl_audit_t; 418 419/* 420 * Size of data from the beginning of an mbuf that covers m_hdr, pkthdr 421 * and m_ext structures. If auditing is enabled, we allocate a shadow 422 * mbuf structure of this size inside each audit structure, and the 423 * contents of the real mbuf gets copied into it when the mbuf is freed. 424 * This allows us to pattern-fill the mbuf for integrity check, and to 425 * preserve any constructed mbuf fields (e.g. mbuf + cluster cache case). 426 * Note that we don't save the contents of clusters when they are freed; 427 * we simply pattern-fill them. 428 */ 429#define AUDIT_CONTENTS_SIZE ((MSIZE - MHLEN) + sizeof (_m_ext_t)) 430 431/* 432 * mbuf specific mcache audit flags 433 */ 434#define MB_INUSE 0x01 /* object has not been returned to slab */ 435#define MB_COMP_INUSE 0x02 /* object has not been returned to cslab */ 436#define MB_SCVALID 0x04 /* object has valid saved contents */ 437 438/* 439 * Each of the following two arrays hold up to nmbclusters elements. 440 */ 441static mcl_audit_t *mclaudit; /* array of cluster audit information */ 442static unsigned int maxclaudit; /* max # of entries in audit table */ 443static mcl_slabg_t **slabstbl; /* cluster slabs table */ 444static unsigned int maxslabgrp; /* max # of entries in slabs table */ 445static unsigned int slabgrp; /* # of entries in slabs table */ 446 447/* Globals */ 448int nclusters; /* # of clusters for non-jumbo (legacy) sizes */ 449int njcl; /* # of clusters for jumbo sizes */ 450int njclbytes; /* size of a jumbo cluster */ 451union mbigcluster *mbutl; /* first mapped cluster address */ 452union mbigcluster *embutl; /* ending virtual address of mclusters */ 453int _max_linkhdr; /* largest link-level header */ 454int _max_protohdr; /* largest protocol header */ 455int max_hdr; /* largest link+protocol header */ 456int max_datalen; /* MHLEN - max_hdr */ 457 458static boolean_t mclverify; /* debug: pattern-checking */ 459static boolean_t mcltrace; /* debug: stack tracing */ 460static boolean_t mclfindleak; /* debug: leak detection */ 461static boolean_t mclexpleak; /* debug: expose leak info to user space */ 462 463/* mbuf leak detection variables */ 464static struct mleak_table mleak_table; 465static mleak_stat_t *mleak_stat; 466 467#define MLEAK_STAT_SIZE(n) \ 468 ((size_t)(&((mleak_stat_t *)0)->ml_trace[n])) 469 470struct mallocation { 471 mcache_obj_t *element; /* the alloc'ed element, NULL if unused */ 472 u_int32_t trace_index; /* mtrace index for corresponding backtrace */ 473 u_int32_t count; /* How many objects were requested */ 474 u_int64_t hitcount; /* for determining hash effectiveness */ 475}; 476 477struct mtrace { 478 u_int64_t collisions; 479 u_int64_t hitcount; 480 u_int64_t allocs; 481 u_int64_t depth; 482 uintptr_t addr[MLEAK_STACK_DEPTH]; 483}; 484 485/* Size must be a power of two for the zhash to be able to just mask off bits */ 486#define MLEAK_ALLOCATION_MAP_NUM 512 487#define MLEAK_TRACE_MAP_NUM 256 488 489/* 490 * Sample factor for how often to record a trace. This is overwritable 491 * by the boot-arg mleak_sample_factor. 492 */ 493#define MLEAK_SAMPLE_FACTOR 500 494 495/* 496 * Number of top leakers recorded. 497 */ 498#define MLEAK_NUM_TRACES 5 499 500#define MB_LEAK_SPACING_64 " " 501#define MB_LEAK_SPACING_32 " " 502 503 504#define MB_LEAK_HDR_32 "\n\ 505 trace [1] trace [2] trace [3] trace [4] trace [5] \n\ 506 ---------- ---------- ---------- ---------- ---------- \n\ 507" 508 509#define MB_LEAK_HDR_64 "\n\ 510 trace [1] trace [2] trace [3] \ 511 trace [4] trace [5] \n\ 512 ------------------ ------------------ ------------------ \ 513 ------------------ ------------------ \n\ 514" 515 516static uint32_t mleak_alloc_buckets = MLEAK_ALLOCATION_MAP_NUM; 517static uint32_t mleak_trace_buckets = MLEAK_TRACE_MAP_NUM; 518 519/* Hashmaps of allocations and their corresponding traces */ 520static struct mallocation *mleak_allocations; 521static struct mtrace *mleak_traces; 522static struct mtrace *mleak_top_trace[MLEAK_NUM_TRACES]; 523 524/* Lock to protect mleak tables from concurrent modification */ 525decl_lck_mtx_data(static, mleak_lock_data); 526static lck_mtx_t *mleak_lock = &mleak_lock_data; 527static lck_attr_t *mleak_lock_attr; 528static lck_grp_t *mleak_lock_grp; 529static lck_grp_attr_t *mleak_lock_grp_attr; 530 531extern u_int32_t high_sb_max; 532 533/* TODO: should be in header file */ 534int do_reclaim = 0; 535 536/* The minimum number of objects that are allocated, to start. */ 537#define MINCL 32 538#define MINBIGCL (MINCL >> 1) 539#define MIN16KCL (MINCL >> 2) 540 541/* Low watermarks (only map in pages once free counts go below) */ 542#define MBIGCL_LOWAT MINBIGCL 543#define M16KCL_LOWAT MIN16KCL 544 545typedef struct { 546 mbuf_class_t mtbl_class; /* class type */ 547 mcache_t *mtbl_cache; /* mcache for this buffer class */ 548 TAILQ_HEAD(mcl_slhead, mcl_slab) mtbl_slablist; /* slab list */ 549 mcache_obj_t *mtbl_cobjlist; /* composite objects freelist */ 550 mb_class_stat_t *mtbl_stats; /* statistics fetchable via sysctl */ 551 u_int32_t mtbl_maxsize; /* maximum buffer size */ 552 int mtbl_minlimit; /* minimum allowed */ 553 int mtbl_maxlimit; /* maximum allowed */ 554 u_int32_t mtbl_wantpurge; /* purge during next reclaim */ 555} mbuf_table_t; 556 557#define m_class(c) mbuf_table[c].mtbl_class 558#define m_cache(c) mbuf_table[c].mtbl_cache 559#define m_slablist(c) mbuf_table[c].mtbl_slablist 560#define m_cobjlist(c) mbuf_table[c].mtbl_cobjlist 561#define m_maxsize(c) mbuf_table[c].mtbl_maxsize 562#define m_minlimit(c) mbuf_table[c].mtbl_minlimit 563#define m_maxlimit(c) mbuf_table[c].mtbl_maxlimit 564#define m_wantpurge(c) mbuf_table[c].mtbl_wantpurge 565#define m_cname(c) mbuf_table[c].mtbl_stats->mbcl_cname 566#define m_size(c) mbuf_table[c].mtbl_stats->mbcl_size 567#define m_total(c) mbuf_table[c].mtbl_stats->mbcl_total 568#define m_active(c) mbuf_table[c].mtbl_stats->mbcl_active 569#define m_infree(c) mbuf_table[c].mtbl_stats->mbcl_infree 570#define m_slab_cnt(c) mbuf_table[c].mtbl_stats->mbcl_slab_cnt 571#define m_alloc_cnt(c) mbuf_table[c].mtbl_stats->mbcl_alloc_cnt 572#define m_free_cnt(c) mbuf_table[c].mtbl_stats->mbcl_free_cnt 573#define m_notified(c) mbuf_table[c].mtbl_stats->mbcl_notified 574#define m_purge_cnt(c) mbuf_table[c].mtbl_stats->mbcl_purge_cnt 575#define m_fail_cnt(c) mbuf_table[c].mtbl_stats->mbcl_fail_cnt 576#define m_ctotal(c) mbuf_table[c].mtbl_stats->mbcl_ctotal 577 578static mbuf_table_t mbuf_table[] = { 579 /* 580 * The caches for mbufs, regular clusters and big clusters. 581 */ 582 { MC_MBUF, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_MBUF)), 583 NULL, NULL, 0, 0, 0, 0 }, 584 { MC_CL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_CL)), 585 NULL, NULL, 0, 0, 0, 0 }, 586 { MC_BIGCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_BIGCL)), 587 NULL, NULL, 0, 0, 0, 0 }, 588 { MC_16KCL, NULL, TAILQ_HEAD_INITIALIZER(m_slablist(MC_16KCL)), 589 NULL, NULL, 0, 0, 0, 0 }, 590 /* 591 * The following are special caches; they serve as intermediate 592 * caches backed by the above rudimentary caches. Each object 593 * in the cache is an mbuf with a cluster attached to it. Unlike 594 * the above caches, these intermediate caches do not directly 595 * deal with the slab structures; instead, the constructed 596 * cached elements are simply stored in the freelists. 597 */ 598 { MC_MBUF_CL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 599 { MC_MBUF_BIGCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 600 { MC_MBUF_16KCL, NULL, { NULL, NULL }, NULL, NULL, 0, 0, 0, 0 }, 601}; 602 603#define NELEM(a) (sizeof (a) / sizeof ((a)[0])) 604 605static void *mb_waitchan = &mbuf_table; /* wait channel for all caches */ 606static int mb_waiters; /* number of waiters */ 607 608#define MB_WDT_MAXTIME 10 /* # of secs before watchdog panic */ 609static struct timeval mb_wdtstart; /* watchdog start timestamp */ 610static char *mbuf_dump_buf; 611 612#define MBUF_DUMP_BUF_SIZE 2048 613 614/* 615 * mbuf watchdog is enabled by default on embedded platforms. It is 616 * also toggeable via the kern.ipc.mb_watchdog sysctl. 617 */ 618#if CONFIG_EMBEDDED 619static unsigned int mb_watchdog = 1; 620#else 621static unsigned int mb_watchdog = 0; 622#endif /* CONFIG_EMBEDDED */ 623 624/* The following are used to serialize m_clalloc() */ 625static boolean_t mb_clalloc_busy; 626static void *mb_clalloc_waitchan = &mb_clalloc_busy; 627static int mb_clalloc_waiters; 628 629static void mbuf_mtypes_sync(boolean_t); 630static int mbstat_sysctl SYSCTL_HANDLER_ARGS; 631static void mbuf_stat_sync(void); 632static int mb_stat_sysctl SYSCTL_HANDLER_ARGS; 633static int mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS; 634static int mleak_table_sysctl SYSCTL_HANDLER_ARGS; 635static char *mbuf_dump(void); 636static void mbuf_table_init(void); 637static inline void m_incref(struct mbuf *); 638static inline u_int32_t m_decref(struct mbuf *); 639static int m_clalloc(const u_int32_t, const int, const u_int32_t); 640static void mbuf_worker_thread_init(void); 641static mcache_obj_t *slab_alloc(mbuf_class_t, int); 642static void slab_free(mbuf_class_t, mcache_obj_t *); 643static unsigned int mbuf_slab_alloc(void *, mcache_obj_t ***, 644 unsigned int, int); 645static void mbuf_slab_free(void *, mcache_obj_t *, int); 646static void mbuf_slab_audit(void *, mcache_obj_t *, boolean_t); 647static void mbuf_slab_notify(void *, u_int32_t); 648static unsigned int cslab_alloc(mbuf_class_t, mcache_obj_t ***, 649 unsigned int); 650static unsigned int cslab_free(mbuf_class_t, mcache_obj_t *, int); 651static unsigned int mbuf_cslab_alloc(void *, mcache_obj_t ***, 652 unsigned int, int); 653static void mbuf_cslab_free(void *, mcache_obj_t *, int); 654static void mbuf_cslab_audit(void *, mcache_obj_t *, boolean_t); 655static int freelist_populate(mbuf_class_t, unsigned int, int); 656static void freelist_init(mbuf_class_t); 657static boolean_t mbuf_cached_above(mbuf_class_t, int); 658static boolean_t mbuf_steal(mbuf_class_t, unsigned int); 659static void m_reclaim(mbuf_class_t, unsigned int, boolean_t); 660static int m_howmany(int, size_t); 661static void mbuf_worker_thread(void); 662static void mbuf_watchdog(void); 663static boolean_t mbuf_sleep(mbuf_class_t, unsigned int, int); 664 665static void mcl_audit_init(void *, mcache_audit_t **, mcache_obj_t **, 666 size_t, unsigned int); 667static mcache_audit_t *mcl_audit_buf2mca(mbuf_class_t, mcache_obj_t *); 668static void mcl_audit_mbuf(mcache_audit_t *, void *, boolean_t, boolean_t); 669static void mcl_audit_cluster(mcache_audit_t *, void *, size_t, boolean_t, 670 boolean_t); 671static void mcl_audit_restore_mbuf(struct mbuf *, mcache_audit_t *, boolean_t); 672static void mcl_audit_save_mbuf(struct mbuf *, mcache_audit_t *); 673static void mcl_audit_mcheck_panic(struct mbuf *); 674static void mcl_audit_verify_nextptr(void *, mcache_audit_t *); 675 676static void mleak_activate(void); 677static void mleak_logger(u_int32_t, mcache_obj_t *, boolean_t); 678static boolean_t mleak_log(uintptr_t *, mcache_obj_t *, uint32_t, int); 679static void mleak_free(mcache_obj_t *); 680static void mleak_sort_traces(void); 681static void mleak_update_stats(void); 682 683static mcl_slab_t *slab_get(void *); 684static void slab_init(mcl_slab_t *, mbuf_class_t, u_int32_t, 685 void *, void *, unsigned int, int, int); 686static void slab_insert(mcl_slab_t *, mbuf_class_t); 687static void slab_remove(mcl_slab_t *, mbuf_class_t); 688static boolean_t slab_inrange(mcl_slab_t *, void *); 689static void slab_nextptr_panic(mcl_slab_t *, void *); 690static void slab_detach(mcl_slab_t *); 691static boolean_t slab_is_detached(mcl_slab_t *); 692 693static int m_copyback0(struct mbuf **, int, int, const void *, int, int); 694static struct mbuf *m_split0(struct mbuf *, int, int, int); 695 696/* flags for m_copyback0 */ 697#define M_COPYBACK0_COPYBACK 0x0001 /* copyback from cp */ 698#define M_COPYBACK0_PRESERVE 0x0002 /* preserve original data */ 699#define M_COPYBACK0_COW 0x0004 /* do copy-on-write */ 700#define M_COPYBACK0_EXTEND 0x0008 /* extend chain */ 701 702/* 703 * This flag is set for all mbufs that come out of and into the composite 704 * mbuf + cluster caches, i.e. MC_MBUF_CL and MC_MBUF_BIGCL. mbufs that 705 * are marked with such a flag have clusters attached to them, and will be 706 * treated differently when they are freed; instead of being placed back 707 * into the mbuf and cluster freelists, the composite mbuf + cluster objects 708 * are placed back into the appropriate composite cache's freelist, and the 709 * actual freeing is deferred until the composite objects are purged. At 710 * such a time, this flag will be cleared from the mbufs and the objects 711 * will be freed into their own separate freelists. 712 */ 713#define EXTF_COMPOSITE 0x1 714 715/* 716 * This flag indicates that the external cluster is read-only, i.e. it is 717 * or was referred to by more than one mbufs. Once set, this flag is never 718 * cleared. 719 */ 720#define EXTF_READONLY 0x2 721#define EXTF_MASK (EXTF_COMPOSITE | EXTF_READONLY) 722 723#define MEXT_RFA(m) ((m)->m_ext.ext_refflags) 724#define MEXT_REF(m) (MEXT_RFA(m)->refcnt) 725#define MEXT_FLAGS(m) (MEXT_RFA(m)->flags) 726#define MBUF_IS_COMPOSITE(m) \ 727 (MEXT_REF(m) == 0 && (MEXT_FLAGS(m) & EXTF_MASK) == EXTF_COMPOSITE) 728 729/* 730 * Macros used to verify the integrity of the mbuf. 731 */ 732#define _MCHECK(m) { \ 733 if ((m)->m_type != MT_FREE) { \ 734 if (mclaudit == NULL) \ 735 panic("MCHECK: m_type=%d m=%p", \ 736 (u_int16_t)(m)->m_type, m); \ 737 else \ 738 mcl_audit_mcheck_panic(m); \ 739 } \ 740} 741 742#define MBUF_IN_MAP(addr) \ 743 ((void *)(addr) >= (void *)mbutl && (void *)(addr) < (void *)embutl) 744 745#define MRANGE(addr) { \ 746 if (!MBUF_IN_MAP(addr)) \ 747 panic("MRANGE: address out of range 0x%p", addr); \ 748} 749 750/* 751 * Macro version of mtod. 752 */ 753#define MTOD(m, t) ((t)((m)->m_data)) 754 755/* 756 * Macros to obtain (4KB) cluster index and base cluster address. 757 */ 758 759#define MTOBG(x) (((char *)(x) - (char *)mbutl) >> MBIGCLSHIFT) 760#define BGTOM(x) ((union mbigcluster *)(mbutl + (x))) 761 762/* 763 * Macro to find the mbuf index relative to a base. 764 */ 765#define MCLIDX(c, m) (((char *)(m) - (char *)(c)) >> MSIZESHIFT) 766 767/* 768 * Same thing for 2KB cluster index. 769 */ 770#define CLBGIDX(c, m) (((char *)(m) - (char *)(c)) >> MCLSHIFT) 771 772/* 773 * Macros used during mbuf and cluster initialization. 774 */ 775#define MBUF_INIT(m, pkthdr, type) { \ 776 _MCHECK(m); \ 777 (m)->m_next = (m)->m_nextpkt = NULL; \ 778 (m)->m_len = 0; \ 779 (m)->m_type = type; \ 780 if ((pkthdr) == 0) { \ 781 (m)->m_data = (m)->m_dat; \ 782 (m)->m_flags = 0; \ 783 } else { \ 784 (m)->m_data = (m)->m_pktdat; \ 785 (m)->m_flags = M_PKTHDR; \ 786 (m)->m_pkthdr.rcvif = NULL; \ 787 (m)->m_pkthdr.len = 0; \ 788 (m)->m_pkthdr.header = NULL; \ 789 (m)->m_pkthdr.csum_flags = 0; \ 790 (m)->m_pkthdr.csum_data = 0; \ 791 (m)->m_pkthdr.tso_segsz = 0; \ 792 (m)->m_pkthdr.vlan_tag = 0; \ 793 (m)->m_pkthdr.socket_id = 0; \ 794 (m)->m_pkthdr.vt_nrecs = 0; \ 795 (m)->m_pkthdr.aux_flags = 0; \ 796 m_tag_init(m); \ 797 m_service_class_init(m); \ 798 } \ 799} 800 801#define MEXT_INIT(m, buf, size, free, arg, rfa, ref, flag) { \ 802 (m)->m_data = (m)->m_ext.ext_buf = (buf); \ 803 (m)->m_flags |= M_EXT; \ 804 (m)->m_ext.ext_size = (size); \ 805 (m)->m_ext.ext_free = (free); \ 806 (m)->m_ext.ext_arg = (arg); \ 807 (m)->m_ext.ext_refs.forward = (m)->m_ext.ext_refs.backward = \ 808 &(m)->m_ext.ext_refs; \ 809 MEXT_RFA(m) = (rfa); \ 810 MEXT_REF(m) = (ref); \ 811 MEXT_FLAGS(m) = (flag); \ 812} 813 814#define MBUF_CL_INIT(m, buf, rfa, ref, flag) \ 815 MEXT_INIT(m, buf, m_maxsize(MC_CL), NULL, NULL, rfa, ref, flag) 816 817#define MBUF_BIGCL_INIT(m, buf, rfa, ref, flag) \ 818 MEXT_INIT(m, buf, m_maxsize(MC_BIGCL), m_bigfree, NULL, rfa, ref, flag) 819 820#define MBUF_16KCL_INIT(m, buf, rfa, ref, flag) \ 821 MEXT_INIT(m, buf, m_maxsize(MC_16KCL), m_16kfree, NULL, rfa, ref, flag) 822 823/* 824 * Macro to convert BSD malloc sleep flag to mcache's 825 */ 826#define MSLEEPF(f) ((!((f) & M_DONTWAIT)) ? MCR_SLEEP : MCR_NOSLEEP) 827 828/* 829 * The structure that holds all mbuf class statistics exportable via sysctl. 830 * Similar to mbstat structure, the mb_stat structure is protected by the 831 * global mbuf lock. It contains additional information about the classes 832 * that allows for a more accurate view of the state of the allocator. 833 */ 834struct mb_stat *mb_stat; 835struct omb_stat *omb_stat; /* For backwards compatibility */ 836 837#define MB_STAT_SIZE(n) \ 838 ((size_t)(&((mb_stat_t *)0)->mbs_class[n])) 839#define OMB_STAT_SIZE(n) \ 840 ((size_t)(&((struct omb_stat *)0)->mbs_class[n])) 841 842/* 843 * The legacy structure holding all of the mbuf allocation statistics. 844 * The actual statistics used by the kernel are stored in the mbuf_table 845 * instead, and are updated atomically while the global mbuf lock is held. 846 * They are mirrored in mbstat to support legacy applications (e.g. netstat). 847 * Unlike before, the kernel no longer relies on the contents of mbstat for 848 * its operations (e.g. cluster expansion) because the structure is exposed 849 * to outside and could possibly be modified, therefore making it unsafe. 850 * With the exception of the mbstat.m_mtypes array (see below), all of the 851 * statistics are updated as they change. 852 */ 853struct mbstat mbstat; 854 855#define MBSTAT_MTYPES_MAX \ 856 (sizeof (mbstat.m_mtypes) / sizeof (mbstat.m_mtypes[0])) 857 858/* 859 * Allocation statistics related to mbuf types (up to MT_MAX-1) are updated 860 * atomically and stored in a per-CPU structure which is lock-free; this is 861 * done in order to avoid writing to the global mbstat data structure which 862 * would cause false sharing. During sysctl request for kern.ipc.mbstat, 863 * the statistics across all CPUs will be converged into the mbstat.m_mtypes 864 * array and returned to the application. Any updates for types greater or 865 * equal than MT_MAX would be done atomically to the mbstat; this slows down 866 * performance but is okay since the kernel uses only up to MT_MAX-1 while 867 * anything beyond that (up to type 255) is considered a corner case. 868 */ 869typedef struct { 870 unsigned int cpu_mtypes[MT_MAX]; 871} __attribute__((aligned(CPU_CACHE_SIZE), packed)) mtypes_cpu_t; 872 873typedef struct { 874 mtypes_cpu_t mbs_cpu[1]; 875} mbuf_mtypes_t; 876 877static mbuf_mtypes_t *mbuf_mtypes; /* per-CPU statistics */ 878 879#define MBUF_MTYPES_SIZE(n) \ 880 ((size_t)(&((mbuf_mtypes_t *)0)->mbs_cpu[n])) 881 882#define MTYPES_CPU(p) \ 883 ((mtypes_cpu_t *)(void *)((char *)(p) + MBUF_MTYPES_SIZE(cpu_number()))) 884 885#define mtype_stat_add(type, n) { \ 886 if ((unsigned)(type) < MT_MAX) { \ 887 mtypes_cpu_t *mbs = MTYPES_CPU(mbuf_mtypes); \ 888 atomic_add_32(&mbs->cpu_mtypes[type], n); \ 889 } else if ((unsigned)(type) < (unsigned)MBSTAT_MTYPES_MAX) { \ 890 atomic_add_16((int16_t *)&mbstat.m_mtypes[type], n); \ 891 } \ 892} 893 894#define mtype_stat_sub(t, n) mtype_stat_add(t, -(n)) 895#define mtype_stat_inc(t) mtype_stat_add(t, 1) 896#define mtype_stat_dec(t) mtype_stat_sub(t, 1) 897 898static void 899mbuf_mtypes_sync(boolean_t locked) 900{ 901 int m, n; 902 mtypes_cpu_t mtc; 903 904 if (locked) 905 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 906 907 bzero(&mtc, sizeof (mtc)); 908 for (m = 0; m < ncpu; m++) { 909 mtypes_cpu_t *scp = &mbuf_mtypes->mbs_cpu[m]; 910 mtypes_cpu_t temp; 911 912 bcopy(&scp->cpu_mtypes, &temp.cpu_mtypes, 913 sizeof (temp.cpu_mtypes)); 914 915 for (n = 0; n < MT_MAX; n++) 916 mtc.cpu_mtypes[n] += temp.cpu_mtypes[n]; 917 } 918 if (!locked) 919 lck_mtx_lock(mbuf_mlock); 920 for (n = 0; n < MT_MAX; n++) 921 mbstat.m_mtypes[n] = mtc.cpu_mtypes[n]; 922 if (!locked) 923 lck_mtx_unlock(mbuf_mlock); 924} 925 926static int 927mbstat_sysctl SYSCTL_HANDLER_ARGS 928{ 929#pragma unused(oidp, arg1, arg2) 930 mbuf_mtypes_sync(FALSE); 931 932 return (SYSCTL_OUT(req, &mbstat, sizeof (mbstat))); 933} 934 935static void 936mbuf_stat_sync(void) 937{ 938 mb_class_stat_t *sp; 939 mcache_cpu_t *ccp; 940 mcache_t *cp; 941 int k, m, bktsize; 942 943 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 944 945 for (k = 0; k < NELEM(mbuf_table); k++) { 946 cp = m_cache(k); 947 ccp = &cp->mc_cpu[0]; 948 bktsize = ccp->cc_bktsize; 949 sp = mbuf_table[k].mtbl_stats; 950 951 if (cp->mc_flags & MCF_NOCPUCACHE) 952 sp->mbcl_mc_state = MCS_DISABLED; 953 else if (cp->mc_purge_cnt > 0) 954 sp->mbcl_mc_state = MCS_PURGING; 955 else if (bktsize == 0) 956 sp->mbcl_mc_state = MCS_OFFLINE; 957 else 958 sp->mbcl_mc_state = MCS_ONLINE; 959 960 sp->mbcl_mc_cached = 0; 961 for (m = 0; m < ncpu; m++) { 962 ccp = &cp->mc_cpu[m]; 963 if (ccp->cc_objs > 0) 964 sp->mbcl_mc_cached += ccp->cc_objs; 965 if (ccp->cc_pobjs > 0) 966 sp->mbcl_mc_cached += ccp->cc_pobjs; 967 } 968 sp->mbcl_mc_cached += (cp->mc_full.bl_total * bktsize); 969 sp->mbcl_active = sp->mbcl_total - sp->mbcl_mc_cached - 970 sp->mbcl_infree; 971 972 sp->mbcl_mc_waiter_cnt = cp->mc_waiter_cnt; 973 sp->mbcl_mc_wretry_cnt = cp->mc_wretry_cnt; 974 sp->mbcl_mc_nwretry_cnt = cp->mc_nwretry_cnt; 975 976 /* Calculate total count specific to each class */ 977 sp->mbcl_ctotal = sp->mbcl_total; 978 switch (m_class(k)) { 979 case MC_MBUF: 980 /* Deduct mbufs used in composite caches */ 981 sp->mbcl_ctotal -= (m_total(MC_MBUF_CL) + 982 m_total(MC_MBUF_BIGCL)); 983 break; 984 985 case MC_CL: 986 /* Deduct clusters used in composite cache */ 987 sp->mbcl_ctotal -= m_total(MC_MBUF_CL); 988 break; 989 990 case MC_BIGCL: 991 /* Deduct clusters used in composite cache */ 992 sp->mbcl_ctotal -= m_total(MC_MBUF_BIGCL); 993 break; 994 995 case MC_16KCL: 996 /* Deduct clusters used in composite cache */ 997 sp->mbcl_ctotal -= m_total(MC_MBUF_16KCL); 998 break; 999 1000 default: 1001 break; 1002 } 1003 } 1004} 1005 1006static int 1007mb_stat_sysctl SYSCTL_HANDLER_ARGS 1008{ 1009#pragma unused(oidp, arg1, arg2) 1010 void *statp; 1011 int k, statsz, proc64 = proc_is64bit(req->p); 1012 1013 lck_mtx_lock(mbuf_mlock); 1014 mbuf_stat_sync(); 1015 1016 if (!proc64) { 1017 struct omb_class_stat *oc; 1018 struct mb_class_stat *c; 1019 1020 omb_stat->mbs_cnt = mb_stat->mbs_cnt; 1021 oc = &omb_stat->mbs_class[0]; 1022 c = &mb_stat->mbs_class[0]; 1023 for (k = 0; k < omb_stat->mbs_cnt; k++, oc++, c++) { 1024 (void) snprintf(oc->mbcl_cname, sizeof (oc->mbcl_cname), 1025 "%s", c->mbcl_cname); 1026 oc->mbcl_size = c->mbcl_size; 1027 oc->mbcl_total = c->mbcl_total; 1028 oc->mbcl_active = c->mbcl_active; 1029 oc->mbcl_infree = c->mbcl_infree; 1030 oc->mbcl_slab_cnt = c->mbcl_slab_cnt; 1031 oc->mbcl_alloc_cnt = c->mbcl_alloc_cnt; 1032 oc->mbcl_free_cnt = c->mbcl_free_cnt; 1033 oc->mbcl_notified = c->mbcl_notified; 1034 oc->mbcl_purge_cnt = c->mbcl_purge_cnt; 1035 oc->mbcl_fail_cnt = c->mbcl_fail_cnt; 1036 oc->mbcl_ctotal = c->mbcl_ctotal; 1037 oc->mbcl_mc_state = c->mbcl_mc_state; 1038 oc->mbcl_mc_cached = c->mbcl_mc_cached; 1039 oc->mbcl_mc_waiter_cnt = c->mbcl_mc_waiter_cnt; 1040 oc->mbcl_mc_wretry_cnt = c->mbcl_mc_wretry_cnt; 1041 oc->mbcl_mc_nwretry_cnt = c->mbcl_mc_nwretry_cnt; 1042 } 1043 statp = omb_stat; 1044 statsz = OMB_STAT_SIZE(NELEM(mbuf_table)); 1045 } else { 1046 statp = mb_stat; 1047 statsz = MB_STAT_SIZE(NELEM(mbuf_table)); 1048 } 1049 1050 lck_mtx_unlock(mbuf_mlock); 1051 1052 return (SYSCTL_OUT(req, statp, statsz)); 1053} 1054 1055static int 1056mleak_top_trace_sysctl SYSCTL_HANDLER_ARGS 1057{ 1058#pragma unused(oidp, arg1, arg2) 1059 int i; 1060 1061 /* Ensure leak tracing turned on */ 1062 if (!mclfindleak || !mclexpleak) 1063 return (ENXIO); 1064 1065 lck_mtx_lock(mleak_lock); 1066 mleak_update_stats(); 1067 i = SYSCTL_OUT(req, mleak_stat, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES)); 1068 lck_mtx_unlock(mleak_lock); 1069 1070 return (i); 1071} 1072 1073static int 1074mleak_table_sysctl SYSCTL_HANDLER_ARGS 1075{ 1076#pragma unused(oidp, arg1, arg2) 1077 int i = 0; 1078 1079 /* Ensure leak tracing turned on */ 1080 if (!mclfindleak || !mclexpleak) 1081 return (ENXIO); 1082 1083 lck_mtx_lock(mleak_lock); 1084 i = SYSCTL_OUT(req, &mleak_table, sizeof (mleak_table)); 1085 lck_mtx_unlock(mleak_lock); 1086 1087 return (i); 1088} 1089 1090static inline void 1091m_incref(struct mbuf *m) 1092{ 1093 UInt32 old, new; 1094 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1095 1096 do { 1097 old = *addr; 1098 new = old + 1; 1099 ASSERT(new != 0); 1100 } while (!OSCompareAndSwap(old, new, addr)); 1101 1102 /* 1103 * If cluster is shared, mark it with (sticky) EXTF_READONLY; 1104 * we don't clear the flag when the refcount goes back to 1 1105 * to simplify code calling m_mclhasreference(). 1106 */ 1107 if (new > 1 && !(MEXT_FLAGS(m) & EXTF_READONLY)) 1108 (void) OSBitOrAtomic(EXTF_READONLY, &MEXT_FLAGS(m)); 1109} 1110 1111static inline u_int32_t 1112m_decref(struct mbuf *m) 1113{ 1114 UInt32 old, new; 1115 volatile UInt32 *addr = (volatile UInt32 *)&MEXT_REF(m); 1116 1117 do { 1118 old = *addr; 1119 new = old - 1; 1120 ASSERT(old != 0); 1121 } while (!OSCompareAndSwap(old, new, addr)); 1122 1123 return (new); 1124} 1125 1126static void 1127mbuf_table_init(void) 1128{ 1129 unsigned int b, c, s; 1130 int m; 1131 1132 MALLOC(omb_stat, struct omb_stat *, OMB_STAT_SIZE(NELEM(mbuf_table)), 1133 M_TEMP, M_WAITOK | M_ZERO); 1134 VERIFY(omb_stat != NULL); 1135 1136 MALLOC(mb_stat, mb_stat_t *, MB_STAT_SIZE(NELEM(mbuf_table)), 1137 M_TEMP, M_WAITOK | M_ZERO); 1138 VERIFY(mb_stat != NULL); 1139 1140 mb_stat->mbs_cnt = NELEM(mbuf_table); 1141 for (m = 0; m < NELEM(mbuf_table); m++) 1142 mbuf_table[m].mtbl_stats = &mb_stat->mbs_class[m]; 1143 1144#if CONFIG_MBUF_JUMBO 1145 /* 1146 * Set aside 1/3 of the mbuf cluster map for jumbo clusters; we do 1147 * this only on platforms where jumbo cluster pool is enabled. 1148 */ 1149 njcl = nmbclusters / 3; 1150 njclbytes = M16KCLBYTES; 1151#endif /* CONFIG_MBUF_JUMBO */ 1152 1153 /* 1154 * nclusters holds both the 2KB and 4KB pools, so ensure it's 1155 * a multiple of 4KB clusters. 1156 */ 1157 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1158 if (njcl > 0) { 1159 /* 1160 * Each jumbo cluster takes 8 2KB clusters, so make 1161 * sure that the pool size is evenly divisible by 8; 1162 * njcl is in 2KB unit, hence treated as such. 1163 */ 1164 njcl = P2ROUNDDOWN(nmbclusters - nclusters, 8); 1165 1166 /* Update nclusters with rounded down value of njcl */ 1167 nclusters = P2ROUNDDOWN(nmbclusters - njcl, NCLPBG); 1168 } 1169 1170 /* 1171 * njcl is valid only on platforms with 16KB jumbo clusters, where 1172 * it is configured to 1/3 of the pool size. On these platforms, 1173 * the remaining is used for 2KB and 4KB clusters. On platforms 1174 * without 16KB jumbo clusters, the entire pool is used for both 1175 * 2KB and 4KB clusters. A 4KB cluster can either be splitted into 1176 * 16 mbufs, or into 2 2KB clusters. 1177 * 1178 * +---+---+------------ ... -----------+------- ... -------+ 1179 * | c | b | s | njcl | 1180 * +---+---+------------ ... -----------+------- ... -------+ 1181 * 1182 * 1/32th of the shared region is reserved for pure 2KB and 4KB 1183 * clusters (1/64th each.) 1184 */ 1185 c = P2ROUNDDOWN((nclusters >> 6), 2); /* in 2KB unit */ 1186 b = P2ROUNDDOWN((nclusters >> (6 + NCLPBGSHIFT)), 2); /* in 4KB unit */ 1187 s = nclusters - (c + (b << NCLPBGSHIFT)); /* in 2KB unit */ 1188 1189 /* 1190 * 1/64th (c) is reserved for 2KB clusters. 1191 */ 1192 m_minlimit(MC_CL) = c; 1193 m_maxlimit(MC_CL) = s + c; /* in 2KB unit */ 1194 m_maxsize(MC_CL) = m_size(MC_CL) = MCLBYTES; 1195 (void) snprintf(m_cname(MC_CL), MAX_MBUF_CNAME, "cl"); 1196 1197 /* 1198 * Another 1/64th (b) of the map is reserved for 4KB clusters. 1199 * It cannot be turned into 2KB clusters or mbufs. 1200 */ 1201 m_minlimit(MC_BIGCL) = b; 1202 m_maxlimit(MC_BIGCL) = (s >> NCLPBGSHIFT) + b; /* in 4KB unit */ 1203 m_maxsize(MC_BIGCL) = m_size(MC_BIGCL) = MBIGCLBYTES; 1204 (void) snprintf(m_cname(MC_BIGCL), MAX_MBUF_CNAME, "bigcl"); 1205 1206 /* 1207 * The remaining 31/32ths (s) are all-purpose (mbufs, 2KB, or 4KB) 1208 */ 1209 m_minlimit(MC_MBUF) = 0; 1210 m_maxlimit(MC_MBUF) = (s << NMBPCLSHIFT); /* in mbuf unit */ 1211 m_maxsize(MC_MBUF) = m_size(MC_MBUF) = MSIZE; 1212 (void) snprintf(m_cname(MC_MBUF), MAX_MBUF_CNAME, "mbuf"); 1213 1214 /* 1215 * Set limits for the composite classes. 1216 */ 1217 m_minlimit(MC_MBUF_CL) = 0; 1218 m_maxlimit(MC_MBUF_CL) = m_maxlimit(MC_CL); 1219 m_maxsize(MC_MBUF_CL) = MCLBYTES; 1220 m_size(MC_MBUF_CL) = m_size(MC_MBUF) + m_size(MC_CL); 1221 (void) snprintf(m_cname(MC_MBUF_CL), MAX_MBUF_CNAME, "mbuf_cl"); 1222 1223 m_minlimit(MC_MBUF_BIGCL) = 0; 1224 m_maxlimit(MC_MBUF_BIGCL) = m_maxlimit(MC_BIGCL); 1225 m_maxsize(MC_MBUF_BIGCL) = MBIGCLBYTES; 1226 m_size(MC_MBUF_BIGCL) = m_size(MC_MBUF) + m_size(MC_BIGCL); 1227 (void) snprintf(m_cname(MC_MBUF_BIGCL), MAX_MBUF_CNAME, "mbuf_bigcl"); 1228 1229 /* 1230 * And for jumbo classes. 1231 */ 1232 m_minlimit(MC_16KCL) = 0; 1233 m_maxlimit(MC_16KCL) = (njcl >> NCLPJCLSHIFT); /* in 16KB unit */ 1234 m_maxsize(MC_16KCL) = m_size(MC_16KCL) = M16KCLBYTES; 1235 (void) snprintf(m_cname(MC_16KCL), MAX_MBUF_CNAME, "16kcl"); 1236 1237 m_minlimit(MC_MBUF_16KCL) = 0; 1238 m_maxlimit(MC_MBUF_16KCL) = m_maxlimit(MC_16KCL); 1239 m_maxsize(MC_MBUF_16KCL) = M16KCLBYTES; 1240 m_size(MC_MBUF_16KCL) = m_size(MC_MBUF) + m_size(MC_16KCL); 1241 (void) snprintf(m_cname(MC_MBUF_16KCL), MAX_MBUF_CNAME, "mbuf_16kcl"); 1242 1243 /* 1244 * Initialize the legacy mbstat structure. 1245 */ 1246 bzero(&mbstat, sizeof (mbstat)); 1247 mbstat.m_msize = m_maxsize(MC_MBUF); 1248 mbstat.m_mclbytes = m_maxsize(MC_CL); 1249 mbstat.m_minclsize = MINCLSIZE; 1250 mbstat.m_mlen = MLEN; 1251 mbstat.m_mhlen = MHLEN; 1252 mbstat.m_bigmclbytes = m_maxsize(MC_BIGCL); 1253} 1254 1255#if defined(__LP64__) 1256typedef struct ncl_tbl { 1257 uint64_t nt_maxmem; /* memory (sane) size */ 1258 uint32_t nt_mbpool; /* mbuf pool size */ 1259} ncl_tbl_t; 1260 1261/* Non-server */ 1262static ncl_tbl_t ncl_table[] = { 1263 { (1ULL << GBSHIFT) /* 1 GB */, (64 << MBSHIFT) /* 64 MB */ }, 1264 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1265 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1266 { 0, 0 } 1267}; 1268 1269/* Server */ 1270static ncl_tbl_t ncl_table_srv[] = { 1271 { (1ULL << GBSHIFT) /* 1 GB */, (96 << MBSHIFT) /* 96 MB */ }, 1272 { (1ULL << (GBSHIFT + 2)) /* 4 GB */, (128 << MBSHIFT) /* 128 MB */ }, 1273 { (1ULL << (GBSHIFT + 3)) /* 8 GB */, (160 << MBSHIFT) /* 160 MB */ }, 1274 { (1ULL << (GBSHIFT + 4)) /* 16 GB */, (192 << MBSHIFT) /* 192 MB */ }, 1275 { (1ULL << (GBSHIFT + 5)) /* 32 GB */, (256 << MBSHIFT) /* 256 MB */ }, 1276 { (1ULL << (GBSHIFT + 6)) /* 64 GB */, (384 << MBSHIFT) /* 384 MB */ }, 1277 { 0, 0 } 1278}; 1279#endif /* __LP64__ */ 1280 1281__private_extern__ unsigned int 1282mbuf_default_ncl(int server, uint64_t mem) 1283{ 1284#if !defined(__LP64__) 1285#pragma unused(server) 1286 unsigned int n; 1287 /* 1288 * 32-bit kernel (default to 64MB of mbuf pool for >= 1GB RAM). 1289 */ 1290 if ((n = ((mem / 16) / MCLBYTES)) > 32768) 1291 n = 32768; 1292#else 1293 unsigned int n, i; 1294 ncl_tbl_t *tbl = (server ? ncl_table_srv : ncl_table); 1295 /* 1296 * 64-bit kernel (mbuf pool size based on table). 1297 */ 1298 n = tbl[0].nt_mbpool; 1299 for (i = 0; tbl[i].nt_mbpool != 0; i++) { 1300 if (mem < tbl[i].nt_maxmem) 1301 break; 1302 n = tbl[i].nt_mbpool; 1303 } 1304 n >>= MCLSHIFT; 1305#endif /* !__LP64__ */ 1306 return (n); 1307} 1308 1309__private_extern__ void 1310mbinit(void) 1311{ 1312 unsigned int m; 1313 unsigned int initmcl = 0; 1314 void *buf; 1315 thread_t thread = THREAD_NULL; 1316 1317 /* 1318 * These MBUF_ values must be equal to their private counterparts. 1319 */ 1320 _CASSERT(MBUF_EXT == M_EXT); 1321 _CASSERT(MBUF_PKTHDR == M_PKTHDR); 1322 _CASSERT(MBUF_EOR == M_EOR); 1323 _CASSERT(MBUF_LOOP == M_LOOP); 1324 _CASSERT(MBUF_BCAST == M_BCAST); 1325 _CASSERT(MBUF_MCAST == M_MCAST); 1326 _CASSERT(MBUF_FRAG == M_FRAG); 1327 _CASSERT(MBUF_FIRSTFRAG == M_FIRSTFRAG); 1328 _CASSERT(MBUF_LASTFRAG == M_LASTFRAG); 1329 _CASSERT(MBUF_PROMISC == M_PROMISC); 1330 _CASSERT(MBUF_HASFCS == M_HASFCS); 1331 1332 _CASSERT(MBUF_TYPE_FREE == MT_FREE); 1333 _CASSERT(MBUF_TYPE_DATA == MT_DATA); 1334 _CASSERT(MBUF_TYPE_HEADER == MT_HEADER); 1335 _CASSERT(MBUF_TYPE_SOCKET == MT_SOCKET); 1336 _CASSERT(MBUF_TYPE_PCB == MT_PCB); 1337 _CASSERT(MBUF_TYPE_RTABLE == MT_RTABLE); 1338 _CASSERT(MBUF_TYPE_HTABLE == MT_HTABLE); 1339 _CASSERT(MBUF_TYPE_ATABLE == MT_ATABLE); 1340 _CASSERT(MBUF_TYPE_SONAME == MT_SONAME); 1341 _CASSERT(MBUF_TYPE_SOOPTS == MT_SOOPTS); 1342 _CASSERT(MBUF_TYPE_FTABLE == MT_FTABLE); 1343 _CASSERT(MBUF_TYPE_RIGHTS == MT_RIGHTS); 1344 _CASSERT(MBUF_TYPE_IFADDR == MT_IFADDR); 1345 _CASSERT(MBUF_TYPE_CONTROL == MT_CONTROL); 1346 _CASSERT(MBUF_TYPE_OOBDATA == MT_OOBDATA); 1347 1348 _CASSERT(MBUF_TSO_IPV4 == CSUM_TSO_IPV4); 1349 _CASSERT(MBUF_TSO_IPV6 == CSUM_TSO_IPV6); 1350 _CASSERT(MBUF_CSUM_REQ_SUM16 == CSUM_TCP_SUM16); 1351 _CASSERT(MBUF_CSUM_TCP_SUM16 == MBUF_CSUM_REQ_SUM16); 1352 _CASSERT(MBUF_CSUM_REQ_IP == CSUM_IP); 1353 _CASSERT(MBUF_CSUM_REQ_TCP == CSUM_TCP); 1354 _CASSERT(MBUF_CSUM_REQ_UDP == CSUM_UDP); 1355 _CASSERT(MBUF_CSUM_REQ_TCPIPV6 == CSUM_TCPIPV6); 1356 _CASSERT(MBUF_CSUM_REQ_UDPIPV6 == CSUM_UDPIPV6); 1357 _CASSERT(MBUF_CSUM_DID_IP == CSUM_IP_CHECKED); 1358 _CASSERT(MBUF_CSUM_IP_GOOD == CSUM_IP_VALID); 1359 _CASSERT(MBUF_CSUM_DID_DATA == CSUM_DATA_VALID); 1360 _CASSERT(MBUF_CSUM_PSEUDO_HDR == CSUM_PSEUDO_HDR); 1361 1362 _CASSERT(MBUF_WAITOK == M_WAIT); 1363 _CASSERT(MBUF_DONTWAIT == M_DONTWAIT); 1364 _CASSERT(MBUF_COPYALL == M_COPYALL); 1365 1366 _CASSERT(MBUF_PKTAUXF_INET_RESOLVE_RTR == MAUXF_INET_RESOLVE_RTR); 1367 _CASSERT(MBUF_PKTAUXF_INET6_RESOLVE_RTR == MAUXF_INET6_RESOLVE_RTR); 1368 1369 _CASSERT(MBUF_SC2TC(MBUF_SC_BK_SYS) == MBUF_TC_BK); 1370 _CASSERT(MBUF_SC2TC(MBUF_SC_BK) == MBUF_TC_BK); 1371 _CASSERT(MBUF_SC2TC(MBUF_SC_BE) == MBUF_TC_BE); 1372 _CASSERT(MBUF_SC2TC(MBUF_SC_RD) == MBUF_TC_BE); 1373 _CASSERT(MBUF_SC2TC(MBUF_SC_OAM) == MBUF_TC_BE); 1374 _CASSERT(MBUF_SC2TC(MBUF_SC_AV) == MBUF_TC_VI); 1375 _CASSERT(MBUF_SC2TC(MBUF_SC_RV) == MBUF_TC_VI); 1376 _CASSERT(MBUF_SC2TC(MBUF_SC_VI) == MBUF_TC_VI); 1377 _CASSERT(MBUF_SC2TC(MBUF_SC_VO) == MBUF_TC_VO); 1378 _CASSERT(MBUF_SC2TC(MBUF_SC_CTL) == MBUF_TC_VO); 1379 1380 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BK) == SCVAL_BK); 1381 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_BE) == SCVAL_BE); 1382 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VI) == SCVAL_VI); 1383 _CASSERT(MBUF_TC2SCVAL(MBUF_TC_VO) == SCVAL_VO); 1384 1385 if (nmbclusters == 0) 1386 nmbclusters = NMBCLUSTERS; 1387 1388 /* This should be a sane (at least even) value by now */ 1389 VERIFY(nmbclusters != 0 && !(nmbclusters & 0x1)); 1390 1391 /* Setup the mbuf table */ 1392 mbuf_table_init(); 1393 1394 /* Global lock for common layer */ 1395 mbuf_mlock_grp_attr = lck_grp_attr_alloc_init(); 1396 mbuf_mlock_grp = lck_grp_alloc_init("mbuf", mbuf_mlock_grp_attr); 1397 mbuf_mlock_attr = lck_attr_alloc_init(); 1398 lck_mtx_init(mbuf_mlock, mbuf_mlock_grp, mbuf_mlock_attr); 1399 1400 /* 1401 * Allocate cluster slabs table: 1402 * 1403 * maxslabgrp = (N * 2048) / (1024 * 1024) 1404 * 1405 * Where N is nmbclusters rounded up to the nearest 512. This yields 1406 * mcl_slab_g_t units, each one representing a MB of memory. 1407 */ 1408 maxslabgrp = 1409 (P2ROUNDUP(nmbclusters, (MBSIZE >> 11)) << MCLSHIFT) >> MBSHIFT; 1410 MALLOC(slabstbl, mcl_slabg_t **, maxslabgrp * sizeof (mcl_slabg_t *), 1411 M_TEMP, M_WAITOK | M_ZERO); 1412 VERIFY(slabstbl != NULL); 1413 1414 /* 1415 * Allocate audit structures, if needed: 1416 * 1417 * maxclaudit = (maxslabgrp * 1024 * 1024) / 4096 1418 * 1419 * This yields mcl_audit_t units, each one representing a page. 1420 */ 1421 PE_parse_boot_argn("mbuf_debug", &mbuf_debug, sizeof (mbuf_debug)); 1422 mbuf_debug |= mcache_getflags(); 1423 if (mbuf_debug & MCF_DEBUG) { 1424 maxclaudit = ((maxslabgrp << MBSHIFT) >> PGSHIFT); 1425 MALLOC(mclaudit, mcl_audit_t *, maxclaudit * sizeof (*mclaudit), 1426 M_TEMP, M_WAITOK | M_ZERO); 1427 VERIFY(mclaudit != NULL); 1428 1429 mcl_audit_con_cache = mcache_create("mcl_audit_contents", 1430 AUDIT_CONTENTS_SIZE, 0, 0, MCR_SLEEP); 1431 VERIFY(mcl_audit_con_cache != NULL); 1432 } 1433 mclverify = (mbuf_debug & MCF_VERIFY); 1434 mcltrace = (mbuf_debug & MCF_TRACE); 1435 mclfindleak = !(mbuf_debug & MCF_NOLEAKLOG); 1436 mclexpleak = mclfindleak && (mbuf_debug & MCF_EXPLEAKLOG); 1437 1438 /* Enable mbuf leak logging, with a lock to protect the tables */ 1439 1440 mleak_lock_grp_attr = lck_grp_attr_alloc_init(); 1441 mleak_lock_grp = lck_grp_alloc_init("mleak_lock", mleak_lock_grp_attr); 1442 mleak_lock_attr = lck_attr_alloc_init(); 1443 lck_mtx_init(mleak_lock, mleak_lock_grp, mleak_lock_attr); 1444 1445 mleak_activate(); 1446 1447 /* Calculate the number of pages assigned to the cluster pool */ 1448 mcl_pages = (nmbclusters * MCLBYTES) / CLBYTES; 1449 MALLOC(mcl_paddr, ppnum_t *, mcl_pages * sizeof (ppnum_t), 1450 M_TEMP, M_WAITOK); 1451 VERIFY(mcl_paddr != NULL); 1452 1453 /* Register with the I/O Bus mapper */ 1454 mcl_paddr_base = IOMapperIOVMAlloc(mcl_pages); 1455 bzero((char *)mcl_paddr, mcl_pages * sizeof (ppnum_t)); 1456 1457 embutl = (union mbigcluster *) 1458 ((void *)((unsigned char *)mbutl + (nmbclusters * MCLBYTES))); 1459 VERIFY((((char *)embutl - (char *)mbutl) % MBIGCLBYTES) == 0); 1460 1461 /* Prime up the freelist */ 1462 PE_parse_boot_argn("initmcl", &initmcl, sizeof (initmcl)); 1463 if (initmcl != 0) { 1464 initmcl >>= NCLPBGSHIFT; /* become a 4K unit */ 1465 if (initmcl > m_maxlimit(MC_BIGCL)) 1466 initmcl = m_maxlimit(MC_BIGCL); 1467 } 1468 if (initmcl < m_minlimit(MC_BIGCL)) 1469 initmcl = m_minlimit(MC_BIGCL); 1470 1471 lck_mtx_lock(mbuf_mlock); 1472 1473 /* 1474 * For classes with non-zero minimum limits, populate their freelists 1475 * so that m_total(class) is at least m_minlimit(class). 1476 */ 1477 VERIFY(m_total(MC_BIGCL) == 0 && m_minlimit(MC_BIGCL) != 0); 1478 freelist_populate(m_class(MC_BIGCL), initmcl, M_WAIT); 1479 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 1480 freelist_init(m_class(MC_CL)); 1481 1482 for (m = 0; m < NELEM(mbuf_table); m++) { 1483 /* Make sure we didn't miss any */ 1484 VERIFY(m_minlimit(m_class(m)) == 0 || 1485 m_total(m_class(m)) >= m_minlimit(m_class(m))); 1486 } 1487 1488 lck_mtx_unlock(mbuf_mlock); 1489 1490 (void) kernel_thread_start((thread_continue_t)mbuf_worker_thread_init, 1491 NULL, &thread); 1492 thread_deallocate(thread); 1493 1494 ref_cache = mcache_create("mext_ref", sizeof (struct ext_ref), 1495 0, 0, MCR_SLEEP); 1496 1497 /* Create the cache for each class */ 1498 for (m = 0; m < NELEM(mbuf_table); m++) { 1499 void *allocfunc, *freefunc, *auditfunc, *logfunc; 1500 u_int32_t flags; 1501 1502 flags = mbuf_debug; 1503 if (m_class(m) == MC_MBUF_CL || m_class(m) == MC_MBUF_BIGCL || 1504 m_class(m) == MC_MBUF_16KCL) { 1505 allocfunc = mbuf_cslab_alloc; 1506 freefunc = mbuf_cslab_free; 1507 auditfunc = mbuf_cslab_audit; 1508 logfunc = mleak_logger; 1509 } else { 1510 allocfunc = mbuf_slab_alloc; 1511 freefunc = mbuf_slab_free; 1512 auditfunc = mbuf_slab_audit; 1513 logfunc = mleak_logger; 1514 } 1515 1516 /* 1517 * Disable per-CPU caches for jumbo classes if there 1518 * is no jumbo cluster pool available in the system. 1519 * The cache itself is still created (but will never 1520 * be populated) since it simplifies the code. 1521 */ 1522 if ((m_class(m) == MC_MBUF_16KCL || m_class(m) == MC_16KCL) && 1523 njcl == 0) 1524 flags |= MCF_NOCPUCACHE; 1525 1526 if (!mclfindleak) 1527 flags |= MCF_NOLEAKLOG; 1528 1529 m_cache(m) = mcache_create_ext(m_cname(m), m_maxsize(m), 1530 allocfunc, freefunc, auditfunc, logfunc, mbuf_slab_notify, 1531 (void *)(uintptr_t)m, flags, MCR_SLEEP); 1532 } 1533 1534 /* 1535 * Allocate structure for per-CPU statistics that's aligned 1536 * on the CPU cache boundary; this code assumes that we never 1537 * uninitialize this framework, since the original address 1538 * before alignment is not saved. 1539 */ 1540 ncpu = ml_get_max_cpus(); 1541 MALLOC(buf, void *, MBUF_MTYPES_SIZE(ncpu) + CPU_CACHE_SIZE, 1542 M_TEMP, M_WAITOK); 1543 VERIFY(buf != NULL); 1544 1545 mbuf_mtypes = (mbuf_mtypes_t *)P2ROUNDUP((intptr_t)buf, CPU_CACHE_SIZE); 1546 bzero(mbuf_mtypes, MBUF_MTYPES_SIZE(ncpu)); 1547 1548 /* 1549 * Set the max limit on sb_max to be 1/16 th of the size of 1550 * memory allocated for mbuf clusters. 1551 */ 1552 high_sb_max = (nmbclusters << (MCLSHIFT - 4)); 1553 if (high_sb_max < sb_max) { 1554 /* sb_max is too large for this configuration, scale it down */ 1555 if (high_sb_max > (1 << MBSHIFT)) { 1556 /* We have atleast 16 M of mbuf pool */ 1557 sb_max = high_sb_max; 1558 } else if ((nmbclusters << MCLSHIFT) > (1 << MBSHIFT)) { 1559 /* 1560 * If we have more than 1M of mbufpool, cap the size of 1561 * max sock buf at 1M 1562 */ 1563 sb_max = high_sb_max = (1 << MBSHIFT); 1564 } else { 1565 sb_max = high_sb_max; 1566 } 1567 } 1568 1569 /* allocate space for mbuf_dump_buf */ 1570 MALLOC(mbuf_dump_buf, char *, MBUF_DUMP_BUF_SIZE, M_TEMP, M_WAITOK); 1571 VERIFY(mbuf_dump_buf != NULL); 1572 1573 printf("mbinit: done [%d MB total pool size, (%d/%d) split]\n", 1574 (nmbclusters << MCLSHIFT) >> MBSHIFT, 1575 (nclusters << MCLSHIFT) >> MBSHIFT, 1576 (njcl << MCLSHIFT) >> MBSHIFT); 1577} 1578 1579/* 1580 * Obtain a slab of object(s) from the class's freelist. 1581 */ 1582static mcache_obj_t * 1583slab_alloc(mbuf_class_t class, int wait) 1584{ 1585 mcl_slab_t *sp; 1586 mcache_obj_t *buf; 1587 1588 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1589 1590 VERIFY(class != MC_16KCL || njcl > 0); 1591 1592 /* This should always be NULL for us */ 1593 VERIFY(m_cobjlist(class) == NULL); 1594 1595 /* 1596 * Treat composite objects as having longer lifespan by using 1597 * a slab from the reverse direction, in hoping that this could 1598 * reduce the probability of fragmentation for slabs that hold 1599 * more than one buffer chunks (e.g. mbuf slabs). For other 1600 * slabs, this probably doesn't make much of a difference. 1601 */ 1602 if ((class == MC_MBUF || class == MC_CL) && (wait & MCR_COMP)) 1603 sp = (mcl_slab_t *)TAILQ_LAST(&m_slablist(class), mcl_slhead); 1604 else 1605 sp = (mcl_slab_t *)TAILQ_FIRST(&m_slablist(class)); 1606 1607 if (sp == NULL) { 1608 VERIFY(m_infree(class) == 0 && m_slab_cnt(class) == 0); 1609 /* The slab list for this class is empty */ 1610 return (NULL); 1611 } 1612 1613 VERIFY(m_infree(class) > 0); 1614 VERIFY(!slab_is_detached(sp)); 1615 VERIFY(sp->sl_class == class && 1616 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1617 buf = sp->sl_head; 1618 VERIFY(slab_inrange(sp, buf) && sp == slab_get(buf)); 1619 1620 if (class == MC_MBUF) { 1621 sp->sl_head = buf->obj_next; 1622 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NMBPBG - 1)); 1623 } else if (class == MC_CL) { 1624 sp->sl_head = buf->obj_next; 1625 VERIFY(sp->sl_head != NULL || sp->sl_refcnt == (NCLPBG - 1)); 1626 } else { 1627 sp->sl_head = NULL; 1628 } 1629 if (sp->sl_head != NULL && !slab_inrange(sp, sp->sl_head)) { 1630 slab_nextptr_panic(sp, sp->sl_head); 1631 /* In case sl_head is in the map but not in the slab */ 1632 VERIFY(slab_inrange(sp, sp->sl_head)); 1633 /* NOTREACHED */ 1634 } 1635 1636 /* Increment slab reference */ 1637 sp->sl_refcnt++; 1638 1639 if (mclaudit != NULL) { 1640 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1641 mca->mca_uflags = 0; 1642 /* Save contents on mbuf objects only */ 1643 if (class == MC_MBUF) 1644 mca->mca_uflags |= MB_SCVALID; 1645 } 1646 1647 if (class == MC_CL) { 1648 mbstat.m_clfree = (--m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1649 /* 1650 * A 2K cluster slab can have at most NCLPBG references. 1651 */ 1652 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NCLPBG && 1653 sp->sl_chunks == NCLPBG && 1654 sp->sl_len == m_maxsize(MC_BIGCL)); 1655 VERIFY(sp->sl_refcnt < NCLPBG || sp->sl_head == NULL); 1656 } else if (class == MC_BIGCL) { 1657 mbstat.m_bigclfree = (--m_infree(MC_BIGCL)) + 1658 m_infree(MC_MBUF_BIGCL); 1659 /* 1660 * A 4K cluster slab can have at most 1 reference. 1661 */ 1662 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1663 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1664 } else if (class == MC_16KCL) { 1665 mcl_slab_t *nsp; 1666 int k; 1667 1668 --m_infree(MC_16KCL); 1669 VERIFY(sp->sl_refcnt == 1 && sp->sl_chunks == 1 && 1670 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1671 /* 1672 * Increment 2nd-Nth slab reference, where N is NSLABSP16KB. 1673 * A 16KB big cluster takes NSLABSP16KB slabs, each having at 1674 * most 1 reference. 1675 */ 1676 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1677 nsp = nsp->sl_next; 1678 /* Next slab must already be present */ 1679 VERIFY(nsp != NULL); 1680 nsp->sl_refcnt++; 1681 VERIFY(!slab_is_detached(nsp)); 1682 VERIFY(nsp->sl_class == MC_16KCL && 1683 nsp->sl_flags == (SLF_MAPPED | SLF_PARTIAL) && 1684 nsp->sl_refcnt == 1 && nsp->sl_chunks == 0 && 1685 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1686 nsp->sl_head == NULL); 1687 } 1688 } else { 1689 VERIFY(class == MC_MBUF); 1690 --m_infree(MC_MBUF); 1691 /* 1692 * If auditing is turned on, this check is 1693 * deferred until later in mbuf_slab_audit(). 1694 */ 1695 if (mclaudit == NULL) 1696 _MCHECK((struct mbuf *)buf); 1697 /* 1698 * Since we have incremented the reference count above, 1699 * an mbuf slab (formerly a 4KB cluster slab that was cut 1700 * up into mbufs) must have a reference count between 1 1701 * and NMBPBG at this point. 1702 */ 1703 VERIFY(sp->sl_refcnt >= 1 && sp->sl_refcnt <= NMBPBG && 1704 sp->sl_chunks == NMBPBG && 1705 sp->sl_len == m_maxsize(MC_BIGCL)); 1706 VERIFY(sp->sl_refcnt < NMBPBG || sp->sl_head == NULL); 1707 } 1708 1709 /* If empty, remove this slab from the class's freelist */ 1710 if (sp->sl_head == NULL) { 1711 VERIFY(class != MC_MBUF || sp->sl_refcnt == NMBPBG); 1712 VERIFY(class != MC_CL || sp->sl_refcnt == NCLPBG); 1713 slab_remove(sp, class); 1714 } 1715 1716 return (buf); 1717} 1718 1719/* 1720 * Place a slab of object(s) back into a class's slab list. 1721 */ 1722static void 1723slab_free(mbuf_class_t class, mcache_obj_t *buf) 1724{ 1725 mcl_slab_t *sp; 1726 1727 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1728 1729 VERIFY(class != MC_16KCL || njcl > 0); 1730 VERIFY(buf->obj_next == NULL); 1731 sp = slab_get(buf); 1732 VERIFY(sp->sl_class == class && slab_inrange(sp, buf) && 1733 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 1734 1735 /* Decrement slab reference */ 1736 sp->sl_refcnt--; 1737 1738 if (class == MC_CL) { 1739 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1740 /* 1741 * A slab that has been splitted for 2KB clusters can have 1742 * at most 1 outstanding reference at this point. 1743 */ 1744 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NCLPBG - 1) && 1745 sp->sl_chunks == NCLPBG && 1746 sp->sl_len == m_maxsize(MC_BIGCL)); 1747 VERIFY(sp->sl_refcnt < (NCLPBG - 1) || 1748 (slab_is_detached(sp) && sp->sl_head == NULL)); 1749 } else if (class == MC_BIGCL) { 1750 VERIFY(IS_P2ALIGNED(buf, MCLBYTES)); 1751 /* 1752 * A 4KB cluster slab can have at most 1 reference 1753 * which must be 0 at this point. 1754 */ 1755 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1756 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1757 VERIFY(slab_is_detached(sp)); 1758 } else if (class == MC_16KCL) { 1759 mcl_slab_t *nsp; 1760 int k; 1761 /* 1762 * A 16KB cluster takes NSLABSP16KB slabs, all must 1763 * now have 0 reference. 1764 */ 1765 VERIFY(IS_P2ALIGNED(buf, MBIGCLBYTES)); 1766 VERIFY(sp->sl_refcnt == 0 && sp->sl_chunks == 1 && 1767 sp->sl_len == m_maxsize(class) && sp->sl_head == NULL); 1768 VERIFY(slab_is_detached(sp)); 1769 for (nsp = sp, k = 1; k < NSLABSP16KB; k++) { 1770 nsp = nsp->sl_next; 1771 /* Next slab must already be present */ 1772 VERIFY(nsp != NULL); 1773 nsp->sl_refcnt--; 1774 VERIFY(slab_is_detached(nsp)); 1775 VERIFY(nsp->sl_class == MC_16KCL && 1776 (nsp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) && 1777 nsp->sl_refcnt == 0 && nsp->sl_chunks == 0 && 1778 nsp->sl_len == 0 && nsp->sl_base == sp->sl_base && 1779 nsp->sl_head == NULL); 1780 } 1781 } else { 1782 /* 1783 * A slab that has been splitted for mbufs has at most NMBPBG 1784 * reference counts. Since we have decremented one reference 1785 * above, it must now be between 0 and NMBPBG-1. 1786 */ 1787 VERIFY(class == MC_MBUF); 1788 VERIFY(sp->sl_refcnt >= 0 && sp->sl_refcnt <= (NMBPBG - 1) && 1789 sp->sl_chunks == NMBPBG && 1790 sp->sl_len == m_maxsize(MC_BIGCL)); 1791 VERIFY(sp->sl_refcnt < (NMBPBG - 1) || 1792 (slab_is_detached(sp) && sp->sl_head == NULL)); 1793 } 1794 1795 /* 1796 * When auditing is enabled, ensure that the buffer still 1797 * contains the free pattern. Otherwise it got corrupted 1798 * while at the CPU cache layer. 1799 */ 1800 if (mclaudit != NULL) { 1801 mcache_audit_t *mca = mcl_audit_buf2mca(class, buf); 1802 if (mclverify) { 1803 mcache_audit_free_verify(mca, buf, 0, m_maxsize(class)); 1804 } 1805 mca->mca_uflags &= ~MB_SCVALID; 1806 } 1807 1808 if (class == MC_CL) { 1809 mbstat.m_clfree = (++m_infree(MC_CL)) + m_infree(MC_MBUF_CL); 1810 buf->obj_next = sp->sl_head; 1811 } else if (class == MC_BIGCL) { 1812 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1813 m_infree(MC_MBUF_BIGCL); 1814 } else if (class == MC_16KCL) { 1815 ++m_infree(MC_16KCL); 1816 } else { 1817 ++m_infree(MC_MBUF); 1818 buf->obj_next = sp->sl_head; 1819 } 1820 sp->sl_head = buf; 1821 1822 /* 1823 * If a slab has been splitted to either one which holds 2KB clusters, 1824 * or one which holds mbufs, turn it back to one which holds a 4KB 1825 * cluster. 1826 */ 1827 if (class == MC_MBUF && sp->sl_refcnt == 0 && 1828 m_total(class) > m_minlimit(class) && 1829 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1830 int i = NMBPBG; 1831 1832 m_total(MC_BIGCL)++; 1833 mbstat.m_bigclusters = m_total(MC_BIGCL); 1834 m_total(MC_MBUF) -= NMBPBG; 1835 mbstat.m_mbufs = m_total(MC_MBUF); 1836 m_infree(MC_MBUF) -= NMBPBG; 1837 mtype_stat_add(MT_FREE, -((unsigned)NMBPBG)); 1838 1839 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1840 VERIFY(m_total(MC_MBUF) >= m_minlimit(MC_MBUF)); 1841 1842 while (i--) { 1843 struct mbuf *m = sp->sl_head; 1844 VERIFY(m != NULL); 1845 sp->sl_head = m->m_next; 1846 m->m_next = NULL; 1847 } 1848 VERIFY(sp->sl_head == NULL); 1849 1850 /* Remove the slab from the mbuf class's slab list */ 1851 slab_remove(sp, class); 1852 1853 /* Reinitialize it as a 4KB cluster slab */ 1854 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1855 sp->sl_len, 0, 1); 1856 1857 if (mclverify) { 1858 mcache_set_pattern(MCACHE_FREE_PATTERN, 1859 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1860 } 1861 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1862 m_infree(MC_MBUF_BIGCL); 1863 1864 VERIFY(slab_is_detached(sp)); 1865 /* And finally switch class */ 1866 class = MC_BIGCL; 1867 } else if (class == MC_CL && sp->sl_refcnt == 0 && 1868 m_total(class) > m_minlimit(class) && 1869 m_total(MC_BIGCL) < m_maxlimit(MC_BIGCL)) { 1870 int i = NCLPBG; 1871 1872 m_total(MC_BIGCL)++; 1873 mbstat.m_bigclusters = m_total(MC_BIGCL); 1874 m_total(MC_CL) -= NCLPBG; 1875 mbstat.m_clusters = m_total(MC_CL); 1876 m_infree(MC_CL) -= NCLPBG; 1877 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 1878 VERIFY(m_total(MC_CL) >= m_minlimit(MC_CL)); 1879 1880 while (i--) { 1881 union mcluster *c = sp->sl_head; 1882 VERIFY(c != NULL); 1883 sp->sl_head = c->mcl_next; 1884 c->mcl_next = NULL; 1885 } 1886 VERIFY(sp->sl_head == NULL); 1887 1888 /* Remove the slab from the 2KB cluster class's slab list */ 1889 slab_remove(sp, class); 1890 1891 /* Reinitialize it as a 4KB cluster slab */ 1892 slab_init(sp, MC_BIGCL, sp->sl_flags, sp->sl_base, sp->sl_base, 1893 sp->sl_len, 0, 1); 1894 1895 if (mclverify) { 1896 mcache_set_pattern(MCACHE_FREE_PATTERN, 1897 (caddr_t)sp->sl_head, m_maxsize(MC_BIGCL)); 1898 } 1899 mbstat.m_bigclfree = (++m_infree(MC_BIGCL)) + 1900 m_infree(MC_MBUF_BIGCL); 1901 1902 VERIFY(slab_is_detached(sp)); 1903 /* And finally switch class */ 1904 class = MC_BIGCL; 1905 } 1906 1907 /* Reinsert the slab to the class's slab list */ 1908 if (slab_is_detached(sp)) 1909 slab_insert(sp, class); 1910} 1911 1912/* 1913 * Common allocator for rudimentary objects called by the CPU cache layer 1914 * during an allocation request whenever there is no available element in the 1915 * bucket layer. It returns one or more elements from the appropriate global 1916 * freelist. If the freelist is empty, it will attempt to populate it and 1917 * retry the allocation. 1918 */ 1919static unsigned int 1920mbuf_slab_alloc(void *arg, mcache_obj_t ***plist, unsigned int num, int wait) 1921{ 1922 mbuf_class_t class = (mbuf_class_t)arg; 1923 unsigned int need = num; 1924 mcache_obj_t **list = *plist; 1925 1926 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 1927 ASSERT(need > 0); 1928 1929 lck_mtx_lock(mbuf_mlock); 1930 1931 for (;;) { 1932 if ((*list = slab_alloc(class, wait)) != NULL) { 1933 (*list)->obj_next = NULL; 1934 list = *plist = &(*list)->obj_next; 1935 1936 if (--need == 0) { 1937 /* 1938 * If the number of elements in freelist has 1939 * dropped below low watermark, asynchronously 1940 * populate the freelist now rather than doing 1941 * it later when we run out of elements. 1942 */ 1943 if (!mbuf_cached_above(class, wait) && 1944 m_infree(class) < m_total(class) >> 5) { 1945 (void) freelist_populate(class, 1, 1946 M_DONTWAIT); 1947 } 1948 break; 1949 } 1950 } else { 1951 VERIFY(m_infree(class) == 0 || class == MC_CL); 1952 1953 (void) freelist_populate(class, 1, 1954 (wait & MCR_NOSLEEP) ? M_DONTWAIT : M_WAIT); 1955 1956 if (m_infree(class) > 0) 1957 continue; 1958 1959 /* Check if there's anything at the cache layer */ 1960 if (mbuf_cached_above(class, wait)) 1961 break; 1962 1963 /* watchdog checkpoint */ 1964 mbuf_watchdog(); 1965 1966 /* We have nothing and cannot block; give up */ 1967 if (wait & MCR_NOSLEEP) { 1968 if (!(wait & MCR_TRYHARD)) { 1969 m_fail_cnt(class)++; 1970 mbstat.m_drops++; 1971 break; 1972 } 1973 } 1974 1975 /* 1976 * If the freelist is still empty and the caller is 1977 * willing to be blocked, sleep on the wait channel 1978 * until an element is available. Otherwise, if 1979 * MCR_TRYHARD is set, do our best to satisfy the 1980 * request without having to go to sleep. 1981 */ 1982 if (mbuf_worker_ready && 1983 mbuf_sleep(class, need, wait)) 1984 break; 1985 1986 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 1987 } 1988 } 1989 1990 m_alloc_cnt(class) += num - need; 1991 lck_mtx_unlock(mbuf_mlock); 1992 1993 return (num - need); 1994} 1995 1996/* 1997 * Common de-allocator for rudimentary objects called by the CPU cache 1998 * layer when one or more elements need to be returned to the appropriate 1999 * global freelist. 2000 */ 2001static void 2002mbuf_slab_free(void *arg, mcache_obj_t *list, __unused int purged) 2003{ 2004 mbuf_class_t class = (mbuf_class_t)arg; 2005 mcache_obj_t *nlist; 2006 unsigned int num = 0; 2007 int w; 2008 2009 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2010 2011 lck_mtx_lock(mbuf_mlock); 2012 2013 for (;;) { 2014 nlist = list->obj_next; 2015 list->obj_next = NULL; 2016 slab_free(class, list); 2017 ++num; 2018 if ((list = nlist) == NULL) 2019 break; 2020 } 2021 m_free_cnt(class) += num; 2022 2023 if ((w = mb_waiters) > 0) 2024 mb_waiters = 0; 2025 2026 lck_mtx_unlock(mbuf_mlock); 2027 2028 if (w != 0) 2029 wakeup(mb_waitchan); 2030} 2031 2032/* 2033 * Common auditor for rudimentary objects called by the CPU cache layer 2034 * during an allocation or free request. For the former, this is called 2035 * after the objects are obtained from either the bucket or slab layer 2036 * and before they are returned to the caller. For the latter, this is 2037 * called immediately during free and before placing the objects into 2038 * the bucket or slab layer. 2039 */ 2040static void 2041mbuf_slab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2042{ 2043 mbuf_class_t class = (mbuf_class_t)arg; 2044 mcache_audit_t *mca; 2045 2046 ASSERT(MBUF_CLASS_VALID(class) && !MBUF_CLASS_COMPOSITE(class)); 2047 2048 while (list != NULL) { 2049 lck_mtx_lock(mbuf_mlock); 2050 mca = mcl_audit_buf2mca(class, list); 2051 2052 /* Do the sanity checks */ 2053 if (class == MC_MBUF) { 2054 mcl_audit_mbuf(mca, list, FALSE, alloc); 2055 ASSERT(mca->mca_uflags & MB_SCVALID); 2056 } else { 2057 mcl_audit_cluster(mca, list, m_maxsize(class), 2058 alloc, TRUE); 2059 ASSERT(!(mca->mca_uflags & MB_SCVALID)); 2060 } 2061 /* Record this transaction */ 2062 if (mcltrace) 2063 mcache_buffer_log(mca, list, m_cache(class)); 2064 2065 if (alloc) 2066 mca->mca_uflags |= MB_INUSE; 2067 else 2068 mca->mca_uflags &= ~MB_INUSE; 2069 /* Unpair the object (unconditionally) */ 2070 mca->mca_uptr = NULL; 2071 lck_mtx_unlock(mbuf_mlock); 2072 2073 list = list->obj_next; 2074 } 2075} 2076 2077/* 2078 * Common notify routine for all caches. It is called by mcache when 2079 * one or more objects get freed. We use this indication to trigger 2080 * the wakeup of any sleeping threads so that they can retry their 2081 * allocation requests. 2082 */ 2083static void 2084mbuf_slab_notify(void *arg, u_int32_t reason) 2085{ 2086 mbuf_class_t class = (mbuf_class_t)arg; 2087 int w; 2088 2089 ASSERT(MBUF_CLASS_VALID(class)); 2090 2091 if (reason != MCN_RETRYALLOC) 2092 return; 2093 2094 lck_mtx_lock(mbuf_mlock); 2095 if ((w = mb_waiters) > 0) { 2096 m_notified(class)++; 2097 mb_waiters = 0; 2098 } 2099 lck_mtx_unlock(mbuf_mlock); 2100 2101 if (w != 0) 2102 wakeup(mb_waitchan); 2103} 2104 2105/* 2106 * Obtain object(s) from the composite class's freelist. 2107 */ 2108static unsigned int 2109cslab_alloc(mbuf_class_t class, mcache_obj_t ***plist, unsigned int num) 2110{ 2111 unsigned int need = num; 2112 mcl_slab_t *sp, *clsp, *nsp; 2113 struct mbuf *m; 2114 mcache_obj_t **list = *plist; 2115 void *cl; 2116 2117 VERIFY(need > 0); 2118 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2119 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2120 2121 /* Get what we can from the freelist */ 2122 while ((*list = m_cobjlist(class)) != NULL) { 2123 MRANGE(*list); 2124 2125 m = (struct mbuf *)*list; 2126 sp = slab_get(m); 2127 cl = m->m_ext.ext_buf; 2128 clsp = slab_get(cl); 2129 VERIFY(m->m_flags == M_EXT && cl != NULL); 2130 VERIFY(MEXT_RFA(m) != NULL && MBUF_IS_COMPOSITE(m)); 2131 2132 if (class == MC_MBUF_CL) { 2133 VERIFY(clsp->sl_refcnt >= 1 && 2134 clsp->sl_refcnt <= NCLPBG); 2135 } else { 2136 VERIFY(clsp->sl_refcnt == 1); 2137 } 2138 2139 if (class == MC_MBUF_16KCL) { 2140 int k; 2141 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2142 nsp = nsp->sl_next; 2143 /* Next slab must already be present */ 2144 VERIFY(nsp != NULL); 2145 VERIFY(nsp->sl_refcnt == 1); 2146 } 2147 } 2148 2149 if ((m_cobjlist(class) = (*list)->obj_next) != NULL && 2150 !MBUF_IN_MAP(m_cobjlist(class))) { 2151 slab_nextptr_panic(sp, m_cobjlist(class)); 2152 /* NOTREACHED */ 2153 } 2154 (*list)->obj_next = NULL; 2155 list = *plist = &(*list)->obj_next; 2156 2157 if (--need == 0) 2158 break; 2159 } 2160 m_infree(class) -= (num - need); 2161 2162 return (num - need); 2163} 2164 2165/* 2166 * Place object(s) back into a composite class's freelist. 2167 */ 2168static unsigned int 2169cslab_free(mbuf_class_t class, mcache_obj_t *list, int purged) 2170{ 2171 mcache_obj_t *o, *tail; 2172 unsigned int num = 0; 2173 struct mbuf *m, *ms; 2174 mcache_audit_t *mca = NULL; 2175 mcache_obj_t *ref_list = NULL; 2176 mcl_slab_t *clsp, *nsp; 2177 void *cl; 2178 mbuf_class_t cl_class; 2179 2180 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2181 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2182 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2183 2184 if (class == MC_MBUF_CL) { 2185 cl_class = MC_CL; 2186 } else if (class == MC_MBUF_BIGCL) { 2187 cl_class = MC_BIGCL; 2188 } else { 2189 VERIFY(class == MC_MBUF_16KCL); 2190 cl_class = MC_16KCL; 2191 } 2192 2193 o = tail = list; 2194 2195 while ((m = ms = (struct mbuf *)o) != NULL) { 2196 mcache_obj_t *rfa, *nexto = o->obj_next; 2197 2198 /* Do the mbuf sanity checks */ 2199 if (mclaudit != NULL) { 2200 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2201 if (mclverify) { 2202 mcache_audit_free_verify(mca, m, 0, 2203 m_maxsize(MC_MBUF)); 2204 } 2205 ms = (struct mbuf *)mca->mca_contents; 2206 } 2207 2208 /* Do the cluster sanity checks */ 2209 cl = ms->m_ext.ext_buf; 2210 clsp = slab_get(cl); 2211 if (mclverify) { 2212 size_t size = m_maxsize(cl_class); 2213 mcache_audit_free_verify(mcl_audit_buf2mca(cl_class, 2214 (mcache_obj_t *)cl), cl, 0, size); 2215 } 2216 VERIFY(ms->m_type == MT_FREE); 2217 VERIFY(ms->m_flags == M_EXT); 2218 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2219 if (cl_class == MC_CL) { 2220 VERIFY(clsp->sl_refcnt >= 1 && 2221 clsp->sl_refcnt <= NCLPBG); 2222 } else { 2223 VERIFY(clsp->sl_refcnt == 1); 2224 } 2225 if (cl_class == MC_16KCL) { 2226 int k; 2227 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2228 nsp = nsp->sl_next; 2229 /* Next slab must already be present */ 2230 VERIFY(nsp != NULL); 2231 VERIFY(nsp->sl_refcnt == 1); 2232 } 2233 } 2234 2235 /* 2236 * If we're asked to purge, restore the actual mbuf using 2237 * contents of the shadow structure (if auditing is enabled) 2238 * and clear EXTF_COMPOSITE flag from the mbuf, as we are 2239 * about to free it and the attached cluster into their caches. 2240 */ 2241 if (purged) { 2242 /* Restore constructed mbuf fields */ 2243 if (mclaudit != NULL) 2244 mcl_audit_restore_mbuf(m, mca, TRUE); 2245 2246 MEXT_REF(m) = 0; 2247 MEXT_FLAGS(m) = 0; 2248 2249 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 2250 rfa->obj_next = ref_list; 2251 ref_list = rfa; 2252 MEXT_RFA(m) = NULL; 2253 2254 m->m_type = MT_FREE; 2255 m->m_flags = m->m_len = 0; 2256 m->m_next = m->m_nextpkt = NULL; 2257 2258 /* Save mbuf fields and make auditing happy */ 2259 if (mclaudit != NULL) 2260 mcl_audit_mbuf(mca, o, FALSE, FALSE); 2261 2262 VERIFY(m_total(class) > 0); 2263 m_total(class)--; 2264 2265 /* Free the mbuf */ 2266 o->obj_next = NULL; 2267 slab_free(MC_MBUF, o); 2268 2269 /* And free the cluster */ 2270 ((mcache_obj_t *)cl)->obj_next = NULL; 2271 if (class == MC_MBUF_CL) 2272 slab_free(MC_CL, cl); 2273 else if (class == MC_MBUF_BIGCL) 2274 slab_free(MC_BIGCL, cl); 2275 else 2276 slab_free(MC_16KCL, cl); 2277 } 2278 2279 ++num; 2280 tail = o; 2281 o = nexto; 2282 } 2283 2284 if (!purged) { 2285 tail->obj_next = m_cobjlist(class); 2286 m_cobjlist(class) = list; 2287 m_infree(class) += num; 2288 } else if (ref_list != NULL) { 2289 mcache_free_ext(ref_cache, ref_list); 2290 } 2291 2292 return (num); 2293} 2294 2295/* 2296 * Common allocator for composite objects called by the CPU cache layer 2297 * during an allocation request whenever there is no available element in 2298 * the bucket layer. It returns one or more composite elements from the 2299 * appropriate global freelist. If the freelist is empty, it will attempt 2300 * to obtain the rudimentary objects from their caches and construct them 2301 * into composite mbuf + cluster objects. 2302 */ 2303static unsigned int 2304mbuf_cslab_alloc(void *arg, mcache_obj_t ***plist, unsigned int needed, 2305 int wait) 2306{ 2307 mbuf_class_t class = (mbuf_class_t)arg; 2308 mbuf_class_t cl_class = 0; 2309 unsigned int num = 0, cnum = 0, want = needed; 2310 mcache_obj_t *ref_list = NULL; 2311 mcache_obj_t *mp_list = NULL; 2312 mcache_obj_t *clp_list = NULL; 2313 mcache_obj_t **list; 2314 struct ext_ref *rfa; 2315 struct mbuf *m; 2316 void *cl; 2317 2318 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2319 ASSERT(needed > 0); 2320 2321 VERIFY(class != MC_MBUF_16KCL || njcl > 0); 2322 2323 /* There should not be any slab for this class */ 2324 VERIFY(m_slab_cnt(class) == 0 && 2325 m_slablist(class).tqh_first == NULL && 2326 m_slablist(class).tqh_last == NULL); 2327 2328 lck_mtx_lock(mbuf_mlock); 2329 2330 /* Try using the freelist first */ 2331 num = cslab_alloc(class, plist, needed); 2332 list = *plist; 2333 if (num == needed) { 2334 m_alloc_cnt(class) += num; 2335 lck_mtx_unlock(mbuf_mlock); 2336 return (needed); 2337 } 2338 2339 lck_mtx_unlock(mbuf_mlock); 2340 2341 /* 2342 * We could not satisfy the request using the freelist alone; 2343 * allocate from the appropriate rudimentary caches and use 2344 * whatever we can get to construct the composite objects. 2345 */ 2346 needed -= num; 2347 2348 /* 2349 * Mark these allocation requests as coming from a composite cache. 2350 * Also, if the caller is willing to be blocked, mark the request 2351 * with MCR_FAILOK such that we don't end up sleeping at the mbuf 2352 * slab layer waiting for the individual object when one or more 2353 * of the already-constructed composite objects are available. 2354 */ 2355 wait |= MCR_COMP; 2356 if (!(wait & MCR_NOSLEEP)) 2357 wait |= MCR_FAILOK; 2358 2359 /* allocate mbufs */ 2360 needed = mcache_alloc_ext(m_cache(MC_MBUF), &mp_list, needed, wait); 2361 if (needed == 0) { 2362 ASSERT(mp_list == NULL); 2363 goto fail; 2364 } 2365 2366 /* allocate clusters */ 2367 if (class == MC_MBUF_CL) { 2368 cl_class = MC_CL; 2369 } else if (class == MC_MBUF_BIGCL) { 2370 cl_class = MC_BIGCL; 2371 } else { 2372 VERIFY(class == MC_MBUF_16KCL); 2373 cl_class = MC_16KCL; 2374 } 2375 needed = mcache_alloc_ext(m_cache(cl_class), &clp_list, needed, wait); 2376 if (needed == 0) { 2377 ASSERT(clp_list == NULL); 2378 goto fail; 2379 } 2380 2381 needed = mcache_alloc_ext(ref_cache, &ref_list, needed, wait); 2382 if (needed == 0) { 2383 ASSERT(ref_list == NULL); 2384 goto fail; 2385 } 2386 2387 /* 2388 * By this time "needed" is MIN(mbuf, cluster, ref). Any left 2389 * overs will get freed accordingly before we return to caller. 2390 */ 2391 for (cnum = 0; cnum < needed; cnum++) { 2392 struct mbuf *ms; 2393 2394 m = ms = (struct mbuf *)mp_list; 2395 mp_list = mp_list->obj_next; 2396 2397 cl = clp_list; 2398 clp_list = clp_list->obj_next; 2399 ((mcache_obj_t *)cl)->obj_next = NULL; 2400 2401 rfa = (struct ext_ref *)ref_list; 2402 ref_list = ref_list->obj_next; 2403 ((mcache_obj_t *)(void *)rfa)->obj_next = NULL; 2404 2405 /* 2406 * If auditing is enabled, construct the shadow mbuf 2407 * in the audit structure instead of in the actual one. 2408 * mbuf_cslab_audit() will take care of restoring the 2409 * contents after the integrity check. 2410 */ 2411 if (mclaudit != NULL) { 2412 mcache_audit_t *mca, *cl_mca; 2413 2414 lck_mtx_lock(mbuf_mlock); 2415 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2416 ms = ((struct mbuf *)mca->mca_contents); 2417 cl_mca = mcl_audit_buf2mca(MC_CL, (mcache_obj_t *)cl); 2418 2419 /* 2420 * Pair them up. Note that this is done at the time 2421 * the mbuf+cluster objects are constructed. This 2422 * information should be treated as "best effort" 2423 * debugging hint since more than one mbufs can refer 2424 * to a cluster. In that case, the cluster might not 2425 * be freed along with the mbuf it was paired with. 2426 */ 2427 mca->mca_uptr = cl_mca; 2428 cl_mca->mca_uptr = mca; 2429 2430 ASSERT(mca->mca_uflags & MB_SCVALID); 2431 ASSERT(!(cl_mca->mca_uflags & MB_SCVALID)); 2432 lck_mtx_unlock(mbuf_mlock); 2433 2434 /* Technically, they are in the freelist */ 2435 if (mclverify) { 2436 size_t size; 2437 2438 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 2439 m_maxsize(MC_MBUF)); 2440 2441 if (class == MC_MBUF_CL) 2442 size = m_maxsize(MC_CL); 2443 else if (class == MC_MBUF_BIGCL) 2444 size = m_maxsize(MC_BIGCL); 2445 else 2446 size = m_maxsize(MC_16KCL); 2447 2448 mcache_set_pattern(MCACHE_FREE_PATTERN, cl, 2449 size); 2450 } 2451 } 2452 2453 MBUF_INIT(ms, 0, MT_FREE); 2454 if (class == MC_MBUF_16KCL) { 2455 MBUF_16KCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2456 } else if (class == MC_MBUF_BIGCL) { 2457 MBUF_BIGCL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2458 } else { 2459 MBUF_CL_INIT(ms, cl, rfa, 0, EXTF_COMPOSITE); 2460 } 2461 VERIFY(ms->m_flags == M_EXT); 2462 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2463 2464 *list = (mcache_obj_t *)m; 2465 (*list)->obj_next = NULL; 2466 list = *plist = &(*list)->obj_next; 2467 } 2468 2469fail: 2470 /* 2471 * Free up what's left of the above. 2472 */ 2473 if (mp_list != NULL) 2474 mcache_free_ext(m_cache(MC_MBUF), mp_list); 2475 if (clp_list != NULL) 2476 mcache_free_ext(m_cache(cl_class), clp_list); 2477 if (ref_list != NULL) 2478 mcache_free_ext(ref_cache, ref_list); 2479 2480 lck_mtx_lock(mbuf_mlock); 2481 if (num > 0 || cnum > 0) { 2482 m_total(class) += cnum; 2483 VERIFY(m_total(class) <= m_maxlimit(class)); 2484 m_alloc_cnt(class) += num + cnum; 2485 } 2486 if ((num + cnum) < want) 2487 m_fail_cnt(class) += (want - (num + cnum)); 2488 lck_mtx_unlock(mbuf_mlock); 2489 2490 return (num + cnum); 2491} 2492 2493/* 2494 * Common de-allocator for composite objects called by the CPU cache 2495 * layer when one or more elements need to be returned to the appropriate 2496 * global freelist. 2497 */ 2498static void 2499mbuf_cslab_free(void *arg, mcache_obj_t *list, int purged) 2500{ 2501 mbuf_class_t class = (mbuf_class_t)arg; 2502 unsigned int num; 2503 int w; 2504 2505 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2506 2507 lck_mtx_lock(mbuf_mlock); 2508 2509 num = cslab_free(class, list, purged); 2510 m_free_cnt(class) += num; 2511 2512 if ((w = mb_waiters) > 0) 2513 mb_waiters = 0; 2514 2515 lck_mtx_unlock(mbuf_mlock); 2516 2517 if (w != 0) 2518 wakeup(mb_waitchan); 2519} 2520 2521/* 2522 * Common auditor for composite objects called by the CPU cache layer 2523 * during an allocation or free request. For the former, this is called 2524 * after the objects are obtained from either the bucket or slab layer 2525 * and before they are returned to the caller. For the latter, this is 2526 * called immediately during free and before placing the objects into 2527 * the bucket or slab layer. 2528 */ 2529static void 2530mbuf_cslab_audit(void *arg, mcache_obj_t *list, boolean_t alloc) 2531{ 2532 mbuf_class_t class = (mbuf_class_t)arg; 2533 mcache_audit_t *mca; 2534 struct mbuf *m, *ms; 2535 mcl_slab_t *clsp, *nsp; 2536 size_t size; 2537 void *cl; 2538 2539 ASSERT(MBUF_CLASS_VALID(class) && MBUF_CLASS_COMPOSITE(class)); 2540 2541 while ((m = ms = (struct mbuf *)list) != NULL) { 2542 lck_mtx_lock(mbuf_mlock); 2543 /* Do the mbuf sanity checks and record its transaction */ 2544 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 2545 mcl_audit_mbuf(mca, m, TRUE, alloc); 2546 if (mcltrace) 2547 mcache_buffer_log(mca, m, m_cache(class)); 2548 2549 if (alloc) 2550 mca->mca_uflags |= MB_COMP_INUSE; 2551 else 2552 mca->mca_uflags &= ~MB_COMP_INUSE; 2553 2554 /* 2555 * Use the shadow mbuf in the audit structure if we are 2556 * freeing, since the contents of the actual mbuf has been 2557 * pattern-filled by the above call to mcl_audit_mbuf(). 2558 */ 2559 if (!alloc && mclverify) 2560 ms = (struct mbuf *)mca->mca_contents; 2561 2562 /* Do the cluster sanity checks and record its transaction */ 2563 cl = ms->m_ext.ext_buf; 2564 clsp = slab_get(cl); 2565 VERIFY(ms->m_flags == M_EXT && cl != NULL); 2566 VERIFY(MEXT_RFA(ms) != NULL && MBUF_IS_COMPOSITE(ms)); 2567 if (class == MC_MBUF_CL) 2568 VERIFY(clsp->sl_refcnt >= 1 && 2569 clsp->sl_refcnt <= NCLPBG); 2570 else 2571 VERIFY(clsp->sl_refcnt == 1); 2572 2573 if (class == MC_MBUF_16KCL) { 2574 int k; 2575 for (nsp = clsp, k = 1; k < NSLABSP16KB; k++) { 2576 nsp = nsp->sl_next; 2577 /* Next slab must already be present */ 2578 VERIFY(nsp != NULL); 2579 VERIFY(nsp->sl_refcnt == 1); 2580 } 2581 } 2582 2583 mca = mcl_audit_buf2mca(MC_CL, cl); 2584 if (class == MC_MBUF_CL) 2585 size = m_maxsize(MC_CL); 2586 else if (class == MC_MBUF_BIGCL) 2587 size = m_maxsize(MC_BIGCL); 2588 else 2589 size = m_maxsize(MC_16KCL); 2590 mcl_audit_cluster(mca, cl, size, alloc, FALSE); 2591 if (mcltrace) 2592 mcache_buffer_log(mca, cl, m_cache(class)); 2593 2594 if (alloc) 2595 mca->mca_uflags |= MB_COMP_INUSE; 2596 else 2597 mca->mca_uflags &= ~MB_COMP_INUSE; 2598 lck_mtx_unlock(mbuf_mlock); 2599 2600 list = list->obj_next; 2601 } 2602} 2603 2604/* 2605 * Allocate some number of mbuf clusters and place on cluster freelist. 2606 */ 2607static int 2608m_clalloc(const u_int32_t num, const int wait, const u_int32_t bufsize) 2609{ 2610 int i; 2611 vm_size_t size = 0; 2612 int numpages = 0, large_buffer = (bufsize == m_maxsize(MC_16KCL)); 2613 vm_offset_t page = 0; 2614 mcache_audit_t *mca_list = NULL; 2615 mcache_obj_t *con_list = NULL; 2616 mcl_slab_t *sp; 2617 2618 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 2619 bufsize == m_maxsize(MC_16KCL)); 2620 2621 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2622 2623 /* 2624 * Multiple threads may attempt to populate the cluster map one 2625 * after another. Since we drop the lock below prior to acquiring 2626 * the physical page(s), our view of the cluster map may no longer 2627 * be accurate, and we could end up over-committing the pages beyond 2628 * the maximum allowed for each class. To prevent it, this entire 2629 * operation (including the page mapping) is serialized. 2630 */ 2631 while (mb_clalloc_busy) { 2632 mb_clalloc_waiters++; 2633 (void) msleep(mb_clalloc_waitchan, mbuf_mlock, 2634 (PZERO-1), "m_clalloc", NULL); 2635 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2636 } 2637 2638 /* We are busy now; tell everyone else to go away */ 2639 mb_clalloc_busy = TRUE; 2640 2641 /* 2642 * Honor the caller's wish to block or not block. We have a way 2643 * to grow the pool asynchronously using the mbuf worker thread. 2644 */ 2645 i = m_howmany(num, bufsize); 2646 if (i == 0 || (wait & M_DONTWAIT)) 2647 goto out; 2648 2649 lck_mtx_unlock(mbuf_mlock); 2650 2651 size = round_page(i * bufsize); 2652 page = kmem_mb_alloc(mb_map, size, large_buffer); 2653 2654 /* 2655 * If we did ask for "n" 16KB physically contiguous chunks 2656 * and didn't get them, then please try again without this 2657 * restriction. 2658 */ 2659 if (large_buffer && page == 0) 2660 page = kmem_mb_alloc(mb_map, size, 0); 2661 2662 if (page == 0) { 2663 if (bufsize == m_maxsize(MC_BIGCL)) { 2664 /* Try for 1 page if failed, only 4KB request */ 2665 size = NBPG; 2666 page = kmem_mb_alloc(mb_map, size, 0); 2667 } 2668 2669 if (page == 0) { 2670 lck_mtx_lock(mbuf_mlock); 2671 goto out; 2672 } 2673 } 2674 2675 VERIFY(IS_P2ALIGNED(page, NBPG)); 2676 numpages = size / NBPG; 2677 2678 /* If auditing is enabled, allocate the audit structures now */ 2679 if (mclaudit != NULL) { 2680 int needed; 2681 2682 /* 2683 * Yes, I realize this is a waste of memory for clusters 2684 * that never get transformed into mbufs, as we may end 2685 * up with NMBPBG-1 unused audit structures per cluster. 2686 * But doing so tremendously simplifies the allocation 2687 * strategy, since at this point we are not holding the 2688 * mbuf lock and the caller is okay to be blocked. 2689 */ 2690 if (bufsize == m_maxsize(MC_BIGCL)) { 2691 needed = numpages * NMBPBG; 2692 2693 i = mcache_alloc_ext(mcl_audit_con_cache, 2694 &con_list, needed, MCR_SLEEP); 2695 2696 VERIFY(con_list != NULL && i == needed); 2697 } else { 2698 needed = numpages / NSLABSP16KB; 2699 } 2700 2701 i = mcache_alloc_ext(mcache_audit_cache, 2702 (mcache_obj_t **)&mca_list, needed, MCR_SLEEP); 2703 2704 VERIFY(mca_list != NULL && i == needed); 2705 } 2706 2707 lck_mtx_lock(mbuf_mlock); 2708 2709 for (i = 0; i < numpages; i++, page += NBPG) { 2710 ppnum_t offset = ((char *)page - (char *)mbutl) / NBPG; 2711 ppnum_t new_page = pmap_find_phys(kernel_pmap, page); 2712 2713 /* 2714 * In the case of no mapper being available the following 2715 * code noops and returns the input page; if there is a 2716 * mapper the appropriate I/O page is returned. 2717 */ 2718 VERIFY(offset < mcl_pages); 2719 if (mcl_paddr_base) { 2720 bzero((void *)(uintptr_t) page, page_size); 2721 new_page = IOMapperInsertPage(mcl_paddr_base, offset, new_page); 2722 } 2723 mcl_paddr[offset] = new_page << PGSHIFT; 2724 2725 /* Pattern-fill this fresh page */ 2726 if (mclverify) { 2727 mcache_set_pattern(MCACHE_FREE_PATTERN, 2728 (caddr_t)page, NBPG); 2729 } 2730 if (bufsize == m_maxsize(MC_BIGCL)) { 2731 union mbigcluster *mbc = (union mbigcluster *)page; 2732 2733 /* One for the entire page */ 2734 sp = slab_get(mbc); 2735 if (mclaudit != NULL) { 2736 mcl_audit_init(mbc, &mca_list, &con_list, 2737 AUDIT_CONTENTS_SIZE, NMBPBG); 2738 } 2739 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2740 slab_init(sp, MC_BIGCL, SLF_MAPPED, 2741 mbc, mbc, bufsize, 0, 1); 2742 2743 /* Insert this slab */ 2744 slab_insert(sp, MC_BIGCL); 2745 2746 /* Update stats now since slab_get() drops the lock */ 2747 mbstat.m_bigclfree = ++m_infree(MC_BIGCL) + 2748 m_infree(MC_MBUF_BIGCL); 2749 mbstat.m_bigclusters = ++m_total(MC_BIGCL); 2750 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 2751 } else if ((i % NSLABSP16KB) == 0) { 2752 union m16kcluster *m16kcl = (union m16kcluster *)page; 2753 mcl_slab_t *nsp; 2754 int k; 2755 2756 VERIFY(njcl > 0); 2757 /* One for the entire 16KB */ 2758 sp = slab_get(m16kcl); 2759 if (mclaudit != NULL) 2760 mcl_audit_init(m16kcl, &mca_list, NULL, 0, 1); 2761 2762 VERIFY(sp->sl_refcnt == 0 && sp->sl_flags == 0); 2763 slab_init(sp, MC_16KCL, SLF_MAPPED, 2764 m16kcl, m16kcl, bufsize, 0, 1); 2765 2766 /* 2767 * 2nd-Nth page's slab is part of the first one, 2768 * where N is NSLABSP16KB. 2769 */ 2770 for (k = 1; k < NSLABSP16KB; k++) { 2771 nsp = slab_get(((union mbigcluster *)page) + k); 2772 VERIFY(nsp->sl_refcnt == 0 && 2773 nsp->sl_flags == 0); 2774 slab_init(nsp, MC_16KCL, 2775 SLF_MAPPED | SLF_PARTIAL, 2776 m16kcl, NULL, 0, 0, 0); 2777 } 2778 2779 /* Insert this slab */ 2780 slab_insert(sp, MC_16KCL); 2781 2782 /* Update stats now since slab_get() drops the lock */ 2783 m_infree(MC_16KCL)++; 2784 m_total(MC_16KCL)++; 2785 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 2786 } 2787 } 2788 VERIFY(mca_list == NULL && con_list == NULL); 2789 2790 /* We're done; let others enter */ 2791 mb_clalloc_busy = FALSE; 2792 if (mb_clalloc_waiters > 0) { 2793 mb_clalloc_waiters = 0; 2794 wakeup(mb_clalloc_waitchan); 2795 } 2796 2797 if (bufsize == m_maxsize(MC_BIGCL)) 2798 return (numpages); 2799 2800 VERIFY(bufsize == m_maxsize(MC_16KCL)); 2801 return (numpages / NSLABSP16KB); 2802 2803out: 2804 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2805 2806 /* We're done; let others enter */ 2807 mb_clalloc_busy = FALSE; 2808 if (mb_clalloc_waiters > 0) { 2809 mb_clalloc_waiters = 0; 2810 wakeup(mb_clalloc_waitchan); 2811 } 2812 2813 /* 2814 * When non-blocking we kick a thread if we have to grow the 2815 * pool or if the number of free clusters is less than requested. 2816 */ 2817 if (bufsize == m_maxsize(MC_BIGCL)) { 2818 if (i > 0) { 2819 /* 2820 * Remember total number of 4KB clusters needed 2821 * at this time. 2822 */ 2823 i += m_total(MC_BIGCL); 2824 if (i > mbuf_expand_big) { 2825 mbuf_expand_big = i; 2826 if (mbuf_worker_ready) 2827 wakeup((caddr_t)&mbuf_worker_run); 2828 } 2829 } 2830 2831 if (m_infree(MC_BIGCL) >= num) 2832 return (1); 2833 } else { 2834 if (i > 0) { 2835 /* 2836 * Remember total number of 16KB clusters needed 2837 * at this time. 2838 */ 2839 i += m_total(MC_16KCL); 2840 if (i > mbuf_expand_16k) { 2841 mbuf_expand_16k = i; 2842 if (mbuf_worker_ready) 2843 wakeup((caddr_t)&mbuf_worker_run); 2844 } 2845 } 2846 2847 if (m_infree(MC_16KCL) >= num) 2848 return (1); 2849 } 2850 return (0); 2851} 2852 2853/* 2854 * Populate the global freelist of the corresponding buffer class. 2855 */ 2856static int 2857freelist_populate(mbuf_class_t class, unsigned int num, int wait) 2858{ 2859 mcache_obj_t *o = NULL; 2860 int i, numpages = 0, count; 2861 2862 VERIFY(class == MC_MBUF || class == MC_CL || class == MC_BIGCL || 2863 class == MC_16KCL); 2864 2865 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2866 2867 switch (class) { 2868 case MC_MBUF: 2869 case MC_CL: 2870 case MC_BIGCL: 2871 numpages = (num * m_size(class) + NBPG - 1) / NBPG; 2872 i = m_clalloc(numpages, wait, m_maxsize(MC_BIGCL)); 2873 2874 /* Respect the 4KB clusters minimum limit */ 2875 if (m_total(MC_BIGCL) == m_maxlimit(MC_BIGCL) && 2876 m_infree(MC_BIGCL) <= m_minlimit(MC_BIGCL)) { 2877 if (class != MC_BIGCL || (wait & MCR_COMP)) 2878 return (0); 2879 } 2880 if (class == MC_BIGCL) 2881 return (i != 0); 2882 break; 2883 2884 case MC_16KCL: 2885 return (m_clalloc(num, wait, m_maxsize(class)) != 0); 2886 /* NOTREACHED */ 2887 2888 default: 2889 VERIFY(0); 2890 /* NOTREACHED */ 2891 } 2892 2893 VERIFY(class == MC_MBUF || class == MC_CL); 2894 2895 /* how many objects will we cut the page into? */ 2896 int numobj = (class == MC_MBUF ? NMBPBG : NCLPBG); 2897 2898 for (count = 0; count < numpages; count++) { 2899 2900 /* respect totals, minlimit, maxlimit */ 2901 if (m_total(MC_BIGCL) <= m_minlimit(MC_BIGCL) || 2902 m_total(class) >= m_maxlimit(class)) 2903 break; 2904 2905 if ((o = slab_alloc(MC_BIGCL, wait)) == NULL) 2906 break; 2907 2908 struct mbuf *m = (struct mbuf *)o; 2909 union mcluster *c = (union mcluster *)o; 2910 mcl_slab_t *sp = slab_get(o); 2911 mcache_audit_t *mca = NULL; 2912 2913 VERIFY(slab_is_detached(sp) && 2914 (sp->sl_flags & (SLF_MAPPED | SLF_PARTIAL)) == SLF_MAPPED); 2915 2916 /* 2917 * Make sure that the cluster is unmolested 2918 * while in freelist 2919 */ 2920 if (mclverify) { 2921 mca = mcl_audit_buf2mca(MC_BIGCL, o); 2922 mcache_audit_free_verify(mca, o, 0, 2923 m_maxsize(MC_BIGCL)); 2924 } 2925 2926 /* Reinitialize it as an mbuf or 2K slab */ 2927 slab_init(sp, class, sp->sl_flags, 2928 sp->sl_base, NULL, sp->sl_len, 0, numobj); 2929 2930 VERIFY(o == (mcache_obj_t *)sp->sl_base); 2931 VERIFY(sp->sl_head == NULL); 2932 2933 VERIFY(m_total(MC_BIGCL) > 0); 2934 m_total(MC_BIGCL)--; 2935 mbstat.m_bigclusters = m_total(MC_BIGCL); 2936 2937 m_total(class) += numobj; 2938 m_infree(class) += numobj; 2939 2940 VERIFY(m_total(MC_BIGCL) >= m_minlimit(MC_BIGCL)); 2941 VERIFY(m_total(class) <= m_maxlimit(class)); 2942 2943 i = numobj; 2944 if (class == MC_MBUF) { 2945 mbstat.m_mbufs = m_total(MC_MBUF); 2946 mtype_stat_add(MT_FREE, NMBPBG); 2947 while (i--) { 2948 /* 2949 * If auditing is enabled, construct the 2950 * shadow mbuf in the audit structure 2951 * instead of the actual one. 2952 * mbuf_slab_audit() will take care of 2953 * restoring the contents after the 2954 * integrity check. 2955 */ 2956 if (mclaudit != NULL) { 2957 struct mbuf *ms; 2958 mca = mcl_audit_buf2mca(MC_MBUF, 2959 (mcache_obj_t *)m); 2960 ms = ((struct mbuf *) 2961 mca->mca_contents); 2962 ms->m_type = MT_FREE; 2963 } else { 2964 m->m_type = MT_FREE; 2965 } 2966 m->m_next = sp->sl_head; 2967 sp->sl_head = (void *)m++; 2968 } 2969 } else { /* MC_CL */ 2970 mbstat.m_clfree = 2971 m_infree(MC_CL) + m_infree(MC_MBUF_CL); 2972 mbstat.m_clusters = m_total(MC_CL); 2973 while (i--) { 2974 c->mcl_next = sp->sl_head; 2975 sp->sl_head = (void *)c++; 2976 } 2977 } 2978 2979 /* Insert into the mbuf or 2k slab list */ 2980 slab_insert(sp, class); 2981 2982 if ((i = mb_waiters) > 0) 2983 mb_waiters = 0; 2984 if (i != 0) 2985 wakeup(mb_waitchan); 2986 } 2987 return (count != 0); 2988} 2989 2990/* 2991 * For each class, initialize the freelist to hold m_minlimit() objects. 2992 */ 2993static void 2994freelist_init(mbuf_class_t class) 2995{ 2996 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 2997 2998 VERIFY(class == MC_CL || class == MC_BIGCL); 2999 VERIFY(m_total(class) == 0); 3000 VERIFY(m_minlimit(class) > 0); 3001 3002 while (m_total(class) < m_minlimit(class)) 3003 (void) freelist_populate(class, m_minlimit(class), M_WAIT); 3004 3005 VERIFY(m_total(class) >= m_minlimit(class)); 3006} 3007 3008/* 3009 * (Inaccurately) check if it might be worth a trip back to the 3010 * mcache layer due the availability of objects there. We'll 3011 * end up back here if there's nothing up there. 3012 */ 3013static boolean_t 3014mbuf_cached_above(mbuf_class_t class, int wait) 3015{ 3016 switch (class) { 3017 case MC_MBUF: 3018 if (wait & MCR_COMP) 3019 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL)) || 3020 !mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3021 break; 3022 3023 case MC_CL: 3024 if (wait & MCR_COMP) 3025 return (!mcache_bkt_isempty(m_cache(MC_MBUF_CL))); 3026 break; 3027 3028 case MC_BIGCL: 3029 if (wait & MCR_COMP) 3030 return (!mcache_bkt_isempty(m_cache(MC_MBUF_BIGCL))); 3031 break; 3032 3033 case MC_16KCL: 3034 if (wait & MCR_COMP) 3035 return (!mcache_bkt_isempty(m_cache(MC_MBUF_16KCL))); 3036 break; 3037 3038 case MC_MBUF_CL: 3039 case MC_MBUF_BIGCL: 3040 case MC_MBUF_16KCL: 3041 break; 3042 3043 default: 3044 VERIFY(0); 3045 /* NOTREACHED */ 3046 } 3047 3048 return (!mcache_bkt_isempty(m_cache(class))); 3049} 3050 3051/* 3052 * If possible, convert constructed objects to raw ones. 3053 */ 3054static boolean_t 3055mbuf_steal(mbuf_class_t class, unsigned int num) 3056{ 3057 mcache_obj_t *top = NULL; 3058 mcache_obj_t **list = ⊤ 3059 unsigned int tot = 0; 3060 3061 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3062 3063 switch (class) { 3064 case MC_MBUF: 3065 case MC_CL: 3066 case MC_BIGCL: 3067 case MC_16KCL: 3068 return (FALSE); 3069 3070 case MC_MBUF_CL: 3071 case MC_MBUF_BIGCL: 3072 case MC_MBUF_16KCL: 3073 /* Get the required number of constructed objects if possible */ 3074 if (m_infree(class) > m_minlimit(class)) { 3075 tot = cslab_alloc(class, &list, 3076 MIN(num, m_infree(class))); 3077 } 3078 3079 /* And destroy them to get back the raw objects */ 3080 if (top != NULL) 3081 (void) cslab_free(class, top, 1); 3082 break; 3083 3084 default: 3085 VERIFY(0); 3086 /* NOTREACHED */ 3087 } 3088 3089 return (tot == num); 3090} 3091 3092static void 3093m_reclaim(mbuf_class_t class, unsigned int num, boolean_t comp) 3094{ 3095 int m, bmap = 0; 3096 3097 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 3098 3099 VERIFY(m_total(MC_CL) <= m_maxlimit(MC_CL)); 3100 VERIFY(m_total(MC_BIGCL) <= m_maxlimit(MC_BIGCL)); 3101 VERIFY(m_total(MC_16KCL) <= m_maxlimit(MC_16KCL)); 3102 3103 /* 3104 * This logic can be made smarter; for now, simply mark 3105 * all other related classes as potential victims. 3106 */ 3107 switch (class) { 3108 case MC_MBUF: 3109 m_wantpurge(MC_CL)++; 3110 m_wantpurge(MC_BIGCL)++; 3111 m_wantpurge(MC_MBUF_CL)++; 3112 m_wantpurge(MC_MBUF_BIGCL)++; 3113 break; 3114 3115 case MC_CL: 3116 m_wantpurge(MC_MBUF)++; 3117 m_wantpurge(MC_BIGCL)++; 3118 m_wantpurge(MC_MBUF_BIGCL)++; 3119 if (!comp) 3120 m_wantpurge(MC_MBUF_CL)++; 3121 break; 3122 3123 case MC_BIGCL: 3124 m_wantpurge(MC_MBUF)++; 3125 m_wantpurge(MC_CL)++; 3126 m_wantpurge(MC_MBUF_CL)++; 3127 if (!comp) 3128 m_wantpurge(MC_MBUF_BIGCL)++; 3129 break; 3130 3131 case MC_16KCL: 3132 if (!comp) 3133 m_wantpurge(MC_MBUF_16KCL)++; 3134 break; 3135 3136 default: 3137 VERIFY(0); 3138 /* NOTREACHED */ 3139 } 3140 3141 /* 3142 * Run through each marked class and check if we really need to 3143 * purge (and therefore temporarily disable) the per-CPU caches 3144 * layer used by the class. If so, remember the classes since 3145 * we are going to drop the lock below prior to purging. 3146 */ 3147 for (m = 0; m < NELEM(mbuf_table); m++) { 3148 if (m_wantpurge(m) > 0) { 3149 m_wantpurge(m) = 0; 3150 /* 3151 * Try hard to steal the required number of objects 3152 * from the freelist of other mbuf classes. Only 3153 * purge and disable the per-CPU caches layer when 3154 * we don't have enough; it's the last resort. 3155 */ 3156 if (!mbuf_steal(m, num)) 3157 bmap |= (1 << m); 3158 } 3159 } 3160 3161 lck_mtx_unlock(mbuf_mlock); 3162 3163 if (bmap != 0) { 3164 /* drain is performed in pfslowtimo(), to avoid deadlocks */ 3165 do_reclaim = 1; 3166 3167 /* Sigh; we have no other choices but to ask mcache to purge */ 3168 for (m = 0; m < NELEM(mbuf_table); m++) { 3169 if ((bmap & (1 << m)) && 3170 mcache_purge_cache(m_cache(m))) { 3171 lck_mtx_lock(mbuf_mlock); 3172 m_purge_cnt(m)++; 3173 mbstat.m_drain++; 3174 lck_mtx_unlock(mbuf_mlock); 3175 } 3176 } 3177 } else { 3178 /* 3179 * Request mcache to reap extra elements from all of its caches; 3180 * note that all reaps are serialized and happen only at a fixed 3181 * interval. 3182 */ 3183 mcache_reap(); 3184 } 3185 lck_mtx_lock(mbuf_mlock); 3186} 3187 3188static inline struct mbuf * 3189m_get_common(int wait, short type, int hdr) 3190{ 3191 struct mbuf *m; 3192 int mcflags = MSLEEPF(wait); 3193 3194 /* Is this due to a non-blocking retry? If so, then try harder */ 3195 if (mcflags & MCR_NOSLEEP) 3196 mcflags |= MCR_TRYHARD; 3197 3198 m = mcache_alloc(m_cache(MC_MBUF), mcflags); 3199 if (m != NULL) { 3200 MBUF_INIT(m, hdr, type); 3201 mtype_stat_inc(type); 3202 mtype_stat_dec(MT_FREE); 3203#if CONFIG_MACF_NET 3204 if (hdr && mac_init_mbuf(m, wait) != 0) { 3205 m_free(m); 3206 return (NULL); 3207 } 3208#endif /* MAC_NET */ 3209 } 3210 return (m); 3211} 3212 3213/* 3214 * Space allocation routines; these are also available as macros 3215 * for critical paths. 3216 */ 3217#define _M_GET(wait, type) m_get_common(wait, type, 0) 3218#define _M_GETHDR(wait, type) m_get_common(wait, type, 1) 3219#define _M_RETRY(wait, type) _M_GET(wait, type) 3220#define _M_RETRYHDR(wait, type) _M_GETHDR(wait, type) 3221#define _MGET(m, how, type) ((m) = _M_GET(how, type)) 3222#define _MGETHDR(m, how, type) ((m) = _M_GETHDR(how, type)) 3223 3224struct mbuf * 3225m_get(int wait, int type) 3226{ 3227 return (_M_GET(wait, type)); 3228} 3229 3230struct mbuf * 3231m_gethdr(int wait, int type) 3232{ 3233 return (_M_GETHDR(wait, type)); 3234} 3235 3236struct mbuf * 3237m_retry(int wait, int type) 3238{ 3239 return (_M_RETRY(wait, type)); 3240} 3241 3242struct mbuf * 3243m_retryhdr(int wait, int type) 3244{ 3245 return (_M_RETRYHDR(wait, type)); 3246} 3247 3248struct mbuf * 3249m_getclr(int wait, int type) 3250{ 3251 struct mbuf *m; 3252 3253 _MGET(m, wait, type); 3254 if (m != NULL) 3255 bzero(MTOD(m, caddr_t), MLEN); 3256 return (m); 3257} 3258 3259struct mbuf * 3260m_free(struct mbuf *m) 3261{ 3262 struct mbuf *n = m->m_next; 3263 3264 if (m->m_type == MT_FREE) 3265 panic("m_free: freeing an already freed mbuf"); 3266 3267 /* Free the aux data and tags if there is any */ 3268 if (m->m_flags & M_PKTHDR) { 3269 m_tag_delete_chain(m, NULL); 3270 } 3271 3272 if (m->m_flags & M_EXT) { 3273 u_int32_t refcnt; 3274 u_int32_t composite; 3275 3276 refcnt = m_decref(m); 3277 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3278 if (refcnt == 0 && !composite) { 3279 if (m->m_ext.ext_free == NULL) { 3280 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3281 } else if (m->m_ext.ext_free == m_bigfree) { 3282 mcache_free(m_cache(MC_BIGCL), 3283 m->m_ext.ext_buf); 3284 } else if (m->m_ext.ext_free == m_16kfree) { 3285 mcache_free(m_cache(MC_16KCL), 3286 m->m_ext.ext_buf); 3287 } else { 3288 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3289 m->m_ext.ext_size, m->m_ext.ext_arg); 3290 } 3291 mcache_free(ref_cache, MEXT_RFA(m)); 3292 MEXT_RFA(m) = NULL; 3293 } else if (refcnt == 0 && composite) { 3294 VERIFY(m->m_type != MT_FREE); 3295 3296 mtype_stat_dec(m->m_type); 3297 mtype_stat_inc(MT_FREE); 3298 3299 m->m_type = MT_FREE; 3300 m->m_flags = M_EXT; 3301 m->m_len = 0; 3302 m->m_next = m->m_nextpkt = NULL; 3303 3304 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3305 3306 /* "Free" into the intermediate cache */ 3307 if (m->m_ext.ext_free == NULL) { 3308 mcache_free(m_cache(MC_MBUF_CL), m); 3309 } else if (m->m_ext.ext_free == m_bigfree) { 3310 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3311 } else { 3312 VERIFY(m->m_ext.ext_free == m_16kfree); 3313 mcache_free(m_cache(MC_MBUF_16KCL), m); 3314 } 3315 return (n); 3316 } 3317 } 3318 3319 if (m->m_type != MT_FREE) { 3320 mtype_stat_dec(m->m_type); 3321 mtype_stat_inc(MT_FREE); 3322 } 3323 3324 m->m_type = MT_FREE; 3325 m->m_flags = m->m_len = 0; 3326 m->m_next = m->m_nextpkt = NULL; 3327 3328 mcache_free(m_cache(MC_MBUF), m); 3329 3330 return (n); 3331} 3332 3333__private_extern__ struct mbuf * 3334m_clattach(struct mbuf *m, int type, caddr_t extbuf, 3335 void (*extfree)(caddr_t, u_int, caddr_t), u_int extsize, caddr_t extarg, 3336 int wait) 3337{ 3338 struct ext_ref *rfa = NULL; 3339 3340 if (m == NULL && (m = _M_GETHDR(wait, type)) == NULL) 3341 return (NULL); 3342 3343 if (m->m_flags & M_EXT) { 3344 u_int32_t refcnt; 3345 u_int32_t composite; 3346 3347 refcnt = m_decref(m); 3348 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 3349 if (refcnt == 0 && !composite) { 3350 if (m->m_ext.ext_free == NULL) { 3351 mcache_free(m_cache(MC_CL), m->m_ext.ext_buf); 3352 } else if (m->m_ext.ext_free == m_bigfree) { 3353 mcache_free(m_cache(MC_BIGCL), 3354 m->m_ext.ext_buf); 3355 } else if (m->m_ext.ext_free == m_16kfree) { 3356 mcache_free(m_cache(MC_16KCL), 3357 m->m_ext.ext_buf); 3358 } else { 3359 (*(m->m_ext.ext_free))(m->m_ext.ext_buf, 3360 m->m_ext.ext_size, m->m_ext.ext_arg); 3361 } 3362 /* Re-use the reference structure */ 3363 rfa = MEXT_RFA(m); 3364 } else if (refcnt == 0 && composite) { 3365 VERIFY(m->m_type != MT_FREE); 3366 3367 mtype_stat_dec(m->m_type); 3368 mtype_stat_inc(MT_FREE); 3369 3370 m->m_type = MT_FREE; 3371 m->m_flags = M_EXT; 3372 m->m_len = 0; 3373 m->m_next = m->m_nextpkt = NULL; 3374 3375 MEXT_FLAGS(m) &= ~EXTF_READONLY; 3376 3377 /* "Free" into the intermediate cache */ 3378 if (m->m_ext.ext_free == NULL) { 3379 mcache_free(m_cache(MC_MBUF_CL), m); 3380 } else if (m->m_ext.ext_free == m_bigfree) { 3381 mcache_free(m_cache(MC_MBUF_BIGCL), m); 3382 } else { 3383 VERIFY(m->m_ext.ext_free == m_16kfree); 3384 mcache_free(m_cache(MC_MBUF_16KCL), m); 3385 } 3386 /* 3387 * Allocate a new mbuf, since we didn't divorce 3388 * the composite mbuf + cluster pair above. 3389 */ 3390 if ((m = _M_GETHDR(wait, type)) == NULL) 3391 return (NULL); 3392 } 3393 } 3394 3395 if (rfa == NULL && 3396 (rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) { 3397 m_free(m); 3398 return (NULL); 3399 } 3400 3401 MEXT_INIT(m, extbuf, extsize, extfree, extarg, rfa, 1, 0); 3402 3403 return (m); 3404} 3405 3406/* 3407 * Perform `fast' allocation mbuf clusters from a cache of recently-freed 3408 * clusters. (If the cache is empty, new clusters are allocated en-masse.) 3409 */ 3410struct mbuf * 3411m_getcl(int wait, int type, int flags) 3412{ 3413 struct mbuf *m; 3414 int mcflags = MSLEEPF(wait); 3415 int hdr = (flags & M_PKTHDR); 3416 3417 /* Is this due to a non-blocking retry? If so, then try harder */ 3418 if (mcflags & MCR_NOSLEEP) 3419 mcflags |= MCR_TRYHARD; 3420 3421 m = mcache_alloc(m_cache(MC_MBUF_CL), mcflags); 3422 if (m != NULL) { 3423 u_int32_t flag; 3424 struct ext_ref *rfa; 3425 void *cl; 3426 3427 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3428 cl = m->m_ext.ext_buf; 3429 rfa = MEXT_RFA(m); 3430 3431 ASSERT(cl != NULL && rfa != NULL); 3432 VERIFY(MBUF_IS_COMPOSITE(m) && m->m_ext.ext_free == NULL); 3433 3434 flag = MEXT_FLAGS(m); 3435 3436 MBUF_INIT(m, hdr, type); 3437 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3438 3439 mtype_stat_inc(type); 3440 mtype_stat_dec(MT_FREE); 3441#if CONFIG_MACF_NET 3442 if (hdr && mac_init_mbuf(m, wait) != 0) { 3443 m_freem(m); 3444 return (NULL); 3445 } 3446#endif /* MAC_NET */ 3447 } 3448 return (m); 3449} 3450 3451/* m_mclget() add an mbuf cluster to a normal mbuf */ 3452struct mbuf * 3453m_mclget(struct mbuf *m, int wait) 3454{ 3455 struct ext_ref *rfa; 3456 3457 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3458 return (m); 3459 3460 m->m_ext.ext_buf = m_mclalloc(wait); 3461 if (m->m_ext.ext_buf != NULL) { 3462 MBUF_CL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3463 } else { 3464 mcache_free(ref_cache, rfa); 3465 } 3466 return (m); 3467} 3468 3469/* Allocate an mbuf cluster */ 3470caddr_t 3471m_mclalloc(int wait) 3472{ 3473 int mcflags = MSLEEPF(wait); 3474 3475 /* Is this due to a non-blocking retry? If so, then try harder */ 3476 if (mcflags & MCR_NOSLEEP) 3477 mcflags |= MCR_TRYHARD; 3478 3479 return (mcache_alloc(m_cache(MC_CL), mcflags)); 3480} 3481 3482/* Free an mbuf cluster */ 3483void 3484m_mclfree(caddr_t p) 3485{ 3486 mcache_free(m_cache(MC_CL), p); 3487} 3488 3489/* 3490 * mcl_hasreference() checks if a cluster of an mbuf is referenced by 3491 * another mbuf; see comments in m_incref() regarding EXTF_READONLY. 3492 */ 3493int 3494m_mclhasreference(struct mbuf *m) 3495{ 3496 if (!(m->m_flags & M_EXT)) 3497 return (0); 3498 3499 ASSERT(MEXT_RFA(m) != NULL); 3500 3501 return ((MEXT_FLAGS(m) & EXTF_READONLY) ? 1 : 0); 3502} 3503 3504__private_extern__ caddr_t 3505m_bigalloc(int wait) 3506{ 3507 int mcflags = MSLEEPF(wait); 3508 3509 /* Is this due to a non-blocking retry? If so, then try harder */ 3510 if (mcflags & MCR_NOSLEEP) 3511 mcflags |= MCR_TRYHARD; 3512 3513 return (mcache_alloc(m_cache(MC_BIGCL), mcflags)); 3514} 3515 3516__private_extern__ void 3517m_bigfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3518{ 3519 mcache_free(m_cache(MC_BIGCL), p); 3520} 3521 3522/* m_mbigget() add an 4KB mbuf cluster to a normal mbuf */ 3523__private_extern__ struct mbuf * 3524m_mbigget(struct mbuf *m, int wait) 3525{ 3526 struct ext_ref *rfa; 3527 3528 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3529 return (m); 3530 3531 m->m_ext.ext_buf = m_bigalloc(wait); 3532 if (m->m_ext.ext_buf != NULL) { 3533 MBUF_BIGCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3534 } else { 3535 mcache_free(ref_cache, rfa); 3536 } 3537 return (m); 3538} 3539 3540__private_extern__ caddr_t 3541m_16kalloc(int wait) 3542{ 3543 int mcflags = MSLEEPF(wait); 3544 3545 /* Is this due to a non-blocking retry? If so, then try harder */ 3546 if (mcflags & MCR_NOSLEEP) 3547 mcflags |= MCR_TRYHARD; 3548 3549 return (mcache_alloc(m_cache(MC_16KCL), mcflags)); 3550} 3551 3552__private_extern__ void 3553m_16kfree(caddr_t p, __unused u_int size, __unused caddr_t arg) 3554{ 3555 mcache_free(m_cache(MC_16KCL), p); 3556} 3557 3558/* m_m16kget() add a 16KB mbuf cluster to a normal mbuf */ 3559__private_extern__ struct mbuf * 3560m_m16kget(struct mbuf *m, int wait) 3561{ 3562 struct ext_ref *rfa; 3563 3564 if ((rfa = mcache_alloc(ref_cache, MSLEEPF(wait))) == NULL) 3565 return (m); 3566 3567 m->m_ext.ext_buf = m_16kalloc(wait); 3568 if (m->m_ext.ext_buf != NULL) { 3569 MBUF_16KCL_INIT(m, m->m_ext.ext_buf, rfa, 1, 0); 3570 } else { 3571 mcache_free(ref_cache, rfa); 3572 } 3573 return (m); 3574} 3575 3576/* 3577 * "Move" mbuf pkthdr from "from" to "to". 3578 * "from" must have M_PKTHDR set, and "to" must be empty. 3579 */ 3580void 3581m_copy_pkthdr(struct mbuf *to, struct mbuf *from) 3582{ 3583 /* We will be taking over the tags of 'to' */ 3584 if (to->m_flags & M_PKTHDR) 3585 m_tag_delete_chain(to, NULL); 3586 to->m_pkthdr = from->m_pkthdr; /* especially tags */ 3587 m_tag_init(from); /* purge tags from src */ 3588 m_service_class_init(from); /* reset svc class from src */ 3589 from->m_pkthdr.aux_flags = 0; /* clear aux flags from src */ 3590 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3591 if ((to->m_flags & M_EXT) == 0) 3592 to->m_data = to->m_pktdat; 3593} 3594 3595/* 3596 * Duplicate "from"'s mbuf pkthdr in "to". 3597 * "from" must have M_PKTHDR set, and "to" must be empty. 3598 * In particular, this does a deep copy of the packet tags. 3599 */ 3600static int 3601m_dup_pkthdr(struct mbuf *to, struct mbuf *from, int how) 3602{ 3603 if (to->m_flags & M_PKTHDR) 3604 m_tag_delete_chain(to, NULL); 3605 to->m_flags = (from->m_flags & M_COPYFLAGS) | (to->m_flags & M_EXT); 3606 if ((to->m_flags & M_EXT) == 0) 3607 to->m_data = to->m_pktdat; 3608 to->m_pkthdr = from->m_pkthdr; 3609 m_tag_init(to); 3610 return (m_tag_copy_chain(to, from, how)); 3611} 3612 3613void 3614m_copy_pftag(struct mbuf *to, struct mbuf *from) 3615{ 3616 to->m_pkthdr.pf_mtag = from->m_pkthdr.pf_mtag; 3617 to->m_pkthdr.pf_mtag.pftag_hdr = NULL; 3618 to->m_pkthdr.pf_mtag.pftag_flags &= ~(PF_TAG_HDR_INET|PF_TAG_HDR_INET6); 3619} 3620 3621/* 3622 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 3623 * if wantall is not set, return whatever number were available. Set up the 3624 * first num_with_pkthdrs with mbuf hdrs configured as packet headers; these 3625 * are chained on the m_nextpkt field. Any packets requested beyond this 3626 * are chained onto the last packet header's m_next field. The size of 3627 * the cluster is controlled by the parameter bufsize. 3628 */ 3629__private_extern__ struct mbuf * 3630m_getpackets_internal(unsigned int *num_needed, int num_with_pkthdrs, 3631 int wait, int wantall, size_t bufsize) 3632{ 3633 struct mbuf *m; 3634 struct mbuf **np, *top; 3635 unsigned int pnum, needed = *num_needed; 3636 mcache_obj_t *mp_list = NULL; 3637 int mcflags = MSLEEPF(wait); 3638 u_int32_t flag; 3639 struct ext_ref *rfa; 3640 mcache_t *cp; 3641 void *cl; 3642 3643 ASSERT(bufsize == m_maxsize(MC_CL) || 3644 bufsize == m_maxsize(MC_BIGCL) || 3645 bufsize == m_maxsize(MC_16KCL)); 3646 3647 /* 3648 * Caller must first check for njcl because this 3649 * routine is internal and not exposed/used via KPI. 3650 */ 3651 VERIFY(bufsize != m_maxsize(MC_16KCL) || njcl > 0); 3652 3653 top = NULL; 3654 np = ⊤ 3655 pnum = 0; 3656 3657 /* 3658 * The caller doesn't want all the requested buffers; only some. 3659 * Try hard to get what we can, but don't block. This effectively 3660 * overrides MCR_SLEEP, since this thread will not go to sleep 3661 * if we can't get all the buffers. 3662 */ 3663 if (!wantall || (mcflags & MCR_NOSLEEP)) 3664 mcflags |= MCR_TRYHARD; 3665 3666 /* Allocate the composite mbuf + cluster elements from the cache */ 3667 if (bufsize == m_maxsize(MC_CL)) 3668 cp = m_cache(MC_MBUF_CL); 3669 else if (bufsize == m_maxsize(MC_BIGCL)) 3670 cp = m_cache(MC_MBUF_BIGCL); 3671 else 3672 cp = m_cache(MC_MBUF_16KCL); 3673 needed = mcache_alloc_ext(cp, &mp_list, needed, mcflags); 3674 3675 for (pnum = 0; pnum < needed; pnum++) { 3676 m = (struct mbuf *)mp_list; 3677 mp_list = mp_list->obj_next; 3678 3679 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3680 cl = m->m_ext.ext_buf; 3681 rfa = MEXT_RFA(m); 3682 3683 ASSERT(cl != NULL && rfa != NULL); 3684 VERIFY(MBUF_IS_COMPOSITE(m)); 3685 3686 flag = MEXT_FLAGS(m); 3687 3688 MBUF_INIT(m, num_with_pkthdrs, MT_DATA); 3689 if (bufsize == m_maxsize(MC_16KCL)) { 3690 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 3691 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3692 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 3693 } else { 3694 MBUF_CL_INIT(m, cl, rfa, 1, flag); 3695 } 3696 3697 if (num_with_pkthdrs > 0) { 3698 --num_with_pkthdrs; 3699#if CONFIG_MACF_NET 3700 if (mac_mbuf_label_init(m, wait) != 0) { 3701 m_freem(m); 3702 break; 3703 } 3704#endif /* MAC_NET */ 3705 } 3706 3707 *np = m; 3708 if (num_with_pkthdrs > 0) 3709 np = &m->m_nextpkt; 3710 else 3711 np = &m->m_next; 3712 } 3713 ASSERT(pnum != *num_needed || mp_list == NULL); 3714 if (mp_list != NULL) 3715 mcache_free_ext(cp, mp_list); 3716 3717 if (pnum > 0) { 3718 mtype_stat_add(MT_DATA, pnum); 3719 mtype_stat_sub(MT_FREE, pnum); 3720 } 3721 3722 if (wantall && (pnum != *num_needed)) { 3723 if (top != NULL) 3724 m_freem_list(top); 3725 return (NULL); 3726 } 3727 3728 if (pnum > *num_needed) { 3729 printf("%s: File a radar related to <rdar://10146739>. \ 3730 needed = %u, pnum = %u, num_needed = %u \n", 3731 __func__, needed, pnum, *num_needed); 3732 } 3733 3734 *num_needed = pnum; 3735 return (top); 3736} 3737 3738/* 3739 * Return list of mbuf linked by m_nextpkt. Try for numlist, and if 3740 * wantall is not set, return whatever number were available. The size of 3741 * each mbuf in the list is controlled by the parameter packetlen. Each 3742 * mbuf of the list may have a chain of mbufs linked by m_next. Each mbuf 3743 * in the chain is called a segment. If maxsegments is not null and the 3744 * value pointed to is not null, this specify the maximum number of segments 3745 * for a chain of mbufs. If maxsegments is zero or the value pointed to 3746 * is zero the caller does not have any restriction on the number of segments. 3747 * The actual number of segments of a mbuf chain is return in the value 3748 * pointed to by maxsegments. 3749 */ 3750__private_extern__ struct mbuf * 3751m_allocpacket_internal(unsigned int *numlist, size_t packetlen, 3752 unsigned int *maxsegments, int wait, int wantall, size_t wantsize) 3753{ 3754 struct mbuf **np, *top, *first = NULL; 3755 size_t bufsize, r_bufsize; 3756 unsigned int num = 0; 3757 unsigned int nsegs = 0; 3758 unsigned int needed, resid; 3759 int mcflags = MSLEEPF(wait); 3760 mcache_obj_t *mp_list = NULL, *rmp_list = NULL; 3761 mcache_t *cp = NULL, *rcp = NULL; 3762 3763 if (*numlist == 0) 3764 return (NULL); 3765 3766 top = NULL; 3767 np = ⊤ 3768 3769 if (wantsize == 0) { 3770 if (packetlen <= MINCLSIZE) { 3771 bufsize = packetlen; 3772 } else if (packetlen > m_maxsize(MC_CL)) { 3773 /* Use 4KB if jumbo cluster pool isn't available */ 3774 if (packetlen <= m_maxsize(MC_BIGCL) || njcl == 0) 3775 bufsize = m_maxsize(MC_BIGCL); 3776 else 3777 bufsize = m_maxsize(MC_16KCL); 3778 } else { 3779 bufsize = m_maxsize(MC_CL); 3780 } 3781 } else if (wantsize == m_maxsize(MC_CL) || 3782 wantsize == m_maxsize(MC_BIGCL) || 3783 (wantsize == m_maxsize(MC_16KCL) && njcl > 0)) { 3784 bufsize = wantsize; 3785 } else { 3786 return (NULL); 3787 } 3788 3789 if (bufsize <= MHLEN) { 3790 nsegs = 1; 3791 } else if (bufsize <= MINCLSIZE) { 3792 if (maxsegments != NULL && *maxsegments == 1) { 3793 bufsize = m_maxsize(MC_CL); 3794 nsegs = 1; 3795 } else { 3796 nsegs = 2; 3797 } 3798 } else if (bufsize == m_maxsize(MC_16KCL)) { 3799 VERIFY(njcl > 0); 3800 nsegs = ((packetlen - 1) >> (PGSHIFT + 2)) + 1; 3801 } else if (bufsize == m_maxsize(MC_BIGCL)) { 3802 nsegs = ((packetlen - 1) >> PGSHIFT) + 1; 3803 } else { 3804 nsegs = ((packetlen - 1) >> MCLSHIFT) + 1; 3805 } 3806 if (maxsegments != NULL) { 3807 if (*maxsegments && nsegs > *maxsegments) { 3808 *maxsegments = nsegs; 3809 return (NULL); 3810 } 3811 *maxsegments = nsegs; 3812 } 3813 3814 /* 3815 * The caller doesn't want all the requested buffers; only some. 3816 * Try hard to get what we can, but don't block. This effectively 3817 * overrides MCR_SLEEP, since this thread will not go to sleep 3818 * if we can't get all the buffers. 3819 */ 3820 if (!wantall || (mcflags & MCR_NOSLEEP)) 3821 mcflags |= MCR_TRYHARD; 3822 3823 /* 3824 * Simple case where all elements in the lists/chains are mbufs. 3825 * Unless bufsize is greater than MHLEN, each segment chain is made 3826 * up of exactly 1 mbuf. Otherwise, each segment chain is made up 3827 * of 2 mbufs; the second one is used for the residual data, i.e. 3828 * the remaining data that cannot fit into the first mbuf. 3829 */ 3830 if (bufsize <= MINCLSIZE) { 3831 /* Allocate the elements in one shot from the mbuf cache */ 3832 ASSERT(bufsize <= MHLEN || nsegs == 2); 3833 cp = m_cache(MC_MBUF); 3834 needed = mcache_alloc_ext(cp, &mp_list, 3835 (*numlist) * nsegs, mcflags); 3836 3837 /* 3838 * The number of elements must be even if we are to use an 3839 * mbuf (instead of a cluster) to store the residual data. 3840 * If we couldn't allocate the requested number of mbufs, 3841 * trim the number down (if it's odd) in order to avoid 3842 * creating a partial segment chain. 3843 */ 3844 if (bufsize > MHLEN && (needed & 0x1)) 3845 needed--; 3846 3847 while (num < needed) { 3848 struct mbuf *m; 3849 3850 m = (struct mbuf *)mp_list; 3851 mp_list = mp_list->obj_next; 3852 ASSERT(m != NULL); 3853 3854 MBUF_INIT(m, 1, MT_DATA); 3855#if CONFIG_MACF_NET 3856 if (mac_init_mbuf(m, wait) != 0) { 3857 m_free(m); 3858 break; 3859 } 3860#endif /* MAC_NET */ 3861 num++; 3862 if (bufsize > MHLEN) { 3863 /* A second mbuf for this segment chain */ 3864 m->m_next = (struct mbuf *)mp_list; 3865 mp_list = mp_list->obj_next; 3866 ASSERT(m->m_next != NULL); 3867 3868 MBUF_INIT(m->m_next, 0, MT_DATA); 3869 num++; 3870 } 3871 *np = m; 3872 np = &m->m_nextpkt; 3873 } 3874 ASSERT(num != *numlist || mp_list == NULL); 3875 3876 if (num > 0) { 3877 mtype_stat_add(MT_DATA, num); 3878 mtype_stat_sub(MT_FREE, num); 3879 } 3880 num /= nsegs; 3881 3882 /* We've got them all; return to caller */ 3883 if (num == *numlist) 3884 return (top); 3885 3886 goto fail; 3887 } 3888 3889 /* 3890 * Complex cases where elements are made up of one or more composite 3891 * mbufs + cluster, depending on packetlen. Each N-segment chain can 3892 * be illustrated as follows: 3893 * 3894 * [mbuf + cluster 1] [mbuf + cluster 2] ... [mbuf + cluster N] 3895 * 3896 * Every composite mbuf + cluster element comes from the intermediate 3897 * cache (either MC_MBUF_CL or MC_MBUF_BIGCL). For space efficiency, 3898 * the last composite element will come from the MC_MBUF_CL cache, 3899 * unless the residual data is larger than 2KB where we use the 3900 * big cluster composite cache (MC_MBUF_BIGCL) instead. Residual 3901 * data is defined as extra data beyond the first element that cannot 3902 * fit into the previous element, i.e. there is no residual data if 3903 * the chain only has 1 segment. 3904 */ 3905 r_bufsize = bufsize; 3906 resid = packetlen > bufsize ? packetlen % bufsize : 0; 3907 if (resid > 0) { 3908 /* There is residual data; figure out the cluster size */ 3909 if (wantsize == 0 && packetlen > MINCLSIZE) { 3910 /* 3911 * Caller didn't request that all of the segments 3912 * in the chain use the same cluster size; use the 3913 * smaller of the cluster sizes. 3914 */ 3915 if (njcl > 0 && resid > m_maxsize(MC_BIGCL)) 3916 r_bufsize = m_maxsize(MC_16KCL); 3917 else if (resid > m_maxsize(MC_CL)) 3918 r_bufsize = m_maxsize(MC_BIGCL); 3919 else 3920 r_bufsize = m_maxsize(MC_CL); 3921 } else { 3922 /* Use the same cluster size as the other segments */ 3923 resid = 0; 3924 } 3925 } 3926 3927 needed = *numlist; 3928 if (resid > 0) { 3929 /* 3930 * Attempt to allocate composite mbuf + cluster elements for 3931 * the residual data in each chain; record the number of such 3932 * elements that can be allocated so that we know how many 3933 * segment chains we can afford to create. 3934 */ 3935 if (r_bufsize <= m_maxsize(MC_CL)) 3936 rcp = m_cache(MC_MBUF_CL); 3937 else if (r_bufsize <= m_maxsize(MC_BIGCL)) 3938 rcp = m_cache(MC_MBUF_BIGCL); 3939 else 3940 rcp = m_cache(MC_MBUF_16KCL); 3941 needed = mcache_alloc_ext(rcp, &rmp_list, *numlist, mcflags); 3942 3943 if (needed == 0) 3944 goto fail; 3945 3946 /* This is temporarily reduced for calculation */ 3947 ASSERT(nsegs > 1); 3948 nsegs--; 3949 } 3950 3951 /* 3952 * Attempt to allocate the rest of the composite mbuf + cluster 3953 * elements for the number of segment chains that we need. 3954 */ 3955 if (bufsize <= m_maxsize(MC_CL)) 3956 cp = m_cache(MC_MBUF_CL); 3957 else if (bufsize <= m_maxsize(MC_BIGCL)) 3958 cp = m_cache(MC_MBUF_BIGCL); 3959 else 3960 cp = m_cache(MC_MBUF_16KCL); 3961 needed = mcache_alloc_ext(cp, &mp_list, needed * nsegs, mcflags); 3962 3963 /* Round it down to avoid creating a partial segment chain */ 3964 needed = (needed / nsegs) * nsegs; 3965 if (needed == 0) 3966 goto fail; 3967 3968 if (resid > 0) { 3969 /* 3970 * We're about to construct the chain(s); take into account 3971 * the number of segments we have created above to hold the 3972 * residual data for each chain, as well as restore the 3973 * original count of segments per chain. 3974 */ 3975 ASSERT(nsegs > 0); 3976 needed += needed / nsegs; 3977 nsegs++; 3978 } 3979 3980 for (;;) { 3981 struct mbuf *m; 3982 u_int32_t flag; 3983 struct ext_ref *rfa; 3984 void *cl; 3985 int pkthdr; 3986 3987 ++num; 3988 if (nsegs == 1 || (num % nsegs) != 0 || resid == 0) { 3989 m = (struct mbuf *)mp_list; 3990 mp_list = mp_list->obj_next; 3991 } else { 3992 m = (struct mbuf *)rmp_list; 3993 rmp_list = rmp_list->obj_next; 3994 } 3995 ASSERT(m != NULL); 3996 VERIFY(m->m_type == MT_FREE && m->m_flags == M_EXT); 3997 VERIFY(m->m_ext.ext_free == NULL || 3998 m->m_ext.ext_free == m_bigfree || 3999 m->m_ext.ext_free == m_16kfree); 4000 4001 cl = m->m_ext.ext_buf; 4002 rfa = MEXT_RFA(m); 4003 4004 ASSERT(cl != NULL && rfa != NULL); 4005 VERIFY(MBUF_IS_COMPOSITE(m)); 4006 4007 flag = MEXT_FLAGS(m); 4008 4009 pkthdr = (nsegs == 1 || (num % nsegs) == 1); 4010 if (pkthdr) 4011 first = m; 4012 MBUF_INIT(m, pkthdr, MT_DATA); 4013 if (m->m_ext.ext_free == m_16kfree) { 4014 MBUF_16KCL_INIT(m, cl, rfa, 1, flag); 4015 } else if (m->m_ext.ext_free == m_bigfree) { 4016 MBUF_BIGCL_INIT(m, cl, rfa, 1, flag); 4017 } else { 4018 MBUF_CL_INIT(m, cl, rfa, 1, flag); 4019 } 4020#if CONFIG_MACF_NET 4021 if (pkthdr && mac_init_mbuf(m, wait) != 0) { 4022 --num; 4023 m_freem(m); 4024 break; 4025 } 4026#endif /* MAC_NET */ 4027 4028 *np = m; 4029 if ((num % nsegs) == 0) 4030 np = &first->m_nextpkt; 4031 else 4032 np = &m->m_next; 4033 4034 if (num == needed) 4035 break; 4036 } 4037 4038 if (num > 0) { 4039 mtype_stat_add(MT_DATA, num); 4040 mtype_stat_sub(MT_FREE, num); 4041 } 4042 4043 num /= nsegs; 4044 4045 /* We've got them all; return to caller */ 4046 if (num == *numlist) { 4047 ASSERT(mp_list == NULL && rmp_list == NULL); 4048 return (top); 4049 } 4050 4051fail: 4052 /* Free up what's left of the above */ 4053 if (mp_list != NULL) 4054 mcache_free_ext(cp, mp_list); 4055 if (rmp_list != NULL) 4056 mcache_free_ext(rcp, rmp_list); 4057 if (wantall && top != NULL) { 4058 m_freem(top); 4059 return (NULL); 4060 } 4061 *numlist = num; 4062 return (top); 4063} 4064 4065/* 4066 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4067 * packets on receive ring. 4068 */ 4069__private_extern__ struct mbuf * 4070m_getpacket_how(int wait) 4071{ 4072 unsigned int num_needed = 1; 4073 4074 return (m_getpackets_internal(&num_needed, 1, wait, 1, 4075 m_maxsize(MC_CL))); 4076} 4077 4078/* 4079 * Best effort to get a mbuf cluster + pkthdr. Used by drivers to allocated 4080 * packets on receive ring. 4081 */ 4082struct mbuf * 4083m_getpacket(void) 4084{ 4085 unsigned int num_needed = 1; 4086 4087 return (m_getpackets_internal(&num_needed, 1, M_WAIT, 1, 4088 m_maxsize(MC_CL))); 4089} 4090 4091/* 4092 * Return a list of mbuf hdrs that point to clusters. Try for num_needed; 4093 * if this can't be met, return whatever number were available. Set up the 4094 * first num_with_pkthdrs with mbuf hdrs configured as packet headers. These 4095 * are chained on the m_nextpkt field. Any packets requested beyond this are 4096 * chained onto the last packet header's m_next field. 4097 */ 4098struct mbuf * 4099m_getpackets(int num_needed, int num_with_pkthdrs, int how) 4100{ 4101 unsigned int n = num_needed; 4102 4103 return (m_getpackets_internal(&n, num_with_pkthdrs, how, 0, 4104 m_maxsize(MC_CL))); 4105} 4106 4107/* 4108 * Return a list of mbuf hdrs set up as packet hdrs chained together 4109 * on the m_nextpkt field 4110 */ 4111struct mbuf * 4112m_getpackethdrs(int num_needed, int how) 4113{ 4114 struct mbuf *m; 4115 struct mbuf **np, *top; 4116 4117 top = NULL; 4118 np = ⊤ 4119 4120 while (num_needed--) { 4121 m = _M_RETRYHDR(how, MT_DATA); 4122 if (m == NULL) 4123 break; 4124 4125 *np = m; 4126 np = &m->m_nextpkt; 4127 } 4128 4129 return (top); 4130} 4131 4132/* 4133 * Free an mbuf list (m_nextpkt) while following m_next. Returns the count 4134 * for mbufs packets freed. Used by the drivers. 4135 */ 4136int 4137m_freem_list(struct mbuf *m) 4138{ 4139 struct mbuf *nextpkt; 4140 mcache_obj_t *mp_list = NULL; 4141 mcache_obj_t *mcl_list = NULL; 4142 mcache_obj_t *mbc_list = NULL; 4143 mcache_obj_t *m16k_list = NULL; 4144 mcache_obj_t *m_mcl_list = NULL; 4145 mcache_obj_t *m_mbc_list = NULL; 4146 mcache_obj_t *m_m16k_list = NULL; 4147 mcache_obj_t *ref_list = NULL; 4148 int pktcount = 0; 4149 int mt_free = 0, mt_data = 0, mt_header = 0, mt_soname = 0, mt_tag = 0; 4150 4151 while (m != NULL) { 4152 pktcount++; 4153 4154 nextpkt = m->m_nextpkt; 4155 m->m_nextpkt = NULL; 4156 4157 while (m != NULL) { 4158 struct mbuf *next = m->m_next; 4159 mcache_obj_t *o, *rfa; 4160 u_int32_t refcnt, composite; 4161 4162 if (m->m_type == MT_FREE) 4163 panic("m_free: freeing an already freed mbuf"); 4164 4165 if (m->m_type != MT_FREE) 4166 mt_free++; 4167 4168 if (m->m_flags & M_PKTHDR) { 4169 m_tag_delete_chain(m, NULL); 4170 } 4171 4172 if (!(m->m_flags & M_EXT)) 4173 goto simple_free; 4174 4175 o = (mcache_obj_t *)(void *)m->m_ext.ext_buf; 4176 refcnt = m_decref(m); 4177 composite = (MEXT_FLAGS(m) & EXTF_COMPOSITE); 4178 if (refcnt == 0 && !composite) { 4179 if (m->m_ext.ext_free == NULL) { 4180 o->obj_next = mcl_list; 4181 mcl_list = o; 4182 } else if (m->m_ext.ext_free == m_bigfree) { 4183 o->obj_next = mbc_list; 4184 mbc_list = o; 4185 } else if (m->m_ext.ext_free == m_16kfree) { 4186 o->obj_next = m16k_list; 4187 m16k_list = o; 4188 } else { 4189 (*(m->m_ext.ext_free))((caddr_t)o, 4190 m->m_ext.ext_size, 4191 m->m_ext.ext_arg); 4192 } 4193 rfa = (mcache_obj_t *)(void *)MEXT_RFA(m); 4194 rfa->obj_next = ref_list; 4195 ref_list = rfa; 4196 MEXT_RFA(m) = NULL; 4197 } else if (refcnt == 0 && composite) { 4198 VERIFY(m->m_type != MT_FREE); 4199 /* 4200 * Amortize the costs of atomic operations 4201 * by doing them at the end, if possible. 4202 */ 4203 if (m->m_type == MT_DATA) 4204 mt_data++; 4205 else if (m->m_type == MT_HEADER) 4206 mt_header++; 4207 else if (m->m_type == MT_SONAME) 4208 mt_soname++; 4209 else if (m->m_type == MT_TAG) 4210 mt_tag++; 4211 else 4212 mtype_stat_dec(m->m_type); 4213 4214 m->m_type = MT_FREE; 4215 m->m_flags = M_EXT; 4216 m->m_len = 0; 4217 m->m_next = m->m_nextpkt = NULL; 4218 4219 MEXT_FLAGS(m) &= ~EXTF_READONLY; 4220 4221 /* "Free" into the intermediate cache */ 4222 o = (mcache_obj_t *)m; 4223 if (m->m_ext.ext_free == NULL) { 4224 o->obj_next = m_mcl_list; 4225 m_mcl_list = o; 4226 } else if (m->m_ext.ext_free == m_bigfree) { 4227 o->obj_next = m_mbc_list; 4228 m_mbc_list = o; 4229 } else { 4230 VERIFY(m->m_ext.ext_free == m_16kfree); 4231 o->obj_next = m_m16k_list; 4232 m_m16k_list = o; 4233 } 4234 m = next; 4235 continue; 4236 } 4237simple_free: 4238 /* 4239 * Amortize the costs of atomic operations 4240 * by doing them at the end, if possible. 4241 */ 4242 if (m->m_type == MT_DATA) 4243 mt_data++; 4244 else if (m->m_type == MT_HEADER) 4245 mt_header++; 4246 else if (m->m_type == MT_SONAME) 4247 mt_soname++; 4248 else if (m->m_type == MT_TAG) 4249 mt_tag++; 4250 else if (m->m_type != MT_FREE) 4251 mtype_stat_dec(m->m_type); 4252 4253 m->m_type = MT_FREE; 4254 m->m_flags = m->m_len = 0; 4255 m->m_next = m->m_nextpkt = NULL; 4256 4257 ((mcache_obj_t *)m)->obj_next = mp_list; 4258 mp_list = (mcache_obj_t *)m; 4259 4260 m = next; 4261 } 4262 4263 m = nextpkt; 4264 } 4265 4266 if (mt_free > 0) 4267 mtype_stat_add(MT_FREE, mt_free); 4268 if (mt_data > 0) 4269 mtype_stat_sub(MT_DATA, mt_data); 4270 if (mt_header > 0) 4271 mtype_stat_sub(MT_HEADER, mt_header); 4272 if (mt_soname > 0) 4273 mtype_stat_sub(MT_SONAME, mt_soname); 4274 if (mt_tag > 0) 4275 mtype_stat_sub(MT_TAG, mt_tag); 4276 4277 if (mp_list != NULL) 4278 mcache_free_ext(m_cache(MC_MBUF), mp_list); 4279 if (mcl_list != NULL) 4280 mcache_free_ext(m_cache(MC_CL), mcl_list); 4281 if (mbc_list != NULL) 4282 mcache_free_ext(m_cache(MC_BIGCL), mbc_list); 4283 if (m16k_list != NULL) 4284 mcache_free_ext(m_cache(MC_16KCL), m16k_list); 4285 if (m_mcl_list != NULL) 4286 mcache_free_ext(m_cache(MC_MBUF_CL), m_mcl_list); 4287 if (m_mbc_list != NULL) 4288 mcache_free_ext(m_cache(MC_MBUF_BIGCL), m_mbc_list); 4289 if (m_m16k_list != NULL) 4290 mcache_free_ext(m_cache(MC_MBUF_16KCL), m_m16k_list); 4291 if (ref_list != NULL) 4292 mcache_free_ext(ref_cache, ref_list); 4293 4294 return (pktcount); 4295} 4296 4297void 4298m_freem(struct mbuf *m) 4299{ 4300 while (m != NULL) 4301 m = m_free(m); 4302} 4303 4304/* 4305 * Mbuffer utility routines. 4306 */ 4307 4308/* 4309 * Compute the amount of space available before the current start 4310 * of data in an mbuf. 4311 */ 4312int 4313m_leadingspace(struct mbuf *m) 4314{ 4315 if (m->m_flags & M_EXT) { 4316 if (MCLHASREFERENCE(m)) 4317 return (0); 4318 return (m->m_data - m->m_ext.ext_buf); 4319 } 4320 if (m->m_flags & M_PKTHDR) 4321 return (m->m_data - m->m_pktdat); 4322 return (m->m_data - m->m_dat); 4323} 4324 4325/* 4326 * Compute the amount of space available after the end of data in an mbuf. 4327 */ 4328int 4329m_trailingspace(struct mbuf *m) 4330{ 4331 if (m->m_flags & M_EXT) { 4332 if (MCLHASREFERENCE(m)) 4333 return (0); 4334 return (m->m_ext.ext_buf + m->m_ext.ext_size - 4335 (m->m_data + m->m_len)); 4336 } 4337 return (&m->m_dat[MLEN] - (m->m_data + m->m_len)); 4338} 4339 4340/* 4341 * Lesser-used path for M_PREPEND: allocate new mbuf to prepend to chain, 4342 * copy junk along. Does not adjust packet header length. 4343 */ 4344struct mbuf * 4345m_prepend(struct mbuf *m, int len, int how) 4346{ 4347 struct mbuf *mn; 4348 4349 _MGET(mn, how, m->m_type); 4350 if (mn == NULL) { 4351 m_freem(m); 4352 return (NULL); 4353 } 4354 if (m->m_flags & M_PKTHDR) { 4355 M_COPY_PKTHDR(mn, m); 4356 m->m_flags &= ~M_PKTHDR; 4357 } 4358 mn->m_next = m; 4359 m = mn; 4360 if (len < MHLEN) 4361 MH_ALIGN(m, len); 4362 m->m_len = len; 4363 return (m); 4364} 4365 4366/* 4367 * Replacement for old M_PREPEND macro: allocate new mbuf to prepend to 4368 * chain, copy junk along, and adjust length. 4369 */ 4370struct mbuf * 4371m_prepend_2(struct mbuf *m, int len, int how) 4372{ 4373 if (M_LEADINGSPACE(m) >= len) { 4374 m->m_data -= len; 4375 m->m_len += len; 4376 } else { 4377 m = m_prepend(m, len, how); 4378 } 4379 if ((m) && (m->m_flags & M_PKTHDR)) 4380 m->m_pkthdr.len += len; 4381 return (m); 4382} 4383 4384/* 4385 * Make a copy of an mbuf chain starting "off0" bytes from the beginning, 4386 * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. 4387 * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. 4388 */ 4389int MCFail; 4390 4391struct mbuf * 4392m_copym(struct mbuf *m, int off0, int len, int wait) 4393{ 4394 struct mbuf *n, *mhdr = NULL, **np; 4395 int off = off0; 4396 struct mbuf *top; 4397 int copyhdr = 0; 4398 4399 if (off < 0 || len < 0) 4400 panic("m_copym: invalid offset %d or len %d", off, len); 4401 4402 if (off == 0 && (m->m_flags & M_PKTHDR)) { 4403 mhdr = m; 4404 copyhdr = 1; 4405 } 4406 4407 while (off >= m->m_len) { 4408 if (m->m_next == NULL) 4409 panic("m_copym: invalid mbuf chain"); 4410 off -= m->m_len; 4411 m = m->m_next; 4412 } 4413 np = ⊤ 4414 top = NULL; 4415 4416 while (len > 0) { 4417 if (m == NULL) { 4418 if (len != M_COPYALL) 4419 panic("m_copym: len != M_COPYALL"); 4420 break; 4421 } 4422 4423 n = _M_RETRY(wait, m->m_type); 4424 *np = n; 4425 4426 if (n == NULL) 4427 goto nospace; 4428 4429 if (copyhdr != 0) { 4430 M_COPY_PKTHDR(n, mhdr); 4431 if (len == M_COPYALL) 4432 n->m_pkthdr.len -= off0; 4433 else 4434 n->m_pkthdr.len = len; 4435 copyhdr = 0; 4436 } 4437 if (len == M_COPYALL) { 4438 if (MIN(len, (m->m_len - off)) == len) { 4439 printf("m->m_len %d - off %d = %d, %d\n", 4440 m->m_len, off, m->m_len - off, 4441 MIN(len, (m->m_len - off))); 4442 } 4443 } 4444 n->m_len = MIN(len, (m->m_len - off)); 4445 if (n->m_len == M_COPYALL) { 4446 printf("n->m_len == M_COPYALL, fixing\n"); 4447 n->m_len = MHLEN; 4448 } 4449 if (m->m_flags & M_EXT) { 4450 n->m_ext = m->m_ext; 4451 m_incref(m); 4452 n->m_data = m->m_data + off; 4453 n->m_flags |= M_EXT; 4454 } else { 4455 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4456 (unsigned)n->m_len); 4457 } 4458 if (len != M_COPYALL) 4459 len -= n->m_len; 4460 off = 0; 4461 m = m->m_next; 4462 np = &n->m_next; 4463 } 4464 4465 if (top == NULL) 4466 MCFail++; 4467 4468 return (top); 4469nospace: 4470 4471 m_freem(top); 4472 MCFail++; 4473 return (NULL); 4474} 4475 4476/* 4477 * Equivalent to m_copym except that all necessary mbuf hdrs are allocated 4478 * within this routine also, the last mbuf and offset accessed are passed 4479 * out and can be passed back in to avoid having to rescan the entire mbuf 4480 * list (normally hung off of the socket) 4481 */ 4482struct mbuf * 4483m_copym_with_hdrs(struct mbuf *m, int off0, int len0, int wait, 4484 struct mbuf **m_lastm, int *m_off) 4485{ 4486 struct mbuf *n, **np = NULL; 4487 int off = off0, len = len0; 4488 struct mbuf *top = NULL; 4489 int mcflags = MSLEEPF(wait); 4490 int copyhdr = 0; 4491 int type = 0; 4492 mcache_obj_t *list = NULL; 4493 int needed = 0; 4494 4495 if (off == 0 && (m->m_flags & M_PKTHDR)) 4496 copyhdr = 1; 4497 4498 if (*m_lastm != NULL) { 4499 m = *m_lastm; 4500 off = *m_off; 4501 } else { 4502 while (off >= m->m_len) { 4503 off -= m->m_len; 4504 m = m->m_next; 4505 } 4506 } 4507 4508 n = m; 4509 while (len > 0) { 4510 needed++; 4511 ASSERT(n != NULL); 4512 len -= MIN(len, (n->m_len - ((needed == 1) ? off : 0))); 4513 n = n->m_next; 4514 } 4515 needed++; 4516 len = len0; 4517 4518 /* 4519 * If the caller doesn't want to be put to sleep, mark it with 4520 * MCR_TRYHARD so that we may reclaim buffers from other places 4521 * before giving up. 4522 */ 4523 if (mcflags & MCR_NOSLEEP) 4524 mcflags |= MCR_TRYHARD; 4525 4526 if (mcache_alloc_ext(m_cache(MC_MBUF), &list, needed, 4527 mcflags) != needed) 4528 goto nospace; 4529 4530 needed = 0; 4531 while (len > 0) { 4532 n = (struct mbuf *)list; 4533 list = list->obj_next; 4534 ASSERT(n != NULL && m != NULL); 4535 4536 type = (top == NULL) ? MT_HEADER : m->m_type; 4537 MBUF_INIT(n, (top == NULL), type); 4538#if CONFIG_MACF_NET 4539 if (top == NULL && mac_mbuf_label_init(n, wait) != 0) { 4540 mtype_stat_inc(MT_HEADER); 4541 mtype_stat_dec(MT_FREE); 4542 m_free(n); 4543 goto nospace; 4544 } 4545#endif /* MAC_NET */ 4546 4547 if (top == NULL) { 4548 top = n; 4549 np = &top->m_next; 4550 continue; 4551 } else { 4552 needed++; 4553 *np = n; 4554 } 4555 4556 if (copyhdr) { 4557 M_COPY_PKTHDR(n, m); 4558 n->m_pkthdr.len = len; 4559 copyhdr = 0; 4560 } 4561 n->m_len = MIN(len, (m->m_len - off)); 4562 4563 if (m->m_flags & M_EXT) { 4564 n->m_ext = m->m_ext; 4565 m_incref(m); 4566 n->m_data = m->m_data + off; 4567 n->m_flags |= M_EXT; 4568 } else { 4569 bcopy(MTOD(m, caddr_t)+off, MTOD(n, caddr_t), 4570 (unsigned)n->m_len); 4571 } 4572 len -= n->m_len; 4573 4574 if (len == 0) { 4575 if ((off + n->m_len) == m->m_len) { 4576 *m_lastm = m->m_next; 4577 *m_off = 0; 4578 } else { 4579 *m_lastm = m; 4580 *m_off = off + n->m_len; 4581 } 4582 break; 4583 } 4584 off = 0; 4585 m = m->m_next; 4586 np = &n->m_next; 4587 } 4588 4589 mtype_stat_inc(MT_HEADER); 4590 mtype_stat_add(type, needed); 4591 mtype_stat_sub(MT_FREE, needed + 1); 4592 4593 ASSERT(list == NULL); 4594 return (top); 4595 4596nospace: 4597 if (list != NULL) 4598 mcache_free_ext(m_cache(MC_MBUF), list); 4599 if (top != NULL) 4600 m_freem(top); 4601 MCFail++; 4602 return (NULL); 4603} 4604 4605/* 4606 * Copy data from an mbuf chain starting "off" bytes from the beginning, 4607 * continuing for "len" bytes, into the indicated buffer. 4608 */ 4609void 4610m_copydata(struct mbuf *m, int off, int len, void *vp) 4611{ 4612 unsigned count; 4613 char *cp = vp; 4614 4615 if (off < 0 || len < 0) 4616 panic("m_copydata: invalid offset %d or len %d", off, len); 4617 4618 while (off > 0) { 4619 if (m == NULL) 4620 panic("m_copydata: invalid mbuf chain"); 4621 if (off < m->m_len) 4622 break; 4623 off -= m->m_len; 4624 m = m->m_next; 4625 } 4626 while (len > 0) { 4627 if (m == NULL) 4628 panic("m_copydata: invalid mbuf chain"); 4629 count = MIN(m->m_len - off, len); 4630 bcopy(MTOD(m, caddr_t) + off, cp, count); 4631 len -= count; 4632 cp += count; 4633 off = 0; 4634 m = m->m_next; 4635 } 4636} 4637 4638/* 4639 * Concatenate mbuf chain n to m. Both chains must be of the same type 4640 * (e.g. MT_DATA). Any m_pkthdr is not updated. 4641 */ 4642void 4643m_cat(struct mbuf *m, struct mbuf *n) 4644{ 4645 while (m->m_next) 4646 m = m->m_next; 4647 while (n) { 4648 if ((m->m_flags & M_EXT) || 4649 m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { 4650 /* just join the two chains */ 4651 m->m_next = n; 4652 return; 4653 } 4654 /* splat the data from one into the other */ 4655 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4656 (u_int)n->m_len); 4657 m->m_len += n->m_len; 4658 n = m_free(n); 4659 } 4660} 4661 4662void 4663m_adj(struct mbuf *mp, int req_len) 4664{ 4665 int len = req_len; 4666 struct mbuf *m; 4667 int count; 4668 4669 if ((m = mp) == NULL) 4670 return; 4671 if (len >= 0) { 4672 /* 4673 * Trim from head. 4674 */ 4675 while (m != NULL && len > 0) { 4676 if (m->m_len <= len) { 4677 len -= m->m_len; 4678 m->m_len = 0; 4679 m = m->m_next; 4680 } else { 4681 m->m_len -= len; 4682 m->m_data += len; 4683 len = 0; 4684 } 4685 } 4686 m = mp; 4687 if (m->m_flags & M_PKTHDR) 4688 m->m_pkthdr.len -= (req_len - len); 4689 } else { 4690 /* 4691 * Trim from tail. Scan the mbuf chain, 4692 * calculating its length and finding the last mbuf. 4693 * If the adjustment only affects this mbuf, then just 4694 * adjust and return. Otherwise, rescan and truncate 4695 * after the remaining size. 4696 */ 4697 len = -len; 4698 count = 0; 4699 for (;;) { 4700 count += m->m_len; 4701 if (m->m_next == (struct mbuf *)0) 4702 break; 4703 m = m->m_next; 4704 } 4705 if (m->m_len >= len) { 4706 m->m_len -= len; 4707 m = mp; 4708 if (m->m_flags & M_PKTHDR) 4709 m->m_pkthdr.len -= len; 4710 return; 4711 } 4712 count -= len; 4713 if (count < 0) 4714 count = 0; 4715 /* 4716 * Correct length for chain is "count". 4717 * Find the mbuf with last data, adjust its length, 4718 * and toss data from remaining mbufs on chain. 4719 */ 4720 m = mp; 4721 if (m->m_flags & M_PKTHDR) 4722 m->m_pkthdr.len = count; 4723 for (; m; m = m->m_next) { 4724 if (m->m_len >= count) { 4725 m->m_len = count; 4726 break; 4727 } 4728 count -= m->m_len; 4729 } 4730 while ((m = m->m_next)) 4731 m->m_len = 0; 4732 } 4733} 4734 4735/* 4736 * Rearange an mbuf chain so that len bytes are contiguous 4737 * and in the data area of an mbuf (so that mtod and dtom 4738 * will work for a structure of size len). Returns the resulting 4739 * mbuf chain on success, frees it and returns null on failure. 4740 * If there is room, it will add up to max_protohdr-len extra bytes to the 4741 * contiguous region in an attempt to avoid being called next time. 4742 */ 4743int MPFail; 4744 4745struct mbuf * 4746m_pullup(struct mbuf *n, int len) 4747{ 4748 struct mbuf *m; 4749 int count; 4750 int space; 4751 4752 /* 4753 * If first mbuf has no cluster, and has room for len bytes 4754 * without shifting current data, pullup into it, 4755 * otherwise allocate a new mbuf to prepend to the chain. 4756 */ 4757 if ((n->m_flags & M_EXT) == 0 && 4758 n->m_data + len < &n->m_dat[MLEN] && n->m_next) { 4759 if (n->m_len >= len) 4760 return (n); 4761 m = n; 4762 n = n->m_next; 4763 len -= m->m_len; 4764 } else { 4765 if (len > MHLEN) 4766 goto bad; 4767 _MGET(m, M_DONTWAIT, n->m_type); 4768 if (m == 0) 4769 goto bad; 4770 m->m_len = 0; 4771 if (n->m_flags & M_PKTHDR) { 4772 M_COPY_PKTHDR(m, n); 4773 n->m_flags &= ~M_PKTHDR; 4774 } 4775 } 4776 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 4777 do { 4778 count = MIN(MIN(MAX(len, max_protohdr), space), n->m_len); 4779 bcopy(MTOD(n, caddr_t), MTOD(m, caddr_t) + m->m_len, 4780 (unsigned)count); 4781 len -= count; 4782 m->m_len += count; 4783 n->m_len -= count; 4784 space -= count; 4785 if (n->m_len) 4786 n->m_data += count; 4787 else 4788 n = m_free(n); 4789 } while (len > 0 && n); 4790 if (len > 0) { 4791 (void) m_free(m); 4792 goto bad; 4793 } 4794 m->m_next = n; 4795 return (m); 4796bad: 4797 m_freem(n); 4798 MPFail++; 4799 return (0); 4800} 4801 4802/* 4803 * Like m_pullup(), except a new mbuf is always allocated, and we allow 4804 * the amount of empty space before the data in the new mbuf to be specified 4805 * (in the event that the caller expects to prepend later). 4806 */ 4807__private_extern__ int MSFail = 0; 4808 4809__private_extern__ struct mbuf * 4810m_copyup(struct mbuf *n, int len, int dstoff) 4811{ 4812 struct mbuf *m; 4813 int count, space; 4814 4815 if (len > (MHLEN - dstoff)) 4816 goto bad; 4817 MGET(m, M_DONTWAIT, n->m_type); 4818 if (m == NULL) 4819 goto bad; 4820 m->m_len = 0; 4821 if (n->m_flags & M_PKTHDR) { 4822 m_copy_pkthdr(m, n); 4823 n->m_flags &= ~M_PKTHDR; 4824 } 4825 m->m_data += dstoff; 4826 space = &m->m_dat[MLEN] - (m->m_data + m->m_len); 4827 do { 4828 count = min(min(max(len, max_protohdr), space), n->m_len); 4829 memcpy(mtod(m, caddr_t) + m->m_len, mtod(n, caddr_t), 4830 (unsigned)count); 4831 len -= count; 4832 m->m_len += count; 4833 n->m_len -= count; 4834 space -= count; 4835 if (n->m_len) 4836 n->m_data += count; 4837 else 4838 n = m_free(n); 4839 } while (len > 0 && n); 4840 if (len > 0) { 4841 (void) m_free(m); 4842 goto bad; 4843 } 4844 m->m_next = n; 4845 return (m); 4846bad: 4847 m_freem(n); 4848 MSFail++; 4849 return (NULL); 4850} 4851 4852/* 4853 * Partition an mbuf chain in two pieces, returning the tail -- 4854 * all but the first len0 bytes. In case of failure, it returns NULL and 4855 * attempts to restore the chain to its original state. 4856 */ 4857struct mbuf * 4858m_split(struct mbuf *m0, int len0, int wait) 4859{ 4860 return (m_split0(m0, len0, wait, 1)); 4861} 4862 4863static struct mbuf * 4864m_split0(struct mbuf *m0, int len0, int wait, int copyhdr) 4865{ 4866 struct mbuf *m, *n; 4867 unsigned len = len0, remain; 4868 4869 for (m = m0; m && len > m->m_len; m = m->m_next) 4870 len -= m->m_len; 4871 if (m == NULL) 4872 return (NULL); 4873 remain = m->m_len - len; 4874 if (copyhdr && (m0->m_flags & M_PKTHDR)) { 4875 _MGETHDR(n, wait, m0->m_type); 4876 if (n == NULL) 4877 return (NULL); 4878 n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; 4879 n->m_pkthdr.len = m0->m_pkthdr.len - len0; 4880 m0->m_pkthdr.len = len0; 4881 if (m->m_flags & M_EXT) 4882 goto extpacket; 4883 if (remain > MHLEN) { 4884 /* m can't be the lead packet */ 4885 MH_ALIGN(n, 0); 4886 n->m_next = m_split(m, len, wait); 4887 if (n->m_next == NULL) { 4888 (void) m_free(n); 4889 return (NULL); 4890 } else 4891 return (n); 4892 } else 4893 MH_ALIGN(n, remain); 4894 } else if (remain == 0) { 4895 n = m->m_next; 4896 m->m_next = NULL; 4897 return (n); 4898 } else { 4899 _MGET(n, wait, m->m_type); 4900 if (n == NULL) 4901 return (NULL); 4902 M_ALIGN(n, remain); 4903 } 4904extpacket: 4905 if (m->m_flags & M_EXT) { 4906 n->m_flags |= M_EXT; 4907 n->m_ext = m->m_ext; 4908 m_incref(m); 4909 n->m_data = m->m_data + len; 4910 } else { 4911 bcopy(MTOD(m, caddr_t) + len, MTOD(n, caddr_t), remain); 4912 } 4913 n->m_len = remain; 4914 m->m_len = len; 4915 n->m_next = m->m_next; 4916 m->m_next = NULL; 4917 return (n); 4918} 4919 4920/* 4921 * Routine to copy from device local memory into mbufs. 4922 */ 4923struct mbuf * 4924m_devget(char *buf, int totlen, int off0, struct ifnet *ifp, 4925 void (*copy)(const void *, void *, size_t)) 4926{ 4927 struct mbuf *m; 4928 struct mbuf *top = NULL, **mp = ⊤ 4929 int off = off0, len; 4930 char *cp; 4931 char *epkt; 4932 4933 cp = buf; 4934 epkt = cp + totlen; 4935 if (off) { 4936 /* 4937 * If 'off' is non-zero, packet is trailer-encapsulated, 4938 * so we have to skip the type and length fields. 4939 */ 4940 cp += off + 2 * sizeof (u_int16_t); 4941 totlen -= 2 * sizeof (u_int16_t); 4942 } 4943 _MGETHDR(m, M_DONTWAIT, MT_DATA); 4944 if (m == NULL) 4945 return (NULL); 4946 m->m_pkthdr.rcvif = ifp; 4947 m->m_pkthdr.len = totlen; 4948 m->m_len = MHLEN; 4949 4950 while (totlen > 0) { 4951 if (top != NULL) { 4952 _MGET(m, M_DONTWAIT, MT_DATA); 4953 if (m == NULL) { 4954 m_freem(top); 4955 return (NULL); 4956 } 4957 m->m_len = MLEN; 4958 } 4959 len = MIN(totlen, epkt - cp); 4960 if (len >= MINCLSIZE) { 4961 MCLGET(m, M_DONTWAIT); 4962 if (m->m_flags & M_EXT) { 4963 m->m_len = len = MIN(len, m_maxsize(MC_CL)); 4964 } else { 4965 /* give up when it's out of cluster mbufs */ 4966 if (top != NULL) 4967 m_freem(top); 4968 m_freem(m); 4969 return (NULL); 4970 } 4971 } else { 4972 /* 4973 * Place initial small packet/header at end of mbuf. 4974 */ 4975 if (len < m->m_len) { 4976 if (top == NULL && 4977 len + max_linkhdr <= m->m_len) 4978 m->m_data += max_linkhdr; 4979 m->m_len = len; 4980 } else { 4981 len = m->m_len; 4982 } 4983 } 4984 if (copy) 4985 copy(cp, MTOD(m, caddr_t), (unsigned)len); 4986 else 4987 bcopy(cp, MTOD(m, caddr_t), (unsigned)len); 4988 cp += len; 4989 *mp = m; 4990 mp = &m->m_next; 4991 totlen -= len; 4992 if (cp == epkt) 4993 cp = buf; 4994 } 4995 return (top); 4996} 4997 4998#ifndef MBUF_GROWTH_NORMAL_THRESH 4999#define MBUF_GROWTH_NORMAL_THRESH 25 5000#endif 5001 5002/* 5003 * Cluster freelist allocation check. 5004 */ 5005static int 5006m_howmany(int num, size_t bufsize) 5007{ 5008 int i = 0, j = 0; 5009 u_int32_t m_mbclusters, m_clusters, m_bigclusters, m_16kclusters; 5010 u_int32_t m_mbfree, m_clfree, m_bigclfree, m_16kclfree; 5011 u_int32_t sumclusters, freeclusters; 5012 u_int32_t percent_pool, percent_kmem; 5013 u_int32_t mb_growth, mb_growth_thresh; 5014 5015 VERIFY(bufsize == m_maxsize(MC_BIGCL) || 5016 bufsize == m_maxsize(MC_16KCL)); 5017 5018 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 5019 5020 /* Numbers in 2K cluster units */ 5021 m_mbclusters = m_total(MC_MBUF) >> NMBPCLSHIFT; 5022 m_clusters = m_total(MC_CL); 5023 m_bigclusters = m_total(MC_BIGCL) << NCLPBGSHIFT; 5024 m_16kclusters = m_total(MC_16KCL); 5025 sumclusters = m_mbclusters + m_clusters + m_bigclusters; 5026 5027 m_mbfree = m_infree(MC_MBUF) >> NMBPCLSHIFT; 5028 m_clfree = m_infree(MC_CL); 5029 m_bigclfree = m_infree(MC_BIGCL) << NCLPBGSHIFT; 5030 m_16kclfree = m_infree(MC_16KCL); 5031 freeclusters = m_mbfree + m_clfree + m_bigclfree; 5032 5033 /* Bail if we've maxed out the mbuf memory map */ 5034 if ((bufsize == m_maxsize(MC_BIGCL) && sumclusters >= nclusters) || 5035 (njcl > 0 && bufsize == m_maxsize(MC_16KCL) && 5036 (m_16kclusters << NCLPJCLSHIFT) >= njcl)) { 5037 return (0); 5038 } 5039 5040 if (bufsize == m_maxsize(MC_BIGCL)) { 5041 /* Under minimum */ 5042 if (m_bigclusters < m_minlimit(MC_BIGCL)) 5043 return (m_minlimit(MC_BIGCL) - m_bigclusters); 5044 5045 percent_pool = 5046 ((sumclusters - freeclusters) * 100) / sumclusters; 5047 percent_kmem = (sumclusters * 100) / nclusters; 5048 5049 /* 5050 * If a light/normal user, grow conservatively (75%) 5051 * If a heavy user, grow aggressively (50%) 5052 */ 5053 if (percent_kmem < MBUF_GROWTH_NORMAL_THRESH) 5054 mb_growth = MB_GROWTH_NORMAL; 5055 else 5056 mb_growth = MB_GROWTH_AGGRESSIVE; 5057 5058 if (percent_kmem < 5) { 5059 /* For initial allocations */ 5060 i = num; 5061 } else { 5062 /* Return if >= MBIGCL_LOWAT clusters available */ 5063 if (m_infree(MC_BIGCL) >= MBIGCL_LOWAT && 5064 m_total(MC_BIGCL) >= 5065 MBIGCL_LOWAT + m_minlimit(MC_BIGCL)) 5066 return (0); 5067 5068 /* Ensure at least num clusters are accessible */ 5069 if (num >= m_infree(MC_BIGCL)) 5070 i = num - m_infree(MC_BIGCL); 5071 if (num > m_total(MC_BIGCL) - m_minlimit(MC_BIGCL)) 5072 j = num - (m_total(MC_BIGCL) - 5073 m_minlimit(MC_BIGCL)); 5074 5075 i = MAX(i, j); 5076 5077 /* 5078 * Grow pool if percent_pool > 75 (normal growth) 5079 * or percent_pool > 50 (aggressive growth). 5080 */ 5081 mb_growth_thresh = 100 - (100 / (1 << mb_growth)); 5082 if (percent_pool > mb_growth_thresh) 5083 j = ((sumclusters + num) >> mb_growth) - 5084 freeclusters; 5085 i = MAX(i, j); 5086 } 5087 5088 /* Check to ensure we didn't go over limits */ 5089 if (i + m_bigclusters >= m_maxlimit(MC_BIGCL)) 5090 i = m_maxlimit(MC_BIGCL) - m_bigclusters; 5091 if ((i << 1) + sumclusters >= nclusters) 5092 i = (nclusters - sumclusters) >> 1; 5093 VERIFY((m_total(MC_BIGCL) + i) <= m_maxlimit(MC_BIGCL)); 5094 VERIFY(sumclusters + (i << 1) <= nclusters); 5095 5096 } else { /* 16K CL */ 5097 VERIFY(njcl > 0); 5098 /* Under minimum */ 5099 if (m_16kclusters < MIN16KCL) 5100 return (MIN16KCL - m_16kclusters); 5101 if (m_16kclfree >= M16KCL_LOWAT) 5102 return (0); 5103 5104 /* Ensure at least num clusters are available */ 5105 if (num >= m_16kclfree) 5106 i = num - m_16kclfree; 5107 5108 /* Always grow 16KCL pool aggressively */ 5109 if (((m_16kclusters + num) >> 1) > m_16kclfree) 5110 j = ((m_16kclusters + num) >> 1) - m_16kclfree; 5111 i = MAX(i, j); 5112 5113 /* Check to ensure we don't go over limit */ 5114 if (i + m_16kclusters >= m_maxlimit(MC_16KCL)) 5115 i = m_maxlimit(MC_16KCL) - m_16kclusters; 5116 VERIFY((m_total(MC_16KCL) + i) <= m_maxlimit(MC_16KCL)); 5117 } 5118 return (i); 5119} 5120/* 5121 * Return the number of bytes in the mbuf chain, m. 5122 */ 5123unsigned int 5124m_length(struct mbuf *m) 5125{ 5126 struct mbuf *m0; 5127 unsigned int pktlen; 5128 5129 if (m->m_flags & M_PKTHDR) 5130 return (m->m_pkthdr.len); 5131 5132 pktlen = 0; 5133 for (m0 = m; m0 != NULL; m0 = m0->m_next) 5134 pktlen += m0->m_len; 5135 return (pktlen); 5136} 5137 5138/* 5139 * Copy data from a buffer back into the indicated mbuf chain, 5140 * starting "off" bytes from the beginning, extending the mbuf 5141 * chain if necessary. 5142 */ 5143void 5144m_copyback(struct mbuf *m0, int off, int len, const void *cp) 5145{ 5146#if DEBUG 5147 struct mbuf *origm = m0; 5148 int error; 5149#endif /* DEBUG */ 5150 5151 if (m0 == NULL) 5152 return; 5153 5154#if DEBUG 5155 error = 5156#endif /* DEBUG */ 5157 m_copyback0(&m0, off, len, cp, 5158 M_COPYBACK0_COPYBACK | M_COPYBACK0_EXTEND, M_DONTWAIT); 5159 5160#if DEBUG 5161 if (error != 0 || (m0 != NULL && origm != m0)) 5162 panic("m_copyback"); 5163#endif /* DEBUG */ 5164} 5165 5166struct mbuf * 5167m_copyback_cow(struct mbuf *m0, int off, int len, const void *cp, int how) 5168{ 5169 int error; 5170 5171 /* don't support chain expansion */ 5172 VERIFY(off + len <= m_length(m0)); 5173 5174 error = m_copyback0(&m0, off, len, cp, 5175 M_COPYBACK0_COPYBACK | M_COPYBACK0_COW, how); 5176 if (error) { 5177 /* 5178 * no way to recover from partial success. 5179 * just free the chain. 5180 */ 5181 m_freem(m0); 5182 return (NULL); 5183 } 5184 return (m0); 5185} 5186 5187/* 5188 * m_makewritable: ensure the specified range writable. 5189 */ 5190int 5191m_makewritable(struct mbuf **mp, int off, int len, int how) 5192{ 5193 int error; 5194#if DEBUG 5195 struct mbuf *n; 5196 int origlen, reslen; 5197 5198 origlen = m_length(*mp); 5199#endif /* DEBUG */ 5200 5201#if 0 /* M_COPYALL is large enough */ 5202 if (len == M_COPYALL) 5203 len = m_length(*mp) - off; /* XXX */ 5204#endif 5205 5206 error = m_copyback0(mp, off, len, NULL, 5207 M_COPYBACK0_PRESERVE | M_COPYBACK0_COW, how); 5208 5209#if DEBUG 5210 reslen = 0; 5211 for (n = *mp; n; n = n->m_next) 5212 reslen += n->m_len; 5213 if (origlen != reslen) 5214 panic("m_makewritable: length changed"); 5215 if (((*mp)->m_flags & M_PKTHDR) && reslen != (*mp)->m_pkthdr.len) 5216 panic("m_makewritable: inconsist"); 5217#endif /* DEBUG */ 5218 5219 return (error); 5220} 5221 5222static int 5223m_copyback0(struct mbuf **mp0, int off, int len, const void *vp, int flags, 5224 int how) 5225{ 5226 int mlen; 5227 struct mbuf *m, *n; 5228 struct mbuf **mp; 5229 int totlen = 0; 5230 const char *cp = vp; 5231 5232 VERIFY(mp0 != NULL); 5233 VERIFY(*mp0 != NULL); 5234 VERIFY((flags & M_COPYBACK0_PRESERVE) == 0 || cp == NULL); 5235 VERIFY((flags & M_COPYBACK0_COPYBACK) == 0 || cp != NULL); 5236 5237 /* 5238 * we don't bother to update "totlen" in the case of M_COPYBACK0_COW, 5239 * assuming that M_COPYBACK0_EXTEND and M_COPYBACK0_COW are exclusive. 5240 */ 5241 5242 VERIFY((~flags & (M_COPYBACK0_EXTEND|M_COPYBACK0_COW)) != 0); 5243 5244 mp = mp0; 5245 m = *mp; 5246 while (off > (mlen = m->m_len)) { 5247 off -= mlen; 5248 totlen += mlen; 5249 if (m->m_next == NULL) { 5250 int tspace; 5251extend: 5252 if (!(flags & M_COPYBACK0_EXTEND)) 5253 goto out; 5254 5255 /* 5256 * try to make some space at the end of "m". 5257 */ 5258 5259 mlen = m->m_len; 5260 if (off + len >= MINCLSIZE && 5261 !(m->m_flags & M_EXT) && m->m_len == 0) { 5262 MCLGET(m, how); 5263 } 5264 tspace = M_TRAILINGSPACE(m); 5265 if (tspace > 0) { 5266 tspace = MIN(tspace, off + len); 5267 VERIFY(tspace > 0); 5268 bzero(mtod(m, char *) + m->m_len, 5269 MIN(off, tspace)); 5270 m->m_len += tspace; 5271 off += mlen; 5272 totlen -= mlen; 5273 continue; 5274 } 5275 5276 /* 5277 * need to allocate an mbuf. 5278 */ 5279 5280 if (off + len >= MINCLSIZE) { 5281 n = m_getcl(how, m->m_type, 0); 5282 } else { 5283 n = _M_GET(how, m->m_type); 5284 } 5285 if (n == NULL) { 5286 goto out; 5287 } 5288 n->m_len = 0; 5289 n->m_len = MIN(M_TRAILINGSPACE(n), off + len); 5290 bzero(mtod(n, char *), MIN(n->m_len, off)); 5291 m->m_next = n; 5292 } 5293 mp = &m->m_next; 5294 m = m->m_next; 5295 } 5296 while (len > 0) { 5297 mlen = m->m_len - off; 5298 if (mlen != 0 && m_mclhasreference(m)) { 5299 char *datap; 5300 int eatlen; 5301 5302 /* 5303 * this mbuf is read-only. 5304 * allocate a new writable mbuf and try again. 5305 */ 5306 5307#if defined(DIAGNOSTIC) 5308 if (!(flags & M_COPYBACK0_COW)) 5309 panic("m_copyback0: read-only"); 5310#endif /* defined(DIAGNOSTIC) */ 5311 5312 /* 5313 * if we're going to write into the middle of 5314 * a mbuf, split it first. 5315 */ 5316 if (off > 0 && len < mlen) { 5317 n = m_split0(m, off, how, 0); 5318 if (n == NULL) 5319 goto enobufs; 5320 m->m_next = n; 5321 mp = &m->m_next; 5322 m = n; 5323 off = 0; 5324 continue; 5325 } 5326 5327 /* 5328 * XXX TODO coalesce into the trailingspace of 5329 * the previous mbuf when possible. 5330 */ 5331 5332 /* 5333 * allocate a new mbuf. copy packet header if needed. 5334 */ 5335 n = _M_GET(how, m->m_type); 5336 if (n == NULL) 5337 goto enobufs; 5338 if (off == 0 && (m->m_flags & M_PKTHDR)) { 5339 M_COPY_PKTHDR(n, m); 5340 n->m_len = MHLEN; 5341 } else { 5342 if (len >= MINCLSIZE) 5343 MCLGET(n, M_DONTWAIT); 5344 n->m_len = 5345 (n->m_flags & M_EXT) ? MCLBYTES : MLEN; 5346 } 5347 if (n->m_len > len) 5348 n->m_len = len; 5349 5350 /* 5351 * free the region which has been overwritten. 5352 * copying data from old mbufs if requested. 5353 */ 5354 if (flags & M_COPYBACK0_PRESERVE) 5355 datap = mtod(n, char *); 5356 else 5357 datap = NULL; 5358 eatlen = n->m_len; 5359 VERIFY(off == 0 || eatlen >= mlen); 5360 if (off > 0) { 5361 VERIFY(len >= mlen); 5362 m->m_len = off; 5363 m->m_next = n; 5364 if (datap) { 5365 m_copydata(m, off, mlen, datap); 5366 datap += mlen; 5367 } 5368 eatlen -= mlen; 5369 mp = &m->m_next; 5370 m = m->m_next; 5371 } 5372 while (m != NULL && m_mclhasreference(m) && 5373 n->m_type == m->m_type && eatlen > 0) { 5374 mlen = MIN(eatlen, m->m_len); 5375 if (datap) { 5376 m_copydata(m, 0, mlen, datap); 5377 datap += mlen; 5378 } 5379 m->m_data += mlen; 5380 m->m_len -= mlen; 5381 eatlen -= mlen; 5382 if (m->m_len == 0) 5383 *mp = m = m_free(m); 5384 } 5385 if (eatlen > 0) 5386 n->m_len -= eatlen; 5387 n->m_next = m; 5388 *mp = m = n; 5389 continue; 5390 } 5391 mlen = MIN(mlen, len); 5392 if (flags & M_COPYBACK0_COPYBACK) { 5393 bcopy(cp, mtod(m, caddr_t) + off, (unsigned)mlen); 5394 cp += mlen; 5395 } 5396 len -= mlen; 5397 mlen += off; 5398 off = 0; 5399 totlen += mlen; 5400 if (len == 0) 5401 break; 5402 if (m->m_next == NULL) { 5403 goto extend; 5404 } 5405 mp = &m->m_next; 5406 m = m->m_next; 5407 } 5408out: 5409 if (((m = *mp0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) { 5410 VERIFY(flags & M_COPYBACK0_EXTEND); 5411 m->m_pkthdr.len = totlen; 5412 } 5413 5414 return (0); 5415 5416enobufs: 5417 return (ENOBUFS); 5418} 5419 5420char * 5421mcl_to_paddr(char *addr) 5422{ 5423 vm_offset_t base_phys; 5424 5425 if (!MBUF_IN_MAP(addr)) 5426 return (NULL); 5427 base_phys = mcl_paddr[(addr - (char *)mbutl) >> PGSHIFT]; 5428 5429 if (base_phys == 0) 5430 return (NULL); 5431 return ((char *)((uintptr_t)base_phys | ((uintptr_t)addr & PGOFSET))); 5432} 5433 5434/* 5435 * Dup the mbuf chain passed in. The whole thing. No cute additional cruft. 5436 * And really copy the thing. That way, we don't "precompute" checksums 5437 * for unsuspecting consumers. Assumption: m->m_nextpkt == 0. Trick: for 5438 * small packets, don't dup into a cluster. That way received packets 5439 * don't take up too much room in the sockbuf (cf. sbspace()). 5440 */ 5441int MDFail; 5442 5443struct mbuf * 5444m_dup(struct mbuf *m, int how) 5445{ 5446 struct mbuf *n, **np; 5447 struct mbuf *top; 5448 int copyhdr = 0; 5449 5450 np = ⊤ 5451 top = NULL; 5452 if (m->m_flags & M_PKTHDR) 5453 copyhdr = 1; 5454 5455 /* 5456 * Quick check: if we have one mbuf and its data fits in an 5457 * mbuf with packet header, just copy and go. 5458 */ 5459 if (m->m_next == NULL) { 5460 /* Then just move the data into an mbuf and be done... */ 5461 if (copyhdr) { 5462 if (m->m_pkthdr.len <= MHLEN && m->m_len <= MHLEN) { 5463 if ((n = _M_GETHDR(how, m->m_type)) == NULL) 5464 return (NULL); 5465 n->m_len = m->m_len; 5466 m_dup_pkthdr(n, m, how); 5467 bcopy(m->m_data, n->m_data, m->m_len); 5468 return (n); 5469 } 5470 } else if (m->m_len <= MLEN) { 5471 if ((n = _M_GET(how, m->m_type)) == NULL) 5472 return (NULL); 5473 bcopy(m->m_data, n->m_data, m->m_len); 5474 n->m_len = m->m_len; 5475 return (n); 5476 } 5477 } 5478 while (m != NULL) { 5479#if BLUE_DEBUG 5480 kprintf("<%x: %x, %x, %x\n", m, m->m_flags, m->m_len, 5481 m->m_data); 5482#endif 5483 if (copyhdr) 5484 n = _M_GETHDR(how, m->m_type); 5485 else 5486 n = _M_GET(how, m->m_type); 5487 if (n == NULL) 5488 goto nospace; 5489 if (m->m_flags & M_EXT) { 5490 if (m->m_len <= m_maxsize(MC_CL)) 5491 MCLGET(n, how); 5492 else if (m->m_len <= m_maxsize(MC_BIGCL)) 5493 n = m_mbigget(n, how); 5494 else if (m->m_len <= m_maxsize(MC_16KCL) && njcl > 0) 5495 n = m_m16kget(n, how); 5496 if (!(n->m_flags & M_EXT)) { 5497 (void) m_free(n); 5498 goto nospace; 5499 } 5500 } 5501 *np = n; 5502 if (copyhdr) { 5503 /* Don't use M_COPY_PKTHDR: preserve m_data */ 5504 m_dup_pkthdr(n, m, how); 5505 copyhdr = 0; 5506 if (!(n->m_flags & M_EXT)) 5507 n->m_data = n->m_pktdat; 5508 } 5509 n->m_len = m->m_len; 5510 /* 5511 * Get the dup on the same bdry as the original 5512 * Assume that the two mbufs have the same offset to data area 5513 * (up to word boundaries) 5514 */ 5515 bcopy(MTOD(m, caddr_t), MTOD(n, caddr_t), (unsigned)n->m_len); 5516 m = m->m_next; 5517 np = &n->m_next; 5518#if BLUE_DEBUG 5519 kprintf(">%x: %x, %x, %x\n", n, n->m_flags, n->m_len, 5520 n->m_data); 5521#endif 5522 } 5523 5524 if (top == NULL) 5525 MDFail++; 5526 return (top); 5527 5528nospace: 5529 m_freem(top); 5530 MDFail++; 5531 return (NULL); 5532} 5533 5534#define MBUF_MULTIPAGES(m) \ 5535 (((m)->m_flags & M_EXT) && \ 5536 ((IS_P2ALIGNED((m)->m_data, NBPG) && (m)->m_len > NBPG) || \ 5537 (!IS_P2ALIGNED((m)->m_data, NBPG) && \ 5538 P2ROUNDUP((m)->m_data, NBPG) < ((uintptr_t)(m)->m_data + (m)->m_len)))) 5539 5540static struct mbuf * 5541m_expand(struct mbuf *m, struct mbuf **last) 5542{ 5543 struct mbuf *top = NULL; 5544 struct mbuf **nm = ⊤ 5545 uintptr_t data0, data; 5546 unsigned int len0, len; 5547 5548 VERIFY(MBUF_MULTIPAGES(m)); 5549 VERIFY(m->m_next == NULL); 5550 data0 = (uintptr_t)m->m_data; 5551 len0 = m->m_len; 5552 *last = top; 5553 5554 for (;;) { 5555 struct mbuf *n; 5556 5557 data = data0; 5558 if (IS_P2ALIGNED(data, NBPG) && len0 > NBPG) 5559 len = NBPG; 5560 else if (!IS_P2ALIGNED(data, NBPG) && 5561 P2ROUNDUP(data, NBPG) < (data + len0)) 5562 len = P2ROUNDUP(data, NBPG) - data; 5563 else 5564 len = len0; 5565 5566 VERIFY(len > 0); 5567 VERIFY(m->m_flags & M_EXT); 5568 m->m_data = (void *)data; 5569 m->m_len = len; 5570 5571 *nm = *last = m; 5572 nm = &m->m_next; 5573 m->m_next = NULL; 5574 5575 data0 += len; 5576 len0 -= len; 5577 if (len0 == 0) 5578 break; 5579 5580 n = _M_RETRY(M_DONTWAIT, MT_DATA); 5581 if (n == NULL) { 5582 m_freem(top); 5583 top = *last = NULL; 5584 break; 5585 } 5586 5587 n->m_ext = m->m_ext; 5588 m_incref(m); 5589 n->m_flags |= M_EXT; 5590 m = n; 5591 } 5592 return (top); 5593} 5594 5595struct mbuf * 5596m_normalize(struct mbuf *m) 5597{ 5598 struct mbuf *top = NULL; 5599 struct mbuf **nm = ⊤ 5600 boolean_t expanded = FALSE; 5601 5602 while (m != NULL) { 5603 struct mbuf *n; 5604 5605 n = m->m_next; 5606 m->m_next = NULL; 5607 5608 /* Does the data cross one or more page boundaries? */ 5609 if (MBUF_MULTIPAGES(m)) { 5610 struct mbuf *last; 5611 if ((m = m_expand(m, &last)) == NULL) { 5612 m_freem(n); 5613 m_freem(top); 5614 top = NULL; 5615 break; 5616 } 5617 *nm = m; 5618 nm = &last->m_next; 5619 expanded = TRUE; 5620 } else { 5621 *nm = m; 5622 nm = &m->m_next; 5623 } 5624 m = n; 5625 } 5626 if (expanded) 5627 atomic_add_32(&mb_normalized, 1); 5628 return (top); 5629} 5630 5631/* 5632 * Append the specified data to the indicated mbuf chain, 5633 * Extend the mbuf chain if the new data does not fit in 5634 * existing space. 5635 * 5636 * Return 1 if able to complete the job; otherwise 0. 5637 */ 5638int 5639m_append(struct mbuf *m0, int len, caddr_t cp) 5640{ 5641 struct mbuf *m, *n; 5642 int remainder, space; 5643 5644 for (m = m0; m->m_next != NULL; m = m->m_next) 5645 ; 5646 remainder = len; 5647 space = M_TRAILINGSPACE(m); 5648 if (space > 0) { 5649 /* 5650 * Copy into available space. 5651 */ 5652 if (space > remainder) 5653 space = remainder; 5654 bcopy(cp, mtod(m, caddr_t) + m->m_len, space); 5655 m->m_len += space; 5656 cp += space, remainder -= space; 5657 } 5658 while (remainder > 0) { 5659 /* 5660 * Allocate a new mbuf; could check space 5661 * and allocate a cluster instead. 5662 */ 5663 n = m_get(M_WAITOK, m->m_type); 5664 if (n == NULL) 5665 break; 5666 n->m_len = min(MLEN, remainder); 5667 bcopy(cp, mtod(n, caddr_t), n->m_len); 5668 cp += n->m_len; 5669 remainder -= n->m_len; 5670 m->m_next = n; 5671 m = n; 5672 } 5673 if (m0->m_flags & M_PKTHDR) 5674 m0->m_pkthdr.len += len - remainder; 5675 return (remainder == 0); 5676} 5677 5678struct mbuf * 5679m_last(struct mbuf *m) 5680{ 5681 while (m->m_next != NULL) 5682 m = m->m_next; 5683 return (m); 5684} 5685 5686unsigned int 5687m_fixhdr(struct mbuf *m0) 5688{ 5689 u_int len; 5690 5691 len = m_length2(m0, NULL); 5692 m0->m_pkthdr.len = len; 5693 return (len); 5694} 5695 5696unsigned int 5697m_length2(struct mbuf *m0, struct mbuf **last) 5698{ 5699 struct mbuf *m; 5700 u_int len; 5701 5702 len = 0; 5703 for (m = m0; m != NULL; m = m->m_next) { 5704 len += m->m_len; 5705 if (m->m_next == NULL) 5706 break; 5707 } 5708 if (last != NULL) 5709 *last = m; 5710 return (len); 5711} 5712 5713/* 5714 * Defragment a mbuf chain, returning the shortest possible chain of mbufs 5715 * and clusters. If allocation fails and this cannot be completed, NULL will 5716 * be returned, but the passed in chain will be unchanged. Upon success, 5717 * the original chain will be freed, and the new chain will be returned. 5718 * 5719 * If a non-packet header is passed in, the original mbuf (chain?) will 5720 * be returned unharmed. 5721 * 5722 * If offset is specfied, the first mbuf in the chain will have a leading 5723 * space of the amount stated by the "off" parameter. 5724 * 5725 * This routine requires that the m_pkthdr.header field of the original 5726 * mbuf chain is cleared by the caller. 5727 */ 5728struct mbuf * 5729m_defrag_offset(struct mbuf *m0, u_int32_t off, int how) 5730{ 5731 struct mbuf *m_new = NULL, *m_final = NULL; 5732 int progress = 0, length, pktlen; 5733 5734 if (!(m0->m_flags & M_PKTHDR)) 5735 return (m0); 5736 5737 VERIFY(off < MHLEN); 5738 m_fixhdr(m0); /* Needed sanity check */ 5739 5740 pktlen = m0->m_pkthdr.len + off; 5741 if (pktlen > MHLEN) 5742 m_final = m_getcl(how, MT_DATA, M_PKTHDR); 5743 else 5744 m_final = m_gethdr(how, MT_DATA); 5745 5746 if (m_final == NULL) 5747 goto nospace; 5748 5749 if (off > 0) { 5750 pktlen -= off; 5751 m_final->m_len -= off; 5752 m_final->m_data += off; 5753 } 5754 5755 /* 5756 * Caller must have handled the contents pointed to by this 5757 * pointer before coming here, as otherwise it will point to 5758 * the original mbuf which will get freed upon success. 5759 */ 5760 VERIFY(m0->m_pkthdr.header == NULL); 5761 5762 if (m_dup_pkthdr(m_final, m0, how) == 0) 5763 goto nospace; 5764 5765 m_new = m_final; 5766 5767 while (progress < pktlen) { 5768 length = pktlen - progress; 5769 if (length > MCLBYTES) 5770 length = MCLBYTES; 5771 5772 if (m_new == NULL) { 5773 if (length > MLEN) 5774 m_new = m_getcl(how, MT_DATA, 0); 5775 else 5776 m_new = m_get(how, MT_DATA); 5777 if (m_new == NULL) 5778 goto nospace; 5779 } 5780 5781 m_copydata(m0, progress, length, mtod(m_new, caddr_t)); 5782 progress += length; 5783 m_new->m_len = length; 5784 if (m_new != m_final) 5785 m_cat(m_final, m_new); 5786 m_new = NULL; 5787 } 5788 m_freem(m0); 5789 m0 = m_final; 5790 return (m0); 5791nospace: 5792 if (m_final) 5793 m_freem(m_final); 5794 return (NULL); 5795} 5796 5797struct mbuf * 5798m_defrag(struct mbuf *m0, int how) 5799{ 5800 return (m_defrag_offset(m0, 0, how)); 5801} 5802 5803void 5804m_mchtype(struct mbuf *m, int t) 5805{ 5806 mtype_stat_inc(t); 5807 mtype_stat_dec(m->m_type); 5808 (m)->m_type = t; 5809} 5810 5811void * 5812m_mtod(struct mbuf *m) 5813{ 5814 return (MTOD(m, void *)); 5815} 5816 5817struct mbuf * 5818m_dtom(void *x) 5819{ 5820 return ((struct mbuf *)((uintptr_t)(x) & ~(MSIZE-1))); 5821} 5822 5823void 5824m_mcheck(struct mbuf *m) 5825{ 5826 _MCHECK(m); 5827} 5828 5829/* 5830 * Return a pointer to mbuf/offset of location in mbuf chain. 5831 */ 5832struct mbuf * 5833m_getptr(struct mbuf *m, int loc, int *off) 5834{ 5835 5836 while (loc >= 0) { 5837 /* Normal end of search. */ 5838 if (m->m_len > loc) { 5839 *off = loc; 5840 return (m); 5841 } else { 5842 loc -= m->m_len; 5843 if (m->m_next == NULL) { 5844 if (loc == 0) { 5845 /* Point at the end of valid data. */ 5846 *off = m->m_len; 5847 return (m); 5848 } 5849 return (NULL); 5850 } 5851 m = m->m_next; 5852 } 5853 } 5854 return (NULL); 5855} 5856 5857/* 5858 * Inform the corresponding mcache(s) that there's a waiter below. 5859 */ 5860static void 5861mbuf_waiter_inc(mbuf_class_t class, boolean_t comp) 5862{ 5863 mcache_waiter_inc(m_cache(class)); 5864 if (comp) { 5865 if (class == MC_CL) { 5866 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 5867 } else if (class == MC_BIGCL) { 5868 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 5869 } else if (class == MC_16KCL) { 5870 mcache_waiter_inc(m_cache(MC_MBUF_16KCL)); 5871 } else { 5872 mcache_waiter_inc(m_cache(MC_MBUF_CL)); 5873 mcache_waiter_inc(m_cache(MC_MBUF_BIGCL)); 5874 } 5875 } 5876} 5877 5878/* 5879 * Inform the corresponding mcache(s) that there's no more waiter below. 5880 */ 5881static void 5882mbuf_waiter_dec(mbuf_class_t class, boolean_t comp) 5883{ 5884 mcache_waiter_dec(m_cache(class)); 5885 if (comp) { 5886 if (class == MC_CL) { 5887 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 5888 } else if (class == MC_BIGCL) { 5889 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 5890 } else if (class == MC_16KCL) { 5891 mcache_waiter_dec(m_cache(MC_MBUF_16KCL)); 5892 } else { 5893 mcache_waiter_dec(m_cache(MC_MBUF_CL)); 5894 mcache_waiter_dec(m_cache(MC_MBUF_BIGCL)); 5895 } 5896 } 5897} 5898 5899/* 5900 * Called during slab (blocking and non-blocking) allocation. If there 5901 * is at least one waiter, and the time since the first waiter is blocked 5902 * is greater than the watchdog timeout, panic the system. 5903 */ 5904static void 5905mbuf_watchdog(void) 5906{ 5907 struct timeval now; 5908 unsigned int since; 5909 5910 if (mb_waiters == 0 || !mb_watchdog) 5911 return; 5912 5913 microuptime(&now); 5914 since = now.tv_sec - mb_wdtstart.tv_sec; 5915 if (since >= MB_WDT_MAXTIME) { 5916 panic_plain("%s: %d waiters stuck for %u secs\n%s", __func__, 5917 mb_waiters, since, mbuf_dump()); 5918 /* NOTREACHED */ 5919 } 5920} 5921 5922/* 5923 * Called during blocking allocation. Returns TRUE if one or more objects 5924 * are available at the per-CPU caches layer and that allocation should be 5925 * retried at that level. 5926 */ 5927static boolean_t 5928mbuf_sleep(mbuf_class_t class, unsigned int num, int wait) 5929{ 5930 boolean_t mcache_retry = FALSE; 5931 5932 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 5933 5934 /* Check if there's anything at the cache layer */ 5935 if (mbuf_cached_above(class, wait)) { 5936 mcache_retry = TRUE; 5937 goto done; 5938 } 5939 5940 /* Nothing? Then try hard to get it from somewhere */ 5941 m_reclaim(class, num, (wait & MCR_COMP)); 5942 5943 /* We tried hard and got something? */ 5944 if (m_infree(class) > 0) { 5945 mbstat.m_wait++; 5946 goto done; 5947 } else if (mbuf_cached_above(class, wait)) { 5948 mbstat.m_wait++; 5949 mcache_retry = TRUE; 5950 goto done; 5951 } else if (wait & MCR_TRYHARD) { 5952 mcache_retry = TRUE; 5953 goto done; 5954 } 5955 5956 /* 5957 * There's really nothing for us right now; inform the 5958 * cache(s) that there is a waiter below and go to sleep. 5959 */ 5960 mbuf_waiter_inc(class, (wait & MCR_COMP)); 5961 5962 VERIFY(!(wait & MCR_NOSLEEP)); 5963 5964 /* 5965 * If this is the first waiter, arm the watchdog timer. Otherwise 5966 * check if we need to panic the system due to watchdog timeout. 5967 */ 5968 if (mb_waiters == 0) 5969 microuptime(&mb_wdtstart); 5970 else 5971 mbuf_watchdog(); 5972 5973 mb_waiters++; 5974 (void) msleep(mb_waitchan, mbuf_mlock, (PZERO-1), m_cname(class), NULL); 5975 5976 /* We are now up; stop getting notified until next round */ 5977 mbuf_waiter_dec(class, (wait & MCR_COMP)); 5978 5979 /* We waited and got something */ 5980 if (m_infree(class) > 0) { 5981 mbstat.m_wait++; 5982 goto done; 5983 } else if (mbuf_cached_above(class, wait)) { 5984 mbstat.m_wait++; 5985 mcache_retry = TRUE; 5986 } 5987done: 5988 return (mcache_retry); 5989} 5990 5991static void 5992mbuf_worker_thread(void) 5993{ 5994 int mbuf_expand; 5995 5996 while (1) { 5997 lck_mtx_lock(mbuf_mlock); 5998 5999 mbuf_expand = 0; 6000 if (mbuf_expand_mcl) { 6001 int n; 6002 6003 /* Adjust to current number of cluster in use */ 6004 n = mbuf_expand_mcl - 6005 (m_total(MC_CL) - m_infree(MC_CL)); 6006 if ((n + m_total(MC_CL)) > m_maxlimit(MC_CL)) 6007 n = m_maxlimit(MC_CL) - m_total(MC_CL); 6008 mbuf_expand_mcl = 0; 6009 6010 if (n > 0 && freelist_populate(MC_CL, n, M_WAIT) > 0) 6011 mbuf_expand++; 6012 } 6013 if (mbuf_expand_big) { 6014 int n; 6015 6016 /* Adjust to current number of 4 KB cluster in use */ 6017 n = mbuf_expand_big - 6018 (m_total(MC_BIGCL) - m_infree(MC_BIGCL)); 6019 if ((n + m_total(MC_BIGCL)) > m_maxlimit(MC_BIGCL)) 6020 n = m_maxlimit(MC_BIGCL) - m_total(MC_BIGCL); 6021 mbuf_expand_big = 0; 6022 6023 if (n > 0 && freelist_populate(MC_BIGCL, n, M_WAIT) > 0) 6024 mbuf_expand++; 6025 } 6026 if (mbuf_expand_16k) { 6027 int n; 6028 6029 /* Adjust to current number of 16 KB cluster in use */ 6030 n = mbuf_expand_16k - 6031 (m_total(MC_16KCL) - m_infree(MC_16KCL)); 6032 if ((n + m_total(MC_16KCL)) > m_maxlimit(MC_16KCL)) 6033 n = m_maxlimit(MC_16KCL) - m_total(MC_16KCL); 6034 mbuf_expand_16k = 0; 6035 6036 if (n > 0) 6037 (void) freelist_populate(MC_16KCL, n, M_WAIT); 6038 } 6039 6040 /* 6041 * Because we can run out of memory before filling the mbuf 6042 * map, we should not allocate more clusters than they are 6043 * mbufs -- otherwise we could have a large number of useless 6044 * clusters allocated. 6045 */ 6046 if (mbuf_expand) { 6047 while (m_total(MC_MBUF) < 6048 (m_total(MC_BIGCL) + m_total(MC_CL))) { 6049 if (freelist_populate(MC_MBUF, 1, M_WAIT) == 0) 6050 break; 6051 } 6052 } 6053 6054 lck_mtx_unlock(mbuf_mlock); 6055 6056 assert_wait(&mbuf_worker_run, THREAD_UNINT); 6057 (void) thread_block((thread_continue_t)mbuf_worker_thread); 6058 } 6059} 6060 6061static void 6062mbuf_worker_thread_init(void) 6063{ 6064 mbuf_worker_ready++; 6065 mbuf_worker_thread(); 6066} 6067 6068static mcl_slab_t * 6069slab_get(void *buf) 6070{ 6071 mcl_slabg_t *slg; 6072 unsigned int ix, k; 6073 6074 lck_mtx_assert(mbuf_mlock, LCK_MTX_ASSERT_OWNED); 6075 6076 VERIFY(MBUF_IN_MAP(buf)); 6077 ix = ((char *)buf - (char *)mbutl) >> MBSHIFT; 6078 VERIFY(ix < maxslabgrp); 6079 6080 if ((slg = slabstbl[ix]) == NULL) { 6081 /* 6082 * In the current implementation, we never shrink the memory 6083 * pool (hence the cluster map); if we attempt to reallocate 6084 * a cluster group when it's already allocated, panic since 6085 * this is a sign of a memory corruption (slabstbl[ix] got 6086 * nullified). This also means that there shouldn't be any 6087 * hole in the kernel sub-map for the mbuf pool. 6088 */ 6089 ++slabgrp; 6090 VERIFY(ix < slabgrp); 6091 /* 6092 * Slabs expansion can only be done single threaded; when 6093 * we get here, it must be as a result of m_clalloc() which 6094 * is serialized and therefore mb_clalloc_busy must be set. 6095 */ 6096 VERIFY(mb_clalloc_busy); 6097 lck_mtx_unlock(mbuf_mlock); 6098 6099 /* This is a new buffer; create the slabs group for it */ 6100 MALLOC(slg, mcl_slabg_t *, sizeof (*slg), M_TEMP, 6101 M_WAITOK | M_ZERO); 6102 VERIFY(slg != NULL); 6103 6104 lck_mtx_lock(mbuf_mlock); 6105 /* 6106 * No other thread could have gone into m_clalloc() after 6107 * we dropped the lock above, so verify that it's true. 6108 */ 6109 VERIFY(mb_clalloc_busy); 6110 6111 slabstbl[ix] = slg; 6112 6113 /* Chain each slab in the group to its forward neighbor */ 6114 for (k = 1; k < NSLABSPMB; k++) 6115 slg->slg_slab[k - 1].sl_next = &slg->slg_slab[k]; 6116 VERIFY(slg->slg_slab[NSLABSPMB - 1].sl_next == NULL); 6117 6118 /* And chain the last slab in the previous group to this */ 6119 if (ix > 0) { 6120 VERIFY(slabstbl[ix - 1]-> 6121 slg_slab[NSLABSPMB - 1].sl_next == NULL); 6122 slabstbl[ix - 1]->slg_slab[NSLABSPMB - 1].sl_next = 6123 &slg->slg_slab[0]; 6124 } 6125 } 6126 6127 ix = MTOBG(buf) % NSLABSPMB; 6128 VERIFY(ix < NSLABSPMB); 6129 6130 return (&slg->slg_slab[ix]); 6131} 6132 6133static void 6134slab_init(mcl_slab_t *sp, mbuf_class_t class, u_int32_t flags, 6135 void *base, void *head, unsigned int len, int refcnt, int chunks) 6136{ 6137 sp->sl_class = class; 6138 sp->sl_flags = flags; 6139 sp->sl_base = base; 6140 sp->sl_head = head; 6141 sp->sl_len = len; 6142 sp->sl_refcnt = refcnt; 6143 sp->sl_chunks = chunks; 6144 slab_detach(sp); 6145} 6146 6147static void 6148slab_insert(mcl_slab_t *sp, mbuf_class_t class) 6149{ 6150 VERIFY(slab_is_detached(sp)); 6151 m_slab_cnt(class)++; 6152 TAILQ_INSERT_TAIL(&m_slablist(class), sp, sl_link); 6153 sp->sl_flags &= ~SLF_DETACHED; 6154 if (class == MC_16KCL) { 6155 int k; 6156 for (k = 1; k < NSLABSP16KB; k++) { 6157 sp = sp->sl_next; 6158 /* Next slab must already be present */ 6159 VERIFY(sp != NULL); 6160 VERIFY(slab_is_detached(sp)); 6161 sp->sl_flags &= ~SLF_DETACHED; 6162 } 6163 } 6164} 6165 6166static void 6167slab_remove(mcl_slab_t *sp, mbuf_class_t class) 6168{ 6169 VERIFY(!slab_is_detached(sp)); 6170 VERIFY(m_slab_cnt(class) > 0); 6171 m_slab_cnt(class)--; 6172 TAILQ_REMOVE(&m_slablist(class), sp, sl_link); 6173 slab_detach(sp); 6174 if (class == MC_16KCL) { 6175 int k; 6176 for (k = 1; k < NSLABSP16KB; k++) { 6177 sp = sp->sl_next; 6178 /* Next slab must already be present */ 6179 VERIFY(sp != NULL); 6180 VERIFY(!slab_is_detached(sp)); 6181 slab_detach(sp); 6182 } 6183 } 6184} 6185 6186static boolean_t 6187slab_inrange(mcl_slab_t *sp, void *buf) 6188{ 6189 return ((uintptr_t)buf >= (uintptr_t)sp->sl_base && 6190 (uintptr_t)buf < ((uintptr_t)sp->sl_base + sp->sl_len)); 6191} 6192 6193#undef panic 6194 6195static void 6196slab_nextptr_panic(mcl_slab_t *sp, void *addr) 6197{ 6198 int i; 6199 unsigned int chunk_len = sp->sl_len / sp->sl_chunks; 6200 uintptr_t buf = (uintptr_t)sp->sl_base; 6201 6202 for (i = 0; i < sp->sl_chunks; i++, buf += chunk_len) { 6203 void *next = ((mcache_obj_t *)buf)->obj_next; 6204 if (next != addr) 6205 continue; 6206 if (!mclverify) { 6207 if (next != NULL && !MBUF_IN_MAP(next)) { 6208 mcache_t *cp = m_cache(sp->sl_class); 6209 panic("%s: %s buffer %p in slab %p modified " 6210 "after free at offset 0: %p out of range " 6211 "[%p-%p)\n", __func__, cp->mc_name, 6212 (void *)buf, sp, next, mbutl, embutl); 6213 /* NOTREACHED */ 6214 } 6215 } else { 6216 mcache_audit_t *mca = mcl_audit_buf2mca(sp->sl_class, 6217 (mcache_obj_t *)buf); 6218 mcl_audit_verify_nextptr(next, mca); 6219 } 6220 } 6221} 6222 6223static void 6224slab_detach(mcl_slab_t *sp) 6225{ 6226 sp->sl_link.tqe_next = (mcl_slab_t *)-1; 6227 sp->sl_link.tqe_prev = (mcl_slab_t **)-1; 6228 sp->sl_flags |= SLF_DETACHED; 6229} 6230 6231static boolean_t 6232slab_is_detached(mcl_slab_t *sp) 6233{ 6234 return ((intptr_t)sp->sl_link.tqe_next == -1 && 6235 (intptr_t)sp->sl_link.tqe_prev == -1 && 6236 (sp->sl_flags & SLF_DETACHED)); 6237} 6238 6239static void 6240mcl_audit_init(void *buf, mcache_audit_t **mca_list, 6241 mcache_obj_t **con_list, size_t con_size, unsigned int num) 6242{ 6243 mcache_audit_t *mca, *mca_tail; 6244 mcache_obj_t *con = NULL; 6245 boolean_t save_contents = (con_list != NULL); 6246 unsigned int i, ix; 6247 6248 ASSERT(num <= NMBPBG); 6249 ASSERT(con_list == NULL || con_size != 0); 6250 6251 ix = MTOBG(buf); 6252 VERIFY(ix < maxclaudit); 6253 6254 /* Make sure we haven't been here before */ 6255 for (i = 0; i < NMBPBG; i++) 6256 VERIFY(mclaudit[ix].cl_audit[i] == NULL); 6257 6258 mca = mca_tail = *mca_list; 6259 if (save_contents) 6260 con = *con_list; 6261 6262 for (i = 0; i < num; i++) { 6263 mcache_audit_t *next; 6264 6265 next = mca->mca_next; 6266 bzero(mca, sizeof (*mca)); 6267 mca->mca_next = next; 6268 mclaudit[ix].cl_audit[i] = mca; 6269 6270 /* Attach the contents buffer if requested */ 6271 if (save_contents) { 6272 VERIFY(con != NULL); 6273 mca->mca_contents_size = con_size; 6274 mca->mca_contents = con; 6275 con = con->obj_next; 6276 bzero(mca->mca_contents, mca->mca_contents_size); 6277 } 6278 6279 mca_tail = mca; 6280 mca = mca->mca_next; 6281 } 6282 6283 if (save_contents) 6284 *con_list = con; 6285 6286 *mca_list = mca_tail->mca_next; 6287 mca_tail->mca_next = NULL; 6288} 6289 6290/* 6291 * Given an address of a buffer (mbuf/2KB/4KB/16KB), return 6292 * the corresponding audit structure for that buffer. 6293 */ 6294static mcache_audit_t * 6295mcl_audit_buf2mca(mbuf_class_t class, mcache_obj_t *o) 6296{ 6297 mcache_audit_t *mca = NULL; 6298 int ix = MTOBG(o); 6299 6300 VERIFY(ix < maxclaudit); 6301 VERIFY(IS_P2ALIGNED(o, MIN(m_maxsize(class), NBPG))); 6302 6303 switch (class) { 6304 case MC_MBUF: 6305 /* 6306 * For the mbuf case, find the index of the page 6307 * used by the mbuf and use that index to locate the 6308 * base address of the page. Then find out the 6309 * mbuf index relative to the page base and use 6310 * it to locate the audit structure. 6311 */ 6312 VERIFY(MCLIDX(BGTOM(ix), o) < (int)NMBPBG); 6313 mca = mclaudit[ix].cl_audit[MCLIDX(BGTOM(ix), o)]; 6314 break; 6315 6316 case MC_CL: 6317 /* 6318 * Same thing as above, but for 2KB clusters in a page. 6319 */ 6320 VERIFY(CLBGIDX(BGTOM(ix), o) < (int)NCLPBG); 6321 mca = mclaudit[ix].cl_audit[CLBGIDX(BGTOM(ix), o)]; 6322 break; 6323 6324 case MC_BIGCL: 6325 case MC_16KCL: 6326 /* 6327 * Same as above, but only return the first element. 6328 */ 6329 mca = mclaudit[ix].cl_audit[0]; 6330 break; 6331 6332 default: 6333 VERIFY(0); 6334 /* NOTREACHED */ 6335 } 6336 6337 return (mca); 6338} 6339 6340static void 6341mcl_audit_mbuf(mcache_audit_t *mca, void *addr, boolean_t composite, 6342 boolean_t alloc) 6343{ 6344 struct mbuf *m = addr; 6345 mcache_obj_t *next = ((mcache_obj_t *)m)->obj_next; 6346 6347 VERIFY(mca->mca_contents != NULL && 6348 mca->mca_contents_size == AUDIT_CONTENTS_SIZE); 6349 6350 if (mclverify) 6351 mcl_audit_verify_nextptr(next, mca); 6352 6353 if (!alloc) { 6354 /* Save constructed mbuf fields */ 6355 mcl_audit_save_mbuf(m, mca); 6356 if (mclverify) { 6357 mcache_set_pattern(MCACHE_FREE_PATTERN, m, 6358 m_maxsize(MC_MBUF)); 6359 } 6360 ((mcache_obj_t *)m)->obj_next = next; 6361 return; 6362 } 6363 6364 /* Check if the buffer has been corrupted while in freelist */ 6365 if (mclverify) { 6366 mcache_audit_free_verify_set(mca, addr, 0, m_maxsize(MC_MBUF)); 6367 } 6368 /* Restore constructed mbuf fields */ 6369 mcl_audit_restore_mbuf(m, mca, composite); 6370} 6371 6372static void 6373mcl_audit_restore_mbuf(struct mbuf *m, mcache_audit_t *mca, boolean_t composite) 6374{ 6375 struct mbuf *ms = (struct mbuf *)mca->mca_contents; 6376 6377 if (composite) { 6378 struct mbuf *next = m->m_next; 6379 VERIFY(ms->m_flags == M_EXT && MEXT_RFA(ms) != NULL && 6380 MBUF_IS_COMPOSITE(ms)); 6381 /* 6382 * We could have hand-picked the mbuf fields and restore 6383 * them individually, but that will be a maintenance 6384 * headache. Instead, restore everything that was saved; 6385 * the mbuf layer will recheck and reinitialize anyway. 6386 */ 6387 bcopy(ms, m, mca->mca_contents_size); 6388 m->m_next = next; 6389 } else { 6390 /* 6391 * For a regular mbuf (no cluster attached) there's nothing 6392 * to restore other than the type field, which is expected 6393 * to be MT_FREE. 6394 */ 6395 m->m_type = ms->m_type; 6396 } 6397 _MCHECK(m); 6398} 6399 6400static void 6401mcl_audit_save_mbuf(struct mbuf *m, mcache_audit_t *mca) 6402{ 6403 _MCHECK(m); 6404 bcopy(m, mca->mca_contents, mca->mca_contents_size); 6405} 6406 6407static void 6408mcl_audit_cluster(mcache_audit_t *mca, void *addr, size_t size, boolean_t alloc, 6409 boolean_t save_next) 6410{ 6411 mcache_obj_t *next = ((mcache_obj_t *)addr)->obj_next; 6412 6413 if (!alloc) { 6414 if (mclverify) { 6415 mcache_set_pattern(MCACHE_FREE_PATTERN, addr, size); 6416 } 6417 if (save_next) { 6418 mcl_audit_verify_nextptr(next, mca); 6419 ((mcache_obj_t *)addr)->obj_next = next; 6420 } 6421 } else if (mclverify) { 6422 /* Check if the buffer has been corrupted while in freelist */ 6423 mcl_audit_verify_nextptr(next, mca); 6424 mcache_audit_free_verify_set(mca, addr, 0, size); 6425 } 6426} 6427 6428static void 6429mcl_audit_mcheck_panic(struct mbuf *m) 6430{ 6431 mcache_audit_t *mca; 6432 6433 MRANGE(m); 6434 mca = mcl_audit_buf2mca(MC_MBUF, (mcache_obj_t *)m); 6435 6436 panic("mcl_audit: freed mbuf %p with type 0x%x (instead of 0x%x)\n%s\n", 6437 m, (u_int16_t)m->m_type, MT_FREE, mcache_dump_mca(mca)); 6438 /* NOTREACHED */ 6439} 6440 6441static void 6442mcl_audit_verify_nextptr(void *next, mcache_audit_t *mca) 6443{ 6444 if (next != NULL && !MBUF_IN_MAP(next) && 6445 (next != (void *)MCACHE_FREE_PATTERN || !mclverify)) { 6446 panic("mcl_audit: buffer %p modified after free at offset 0: " 6447 "%p out of range [%p-%p)\n%s\n", 6448 mca->mca_addr, next, mbutl, embutl, mcache_dump_mca(mca)); 6449 /* NOTREACHED */ 6450 } 6451} 6452 6453/* This function turns on mbuf leak detection */ 6454static void 6455mleak_activate(void) 6456{ 6457 mleak_table.mleak_sample_factor = MLEAK_SAMPLE_FACTOR; 6458 PE_parse_boot_argn("mleak_sample_factor", 6459 &mleak_table.mleak_sample_factor, 6460 sizeof (mleak_table.mleak_sample_factor)); 6461 6462 if (mleak_table.mleak_sample_factor == 0) 6463 mclfindleak = 0; 6464 6465 if (mclfindleak == 0) 6466 return; 6467 6468 vm_size_t alloc_size = 6469 mleak_alloc_buckets * sizeof (struct mallocation); 6470 vm_size_t trace_size = mleak_trace_buckets * sizeof (struct mtrace); 6471 6472 MALLOC(mleak_allocations, struct mallocation *, alloc_size, 6473 M_TEMP, M_WAITOK | M_ZERO); 6474 VERIFY(mleak_allocations != NULL); 6475 6476 MALLOC(mleak_traces, struct mtrace *, trace_size, 6477 M_TEMP, M_WAITOK | M_ZERO); 6478 VERIFY(mleak_traces != NULL); 6479 6480 MALLOC(mleak_stat, mleak_stat_t *, MLEAK_STAT_SIZE(MLEAK_NUM_TRACES), 6481 M_TEMP, M_WAITOK | M_ZERO); 6482 VERIFY(mleak_stat != NULL); 6483 mleak_stat->ml_cnt = MLEAK_NUM_TRACES; 6484#ifdef __LP64__ 6485 mleak_stat->ml_isaddr64 = 1; 6486#endif /* __LP64__ */ 6487} 6488 6489static void 6490mleak_logger(u_int32_t num, mcache_obj_t *addr, boolean_t alloc) 6491{ 6492 int temp; 6493 6494 if (mclfindleak == 0) 6495 return; 6496 6497 if (!alloc) 6498 return (mleak_free(addr)); 6499 6500 temp = atomic_add_32_ov(&mleak_table.mleak_capture, 1); 6501 6502 if ((temp % mleak_table.mleak_sample_factor) == 0 && addr != NULL) { 6503 uintptr_t bt[MLEAK_STACK_DEPTH]; 6504 int logged = fastbacktrace(bt, MLEAK_STACK_DEPTH); 6505 mleak_log(bt, addr, logged, num); 6506 } 6507} 6508 6509/* 6510 * This function records the allocation in the mleak_allocations table 6511 * and the backtrace in the mleak_traces table; if allocation slot is in use, 6512 * replace old allocation with new one if the trace slot is in use, return 6513 * (or increment refcount if same trace). 6514 */ 6515static boolean_t 6516mleak_log(uintptr_t *bt, mcache_obj_t *addr, uint32_t depth, int num) 6517{ 6518 struct mallocation *allocation; 6519 struct mtrace *trace; 6520 uint32_t trace_index; 6521 6522 /* Quit if someone else modifying the tables */ 6523 if (!lck_mtx_try_lock_spin(mleak_lock)) { 6524 mleak_table.total_conflicts++; 6525 return (FALSE); 6526 } 6527 6528 allocation = &mleak_allocations[hashaddr((uintptr_t)addr, 6529 mleak_alloc_buckets)]; 6530 trace_index = hashbacktrace(bt, depth, mleak_trace_buckets); 6531 trace = &mleak_traces[trace_index]; 6532 6533 VERIFY(allocation <= &mleak_allocations[mleak_alloc_buckets - 1]); 6534 VERIFY(trace <= &mleak_traces[mleak_trace_buckets - 1]); 6535 6536 allocation->hitcount++; 6537 trace->hitcount++; 6538 6539 /* 6540 * If the allocation bucket we want is occupied 6541 * and the occupier has the same trace, just bail. 6542 */ 6543 if (allocation->element != NULL && 6544 trace_index == allocation->trace_index) { 6545 mleak_table.alloc_collisions++; 6546 lck_mtx_unlock(mleak_lock); 6547 return (TRUE); 6548 } 6549 6550 /* 6551 * Store the backtrace in the traces array; 6552 * Size of zero = trace bucket is free. 6553 */ 6554 if (trace->allocs > 0 && 6555 bcmp(trace->addr, bt, (depth * sizeof (uintptr_t))) != 0) { 6556 /* Different, unique trace, but the same hash! Bail out. */ 6557 trace->collisions++; 6558 mleak_table.trace_collisions++; 6559 lck_mtx_unlock(mleak_lock); 6560 return (TRUE); 6561 } else if (trace->allocs > 0) { 6562 /* Same trace, already added, so increment refcount */ 6563 trace->allocs++; 6564 } else { 6565 /* Found an unused trace bucket, so record the trace here */ 6566 if (trace->depth != 0) { 6567 /* this slot previously used but not currently in use */ 6568 mleak_table.trace_overwrites++; 6569 } 6570 mleak_table.trace_recorded++; 6571 trace->allocs = 1; 6572 memcpy(trace->addr, bt, (depth * sizeof (uintptr_t))); 6573 trace->depth = depth; 6574 trace->collisions = 0; 6575 } 6576 6577 /* Step 2: Store the allocation record in the allocations array */ 6578 if (allocation->element != NULL) { 6579 /* 6580 * Replace an existing allocation. No need to preserve 6581 * because only a subset of the allocations are being 6582 * recorded anyway. 6583 */ 6584 mleak_table.alloc_collisions++; 6585 } else if (allocation->trace_index != 0) { 6586 mleak_table.alloc_overwrites++; 6587 } 6588 allocation->element = addr; 6589 allocation->trace_index = trace_index; 6590 allocation->count = num; 6591 mleak_table.alloc_recorded++; 6592 mleak_table.outstanding_allocs++; 6593 6594 lck_mtx_unlock(mleak_lock); 6595 return (TRUE); 6596} 6597 6598static void 6599mleak_free(mcache_obj_t *addr) 6600{ 6601 while (addr != NULL) { 6602 struct mallocation *allocation = &mleak_allocations 6603 [hashaddr((uintptr_t)addr, mleak_alloc_buckets)]; 6604 6605 if (allocation->element == addr && 6606 allocation->trace_index < mleak_trace_buckets) { 6607 lck_mtx_lock_spin(mleak_lock); 6608 if (allocation->element == addr && 6609 allocation->trace_index < mleak_trace_buckets) { 6610 struct mtrace *trace; 6611 trace = &mleak_traces[allocation->trace_index]; 6612 /* allocs = 0 means trace bucket is unused */ 6613 if (trace->allocs > 0) 6614 trace->allocs--; 6615 if (trace->allocs == 0) 6616 trace->depth = 0; 6617 /* NULL element means alloc bucket is unused */ 6618 allocation->element = NULL; 6619 mleak_table.outstanding_allocs--; 6620 } 6621 lck_mtx_unlock(mleak_lock); 6622 } 6623 addr = addr->obj_next; 6624 } 6625} 6626 6627static void 6628mleak_sort_traces() 6629{ 6630 int i, j, k; 6631 struct mtrace *swap; 6632 6633 for(i = 0; i < MLEAK_NUM_TRACES; i++) 6634 mleak_top_trace[i] = NULL; 6635 6636 for(i = 0, j = 0; j < MLEAK_NUM_TRACES && i < mleak_trace_buckets; i++) 6637 { 6638 if (mleak_traces[i].allocs <= 0) 6639 continue; 6640 6641 mleak_top_trace[j] = &mleak_traces[i]; 6642 for (k = j; k > 0; k--) { 6643 if (mleak_top_trace[k]->allocs <= 6644 mleak_top_trace[k-1]->allocs) 6645 break; 6646 6647 swap = mleak_top_trace[k-1]; 6648 mleak_top_trace[k-1] = mleak_top_trace[k]; 6649 mleak_top_trace[k] = swap; 6650 } 6651 j++; 6652 } 6653 6654 j--; 6655 for(; i < mleak_trace_buckets; i++) { 6656 if (mleak_traces[i].allocs <= mleak_top_trace[j]->allocs) 6657 continue; 6658 6659 mleak_top_trace[j] = &mleak_traces[i]; 6660 6661 for (k = j; k > 0; k--) { 6662 if (mleak_top_trace[k]->allocs <= 6663 mleak_top_trace[k-1]->allocs) 6664 break; 6665 6666 swap = mleak_top_trace[k-1]; 6667 mleak_top_trace[k-1] = mleak_top_trace[k]; 6668 mleak_top_trace[k] = swap; 6669 } 6670 } 6671} 6672 6673static void 6674mleak_update_stats() 6675{ 6676 mleak_trace_stat_t *mltr; 6677 int i; 6678 6679 VERIFY(mleak_stat != NULL); 6680#ifdef __LP64__ 6681 VERIFY(mleak_stat->ml_isaddr64); 6682#else 6683 VERIFY(!mleak_stat->ml_isaddr64); 6684#endif /* !__LP64__ */ 6685 VERIFY(mleak_stat->ml_cnt == MLEAK_NUM_TRACES); 6686 6687 mleak_sort_traces(); 6688 6689 mltr = &mleak_stat->ml_trace[0]; 6690 bzero(mltr, sizeof (*mltr) * MLEAK_NUM_TRACES); 6691 for (i = 0; i < MLEAK_NUM_TRACES; i++) { 6692 int j; 6693 6694 if (mleak_top_trace[i] == NULL || 6695 mleak_top_trace[i]->allocs == 0) 6696 continue; 6697 6698 mltr->mltr_collisions = mleak_top_trace[i]->collisions; 6699 mltr->mltr_hitcount = mleak_top_trace[i]->hitcount; 6700 mltr->mltr_allocs = mleak_top_trace[i]->allocs; 6701 mltr->mltr_depth = mleak_top_trace[i]->depth; 6702 6703 VERIFY(mltr->mltr_depth <= MLEAK_STACK_DEPTH); 6704 for (j = 0; j < mltr->mltr_depth; j++) 6705 mltr->mltr_addr[j] = mleak_top_trace[i]->addr[j]; 6706 6707 mltr++; 6708 } 6709} 6710 6711static struct mbtypes { 6712 int mt_type; 6713 const char *mt_name; 6714} mbtypes[] = { 6715 { MT_DATA, "data" }, 6716 { MT_OOBDATA, "oob data" }, 6717 { MT_CONTROL, "ancillary data" }, 6718 { MT_HEADER, "packet headers" }, 6719 { MT_SOCKET, "socket structures" }, 6720 { MT_PCB, "protocol control blocks" }, 6721 { MT_RTABLE, "routing table entries" }, 6722 { MT_HTABLE, "IMP host table entries" }, 6723 { MT_ATABLE, "address resolution tables" }, 6724 { MT_FTABLE, "fragment reassembly queue headers" }, 6725 { MT_SONAME, "socket names and addresses" }, 6726 { MT_SOOPTS, "socket options" }, 6727 { MT_RIGHTS, "access rights" }, 6728 { MT_IFADDR, "interface addresses" }, 6729 { MT_TAG, "packet tags" }, 6730 { 0, NULL } 6731}; 6732 6733#define MBUF_DUMP_BUF_CHK() { \ 6734 clen -= k; \ 6735 if (clen < 1) \ 6736 goto done; \ 6737 c += k; \ 6738} 6739 6740static char * 6741mbuf_dump(void) 6742{ 6743 unsigned long totmem = 0, totfree = 0, totmbufs, totused, totpct; 6744 u_int32_t m_mbufs = 0, m_clfree = 0, m_bigclfree = 0; 6745 u_int32_t m_mbufclfree = 0, m_mbufbigclfree = 0; 6746 u_int32_t m_16kclusters = 0, m_16kclfree = 0, m_mbuf16kclfree = 0; 6747 int nmbtypes = sizeof (mbstat.m_mtypes) / sizeof (short); 6748 uint8_t seen[256]; 6749 struct mbtypes *mp; 6750 mb_class_stat_t *sp; 6751 mleak_trace_stat_t *mltr; 6752 char *c = mbuf_dump_buf; 6753 int i, k, clen = MBUF_DUMP_BUF_SIZE; 6754 6755 mbuf_dump_buf[0] = '\0'; 6756 6757 /* synchronize all statistics in the mbuf table */ 6758 mbuf_stat_sync(); 6759 mbuf_mtypes_sync(TRUE); 6760 6761 sp = &mb_stat->mbs_class[0]; 6762 for (i = 0; i < mb_stat->mbs_cnt; i++, sp++) { 6763 u_int32_t mem; 6764 6765 if (m_class(i) == MC_MBUF) { 6766 m_mbufs = sp->mbcl_active; 6767 } else if (m_class(i) == MC_CL) { 6768 m_clfree = sp->mbcl_total - sp->mbcl_active; 6769 } else if (m_class(i) == MC_BIGCL) { 6770 m_bigclfree = sp->mbcl_total - sp->mbcl_active; 6771 } else if (njcl > 0 && m_class(i) == MC_16KCL) { 6772 m_16kclfree = sp->mbcl_total - sp->mbcl_active; 6773 m_16kclusters = sp->mbcl_total; 6774 } else if (m_class(i) == MC_MBUF_CL) { 6775 m_mbufclfree = sp->mbcl_total - sp->mbcl_active; 6776 } else if (m_class(i) == MC_MBUF_BIGCL) { 6777 m_mbufbigclfree = sp->mbcl_total - sp->mbcl_active; 6778 } else if (njcl > 0 && m_class(i) == MC_MBUF_16KCL) { 6779 m_mbuf16kclfree = sp->mbcl_total - sp->mbcl_active; 6780 } 6781 6782 mem = sp->mbcl_ctotal * sp->mbcl_size; 6783 totmem += mem; 6784 totfree += (sp->mbcl_mc_cached + sp->mbcl_infree) * 6785 sp->mbcl_size; 6786 6787 } 6788 6789 /* adjust free counts to include composite caches */ 6790 m_clfree += m_mbufclfree; 6791 m_bigclfree += m_mbufbigclfree; 6792 m_16kclfree += m_mbuf16kclfree; 6793 6794 totmbufs = 0; 6795 for (mp = mbtypes; mp->mt_name != NULL; mp++) 6796 totmbufs += mbstat.m_mtypes[mp->mt_type]; 6797 if (totmbufs > m_mbufs) 6798 totmbufs = m_mbufs; 6799 k = snprintf(c, clen, "%lu/%u mbufs in use:\n", totmbufs, m_mbufs); 6800 MBUF_DUMP_BUF_CHK(); 6801 6802 bzero(&seen, sizeof (seen)); 6803 for (mp = mbtypes; mp->mt_name != NULL; mp++) { 6804 if (mbstat.m_mtypes[mp->mt_type] != 0) { 6805 seen[mp->mt_type] = 1; 6806 k = snprintf(c, clen, "\t%u mbufs allocated to %s\n", 6807 mbstat.m_mtypes[mp->mt_type], mp->mt_name); 6808 MBUF_DUMP_BUF_CHK(); 6809 } 6810 } 6811 seen[MT_FREE] = 1; 6812 for (i = 0; i < nmbtypes; i++) 6813 if (!seen[i] && mbstat.m_mtypes[i] != 0) { 6814 k = snprintf(c, clen, "\t%u mbufs allocated to " 6815 "<mbuf type %d>\n", mbstat.m_mtypes[i], i); 6816 MBUF_DUMP_BUF_CHK(); 6817 } 6818 if ((m_mbufs - totmbufs) > 0) { 6819 k = snprintf(c, clen, "\t%lu mbufs allocated to caches\n", 6820 m_mbufs - totmbufs); 6821 MBUF_DUMP_BUF_CHK(); 6822 } 6823 k = snprintf(c, clen, "%u/%u mbuf 2KB clusters in use\n" 6824 "%u/%u mbuf 4KB clusters in use\n", 6825 (unsigned int)(mbstat.m_clusters - m_clfree), 6826 (unsigned int)mbstat.m_clusters, 6827 (unsigned int)(mbstat.m_bigclusters - m_bigclfree), 6828 (unsigned int)mbstat.m_bigclusters); 6829 MBUF_DUMP_BUF_CHK(); 6830 6831 if (njcl > 0) { 6832 k = snprintf(c, clen, "%u/%u mbuf %uKB clusters in use\n", 6833 m_16kclusters - m_16kclfree, m_16kclusters, 6834 njclbytes / 1024); 6835 MBUF_DUMP_BUF_CHK(); 6836 } 6837 totused = totmem - totfree; 6838 if (totmem == 0) { 6839 totpct = 0; 6840 } else if (totused < (ULONG_MAX / 100)) { 6841 totpct = (totused * 100) / totmem; 6842 } else { 6843 u_long totmem1 = totmem / 100; 6844 u_long totused1 = totused / 100; 6845 totpct = (totused1 * 100) / totmem1; 6846 } 6847 k = snprintf(c, clen, "%lu KB allocated to network (approx. %lu%% " 6848 "in use)\n", totmem / 1024, totpct); 6849 MBUF_DUMP_BUF_CHK(); 6850 6851 /* mbuf leak detection statistics */ 6852 mleak_update_stats(); 6853 6854 k = snprintf(c, clen, "\nmbuf leak detection table:\n"); 6855 MBUF_DUMP_BUF_CHK(); 6856 k = snprintf(c, clen, "\ttotal captured: %u (one per %u)\n", 6857 mleak_table.mleak_capture / mleak_table.mleak_sample_factor, 6858 mleak_table.mleak_sample_factor); 6859 MBUF_DUMP_BUF_CHK(); 6860 k = snprintf(c, clen, "\ttotal allocs outstanding: %llu\n", 6861 mleak_table.outstanding_allocs); 6862 MBUF_DUMP_BUF_CHK(); 6863 k = snprintf(c, clen, "\tnew hash recorded: %llu allocs, %llu traces\n", 6864 mleak_table.alloc_recorded, mleak_table.trace_recorded); 6865 MBUF_DUMP_BUF_CHK(); 6866 k = snprintf(c, clen, "\thash collisions: %llu allocs, %llu traces\n", 6867 mleak_table.alloc_collisions, mleak_table.trace_collisions); 6868 MBUF_DUMP_BUF_CHK(); 6869 k = snprintf(c, clen, "\toverwrites: %llu allocs, %llu traces\n", 6870 mleak_table.alloc_overwrites, mleak_table.trace_overwrites); 6871 MBUF_DUMP_BUF_CHK(); 6872 k = snprintf(c, clen, "\tlock conflicts: %llu\n\n", 6873 mleak_table.total_conflicts); 6874 MBUF_DUMP_BUF_CHK(); 6875 6876 k = snprintf(c, clen, "top %d outstanding traces:\n", 6877 mleak_stat->ml_cnt); 6878 MBUF_DUMP_BUF_CHK(); 6879 for (i = 0; i < mleak_stat->ml_cnt; i++) { 6880 mltr = &mleak_stat->ml_trace[i]; 6881 k = snprintf(c, clen, "[%d] %llu outstanding alloc(s), " 6882 "%llu hit(s), %llu collision(s)\n", (i + 1), 6883 mltr->mltr_allocs, mltr->mltr_hitcount, 6884 mltr->mltr_collisions); 6885 MBUF_DUMP_BUF_CHK(); 6886 } 6887 6888 if (mleak_stat->ml_isaddr64) 6889 k = snprintf(c, clen, MB_LEAK_HDR_64); 6890 else 6891 k = snprintf(c, clen, MB_LEAK_HDR_32); 6892 MBUF_DUMP_BUF_CHK(); 6893 6894 for (i = 0; i < MLEAK_STACK_DEPTH; i++) { 6895 int j; 6896 k = snprintf(c, clen, "%2d: ", (i + 1)); 6897 MBUF_DUMP_BUF_CHK(); 6898 for (j = 0; j < mleak_stat->ml_cnt; j++) { 6899 mltr = &mleak_stat->ml_trace[j]; 6900 if (i < mltr->mltr_depth) { 6901 if (mleak_stat->ml_isaddr64) { 6902 k = snprintf(c, clen, "0x%0llx ", 6903 mltr->mltr_addr[i]); 6904 } else { 6905 k = snprintf(c, clen, 6906 "0x%08x ", 6907 (u_int32_t)mltr->mltr_addr[i]); 6908 } 6909 } else { 6910 if (mleak_stat->ml_isaddr64) 6911 k = snprintf(c, clen, 6912 MB_LEAK_SPACING_64); 6913 else 6914 k = snprintf(c, clen, 6915 MB_LEAK_SPACING_32); 6916 } 6917 MBUF_DUMP_BUF_CHK(); 6918 } 6919 k = snprintf(c, clen, "\n"); 6920 MBUF_DUMP_BUF_CHK(); 6921 } 6922done: 6923 return (mbuf_dump_buf); 6924} 6925 6926#undef MBUF_DUMP_BUF_CHK 6927 6928SYSCTL_DECL(_kern_ipc); 6929SYSCTL_PROC(_kern_ipc, KIPC_MBSTAT, mbstat, 6930 CTLFLAG_RD | CTLFLAG_LOCKED, 6931 0, 0, mbstat_sysctl, "S,mbstat", ""); 6932SYSCTL_PROC(_kern_ipc, OID_AUTO, mb_stat, 6933 CTLFLAG_RD | CTLFLAG_LOCKED, 6934 0, 0, mb_stat_sysctl, "S,mb_stat", ""); 6935SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_top_trace, 6936 CTLFLAG_RD | CTLFLAG_LOCKED, 6937 0, 0, mleak_top_trace_sysctl, "S,mb_top_trace", ""); 6938SYSCTL_PROC(_kern_ipc, OID_AUTO, mleak_table, 6939 CTLFLAG_RD | CTLFLAG_LOCKED, 6940 0, 0, mleak_table_sysctl, "S,mleak_table", ""); 6941SYSCTL_INT(_kern_ipc, OID_AUTO, mleak_sample_factor, 6942 CTLFLAG_RW | CTLFLAG_LOCKED, &mleak_table.mleak_sample_factor, 0, ""); 6943SYSCTL_INT(_kern_ipc, OID_AUTO, mb_normalized, 6944 CTLFLAG_RD | CTLFLAG_LOCKED, &mb_normalized, 0, ""); 6945SYSCTL_INT(_kern_ipc, OID_AUTO, mb_watchdog, 6946 CTLFLAG_RW | CTLFLAG_LOCKED, &mb_watchdog, 0, ""); 6947