1/* 2 * Boot-time performance cache. 3 * 4 * We implement a shim layer for a block device which can be advised ahead 5 * of time of the likely read pattern for the block device. 6 * 7 * Armed with a sorted version of this read pattern (the "playlist"), we 8 * cluster the reads into a private cache, taking advantage of the 9 * sequential I/O pattern. Incoming read requests are then satisfied from 10 * the cache (if possible), or by forwarding to the original strategy routine. 11 * 12 * We also record the pattern of incoming read requests (the "history 13 * list"), in order to provide for adaptation to changes in behaviour by 14 * the calling layer. 15 * 16 * The hit rate of the cache is also measured, and caching will be disabled 17 * preemptively if the hit rate is too low. 18 * 19 * The cache is controlled by a sysctl interface. Typical usage is as 20 * follows: 21 * 22 * - Load a sorted read pattern. 23 * - Wait for the operation(s) of interest to be performed. 24 * - Fetch the new (unsorted) read pattern. 25 * 26 * Loading a read pattern initialises the cache; upon successful loading 27 * the cache is active and readhead is being performed. After fetching the 28 * recorded read pattern, the cache remains around until jettisoned due 29 * to memory pressure or the hit rate falls below a desirable level. 30 * 31 * Limitations: 32 * 33 * - only one copy of the cache may be active at a time 34 */ 35 36/* 37 * Build options. 38 */ 39/*#define BC_DEBUG*/ 40/*#define BC_EXTRA_DEBUG*/ 41 42/* 43 * Emulate readahead, don't actually perform it. 44 */ 45/*#define EMULATE_ONLY*/ 46 47/* 48 * Don't collect request history. 49 */ 50/*#define NO_HISTORY*/ 51 52/* 53 * Don't use history cluster chains (seems to be a malloc problem with these 54 * enabled) 55 */ 56/*#define STATIC_HISTORY*/ 57 58/* 59 * Only cache for the volumes on the root disk 60 */ 61#define ROOT_DISK_ONLY 62 63/* 64 * Keep a log of read history to aid in debugging. 65 */ 66/*#define READ_HISTORY_BUFFER 1000*/ 67/*#define READ_HISTORY_ALL*/ 68 69/* 70 * Ignore the batches on playlist entries. 71 */ 72/*#define IGNORE_BATCH */ 73 74/* 75 * Tunable parameters. 76 */ 77 78/* 79 * Minimum required physical memory before BootCache will 80 * activate. Insufficient physical memory leads to paging 81 * activity during boot, which is not repeatable and can't 82 * be distinguished from the repeatable read patterns. 83 * 84 * Size in megabytes. 85 */ 86static int BC_minimum_mem_size = 256; 87 88/* 89 * Number of seconds to wait in BC_strategy for readahead to catch up 90 * with our request. Tuninig issues: 91 * - Waiting too long may hold up the system behind this I/O/. 92 * - Bypassing an I/O too early will reduce the hit rate and adversely 93 * affect performance on slow media. 94 */ 95#define BC_READAHEAD_WAIT_DEFAULT 10 96#define BC_READAHEAD_WAIT_CDROM 45 97static int BC_wait_for_readahead = BC_READAHEAD_WAIT_DEFAULT; 98 99/* 100 * Cache hit ratio monitoring. 101 * 102 * The time overhead of a miss in the cache is approx 1/5000th of the 103 * time it takes to satisfy an IO from a HDD. If our hit rate falls 104 * below this, we're no longer providing a performance win and the 105 * cache will be jettisoned. 106 * 107 * Make sure to keep the cache around for at least an hour 108 * if the user hasn't logged in yet, though. 109 */ 110static int BC_chit_max_num_IOs_since_last_hit = 10000; 111static int BC_chit_prelogin_timeout_seconds = 3600; 112 113/* 114 * History recording timeout. 115 * 116 * If the history recording is not turned off, the history buffer 117 * will grow to its maximum size. This timeout will kill the history 118 * recording after the specified interval (in hz). 119 * 120 * User DVDs don't issue a "BootCacheControl stop", so they depend 121 * on the history recording timeout. 122 */ 123static int BC_history_timeout = 240 * 100; 124 125/* 126 * Trace macros 127 */ 128#ifndef DBG_BOOTCACHE 129#define DBG_BOOTCACHE 7 130#endif 131 132#define DBG_BC_TAG (1 << 0) 133#define DBG_BC_BATCH (1 << 1) 134 135#define DBG_BC_IO_HIT (1 << 2) 136#define DBG_BC_IO_HIT_STALLED (1 << 3) 137#define DBG_BC_IO_MISS (1 << 4) 138#define DBG_BC_IO_MISS_CUT_THROUGH (1 << 5) 139#define DBG_BC_PLAYBACK_IO (1 << 6) 140 141static int dbg_tag_count = 0; 142 143#ifdef BC_DEBUG 144# define MACH_DEBUG 145# define MACH_ASSERT 1 146# define message(fmt, args...) \ 147do { \ 148 microtime(&debug_currenttime); \ 149 timersub(&debug_currenttime, &debug_starttime, &debug_currenttime); \ 150 lck_mtx_lock(&debug_printlock); \ 151 printf("BootCache: %5d.%03d %24s[%4d]: " fmt "\n", (u_int)(debug_currenttime.tv_sec), (u_int)(debug_currenttime.tv_usec / 1000), __FUNCTION__, __LINE__, ##args); \ 152 lck_mtx_unlock(&debug_printlock); \ 153} while (0) 154 155# define debug(fmt, args...) message("*** " fmt, ##args) 156extern void Debugger(char *); 157#else 158# define debug(fmt, args...) do {} while (0) 159# define message(fmt, args...) printf("BootCache: " fmt "\n" , ##args) 160#endif 161 162#ifdef BC_EXTRA_DEBUG 163# define xdebug(fmt, args...) message("+++ " fmt, ##args) 164#else 165# define xdebug(fmt, args...) do {} while (0) 166#endif 167 168#include <sys/buf.h> 169#include <sys/conf.h> 170#include <sys/fcntl.h> 171#include <sys/kdebug.h> 172#include <sys/kernel.h> 173#include <sys/malloc.h> 174#include <sys/param.h> 175#include <sys/proc.h> 176#include <sys/types.h> 177#include <sys/sysctl.h> 178#include <sys/systm.h> 179#include <sys/vnode.h> 180#include <sys/mount.h> 181#include <sys/ubc.h> 182#include <sys/uio.h> 183 184#include <uuid/uuid.h> 185 186#include <mach/kmod.h> 187#include <mach/mach_types.h> 188#include <mach/vm_param.h> /* for max_mem */ 189 190#include <vm/vm_kern.h> 191 192#include <libkern/libkern.h> 193#include <libkern/OSAtomic.h> 194 195#include <kern/assert.h> 196#include <kern/host.h> 197#include <kern/kalloc.h> 198#include <kern/thread_call.h> 199#include <kern/locks.h> 200 201#include <IOKit/storage/IOMediaBSDClient.h> 202 203#include <pexpert/pexpert.h> 204 205#include "BootCache.h" 206 207#ifdef BC_DEBUG 208static struct timeval debug_starttime; 209static struct timeval debug_currenttime; 210static lck_mtx_t debug_printlock; 211#endif 212 213#define BC_MALLOC(_size, _type, _flags) ((ssize_t)(_size) < 0) ? NULL : _MALLOC((_size), (_type), (_flags)) 214 215/************************************** 216 * Structures for the readahead cache * 217 **************************************/ 218 219/* 220 * A cache extent. 221 * 222 * Each extent represents a contiguous range of disk blocks which are 223 * held in the cache. All values in bytes. 224 */ 225struct BC_cache_extent { 226 lck_mtx_t ce_lock; 227 u_int64_t ce_diskoffset; /* physical offset on device */ 228 u_int64_t ce_length; /* data length */ 229 off_t ce_cacheoffset; /* offset into cache backing store */ 230 u_int16_t ce_batch; /* batch number */ 231 u_int16_t ce_flags; /* flags */ 232#define CE_ABORTED (1 << 0) /* extent no longer has data */ 233#define CE_IODONE (1 << 1) /* extent has been read */ 234#define CE_LOWPRI (1 << 2) /* extent is low priority */ 235#define CE_SHARED (1 << 3) /* extent is in the shared cache */ 236 int ce_mount_idx; /* the index of the mount for this extent */ 237 u_int32_t *ce_blockmap; /* track valid blocks for this extent */ 238}; 239 240/* A mount 241 * 242 * The volume to which an extent refers 243 */ 244struct BC_cache_mount { 245 lck_rw_t cm_lock; /* lock guards instance variables, extents have their own locks */ 246 uuid_t cm_uuid; /* this mount's uuid */ 247 int cm_nextents; /* the number of extents that reference this mount */ 248 struct BC_cache_extent **cm_pextents; /* a list of pointers to extents which reference 249 this mount, sorted by block address */ 250 int cm_state; /* this mount's state */ 251#define CM_SETUP (0) /* fields above are valid. Not mounted yet */ 252#define CM_READY (1) /* every field is valid. Mounted and ready to go */ 253#define CM_ABORTED (2) /* mount failed for some reason */ 254 255 /* fields below filled in when we detect that the volume has mounted */ 256 dev_t cm_dev; /* the device for this mount */ 257 struct buf *cm_bp; /* struct buf allocated in the reader thread for this device */ 258 u_int64_t cm_throttle_mask;/* the throttle mask to use in throttle APIs for this mount */ 259 u_int64_t cm_blocksize; /* keep 64-bit for promotion */ 260 u_int64_t cm_devsize; /* range checking */ 261 u_int64_t cm_maxread; /* largest read the dev will support */ 262 struct BC_cache_disk* cm_disk; /* the mount's physical disk */ 263}; 264 265/* A physical disk 266 * 267 * The disk may contain several mounts 268 * and will have one readahead thread 269 */ 270struct BC_cache_disk { 271 lck_mtx_t cd_lock; 272 u_int64_t cd_disk_id; /* The throttle mask, used as an identifier */ 273 int cd_disk_num; /* used for stat batching */ 274 int cd_flags; /* flags */ 275#define CD_HAS_THREAD (1 << 0) /* there is a readahead thread in progress for this disk */ 276#define CD_ISSUE_LOWPRI (1 << 1) /* the readahead thread is reading in low-priority extents */ 277#define CD_IS_SSD (1 << 2) /* this is a solid state disk */ 278 int cd_nmounts; /* The number of mounts that reference this disk */ 279 int cd_batch; /* What playback batch this disk is on */ 280}; 281 282/************************************ 283 * Structures for history recording * 284 ************************************/ 285 286/* see BC_history_entry and BC_history_mount in BootCache.h */ 287 288/* 289 * Wrapper around BC_history_mount so we 290 * can keep track of the device 291 */ 292struct BC_history_mount_device { 293 u_int64_t hmd_disk_id; 294 struct BC_history_mount_device* hmd_next; 295 uint32_t hmd_is_ssd; 296 dev_t hmd_dev; 297 int hmd_blocksize; 298 struct BC_history_mount hmd_mount; 299}; 300 301/* 302 * A history entry list cluster. 303 */ 304struct BC_history_entry_cluster { 305 struct BC_history_entry_cluster *hec_link; 306 int hec_nentries; /* number of entries in list */ 307 struct BC_history_entry hec_data[0]; 308}; 309 310#ifdef STATIC_HISTORY 311# define BC_HISTORY_ALLOC 128000 312# define BC_HISTORY_ENTRIES \ 313(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \ 314sizeof(struct BC_history_entry)) - 1) 315# define BC_HISTORY_MAXCLUSTERS 1 316#else 317# define BC_HISTORY_ALLOC 16000 318# define BC_HISTORY_ENTRIES \ 319(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) / \ 320sizeof(struct BC_history_entry)) - 1) 321# define BC_HISTORY_MAXCLUSTERS ((BC_MAXENTRIES / BC_HISTORY_ENTRIES) + 1) 322#endif 323 324 325/***************************************** 326 * To be able to debug live read history * 327 *****************************************/ 328#ifdef READ_HISTORY_BUFFER 329struct BC_read_history_entry { 330 struct BC_cache_extent *rh_extent; /* extent this read is for */ 331 u_int64_t rh_blkno; /* offset of actual read */ 332 u_int64_t rh_bcount; /* length in bytes */ 333 int rh_result; 334# define BC_RHISTORY_OK 0 335# define BC_RHISTORY_FAIL 1 336}; 337#endif 338 339 340/* 341 * The cache control structure. 342 */ 343struct BC_cache_control { 344 lck_grp_t *c_lckgrp; 345 346 /* the device we are covering */ 347 struct bdevsw *c_devsw; 348 349 /* saved switch table routines */ 350 strategy_fcn_t *c_strategy; 351 open_close_fcn_t *c_close; 352 lck_mtx_t c_handlers_lock; 353 354 u_int64_t c_cachesize; /* total number of bytes in cache */ 355 356 u_int64_t c_root_disk_id; /* the throttle mask of the root disk, used as an id for the physical device */ 357 /* This needs to be updated to handle multiple physical volumes backing a mount */ 358 359 /* 360 * The mounts we're tracking 361 */ 362 int c_nmounts; /* number of mounts in the array below */ 363 struct BC_cache_mount *c_mounts; /* the array of mounts the cache extents refer to */ 364 365 /* 366 * Extents, in optimal read order. 367 * 368 * Each extent tracks the pages it owns in the buffer. 369 * It is assumed that PAGE_SIZE is a multiple of cm_blocksize for each mount. 370 * 371 * Each extent refers to one of the mounts, above, by index into c_mounts 372 */ 373 int c_nextentlists; /* number of extent lists we have */ 374 int *c_nextents; /* number of extents in a given extent list */ 375 struct BC_cache_extent **c_extentlists; /* an array of extent lists in the cache */ 376 377 int c_batch_count; /* number of extent batches */ 378 vnode_t c_vp; 379 uint32_t c_vid; 380 /* rw lock protects the above fields through c_nmounts, but not their contents. 381 * Shared for general use of the fields, 382 * Exclusive for cache initialization or cleanup. 383 * Each mount/extent has its own lock for modifying its contents. 384 */ 385 lck_rw_t c_cache_lock; 386 387 388 /* history list, in reverse order */ 389 int c_history_num_clusters; 390 struct BC_history_entry_cluster *c_history; 391 struct BC_history_mount_device *c_history_mounts; 392 uint64_t c_history_size; 393 /* lock protects the above history fields and their contents */ 394 lck_rw_t c_history_lock; 395 396 /* fields below are accessed atomically */ 397 398 /* flags */ 399 UInt32 c_flags; 400#define BC_FLAG_SETUP (1 << 0) /* cache setup properly during mount */ 401#define BC_FLAG_CACHEACTIVE (1 << 1) /* cache is active, owns memory */ 402#define BC_FLAG_HISTORYACTIVE (1 << 2) /* currently recording history */ 403#define BC_FLAG_HTRUNCATED (1 << 3) /* history list truncated */ 404#define BC_FLAG_SHUTDOWN (1 << 4) /* readahead shut down */ 405 SInt32 c_strategycalls; /* count of busy strategy calls in the cache */ 406 407 uint32_t c_readahead_throttles_cutthroughs; 408 409 uint32_t c_num_ios_since_last_hit; /* The number of cache misses since the last hit */ 410 411 uint32_t c_num_reader_threads; /* The number of reader threads active */ 412 lck_mtx_t c_reader_threads_lock; /* protects c_num_reader_threads */ 413 414 /* statistics */ 415 int c_take_detailed_stats; 416 struct BC_statistics c_stats; 417 418 /* time-keeping */ 419 struct timeval c_loadtime; 420 struct timeval c_cache_starttime; 421 struct timeval c_history_starttime; 422 423#ifdef READ_HISTORY_BUFFER 424 int c_rhistory_idx; 425 struct BC_read_history_entry *c_rhistory; 426#endif 427}; 428 429/* 430 * The number of blocks per page; assumes that a page 431 * will always be >= the size of a disk block. 432 */ 433#define CB_PAGEBLOCKS(cm) (PAGE_SIZE / (cm)->cm_blocksize) 434 435/* 436 * Convert block offset to page offset and vice versa. 437 */ 438#define CB_BLOCK_TO_PAGE(cm, block) ((block) / CB_PAGEBLOCKS(cm)) 439#define CB_PAGE_TO_BLOCK(cm, page) ((page) * CB_PAGEBLOCKS(cm)) 440#define CB_BLOCK_TO_BYTE(cm, block) ((block) * (cm)->cm_blocksize) 441#define HB_BLOCK_TO_BYTE(hmd, block) ((block) * (hmd)->hmd_blocksize) 442#define CB_BYTE_TO_BLOCK(cm, byte) ((byte) / (cm)->cm_blocksize) 443 444/* 445 * The size of an addressable field in the block map. 446 * This reflects the u_int32_t type for the blockmap type. 447 */ 448#define CB_MAPFIELDBITS 32 449#define CB_MAPFIELDBYTES 4 450 451/* 452 * The index into an array of addressable fields, and the bit within 453 * the addressable field corresponding to an index in the map. 454 */ 455#define CB_MAPIDX(x) ((x) / CB_MAPFIELDBITS) 456#define CB_MAPBIT(x) (1 << ((x) % CB_MAPFIELDBITS)) 457 458/* 459 * Blockmap management. Must be called with extent lock held. 460 */ 461#define CB_BLOCK_PRESENT(ce, block) \ 462((ce)->ce_blockmap != NULL && ((ce)->ce_blockmap[CB_MAPIDX(block)] & CB_MAPBIT(block))) 463#define CB_MARK_BLOCK_PRESENT(ce, block) \ 464((ce)->ce_blockmap[CB_MAPIDX(block)] |= CB_MAPBIT(block)) 465#define CB_MARK_BLOCK_VACANT(ce, block) \ 466((ce)->ce_blockmap[CB_MAPIDX(block)] &= ~CB_MAPBIT(block)) 467 468/* 469 * Determine whether a given page is vacant (all blocks are gone). 470 * This takes advantage of the expectation that a page's blocks 471 * will never cross the boundary of a field in the blockmap. 472 */ 473#define CB_PAGE_VACANT(cm, ce, page) \ 474(!((ce)->ce_blockmap[CB_MAPIDX(CB_PAGE_TO_BLOCK(cm, page))] & \ 475(((1 << CB_PAGEBLOCKS(cm)) - 1) << (CB_PAGE_TO_BLOCK(cm, page) % \ 476CB_MAPFIELDBITS)))) 477 478/* Maximum size of the boot cache */ 479#define BC_MAX_SIZE (max_mem / 2) 480 481/* 482 * Sanity macro, frees and zeroes a pointer if it is not already zeroed. 483 */ 484#define _FREE_ZERO(p, c) \ 485do { \ 486if (p != NULL) { \ 487_FREE(p, c); \ 488p = NULL; \ 489} \ 490} while (0); 491 492/* 493 * Synchronization macros 494 */ 495#define LOCK_HISTORY_R() lck_rw_lock_shared(&BC_cache->c_history_lock) 496#define UNLOCK_HISTORY_R() lck_rw_unlock_shared(&BC_cache->c_history_lock) 497#define LOCK_HISTORY_W() lck_rw_lock_exclusive(&BC_cache->c_history_lock) 498#define UNLOCK_HISTORY_W() lck_rw_unlock_exclusive(&BC_cache->c_history_lock) 499 500#define LOCK_EXTENT(e) lck_mtx_lock(&(e)->ce_lock) 501#define UNLOCK_EXTENT(e) lck_mtx_unlock(&(e)->ce_lock) 502 503/* mount locks should only be held while also holding the cache lock */ 504#define LOCK_MOUNT_R(m) lck_rw_lock_shared(&(m)->cm_lock) 505#define UNLOCK_MOUNT_R(m) lck_rw_unlock_shared(&(m)->cm_lock) 506#define LOCK_MOUNT_W(m) lck_rw_lock_exclusive(&(m)->cm_lock) 507#define UNLOCK_MOUNT_W(m) lck_rw_unlock_exclusive(&(m)->cm_lock) 508#define LOCK_MOUNT_W_TO_R(m) lck_rw_lock_exclusive_to_shared(&(m)->cm_lock) 509#define LOCK_MOUNT_R_TO_W(m) lck_rw_lock_shared_to_exclusive(&(m)->cm_lock) 510 511#define LOCK_DISK(d) lck_mtx_lock(&(d)->cd_lock) 512#define UNLOCK_DISK(d) lck_mtx_unlock(&(d)->cd_lock) 513 514#define LOCK_READERS() lck_mtx_lock(&BC_cache->c_reader_threads_lock) 515#define UNLOCK_READERS() lck_mtx_unlock(&BC_cache->c_reader_threads_lock) 516 517#define LOCK_HANDLERS() lck_mtx_lock(&BC_cache->c_handlers_lock) 518#define UNLOCK_HANDLERS() lck_mtx_unlock(&BC_cache->c_handlers_lock) 519 520#define LOCK_CACHE_R() lck_rw_lock_shared(&BC_cache->c_cache_lock) 521#define UNLOCK_CACHE_R() lck_rw_unlock_shared(&BC_cache->c_cache_lock) 522#define LOCK_CACHE_W() lck_rw_lock_exclusive(&BC_cache->c_cache_lock) 523#define UNLOCK_CACHE_W() lck_rw_unlock_exclusive(&BC_cache->c_cache_lock) 524#define LOCK_CACHE_TRY_W() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_EXCLUSIVE) 525#define LOCK_CACHE_TRY_R() lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_SHARED) 526#define LOCK_CACHE_W_TO_R() lck_rw_lock_exclusive_to_shared(&BC_cache->c_cache_lock) 527#define LOCK_CACHE_R_TO_W() lck_rw_lock_shared_to_exclusive(&BC_cache->c_cache_lock) 528 529#define BC_ADD_STAT(stat, inc) OSAddAtomic((inc), ((SInt32 *)&BC_cache->c_stats.ss_##stat)) 530 531/* 532 * Only one instance of the cache is supported. 533 */ 534static struct BC_cache_control BC_cache_softc; 535static struct BC_cache_control *BC_cache = &BC_cache_softc; 536 537/* 538 * We support preloaded cache data by checking for a Mach-O 539 * segment/section which can be populated with a playlist by the 540 * linker. This is particularly useful in the CD mastering process, 541 * as reading the playlist from CD is very slow and rebuilding the 542 * kext at mastering time is not practical. 543 */ 544extern kmod_info_t kmod_info; 545static struct BC_playlist_header *BC_preloaded_playlist; 546static int BC_preloaded_playlist_size; 547 548static int BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length); 549static int BC_find_cache_mount(dev_t dev); 550static struct BC_cache_extent** BC_find_extent(struct BC_cache_mount* cm, u_int64_t offset, u_int64_t length, int contained, int* pnum_extents); 551static int BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length); 552static int BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length); 553static int BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk); 554static void BC_reader_thread(void *param0, wait_result_t param1); 555static int BC_strategy(struct buf *bp); 556static int BC_close(dev_t dev, int flags, int devtype, struct proc *p); 557static int BC_terminate_readahead(void); 558static int BC_terminate_cache(void); 559static int BC_terminate_history(void); 560static void BC_terminate_cache_async(void); 561static void BC_check_handlers(void); 562static void BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t from, 563 uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength); 564static int BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd); 565static void BC_teardown_disk(struct BC_cache_disk *cd); 566static void BC_mount_available(struct BC_cache_mount *cm); 567static int BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm); 568static int BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context); 569static void BC_teardown_mount(struct BC_cache_mount *ce); 570static int BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx); 571static void BC_teardown_extent(struct BC_cache_extent *ce); 572static int BC_copyin_playlist(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries); 573static int BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history); 574static void BC_update_mounts(void); 575static void BC_timeout_history(void *); 576static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex); 577static int BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev); 578static int BC_size_history_mounts(void); 579static int BC_size_history_entries(void); 580static int BC_copyout_history_mounts(user_addr_t uptr); 581static int BC_copyout_history_entries(user_addr_t uptr); 582static void BC_discard_history(void); 583static void BC_auto_start(void); 584static int BC_setup(void); 585 586static int fill_in_bc_cache_mounts(mount_t mount, void* arg); 587static int check_for_new_mount_itr(mount_t mount, void* arg); 588static int extents_status(struct BC_cache_extent **pce, int nextents) ; 589static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) ; 590 591#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 592static int compare_cache_extents(const void *vfirst, const void *vsecond); 593//FIXME: qsort not available in kexts 594extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *)); 595#endif 596 597/* 598 * Sysctl interface. 599 */ 600static int BC_sysctl SYSCTL_HANDLER_ARGS; 601 602SYSCTL_PROC(_kern, OID_AUTO, BootCache, CTLFLAG_RW | CTLFLAG_LOCKED, 603 0, 0, BC_sysctl, 604 "S,BC_command", 605 "BootCache command interface"); 606 607/* 608 * Netboot exports this hook so that we can determine 609 * whether we were netbooted. 610 */ 611extern int netboot_root(void); 612 613/* 614 * bsd_init exports this hook so that we can register for notification 615 * once the root filesystem is mounted. 616 */ 617extern void (* mountroot_post_hook)(void); 618extern void (* unmountroot_pre_hook)(void); 619 620/* 621 * Mach VM-specific functions. 622 */ 623static int BC_alloc_pagebuffer(); 624static void BC_free_pagebuffer(); 625static void BC_free_page(struct BC_cache_extent *ce, int page); 626 627/* Functions not prototyped elsewhere */ 628int ubc_range_op(vnode_t, off_t, off_t, int, int *); /* in sys/ubc_internal.h */ 629 630/* 631 * Mach-o functions. 632 */ 633#define MACH_KERNEL 1 634#include <mach-o/loader.h> 635extern void *getsectdatafromheader(struct mach_header *, char *, char *, int *); 636 637/* 638 * Set or clear a flag atomically and return the previous value of that bit. 639 */ 640static inline UInt32 641BC_set_flag(UInt32 flag) 642{ 643 return flag & OSBitOrAtomic(flag, &BC_cache->c_flags); 644} 645 646static inline UInt32 647BC_clear_flag(UInt32 flag) 648{ 649 return flag & OSBitAndAtomic(~(flag), &BC_cache->c_flags); 650} 651 652/* 653 * Return a user-readable string for a given uuid 654 * 655 * Returns a pointer to a static buffer, which is 656 * racy, so this should only be used for debugging purposes 657 */ 658static inline const char* uuid_string(uuid_t uuid) 659{ 660 /* Racy, but used for debug output so who cares */ 661 static uuid_string_t uuidString; 662 uuid_unparse(uuid, uuidString); 663 return (char*)uuidString; 664} 665 666/* 667 * Test whether a given byte range on disk intersects with an extent's range. 668 * This function assumes the ranges are within the same mount. 669 */ 670static inline int 671BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length) 672{ 673 if ((offset < (ce->ce_diskoffset + ce->ce_length)) && 674 ((offset + length) > ce->ce_diskoffset)) 675 return 1; 676 return 0; 677} 678 679/* 680 * Find one of our cache mounts with the given device, if it has been seen this boot 681 * 682 * Called with the cache read lock held 683 * 684 * Returns the index of the mount, or -1 if it is not found 685 */ 686static int 687BC_find_cache_mount(dev_t dev) 688{ 689 int i; 690 691 if (dev == nulldev()) 692 return(-1); 693 694 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (BC_cache->c_mounts == NULL)) 695 return(-1); 696 697 698 for (i = 0; i < BC_cache->c_nmounts; i++) { 699 if (BC_cache->c_mounts[i].cm_state == CM_READY) { 700 if (BC_cache->c_mounts[i].cm_dev == dev) { 701 return i; 702 } 703 } 704 } 705 706 return(-1); 707} 708 709/* The new throttling implementation needs to be able to check 710 * whether an IO is in the boot cache before issuing the IO 711 */ 712static int BC_cache_contains_block(dev_t device, u_int64_t blkno) { 713 struct BC_cache_mount * cm = NULL; 714 struct BC_cache_extent **pce, *ce; 715 716 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 717 return 0; 718 } 719 720 LOCK_CACHE_R(); 721 722 int cm_idx = BC_find_cache_mount(device); 723 if (cm_idx == -1) { 724 UNLOCK_CACHE_R(); 725 return 0; 726 } 727 728 cm = BC_cache->c_mounts + cm_idx; 729 730 LOCK_MOUNT_R(cm); 731 732 if (cm->cm_state != CM_READY) { 733 UNLOCK_MOUNT_R(cm); 734 UNLOCK_CACHE_R(); 735 return 0; 736 } 737 738 u_int64_t disk_offset = CB_BLOCK_TO_BYTE(cm, blkno); 739 u_int64_t length = CB_BLOCK_TO_BYTE(cm, 1) /* one block */; 740 741 pce = BC_find_extent(cm, disk_offset, length, 0, NULL); 742 743 if (pce == NULL || *pce == NULL) { 744 UNLOCK_MOUNT_R(cm); 745 UNLOCK_CACHE_R(); 746 return 0; 747 } 748 749 ce = *pce; 750 751 UNLOCK_MOUNT_R(cm); 752 753 if (ce->ce_flags & CE_ABORTED) { 754 UNLOCK_CACHE_R(); 755 return 0; 756 } 757 758 if (ce->ce_flags & CE_LOWPRI && 759 !(ce->ce_flags & CE_IODONE)) { 760 UNLOCK_CACHE_R(); 761 return 0; 762 } 763 764 UNLOCK_CACHE_R(); 765 return 1; 766} 767 768/* 769 * Find a cache extent containing or intersecting the offset/length. 770 * 771 * We need to be able to find both containing and intersecting 772 * extents. Rather than handle each seperately, we take advantage 773 * of the fact that any containing extent will also be an intersecting 774 * extent. 775 * 776 * Thus, a search for a containing extent should be a search for an 777 * intersecting extent followed by a test for containment. 778 * 779 * Containment may include multiple extents. The returned extent will 780 * be the first of these extents (lowest offset), but the next extents 781 * in the array may also be required to contain the requested range. 782 * Upon success, *pnum_extents contains the number of extents required 783 * to contain the range. 784 * 785 * Returns a pointer to a matching the entry in the mount's extent list 786 * or NULL if no match is found. 787 * 788 * Called with the cache mount read lock held 789 */ 790static struct BC_cache_extent ** 791BC_find_extent(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length, int contained, int *pnum_extents) 792{ 793 struct BC_cache_extent **p, **base, **last, **next, **prev; /* pointers into the mount's extent index list */ 794 size_t lim; 795 796 /* invariants */ 797 assert(length > 0); 798 799 base = cm->cm_pextents; 800 last = base + cm->cm_nextents - 1; 801 802 /* 803 * If the cache is inactive, exit. 804 */ 805 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (base == NULL)) { 806 return(NULL); 807 } 808 809 /* 810 * Range-check against the cache first. 811 * 812 * Note that we check for possible intersection, rather than 813 * containment. 814 */ 815 if (((offset + length) <= (*base)->ce_diskoffset) || 816 (offset >= ((*last)->ce_diskoffset + (*last)->ce_length))) { 817 return(NULL); 818 } 819 820 /* 821 * Perform a binary search for an extent intersecting 822 * the offset and length. 823 * 824 * This code is based on bsearch(3), and the description following 825 * is taken directly from it. 826 * 827 * The code below is a bit sneaky. After a comparison fails, we 828 * divide the work in half by moving either left or right. If lim 829 * is odd, moving left simply involves halving lim: e.g., when lim 830 * is 5 we look at item 2, so we change lim to 2 so that we will 831 * look at items 0 & 1. If lim is even, the same applies. If lim 832 * is odd, moving right again involes halving lim, this time moving 833 * the base up one item past p: e.g., when lim is 5 we change base 834 * to item 3 and make lim 2 so that we will look at items 3 and 4. 835 * If lim is even, however, we have to shrink it by one before 836 * halving: e.g., when lim is 4, we still looked at item 2, so we 837 * have to make lim 3, then halve, obtaining 1, so that we will only 838 * look at item 3. 839 */ 840 for (lim = cm->cm_nextents; lim != 0; lim >>= 1) { 841 p = base + (lim >> 1); 842 843 if (BC_check_intersection((*p), offset, length)) { 844 845 if (contained == 0) 846 return(p); 847 848 if (pnum_extents) { 849 *pnum_extents = 1; 850 } 851 852 /* 853 * We are checking for containment, verify that here. 854 * Extents may be adjacent, so include neighboring 855 * extents in the containment check 856 */ 857 858 /* Check the high bound */ 859 if ((offset + length) > ((*p)->ce_diskoffset + (*p)->ce_length)) { 860 861 /* check higher extents to see if they contain the high end of this range */ 862 prev = p; 863 for (next = prev + 1; 864 next <= last; 865 next++) { 866 867 /* extents should never overlap */ 868 assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset); 869 870 if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) { 871 /* extents are not adjacent */ 872 if (BC_cache->c_take_detailed_stats) { 873 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length)); 874 } 875 return(NULL); 876 } 877 878 if (pnum_extents) { 879 (*pnum_extents)++; 880 } 881 882 if ((offset + length) <= ((*next)->ce_diskoffset + (*next)->ce_length)) { 883 /* we contain the high end of the range, break so we check the low end below */ 884 break; 885 } 886 887 prev = next; 888 } 889 890 if (next > last) { 891 /* ran off the end of the extent list */ 892 if (BC_cache->c_take_detailed_stats) { 893 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length)); 894 } 895 return (NULL); 896 } 897 } 898 899 /* Check the low bound */ 900 if (offset < (*p)->ce_diskoffset) { 901 902 /* check lower extents to see if they contain the low end of this range */ 903 next = p; 904 for (prev = next - 1; 905 prev >= base; 906 prev--) { 907 908 /* extents should never overlap */ 909 assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset); 910 911 if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) { 912 /* extents are not adjacent */ 913 if (BC_cache->c_take_detailed_stats) { 914 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset); 915 } 916 return(NULL); 917 } 918 919 if (pnum_extents) { 920 (*pnum_extents)++; 921 } 922 923 if (offset >= (*prev)->ce_diskoffset) { 924 /* we contain the low end of the range (we checked high end above) */ 925 return(prev); 926 } 927 928 next = prev; 929 } 930 931 assert(prev < base); /* only break condition */ 932 933 if (prev < base) { 934 /* ran off the end of the extent list */ 935 if (BC_cache->c_take_detailed_stats) { 936 xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset); 937 } 938 return (NULL); 939 } 940 } 941 942 return(p); 943 } 944 945 if (offset > (*p)->ce_diskoffset) { /* move right */ 946 base = p + 1; 947 lim--; 948 } /* else move left */ 949 } 950 /* found nothing */ 951 return(NULL); 952} 953 954/* 955 * Discard data from the cache and free pages which are no longer required. 956 * 957 * This should be locked against anyone testing for !vacant pages. 958 * 959 * We return the number of bytes we marked unused. If the value is 960 * zero, the offset/length did not overap this extent. 961 * 962 * Called with extent lock held. 963 */ 964static int 965BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length) 966{ 967 struct BC_cache_mount *cm; 968 u_int64_t estart, eend, dstart, dend, nblk, base, page; 969 int i, discarded; 970 971 /* invariants */ 972 assert(length > 0); 973#ifdef BC_DEBUG 974 lck_mtx_assert(&ce->ce_lock, LCK_MTX_ASSERT_OWNED); 975#endif 976 977 /* 978 * Extent has been terminated: blockmap may no longer be valid. 979 */ 980 if (ce->ce_flags & CE_ABORTED || 981 ce->ce_length == 0) 982 return 0; 983 984 cm = BC_cache->c_mounts + ce->ce_mount_idx; 985 986 /* 987 * Constrain offset and length to fit this extent, return immediately if 988 * there is no overlap. 989 */ 990 dstart = offset; /* convert to start/end format */ 991 dend = offset + length; 992 assert(dend > dstart); 993 estart = ce->ce_diskoffset; 994 eend = ce->ce_diskoffset + ce->ce_length; 995 assert(eend > estart); 996 997 if (dend <= estart) /* check for overlap */ 998 return(0); 999 if (eend <= dstart) 1000 return(0); 1001 1002 if (dstart < estart) /* clip to extent */ 1003 dstart = estart; 1004 if (dend > eend) 1005 dend = eend; 1006 1007 /* 1008 * Convert length in bytes to a block count. 1009 */ 1010 nblk = CB_BYTE_TO_BLOCK(cm, dend - dstart); 1011 1012 /* 1013 * Mark blocks vacant. For each vacated block, check whether the page 1014 * holding it can be freed. 1015 * 1016 * Note that this could be optimised to reduce the number of 1017 * page-freeable checks, but the logic would be more complex. 1018 */ 1019 base = CB_BYTE_TO_BLOCK(cm, dstart - ce->ce_diskoffset); 1020 assert(base >= 0); 1021 discarded = 0; 1022 assert((base + nblk - 1) < howmany(ce->ce_length, cm->cm_blocksize)); 1023 for (i = 0; i < nblk; i++) { 1024 1025 /* this is a no-op if the block is already gone */ 1026 if (CB_BLOCK_PRESENT(ce, base + i)) { 1027 1028 /* mark the block as vacant */ 1029 CB_MARK_BLOCK_VACANT(ce, base + i); 1030 discarded++; 1031 1032 page = CB_BLOCK_TO_PAGE(cm, base + i); 1033 if (CB_PAGE_VACANT(cm, ce, page)) 1034 BC_free_page(ce, (int) page); 1035 } 1036 } 1037 return(discarded * cm->cm_blocksize); 1038} 1039 1040/* 1041 * Blocks in an extent can be invalidated, e.g. by a write, before the 1042 * reader thread fills the extent. Search for the next range of viable blocks 1043 * in the extent, starting from offset 'fromoffset'. 1044 * 1045 * Values are returned by out parameters: 1046 * 'nextpage' takes the page containing the start of the next run, or 1047 * -1 if no more runs are to be found. 1048 * 'nextoffset' takes the offset into that page that the run begins. 1049 * 'nextlength' takes the length in bytes of the range, capped at maxread. 1050 * 1051 * In other words, for this blockmap, if 1052 * *fromoffset = 3 * blocksize, 1053 * 1 1 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1054 * *nextpage = 0 1055 * *nextoffset = 5 * blocksize 1056 * *nextlength = 6 * blocksize 1057 * 1058 * Called with the extent lock held 1059 */ 1060static void 1061BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t fromoffset, 1062 uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength) 1063{ 1064 int maxblks, i, nextblk = 0; 1065 int found = 0; 1066 1067 maxblks = howmany(ce->ce_length, cm->cm_blocksize); 1068 i = CB_BYTE_TO_BLOCK(cm, fromoffset); 1069 /* scan for the next range of valid blocks in the extent */ 1070 for (; i < maxblks; i++) { 1071 if (CB_BLOCK_PRESENT(ce, i)) { 1072 if (found == 0) { 1073 nextblk = i; 1074 } 1075 found++; 1076 } else { 1077 if (found != 0) 1078 break; 1079 } 1080 } 1081 1082 if (found) { 1083 /* found a valid range, so convert to (page, offset, length) */ 1084 *nextpage = CB_BLOCK_TO_PAGE(cm, nextblk); 1085 *nextoffset = CB_BLOCK_TO_BYTE(cm, nextblk) % PAGE_SIZE; 1086 *nextlength = MIN(CB_BLOCK_TO_BYTE(cm, found), cm->cm_maxread); 1087 } else { 1088 *nextpage = -1; 1089 } 1090 1091 return; 1092} 1093 1094/* 1095 * Test for the presence of a range of blocks in the cache. 1096 * 1097 * Called with the extent lock held. 1098 */ 1099static int 1100BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk) 1101{ 1102 int blk; 1103 1104 assert(base >= 0); 1105 assert((base + nblk) <= howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize)); 1106 1107 for (blk = 0; blk < nblk; blk++) { 1108 1109 if (!CB_BLOCK_PRESENT(ce, base + blk)) { 1110 return(0); 1111 } 1112 } 1113 return(1); 1114} 1115 1116/* 1117 * Readahead thread. 1118 */ 1119static void 1120BC_reader_thread(void *param0, wait_result_t param1) 1121{ 1122 struct BC_cache_disk *cd = NULL; 1123 struct BC_cache_mount *cm = NULL; 1124 struct BC_cache_extent *ce = NULL; 1125 int count, num_mounts, ce_idx, cel_idx; 1126 upl_t upl; 1127 kern_return_t kret; 1128 struct timeval batchstart, batchend; 1129 int error = 0; 1130 int issuing_lowpriority_extents = 0; 1131 1132 /* We can access the BC_cache_disk passed in freely without locks 1133 * since disks are only freed up in terminate cache after it has 1134 * guanteed that no reader threads are running 1135 */ 1136 cd = (struct BC_cache_disk*) param0; 1137 1138 num_mounts = cd->cd_nmounts; 1139 1140 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) 1141 goto out; 1142 1143 for (;;) { 1144 1145 if (issuing_lowpriority_extents) { 1146 debug("disk %d starting low priority batch", cd->cd_disk_num); 1147 } else { 1148 debug("disk %d starting batch %d", cd->cd_disk_num, cd->cd_batch); 1149 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) | 1150 DBG_FUNC_START, 1151 cd->cd_batch, cd->cd_disk_num, 0, 0, 0); 1152 } 1153 microtime(&batchstart); 1154 1155 LOCK_CACHE_R(); 1156 for (cel_idx = 0; 1157 cel_idx < BC_cache->c_nextentlists; 1158 cel_idx++) { 1159 1160 /* iterate over extents to populate */ 1161 for (ce_idx = 0; 1162 ce_idx < BC_cache->c_nextents[cel_idx]; 1163 ce_idx++) { 1164 1165 ce = BC_cache->c_extentlists[cel_idx] + ce_idx; 1166 1167 cm = BC_cache->c_mounts + ce->ce_mount_idx; 1168 1169 if (cm->cm_state != CM_READY) { 1170 continue; 1171 } 1172 1173 if (cm->cm_disk != cd){ 1174 continue; 1175 } 1176 1177 /* Only read extents marked for this batch or earlier. 1178 * All low-priority IO is read in one batch, however. 1179 */ 1180 if (ce->ce_batch > cd->cd_batch && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1181 continue; 1182 } 1183 1184 /* Check if already done or aborted */ 1185 if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) { 1186 continue; 1187 } 1188 1189 /* Check if this extent is low priority and we're not yet reading in low-priority extents */ 1190 if (!(cd->cd_flags & CD_ISSUE_LOWPRI) && 1191 (ce->ce_flags & CE_LOWPRI)) { 1192 continue; 1193 } 1194 1195 /* Shouldn't happen */ 1196 if((cd->cd_flags & CD_ISSUE_LOWPRI) && 1197 !(ce->ce_flags & CE_LOWPRI)) { 1198 debug("Saw a regular extent while issuing throttled IO"); 1199 } 1200 1201 LOCK_EXTENT(ce); 1202 1203 /* Check if again with the lock */ 1204 if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) { 1205 UNLOCK_EXTENT(ce); 1206 continue; 1207 } 1208 1209 /* loop reading to fill this extent */ 1210 uint32_t fromoffset = 0; 1211 1212 LOCK_MOUNT_R(cm); 1213 1214 if (cm->cm_state != CM_READY) { 1215 /* the mount was aborted. Whoever aborted it handles aborting the extents as well */ 1216 UNLOCK_MOUNT_R(cm); 1217 UNLOCK_EXTENT(ce); 1218 continue; 1219 } 1220 1221 for (;;) { 1222 uint32_t nextpage; 1223 uint32_t nextoffset; 1224 uint32_t nextlength; 1225 1226 /* requested shutdown */ 1227 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) { 1228 UNLOCK_MOUNT_R(cm); 1229 goto out; 1230 } 1231 1232 1233 /* 1234 * Find the next set of blocks that haven't been invalidated 1235 * for this extent. 1236 */ 1237 BC_next_valid_range(cm, ce, fromoffset, &nextpage, &nextoffset, &nextlength); 1238 /* no more blocks to be read */ 1239 if (nextpage == -1) 1240 break; 1241 1242 /* set up fromoffset to read the next segment of the extent */ 1243 fromoffset = (nextpage * PAGE_SIZE) + nextoffset + nextlength; 1244 1245 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 1246 if (kret != KERN_SUCCESS) { 1247 UNLOCK_MOUNT_R(cm); 1248 message("reader thread: vnode_getwithvid failed - %d\n", kret); 1249 goto out; 1250 } 1251 1252 kret = ubc_create_upl(BC_cache->c_vp, 1253 ce->ce_cacheoffset + (nextpage * PAGE_SIZE), 1254 (int) roundup(nextoffset + nextlength, PAGE_SIZE), 1255 &upl, 1256 NULL, 1257 UPL_SET_LITE|UPL_FILE_IO); 1258 if (kret != KERN_SUCCESS) { 1259 UNLOCK_MOUNT_R(cm); 1260 message("ubc_create_upl returned %d\n", kret); 1261 (void) vnode_put(BC_cache->c_vp); 1262 goto out; 1263 } 1264 1265 /* cm_bp is only accessed from the reader thread, 1266 * and there's only one reader thread for a given mount, 1267 * so we don't need the write lock to modify it */ 1268 1269 /* set buf to fill the requested subset of the upl */ 1270 buf_setblkno(cm->cm_bp, CB_BYTE_TO_BLOCK(cm, ce->ce_diskoffset + nextoffset) + CB_PAGE_TO_BLOCK(cm, nextpage)); 1271 buf_setcount(cm->cm_bp, (unsigned int) nextlength); 1272 buf_setupl(cm->cm_bp, upl, (unsigned int) nextoffset); 1273 buf_setresid(cm->cm_bp, buf_count(cm->cm_bp)); /* ask for residual indication */ 1274 buf_reset(cm->cm_bp, B_READ); 1275 1276 /* If this is regular readahead, make sure any throttled IO are throttled by the readahead thread rdar://8592635 1277 * If this is low-priority readahead, make sure this thread will be throttled appropriately later rdar://8734858 1278 */ 1279 throttle_info_handle_t throttle_info_handle; 1280 if ((error = throttle_info_ref_by_mask(cm->cm_throttle_mask, &throttle_info_handle)) == 0) { 1281 throttle_info_update_by_mask(throttle_info_handle, 0x0); 1282 throttle_info_rel_by_mask(throttle_info_handle); 1283 } else { 1284 debug("Unable to update throttle info for mount %s: %d", uuid_string(cm->cm_uuid), error); 1285 } 1286 1287 BC_ADD_STAT(initiated_reads, 1); 1288 if (cd->cd_disk_num < STAT_DISKMAX) { 1289 if (ce->ce_flags & CE_LOWPRI) { 1290 BC_ADD_STAT(disk_initiated_reads_lowpri[cd->cd_disk_num], 1); 1291 } else { 1292 BC_ADD_STAT(disk_initiated_reads[cd->cd_disk_num], 1); 1293 } 1294 } 1295 1296 /* give the buf to the underlying strategy routine */ 1297 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, DKIO_READ) | DBG_FUNC_NONE, 1298 (uintptr_t) buf_kernel_addrperm_addr(cm->cm_bp), buf_device(cm->cm_bp), (long)buf_blkno(cm->cm_bp), buf_count(cm->cm_bp), 0); 1299 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_PLAYBACK_IO) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(cm->cm_bp), 0, 0, 0, 0); 1300 1301 BC_cache->c_strategy(cm->cm_bp); 1302 1303 /* wait for the bio to complete */ 1304 buf_biowait(cm->cm_bp); 1305 1306 kret = ubc_upl_commit(upl); 1307 (void) vnode_put(BC_cache->c_vp); 1308 1309 /* 1310 * If the read or commit returned an error, invalidate the bytes 1311 * covered by the read (on a residual, we could avoid invalidating 1312 * bytes that are claimed to be read as a minor optimisation, but we do 1313 * not expect errors as a matter of course). 1314 */ 1315 if (kret != KERN_SUCCESS || buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) { 1316 debug("read error: extent %lu %lu/%lu " 1317 "(error buf %ld/%ld flags %08x resid %d)", 1318 (unsigned long) ce_idx, 1319 (unsigned long)ce->ce_diskoffset, (unsigned long)ce->ce_length, 1320 (long)buf_blkno(cm->cm_bp), (long)buf_count(cm->cm_bp), 1321 buf_flags(cm->cm_bp), buf_resid(cm->cm_bp)); 1322 1323 count = BC_discard_bytes(ce, ce->ce_diskoffset + nextoffset, nextlength); 1324 debug("read error: discarded %d bytes", count); 1325 BC_ADD_STAT(read_errors, 1); 1326 BC_ADD_STAT(read_errors_bytes, count); 1327 } 1328#ifdef READ_HISTORY_BUFFER 1329 if (BC_cache->c_rhistory && BC_cache->c_rhistory_idx < READ_HISTORY_BUFFER) { 1330 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_extent = ce; 1331 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_blkno = buf_blkno(cm->cm_bp); 1332 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_bcount = buf_count(cm->cm_bp); 1333 if (buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) { 1334 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_FAIL; 1335 BC_cache->c_rhistory_idx++; 1336 } else { 1337# ifdef READ_HISTORY_ALL 1338 BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_OK; 1339 BC_cache->c_rhistory_idx++; 1340# endif 1341 } 1342 } 1343#endif 1344 1345 /* I'd like to throttle the thread here, but I'm holding the extent and cache locks, so it's throttled below */ 1346 } 1347 1348 /* update stats */ 1349 BC_ADD_STAT(read_blocks, (u_int) CB_BYTE_TO_BLOCK(cm, ce->ce_length)); 1350 BC_ADD_STAT(read_bytes, (u_int) ce->ce_length); 1351 if (ce->ce_flags & CE_LOWPRI) { 1352 BC_ADD_STAT(read_bytes_lowpri, (u_int) ce->ce_length); 1353 if (cd->cd_disk_num < STAT_DISKMAX) { 1354 BC_ADD_STAT(batch_bytes_lowpri[cd->cd_disk_num], (u_int) ce->ce_length); 1355 } 1356 } else { 1357 if (cd->cd_disk_num < STAT_DISKMAX) { 1358 BC_ADD_STAT(batch_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], (u_int) ce->ce_length); 1359 if (ce->ce_batch < cd->cd_batch) { 1360 BC_ADD_STAT(batch_late_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], ce->ce_length); 1361 } 1362 } 1363 } 1364 if (ce->ce_flags & CE_SHARED) { 1365 BC_ADD_STAT(shared_bytes, (u_int) ce->ce_length); 1366 } 1367 UNLOCK_MOUNT_R(cm); 1368 1369 /* 1370 * Wake up anyone wanting this extent, as it is now ready for 1371 * use. 1372 */ 1373 ce->ce_flags |= CE_IODONE; 1374 UNLOCK_EXTENT(ce); 1375 wakeup(ce); 1376 1377 /* Let anyone waiting for the write lock to take hold */ 1378 UNLOCK_CACHE_R(); 1379 1380 /* Take this opportunity of locklessness to throttle the reader thread, if necessary */ 1381 if (issuing_lowpriority_extents) { 1382 throttle_lowpri_io((ce->ce_flags & CE_LOWPRI) ? 1 : 0); 1383 } 1384 1385 LOCK_CACHE_R(); 1386 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1387 /* New playlist arrived, go back to issuing regular priority extents */ 1388 debug("New extents, interrupting low priority readahead"); 1389 break; 1390 } 1391 1392 if (cel_idx >= BC_cache->c_nextentlists || 1393 ce_idx >= BC_cache->c_nextents[cel_idx]) { 1394 /* cache shrunk while we weren't looking. 1395 * Not a hard error, but we're done with this batch at least 1396 */ 1397 break; 1398 } 1399 } 1400 ce = NULL; 1401 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1402 /* New playlist arrived, go back to issuing regular priority extents */ 1403 break; 1404 } 1405 } 1406 UNLOCK_CACHE_R(); 1407 1408 microtime(&batchend); 1409 timersub(&batchend, &batchstart, &batchend); 1410 if (cd->cd_disk_num < STAT_DISKMAX) { 1411 if (issuing_lowpriority_extents) { 1412 BC_cache->c_stats.ss_batch_time_lowpri[cd->cd_disk_num] += 1413 (u_int) batchend.tv_sec * 1000 + 1414 (u_int) batchend.tv_usec / 1000; 1415 } else { 1416 /* Measure times for the first several batches separately */ 1417 BC_cache->c_stats.ss_batch_time[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)] += 1418 (u_int) batchend.tv_sec * 1000 + 1419 (u_int) batchend.tv_usec / 1000; 1420 } 1421 } 1422 if (issuing_lowpriority_extents) { 1423 // debug("disk %d low priority batch done", cd->cd_disk_num); 1424 } else { 1425 // debug("disk %d batch %d done", cd->cd_disk_num, cd->cd_batch); 1426 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) | 1427 DBG_FUNC_END, 1428 cd->cd_batch, cd->cd_disk_num, 0, 0, 0); 1429 } 1430 1431 1432 LOCK_DISK(cd); 1433 if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1434 /* New playlist arrived, go back to issuing regular extents */ 1435 issuing_lowpriority_extents = 0; 1436 throttle_set_thread_io_policy(IOPOL_NORMAL); 1437 cd->cd_batch = 0; 1438 } else if (cd->cd_batch + 1 < BC_cache->c_batch_count && !issuing_lowpriority_extents) { 1439 cd->cd_batch++; 1440 } else { 1441 if (num_mounts == cd->cd_nmounts) { 1442 /* no new mounts appeared */ 1443 if ( !(cd->cd_flags & CD_ISSUE_LOWPRI)) { 1444 /* go through the playlist again, issuing any low-priority extents we have */ 1445 cd->cd_flags |= CD_ISSUE_LOWPRI; 1446 throttle_set_thread_io_policy(IOPOL_THROTTLE); 1447 issuing_lowpriority_extents = 1; 1448 } else { 1449 /* we're done */ 1450 cd->cd_flags &= (~CD_HAS_THREAD); 1451 cd->cd_batch = 0; 1452 UNLOCK_DISK(cd); 1453 goto out; 1454 } 1455 } else { 1456 /* New mounts appeared. Run through again */ 1457 cd->cd_batch = 0; 1458 debug("disk %d more mounts detected (%d total)", cd->cd_disk_num, cd->cd_nmounts); 1459 } 1460 } 1461 num_mounts = cd->cd_nmounts; 1462 UNLOCK_DISK(cd); 1463 } 1464 1465out: 1466 /* 1467 * If ce != NULL we have bailed out, but we cannot free memory beyond 1468 * the bailout point as it may have been read in a previous batch and 1469 * there may be a sleeping strategy routine assuming that it will 1470 * remain valid. Since we are typically killed immediately before a 1471 * complete cache termination, this does not represent a significant 1472 * problem. 1473 * 1474 * However, to prevent readers blocking waiting for readahead to 1475 * complete for extents that will never be read, we mark all the 1476 * extents we have given up on as aborted. 1477 */ 1478 if (ce != NULL) { 1479 UNLOCK_EXTENT(ce); 1480 1481 /* the only errors that get us here are global, so mark the cache as shut down */ 1482 LOCK_DISK(cd); 1483 BC_set_flag(BC_FLAG_SHUTDOWN); 1484 cd->cd_flags &= (~CD_HAS_THREAD); 1485 UNLOCK_DISK(cd); 1486 1487 int tmpcel_idx, tmpce_idx; 1488 for (tmpcel_idx = 0; 1489 tmpcel_idx < BC_cache->c_nextentlists; 1490 tmpcel_idx++) { 1491 for (tmpce_idx = 0; 1492 tmpce_idx < BC_cache->c_nextents[tmpcel_idx]; 1493 tmpce_idx++) { 1494 ce = BC_cache->c_extentlists[tmpcel_idx] + tmpce_idx; 1495 LOCK_EXTENT(ce); 1496 /* abort any extent that hasn't been satisfied */ 1497 if (!(ce->ce_flags & CE_IODONE)) { 1498 ce->ce_flags |= CE_ABORTED; 1499 /* wake up anyone asleep on this extent */ 1500 UNLOCK_EXTENT(ce); 1501 wakeup(ce); 1502 } else { 1503 UNLOCK_EXTENT(ce); 1504 } 1505 } 1506 } 1507 UNLOCK_CACHE_R(); 1508 } 1509 1510 1511 debug("disk %d done", cd->cd_disk_num); 1512 1513 /* wake up someone that might be waiting for this reader thread to exit */ 1514 wakeup(&cd->cd_flags); 1515 1516 LOCK_READERS(); 1517 /* wake up someone that might be waiting for all the reader threads to exit */ 1518 BC_cache->c_num_reader_threads--; 1519 if (BC_cache->c_num_reader_threads == 0) { 1520 wakeup(&BC_cache->c_num_reader_threads); 1521 } 1522 UNLOCK_READERS(); 1523 1524} 1525 1526/* 1527 * Handle an incoming close request. 1528 */ 1529static int 1530BC_close(dev_t dev, int flags, int devtype, struct proc *p) 1531{ 1532 struct BC_cache_mount *cm; 1533 int i, cm_idx; 1534 1535 /* Mark our cache_mount for this device as invalid */ 1536 1537 LOCK_CACHE_R(); 1538 if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) { 1539 1540 cm_idx = BC_find_cache_mount(dev); 1541 if (-1 != cm_idx) { 1542 cm = BC_cache->c_mounts + cm_idx; 1543 debug("Tearing down closing mount %s with dev 0x%x", uuid_string(cm->cm_uuid), dev); 1544 LOCK_MOUNT_R(cm); 1545 if (cm->cm_state != CM_ABORTED) { 1546 /* 1547 * Mark all extents as aborted. This will notify any sleepers in the 1548 * strategy routine that the extent won't have data for them. 1549 * 1550 * Note that we shouldn't try to take an extent lock while holding 1551 * the exclusive mount lock or we risk deadlock with the reader thread 1552 */ 1553 for (i = 0; i < cm->cm_nextents; i++) { 1554 LOCK_EXTENT(cm->cm_pextents[i]); 1555 BC_teardown_extent(cm->cm_pextents[i]); 1556 UNLOCK_EXTENT(cm->cm_pextents[i]); 1557 wakeup(cm->cm_pextents[i]); 1558 } 1559 if (! LOCK_MOUNT_R_TO_W(cm)) { 1560 /* If there is someone waiting for the exclusive lock on this mount, 1561 * we want to grab it before they can so they don't waste time. 1562 * If we fail to take the exclusive lock it means someone 1563 * else is also trying to upgrate the mount lock. We don't keep any 1564 * state, so we can simply take the write lock again. 1565 */ 1566 LOCK_MOUNT_W(cm); 1567 if (cm->cm_state == CM_ABORTED) { 1568 // Mount is aborted, no more work necessary 1569 UNLOCK_MOUNT_W(cm); 1570 goto out; 1571 } 1572 } 1573 BC_teardown_mount(cm); 1574 UNLOCK_MOUNT_W(cm); 1575 } else { 1576 UNLOCK_MOUNT_R(cm); 1577 } 1578 } 1579 } 1580 1581out: 1582 UNLOCK_CACHE_R(); 1583 1584 return BC_cache->c_close(dev, flags, devtype, p); 1585} 1586 1587/* 1588 * Returns CE_ABORTED if any of the extents in the array are aborted 1589 * Returns CE_LOWPRI if any of the extents in the array are low priority and not done 1590 * Returns CE_IODONE if all the extents in the array are done 1591 * Returns 0 otherwise 1592 */ 1593static int extents_status(struct BC_cache_extent **pce, int nextents) { 1594 int ce_idx; 1595 int ret = CE_IODONE; 1596 1597 for (ce_idx = 0; 1598 ce_idx < nextents; 1599 ce_idx++) { 1600 1601 if (pce[ce_idx]->ce_flags & CE_ABORTED) { 1602 return CE_ABORTED; 1603 } 1604 1605 if (! (pce[ce_idx]->ce_flags & CE_IODONE)) { 1606 if (pce[ce_idx]->ce_flags & CE_LOWPRI) { 1607 ret = CE_LOWPRI; 1608 } else { 1609 if (ret == CE_IODONE) { 1610 ret = 0; 1611 } 1612 } 1613 } 1614 } 1615 return ret; 1616} 1617 1618/* 1619 * Waits for the given extent to be filled or aborted 1620 * 1621 * Should only be called with the extent lock. 1622 * NO OTHER LOCKS SHOULD BE HELD INCLUDING THE CACHE LOCK 1623 * 1624 * We are guaranteed that the extent won't disappear when we release 1625 * the cache lock by the cleanup code waiting for BC_cache->c_strategycalls 1626 * to reach 0, and we're counted in there. 1627 */ 1628static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) { 1629 struct timeval time; 1630 struct timespec timeout; 1631 microtime(&time); 1632 timersub(&time, 1633 &starttime, 1634 &time); 1635 if (time.tv_sec < BC_wait_for_readahead) { 1636 timeout.tv_sec = BC_wait_for_readahead - time.tv_sec; 1637 if (time.tv_usec == 0) { 1638 timeout.tv_nsec = 0; 1639 } else { 1640 timeout.tv_sec--; 1641 timeout.tv_nsec = NSEC_PER_USEC * (USEC_PER_SEC - time.tv_usec); 1642 } 1643 msleep(ce, &ce->ce_lock, PRIBIO, "BC_strategy", &timeout); 1644 } 1645} 1646 1647/* 1648 * Handle an incoming IO request. 1649 */ 1650static int 1651BC_strategy(struct buf *bp) 1652{ 1653 struct BC_cache_extent *ce = NULL, **pce, **pcontaining_extents = NULL; 1654 int num_extents; 1655 uio_t uio = NULL; 1656 int nbytes, resid, discards = 0; 1657 struct timeval blocked_start_time, blocked_end_time; 1658 daddr64_t blkno; 1659 caddr_t buf_map_p, buf_extent_p; 1660 off_t disk_offset; 1661 kern_return_t kret; 1662 int cm_idx = -1, ce_idx; 1663 dev_t dev; 1664 int32_t bufflags; 1665 int during_cache = 0, during_io = 0, take_detailed_stats = 0, cache_hit = 0; 1666 int is_root_disk, did_block, status; 1667 int is_shared = 0, is_swap = 0; 1668 int should_throttle = 0; 1669 int is_ssd = 0; 1670 int is_stolen = 0; 1671 int unfilled = 0; 1672 vnode_t vp = NULL; 1673 int pid = 0; 1674 int dont_cache = 0; 1675 int throttle_tier = 0; 1676 bufattr_t bap = NULL; 1677 1678 assert(bp != NULL); 1679 1680 blkno = buf_blkno(bp); 1681 nbytes = buf_count(bp); 1682 bufflags = buf_flags(bp); 1683 bap = buf_attr(bp); 1684 dev = buf_device(bp); 1685 vp = buf_vnode(bp); 1686 1687 if (bap) { 1688 throttle_tier = bufattr_throttled(bap); 1689 } 1690 1691 /* 1692 * If the buf doesn't have a vnode for some reason, pretend 1693 * we never saw the request at all. 1694 */ 1695 if (dev == nulldev()) { 1696 BC_cache->c_strategy(bp); 1697 BC_ADD_STAT(strategy_unknown, 1); 1698 BC_ADD_STAT(strategy_unknown_bytes, nbytes); 1699 return 0; 1700 } 1701 1702 if (vp && vnode_isswap(vp)) { 1703 is_swap = 1; 1704 goto bypass; 1705 } 1706 1707 if (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE) { 1708 pid = proc_selfpid(); 1709 1710 if (vp) { 1711 is_shared = vnode_isdyldsharedcache(vp); 1712 } 1713 } 1714 1715 if (vp && (vnode_israge(vp))) { 1716 dont_cache = 1; 1717 } 1718 1719 /* 1720 * If the cache is not active, bypass the request. We may 1721 * not be able to fill it, but we still want to record it. 1722 */ 1723 if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 1724 goto bypass; 1725 } 1726 1727 /* 1728 * In order to prevent the cache cleanup code from racing with us 1729 * when we sleep, we track the number of strategy calls in flight. 1730 */ 1731 OSAddAtomic(1, &BC_cache->c_strategycalls); 1732 LOCK_CACHE_R(); 1733 during_cache = 1; /* we're included in the in_flight count */ 1734 1735 /* Get cache mount asap for use in case we bypass */ 1736 cm_idx = BC_find_cache_mount(dev); 1737 1738 if (cm_idx != -1) { 1739 disk_offset = CB_BLOCK_TO_BYTE(BC_cache->c_mounts + cm_idx, blkno); 1740 } 1741 1742 if (BC_cache->c_take_detailed_stats) { 1743 take_detailed_stats = 1; 1744 BC_ADD_STAT(strategy_calls, 1); 1745 if (throttle_tier) { 1746 BC_ADD_STAT(strategy_throttled, 1); 1747 } 1748#ifdef BC_DEBUG 1749// if (dont_cache) { 1750// char procname[128]; 1751// proc_selfname(procname, sizeof(procname)); 1752// const char* filename = vp ? vnode_getname(vp) : NULL; 1753// debug("not recording %s%s from app %s for file %s (disk block 0x%llx) which is%s throttled", (vp && vnode_israge(vp)) ? "rapid age " : "", (bufflags & B_READ) ? "read" : "write", procname, filename?:"unknown", blkno, (bufflags & B_THROTTLED_IO) ? "" : " not"); 1754// if (filename) { 1755// vnode_putname(filename); 1756// } 1757// } 1758#endif 1759 } 1760 1761 /* 1762 * if we haven't seen this mount before, pass it off 1763 * 1764 * If this is a mount we want to cache, we'll kick it off when 1765 * adding to our history 1766 */ 1767 if (cm_idx == -1) { 1768 /* don't have a mount for this IO */ 1769 if (take_detailed_stats) 1770 BC_ADD_STAT(strategy_noncached_mount, 1); 1771 goto bypass; 1772 } 1773 1774 /* Get the info from the mount we need */ 1775 LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1776 1777 if (BC_cache->c_mounts[cm_idx].cm_disk && 1778 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) && 1779 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI)) { 1780 during_io = 1; /* for statistics */ 1781 if (take_detailed_stats) { 1782 BC_ADD_STAT(strategy_duringio, 1); 1783 } 1784 } 1785 1786 if (BC_cache->c_mounts[cm_idx].cm_state != CM_READY) { 1787 /* the mount has been aborted, treat it like a missing mount */ 1788 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1789 cm_idx = -1; 1790 1791 if (take_detailed_stats) 1792 BC_ADD_STAT(strategy_noncached_mount, 1); 1793 goto bypass; 1794 } 1795 1796 /* if it's not a read, pass it off */ 1797 if ( !(bufflags & B_READ)) { 1798 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1799 if (take_detailed_stats) 1800 BC_ADD_STAT(strategy_nonread, 1); 1801 goto bypass; 1802 } 1803 1804 if ((nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize) != 0) { 1805 debug("IO with non-blocksize multiple bytes (%d %% %lld = %lld)", nbytes, BC_cache->c_mounts[cm_idx].cm_blocksize, (nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize)); 1806 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1807 if (take_detailed_stats) 1808 BC_ADD_STAT(strategy_nonblocksize, 1); 1809 goto bypass; 1810 } 1811 1812 if (take_detailed_stats) { 1813 BC_ADD_STAT(extent_lookups, 1); 1814 BC_ADD_STAT(requested_bytes, nbytes); 1815 if (cm_idx < STAT_MOUNTMAX) { 1816 BC_ADD_STAT(requested_bytes_m[cm_idx], nbytes); 1817 } 1818 } 1819 1820 /* 1821 * Look for a cache extent containing this request. 1822 */ 1823 num_extents = 0; 1824 pce = BC_find_extent(BC_cache->c_mounts + cm_idx, disk_offset, nbytes, 1, &num_extents); 1825 if (pce == NULL) { 1826#ifdef BC_EXTRA_DEBUG 1827 if (take_detailed_stats && !dont_cache) { 1828 char procname[128]; 1829 proc_selfname(procname, sizeof(procname)); 1830 const char* filename = vp ? vnode_getname(vp) : NULL; 1831 if (num_extents > 0) { 1832 xdebug("Missed IO from app %s for file %s (disk offset 0x%llx) (intersected, though)", procname, filename?:"unknown", blkno); 1833 } else { 1834 xdebug("Missed IO from app %s for file %s (disk offset 0x%llx)", procname, filename?:"unknown", blkno); 1835 } 1836 if (filename) { 1837 vnode_putname(filename); 1838 } 1839 } 1840#endif 1841 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1842 goto bypass; 1843 } 1844 1845 assert(num_extents > 0); 1846 1847 cache_hit = 1; 1848 1849 if (take_detailed_stats) { 1850 BC_ADD_STAT(extent_hits, 1); 1851 if (during_io) 1852 BC_ADD_STAT(hit_duringio, 1); 1853 if (num_extents > 1) 1854 BC_ADD_STAT(hit_multiple, 1); 1855 } 1856 1857 pcontaining_extents = BC_MALLOC(num_extents * sizeof(*pcontaining_extents), M_TEMP, M_WAITOK | M_ZERO); 1858 if (pcontaining_extents == NULL) { 1859 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1860 if (take_detailed_stats) 1861 BC_ADD_STAT(hit_failure, 1); 1862 goto bypass; 1863 } 1864 memcpy(pcontaining_extents, pce, num_extents * sizeof(*pcontaining_extents)); 1865 1866 is_ssd = (BC_cache->c_mounts[cm_idx].cm_disk && 1867 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)); 1868 1869 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 1870 1871 /* if any extent is aborted, bypass immediately */ 1872 status = extents_status(pcontaining_extents, num_extents); 1873 if (status & CE_ABORTED) { 1874 if (take_detailed_stats) 1875 BC_ADD_STAT(hit_aborted, 1); 1876 goto bypass; 1877 } 1878 1879#ifndef EMULATE_ONLY 1880 1881 did_block = 0; 1882 if (! (status & CE_IODONE)) { 1883 1884 if (is_ssd) { 1885 /* Don't delay IOs on SSD since cut-throughs aren't harmful */ 1886 unfilled = 1; 1887 goto bypass; 1888 } 1889 1890 if (status & CE_LOWPRI) { 1891 /* Don't wait for low priority extents */ 1892 if (take_detailed_stats) 1893 BC_ADD_STAT(strategy_unfilled_lowpri, 1); 1894 cache_hit = 0; 1895 goto bypass; 1896 } 1897 1898 /* 1899 * wait for all the extents to finish 1900 */ 1901 microtime(&blocked_start_time); 1902 for (ce_idx = 0; 1903 ce_idx < num_extents; 1904 ce_idx++) { 1905 1906 ce = pcontaining_extents[ce_idx]; 1907 LOCK_EXTENT(ce); 1908 if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) { 1909 did_block = 1; 1910 UNLOCK_CACHE_R(); 1911 /* 1912 * We need to release the cache lock since we will msleep in wait_for_extent. 1913 * If we don't release the cache read lock and someone wants to add a new playlist, 1914 * they go to grab the cache write lock, and everybody stalls for the msleep timeout. 1915 * 1916 * We are guaranteed that the cache won't disappear during this time since 1917 * the cleanup code waits for all the strategy calls (this function) to complete 1918 * by checking BC_cache->c_strategycalls, which we incremented above. 1919 * 1920 * The cache may have grown behind our backs, however. Mount pointers are not valid, 1921 * but mount indexes still are. Extent pointers are still valid since the extent list 1922 * is not modified (this is a requirement to be able to msleep with the extent's lock). 1923 */ 1924 wait_for_extent(ce, blocked_start_time); 1925 1926 /* Cache lock must be grabbed without holding any locks to avoid deadlock rdar://8626772 */ 1927 UNLOCK_EXTENT(ce); 1928 LOCK_CACHE_R(); 1929 LOCK_EXTENT(ce); 1930 1931 if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) { 1932 /* strategy call timed out waiting for the extent */ 1933 if (take_detailed_stats) 1934 BC_ADD_STAT(strategy_timedout, 1); 1935#ifdef BC_DEBUG 1936 microtime(&blocked_end_time); 1937 timersub(&blocked_end_time, 1938 &blocked_start_time, 1939 &blocked_end_time); 1940 char procname[128]; 1941 proc_selfname(procname, sizeof(procname)); 1942 const char* filename = vp ? vnode_getname(vp) : NULL; 1943 debug("Strategy routine timed out app %s waiting on file %s (disk offset 0x%llx) after %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000); 1944 if (filename) { 1945 vnode_putname(filename); 1946 } 1947#endif 1948 goto bypass; 1949 } 1950 UNLOCK_EXTENT(ce); 1951 ce = NULL; 1952 1953 /* Check every time we sleep so we can avoid more sleeping if unnecessary */ 1954 status = extents_status(pcontaining_extents, num_extents); 1955 if (status & CE_ABORTED) { 1956 if (take_detailed_stats) 1957 BC_ADD_STAT(hit_aborted, 1); 1958 goto bypass; 1959 } else if (status & CE_IODONE) { 1960 break; 1961 } 1962 } else { 1963 UNLOCK_EXTENT(ce); 1964 ce = NULL; 1965 } 1966 } 1967 if (take_detailed_stats && did_block) { 1968 BC_ADD_STAT(strategy_blocked, 1); 1969 microtime(&blocked_end_time); 1970 timersub(&blocked_end_time, 1971 &blocked_start_time, 1972 &blocked_end_time); 1973 int ms_blocked = (blocked_end_time.tv_sec * 1000) + (blocked_end_time.tv_usec / (1000)); 1974#ifdef BC_DEBUG 1975 char procname[128]; 1976 proc_selfname(procname, sizeof(procname)); 1977 const char* filename = vp ? vnode_getname(vp) : NULL; 1978 debug("Strategy routine blocked app %s waiting on file %s (disk offset 0x%llx) for %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000); 1979 if (filename) { 1980 vnode_putname(filename); 1981 } 1982#endif 1983 BC_ADD_STAT(strategy_time_blocked, ms_blocked); 1984 if (BC_cache->c_stats.ss_strategy_time_longest_blocked < ms_blocked) { 1985 BC_cache->c_stats.ss_strategy_time_longest_blocked = ms_blocked; 1986 } 1987 } 1988 } 1989 1990 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (did_block ? DBG_BC_IO_HIT_STALLED : DBG_BC_IO_HIT)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), 0, 0, 0, 0); 1991 1992 /* 1993 * buf_map will do its best to provide access to this 1994 * buffer via a kernel accessible address 1995 * if it fails, we can still fall through. 1996 */ 1997 if (buf_map(bp, &buf_map_p)) { 1998 /* can't map, let someone else worry about it */ 1999 if (take_detailed_stats) 2000 BC_ADD_STAT(hit_failure, 1); 2001 goto bypass; 2002 } 2003 buf_extent_p = buf_map_p; 2004 2005 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 2006 if (kret != KERN_SUCCESS) { 2007 debug("vnode_getwithvid failed - %d", kret); 2008 if (take_detailed_stats) 2009 BC_ADD_STAT(hit_failure, 1); 2010 buf_unmap(bp); 2011 goto bypass; 2012 } 2013 2014 /* 2015 * fill in bp from the extents 2016 */ 2017 for (ce_idx = 0; 2018 ce_idx < num_extents; 2019 ce_idx++) { 2020 u_int64_t nbytes_thisextent; 2021 u_int64_t diskoffset_thisextent; 2022 off_t cacheoffset_thisextent; 2023 2024 ce = pcontaining_extents[ce_idx]; 2025 LOCK_EXTENT(ce); 2026 2027 /* 2028 * Check that the extent wasn't aborted while we were unlocked. 2029 */ 2030 if (ce->ce_flags & CE_ABORTED) { 2031 if (take_detailed_stats) 2032 BC_ADD_STAT(hit_aborted, 1); 2033 vnode_put(BC_cache->c_vp); 2034 buf_unmap(bp); 2035 goto bypass; 2036 } 2037 2038 assert(ce->ce_flags & CE_IODONE); 2039 2040 diskoffset_thisextent = MAX(ce->ce_diskoffset, disk_offset); 2041 nbytes_thisextent = MIN(disk_offset + nbytes, ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent; 2042 cacheoffset_thisextent = ce->ce_cacheoffset + (diskoffset_thisextent - ce->ce_diskoffset); 2043 2044 /* range check against extent */ 2045 assert(diskoffset_thisextent < ce->ce_diskoffset + ce->ce_length); 2046 assert(diskoffset_thisextent >= ce->ce_diskoffset); 2047 assert(nbytes_thisextent <= (ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent); 2048 assert(cacheoffset_thisextent < ce->ce_cacheoffset + ce->ce_length); 2049 assert(cacheoffset_thisextent >= ce->ce_cacheoffset); 2050 assert(nbytes_thisextent <= (ce->ce_cacheoffset + ce->ce_length) - cacheoffset_thisextent); 2051 2052 /* range check against buf_t */ 2053 assert(diskoffset_thisextent < disk_offset + nbytes); 2054 assert(diskoffset_thisextent >= disk_offset); 2055 assert(nbytes_thisextent <= (disk_offset + nbytes) - diskoffset_thisextent); 2056 2057 /* check our buf_map pointer */ 2058 assert(buf_extent_p - buf_map_p == diskoffset_thisextent - disk_offset); /* didn't skip any bytes */ 2059 assert(buf_map_p + nbytes >= buf_extent_p + nbytes_thisextent); /* not overflowing the buffer */ 2060 2061 /* Make sure we're including the entire buffer requested */ 2062 assert(ce_idx > 0 || disk_offset == diskoffset_thisextent); /* include the first byte */ 2063 assert(ce_idx < (num_extents - 1) || disk_offset + nbytes == diskoffset_thisextent + nbytes_thisextent); /* include the last byte */ 2064 2065 /* 2066 * Check that the requested blocks are in the buffer. 2067 */ 2068 if (!BC_blocks_present(ce, 2069 CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, diskoffset_thisextent - ce->ce_diskoffset), 2070 CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, nbytes_thisextent))) { 2071 if (take_detailed_stats) 2072 BC_ADD_STAT(hit_blkmissing, 1); 2073 vnode_put(BC_cache->c_vp); 2074 buf_unmap(bp); 2075 goto bypass; 2076 } 2077 2078 resid = nbytes_thisextent; 2079 uio = uio_create(1, cacheoffset_thisextent, UIO_SYSSPACE, UIO_READ); 2080 if (uio == NULL) { 2081 debug("couldn't allocate uio"); 2082 if (take_detailed_stats) 2083 BC_ADD_STAT(hit_failure, 1); 2084 vnode_put(BC_cache->c_vp); 2085 buf_unmap(bp); 2086 goto bypass; 2087 } 2088 2089 kret = uio_addiov(uio, CAST_USER_ADDR_T(buf_extent_p), nbytes_thisextent); 2090 if (kret != KERN_SUCCESS) { 2091 debug("couldn't add iov to uio - %d", kret); 2092 if (take_detailed_stats) 2093 BC_ADD_STAT(hit_failure, 1); 2094 vnode_put(BC_cache->c_vp); 2095 buf_unmap(bp); 2096 goto bypass; 2097 } 2098 2099 kret = cluster_copy_ubc_data(BC_cache->c_vp, uio, &resid, 0); 2100 if (kret != KERN_SUCCESS) { 2101 debug("couldn't copy ubc data - %d", kret); 2102 if (take_detailed_stats) 2103 BC_ADD_STAT(hit_failure, 1); 2104 vnode_put(BC_cache->c_vp); 2105 buf_unmap(bp); 2106 goto bypass; 2107 } 2108 2109 if (resid != 0) { 2110 /* blocks have been stolen for pageout or contiguous memory */ 2111 if (take_detailed_stats) 2112 BC_ADD_STAT(hit_stolen, 1); 2113 vnode_put(BC_cache->c_vp); 2114 buf_unmap(bp); 2115 is_stolen = 1; 2116 goto bypass; 2117 } 2118 2119 buf_extent_p += nbytes_thisextent; 2120 2121 /* discard blocks we have touched */ 2122 discards += BC_discard_bytes(ce, disk_offset, nbytes_thisextent); 2123 2124 UNLOCK_EXTENT(ce); 2125 ce = NULL; 2126 2127 /* copy was successful */ 2128 uio_free(uio); 2129 uio = NULL; 2130 } 2131 2132 vnode_put(BC_cache->c_vp); 2133 2134 buf_setresid(bp, 0); 2135 2136 /* buf_unmap will take care of all cases */ 2137 buf_unmap(bp); 2138 2139 /* complete the request */ 2140 buf_biodone(bp); 2141 2142 /* can't dereference bp after a buf_biodone has been issued */ 2143 2144#else //ifndef EMULATE_ONLY 2145 (void) kret; 2146 (void) p; 2147 (void) resid; 2148 2149 /* discard blocks we have touched */ 2150 for (ce_idx = 0; 2151 ce_idx < num_extents; 2152 ce_idx++) { 2153 ce = pcontaining_extents[ce_idx]; 2154 LOCK_EXTENT(ce); 2155 discards += BC_discard_bytes(ce, disk_offset, nbytes); 2156 UNLOCK_EXTENT(ce); 2157 ce = NULL; 2158 } 2159 2160 /* send the IO to the disk, emulate hit in statistics */ 2161 BC_cache->c_strategy(bp); 2162 2163#endif //ifndef EMULATE_ONLY else 2164 2165 if (take_detailed_stats) { 2166 BC_ADD_STAT(hit_bytes, discards); 2167 if (cm_idx < STAT_MOUNTMAX) { 2168 BC_ADD_STAT(hit_bytes_m[cm_idx], discards); 2169 } 2170 if (dont_cache) { 2171 BC_ADD_STAT(strategy_hit_nocache, 1); 2172 BC_ADD_STAT(hit_nocache_bytes, discards); 2173 } 2174 if (is_shared) { 2175 BC_ADD_STAT(hit_shared_bytes, discards); 2176 } 2177 } else { 2178 BC_ADD_STAT(hit_bytes_afterhistory, discards); 2179 } 2180 2181 BC_cache->c_num_ios_since_last_hit = 0; 2182 2183 /* we are not busy anymore */ 2184 OSAddAtomic(-1, &BC_cache->c_strategycalls); 2185 UNLOCK_CACHE_R(); 2186 2187 _FREE_ZERO(pcontaining_extents, M_TEMP); 2188 2189 if (! dont_cache) { 2190 /* record successful fulfilment (may block) */ 2191 BC_add_history(blkno, nbytes, pid, 1, 0, 0, is_shared, dev); 2192 } 2193 2194 /* 2195 * spec_strategy wants to know if the read has been 2196 * satisfied by the boot cache in order to avoid 2197 * throttling the thread unnecessarily. spec_strategy 2198 * will check for the special return value 0xcafefeed 2199 * to indicate that the read was satisfied by the 2200 * cache. 2201 */ 2202 2203#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed) 2204 return IO_SATISFIED_BY_CACHE; 2205 2206bypass: 2207 if (ce != NULL) 2208 UNLOCK_EXTENT(ce); 2209 if (uio != NULL) 2210 uio_free(uio); 2211 _FREE_ZERO(pcontaining_extents, M_TEMP); 2212 2213 /* pass the request on */ 2214 BC_cache->c_strategy(bp); 2215 2216 /* 2217 * can't dereference bp after c_strategy has been issued 2218 * or else we race with buf_biodone 2219 */ 2220 void *bp_void = (void *)bp; // for use in ktrace 2221 bp = NULL; 2222 2223 /* not really "bypassed" if the cache is not active */ 2224 if (during_cache) { 2225 if (cm_idx != -1) { 2226 LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 2227 if (BC_cache->c_mounts[cm_idx].cm_state == CM_READY) { 2228 discards += BC_handle_discards(BC_cache->c_mounts + cm_idx, disk_offset, nbytes); 2229 } 2230 2231 /* Check if we should throttle this IO */ 2232 if (BC_cache->c_readahead_throttles_cutthroughs && 2233 !is_swap && 2234 BC_cache->c_mounts[cm_idx].cm_disk && 2235 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) && 2236 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI) && 2237 !(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)) { 2238 /* We're currently issuing readahead for this disk. 2239 * Throttle this IO so we don't cut-through the readahead so much. 2240 */ 2241 should_throttle = 1; 2242 } 2243 2244 UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx); 2245 } 2246 if (take_detailed_stats) { 2247 BC_ADD_STAT(strategy_bypassed, 1); 2248 if (during_io) { 2249 BC_ADD_STAT(strategy_bypass_duringio, 1); 2250 } 2251 if (dont_cache) { 2252 BC_ADD_STAT(strategy_bypass_nocache, 1); 2253 BC_ADD_STAT(bypass_nocache_bytes, nbytes); 2254 if (during_io) { 2255 BC_ADD_STAT(strategy_bypass_duringio_nocache, 1); 2256 } 2257 } 2258 if (cache_hit) { 2259 BC_ADD_STAT(error_discards, discards); 2260 } else if (dont_cache) { 2261 BC_ADD_STAT(bypass_nocache_discards, discards); 2262 } else if (bufflags & B_READ) { 2263 BC_ADD_STAT(read_discards, discards); 2264 } else { 2265 BC_ADD_STAT(write_discards, discards); 2266 } 2267 } else { 2268 BC_ADD_STAT(lost_bytes_afterhistory, discards); 2269 } 2270 } 2271 2272 if (during_cache) { 2273 OSAddAtomic(-1, &BC_cache->c_strategycalls); 2274 UNLOCK_CACHE_R(); 2275 } 2276 2277 if (! is_swap) { 2278 if (! dont_cache) { 2279 is_root_disk = BC_add_history(blkno, nbytes, pid, 0, ((bufflags & B_READ) ? 0 : 1), 0, is_shared, dev); 2280 2281 if (take_detailed_stats && during_io && is_root_disk) { 2282 if (cache_hit) { 2283 if (unfilled) { 2284 BC_ADD_STAT(strategy_bypass_duringio_unfilled, 1); 2285 } else { 2286 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_failure, 1); 2287 } 2288 } else if (bufflags & B_READ) { 2289 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_read, 1); 2290 } else { 2291 BC_ADD_STAT(strategy_bypass_duringio_rootdisk_nonread, 1); 2292 } 2293 } 2294 } 2295 } 2296 2297 /* 2298 * Check to make sure that we're not missing the cache 2299 * too often. If we are, we're no longer providing a 2300 * performance win and the best thing would be to give 2301 * up. 2302 * 2303 * Don't count throttled IOs since those aren't time 2304 * critical. (We don't want to jettison the cache just 2305 * because spotlight is indexing) 2306 */ 2307 2308 /* if this is a read, and we do have an active cache, and the read isn't throttled */ 2309 if (during_cache) { 2310 (void) is_stolen; 2311 if (is_swap /*|| is_stolen*/) { //rdar://10651288&10658086 seeing stolen pages early during boot 2312 if (is_swap) { 2313 debug("detected %s swap file, jettisoning cache", (bufflags & B_READ) ? "read from" : "write to"); 2314 } else { 2315 debug("detected stolen page, jettisoning cache"); 2316 } 2317 //rdar://9858070 Do this asynchronously to avoid deadlocks 2318 BC_terminate_cache_async(); 2319 } else if ((bufflags & B_READ) && 2320 !(throttle_tier)) { 2321 2322 struct timeval current_time; 2323 if (BC_cache->c_stats.ss_history_num_recordings < 2) { 2324 microtime(¤t_time); 2325 timersub(¤t_time, 2326 &BC_cache->c_loadtime, 2327 ¤t_time); 2328 } 2329 /* Don't start counting misses until we've started login or hit our prelogin timeout */ 2330 if (BC_cache->c_stats.ss_history_num_recordings >= 2 || current_time.tv_sec > BC_chit_prelogin_timeout_seconds) { 2331 2332 /* increase number of IOs since last hit */ 2333 OSAddAtomic(1, &BC_cache->c_num_ios_since_last_hit); 2334 2335 if (BC_cache->c_num_ios_since_last_hit >= BC_chit_max_num_IOs_since_last_hit) { 2336 /* 2337 * The hit rate is poor, shut the cache down. 2338 */ 2339 debug("hit rate below threshold (0 hits in the last %u lookups), jettisoning cache", 2340 BC_cache->c_num_ios_since_last_hit); 2341 //rdar://9858070 Do this asynchronously to avoid deadlocks 2342 BC_terminate_cache_async(); 2343 } 2344 } 2345 } 2346 } 2347 2348 if (is_swap && (! (BC_cache->c_flags & BC_FLAG_SHUTDOWN))) { 2349 /* if we're swapping, stop readahead */ 2350 debug("Detected %s swap file. Terminating readahead", (bufflags & B_READ) ? "read from" : "write to"); 2351 BC_set_flag(BC_FLAG_SHUTDOWN); 2352 } 2353 2354 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (should_throttle ? DBG_BC_IO_MISS_CUT_THROUGH : DBG_BC_IO_MISS)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp_void), 0, 0, 0, 0); 2355 2356 if (should_throttle && throttle_tier < IOPOL_THROTTLE) { 2357 /* 2358 * We need to indicate to spec_strategy that we want to 2359 * throttle this IO to avoid cutting through readahead 2360 * too much. spec_strategy will check for the special 2361 * return value 0xcafebeef to indicate that the IO 2362 * should be throttled. 2363 */ 2364 2365 BC_ADD_STAT(strategy_forced_throttled, 1); 2366 2367#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef) 2368 return IO_SHOULD_BE_THROTTLED; 2369 } 2370 2371 return 0; 2372} 2373 2374/* 2375 * Handle a block range that needs to be discarded. 2376 * 2377 * Called with the cache mount read lock held 2378 * 2379 * Returns the number of bytes discarded 2380 */ 2381static int 2382BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length) 2383{ 2384 struct BC_cache_extent **pce, **p; 2385 int count, total_discards; 2386 2387 total_discards = 0; 2388 2389 /* 2390 * Look for an extent that we overlap. 2391 */ 2392 if ((pce = BC_find_extent(cm, offset, length, 0, NULL)) == NULL) 2393 return 0; /* doesn't affect us */ 2394 2395 /* 2396 * Discard bytes in the matched extent. 2397 */ 2398 LOCK_EXTENT(*pce); 2399 count = BC_discard_bytes((*pce), offset, length); 2400 UNLOCK_EXTENT(*pce); 2401 total_discards += count; 2402 2403 /* 2404 * Scan adjacent extents for possible overlap and discard there as well. 2405 */ 2406 p = pce - 1; 2407 while (p >= cm->cm_pextents && 2408 BC_check_intersection((*p), offset, length)) { 2409 LOCK_EXTENT(*p); 2410 count = BC_discard_bytes((*p), offset, length); 2411 UNLOCK_EXTENT(*p); 2412 if (count == 0) 2413 break; 2414 total_discards += count; 2415 p--; 2416 } 2417 p = pce + 1; 2418 while (p < (cm->cm_pextents + cm->cm_nextents) && 2419 BC_check_intersection((*p), offset, length)) { 2420 LOCK_EXTENT(*p); 2421 count = BC_discard_bytes((*p), offset, length); 2422 UNLOCK_EXTENT(*p); 2423 if (count == 0) 2424 break; 2425 total_discards += count; 2426 p++; 2427 } 2428 2429 return total_discards; 2430} 2431 2432/* 2433 * Shut down readahead operations. 2434 */ 2435static int 2436BC_terminate_readahead(void) 2437{ 2438 int error; 2439 struct timespec timeout; 2440 timeout.tv_sec = 10; 2441 timeout.tv_nsec = 0; 2442 2443 /* 2444 * Signal the readahead thread to terminate, and wait for 2445 * it to comply. If this takes > 10 seconds, give up. 2446 */ 2447 BC_set_flag(BC_FLAG_SHUTDOWN); 2448 2449 /* 2450 * If readahead is still in progress, we have to shut it down 2451 * cleanly. This is an expensive process, but since readahead 2452 * will normally complete long before the reads it tries to serve 2453 * complete, it will typically only be invoked when the playlist 2454 * is out of synch and the cache hitrate falls below the acceptable 2455 * threshold. 2456 * 2457 * Note that if readahead is aborted, the reader thread will mark 2458 * the aborted extents and wake up any strategy callers waiting 2459 * on them, so we don't have to worry about them here. 2460 */ 2461 LOCK_READERS(); 2462 while (BC_cache->c_num_reader_threads > 0) { 2463 debug("terminating active readahead"); 2464 2465 error = msleep(&BC_cache->c_num_reader_threads, &BC_cache->c_reader_threads_lock, PRIBIO, "BC_terminate_readahead", &timeout); 2466 if (error == EWOULDBLOCK) { 2467 UNLOCK_READERS(); 2468 2469 message("timed out waiting for I/O to stop"); 2470 if (BC_cache->c_num_reader_threads == 0) { 2471 debug("but I/O has stopped!"); 2472 } 2473#ifdef BC_DEBUG 2474 Debugger("I/O Kit wedged on us"); 2475#endif 2476 /* 2477 * It might be nice to free all the pages that 2478 * aren't actually referenced by the outstanding 2479 * region, since otherwise we may be camped on a 2480 * very large amount of physical memory. 2481 * 2482 * Ideally though, this will never happen. 2483 */ 2484 return(EBUSY); /* really EWEDGED */ 2485 } 2486 } 2487 UNLOCK_READERS(); 2488 2489 return(0); 2490} 2491 2492static void 2493BC_terminate_cache_thread(void *param0, wait_result_t param1) 2494{ 2495 BC_terminate_cache(); 2496} 2497 2498/* 2499 * Start up an auxilliary thread to stop the cache so we avoid potential deadlocks 2500 */ 2501static void 2502BC_terminate_cache_async(void) 2503{ 2504 if (! (BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) { 2505 return; 2506 } 2507 2508 int error; 2509 thread_t rthread; 2510 2511 debug("Kicking off thread to terminate cache"); 2512 if ((error = kernel_thread_start(BC_terminate_cache_thread, NULL, &rthread)) == KERN_SUCCESS) { 2513 thread_deallocate(rthread); 2514 } else { 2515 message("Unable to start thread to terminate cache"); 2516 } 2517} 2518 2519/* 2520 * Terminate the cache. 2521 * 2522 * This prevents any further requests from being satisfied from the cache 2523 * and releases all the resources owned by it. 2524 * 2525 * Must be called with no locks held 2526 */ 2527static int 2528BC_terminate_cache(void) 2529{ 2530 int retry, cm_idx, j, ce_idx, cel_idx; 2531 2532 BC_terminate_readahead(); 2533 2534 /* can't shut down if readahead is still active */ 2535 if (BC_cache->c_num_reader_threads > 0) { 2536 debug("cannot terminate cache while readahead is in progress"); 2537 return(EBUSY); 2538 } 2539 2540 LOCK_CACHE_R(); 2541 2542 LOCK_HANDLERS(); 2543 if (!BC_clear_flag(BC_FLAG_CACHEACTIVE)) { 2544 /* can't shut down if we're not active */ 2545 debug("cannot terminate cache when not active"); 2546 UNLOCK_HANDLERS(); 2547 UNLOCK_CACHE_R(); 2548 return(ENXIO); 2549 } 2550 2551 bootcache_contains_block = NULL; 2552 2553 debug("terminating cache..."); 2554 2555 /* if we're no longer recording history also, disconnect our strategy routine */ 2556 BC_check_handlers(); 2557 UNLOCK_HANDLERS(); 2558 2559 /* 2560 * Mark all extents as FREED. This will notify any sleepers in the 2561 * strategy routine that the extent won't have data for them. 2562 */ 2563 for (cel_idx = 0; 2564 cel_idx < BC_cache->c_nextentlists; 2565 cel_idx++) { 2566 for (ce_idx = 0; 2567 ce_idx < BC_cache->c_nextents[cel_idx]; 2568 ce_idx++) { 2569 struct BC_cache_extent *ce = BC_cache->c_extentlists[cel_idx] + ce_idx; 2570 LOCK_EXTENT(ce); 2571 /* 2572 * Track unused bytes 2573 */ 2574 if (ce->ce_blockmap != NULL && BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize != 0) { 2575 for (j = 0; j < howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); j++) { 2576 if (CB_BLOCK_PRESENT(ce, j)) 2577 BC_ADD_STAT(spurious_bytes, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); 2578 } 2579 } 2580 2581 BC_teardown_extent(ce); 2582 UNLOCK_EXTENT(ce); 2583 wakeup(ce); 2584 } 2585 } 2586 2587 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2588 struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx; 2589 LOCK_MOUNT_W(cm); 2590 BC_teardown_mount(cm); 2591 UNLOCK_MOUNT_W(cm); 2592 } 2593 2594 2595 /* 2596 * It is possible that one or more callers are asleep in the 2597 * strategy routine (eg. if readahead was terminated early, 2598 * or if we are called off the timeout). 2599 * Check the count of callers in the strategy code, and sleep 2600 * until there are none left (or we time out here). Note that 2601 * by clearing BC_FLAG_CACHEACTIVE above we prevent any new 2602 * strategy callers from touching the cache, so the count 2603 * must eventually drain to zero. 2604 */ 2605 retry = 0; 2606 while (BC_cache->c_strategycalls > 0) { 2607 tsleep(BC_cache, PRIBIO, "BC_terminate_cache", 10); 2608 if (retry++ > 50) { 2609 message("could not terminate cache, timed out with %d caller%s in BC_strategy", 2610 (int) BC_cache->c_strategycalls, 2611 BC_cache->c_strategycalls == 1 ? "" : "s"); 2612 UNLOCK_CACHE_R(); 2613 return(EBUSY); /* again really EWEDGED */ 2614 } 2615 } 2616 2617 if (! LOCK_CACHE_R_TO_W()) { 2618 /* We shouldn't get here. This is the only LOCK_CACHE_R_TO_W call, 2619 * so this implies someone is terminating the cache in parallel with us, 2620 * but we check for exclusivity by clearing BC_FLAG_CACHEACTIVE. 2621 */ 2622 message("Unable to upgrade cache lock to free resources"); 2623 return ENXIO; 2624 } 2625 2626 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2627 struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx; 2628 if (cm->cm_disk != NULL) { 2629 for (j = cm_idx + 1; j < BC_cache->c_nmounts; j++) { 2630 if (BC_cache->c_mounts[j].cm_disk == cm->cm_disk) { 2631 BC_cache->c_mounts[j].cm_disk = NULL; 2632 } 2633 } 2634 BC_teardown_disk(cm->cm_disk); 2635 lck_mtx_destroy(&cm->cm_disk->cd_lock, BC_cache->c_lckgrp); 2636 _FREE_ZERO(cm->cm_disk, M_TEMP); 2637 } 2638 } 2639 2640 BC_free_pagebuffer(); 2641 BC_cache->c_cachesize = 0; 2642 2643 /* free memory held by extents and mounts */ 2644 for (cel_idx = 0; 2645 cel_idx < BC_cache->c_nextentlists; 2646 cel_idx++) { 2647 for (ce_idx = 0; 2648 ce_idx < BC_cache->c_nextents[cel_idx]; 2649 ce_idx++) { 2650 lck_mtx_destroy(&BC_cache->c_extentlists[cel_idx][ce_idx].ce_lock, BC_cache->c_lckgrp); 2651 } 2652 _FREE_ZERO(BC_cache->c_extentlists[cel_idx], M_TEMP); 2653 } 2654 _FREE_ZERO(BC_cache->c_extentlists, M_TEMP); 2655 _FREE_ZERO(BC_cache->c_nextents, M_TEMP); 2656 BC_cache->c_nextentlists = 0; 2657 2658 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 2659 lck_rw_destroy(&BC_cache->c_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp); 2660 } 2661 _FREE_ZERO(BC_cache->c_mounts, M_TEMP); 2662 BC_cache->c_nmounts = 0; 2663 2664 UNLOCK_CACHE_W(); 2665 2666 /* record stop time */ 2667 struct timeval endtime; 2668 microtime(&endtime); 2669 timersub(&endtime, 2670 &BC_cache->c_cache_starttime, 2671 &endtime); 2672 BC_cache->c_stats.ss_cache_time += (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000; 2673 return(0); 2674} 2675 2676/* 2677 * Terminate history recording. 2678 * 2679 * This stops us recording any further history events. 2680 */ 2681static int 2682BC_terminate_history(void) 2683{ 2684 struct BC_history_mount_device *hmd; 2685 LOCK_HANDLERS(); 2686 if (!BC_clear_flag(BC_FLAG_HISTORYACTIVE)) { 2687 /* can't shut down if we're not active */ 2688 debug("cannot terminate history when not active"); 2689 UNLOCK_HANDLERS(); 2690 return(ENXIO); 2691 } 2692 2693 debug("terminating history collection..."); 2694 2695 /* if the cache is no longer active also, disconnect our strategy routine */ 2696 BC_check_handlers(); 2697 UNLOCK_HANDLERS(); 2698 2699 /* 2700 * Kill the timeout handler; we don't want it going off 2701 * now. 2702 */ 2703 untimeout(BC_timeout_history, NULL); 2704 2705 if (BC_cache->c_take_detailed_stats) { 2706 BC_cache->c_stats.ss_history_mount_no_uuid = 0; 2707 for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next) 2708 if (uuid_is_null(hmd->hmd_mount.hm_uuid)) 2709 BC_ADD_STAT(history_mount_no_uuid, 1); 2710 } 2711 2712 /* record stop time */ 2713 if (BC_cache->c_take_detailed_stats) { 2714 struct timeval endtime; 2715 /* record stop time */ 2716 microtime(&endtime); 2717 timersub(&endtime, 2718 &BC_cache->c_history_starttime, 2719 &endtime); 2720 BC_ADD_STAT(history_time, (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000); 2721 } 2722 2723 BC_cache->c_take_detailed_stats = 0; 2724 2725 return(0); 2726} 2727 2728/* 2729 * Check our strategy handlers and make sure they are set/cleared depending on our current state. 2730 * 2731 * Should be called after changing BC_FLAG_CACHEACTIVE or BC_FLAG_HISTORYACTIVE. 2732 * Called with the handlers lock held. 2733 */ 2734static void 2735BC_check_handlers(void) 2736{ 2737 if (BC_cache->c_devsw == NULL || 2738 BC_cache->c_strategy == NULL || 2739 BC_cache->c_close == NULL) { 2740 debug("Cannot check device handlers: cache not setup properly"); 2741 return; 2742 } 2743 2744 /* if the cache or history recording is active, make sure we've captured the appropriate device handler routines */ 2745 if ((BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || 2746 (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 2747 2748 if (BC_cache->c_devsw->d_strategy != (strategy_fcn_t*) BC_strategy || 2749 BC_cache->c_devsw->d_close != BC_close) { 2750 2751 debug("connecting handlers..."); 2752 2753 /* connect the strategy and close routines */ 2754 BC_cache->c_devsw->d_strategy = (strategy_fcn_t*) BC_strategy; 2755 BC_cache->c_devsw->d_close = BC_close; 2756 } 2757 } else { 2758 if (BC_cache->c_devsw->d_strategy != BC_cache->c_strategy || 2759 BC_cache->c_devsw->d_close != BC_cache->c_close) { 2760 2761 debug("disconnecting handlers..."); 2762 2763 /* disconnect the strategy and close routines */ 2764 BC_cache->c_devsw->d_strategy = BC_cache->c_strategy; 2765 BC_cache->c_devsw->d_close = BC_cache->c_close; 2766 } 2767 } 2768} 2769 2770/* 2771 * Setup a cache extent with the parameters given by the playlist entry 2772 * 2773 * Returns 0 on success 2774 */ 2775static int 2776BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx) 2777{ 2778 lck_mtx_init(&ce->ce_lock, BC_cache->c_lckgrp, 2779 LCK_ATTR_NULL); 2780 ce->ce_diskoffset = pe->pe_offset; 2781 ce->ce_mount_idx = cache_mount_idx; 2782 ce->ce_length = pe->pe_length; 2783#ifdef IGNORE_BATCH 2784 ce->ce_batch = 0; 2785#else 2786 ce->ce_batch = pe->pe_batch; 2787#endif 2788 ce->ce_batch += batch_adjustment; 2789 if (ce->ce_batch > BC_MAXBATCHES) { 2790 ce->ce_batch = BC_MAXBATCHES; /* cap batch number */ 2791 } 2792 ce->ce_cacheoffset = 0; 2793 ce->ce_blockmap = NULL; 2794 ce->ce_flags = 0; 2795 if (pe->pe_flags & BC_PE_LOWPRIORITY) { 2796 ce->ce_flags |= CE_LOWPRI; 2797 } 2798 if (pe->pe_flags & BC_PE_SHARED) { 2799 ce->ce_flags |= CE_SHARED; 2800 } 2801 2802 /* track highest batch number for this playlist */ 2803 if (ce->ce_batch >= BC_cache->c_batch_count) { 2804 BC_cache->c_batch_count = ce->ce_batch + 1; 2805 // debug("Largest batch is now %d from extent with disk offset %llu", BC_cache->c_batch_count, ce->ce_diskoffset); 2806 } 2807 2808 return 0; 2809} 2810 2811/* 2812 * The blocksize is initialised from the first playlist read, the statistics 2813 * structure, or it can be pre-set by the caller. Once set, only playlists with 2814 * matching sizes can be read/written. 2815 */ 2816#define BLOCK_ROUNDDOWN(cm, x) (((x) / (cm)->cm_blocksize) * (cm)->cm_blocksize) 2817#define BLOCK_ROUNDUP(cm, x) ((((x) + ((cm)->cm_blocksize - 1)) / (cm)->cm_blocksize) * (cm)->cm_blocksize) 2818/* 2819 * optional for power-of-two blocksize roundup: 2820 * (((x) + ((cm)->cm_blocksize - 1)) & (~((cm)->cm_blocksize - 1))) 2821 */ 2822 2823/* 2824 * Fill in a cache extent now that its mount has been filled in 2825 * 2826 * Called with the extent lock held 2827 * 2828 * Returns 0 on success 2829 */ 2830static int 2831BC_fill_in_extent(struct BC_cache_extent *ce) 2832{ 2833 int numblocks, roundsize, i; 2834 u_int64_t end; 2835 2836 if (ce->ce_flags & CE_ABORTED || 2837 ce->ce_length == 0) { 2838 return 1; 2839 } 2840 2841 struct BC_cache_mount *cm = BC_cache->c_mounts + ce->ce_mount_idx; 2842 2843 int blocksize = cm->cm_blocksize; 2844 2845 if (0 == blocksize) { 2846 return 1; 2847 } 2848 2849 roundsize = roundup(ce->ce_length, PAGE_SIZE); 2850 2851 /* make sure we're on block boundaries */ 2852 end = ce->ce_diskoffset + roundsize; 2853 ce->ce_diskoffset = BLOCK_ROUNDDOWN(cm, ce->ce_diskoffset); 2854 ce->ce_length = BLOCK_ROUNDUP(cm, end) - ce->ce_diskoffset; 2855 2856 /* make sure we didn't grow our pagebuffer size since is's already been allocated */ 2857 if (roundup(ce->ce_length, PAGE_SIZE) > roundsize) { 2858 debug("Clipped extent %llu by a page", ce->ce_diskoffset); 2859 BC_ADD_STAT(extents_clipped, 1); 2860 ce->ce_length = roundsize; 2861 } 2862 2863 numblocks = howmany(ce->ce_length, blocksize); 2864 2865 ce->ce_blockmap = BC_MALLOC(howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)), 2866 M_TEMP, M_WAITOK | M_ZERO); 2867 if (!ce->ce_blockmap) { 2868 message("can't allocate extent blockmap for %d blocks of %d bytes", numblocks, howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES))); 2869 return 1; 2870 } 2871 2872 for (i = 0; i < howmany(ce->ce_length, blocksize); i++) { 2873 CB_MARK_BLOCK_PRESENT((ce), i); 2874 } 2875 2876 return 0; 2877} 2878 2879static void 2880BC_teardown_extent(struct BC_cache_extent *ce) 2881{ 2882 ce->ce_flags |= CE_ABORTED; 2883 _FREE_ZERO(ce->ce_blockmap, M_TEMP); 2884} 2885 2886/* 2887 * Setup a cache disk. 2888 * Returns 0 on success 2889 */ 2890static int 2891BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd) 2892{ 2893 static int next_disk_num; 2894 lck_mtx_init(&cd->cd_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 2895 cd->cd_disk_id = disk_id; 2896 cd->cd_disk_num = next_disk_num++; 2897 cd->cd_flags = 0; 2898 cd->cd_nmounts = 0; 2899 cd->cd_batch = 0; 2900 2901 if (is_ssd) { 2902 cd->cd_flags |= CD_IS_SSD; 2903 } 2904 2905 debug("Setup disk 0x%llx as disk num %d%s", disk_id, cd->cd_disk_num, (cd->cd_flags & CD_IS_SSD) ? " (ssd)" : ""); 2906 2907 return (0); 2908} 2909 2910/* 2911 * Teardown a cache disk. 2912 */ 2913static void 2914BC_teardown_disk(struct BC_cache_disk *cd) 2915{ 2916 /* Nothing to do */ 2917} 2918 2919 2920/* 2921 * Check if a mount has become available for readahead 2922 * 2923 * If so, make sure the reader thread picks up this mount's IOs. 2924 * If there is no reader thread for the mount's disk, kick one off. 2925 * 2926 * Called with the cache mount read lock held 2927 */ 2928static void 2929BC_mount_available(struct BC_cache_mount *cm) 2930{ 2931#ifdef EMULATE_ONLY 2932 int i; 2933 for (i = 0; i < cm->cm_nextents; i++) 2934 cm->cm_pextents[i]->ce_flags |= CE_IODONE; 2935#else 2936 int i, error; 2937 thread_t rthread; 2938 if (cm->cm_state != CM_READY) { 2939 /* not ready */ 2940 return; 2941 } 2942 2943 struct BC_cache_disk *cd = cm->cm_disk; 2944 LOCK_DISK(cd); 2945 cd->cd_nmounts++; 2946 LOCK_READERS(); 2947 if (!(BC_cache->c_flags & BC_FLAG_SHUTDOWN)) { 2948 if (cd->cd_flags & CD_HAS_THREAD) { 2949 /* Make sure the thread is not issuing lowpriority IO */ 2950 if (cd->cd_flags & CD_ISSUE_LOWPRI) { 2951 debug("Interrupting low-priority thread for new mount %s", uuid_string(cm->cm_uuid)); 2952 cd->cd_flags &= (~CD_ISSUE_LOWPRI); 2953 /* TODO: Unthrottle the readahead thread here rather than waiting out its current throttle delay */ 2954 } 2955 UNLOCK_READERS(); 2956 UNLOCK_DISK(cd); 2957 return; 2958 } 2959 2960 /* Make sure we issue regular IOs for the new playlist in case we were issuing low-priority previously */ 2961 cd->cd_flags &= (~CD_ISSUE_LOWPRI); 2962 2963 debug("Kicking off reader thread for disk %d for mount %s", cd->cd_disk_num, uuid_string(cm->cm_uuid)); 2964 if ((error = kernel_thread_start(BC_reader_thread, cd, &rthread)) == KERN_SUCCESS) { 2965 thread_deallocate(rthread); 2966 BC_cache->c_num_reader_threads++; 2967 BC_ADD_STAT(readahead_threads, 1); 2968 cd->cd_flags |= CD_HAS_THREAD; 2969 UNLOCK_READERS(); 2970 UNLOCK_DISK(cd); 2971 return; 2972 } 2973 2974 message("Unable to start reader thread for disk %d: %d", cd->cd_disk_num, error); 2975 } 2976 UNLOCK_READERS(); 2977 UNLOCK_DISK(cd); 2978 2979 /* 2980 * Getting here indicates some failure. 2981 * 2982 * Mark all extents as aborted. This will notify any sleepers in the 2983 * strategy routine that the extent won't have data for them. 2984 */ 2985 for (i = 0; i < cm->cm_nextents; i++) { 2986 LOCK_EXTENT(cm->cm_pextents[i]); 2987 BC_teardown_extent(cm->cm_pextents[i]); 2988 UNLOCK_EXTENT(cm->cm_pextents[i]); 2989 wakeup(cm->cm_pextents[i]); 2990 } 2991#endif 2992} 2993 2994/* 2995 * Setup a cache mount from the playlist mount. 2996 * 2997 * Allocates cm_pextents large enough to hold pm->pm_nentries extents, 2998 * but leaves cm_nextents 0 since the array hasn't been initialized. 2999 * 3000 * Returns 0 on success 3001 */ 3002static int 3003BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm) 3004{ 3005 int error = 0; 3006 3007 lck_rw_init(&cm->cm_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 3008 uuid_copy(cm->cm_uuid, pm->pm_uuid); 3009 3010 /* These will be set once we've detected that this volume has been mounted */ 3011 cm->cm_dev = nulldev(); 3012 cm->cm_bp = NULL; 3013 cm->cm_devsize = 0; 3014 cm->cm_maxread = 0; 3015 cm->cm_disk = NULL; 3016 cm->cm_blocksize = 0; 3017 cm->cm_nextents = 0; 3018 3019 /* Allocate the sorted extent array. 3020 * We'll fill it in while we setup the extents. 3021 */ 3022 if (pm->pm_nentries == 0) { 3023 message("Playlist incuded mount with 0 entries"); 3024 error = EINVAL; 3025 goto out; 3026 } 3027 3028 cm->cm_pextents = BC_MALLOC(pm->pm_nentries * sizeof(*cm->cm_pextents), M_TEMP, M_WAITOK | M_ZERO); 3029 if (cm->cm_pextents == NULL) { 3030 message("can't allocate mount's extent array (%d entries)", pm->pm_nentries); 3031 error = ENOMEM; 3032 goto out; 3033 } 3034 3035 cm->cm_state = CM_SETUP; 3036 3037out: 3038 if (error) { 3039 cm->cm_state = CM_ABORTED; 3040 _FREE_ZERO(cm->cm_pextents, M_TEMP); 3041 lck_rw_destroy(&cm->cm_lock, BC_cache->c_lckgrp); 3042 } 3043 return error; 3044} 3045 3046/* 3047 * Fill in the rest of the cache mount given the matching mount structure. 3048 * 3049 * Called with the cache mount write lock held 3050 * 3051 * Returns 0 if the mount was successfully filled in and cm_state will be CM_READY 3052 * Returns non-0 on failure or it the mount was already filled in 3053 */ 3054static int 3055BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context) 3056{ 3057 uint64_t blkcount, max_byte_count, max_segment_count, max_segment_byte_count, max_block_count; 3058 uint32_t blksize, is_ssd; 3059 int error, mount_idx, i; 3060 u_int64_t disk_id; 3061 struct BC_cache_disk *cd; 3062 vnode_t devvp = NULLVP; 3063 3064 error = 0; 3065 3066 if (CM_SETUP != cm->cm_state) { 3067 return EINVAL; 3068 } 3069 3070 cm->cm_throttle_mask = vfs_throttle_mask(mount); 3071 disk_id = cm->cm_throttle_mask; /* using the throttle mask as a way to identify the physical disk */ 3072 /* debug("Got throttle mask %llx for mount %s", cm->cm_throttle_mask, uuid_string(cm->cm_uuid)); */ 3073 3074 devvp = vfs_devvp(mount); 3075 if (devvp == NULLVP) { 3076 message("mount %s does not have a vnode", uuid_string(cm->cm_uuid)); 3077 error = EINVAL; 3078 goto out; 3079 } 3080 3081#ifdef ROOT_DISK_ONLY 3082 if (devvp == rootvp) { 3083 BC_cache->c_root_disk_id = disk_id; 3084 if (BC_cache->c_root_disk_id == 0) { 3085 message("Root disk is 0"); 3086 } else { 3087 debug("Root disk (via cache) is 0x%llx", BC_cache->c_root_disk_id); 3088 } 3089 } else if (0 == BC_cache->c_root_disk_id) { 3090 error = EAGAIN; /* we haven't seen the root mount yet, try again later */ 3091 goto out; 3092 3093 //rdar://11653286 disk image volumes (FileVault 1) are messing with this check, so we're going back to != rather than !( & ) 3094 } else if (BC_cache->c_root_disk_id != disk_id) { 3095 debug("mount %s (disk 0x%llx) is not on the root disk (disk 0x%llx)", uuid_string(cm->cm_uuid), disk_id, BC_cache->c_root_disk_id); 3096 error = ENODEV; 3097 goto out; 3098 } 3099#endif 3100 3101 /* See if we already have a cache_disk for this disk */ 3102 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 3103 if (BC_cache->c_mounts + mount_idx == cm) continue; 3104 3105 cd = BC_cache->c_mounts[mount_idx].cm_disk; 3106 3107 /* 3108 * We're not handling mounts backed by multiple disks as gracefull as we should. 3109 * Previously, this was cd->cd_disk_id == disk_id, so we had a thread per disk combination 3110 * meaning reader threads may span multiple disks and disks may have multiple reader threads. 3111 * We've only ever supported the root disk, however, so this wasn't a problem, it just missed 3112 * cases where you'd have other volumes on one of the disks you're booting from. 3113 * 3114 * Now, since we are checking for cd->cd_disk_id & disk_id, we at least include all mounts that 3115 * are on disks that the root mount uses. We still only have one reader thread, but we don't support 3116 * playback on composite disks, so that's not a problem yet. See rdar://10081513 3117 * 3118 * This assumes that the root mount (or, at least the mount we care most about) will always appear first 3119 * 3120 */ 3121 if (cd && (cd->cd_disk_id & disk_id)) { 3122 cm->cm_disk = cd; 3123 break; 3124 } 3125 } 3126 3127 /* If we don't already have a cache_disk for this disk, allocate one */ 3128 if (cm->cm_disk == NULL) { 3129 cd = BC_MALLOC(sizeof(*cd), M_TEMP, M_WAITOK | M_ZERO); 3130 if (cd == NULL) { 3131 message("can't allocate memory for cache disk"); 3132 error = ENOMEM; 3133 goto out; 3134 } 3135 3136 if (VNOP_IOCTL(devvp, /* vnode */ 3137 DKIOCISSOLIDSTATE, /* cmd */ 3138 (caddr_t)&is_ssd, /* data */ 3139 0, 3140 context)) /* context */ 3141 { 3142 message("can't determine if disk is a solid state disk for mount %s", uuid_string(cm->cm_uuid)); 3143 is_ssd = 0; 3144 } 3145 3146 if ((error = BC_setup_disk(cd, disk_id, is_ssd)) != 0) { 3147 _FREE_ZERO(cd, M_TEMP); 3148 message("cache disk setup failed: %d", error); 3149 goto out; 3150 } 3151 cm->cm_disk = cd; 3152 } 3153 3154 3155 if (VNOP_IOCTL(devvp, 3156 DKIOCGETBLOCKCOUNT, 3157 (caddr_t)&blkcount, 3158 0, 3159 context) 3160 || blkcount == 0) 3161 { 3162 message("can't determine device size, not checking"); 3163 blkcount = 0; 3164 } 3165 3166 if (VNOP_IOCTL(devvp, 3167 DKIOCGETBLOCKSIZE, 3168 (caddr_t)&blksize, 3169 0, 3170 context) 3171 || blksize == 0) 3172 { 3173 message("can't determine device block size for mount %s, defaulting to 512 bytes", uuid_string(cm->cm_uuid)); 3174 blksize = 512; 3175 } 3176 3177 if (PAGE_SIZE % blksize != 0) { 3178 message("PAGE_SIZE (%d) is not a multiple of block size (%d) for mount %s", PAGE_SIZE, blksize, uuid_string(cm->cm_uuid)); 3179 error = EINVAL; 3180 goto out; 3181 } 3182 3183 cm->cm_blocksize = blksize; 3184 cm->cm_devsize = blksize * blkcount; 3185 3186 /* max read size isn't larger than the max UPL size */ 3187 cm->cm_maxread = ubc_upl_maxbufsize(); 3188 3189 /* maxread = min ( maxread, MAXBYTECOUNTREAD ) */ 3190 if (0 == VNOP_IOCTL(devvp, 3191 DKIOCGETMAXBYTECOUNTREAD, 3192 (caddr_t)&max_byte_count, 3193 0, 3194 context)) { 3195 if (cm->cm_maxread > max_byte_count && max_byte_count > 0) { 3196 cm->cm_maxread = max_byte_count; 3197 debug("MAXBYTECOUNTREAD is %#llx", max_byte_count); 3198 } 3199 } 3200 3201 /* maxread = min ( maxread, MAXBLOCKCOUNTREAD * BLOCKSIZE ) */ 3202 if (0 == VNOP_IOCTL(devvp, 3203 DKIOCGETMAXBLOCKCOUNTREAD, 3204 (caddr_t)&max_block_count, 3205 0, 3206 context)) { 3207 if (cm->cm_maxread > max_block_count * cm->cm_blocksize && max_block_count > 0) { 3208 cm->cm_maxread = max_block_count * cm->cm_blocksize; 3209 debug("MAXBLOCKCOUNTREAD is %#llx, BLOCKSIZE is %#llx, (multiplied %#llx)", max_block_count, cm->cm_blocksize, max_block_count * cm->cm_blocksize, cm->cm_maxread); 3210 } 3211 } 3212 3213 /* maxread = min ( maxread, MAXSEGMENTCOUNTREAD * min (MAXSEGMENTBYTECOUNTREAD, PAGE_SIZE ) ) */ 3214 if (0 == VNOP_IOCTL(devvp, 3215 DKIOCGETMAXSEGMENTCOUNTREAD, 3216 (caddr_t)&max_segment_count, 3217 0, 3218 context)) { 3219 3220 if (max_segment_count > 0) { 3221 3222 if (0 == VNOP_IOCTL(devvp, 3223 DKIOCGETMAXSEGMENTBYTECOUNTREAD, 3224 (caddr_t)&max_segment_byte_count, 3225 0, 3226 context)) { 3227 //rdar://13835534 Limit max_segment_byte_count to PAGE_SIZE because some drives don't handle the spec correctly 3228 if (max_segment_byte_count > PAGE_SIZE || max_segment_byte_count == 0) { 3229 debug("MAXSEGMENTBYTECOUNTREAD is %#llx, limiting to PAGE_SIZE %#x", max_segment_byte_count, PAGE_SIZE); 3230 max_segment_byte_count = PAGE_SIZE; 3231 } 3232 } else { 3233 debug("Unable to get MAXSEGMENTBYTECOUNTREAD, assuming PAGE_SIZE %#x", PAGE_SIZE); 3234 max_segment_byte_count = PAGE_SIZE; 3235 } 3236 3237 if (cm->cm_maxread > max_segment_count * max_segment_byte_count) { 3238 cm->cm_maxread = max_segment_count * max_segment_byte_count; 3239 debug("MAXSEGMENTCOUNTREAD is %#llx, MAXSEGMENTBYTECOUNTREAD is %#llx, (multiplied %#llx)", max_segment_count, max_segment_byte_count, max_segment_count * max_segment_byte_count); 3240 } 3241 } 3242 } 3243 3244 /* maxread = min ( maxread, MAX_UPL_TRANSFER * PAGE_SIZE ) */ 3245 if (cm->cm_maxread > MAX_UPL_TRANSFER * PAGE_SIZE) { 3246 cm->cm_maxread = MAX_UPL_TRANSFER * PAGE_SIZE; 3247 debug("MAX_UPL_TRANSFER is %#x, PAGE_SIZE is %#x, (multiplied %#x)", MAX_UPL_TRANSFER, PAGE_SIZE, MAX_UPL_TRANSFER * PAGE_SIZE); 3248 } 3249 3250 /* maxread = max ( maxread, MAXPHYS ) */ 3251 if (cm->cm_maxread < MAXPHYS) { 3252 debug("can't determine device read size for mount %s; using default", uuid_string(cm->cm_uuid)); 3253 cm->cm_maxread = MAXPHYS; 3254 } 3255 3256 /* make sure maxread is a multiple of the block size */ 3257 if (cm->cm_maxread % cm->cm_blocksize != 0) { 3258 debug("Mount max IO size (%llu) not a multiple of block size (%llu)", cm->cm_maxread, cm->cm_blocksize); 3259 cm->cm_maxread -= cm->cm_maxread % cm->cm_blocksize; 3260 } 3261 3262 cm->cm_bp = buf_alloc(devvp); 3263 if (cm->cm_bp == NULL) { 3264 message("can't allocate buf"); 3265 error = ENOMEM; 3266 goto out; 3267 } 3268 3269 cm->cm_dev = vnode_specrdev(devvp); 3270 if (cm->cm_dev == nulldev()) { 3271 message("mount %s does not have a device", uuid_string(cm->cm_uuid)); 3272 error = EINVAL; 3273 goto out; 3274 } 3275 3276 for (i = 0; i < cm->cm_nextents; i++) { 3277 LOCK_EXTENT(cm->cm_pextents[i]); 3278 if (0 != BC_fill_in_extent(cm->cm_pextents[i])) { 3279 BC_teardown_extent(cm->cm_pextents[i]); 3280 UNLOCK_EXTENT(cm->cm_pextents[i]); 3281 wakeup(cm->cm_pextents[i]); 3282 } else { 3283 UNLOCK_EXTENT(cm->cm_pextents[i]); 3284 } 3285 } 3286 3287 cm->cm_state = CM_READY; 3288 3289 debug("Found new cache mount %s disk %d, %d extents", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_num, cm->cm_nextents); 3290 3291out: 3292 if (error && error != EAGAIN) { 3293 /* 3294 * Mark all extents as aborted. This will notify any sleepers in the 3295 * strategy routine that the extent won't have data for them. 3296 * 3297 * Note that its fine to take the extent lock here since no one will 3298 * have extent lock and try to take the mount lock if the mount isn't CM_READY 3299 */ 3300 for (i = 0; i < cm->cm_nextents; i++) { 3301 LOCK_EXTENT(cm->cm_pextents[i]); 3302 BC_teardown_extent(cm->cm_pextents[i]); 3303 UNLOCK_EXTENT(cm->cm_pextents[i]); 3304 wakeup(cm->cm_pextents[i]); 3305 } 3306 3307 BC_teardown_mount(cm); 3308 } 3309 if (devvp != NULLVP) { 3310 vnode_put(devvp); 3311 } 3312 return error; 3313} 3314 3315/* 3316 * Called with the cache mount write lock held 3317 */ 3318static void 3319BC_teardown_mount(struct BC_cache_mount *cm) 3320{ 3321 if (cm->cm_bp) { 3322 buf_free(cm->cm_bp); 3323 cm->cm_bp = NULL; 3324 } 3325 cm->cm_nextents = 0; 3326 _FREE_ZERO(cm->cm_pextents, M_TEMP); 3327 cm->cm_state = CM_ABORTED; 3328} 3329 3330/* 3331 * Check to see which mounts have been mounted and finish setting them up. 3332 * 3333 * No extent/mount locks required since we have a write lock on the cache 3334 * 3335 * Called with the cache write lock held 3336 */ 3337static int fill_in_bc_cache_mounts(mount_t mount, void* arg) 3338{ 3339 int mount_idx, error, i; 3340 struct vfs_attr attr; 3341 vfs_context_t context; 3342 int* go_again = (int*) arg; 3343 3344 VFSATTR_INIT(&attr); 3345 VFSATTR_WANTED(&attr, f_uuid); 3346 context = vfs_context_create(NULL); 3347 error = vfs_getattr(mount, &attr, context); 3348 if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) { 3349#ifdef BC_DEBUG 3350 char name[MFSNAMELEN]; 3351 vfs_name(mount, name); 3352 if (strncmp("devfs", name, sizeof("devfs")) != 0) { 3353 debug("Found mount %s for IO without a UUID: error %d", name, error); 3354 } 3355#endif 3356 vfs_context_rele(context); 3357 return VFS_RETURNED; 3358 } else { 3359 // debug("Found mount for IO with UUID %s", uuid_string(attr.f_uuid)); 3360 3361 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 3362 struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx; 3363 int match = 0; 3364 if (CM_SETUP == cm->cm_state) { 3365 3366 if (0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) { 3367 match = 1; 3368 } 3369 3370 /* a null uuid indicates we want to match the root volume, no matter what the uuid (8350414) */ 3371 if ((!match) && uuid_is_null(cm->cm_uuid)) { 3372 vnode_t devvp = vfs_devvp(mount); 3373 if (vnode_specrdev(devvp) == rootdev) { 3374 uuid_copy(cm->cm_uuid, attr.f_uuid); 3375 match = 1; 3376 debug("Found root mount %s", uuid_string(cm->cm_uuid)); 3377 } 3378 vnode_put(devvp); 3379 } 3380 3381 if (match) { 3382 /* Found a matching mount */ 3383 3384 /* Locking here isn't necessary since we're only called while holding the cache write lock 3385 * and no one holds the mount locks without also holding the cache lock 3386 */ 3387 if (BC_fill_in_mount(cm, mount, context) == EAGAIN) { 3388 *go_again = 1; 3389 } 3390 vfs_context_rele(context); 3391 3392 /* Check to see if we have any more mounts to fill in */ 3393 for (i = 0; i < BC_cache->c_nmounts; i++) { 3394 if (CM_SETUP == BC_cache->c_mounts[i].cm_state) { 3395 /* have more to fill in, keep iterating */ 3396 return VFS_RETURNED; 3397 } 3398 } 3399 3400 return VFS_RETURNED_DONE; 3401 } 3402 } 3403 } 3404 } 3405 3406 vfs_context_rele(context); 3407 3408 /* Check to see if we have any more mounts to fill in */ 3409 for (i = 0; i < BC_cache->c_nmounts; i++) { 3410 if (CM_SETUP == BC_cache->c_mounts[i].cm_state) { 3411 /* have more to fill in, keep iterating */ 3412 return VFS_RETURNED; 3413 } 3414 } 3415 3416 debug("No new mounts filled in fill_in_bc_cache_mounts"); 3417 return VFS_RETURNED_DONE; 3418 3419} 3420 3421#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3422/* 3423 * Sort the extent list. 3424 */ 3425static int 3426compare_cache_extents(const void *vfirst, const void *vsecond) 3427{ 3428 const struct BC_cache_extent *first, *second; 3429 3430 first = (const struct BC_cache_extent *)vfirst; 3431 second = (const struct BC_cache_extent *)vsecond; 3432 3433 // Sort by volume first, then by logical block address 3434 int mount_comparison = first->ce_mount_idx - second->ce_mount_idx; 3435 if (mount_comparison != 0) 3436 return((mount_comparison < 0) ? -1 : 1); 3437 3438 if (first->ce_diskoffset == second->ce_diskoffset) 3439 return(0); 3440 return((first->ce_diskoffset < second->ce_diskoffset) ? -1 : 1); 3441} 3442#endif 3443 3444/* 3445 * Fetch the playlist from userspace or the Mach-O segment where it 3446 * was preloaded. Add it to the cache and readahead in batches after 3447 * the current cache, if any. 3448 * 3449 * Returns 0 on success. In failure cases, the cache is not modified. 3450 * 3451 * Called with the cache write lock held. 3452 */ 3453static int 3454BC_copyin_playlist(size_t mounts_size, user_addr_t mounts_buf, size_t entries_size, user_addr_t entries_buf) 3455{ 3456 int nplaylist_mounts, nplaylist_entries; 3457 struct BC_playlist_mount *playlist_mounts = NULL, *pm; 3458 struct BC_playlist_entry *playlist_entries = NULL, *pe; 3459 struct BC_cache_mount *cm, *old_cm; 3460 struct BC_cache_extent *ce; 3461 3462 int ncache_mounts = 0, ncache_extents = 0, nnew_mounts = 0; 3463 struct BC_cache_mount *cache_mounts = NULL; 3464 struct BC_cache_extent *cache_extents = NULL; 3465 int *pmidx_to_cmidx = NULL; 3466 int *next_old_extent_idx = NULL; 3467 3468 int *cache_nextents = NULL; 3469 struct BC_cache_extent **cache_extents_list = NULL; 3470 3471 int old_batch_count; 3472 u_int64_t old_cache_size; 3473 3474 int error, pm_idx, cm_idx, pe_idx, ce_idx, cel_idx, pm_idx2, max_entries; 3475 off_t p; 3476 u_int64_t size; 3477 int actual, remaining_entries; 3478 3479 int clipped_extents = 0, enveloped_extents = 0; 3480 3481 playlist_mounts = NULL; 3482 playlist_entries = NULL; 3483 3484#ifdef BC_DEBUG 3485 struct timeval start_time; 3486 microtime(&start_time); 3487#endif 3488 3489 old_batch_count = BC_cache->c_batch_count; 3490 old_cache_size = BC_cache->c_cachesize; 3491 3492 nplaylist_mounts = (int) mounts_size / sizeof(*playlist_mounts); 3493 nplaylist_entries = (int) entries_size / sizeof(*playlist_entries); 3494 3495 if (BC_cache->c_stats.ss_total_extents > 0) { 3496 debug("adding %u extents to existing cache of %u extents", nplaylist_entries, BC_cache->c_stats.ss_total_extents); 3497 } else { 3498 debug("setting up first cache of %d extents", nplaylist_entries); 3499 } 3500 3501 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 3502 debug("Mount %s has %d extents", uuid_string(BC_cache->c_mounts[cm_idx].cm_uuid), BC_cache->c_mounts[cm_idx].cm_nextents); 3503 } 3504 3505 /* 3506 * Mount array 3507 */ 3508 if (BC_preloaded_playlist) { 3509 playlist_mounts = CAST_DOWN(struct BC_playlist_mount *, mounts_buf); 3510 } else { 3511 playlist_mounts = BC_MALLOC(mounts_size, M_TEMP, M_WAITOK); 3512 if (playlist_mounts == NULL) { 3513 message("can't allocate unpack mount buffer of %ld bytes", mounts_size); 3514 error = ENOMEM; 3515 goto out; 3516 } 3517 if ((error = copyin(mounts_buf, playlist_mounts, mounts_size)) != 0) { 3518 message("copyin %ld bytes from 0x%llx to %p failed", mounts_size, mounts_buf, playlist_mounts); 3519 goto out; 3520 } 3521 3522 /* if BC_preloaded_playlist is non-NULL, playlist_mounts must be freed */ 3523 } 3524 3525 /* map from the playlist mount index to the corresponding cache mount index */ 3526 pmidx_to_cmidx = BC_MALLOC(nplaylist_mounts * sizeof(*pmidx_to_cmidx), M_TEMP, M_WAITOK); 3527 if (pmidx_to_cmidx == NULL) { 3528 message("can't allocate playlist index to cache index reference array for %d mounts", nplaylist_mounts); 3529 error = ENOMEM; 3530 goto out; 3531 } 3532 3533 /* In order to merge the mount's current (sorted) extent list with the new (sorted) 3534 * extent list from the new playlist we first grow the extent list allocation to fit 3535 * both arrays, then merge the two arrays into the new allocation. 3536 */ 3537 3538 if (BC_cache->c_nmounts > 0) { 3539 /* next_old_extent_idx saves the index of the next unmerged extent in the original mount's extent list */ 3540 next_old_extent_idx = BC_MALLOC(BC_cache->c_nmounts * sizeof(*next_old_extent_idx), M_TEMP, M_WAITOK | M_ZERO); 3541 if (next_old_extent_idx == NULL) { 3542 message("can't allocate index array for %d mounts", BC_cache->c_nmounts); 3543 error = ENOMEM; 3544 goto out; 3545 } 3546 3547 /* 0-filled since all the index start at 0 */ 3548 } 3549 3550 nnew_mounts = nplaylist_mounts; 3551 3552 /* determine how many mounts are new and how many we already have */ 3553 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3554 pmidx_to_cmidx[pm_idx] = -1; /* invalid by default */ 3555 3556 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 3557 if (0 == uuid_compare(BC_cache->c_mounts[cm_idx].cm_uuid, playlist_mounts[pm_idx].pm_uuid)) { 3558 /* already have a record of this mount, won't need a new spot for it */ 3559 pmidx_to_cmidx[pm_idx] = cm_idx; 3560 nnew_mounts--; 3561 break; 3562 } 3563 } 3564 3565 /* check to make sure there aren't any duplicate mounts within the playlist */ 3566 for (pm_idx2 = pm_idx + 1; pm_idx2 < nplaylist_mounts; pm_idx2++) { 3567 if (0 == uuid_compare(playlist_mounts[pm_idx2].pm_uuid, playlist_mounts[pm_idx].pm_uuid)) { 3568 message("Playlist had duplicate mount %s", uuid_string(playlist_mounts[pm_idx2].pm_uuid)); 3569 error = EINVAL; 3570 goto out; 3571 } 3572 } 3573 } 3574 3575 /* cache_mounts is the array of mounts that will replace the one currently in the cache */ 3576 cache_mounts = BC_MALLOC((BC_cache->c_nmounts + nnew_mounts) * sizeof(*cache_mounts), M_TEMP, M_WAITOK | M_ZERO); 3577 if (cache_mounts == NULL) { 3578 message("can't allocate memory for %d mounts", (BC_cache->c_nmounts + nnew_mounts)); 3579 error = ENOMEM; 3580 goto out; 3581 } 3582 memcpy(cache_mounts, BC_cache->c_mounts, (BC_cache->c_nmounts * sizeof(*BC_cache->c_mounts))); 3583 3584 /* ncache_mounts is the current number of valid mounts in the mount array */ 3585 ncache_mounts = BC_cache->c_nmounts; 3586 3587 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3588 if (pmidx_to_cmidx[pm_idx] != -1) { 3589 /* We already have a record for this mount */ 3590 3591 if (playlist_mounts[pm_idx].pm_nentries == 0) { 3592 message("Playlist incuded mount with 0 entries"); 3593 error = EINVAL; 3594 goto out; 3595 } 3596 3597 cm_idx = pmidx_to_cmidx[pm_idx]; 3598 3599 assert(cm_idx < BC_cache->c_nmounts); 3600 3601 /* grow the mount's extent array to fit the new extents (we'll fill in the new extents below) */ 3602 cache_mounts[cm_idx].cm_pextents = BC_MALLOC((BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries) * sizeof(*cache_mounts[cm_idx].cm_pextents), M_TEMP, M_WAITOK | M_ZERO); 3603 if (cache_mounts[cm_idx].cm_pextents == NULL) { 3604 message("can't allocate mount's extent array (%d entries)", (BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries)); 3605 error = ENOMEM; 3606 goto out; 3607 } 3608 3609 /* don't free the old extent list yet, the real BC_cache's mount still points to it and we may yet fail */ 3610 3611 /* cm_nextents is the number of valid extents in this mount's extent list. It starts out as 0 and we fill it in below */ 3612 cache_mounts[cm_idx].cm_nextents = 0; 3613 } else { 3614 /* New mount for the cache */ 3615 3616 if ((error = BC_setup_mount(cache_mounts + ncache_mounts, playlist_mounts + pm_idx)) != 0) { 3617 goto out; 3618 } 3619 3620 pmidx_to_cmidx[pm_idx] = ncache_mounts; 3621 ncache_mounts++; 3622 } 3623 } 3624 3625 /* 3626 * Extent array 3627 * 3628 * The existing extent array cannot move since a strategy routine may be 3629 * in the middle of an msleep with one of the extent locks. So, we allocate 3630 * a new array for the new extents. 3631 * 3632 * Add one extent list to our arrays of extent lists 3633 */ 3634 3635 /* cache_nextents is an array of ints, each indicating the number of extents in the list in cache_extents_list at the same index */ 3636 cache_nextents = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_nextents), M_TEMP, M_WAITOK | M_ZERO); 3637 if (cache_nextents == NULL) { 3638 message("can't allocate memory for %d extent list sizes", nplaylist_entries); 3639 error = ENOMEM; 3640 goto out; 3641 } 3642 memcpy(cache_nextents, BC_cache->c_nextents, BC_cache->c_nextentlists * sizeof(*BC_cache->c_nextents)); 3643 3644 /* cache_extents_list is the array of extent lists. The extent list at the last index is for the new playlist's cache */ 3645 cache_extents_list = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_extentlists), M_TEMP, M_WAITOK | M_ZERO); 3646 if (cache_extents_list == NULL) { 3647 message("can't allocate memory for %d extent lists", nplaylist_entries); 3648 error = ENOMEM; 3649 goto out; 3650 } 3651 memcpy(cache_extents_list, BC_cache->c_extentlists, BC_cache->c_nextentlists * sizeof(*BC_cache->c_extentlists)); 3652 3653 /* The extent list for this new playlist's cache */ 3654 cache_extents = BC_MALLOC(nplaylist_entries * sizeof(*cache_extents), M_TEMP, M_WAITOK | M_ZERO); 3655 if (cache_extents == NULL) { 3656 message("can't allocate memory for %d extents", nplaylist_entries); 3657 error = ENOMEM; 3658 goto out; 3659 } 3660 3661 /* TODO: iterate over our history and remove any blocks we've already seen from the new cache */ 3662 3663 /* Fill in the new extents. 3664 * 3665 * We just tack the new playlist onto the end of any cache we currently have as a new batch. 3666 * 3667 * At this point, we assume that any additional caches should be in additional batches just as 3668 * if there was a single recording with a tag separating the new extents. If we did decide to 3669 * merge, it would still be hard since that would require reordering the extent list and 3670 * BC_strategy assumes that the cache extents never move (see the msleep in wait_for_extent). 3671 * 3672 * We also don't coalesce the new extents into the old (and lower batch) extents. 3673 * This would be a bit of work, we assume we want a new batch anyway, and it's rare that 3674 * a new cache comes in while an old cache is still in readahead. So, we don't bother. 3675 * 3676 * We do clip any overlap with the new cache from the old cache, however. 3677 * 3678 * We don't clip any overlap with the history from the new extents. 3679 * The history list isn't ordered, so we don't have a fast way to compare the ranges. It's also 3680 * expected that the overlap is minimal. If we stopped recording history at some point, we don't 3681 * have this info anyway. So, we don't bother. 3682 */ 3683 3684 /* size is the size in bytes of the cache, used to allocate our page buffer */ 3685 size = 0; 3686 3687 if (BC_preloaded_playlist) { 3688 /* 3689 * Unpack the static control entry array into the extent array. 3690 */ 3691 playlist_entries = CAST_DOWN(struct BC_playlist_entry *, entries_buf); 3692 } else { 3693 /* 3694 * Since the playlist control entry structure is not the same as the 3695 * extent structure we use, we need to copy the control entries in 3696 * and unpack them. 3697 */ 3698 playlist_entries = BC_MALLOC(BC_PLC_CHUNK * sizeof(*playlist_entries), 3699 M_TEMP, M_WAITOK); 3700 if (playlist_entries == NULL) { 3701 message("can't allocate unpack buffer for %d entries", BC_PLC_CHUNK); 3702 error = ENOMEM; 3703 goto out; 3704 } 3705 } 3706 3707 remaining_entries = nplaylist_entries; 3708 while (remaining_entries > 0) { 3709 3710 if (BC_preloaded_playlist) { 3711 actual = remaining_entries; 3712 } else { 3713 actual = min(remaining_entries, BC_PLC_CHUNK); 3714 if ((error = copyin(entries_buf, playlist_entries, 3715 actual * sizeof(struct BC_playlist_entry))) != 0) { 3716 message("copyin from 0x%llx to %p failed", entries_buf, playlist_entries); 3717 goto out; 3718 } 3719 } 3720 3721 /* unpack into our array */ 3722 for (pe_idx = 0; pe_idx < actual; pe_idx++) { 3723 pe = playlist_entries + pe_idx; 3724 if (pe->pe_length == 0) { 3725 debug("Bad Playlist: entry %d has 0 length", (nplaylist_entries - remaining_entries) + pe_idx); 3726 error = EINVAL; 3727 goto out; 3728 } 3729 pm_idx = pe->pe_mount_idx; 3730 3731 if (pm_idx >= nplaylist_mounts || pm_idx < 0) { 3732 message("Bad playlist: entry %d referenced non-existent mount index %d", (nplaylist_entries - remaining_entries) + pe_idx, pm_idx); 3733 error = EINVAL; 3734 goto out; 3735 } 3736 3737 pm = playlist_mounts + pm_idx; 3738 cm_idx = pmidx_to_cmidx[pm_idx]; 3739 cm = cache_mounts + cm_idx; 3740 ce = cache_extents + ncache_extents; 3741 3742 /* The size of the extent list is the number of playlist entries + the number of old cache extents */ 3743 max_entries = pm->pm_nentries + ((cm_idx < BC_cache->c_nmounts) ? BC_cache->c_mounts[cm_idx].cm_nextents : 0); 3744 3745 if (cm->cm_nextents >= max_entries) { 3746 message("Bad playlist: more entries existed than the mount %s claimed (%d)", uuid_string(pm->pm_uuid), pm->pm_nentries); 3747 error = EINVAL; 3748 goto out; 3749 } 3750 3751 if ((error = BC_setup_extent(ce, pe, old_batch_count, cm_idx)) != 0) { 3752 goto out; 3753 } 3754 3755 /* Merge the new extent with the old extents for this mount. The new extent may get clipped. */ 3756#ifdef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3757 if (cm_idx < BC_cache->c_nmounts) { 3758 old_cm = BC_cache->c_mounts + cm_idx; 3759 /* Copy any lower or equal extents from the existing playlist down to the low end of the array */ 3760 while (next_old_extent_idx[cm_idx] < old_cm->cm_nextents && 3761 old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset <= ce->ce_diskoffset) { 3762 cm->cm_pextents[cm->cm_nextents] = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]; 3763 cm->cm_nextents++; 3764 next_old_extent_idx[cm_idx]++; 3765 } 3766 3767 /* check for overlap with the next extent in the old list and clip the new extent */ 3768 if (next_old_extent_idx[cm_idx] < old_cm->cm_nextents) { 3769 3770 /* FIXME: rdar://9153031 If the new extent extends past the end of the old extent, we're losing caching! */ 3771 if (ce->ce_diskoffset + ce->ce_length > (old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset + old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length)) { 3772 debug("!!! Entry %d (0x%llx:%lld) is getting clipped too much because of previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset, old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length); 3773 } 3774 3775 u_int64_t max_offset = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset; 3776 if (max_offset < ce->ce_diskoffset + ce->ce_length) { 3777 if (max_offset <= ce->ce_diskoffset) { 3778 ce->ce_length = 0; 3779 } else { 3780 ce->ce_length = (max_offset - ce->ce_diskoffset); 3781 } 3782 } 3783 } 3784 } 3785 3786 /* check for intersection with the next lower extent in the list and clip the new extent */ 3787 if (cm->cm_nextents > 0) { 3788 u_int64_t min_offset = cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset + cm->cm_pextents[cm->cm_nextents - 1]->ce_length; 3789 if (min_offset > ce->ce_diskoffset) { 3790 3791 /* Check if this extent is overlapping with an extent from the same playlist */ 3792 if (cm->cm_pextents[cm->cm_nextents - 1] >= cache_extents && 3793 cm->cm_pextents[cm->cm_nextents - 1] < cache_extents + ncache_extents) { 3794 message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset, cm->cm_pextents[cm->cm_nextents - 1]->ce_length); 3795 error = EINVAL; 3796 goto out; 3797 } 3798 3799 if (min_offset >= ce->ce_diskoffset + ce->ce_length) { 3800 ce->ce_length = 0; 3801 } else { 3802 ce->ce_length -= (min_offset - ce->ce_diskoffset); 3803 ce->ce_diskoffset = min_offset; 3804 } 3805 } 3806 } 3807 3808 if (ce->ce_length != pe->pe_length) { 3809 clipped_extents++; 3810 } 3811 if (ce->ce_length == 0) { 3812 /* new extent is entirely captured in extents from the old cache, throw it out */ 3813 enveloped_extents++; 3814 BC_teardown_extent(ce); 3815 lck_mtx_destroy(&ce->ce_lock, BC_cache->c_lckgrp); 3816 3817 /* continue without incrementing ncache_extents so we reuse this extent index */ 3818 continue; 3819 } 3820 size += roundup(ce->ce_length, PAGE_SIZE); 3821#endif 3822 3823 cm->cm_pextents[cm->cm_nextents] = ce; 3824 cm->cm_nextents++; 3825 ncache_extents++; 3826 } 3827 remaining_entries -= actual; 3828 entries_buf += actual * sizeof(struct BC_playlist_entry); 3829 } 3830 3831 /* Fill in any remaining extents from the original cache that are still sitting at the high end of the extent list */ 3832 for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) { 3833 cm_idx = pmidx_to_cmidx[pm_idx]; 3834 cm = cache_mounts + cm_idx; 3835 pm = playlist_mounts + pm_idx; 3836 3837 if (cm->cm_nextents == 0) { 3838 message("Bad playlist: No entries existed for mount %s (claimed %d)", uuid_string(pm->pm_uuid), pm->pm_nentries); 3839 error = EINVAL; 3840 goto out; 3841 } 3842 3843 if (cm_idx < BC_cache->c_nmounts) { 3844 if (next_old_extent_idx[cm_idx] < BC_cache->c_mounts[cm_idx].cm_nextents) { 3845 /* There are more extents in the old extent array, copy them over */ 3846 memcpy(cm->cm_pextents + cm->cm_nextents, BC_cache->c_mounts[cm_idx].cm_pextents + next_old_extent_idx[cm_idx], sizeof(*cm->cm_pextents) * (BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx])); 3847 cm->cm_nextents += BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx]; 3848 } 3849 } 3850 3851 } 3852 3853#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET 3854 /* We need an ordered list of extents for each mount, but we were given 3855 * the extents in a different ordering, so we need to sort the mount's 3856 * extent list here 3857 */ 3858 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 3859 cm = cache_mounts + cm_idx; 3860 3861 /* If this is a new mount or a mount with a new extent list, it needs sorting */ 3862 if (cm_idx >= BC_cache->c_nmounts || 3863 cm->cm_pextents != BC_cache->c_mounts[cm_idx].cm_pextents) { 3864 3865 qsort(cm->cm_pextents, 3866 cm->cm_nextents, 3867 sizeof(*cm->cm_pextents), 3868 compare_cache_extents); 3869 3870 /* Clip the new extents of overlap with the old extents */ 3871 for (ce_idx = 0; ce_idx < cm->cm_nextents; ce_idx++) { 3872 3873 /* Only look at new extents, old extents were already clipped when they were added and can't be clipped further */ 3874 if (cm->cm_pextents[ce_idx] >= cache_extents && 3875 cm->cm_pextents[ce_idx] < cache_extents + ncache_extents) { 3876 3877 u_int64_t old_length = ce->ce_length; 3878 3879 /* Compare against the previous (lower) extent */ 3880 if (ce_idx > 0) { 3881 u_int64_t min_offset = cm->cm_pextents[ce_idx - 1]->ce_diskoffset + cm->cm_pextents[ce_idx - 1]->ce_length; 3882 if (min_offset > ce->ce_diskoffset) { 3883 if (min_offset >= ce->ce_diskoffset + ce->ce_length) { 3884 ce->ce_length = 0; 3885 } else { 3886 ce->ce_length -= (min_offset - ce->ce_diskoffset); 3887 ce->ce_diskoffset = min_offset; 3888 } 3889 } 3890 } 3891 3892 /* Compare against the next (higher) extent */ 3893 if (ce_idx < cm_nextents - 1) { 3894 u_int64_t max_offset = cm->cm_pextents[ce_idx + 1]->ce_diskoffset; 3895 if (max_offset < ce->ce_diskoffset + ce->ce_length) { 3896 3897 /* Check if this extent is overlapping with an extent from the same playlist */ 3898 if (cm->cm_pextents[ce_idx + 1] >= cache_extents && 3899 cm->cm_pextents[ce_idx + 1] < cache_extents + ncache_extents) { 3900 message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[ce_idx + 1]->ce_diskoffset, cm->cm_pextents[ce_idx + 1]->ce_length); 3901 error = EINVAL; 3902 goto out; 3903 } 3904 3905 if (max_offset <= ce->ce_diskoffset) { 3906 ce->ce_length = 0; 3907 } else { 3908 ce->ce_length = (max_offset - ce->ce_diskoffset); 3909 } 3910 } 3911 } 3912 3913 if (ce->ce_length != old_length) { 3914 clipped_extents++; 3915 if (ce->ce_length == 0) { 3916 /* extent was enveloped, remove this extent from the mount's extent list */ 3917 if ((ce_idx + 1) < cm->cm_nextents) { 3918 memmove(cm->cm_pextents + ce_idx, cm->cm_pextents + (ce_idx + 1), cm->cm_nextents - (ce_idx + 1)); 3919 } 3920 cm->cm_nextents--; 3921 ce_idx--; 3922 enveloped_extents++; 3923 } 3924 } 3925 } 3926 } 3927 } 3928 } 3929 3930 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 3931 size += roundup(cache_extents->ce_length, PAGE_SIZE); 3932 } 3933 3934#endif 3935 3936 if (clipped_extents > 0) { 3937 debug("%d extents were clipped, %d of which were completely enveloped", clipped_extents, enveloped_extents); 3938 } 3939 3940 if (ncache_extents == 0) { 3941 debug("No extents added, not updating cache"); 3942 error = EALREADY; 3943 goto out; 3944 } 3945 3946 if (! BC_preloaded_playlist) { 3947 _FREE_ZERO(playlist_entries, M_TEMP); 3948 _FREE_ZERO(playlist_mounts, M_TEMP); 3949 } 3950 3951 if (ncache_extents < nplaylist_entries) { 3952 debug("%d extents added out of %d", ncache_extents, nplaylist_entries); 3953 } 3954 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 3955 debug("Mount %s now has %d extents", uuid_string(cache_mounts[cm_idx].cm_uuid), cache_mounts[cm_idx].cm_nextents); 3956 } 3957 3958 3959 /* 3960 * In order to simplify allocation of the block and page maps, we round 3961 * the size up to a multiple of CB_MAPFIELDBITS pages. This means we 3962 * may waste as many as 31 pages in the worst case. 3963 */ 3964 size = roundup(size, PAGE_SIZE * CB_MAPFIELDBITS); 3965 3966 /* 3967 * Verify that the cache is not larger than 50% of physical memory 3968 * to avoid forcing pageout activity. 3969 * 3970 * max_mem and size are in bytes. All calculations are done in page 3971 * units to avoid overflow/sign issues. 3972 * 3973 * If the cache is too large, we discard it and go to record-only mode. 3974 * This lets us survive the case where the playlist is corrupted or 3975 * manually made too large, but the "real" bootstrap footprint 3976 * would be OK. 3977 */ 3978 BC_cache->c_cachesize += size; 3979 if (BC_cache->c_cachesize > BC_MAX_SIZE) { 3980 message("cache size (%lu bytes) too large for physical memory (%llu bytes), capping at %llu bytes", 3981 (unsigned long)BC_cache->c_cachesize, max_mem, BC_MAX_SIZE); 3982 BC_cache->c_cachesize = BC_MAX_SIZE; 3983 } 3984 3985 /* 3986 * Allocate/grow the pagebuffer. 3987 */ 3988 if (BC_alloc_pagebuffer() != 0) { 3989 message("can't allocate %lld bytes for cache buffer", BC_cache->c_cachesize); 3990 error = ENOMEM; 3991 goto out; 3992 } 3993 3994 /* 3995 * Fix up the extent cache offsets. 3996 */ 3997 p = 0; 3998 if (BC_cache->c_nextentlists > 0) { 3999 /* start at the offset of the end of the previous cache list */ 4000 for (cel_idx = BC_cache->c_nextentlists - 1; cel_idx >= 0; cel_idx--) { 4001 if (BC_cache->c_nextents[cel_idx] > 0) { 4002 struct BC_cache_extent *last_ce = BC_cache->c_extentlists[cel_idx] + BC_cache->c_nextents[cel_idx] - 1; 4003 p = last_ce->ce_cacheoffset + roundup(last_ce->ce_length, PAGE_SIZE); 4004 break; 4005 } 4006 } 4007 } 4008 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4009 cache_extents[ce_idx].ce_cacheoffset = p; 4010 p += roundup(cache_extents[ce_idx].ce_length, PAGE_SIZE); 4011 4012 /* Abort any extents that extend past our cache size */ 4013 if (p >= BC_cache->c_cachesize || cache_extents[ce_idx].ce_length == 0) { 4014 BC_teardown_extent(cache_extents + ce_idx); 4015 } 4016 } 4017 4018 /* We've successfully integrated the new playlist with our cache, copy it into place */ 4019 4020 /* free up old cache's allocations that we're replacing */ 4021 if (BC_cache->c_nmounts > 0) { 4022 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 4023 /* If we have a new extent array for this mount, free the old one */ 4024 if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) { 4025 _FREE_ZERO(BC_cache->c_mounts[cm_idx].cm_pextents, M_TEMP); 4026 } 4027 } 4028 _FREE_ZERO(BC_cache->c_mounts, M_TEMP) 4029 } 4030 BC_cache->c_mounts = cache_mounts; 4031 BC_cache->c_nmounts = ncache_mounts; 4032 4033 if (BC_cache->c_nextentlists > 0) { 4034 _FREE_ZERO(BC_cache->c_extentlists, M_TEMP) 4035 _FREE_ZERO(BC_cache->c_nextents, M_TEMP) 4036 } 4037 4038 cache_nextents[BC_cache->c_nextentlists] = ncache_extents; 4039 cache_extents_list[BC_cache->c_nextentlists] = cache_extents; 4040 4041 BC_cache->c_nextentlists++; 4042 4043 BC_cache->c_nextents = cache_nextents; 4044 BC_cache->c_extentlists = cache_extents_list; 4045 4046 BC_cache->c_stats.ss_total_mounts = ncache_mounts; 4047 BC_cache->c_stats.ss_total_extents += ncache_extents; 4048 4049 /* fill (or tear down) the extents for all the mounts we've already seen */ 4050 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4051 ce = cache_extents + ce_idx; 4052 if (CM_SETUP != BC_cache->c_mounts[ce->ce_mount_idx].cm_state) { 4053 LOCK_EXTENT(ce); 4054 if (CM_READY == BC_cache->c_mounts[ce->ce_mount_idx].cm_state) { 4055 if (0 == BC_fill_in_extent(ce)) { 4056 UNLOCK_EXTENT(ce); 4057 continue; 4058 } 4059 } 4060 /* either the mount is aborted or fill_in_extent failed */ 4061 BC_teardown_extent(ce); 4062 UNLOCK_EXTENT(ce); 4063 } 4064 } 4065 4066 /* Check to see if we have any more mounts to fill in */ 4067 for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) { 4068 if (CM_SETUP == BC_cache->c_mounts[cm_idx].cm_state) { 4069 /* have at least one mount to fill in, iterate all of them */ 4070 int go_again = 0; 4071 vfs_iterate(0, fill_in_bc_cache_mounts, &go_again); 4072 if (go_again) { 4073 debug("Missing root mount during last iteration, going again"); 4074 vfs_iterate(0, fill_in_bc_cache_mounts, &go_again); 4075 } 4076 break; 4077 } 4078 } 4079 4080 error = 0; 4081 4082 /* all done */ 4083out: 4084 if (! BC_preloaded_playlist) { 4085 _FREE_ZERO(playlist_entries, M_TEMP); 4086 _FREE_ZERO(playlist_mounts, M_TEMP); 4087 } 4088 _FREE_ZERO(pmidx_to_cmidx, M_TEMP); 4089 _FREE_ZERO(next_old_extent_idx, M_TEMP); 4090 if (error != 0) { 4091 if (error != EALREADY) { 4092 debug("cache setup failed, aborting"); 4093 } 4094 4095 /* If it was possible to fail after growing the page buffer, we'd need a way to shrink it back here */ 4096 4097 if (BC_cache->c_extentlists == NULL) { 4098 /* If this is our first cache, clear stats and our page buffer */ 4099 BC_cache->c_stats.ss_total_extents = 0; 4100 BC_cache->c_stats.ss_total_mounts = 0; 4101 if (BC_cache->c_vp != NULL) { 4102 BC_free_pagebuffer(); 4103 } 4104 } 4105 4106 BC_cache->c_cachesize = old_cache_size; 4107 BC_cache->c_batch_count = old_batch_count; 4108 4109 /* free up new cache's allocations that aren't part of the old cache */ 4110 if (cache_extents) { 4111 for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) { 4112 BC_teardown_extent(cache_extents + ce_idx); 4113 lck_mtx_destroy(&cache_extents[ce_idx].ce_lock, BC_cache->c_lckgrp); 4114 } 4115 _FREE_ZERO(cache_extents, M_TEMP); 4116 } 4117 _FREE_ZERO(cache_nextents, M_TEMP); 4118 _FREE_ZERO(cache_extents_list, M_TEMP); 4119 4120 if (cache_mounts) { 4121 for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) { 4122 if (cm_idx >= BC_cache->c_nmounts) { 4123 /* newly allocated mounts, teardown completely */ 4124 BC_teardown_mount(cache_mounts + cm_idx); 4125 lck_rw_destroy(&cache_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp); 4126 } else if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) { 4127 /* old mounts with a new extent list */ 4128 _FREE_ZERO(cache_mounts[cm_idx].cm_pextents, M_TEMP); 4129 } 4130 } 4131 _FREE_ZERO(cache_mounts, M_TEMP); 4132 } 4133 } 4134 4135#ifdef BC_DEBUG 4136 struct timeval end_time; 4137 microtime(&end_time); 4138 timersub(&end_time, &start_time, &end_time); 4139 debug("BC_copyin_playlist took %ldus", end_time.tv_sec * 1000000 + end_time.tv_usec); 4140#endif 4141 4142 return(error); 4143} 4144 4145/* 4146 * Initialise the cache. Fetch the playlist from userspace and 4147 * find and attach to our block device. 4148 */ 4149static int 4150BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history) 4151{ 4152 int error = 0, copyin_error, mount_idx; 4153 struct timeval current_time; 4154 4155 /* Everything should be set, or nothing should be set */ 4156 if ((mounts_size == 0 || mounts == 0 || entries_size == 0 || entries == 0) && 4157 (mounts_size != 0 || mounts != 0 || entries_size != 0 || entries != 0)) { 4158 debug("Invalid cache parameters: %ld, %lld, %ld, %lld", mounts_size, mounts, entries_size, entries); 4159 return(EINVAL); 4160 } 4161 4162 4163 if (! (BC_cache->c_flags & BC_FLAG_SETUP)) { 4164 debug("Cache not yet setup"); 4165 return(ENXIO); 4166 } 4167 4168 microtime(¤t_time); 4169 4170 /* 4171 * Start recording history. 4172 * 4173 * Only one history recording may be active at a time 4174 */ 4175 if (record_history) { 4176 LOCK_HANDLERS(); 4177 if (BC_set_flag(BC_FLAG_HISTORYACTIVE)) { 4178 debug("history recording already started, only one playlist will be returned"); 4179 BC_ADD_STAT(history_num_recordings, 1); 4180 UNLOCK_HANDLERS(); 4181 } else { 4182 debug("starting history recording"); 4183 4184 /* start the timeout handler */ 4185 timeout(BC_timeout_history, NULL, BC_history_timeout); 4186 4187 /* always take stats for the first two recordings (boot + login), even if we don't have a playlist */ 4188 if (BC_cache->c_stats.ss_history_num_recordings < 2) 4189 BC_cache->c_take_detailed_stats = 1; 4190 4191 BC_cache->c_history_starttime = current_time; 4192 BC_cache->c_history_size = 0; 4193 BC_cache->c_history_num_clusters = 0; 4194 BC_ADD_STAT(history_num_recordings, 1); 4195 BC_clear_flag(BC_FLAG_HTRUNCATED); 4196 4197 /* 4198 * Take over the strategy routine for our device; we are now 4199 * recording read operations. 4200 */ 4201 BC_check_handlers(); 4202 UNLOCK_HANDLERS(); 4203 4204 BC_update_mounts(); 4205 4206#ifndef NO_HISTORY 4207# ifdef STATIC_HISTORY 4208 /* initialise the history buffer */ 4209 if ((BC_cache->c_history = BC_MALLOC(BC_HISTORY_ALLOC, M_TEMP, M_WAITOK)) == NULL) { 4210 message("can't allocate %d bytes for static history buffer", BC_HISTORY_ALLOC); 4211 BC_clear_flag(BC_FLAG_HISTORYACTIVE); 4212 error = ENOMEM; 4213 goto out; 4214 } 4215 4216 bzero(BC_cache->c_history, BC_HISTORY_ALLOC); 4217# endif 4218#endif 4219 4220 } 4221 } 4222 4223 /* 4224 * If we have playlist data, fetch it. 4225 */ 4226 if (entries_size > 0) { 4227 LOCK_CACHE_W(); 4228 4229 if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) { 4230 debug("cache already shutdown"); 4231 error = ENXIO; 4232 UNLOCK_CACHE_W(); 4233 goto out; 4234 } 4235 4236 /* we're playing back a cache (or at least trying to), 4237 * record detailed statistics until we stop recording history 4238 */ 4239 BC_cache->c_take_detailed_stats = 1; 4240 4241 copyin_error = BC_copyin_playlist(mounts_size, mounts, entries_size, entries); 4242 if (copyin_error != 0 && copyin_error != EALREADY) { 4243 message("can't load playlist: %d", copyin_error); 4244 UNLOCK_CACHE_W(); 4245 error = copyin_error; 4246 goto out; 4247 } 4248 4249 /* Always throttle cut-through IOs. 4250 * I tried throttling only after login had started, but 4251 * then users who log in quickly see a very long delay 4252 * after entering their password before apps come up. 4253 * I'd prefer delay the time until the login screen appears 4254 * and provide a fast-as-possible login. 4255 * rdar://8592401 4256 */ 4257 BC_cache->c_readahead_throttles_cutthroughs = 1; 4258 4259 LOCK_HANDLERS(); 4260 if (BC_set_flag(BC_FLAG_CACHEACTIVE)) { 4261 /* cache is already running, we're adding to it */ 4262 UNLOCK_HANDLERS(); 4263 } else { 4264 /* first cache we've created this boot */ 4265 4266#ifdef READ_HISTORY_BUFFER 4267 /* initialise the read history buffer */ 4268 if (NULL == BC_cache->c_rhistory) { 4269 if ((BC_cache->c_rhistory = BC_MALLOC(READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry), 4270 M_TEMP, M_WAITOK)) == NULL) { 4271 message("can't allocate %d bytes for read history buffer", 4272 READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry)); 4273 } 4274 } 4275 if (BC_cache->c_rhistory) { 4276 bzero(BC_cache->c_rhistory, READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry)); 4277 } 4278#endif 4279 4280 BC_cache->c_cache_starttime = current_time; 4281 4282 /* 4283 * Take over the strategy routine for our device; we are now 4284 * filling read operations from the cache. 4285 * 4286 * This must be done before we kick off reader threads to ensure 4287 * that we see catch any writes that occur on blocks in our cache. 4288 * This ensures that a write won't come in unnoticed on a block we've 4289 * already read in so when it's subsequently requested, the cache gives 4290 * stale data. 4291 */ 4292 BC_check_handlers(); 4293 UNLOCK_HANDLERS(); 4294 4295 bootcache_contains_block = BC_cache_contains_block; 4296 } 4297 4298 LOCK_CACHE_W_TO_R(); 4299 if (copyin_error != EALREADY) { 4300 /* 4301 * Start the reader threads. 4302 */ 4303 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 4304 if (BC_cache->c_mounts[mount_idx].cm_state == CM_READY) { 4305 LOCK_MOUNT_R(BC_cache->c_mounts + mount_idx); 4306 BC_mount_available(BC_cache->c_mounts + mount_idx); 4307 UNLOCK_MOUNT_R(BC_cache->c_mounts + mount_idx); 4308 } 4309 } 4310 } 4311 4312 UNLOCK_CACHE_R(); 4313 4314 /* cache is now active */ 4315 4316 } 4317 4318 if (BC_cache->c_stats.ss_preload_time == 0) { 4319 timersub(¤t_time, 4320 &BC_cache->c_loadtime, 4321 ¤t_time); 4322 BC_cache->c_stats.ss_preload_time = (u_int) current_time.tv_sec * 1000 + (u_int) current_time.tv_usec / 1000; 4323 } 4324 4325out: 4326 return(error); 4327} 4328 4329/* 4330 * Timeout on collection. 4331 * 4332 * We haven't heard from the userland component for too long; 4333 * stop recording history, but keep the current history 4334 * around in case they show up. 4335 */ 4336static void 4337BC_timeout_history(void *junk) 4338{ 4339 /* stop recording history */ 4340 debug("history recording timed out"); 4341 BC_terminate_history(); 4342} 4343 4344/* 4345 * Check for new mounts. Called with no locks held 4346 */ 4347static int check_for_new_mount_itr(mount_t mount, void* arg) { 4348 struct vfs_attr attr; 4349 vfs_context_t context; 4350 int error, mount_idx; 4351 int* go_again = (int*) arg; 4352 4353 struct BC_history_mount_device *hmd = NULL; 4354 vnode_t devvp = NULLVP; 4355 4356 LOCK_HISTORY_R(); 4357 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 4358 /* don't do anything if we've stopped recording history */ 4359 UNLOCK_HISTORY_R(); 4360 return VFS_RETURNED_DONE; 4361 } 4362 4363#ifdef BC_DEBUG 4364 char name[MFSNAMELEN]; 4365 vfs_name(mount, name); 4366#endif 4367 4368 devvp = vfs_devvp(mount); 4369 if (devvp == NULLVP) { 4370 //debug("Mount %s (%p)'s vnode is NULL", name, mount); 4371 UNLOCK_HISTORY_R(); 4372 goto out; 4373 } 4374 4375 dev_t mount_dev = vnode_specrdev(devvp); 4376 //debug("Mount %s (%p)'s vnode %p's device is 0x%x", name, mount, devvp, mount_dev); 4377 4378 /* 4379 * A new mount appeared, but unfortunately we don't know which one. 4380 * So, for every mount we have to check through our list of history_mounts 4381 * to see if we already have info for it. 4382 */ 4383 hmd = BC_get_history_mount_device(mount_dev, NULL); 4384 if (hmd == NULL || (! uuid_is_null(hmd->hmd_mount.hm_uuid))) { 4385 /* Unable to allocate new space for a mount, or 4386 * we already have info about this mount 4387 */ 4388 UNLOCK_HISTORY_R(); 4389 goto out; 4390 } 4391 4392 /* 4393 * We don't yet have info about this mount. It's new! 4394 */ 4395 4396 hmd->hmd_blocksize = vfs_devblocksize(mount); 4397 4398 VFSATTR_INIT(&attr); 4399 VFSATTR_WANTED(&attr, f_uuid); 4400 context = vfs_context_create(NULL); 4401 error = vfs_getattr(mount, &attr, context); 4402 4403 if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) { 4404 vfs_context_rele(context); 4405#ifdef BC_DEBUG 4406 if (strncmp("devfs", name, sizeof("devfs")) != 0) { 4407 debug("Found mount %s without a UUID: error %d", name, error); 4408 } 4409#endif 4410 UNLOCK_HISTORY_R(); 4411 goto out; 4412 } 4413 4414 hmd->hmd_disk_id = vfs_throttle_mask(mount); 4415 4416 if (VNOP_IOCTL(devvp, /* vnode */ 4417 DKIOCISSOLIDSTATE, /* cmd */ 4418 (caddr_t)&hmd->hmd_is_ssd, /* data */ 4419 0, 4420 context)) /* context */ 4421 { 4422 debug("can't determine if physical disk for history mount %s is an ssd", uuid_string(hmd->hmd_mount.hm_uuid)); 4423 hmd->hmd_is_ssd = 0; 4424 } 4425 4426#ifdef ROOT_DISK_ONLY 4427 if (devvp == rootvp) { 4428 if (BC_cache->c_root_disk_id == 0) { 4429 BC_cache->c_root_disk_id = hmd->hmd_disk_id; 4430 if (BC_cache->c_root_disk_id == 0) { 4431 message("Root disk is 0"); 4432 } else { 4433 debug("Root disk (via history) is 0x%llx", BC_cache->c_root_disk_id); 4434 } 4435 } else if (BC_cache->c_root_disk_id != hmd->hmd_disk_id) { 4436 debug("Root disk 0x%llx doesn't match that found by history 0x%llx", BC_cache->c_root_disk_id, hmd->hmd_disk_id); 4437 } 4438 } 4439#endif 4440 4441 4442 /* Found info for our mount */ 4443 debug("Found new historic mount after %d IOs: %s, dev 0x%x, disk 0x%llx, blk %d%s", 4444 hmd->hmd_mount.hm_nentries, 4445 uuid_string(attr.f_uuid), 4446 hmd->hmd_dev, 4447 hmd->hmd_disk_id, 4448 hmd->hmd_blocksize, 4449 hmd->hmd_is_ssd ? ", ssd" : ""); 4450 4451 if (hmd->hmd_blocksize == 0) { 4452 hmd->hmd_blocksize = 512; 4453 } 4454 hmd->hmd_mount.hm_nentries = 0; 4455 4456 uuid_copy(hmd->hmd_mount.hm_uuid, attr.f_uuid); 4457 4458 UNLOCK_HISTORY_R(); 4459 4460 if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) { 4461 LOCK_CACHE_R(); 4462 /* Check if we have a playlist mount that matches this mount and set it up */ 4463 for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) { 4464 struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx; 4465 if (CM_SETUP == cm->cm_state && 0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) { 4466 /* Found a matching unfilled mount */ 4467 4468 LOCK_MOUNT_W(cm); 4469 if ((error = BC_fill_in_mount(cm, mount, context)) == 0) { 4470 LOCK_MOUNT_W_TO_R(cm); 4471 /* Kick off another reader thread if this is a new physical disk */ 4472 BC_mount_available(cm); 4473 UNLOCK_MOUNT_R(cm); 4474 } else { 4475 if (error == EAGAIN) { 4476 *go_again = 1; 4477 } 4478 UNLOCK_MOUNT_W(cm); 4479 } 4480 4481 break; 4482 } 4483 } 4484 UNLOCK_CACHE_R(); 4485 } 4486 4487 vfs_context_rele(context); 4488 4489out: 4490 if (devvp != NULLVP) { 4491 vnode_put(devvp); 4492 } 4493 return VFS_RETURNED; 4494} 4495 4496/* 4497 * Check for new or disappearing mounts 4498 */ 4499static void BC_update_mounts(void) { 4500 4501 /* don't do anything if we've stopped recording history */ 4502 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) { 4503 return; 4504 } 4505 4506 int go_again = 0; 4507 vfs_iterate(0, check_for_new_mount_itr, &go_again); 4508 if (go_again) { 4509 debug("Missing root mount during last iteration, going again"); 4510 vfs_iterate(0, check_for_new_mount_itr, &go_again); 4511 } 4512} 4513 4514/* 4515 * Find the mount that corresponds to this IO, 4516 * or the first empty slot. 4517 * 4518 * Called with the history read lock 4519 */ 4520static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex) { 4521 struct BC_history_mount_device *tmphmd = NULL, **phmd, *ret = NULL; 4522 int mount_idx = 0; 4523 4524 for (phmd = &BC_cache->c_history_mounts; 4525 (*phmd) == NULL || (*phmd)->hmd_dev != dev; 4526 phmd = &(*phmd)->hmd_next) { 4527 4528 if (*phmd == NULL) { 4529 if (tmphmd == NULL) { /* we may have an allocation from previous iterations */ 4530 tmphmd = kalloc(sizeof(struct BC_history_mount_device)); 4531 if (tmphmd == NULL) { 4532 message("could not allocate %lu bytes for history mount device", 4533 sizeof(struct BC_history_mount_device)); 4534 BC_set_flag(BC_FLAG_HTRUNCATED); 4535 goto out; 4536 } 4537 4538 /* claim the new entry */ 4539 tmphmd->hmd_next = NULL; 4540 tmphmd->hmd_mount.hm_nentries = 0; 4541 uuid_clear(tmphmd->hmd_mount.hm_uuid); 4542 tmphmd->hmd_disk_id = 0; 4543 tmphmd->hmd_blocksize = 0; 4544 tmphmd->hmd_dev = dev; 4545 } 4546 4547 if (OSCompareAndSwapPtr(NULL, tmphmd, phmd)) { 4548 tmphmd = NULL; 4549 if (BC_cache->c_take_detailed_stats) { 4550 BC_ADD_STAT(history_mounts, 1); 4551 } 4552 break; 4553 } 4554 4555 /* someone else added a mount before we could, check if its the one we want */ 4556 if ((*phmd)->hmd_dev == dev) break; 4557 4558 } 4559 mount_idx++; 4560 } 4561 4562 if (pindex) 4563 *pindex = mount_idx; 4564 ret = *phmd; 4565out: 4566 if (tmphmd) 4567 kfree(tmphmd, sizeof(struct BC_history_mount_device)); 4568 4569 return ret; 4570} 4571 4572/* 4573 * Record an incoming IO in the history list. 4574 * 4575 * Returns non-0 if this IO was on the root disk. 4576 * 4577 * Note that this function is not reentrant. 4578 */ 4579static int 4580BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev) 4581{ 4582 u_int64_t offset; 4583 struct BC_history_entry_cluster *hec, *tmphec = NULL; 4584 struct BC_history_mount_device *hmd = NULL; 4585 struct BC_history_entry *he; 4586 int mount_idx, entry_idx; 4587 int is_root_disk = 0; 4588 4589 /* don't do anything if we've stopped recording history */ 4590 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) 4591 return 0; 4592 4593 LOCK_HISTORY_R(); 4594 4595 /* check again, with lock */ 4596 if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) 4597 goto out; 4598 4599 /* count IOs during boot (first recording) */ 4600 if (BC_cache->c_take_detailed_stats) { 4601 if (! tag) { 4602 if (! write) { 4603 BC_ADD_STAT(history_reads, 1); 4604 } else { 4605 BC_ADD_STAT(history_writes, 1); 4606 } 4607 } 4608 } 4609 4610 /* don't do anything if the history list has been truncated */ 4611 if (!tag && BC_cache->c_flags & BC_FLAG_HTRUNCATED) 4612 goto out; 4613 4614 /* 4615 * In order to avoid recording a playlist larger than we will 4616 * allow to be played back, if the total byte count for recorded 4617 * reads reaches 50% of the physical memory in the system, we 4618 * start ignoring reads. 4619 * This is preferable to dropping entries from the end of the 4620 * playlist as we copy it in, since it will result in a clean 4621 * cessation of caching at a single point, rather than 4622 * introducing sporadic cache misses throughout the cache's run. 4623 */ 4624 if (!tag && (BC_cache->c_history_size + length) > BC_MAX_SIZE) { 4625 debug("History recording too large, capping at %lluMB", BC_MAX_SIZE / (1024 * 1024)); 4626 BC_set_flag(BC_FLAG_HTRUNCATED); 4627 goto out; 4628 } 4629 4630#ifdef NO_HISTORY 4631 BC_set_flag(BC_FLAG_HTRUNCATED); 4632 debug("history disabled, truncating"); 4633 goto out; 4634#endif 4635 4636 mount_idx = 0; 4637 offset = 0; 4638 if (! tag) { 4639 4640 hmd = BC_get_history_mount_device(dev, &mount_idx); 4641 if (hmd == NULL) goto out; 4642 4643 if (uuid_is_null(hmd->hmd_mount.hm_uuid)) { 4644 /* Couldn't identify the mount 4645 * 4646 * Before the mount has appeared we don't want to 4647 * include any of this device's IOs in the history 4648 * since these IOs will be issued next boot before 4649 * we're able to start this mount's part of the 4650 * playlist. 4651 */ 4652 4653 /* Keep track of the number of IOs we've seen until the mount appears */ 4654 OSIncrementAtomic(&hmd->hmd_mount.hm_nentries); 4655 4656 if (BC_cache->c_take_detailed_stats) { 4657 BC_ADD_STAT(history_unknown, 1); 4658 BC_ADD_STAT(history_unknown_bytes, length); 4659 } 4660 goto out; 4661 } 4662 4663 offset = HB_BLOCK_TO_BYTE(hmd, blkno); 4664 4665 /* Only track IOs on the root disk. 4666 * 4667 * We don't expect IOs on other physical disks 4668 * to be part of the critical path to boot/login 4669 * and we certainly don't expect the other disks 4670 * to be contested enough to make a cache worthwhile 4671 */ 4672 4673 if (hmd->hmd_disk_id != BC_cache->c_root_disk_id) { 4674#ifdef ROOT_DISK_ONLY 4675 goto out; 4676#endif 4677 } else { 4678 is_root_disk = 1; 4679 } 4680 4681 } 4682 4683 /* 4684 * Get the current entry cluster. 4685 */ 4686 while ((hec = BC_cache->c_history) == NULL || 4687 (entry_idx = OSAddAtomic(1, &hec->hec_nentries)) >= BC_HISTORY_ENTRIES) { 4688 if (hec) 4689 OSAddAtomic(-1, &hec->hec_nentries); 4690 4691 /* need a new cluster */ 4692 4693 /* limit the number of clusters we will allocate */ 4694 if (BC_cache->c_history_num_clusters + 1 >= BC_HISTORY_MAXCLUSTERS) { 4695 message("too many history clusters (%d, limit %ld)", 4696 BC_cache->c_history_num_clusters, 4697 (long)BC_HISTORY_MAXCLUSTERS); 4698 BC_set_flag(BC_FLAG_HTRUNCATED); 4699 goto out; 4700 } 4701 4702 if (tmphec == NULL) { /* we may have an allocation from previous iterations */ 4703 tmphec = kalloc(BC_HISTORY_ALLOC); 4704 if (tmphec == NULL) { 4705 message("could not allocate %d bytes for history cluster", 4706 BC_HISTORY_ALLOC); 4707 BC_set_flag(BC_FLAG_HTRUNCATED); 4708 goto out; 4709 } 4710 4711 /* claim the first entry of the new cluster*/ 4712 tmphec->hec_nentries = 1; 4713 tmphec->hec_link = hec; 4714 } 4715 4716 if (OSCompareAndSwapPtr(hec, tmphec, &BC_cache->c_history)) { 4717 hec = tmphec; 4718 entry_idx = 0; 4719 tmphec = NULL; 4720 OSAddAtomic(1, &BC_cache->c_history_num_clusters); 4721 break; 4722 } 4723 } 4724 if (tmphec != NULL) 4725 kfree(tmphec, BC_HISTORY_ALLOC); 4726 4727 /* 4728 * We've claimed entry entry_idx of cluster hec, fill it in. 4729 */ 4730 he = &(hec->hec_data[entry_idx]); 4731 assert(he >= &hec->hec_data[0]); 4732 assert(he < &hec->hec_data[BC_HISTORY_ENTRIES]); 4733 he->he_offset = offset; 4734 he->he_length = length; 4735 he->he_pid = pid; 4736 he->he_mount_idx = mount_idx; 4737 he->he_flags = 0x0; 4738 if (hit) he->he_flags |= BC_HE_HIT; 4739 if (write) he->he_flags |= BC_HE_WRITE; 4740 if (tag) he->he_flags |= BC_HE_TAG; 4741 if (shared) he->he_flags |= BC_HE_SHARED; 4742 if (hmd) 4743 OSIncrementAtomic(&hmd->hmd_mount.hm_nentries); 4744 4745 if (!write) { 4746 OSAddAtomic64(length, &BC_cache->c_history_size); 4747 if (BC_cache->c_take_detailed_stats) { 4748 BC_ADD_STAT(history_bytes, length); 4749 BC_ADD_STAT(history_entries, 1); 4750 } 4751 } 4752out: 4753 UNLOCK_HISTORY_R(); 4754 return is_root_disk; 4755} 4756 4757/* 4758 * Return the size of the history buffer. 4759 */ 4760static int 4761BC_size_history_mounts(void) 4762{ 4763 struct BC_history_mount_device *hmd; 4764 int nmounts; 4765 4766 /* walk the list of mount clusters and count the number of mounts */ 4767 nmounts = 0; 4768 for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next) 4769 nmounts ++; 4770 4771 return(nmounts * sizeof(struct BC_history_mount)); 4772} 4773 4774/* 4775 * Return the size of the history buffer. 4776 */ 4777static int 4778BC_size_history_entries(void) 4779{ 4780 struct BC_history_entry_cluster *hec; 4781 int nentries, cluster_entries; 4782 4783 /* walk the list of clusters and count the number of entries */ 4784 nentries = 0; 4785 for (hec = BC_cache->c_history; hec != NULL; hec = hec->hec_link) { 4786 cluster_entries = hec->hec_nentries; 4787 if (cluster_entries > BC_HISTORY_ENTRIES) { 4788 cluster_entries = BC_HISTORY_ENTRIES; 4789 } 4790 4791 nentries += cluster_entries; 4792 } 4793 4794 return(nentries * sizeof(struct BC_history_entry)); 4795} 4796 4797/* 4798 * Copy out either the history buffer size, or the entire history buffer. 4799 * 4800 * Note that the cache must be shut down before calling this function in 4801 * order to avoid a race around the entry count/buffer size. 4802 */ 4803static int 4804BC_copyout_history_mounts(user_addr_t uptr) 4805{ 4806 struct BC_history_mount_device *hmd; 4807 int error; 4808 4809 assert(uptr != 0); 4810 4811 /* 4812 * Copy out the mounts 4813 */ 4814 for (hmd = BC_cache->c_history_mounts; 4815 hmd != NULL; 4816 hmd = hmd->hmd_next) { 4817 4818 /* copy the cluster out */ 4819 if ((error = copyout(&(hmd->hmd_mount), uptr, 4820 sizeof(struct BC_history_mount))) != 0) 4821 return(error); 4822 uptr += sizeof(struct BC_history_mount); 4823 4824 } 4825 return(0); 4826} 4827 4828/* 4829 * Copy out either the history buffer size, or the entire history buffer. 4830 * 4831 * Note that the cache must be shut down before calling this function in 4832 * order to avoid a race around the entry count/buffer size. 4833 */ 4834static int 4835BC_copyout_history_entries(user_addr_t uptr) 4836{ 4837 struct BC_history_entry_cluster *hec, *ohec; 4838 int error, cluster_entries; 4839 4840 assert(uptr != 0); 4841 assert(BC_cache->c_history); 4842 4843 /* 4844 * Copying the history entires out is a little messy due to the fact that the 4845 * clusters are on the list backwards, but we want the entries in order. 4846 */ 4847 ohec = NULL; 4848 if (BC_cache->c_history) { 4849 for (;;) { 4850 /* scan the list until we find the last one we haven't copied */ 4851 hec = BC_cache->c_history; 4852 while (hec->hec_link != ohec) 4853 hec = hec->hec_link; 4854 ohec = hec; 4855 4856 cluster_entries = hec->hec_nentries; 4857 if (cluster_entries > BC_HISTORY_ENTRIES) { 4858 cluster_entries = BC_HISTORY_ENTRIES; 4859 } 4860 4861 /* copy the cluster out */ 4862 if ((error = copyout(hec->hec_data, uptr, 4863 cluster_entries * sizeof(struct BC_history_entry))) != 0) 4864 return(error); 4865 uptr += cluster_entries * sizeof(struct BC_history_entry); 4866 4867 /* if this was the last cluster, all done */ 4868 if (hec == BC_cache->c_history) 4869 break; 4870 } 4871 } 4872 return(0); 4873} 4874 4875/* 4876 * Discard the history list. 4877 */ 4878static void 4879BC_discard_history(void) 4880{ 4881 struct BC_history_mount_device *hmd; 4882 struct BC_history_entry_cluster *hec; 4883 4884 while (BC_cache->c_history_mounts != NULL) { 4885 hmd = BC_cache->c_history_mounts; 4886 BC_cache->c_history_mounts = hmd->hmd_next; 4887 kfree(hmd, sizeof(struct BC_history_mount_device)); 4888 } 4889 4890 while (BC_cache->c_history != NULL) { 4891 hec = BC_cache->c_history; 4892 BC_cache->c_history = hec->hec_link; 4893 kfree(hec, BC_HISTORY_ALLOC); 4894 } 4895} 4896 4897/* 4898 * Setup for the root mounted device 4899 */ 4900static int 4901BC_setup(void) 4902{ 4903 unsigned int boot_arg; 4904 4905 /* 4906 * If we have less than the minimum supported physical memory, 4907 * bail immediately. With too little memory, the cache will 4908 * become polluted with nondeterministic pagein activity, leaving 4909 * large amounts of wired memory around which further exacerbates 4910 * the problem. 4911 */ 4912 if ((max_mem / (1024 * 1024)) < BC_minimum_mem_size) { 4913 debug("insufficient physical memory (%dMB < %dMB required)", 4914 (int) (max_mem / (1024 * 1024)), BC_minimum_mem_size); 4915 return(ENOMEM); 4916 } 4917 4918 /* 4919 * Find our block device. 4920 * 4921 * Note that in the netbooted case, we are loaded but should not 4922 * actually run, unless we're explicitly overridden. 4923 * 4924 * Note also that if the override is set, we'd better have a valid 4925 * rootdev... 4926 */ 4927 if (netboot_root() && 4928 (!PE_parse_boot_argn("BootCacheOverride", &boot_arg, sizeof(boot_arg)) || 4929 (boot_arg != 1))) { /* not interesting */ 4930 return(ENXIO); 4931 } 4932 4933 debug("Major of rootdev is %d", major(rootdev)); 4934 4935 BC_cache->c_devsw = &bdevsw[major(rootdev)]; 4936 assert(BC_cache->c_devsw != NULL); 4937 4938 /* Make sure we don't clobber the device strategy routine */ 4939 if ((BC_cache->c_devsw->d_strategy == (strategy_fcn_t*)BC_strategy) || 4940 (BC_cache->c_devsw->d_close == BC_close)) { 4941 debug("cache was not terminated properly previously"); 4942 return(ENXIO); 4943 } 4944 4945 BC_cache->c_strategy = BC_cache->c_devsw->d_strategy; 4946 BC_cache->c_close = BC_cache->c_devsw->d_close; 4947 if (BC_cache->c_strategy == NULL || 4948 BC_cache->c_close == NULL) { /* not configured */ 4949 return(ENXIO); 4950 } 4951 4952 BC_set_flag(BC_FLAG_SETUP); 4953 4954 return(0); 4955} 4956 4957 4958/* 4959 * Called from the root-mount hook. 4960 */ 4961static void 4962BC_auto_start(void) 4963{ 4964 int error; 4965 user_addr_t pm, pe; 4966 4967 if ((error = BC_setup()) != 0) { 4968 debug("BootCache setup failed: %d", error); 4969 return; 4970 } 4971 4972 pm = CAST_USER_ADDR_T((uintptr_t)BC_preloaded_playlist + sizeof(struct BC_playlist_header)); 4973 pe = pm + (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount)); 4974 4975 /* Initialize the cache. 4976 * 4977 * auto-start is for DVDs or other read-only media, so don't bother recording history 4978 */ 4979 error = BC_init_cache(BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount), pm, 4980 BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry), pe, 4981 0 /* don't record history */); 4982 if (error != 0) 4983 debug("BootCache autostart failed: %d", error); 4984} 4985 4986/* 4987 * Called from the root-unmount hook. 4988 */ 4989static void 4990BC_unmount_hook(void) 4991{ 4992 debug("device unmounted"); 4993 4994 /* 4995 * If the cache is running, stop it. If it's already stopped 4996 * (it may have stopped itself), that's OK. 4997 */ 4998 BC_terminate_cache(); 4999 BC_terminate_history(); 5000 5001} 5002 5003 5004/* 5005 * Handle the command sysctl. 5006 */ 5007static int 5008BC_sysctl SYSCTL_HANDLER_ARGS 5009{ 5010 struct BC_command bc; 5011 int error; 5012 5013 /* get the commande structure and validate */ 5014 if ((error = SYSCTL_IN(req, &bc, sizeof(bc))) != 0) { 5015 debug("couldn't get command"); 5016 return(error); 5017 } 5018 if (bc.bc_magic != BC_MAGIC) { 5019 debug("bad command magic"); 5020 return(EINVAL); 5021 } 5022 5023 switch (bc.bc_opcode) { 5024 case BC_OP_START: 5025 debug("BC_OP_START(%d mounts, %d extents)", (int)(bc.bc_data1_size / sizeof(struct BC_playlist_mount)), (int)(bc.bc_data2_size / sizeof(struct BC_playlist_entry))); 5026 if (BC_preloaded_playlist) { 5027 error = EINVAL; 5028 break; 5029 } 5030 error = BC_init_cache(bc.bc_data1_size, (user_addr_t)bc.bc_data1, bc.bc_data2_size, (user_addr_t)bc.bc_data2, 1 /* record history */); 5031 if (error != 0) { 5032 message("cache init failed"); 5033 } 5034 break; 5035 5036 case BC_OP_STOP: 5037 debug("BC_OP_STOP"); 5038 5039 /* 5040 * If we're recording history, stop it. If it's already stopped 5041 * (it may have stopped itself), that's OK. 5042 */ 5043 BC_terminate_history(); 5044 5045 /* return the size of the history buffer */ 5046 LOCK_HISTORY_W(); 5047 bc.bc_data1_size = BC_size_history_mounts(); 5048 bc.bc_data2_size = BC_size_history_entries(); 5049 if ((error = SYSCTL_OUT(req, &bc, sizeof(bc))) != 0) 5050 debug("could not return history size"); 5051 UNLOCK_HISTORY_W(); 5052 5053 break; 5054 5055 case BC_OP_MOUNT: 5056 /* debug("BC_OP_MOUNT"); */ 5057 5058 /* 5059 * A notification that mounts have changed. 5060 */ 5061 BC_update_mounts(); 5062 break; 5063 5064 case BC_OP_HISTORY: 5065 debug("BC_OP_HISTORY"); 5066 /* if there's a user buffer, verify the size and copy out */ 5067 LOCK_HISTORY_W(); 5068 if (bc.bc_data1 != 0 && bc.bc_data2 != 0) { 5069 if (bc.bc_data1_size < BC_size_history_mounts() || 5070 bc.bc_data2_size < BC_size_history_entries()) { 5071 debug("supplied history buffer too small"); 5072 error = EINVAL; 5073 UNLOCK_HISTORY_W(); 5074 break; 5075 } 5076 if ((error = BC_copyout_history_mounts(bc.bc_data1)) != 0) { 5077 debug("could not copy out history mounts: %d", error); 5078 } 5079 if ((error = BC_copyout_history_entries(bc.bc_data2)) != 0) { 5080 debug("could not copy out history entries: %d", error); 5081 } 5082 } 5083 /* 5084 * Release the last of the memory we own. 5085 */ 5086 BC_discard_history(); 5087 UNLOCK_HISTORY_W(); 5088 break; 5089 5090 case BC_OP_JETTISON: 5091 debug("BC_OP_JETTISON"); 5092 5093 /* 5094 * Jettison the cache. If it's already jettisoned 5095 * (it may have jettisoned itself), that's OK. 5096 */ 5097 BC_terminate_cache(); 5098 break; 5099 5100 case BC_OP_STATS: 5101 debug("BC_OP_STATS"); 5102 /* check buffer size and copy out */ 5103 if (bc.bc_data1_size != sizeof(BC_cache->c_stats)) { 5104 debug("stats structure wrong size"); 5105 error = ENOMEM; 5106 } else { 5107 BC_cache->c_stats.ss_cache_flags = BC_cache->c_flags; 5108 if ((error = copyout(&BC_cache->c_stats, bc.bc_data1, bc.bc_data1_size)) != 0) 5109 debug("could not copy out statistics"); 5110 } 5111 break; 5112 5113 case BC_OP_TAG: 5114 debug("BC_OP_TAG"); 5115 KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_TAG) | 5116 DBG_FUNC_NONE, proc_selfppid(), dbg_tag_count, 5117 0, 0, 0); 5118 BC_add_history(0, 0, 0, 0, 0, 1, 0, 0); 5119 dbg_tag_count++; 5120 break; 5121 5122 case BC_OP_TEST: 5123 debug("BC_OP_TEST"); 5124 5125 if (! (BC_cache->c_flags & BC_FLAG_SETUP)) { 5126 error = ENODEV; 5127 } 5128 break; 5129 5130 default: 5131 error = EINVAL; 5132 } 5133 return(error); 5134} 5135 5136 5137/* 5138 * Initialise the block of pages we use to back our data. 5139 * 5140 */ 5141static int 5142BC_alloc_pagebuffer() 5143{ 5144 kern_return_t kret; 5145 int ret; 5146 5147 if (BC_cache->c_vp == NULL) { 5148 5149 kret = vnode_open(BC_BOOT_BACKINGFILE, (O_CREAT | O_NOFOLLOW | FREAD), 5150 0400, 0, &BC_cache->c_vp, vfs_context_current()); 5151 if (kret != KERN_SUCCESS) { 5152 debug("vnode_open failed - %d", kret); 5153 return -1; 5154 } 5155 5156 BC_cache->c_vid = vnode_vid(BC_cache->c_vp); 5157 5158 } else { 5159 ret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 5160 if (ret != 0) { 5161 debug("vnode_getwithvid failed - %d", ret); 5162 return -1; 5163 } 5164 } 5165 5166 assert(BC_cache->c_cachesize != 0); 5167 kret = ubc_setsize(BC_cache->c_vp, BC_cache->c_cachesize); 5168 if (kret != 1) { 5169 debug("ubc_setsize failed - %d", kret); 5170 (void) vnode_put(BC_cache->c_vp); 5171 return -1; 5172 } 5173 5174 (void) vnode_put(BC_cache->c_vp); 5175 5176 return(0); 5177} 5178 5179/* 5180 * Release all resources associated with our pagebuffer. 5181 */ 5182static void 5183BC_free_pagebuffer(void) 5184{ 5185 kern_return_t kret; 5186 if (BC_cache->c_vp != NULL) { 5187 /* Can handle a vnode without ubc info */ 5188 kret = vnode_getwithref(BC_cache->c_vp); 5189 if (kret != KERN_SUCCESS) { 5190 debug("vnode_getwithref failed - %d", kret); 5191 return; 5192 } 5193 5194 kret = ubc_range_op(BC_cache->c_vp, 0, BC_cache->c_cachesize, 5195 UPL_ROP_DUMP, NULL); 5196 if (kret != KERN_SUCCESS) { 5197 debug("ubc_range_op failed - %d", kret); 5198 } 5199 5200#if 0 5201 /* need this interface to be exported... will eliminate need 5202 * for the ubc_range_op above */ 5203 kret = vnode_delete(BC_BOOT_BACKINGFILE, vfs_context_current()); 5204 if (kret) { 5205 debug("vnode_delete failed - %d", error); 5206 } 5207#endif 5208 kret = vnode_close(BC_cache->c_vp, 0, vfs_context_current()); 5209 if (kret != KERN_SUCCESS) { 5210 debug("vnode_close failed - %d", kret); 5211 } 5212 5213 BC_cache->c_vp = NULL; 5214 } 5215} 5216 5217/* 5218 * Release one page from the pagebuffer. 5219 */ 5220static void 5221BC_free_page(struct BC_cache_extent *ce, int page) 5222{ 5223 off_t vpoffset; 5224 kern_return_t kret; 5225 5226 vpoffset = (page * PAGE_SIZE) + ce->ce_cacheoffset; 5227 kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid); 5228 if (kret != KERN_SUCCESS) { 5229 debug("vnode_getwithvid failed - %d", kret); 5230 return; 5231 } 5232 5233 kret = ubc_range_op(BC_cache->c_vp, vpoffset, vpoffset + PAGE_SIZE, 5234 UPL_ROP_DUMP, NULL); 5235 if (kret != KERN_SUCCESS) { 5236 debug("ubc_range_op failed - %d", kret); 5237 } 5238 5239 (void) vnode_put(BC_cache->c_vp); 5240} 5241 5242void 5243BC_hook(void) 5244{ 5245 int error; 5246 5247 if ((error = BC_setup()) != 0) { 5248 debug("BootCache setup failed: %d", error); 5249 } 5250} 5251 5252/* 5253 * Cache start/stop functions. 5254 */ 5255void 5256BC_load(void) 5257{ 5258 struct mach_header *mh; 5259 long xsize; 5260 int has64bitness, error; 5261 size_t sysctlsize = sizeof(int); 5262 char *plsection = "playlist"; 5263 5264#ifdef BC_DEBUG 5265 microtime(&debug_starttime); 5266#endif 5267 5268 /* register our control interface */ 5269 sysctl_register_oid(&sysctl__kern_BootCache); 5270 5271 /* clear the control structure */ 5272 bzero(BC_cache, sizeof(*BC_cache)); 5273 5274 BC_cache->c_lckgrp = lck_grp_alloc_init("BootCache", LCK_GRP_ATTR_NULL); 5275 lck_rw_init(&BC_cache->c_history_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5276 lck_rw_init(&BC_cache->c_cache_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5277 lck_mtx_init(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5278 lck_mtx_init(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5279#ifdef BC_DEBUG 5280 lck_mtx_init(&debug_printlock, BC_cache->c_lckgrp, LCK_ATTR_NULL); 5281#endif 5282 /* 5283 * Find the preload section and its size. If it's not empty 5284 * we have a preloaded playlist, so prepare for an early 5285 * start. 5286 */ 5287 if (kmod_info.info_version != KMOD_INFO_VERSION) { 5288 message("incompatible kmod_info versions"); 5289 return; 5290 } 5291 mh = (struct mach_header *)kmod_info.address; 5292 5293 /* 5294 * If booting on a 32-bit only machine, use the "playlist32" section. 5295 * Otherwise, fall through to the default "playlist" section. 5296 */ 5297 error = sysctlbyname("hw.cpu64bit_capable", &has64bitness, &sysctlsize, 5298 NULL, 0); 5299 if (error == 0 && has64bitness == 0) { 5300 plsection = "playlist32"; 5301 } 5302 BC_preloaded_playlist = getsectdatafromheader(mh, "BootCache", 5303 plsection, &BC_preloaded_playlist_size); 5304 debug("preload section %s: %d @ %p", 5305 plsection, BC_preloaded_playlist_size, BC_preloaded_playlist); 5306 5307 if (BC_preloaded_playlist != NULL) { 5308 if (BC_preloaded_playlist_size < sizeof(struct BC_playlist_header)) { 5309 message("preloaded playlist too small"); 5310 return; 5311 } 5312 if (BC_preloaded_playlist->ph_magic != PH_MAGIC) { 5313 message("preloaded playlist has invalid magic (%x, expected %x)", 5314 BC_preloaded_playlist->ph_magic, PH_MAGIC); 5315 return; 5316 } 5317 xsize = sizeof(struct BC_playlist_header) + 5318 (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount)) + 5319 (BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry)); 5320 if (xsize > BC_preloaded_playlist_size) { 5321 message("preloaded playlist too small (%d, expected at least %ld)", 5322 BC_preloaded_playlist_size, xsize); 5323 return; 5324 } 5325 BC_wait_for_readahead = BC_READAHEAD_WAIT_CDROM; 5326 mountroot_post_hook = BC_auto_start; 5327 unmountroot_pre_hook = BC_unmount_hook; 5328 5329 debug("waiting for root mount..."); 5330 } else { 5331 /* no preload, normal operation */ 5332 BC_preloaded_playlist = NULL; 5333 mountroot_post_hook = BC_hook; 5334 debug("waiting for playlist..."); 5335 } 5336 5337 microtime(&BC_cache->c_loadtime); 5338} 5339 5340int 5341BC_unload(void) 5342{ 5343 int error; 5344 5345 debug("preparing to unload..."); 5346 unmountroot_pre_hook = NULL; 5347 5348 /* 5349 * If the cache is running, stop it. If it's already stopped 5350 * (it may have stopped itself), that's OK. 5351 */ 5352 BC_terminate_history(); 5353 if ((error = BC_terminate_cache()) != 0) { 5354 if (error != ENXIO) 5355 goto out; 5356 } 5357 LOCK_HISTORY_W(); 5358 BC_discard_history(); 5359 UNLOCK_HISTORY_W(); 5360 5361 lck_rw_destroy(&BC_cache->c_history_lock, BC_cache->c_lckgrp); 5362 lck_rw_destroy(&BC_cache->c_cache_lock, BC_cache->c_lckgrp); 5363 lck_mtx_destroy(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp); 5364 lck_mtx_destroy(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp); 5365 lck_grp_free(BC_cache->c_lckgrp); 5366 5367 sysctl_unregister_oid(&sysctl__kern_BootCache); 5368 error = 0; 5369 5370out: 5371 return(error); 5372} 5373