1/*
2 * Boot-time performance cache.
3 *
4 * We implement a shim layer for a block device which can be advised ahead
5 * of time of the likely read pattern for the block device.
6 *
7 * Armed with a sorted version of this read pattern (the "playlist"), we
8 * cluster the reads into a private cache, taking advantage of the
9 * sequential I/O pattern. Incoming read requests are then satisfied from
10 * the cache (if possible), or by forwarding to the original strategy routine.
11 *
12 * We also record the pattern of incoming read requests (the "history
13 * list"), in order to provide for adaptation to changes in behaviour by
14 * the calling layer.
15 *
16 * The hit rate of the cache is also measured, and caching will be disabled
17 * preemptively if the hit rate is too low.
18 *
19 * The cache is controlled by a sysctl interface.  Typical usage is as
20 * follows:
21 *
22 * - Load a sorted read pattern.
23 * - Wait for the operation(s) of interest to be performed.
24 * - Fetch the new (unsorted) read pattern.
25 *
26 * Loading a read pattern initialises the cache; upon successful loading
27 * the cache is active and readhead is being performed.  After fetching the
28 * recorded read pattern, the cache remains around until jettisoned due
29 * to memory pressure or the hit rate falls below a desirable level.
30 *
31 * Limitations:
32 *
33 * - only one copy of the cache may be active at a time
34 */
35
36/*
37 * Build options.
38 */
39/*#define BC_DEBUG*/
40/*#define BC_EXTRA_DEBUG*/
41
42/*
43 * Emulate readahead, don't actually perform it.
44 */
45/*#define EMULATE_ONLY*/
46
47/*
48 * Don't collect request history.
49 */
50/*#define NO_HISTORY*/
51
52/*
53 * Don't use history cluster chains (seems to be a malloc problem with these
54 * enabled)
55 */
56/*#define STATIC_HISTORY*/
57
58/*
59 * Only cache for the volumes on the root disk
60 */
61#define ROOT_DISK_ONLY
62
63/*
64 * Keep a log of read history to aid in debugging.
65 */
66/*#define READ_HISTORY_BUFFER	1000*/
67/*#define READ_HISTORY_ALL*/
68
69/*
70 * Ignore the batches on playlist entries.
71 */
72/*#define IGNORE_BATCH */
73
74/*
75 * Tunable parameters.
76 */
77
78/*
79 * Minimum required physical memory before BootCache will
80 * activate.  Insufficient physical memory leads to paging
81 * activity during boot, which is not repeatable and can't
82 * be distinguished from the repeatable read patterns.
83 *
84 * Size in megabytes.
85 */
86static int BC_minimum_mem_size = 256;
87
88/*
89 * Number of seconds to wait in BC_strategy for readahead to catch up
90 * with our request.   Tuninig issues:
91 *  - Waiting too long may hold up the system behind this I/O/.
92 *  - Bypassing an I/O too early will reduce the hit rate and adversely
93 *    affect performance on slow media.
94 */
95#define BC_READAHEAD_WAIT_DEFAULT	10
96#define BC_READAHEAD_WAIT_CDROM		45
97static int BC_wait_for_readahead = BC_READAHEAD_WAIT_DEFAULT;
98
99/*
100 * Cache hit ratio monitoring.
101 *
102 * The time overhead of a miss in the cache is approx 1/5000th of the
103 * time it takes to satisfy an IO from a HDD. If our hit rate falls
104 * below this, we're no longer providing a performance win and the
105 * cache will be jettisoned.
106 *
107 * Make sure to keep the cache around for at least an hour
108 * if the user hasn't logged in yet, though.
109 */
110static int BC_chit_max_num_IOs_since_last_hit = 10000;
111static int BC_chit_prelogin_timeout_seconds = 3600;
112
113/*
114 * History recording timeout.
115 *
116 * If the history recording is not turned off, the history buffer
117 * will grow to its maximum size. This timeout will kill the history
118 * recording after the specified interval (in hz).
119 *
120 * User DVDs don't issue a "BootCacheControl stop", so they depend
121 * on the history recording timeout.
122 */
123static int BC_history_timeout = 240 * 100;
124
125/*
126 * Trace macros
127 */
128#ifndef DBG_BOOTCACHE
129#define DBG_BOOTCACHE	7
130#endif
131
132#define	DBG_BC_TAG					(1 << 0)
133#define	DBG_BC_BATCH				(1 << 1)
134
135#define DBG_BC_IO_HIT				(1 << 2)
136#define DBG_BC_IO_HIT_STALLED		(1 << 3)
137#define DBG_BC_IO_MISS				(1 << 4)
138#define DBG_BC_IO_MISS_CUT_THROUGH	(1 << 5)
139#define DBG_BC_PLAYBACK_IO			(1 << 6)
140
141static int dbg_tag_count = 0;
142
143#ifdef BC_DEBUG
144# define MACH_DEBUG
145# define MACH_ASSERT 1
146# define message(fmt, args...)	\
147do { \
148	microtime(&debug_currenttime); \
149	timersub(&debug_currenttime, &debug_starttime, &debug_currenttime); \
150	lck_mtx_lock(&debug_printlock); \
151	printf("BootCache: %5d.%03d %24s[%4d]: " fmt "\n", (u_int)(debug_currenttime.tv_sec), (u_int)(debug_currenttime.tv_usec / 1000), __FUNCTION__, __LINE__, ##args); \
152	lck_mtx_unlock(&debug_printlock); \
153} while (0)
154
155# define debug(fmt, args...)	message("*** " fmt, ##args)
156extern void Debugger(char *);
157#else
158# define debug(fmt, args...)	do {} while (0)
159# define message(fmt, args...)	printf("BootCache: " fmt "\n" , ##args)
160#endif
161
162#ifdef BC_EXTRA_DEBUG
163# define xdebug(fmt, args...)	message("+++ " fmt, ##args)
164#else
165# define xdebug(fmt, args...)	do {} while (0)
166#endif
167
168#include <sys/buf.h>
169#include <sys/conf.h>
170#include <sys/fcntl.h>
171#include <sys/kdebug.h>
172#include <sys/kernel.h>
173#include <sys/malloc.h>
174#include <sys/param.h>
175#include <sys/proc.h>
176#include <sys/types.h>
177#include <sys/sysctl.h>
178#include <sys/systm.h>
179#include <sys/vnode.h>
180#include <sys/mount.h>
181#include <sys/ubc.h>
182#include <sys/uio.h>
183
184#include <uuid/uuid.h>
185
186#include <mach/kmod.h>
187#include <mach/mach_types.h>
188#include <mach/vm_param.h>	/* for max_mem */
189
190#include <vm/vm_kern.h>
191
192#include <libkern/libkern.h>
193#include <libkern/OSAtomic.h>
194
195#include <kern/assert.h>
196#include <kern/host.h>
197#include <kern/kalloc.h>
198#include <kern/thread_call.h>
199#include <kern/locks.h>
200
201#include <IOKit/storage/IOMediaBSDClient.h>
202
203#include <pexpert/pexpert.h>
204
205#include "BootCache.h"
206
207#ifdef BC_DEBUG
208static struct timeval debug_starttime;
209static struct timeval debug_currenttime;
210static lck_mtx_t debug_printlock;
211#endif
212
213#define BC_MALLOC(_size, _type, _flags) ((ssize_t)(_size) < 0) ? NULL : _MALLOC((_size), (_type), (_flags))
214
215/**************************************
216 * Structures for the readahead cache *
217 **************************************/
218
219/*
220 * A cache extent.
221 *
222 * Each extent represents a contiguous range of disk blocks which are
223 * held in the cache.  All values in bytes.
224 */
225struct BC_cache_extent {
226	lck_mtx_t   ce_lock;
227	u_int64_t   ce_diskoffset;  /* physical offset on device */
228	u_int64_t   ce_length;      /* data length */
229	off_t       ce_cacheoffset;	/* offset into cache backing store */
230	u_int16_t   ce_batch;       /* batch number */
231	u_int16_t   ce_flags;       /* flags */
232#define CE_ABORTED  (1 << 0)    /* extent no longer has data */
233#define CE_IODONE   (1 << 1)    /* extent has been read */
234#define CE_LOWPRI   (1 << 2)    /* extent is low priority */
235#define CE_SHARED   (1 << 3)    /* extent is in the shared cache */
236	int         ce_mount_idx;   /* the index of the mount for this extent */
237	u_int32_t  *ce_blockmap;    /* track valid blocks for this extent */
238};
239
240/* A mount
241 *
242 * The volume to which an extent refers
243 */
244struct BC_cache_mount {
245	lck_rw_t                 cm_lock;      /* lock guards instance variables, extents have their own locks */
246	uuid_t                   cm_uuid;      /* this mount's uuid */
247	int                      cm_nextents;  /* the number of extents that reference this mount */
248	struct BC_cache_extent **cm_pextents;  /* a list of pointers to extents which reference
249											this mount, sorted by block address */
250	int                      cm_state;     /* this mount's state */
251#define CM_SETUP	(0)                    /* fields above are valid. Not mounted yet */
252#define CM_READY	(1)                    /* every field is valid. Mounted and ready to go */
253#define CM_ABORTED	(2)                    /* mount failed for some reason */
254
255	/* fields below filled in when we detect that the volume has mounted */
256	dev_t                    cm_dev;       /* the device for this mount */
257	struct buf              *cm_bp;        /* struct buf allocated in the reader thread for this device */
258	u_int64_t                cm_throttle_mask;/* the throttle mask to use in throttle APIs for this mount */
259	u_int64_t                cm_blocksize; /* keep 64-bit for promotion */
260	u_int64_t                cm_devsize;   /* range checking */
261	u_int64_t                cm_maxread;   /* largest read the dev will support */
262	struct BC_cache_disk*    cm_disk;      /* the mount's physical disk */
263};
264
265/* A physical disk
266 *
267 * The disk may contain several mounts
268 * and will have one readahead thread
269 */
270struct BC_cache_disk {
271	lck_mtx_t cd_lock;
272	u_int64_t cd_disk_id;    /* The throttle mask, used as an identifier */
273	int       cd_disk_num; /* used for stat batching */
274	int       cd_flags;      /* flags */
275#define CD_HAS_THREAD   (1 << 0)  /* there is a readahead thread in progress for this disk */
276#define CD_ISSUE_LOWPRI (1 << 1)  /* the readahead thread is reading in low-priority extents */
277#define CD_IS_SSD       (1 << 2)  /* this is a solid state disk */
278	int       cd_nmounts;    /* The number of mounts that reference this disk */
279	int       cd_batch;      /* What playback batch this disk is on */
280};
281
282/************************************
283 * Structures for history recording *
284 ************************************/
285
286/* see BC_history_entry and BC_history_mount in BootCache.h */
287
288/*
289 * Wrapper around BC_history_mount so we
290 * can keep track of the device
291 */
292struct BC_history_mount_device {
293	u_int64_t                       hmd_disk_id;
294	struct BC_history_mount_device* hmd_next;
295	uint32_t                        hmd_is_ssd;
296	dev_t                           hmd_dev;
297	int                             hmd_blocksize;
298	struct BC_history_mount         hmd_mount;
299};
300
301/*
302 * A history entry list cluster.
303 */
304struct BC_history_entry_cluster {
305	struct BC_history_entry_cluster *hec_link;
306	int                              hec_nentries;	/* number of entries in list */
307	struct BC_history_entry          hec_data[0];
308};
309
310#ifdef STATIC_HISTORY
311# define BC_HISTORY_ALLOC	128000
312# define BC_HISTORY_ENTRIES						\
313(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) /	\
314sizeof(struct BC_history_entry)) - 1)
315# define BC_HISTORY_MAXCLUSTERS	1
316#else
317# define BC_HISTORY_ALLOC	16000
318# define BC_HISTORY_ENTRIES						\
319(((BC_HISTORY_ALLOC - sizeof(struct BC_history_entry_cluster)) /	\
320sizeof(struct BC_history_entry)) - 1)
321# define BC_HISTORY_MAXCLUSTERS	((BC_MAXENTRIES / BC_HISTORY_ENTRIES) + 1)
322#endif
323
324
325/*****************************************
326 * To be able to debug live read history *
327 *****************************************/
328#ifdef READ_HISTORY_BUFFER
329struct BC_read_history_entry {
330	struct BC_cache_extent	*rh_extent;	/* extent this read is for */
331	u_int64_t		rh_blkno;	/* offset of actual read */
332	u_int64_t		rh_bcount;	/* length in bytes */
333	int			rh_result;
334# define BC_RHISTORY_OK		0
335# define BC_RHISTORY_FAIL	1
336};
337#endif
338
339
340/*
341 * The cache control structure.
342 */
343struct BC_cache_control {
344	lck_grp_t  *c_lckgrp;
345
346	/* the device we are covering */
347	struct bdevsw	*c_devsw;
348
349	/* saved switch table routines */
350	strategy_fcn_t   *c_strategy;
351	open_close_fcn_t *c_close;
352	lck_mtx_t         c_handlers_lock;
353
354	u_int64_t	c_cachesize;		/* total number of bytes in cache */
355
356	u_int64_t   c_root_disk_id;     /* the throttle mask of the root disk, used as an id for the physical device */
357	                                /* This needs to be updated to handle multiple physical volumes backing a mount */
358
359	/*
360	 * The mounts we're tracking
361	 */
362	int                     c_nmounts; /* number of mounts in the array below */
363	struct BC_cache_mount  *c_mounts;  /* the array of mounts the cache extents refer to */
364
365	/*
366	 * Extents, in optimal read order.
367	 *
368	 * Each extent tracks the pages it owns in the buffer.
369	 * It is assumed that PAGE_SIZE is a multiple of cm_blocksize for each mount.
370	 *
371	 * Each extent refers to one of the mounts, above, by index into c_mounts
372	 */
373	int                      c_nextentlists; /* number of extent lists we have */
374	int                     *c_nextents;     /* number of extents in a given extent list */
375	struct BC_cache_extent **c_extentlists;      /* an array of extent lists in the cache */
376
377	int         c_batch_count;		/* number of extent batches */
378	vnode_t     c_vp;
379	uint32_t    c_vid;
380	/* rw lock protects the above fields through c_nmounts, but not their contents.
381	 * Shared for general use of the fields,
382	 * Exclusive for cache initialization or cleanup.
383	 * Each mount/extent has its own lock for modifying its contents.
384	 */
385	lck_rw_t    c_cache_lock;
386
387
388	/* history list, in reverse order */
389	int                              c_history_num_clusters;
390	struct BC_history_entry_cluster *c_history;
391	struct BC_history_mount_device  *c_history_mounts;
392	uint64_t                         c_history_size;
393	/* lock protects the above history fields and their contents */
394	lck_rw_t   c_history_lock;
395
396	/* fields below are accessed atomically */
397
398	/* flags */
399	UInt32		c_flags;
400#define	BC_FLAG_SETUP			(1 << 0)	/* cache setup properly during mount */
401#define	BC_FLAG_CACHEACTIVE		(1 << 1)	/* cache is active, owns memory */
402#define	BC_FLAG_HISTORYACTIVE	(1 << 2)	/* currently recording history */
403#define	BC_FLAG_HTRUNCATED		(1 << 3)	/* history list truncated */
404#define	BC_FLAG_SHUTDOWN		(1 << 4)	/* readahead shut down */
405	SInt32		c_strategycalls;	/* count of busy strategy calls in the cache */
406
407	uint32_t  c_readahead_throttles_cutthroughs;
408
409	uint32_t  c_num_ios_since_last_hit; /* The number of cache misses since the last hit */
410
411	uint32_t  c_num_reader_threads;     /* The number of reader threads active */
412	lck_mtx_t c_reader_threads_lock;    /* protects c_num_reader_threads */
413
414	/* statistics */
415	int c_take_detailed_stats;
416	struct BC_statistics c_stats;
417
418	/* time-keeping */
419	struct timeval c_loadtime;
420	struct timeval c_cache_starttime;
421	struct timeval c_history_starttime;
422
423#ifdef READ_HISTORY_BUFFER
424	int		c_rhistory_idx;
425	struct BC_read_history_entry *c_rhistory;
426#endif
427};
428
429/*
430 * The number of blocks per page; assumes that a page
431 * will always be >= the size of a disk block.
432 */
433#define CB_PAGEBLOCKS(cm)			(PAGE_SIZE / (cm)->cm_blocksize)
434
435/*
436 * Convert block offset to page offset and vice versa.
437 */
438#define CB_BLOCK_TO_PAGE(cm, block)		((block) / CB_PAGEBLOCKS(cm))
439#define CB_PAGE_TO_BLOCK(cm, page)		((page)  * CB_PAGEBLOCKS(cm))
440#define CB_BLOCK_TO_BYTE(cm, block)		((block) * (cm)->cm_blocksize)
441#define HB_BLOCK_TO_BYTE(hmd, block)	((block) * (hmd)->hmd_blocksize)
442#define CB_BYTE_TO_BLOCK(cm, byte)		((byte)  / (cm)->cm_blocksize)
443
444/*
445 * The size of an addressable field in the block map.
446 * This reflects the u_int32_t type for the blockmap type.
447 */
448#define CB_MAPFIELDBITS			32
449#define CB_MAPFIELDBYTES		4
450
451/*
452 * The index into an array of addressable fields, and the bit within
453 * the addressable field corresponding to an index in the map.
454 */
455#define CB_MAPIDX(x)			((x) / CB_MAPFIELDBITS)
456#define CB_MAPBIT(x)			(1 << ((x) % CB_MAPFIELDBITS))
457
458/*
459 * Blockmap management. Must be called with extent lock held.
460 */
461#define CB_BLOCK_PRESENT(ce, block) \
462((ce)->ce_blockmap != NULL && ((ce)->ce_blockmap[CB_MAPIDX(block)] &   CB_MAPBIT(block)))
463#define CB_MARK_BLOCK_PRESENT(ce, block) \
464((ce)->ce_blockmap[CB_MAPIDX(block)] |=  CB_MAPBIT(block))
465#define CB_MARK_BLOCK_VACANT(ce, block) \
466((ce)->ce_blockmap[CB_MAPIDX(block)] &= ~CB_MAPBIT(block))
467
468/*
469 * Determine whether a given page is vacant (all blocks are gone).
470 * This takes advantage of the expectation that a page's blocks
471 * will never cross the boundary of a field in the blockmap.
472 */
473#define CB_PAGE_VACANT(cm, ce, page)					\
474(!((ce)->ce_blockmap[CB_MAPIDX(CB_PAGE_TO_BLOCK(cm, page))] &	\
475(((1 << CB_PAGEBLOCKS(cm)) - 1) << (CB_PAGE_TO_BLOCK(cm, page) %	\
476CB_MAPFIELDBITS))))
477
478/* Maximum size of the boot cache */
479#define BC_MAX_SIZE (max_mem / 2)
480
481/*
482 * Sanity macro, frees and zeroes a pointer if it is not already zeroed.
483 */
484#define _FREE_ZERO(p, c)						\
485do {								\
486if (p != NULL) {					\
487_FREE(p, c);					\
488p = NULL;					\
489}							\
490} while (0);
491
492/*
493 * Synchronization macros
494 */
495#define LOCK_HISTORY_R()		lck_rw_lock_shared(&BC_cache->c_history_lock)
496#define UNLOCK_HISTORY_R()		lck_rw_unlock_shared(&BC_cache->c_history_lock)
497#define LOCK_HISTORY_W()		lck_rw_lock_exclusive(&BC_cache->c_history_lock)
498#define UNLOCK_HISTORY_W()		lck_rw_unlock_exclusive(&BC_cache->c_history_lock)
499
500#define LOCK_EXTENT(e)			lck_mtx_lock(&(e)->ce_lock)
501#define UNLOCK_EXTENT(e)		lck_mtx_unlock(&(e)->ce_lock)
502
503/* mount locks should only be held while also holding the cache lock */
504#define LOCK_MOUNT_R(m)			lck_rw_lock_shared(&(m)->cm_lock)
505#define UNLOCK_MOUNT_R(m)		lck_rw_unlock_shared(&(m)->cm_lock)
506#define LOCK_MOUNT_W(m)			lck_rw_lock_exclusive(&(m)->cm_lock)
507#define UNLOCK_MOUNT_W(m)		lck_rw_unlock_exclusive(&(m)->cm_lock)
508#define LOCK_MOUNT_W_TO_R(m)	lck_rw_lock_exclusive_to_shared(&(m)->cm_lock)
509#define LOCK_MOUNT_R_TO_W(m)	lck_rw_lock_shared_to_exclusive(&(m)->cm_lock)
510
511#define LOCK_DISK(d)			lck_mtx_lock(&(d)->cd_lock)
512#define UNLOCK_DISK(d)			lck_mtx_unlock(&(d)->cd_lock)
513
514#define LOCK_READERS()			lck_mtx_lock(&BC_cache->c_reader_threads_lock)
515#define UNLOCK_READERS()		lck_mtx_unlock(&BC_cache->c_reader_threads_lock)
516
517#define LOCK_HANDLERS()			lck_mtx_lock(&BC_cache->c_handlers_lock)
518#define UNLOCK_HANDLERS()		lck_mtx_unlock(&BC_cache->c_handlers_lock)
519
520#define LOCK_CACHE_R()			lck_rw_lock_shared(&BC_cache->c_cache_lock)
521#define UNLOCK_CACHE_R()		lck_rw_unlock_shared(&BC_cache->c_cache_lock)
522#define LOCK_CACHE_W()			lck_rw_lock_exclusive(&BC_cache->c_cache_lock)
523#define UNLOCK_CACHE_W()		lck_rw_unlock_exclusive(&BC_cache->c_cache_lock)
524#define LOCK_CACHE_TRY_W()		lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_EXCLUSIVE)
525#define LOCK_CACHE_TRY_R()		lck_rw_try_lock(&BC_cache->c_cache_lock, LCK_RW_TYPE_SHARED)
526#define LOCK_CACHE_W_TO_R()		lck_rw_lock_exclusive_to_shared(&BC_cache->c_cache_lock)
527#define LOCK_CACHE_R_TO_W()		lck_rw_lock_shared_to_exclusive(&BC_cache->c_cache_lock)
528
529#define BC_ADD_STAT(stat, inc)	OSAddAtomic((inc), ((SInt32 *)&BC_cache->c_stats.ss_##stat))
530
531/*
532 * Only one instance of the cache is supported.
533 */
534static struct BC_cache_control BC_cache_softc;
535static struct BC_cache_control *BC_cache = &BC_cache_softc;
536
537/*
538 * We support preloaded cache data by checking for a Mach-O
539 * segment/section which can be populated with a playlist by the
540 * linker.  This is particularly useful in the CD mastering process,
541 * as reading the playlist from CD is very slow and rebuilding the
542 * kext at mastering time is not practical.
543 */
544extern kmod_info_t kmod_info;
545static struct BC_playlist_header *BC_preloaded_playlist;
546static int	BC_preloaded_playlist_size;
547
548static int	BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
549static int	BC_find_cache_mount(dev_t dev);
550static struct BC_cache_extent**	BC_find_extent(struct BC_cache_mount* cm, u_int64_t offset, u_int64_t length, int contained, int* pnum_extents);
551static int	BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length);
552static int	BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length);
553static int	BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk);
554static void	BC_reader_thread(void *param0, wait_result_t param1);
555static int	BC_strategy(struct buf *bp);
556static int	BC_close(dev_t dev, int flags, int devtype, struct proc *p);
557static int	BC_terminate_readahead(void);
558static int	BC_terminate_cache(void);
559static int	BC_terminate_history(void);
560static void	BC_terminate_cache_async(void);
561static void	BC_check_handlers(void);
562static void	BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t from,
563								uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength);
564static int	BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd);
565static void	BC_teardown_disk(struct BC_cache_disk *cd);
566static void	BC_mount_available(struct BC_cache_mount *cm);
567static int	BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm);
568static int	BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context);
569static void	BC_teardown_mount(struct BC_cache_mount *ce);
570static int	BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx);
571static void	BC_teardown_extent(struct BC_cache_extent *ce);
572static int	BC_copyin_playlist(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries);
573static int	BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history);
574static void BC_update_mounts(void);
575static void	BC_timeout_history(void *);
576static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex);
577static int	BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev);
578static int	BC_size_history_mounts(void);
579static int	BC_size_history_entries(void);
580static int	BC_copyout_history_mounts(user_addr_t uptr);
581static int	BC_copyout_history_entries(user_addr_t uptr);
582static void	BC_discard_history(void);
583static void	BC_auto_start(void);
584static int	BC_setup(void);
585
586static int	fill_in_bc_cache_mounts(mount_t mount, void* arg);
587static int	check_for_new_mount_itr(mount_t mount, void* arg);
588static int	extents_status(struct BC_cache_extent **pce, int nextents) ;
589static void	wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) ;
590
591#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
592static int  compare_cache_extents(const void *vfirst, const void *vsecond);
593//FIXME: qsort not available in kexts
594extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
595#endif
596
597/*
598 * Sysctl interface.
599 */
600static int	BC_sysctl SYSCTL_HANDLER_ARGS;
601
602SYSCTL_PROC(_kern, OID_AUTO, BootCache, CTLFLAG_RW | CTLFLAG_LOCKED,
603			0, 0, BC_sysctl,
604			"S,BC_command",
605			"BootCache command interface");
606
607/*
608 * Netboot exports this hook so that we can determine
609 * whether we were netbooted.
610 */
611extern int	netboot_root(void);
612
613/*
614 * bsd_init exports this hook so that we can register for notification
615 * once the root filesystem is mounted.
616 */
617extern void (* mountroot_post_hook)(void);
618extern void (* unmountroot_pre_hook)(void);
619
620/*
621 * Mach VM-specific functions.
622 */
623static int	BC_alloc_pagebuffer();
624static void	BC_free_pagebuffer();
625static void	BC_free_page(struct BC_cache_extent *ce, int page);
626
627/* Functions not prototyped elsewhere */
628int     	ubc_range_op(vnode_t, off_t, off_t, int, int *); /* in sys/ubc_internal.h */
629
630/*
631 * Mach-o functions.
632 */
633#define MACH_KERNEL 1
634#include <mach-o/loader.h>
635extern void *getsectdatafromheader(struct mach_header *, char *, char *, int *);
636
637/*
638 * Set or clear a flag atomically and return the previous value of that bit.
639 */
640static inline UInt32
641BC_set_flag(UInt32 flag)
642{
643	return flag & OSBitOrAtomic(flag, &BC_cache->c_flags);
644}
645
646static inline UInt32
647BC_clear_flag(UInt32 flag)
648{
649	return flag & OSBitAndAtomic(~(flag), &BC_cache->c_flags);
650}
651
652/*
653 * Return a user-readable string for a given uuid
654 *
655 * Returns a pointer to a static buffer, which is
656 * racy, so this should only be used for debugging purposes
657 */
658static inline const char* uuid_string(uuid_t uuid)
659{
660	/* Racy, but used for debug output so who cares */
661	static uuid_string_t uuidString;
662	uuid_unparse(uuid, uuidString);
663	return (char*)uuidString;
664}
665
666/*
667 * Test whether a given byte range on disk intersects with an extent's range.
668 * This function assumes the ranges are within the same mount.
669 */
670static inline int
671BC_check_intersection(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
672{
673	if ((offset < (ce->ce_diskoffset + ce->ce_length)) &&
674		((offset + length) > ce->ce_diskoffset))
675		return 1;
676	return 0;
677}
678
679/*
680 * Find one of our cache mounts with the given device, if it has been seen this boot
681 *
682 * Called with the cache read lock held
683 *
684 * Returns the index of the mount, or -1 if it is not found
685 */
686static int
687BC_find_cache_mount(dev_t dev)
688{
689	int i;
690
691	if (dev == nulldev())
692		return(-1);
693
694	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (BC_cache->c_mounts == NULL))
695		return(-1);
696
697
698	for (i = 0; i < BC_cache->c_nmounts; i++) {
699		if (BC_cache->c_mounts[i].cm_state == CM_READY) {
700			if (BC_cache->c_mounts[i].cm_dev == dev) {
701				return i;
702			}
703		}
704	}
705
706	return(-1);
707}
708
709/* The new throttling implementation needs to be able to check
710 * whether an IO is in the boot cache before issuing the IO
711 */
712static int BC_cache_contains_block(dev_t device, u_int64_t blkno) {
713	struct BC_cache_mount * cm = NULL;
714	struct BC_cache_extent **pce, *ce;
715
716	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
717		return 0;
718	}
719
720	LOCK_CACHE_R();
721
722	int cm_idx = BC_find_cache_mount(device);
723	if (cm_idx == -1) {
724		UNLOCK_CACHE_R();
725		return 0;
726	}
727
728	cm = BC_cache->c_mounts + cm_idx;
729
730	LOCK_MOUNT_R(cm);
731
732	if (cm->cm_state != CM_READY) {
733		UNLOCK_MOUNT_R(cm);
734		UNLOCK_CACHE_R();
735		return 0;
736	}
737
738	u_int64_t disk_offset = CB_BLOCK_TO_BYTE(cm, blkno);
739	u_int64_t length = CB_BLOCK_TO_BYTE(cm, 1) /* one block */;
740
741	pce = BC_find_extent(cm, disk_offset, length, 0, NULL);
742
743	if (pce == NULL || *pce == NULL) {
744		UNLOCK_MOUNT_R(cm);
745		UNLOCK_CACHE_R();
746		return 0;
747	}
748
749	ce = *pce;
750
751	UNLOCK_MOUNT_R(cm);
752
753	if (ce->ce_flags & CE_ABORTED) {
754		UNLOCK_CACHE_R();
755		return 0;
756	}
757
758	if (ce->ce_flags & CE_LOWPRI &&
759		!(ce->ce_flags & CE_IODONE)) {
760		UNLOCK_CACHE_R();
761		return 0;
762	}
763
764	UNLOCK_CACHE_R();
765	return 1;
766}
767
768/*
769 * Find a cache extent containing or intersecting the offset/length.
770 *
771 * We need to be able to find both containing and intersecting
772 * extents.  Rather than handle each seperately, we take advantage
773 * of the fact that any containing extent will also be an intersecting
774 * extent.
775 *
776 * Thus, a search for a containing extent should be a search for an
777 * intersecting extent followed by a test for containment.
778 *
779 * Containment may include multiple extents. The returned extent will
780 * be the first of these extents (lowest offset), but the next extents
781 * in the array may also be required to contain the requested range.
782 * Upon success, *pnum_extents contains the number of extents required
783 * to contain the range.
784 *
785 * Returns a pointer to a matching the entry in the mount's extent list
786 * or NULL if no match is found.
787 *
788 * Called with the cache mount read lock held
789 */
790static struct BC_cache_extent **
791BC_find_extent(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length, int contained, int *pnum_extents)
792{
793	struct BC_cache_extent **p, **base, **last, **next, **prev; /* pointers into the mount's extent index list */
794	size_t lim;
795
796	/* invariants */
797	assert(length > 0);
798
799	base = cm->cm_pextents;
800	last = base + cm->cm_nextents - 1;
801
802	/*
803	 * If the cache is inactive, exit.
804	 */
805	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE) || (base == NULL)) {
806		return(NULL);
807	}
808
809	/*
810	 * Range-check against the cache first.
811	 *
812	 * Note that we check for possible intersection, rather than
813	 * containment.
814	 */
815	if (((offset + length) <= (*base)->ce_diskoffset) ||
816	    (offset >= ((*last)->ce_diskoffset + (*last)->ce_length))) {
817		return(NULL);
818	}
819
820	/*
821	 * Perform a binary search for an extent intersecting
822	 * the offset and length.
823	 *
824	 * This code is based on bsearch(3), and the description following
825	 * is taken directly from it.
826	 *
827	 * The code below is a bit sneaky.  After a comparison fails, we
828	 * divide the work in half by moving either left or right. If lim
829	 * is odd, moving left simply involves halving lim: e.g., when lim
830	 * is 5 we look at item 2, so we change lim to 2 so that we will
831	 * look at items 0 & 1.  If lim is even, the same applies.  If lim
832	 * is odd, moving right again involes halving lim, this time moving
833	 * the base up one item past p: e.g., when lim is 5 we change base
834	 * to item 3 and make lim 2 so that we will look at items 3 and 4.
835	 * If lim is even, however, we have to shrink it by one before
836	 * halving: e.g., when lim is 4, we still looked at item 2, so we
837	 * have to make lim 3, then halve, obtaining 1, so that we will only
838	 * look at item 3.
839	 */
840	for (lim = cm->cm_nextents; lim != 0; lim >>= 1) {
841		p = base + (lim >> 1);
842
843		if (BC_check_intersection((*p), offset, length)) {
844
845			if (contained == 0)
846				return(p);
847
848			if (pnum_extents) {
849				*pnum_extents = 1;
850			}
851
852			/*
853			 * We are checking for containment, verify that here.
854			 * Extents may be adjacent, so include neighboring
855			 * extents in the containment check
856			 */
857
858			/* Check the high bound */
859			if ((offset + length) > ((*p)->ce_diskoffset + (*p)->ce_length)) {
860
861				/* check higher extents to see if they contain the high end of this range */
862				prev = p;
863				for (next = prev + 1;
864					 next <= last;
865					 next++) {
866
867					/* extents should never overlap */
868					assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
869
870					if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
871						/* extents are not adjacent */
872						if (BC_cache->c_take_detailed_stats) {
873							xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
874						}
875						return(NULL);
876					}
877
878					if (pnum_extents) {
879						(*pnum_extents)++;
880					}
881
882					if ((offset + length) <= ((*next)->ce_diskoffset + (*next)->ce_length)) {
883						/* we contain the high end of the range, break so we check the low end below */
884						break;
885					}
886
887					prev = next;
888				}
889
890				if (next > last) {
891					/* ran off the end of the extent list */
892					if (BC_cache->c_take_detailed_stats) {
893						xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx,%lld (missed last %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*p)->ce_diskoffset, ((*prev)->ce_diskoffset + (*prev)->ce_length), (offset + length) - ((*prev)->ce_diskoffset + (*prev)->ce_length));
894					}
895					return (NULL);
896				}
897			}
898
899			/* Check the low bound */
900			if (offset < (*p)->ce_diskoffset) {
901
902				/* check lower extents to see if they contain the low end of this range */
903				next = p;
904				for (prev = next - 1;
905					 prev >= base;
906					 prev--) {
907
908					/* extents should never overlap */
909					assert(((*prev)->ce_diskoffset + (*prev)->ce_length) <= (*next)->ce_diskoffset);
910
911					if (((*prev)->ce_diskoffset + (*prev)->ce_length) < (*next)->ce_diskoffset) {
912						/* extents are not adjacent */
913						if (BC_cache->c_take_detailed_stats) {
914							xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
915						}
916						return(NULL);
917					}
918
919					if (pnum_extents) {
920						(*pnum_extents)++;
921					}
922
923					if (offset >= (*prev)->ce_diskoffset) {
924						/* we contain the low end of the range (we checked high end above) */
925						return(prev);
926					}
927
928					next = prev;
929				}
930
931				assert(prev < base); /* only break condition */
932
933				if (prev < base) {
934					/* ran off the end of the extent list */
935					if (BC_cache->c_take_detailed_stats) {
936						xdebug("Read 0x%llx:%lld on mount %s intersected, but not contained by %d-extent cache range 0x%llx:%lld (missed first %lld bytes)", offset, length, uuid_string(cm->cm_uuid), *pnum_extents, (*next)->ce_diskoffset, ((*p)->ce_diskoffset + (*p)->ce_length), (*next)->ce_diskoffset - offset);
937					}
938					return (NULL);
939				}
940			}
941
942			return(p);
943		}
944
945		if (offset > (*p)->ce_diskoffset) {	/* move right */
946			base = p + 1;
947			lim--;
948		}				/* else move left */
949	}
950	/* found nothing */
951	return(NULL);
952}
953
954/*
955 * Discard data from the cache and free pages which are no longer required.
956 *
957 * This should be locked against anyone testing for !vacant pages.
958 *
959 * We return the number of bytes we marked unused. If the value is
960 * zero, the offset/length did not overap this extent.
961 *
962 * Called with extent lock held.
963 */
964static int
965BC_discard_bytes(struct BC_cache_extent *ce, u_int64_t offset, u_int64_t length)
966{
967	struct BC_cache_mount *cm;
968	u_int64_t estart, eend, dstart, dend, nblk, base, page;
969	int i, discarded;
970
971	/* invariants */
972	assert(length > 0);
973#ifdef BC_DEBUG
974	lck_mtx_assert(&ce->ce_lock, LCK_MTX_ASSERT_OWNED);
975#endif
976
977	/*
978	 * Extent has been terminated: blockmap may no longer be valid.
979	 */
980	if (ce->ce_flags & CE_ABORTED ||
981		ce->ce_length == 0)
982		return 0;
983
984	cm = BC_cache->c_mounts + ce->ce_mount_idx;
985
986	/*
987	 * Constrain offset and length to fit this extent, return immediately if
988	 * there is no overlap.
989	 */
990	dstart = offset;			/* convert to start/end format */
991	dend = offset + length;
992	assert(dend > dstart);
993	estart = ce->ce_diskoffset;
994	eend = ce->ce_diskoffset + ce->ce_length;
995	assert(eend > estart);
996
997	if (dend <= estart)			/* check for overlap */
998		return(0);
999	if (eend <= dstart)
1000		return(0);
1001
1002	if (dstart < estart)			/* clip to extent */
1003		dstart = estart;
1004	if (dend > eend)
1005		dend = eend;
1006
1007	/*
1008	 * Convert length in bytes to a block count.
1009	 */
1010	nblk = CB_BYTE_TO_BLOCK(cm, dend - dstart);
1011
1012	/*
1013	 * Mark blocks vacant.  For each vacated block, check whether the page
1014	 * holding it can be freed.
1015	 *
1016	 * Note that this could be optimised to reduce the number of
1017	 * page-freeable checks, but the logic would be more complex.
1018	 */
1019	base = CB_BYTE_TO_BLOCK(cm, dstart - ce->ce_diskoffset);
1020	assert(base >= 0);
1021	discarded = 0;
1022	assert((base + nblk - 1) < howmany(ce->ce_length, cm->cm_blocksize));
1023	for (i = 0; i < nblk; i++) {
1024
1025		/* this is a no-op if the block is already gone */
1026		if (CB_BLOCK_PRESENT(ce, base + i)) {
1027
1028			/* mark the block as vacant */
1029			CB_MARK_BLOCK_VACANT(ce, base + i);
1030			discarded++;
1031
1032			page = CB_BLOCK_TO_PAGE(cm, base + i);
1033			if (CB_PAGE_VACANT(cm, ce, page))
1034				BC_free_page(ce, (int) page);
1035		}
1036	}
1037	return(discarded * cm->cm_blocksize);
1038}
1039
1040/*
1041 * Blocks in an extent can be invalidated, e.g. by a write, before the
1042 * reader thread fills the extent. Search for the next range of viable blocks
1043 * in the extent, starting from offset 'fromoffset'.
1044 *
1045 * Values are returned by out parameters:
1046 * 	'nextpage' takes the page containing the start of the next run, or
1047 * 		-1 if no more runs are to be found.
1048 * 	'nextoffset' takes the offset into that page that the run begins.
1049 * 	'nextlength' takes the length in bytes of the range, capped at maxread.
1050 *
1051 * In other words, for this blockmap, if
1052 * 	    *fromoffset = 3 * blocksize,
1053 * 	1 1 0 0 0 1 1 1   1 1 1 1 1 1 0 0
1054 * 	*nextpage = 0
1055 * 	          *nextoffset = 5 * blocksize
1056 * 	                            *nextlength = 6 * blocksize
1057 *
1058 * Called with the extent lock held
1059 */
1060static void
1061BC_next_valid_range(struct BC_cache_mount *cm, struct BC_cache_extent *ce, uint32_t fromoffset,
1062					uint32_t *nextpage, uint32_t *nextoffset, uint32_t *nextlength)
1063{
1064	int maxblks, i, nextblk = 0;
1065	int found = 0;
1066
1067	maxblks = howmany(ce->ce_length, cm->cm_blocksize);
1068	i = CB_BYTE_TO_BLOCK(cm, fromoffset);
1069	/* scan for the next range of valid blocks in the extent */
1070	for (; i < maxblks; i++) {
1071		if (CB_BLOCK_PRESENT(ce, i)) {
1072			if (found == 0) {
1073				nextblk = i;
1074			}
1075			found++;
1076		} else {
1077			if (found != 0)
1078				break;
1079		}
1080	}
1081
1082	if (found) {
1083		/* found a valid range, so convert to (page, offset, length) */
1084		*nextpage = CB_BLOCK_TO_PAGE(cm, nextblk);
1085		*nextoffset = CB_BLOCK_TO_BYTE(cm, nextblk) % PAGE_SIZE;
1086		*nextlength = MIN(CB_BLOCK_TO_BYTE(cm, found), cm->cm_maxread);
1087	} else {
1088		*nextpage = -1;
1089	}
1090
1091	return;
1092}
1093
1094/*
1095 * Test for the presence of a range of blocks in the cache.
1096 *
1097 * Called with the extent lock held.
1098 */
1099static int
1100BC_blocks_present(struct BC_cache_extent *ce, int base, int nblk)
1101{
1102	int	blk;
1103
1104	assert(base >= 0);
1105	assert((base + nblk) <= howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize));
1106
1107	for (blk = 0; blk < nblk; blk++) {
1108
1109		if (!CB_BLOCK_PRESENT(ce, base + blk)) {
1110			return(0);
1111		}
1112	}
1113	return(1);
1114}
1115
1116/*
1117 * Readahead thread.
1118 */
1119static void
1120BC_reader_thread(void *param0, wait_result_t param1)
1121{
1122	struct BC_cache_disk *cd = NULL;
1123	struct BC_cache_mount *cm = NULL;
1124	struct BC_cache_extent *ce = NULL;
1125	int count, num_mounts, ce_idx, cel_idx;
1126	upl_t	upl;
1127	kern_return_t kret;
1128	struct timeval batchstart, batchend;
1129	int error = 0;
1130	int issuing_lowpriority_extents = 0;
1131
1132	/* We can access the BC_cache_disk passed in freely without locks
1133	 * since disks are only freed up in terminate cache after it has
1134	 * guanteed that no reader threads are running
1135	 */
1136	cd = (struct BC_cache_disk*) param0;
1137
1138	num_mounts = cd->cd_nmounts;
1139
1140	if (BC_cache->c_flags & BC_FLAG_SHUTDOWN)
1141		goto out;
1142
1143	for (;;) {
1144
1145		if (issuing_lowpriority_extents) {
1146			debug("disk %d starting low priority batch", cd->cd_disk_num);
1147		} else {
1148			debug("disk %d starting batch %d", cd->cd_disk_num, cd->cd_batch);
1149			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
1150								  DBG_FUNC_START,
1151								  cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
1152		}
1153		microtime(&batchstart);
1154
1155		LOCK_CACHE_R();
1156		for (cel_idx = 0;
1157			 cel_idx < BC_cache->c_nextentlists;
1158			 cel_idx++) {
1159
1160			/* iterate over extents to populate */
1161			for (ce_idx = 0;
1162				 ce_idx < BC_cache->c_nextents[cel_idx];
1163				 ce_idx++) {
1164
1165				ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
1166
1167				cm = BC_cache->c_mounts + ce->ce_mount_idx;
1168
1169				if (cm->cm_state != CM_READY) {
1170					continue;
1171				}
1172
1173				if (cm->cm_disk != cd){
1174					continue;
1175				}
1176
1177				/* Only read extents marked for this batch or earlier.
1178				 * All low-priority IO is read in one batch, however.
1179				 */
1180				if (ce->ce_batch > cd->cd_batch && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1181					continue;
1182				}
1183
1184				/* Check if already done or aborted */
1185				if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
1186					continue;
1187				}
1188
1189				/* Check if this extent is low priority and we're not yet reading in low-priority extents */
1190				if (!(cd->cd_flags & CD_ISSUE_LOWPRI) &&
1191					(ce->ce_flags & CE_LOWPRI)) {
1192					continue;
1193				}
1194
1195				/* Shouldn't happen */
1196				if((cd->cd_flags & CD_ISSUE_LOWPRI) &&
1197				   !(ce->ce_flags & CE_LOWPRI)) {
1198					debug("Saw a regular extent while issuing throttled IO");
1199				}
1200
1201				LOCK_EXTENT(ce);
1202
1203				/* Check if again with the lock */
1204				if (ce->ce_flags & (CE_ABORTED | CE_IODONE)) {
1205					UNLOCK_EXTENT(ce);
1206					continue;
1207				}
1208
1209				/* loop reading to fill this extent */
1210				uint32_t fromoffset = 0;
1211
1212				LOCK_MOUNT_R(cm);
1213
1214				if (cm->cm_state != CM_READY) {
1215					/* the mount was aborted. Whoever aborted it handles aborting the extents as well */
1216					UNLOCK_MOUNT_R(cm);
1217					UNLOCK_EXTENT(ce);
1218					continue;
1219				}
1220
1221				for (;;) {
1222					uint32_t nextpage;
1223					uint32_t nextoffset;
1224					uint32_t nextlength;
1225
1226					/* requested shutdown */
1227					if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
1228						UNLOCK_MOUNT_R(cm);
1229						goto out;
1230					}
1231
1232
1233					/*
1234					 * Find the next set of blocks that haven't been invalidated
1235					 * for this extent.
1236					 */
1237					BC_next_valid_range(cm, ce, fromoffset, &nextpage, &nextoffset, &nextlength);
1238					/* no more blocks to be read */
1239					if (nextpage == -1)
1240						break;
1241
1242					/* set up fromoffset to read the next segment of the extent */
1243					fromoffset = (nextpage * PAGE_SIZE) + nextoffset + nextlength;
1244
1245					kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
1246					if (kret != KERN_SUCCESS) {
1247						UNLOCK_MOUNT_R(cm);
1248						message("reader thread: vnode_getwithvid failed - %d\n", kret);
1249						goto out;
1250					}
1251
1252					kret = ubc_create_upl(BC_cache->c_vp,
1253										  ce->ce_cacheoffset + (nextpage * PAGE_SIZE),
1254										  (int) roundup(nextoffset + nextlength, PAGE_SIZE),
1255										  &upl,
1256										  NULL,
1257										  UPL_SET_LITE|UPL_FILE_IO);
1258					if (kret != KERN_SUCCESS) {
1259						UNLOCK_MOUNT_R(cm);
1260						message("ubc_create_upl returned %d\n", kret);
1261						(void) vnode_put(BC_cache->c_vp);
1262						goto out;
1263					}
1264
1265					/* cm_bp is only accessed from the reader thread,
1266					 * and there's only one reader thread for a given mount,
1267					 * so we don't need the write lock to modify it */
1268
1269					/* set buf to fill the requested subset of the upl */
1270					buf_setblkno(cm->cm_bp, CB_BYTE_TO_BLOCK(cm, ce->ce_diskoffset + nextoffset) + CB_PAGE_TO_BLOCK(cm, nextpage));
1271					buf_setcount(cm->cm_bp, (unsigned int) nextlength);
1272					buf_setupl(cm->cm_bp, upl, (unsigned int) nextoffset);
1273					buf_setresid(cm->cm_bp, buf_count(cm->cm_bp));	/* ask for residual indication */
1274					buf_reset(cm->cm_bp, B_READ);
1275
1276					/* If this is regular readahead, make sure any throttled IO are throttled by the readahead thread rdar://8592635
1277					 * If this is low-priority readahead, make sure this thread will be throttled appropriately later rdar://8734858
1278					 */
1279					throttle_info_handle_t throttle_info_handle;
1280					if ((error = throttle_info_ref_by_mask(cm->cm_throttle_mask, &throttle_info_handle)) == 0) {
1281						throttle_info_update_by_mask(throttle_info_handle, 0x0);
1282						throttle_info_rel_by_mask(throttle_info_handle);
1283					} else {
1284						debug("Unable to update throttle info for mount %s: %d", uuid_string(cm->cm_uuid), error);
1285					}
1286
1287					BC_ADD_STAT(initiated_reads, 1);
1288					if (cd->cd_disk_num < STAT_DISKMAX) {
1289						if (ce->ce_flags & CE_LOWPRI) {
1290							BC_ADD_STAT(disk_initiated_reads_lowpri[cd->cd_disk_num], 1);
1291						} else {
1292							BC_ADD_STAT(disk_initiated_reads[cd->cd_disk_num], 1);
1293						}
1294					}
1295
1296					/* give the buf to the underlying strategy routine */
1297					KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_DKRW, DKIO_READ) | DBG_FUNC_NONE,
1298										  (uintptr_t) buf_kernel_addrperm_addr(cm->cm_bp), buf_device(cm->cm_bp), (long)buf_blkno(cm->cm_bp), buf_count(cm->cm_bp), 0);
1299					KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_PLAYBACK_IO) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(cm->cm_bp), 0, 0, 0, 0);
1300
1301					BC_cache->c_strategy(cm->cm_bp);
1302
1303					/* wait for the bio to complete */
1304					buf_biowait(cm->cm_bp);
1305
1306					kret = ubc_upl_commit(upl);
1307					(void) vnode_put(BC_cache->c_vp);
1308
1309					/*
1310					 * If the read or commit returned an error, invalidate the bytes
1311					 * covered by the read (on a residual, we could avoid invalidating
1312					 * bytes that are claimed to be read as a minor optimisation, but we do
1313					 * not expect errors as a matter of course).
1314					 */
1315					if (kret != KERN_SUCCESS || buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
1316						debug("read error: extent %lu %lu/%lu "
1317							  "(error buf %ld/%ld flags %08x resid %d)",
1318							  (unsigned long) ce_idx,
1319							  (unsigned long)ce->ce_diskoffset, (unsigned long)ce->ce_length,
1320							  (long)buf_blkno(cm->cm_bp), (long)buf_count(cm->cm_bp),
1321							  buf_flags(cm->cm_bp), buf_resid(cm->cm_bp));
1322
1323						count = BC_discard_bytes(ce, ce->ce_diskoffset + nextoffset, nextlength);
1324						debug("read error: discarded %d bytes", count);
1325						BC_ADD_STAT(read_errors, 1);
1326						BC_ADD_STAT(read_errors_bytes, count);
1327					}
1328#ifdef READ_HISTORY_BUFFER
1329					if (BC_cache->c_rhistory && BC_cache->c_rhistory_idx < READ_HISTORY_BUFFER) {
1330						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_extent = ce;
1331						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_blkno = buf_blkno(cm->cm_bp);
1332						BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_bcount = buf_count(cm->cm_bp);
1333						if (buf_error(cm->cm_bp) || (buf_resid(cm->cm_bp) != 0)) {
1334							BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_FAIL;
1335							BC_cache->c_rhistory_idx++;
1336						} else {
1337# ifdef READ_HISTORY_ALL
1338							BC_cache->c_rhistory[BC_cache->c_rhistory_idx].rh_result = BC_RHISTORY_OK;
1339							BC_cache->c_rhistory_idx++;
1340# endif
1341						}
1342					}
1343#endif
1344
1345					/* I'd like to throttle the thread here, but I'm holding the extent and cache locks, so it's throttled below */
1346				}
1347
1348				/* update stats */
1349				BC_ADD_STAT(read_blocks, (u_int) CB_BYTE_TO_BLOCK(cm, ce->ce_length));
1350				BC_ADD_STAT(read_bytes,  (u_int) ce->ce_length);
1351				if (ce->ce_flags & CE_LOWPRI) {
1352					BC_ADD_STAT(read_bytes_lowpri,  (u_int) ce->ce_length);
1353					if (cd->cd_disk_num < STAT_DISKMAX) {
1354						BC_ADD_STAT(batch_bytes_lowpri[cd->cd_disk_num],  (u_int) ce->ce_length);
1355					}
1356				} else {
1357					if (cd->cd_disk_num < STAT_DISKMAX) {
1358						BC_ADD_STAT(batch_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)],  (u_int) ce->ce_length);
1359						if (ce->ce_batch < cd->cd_batch) {
1360							BC_ADD_STAT(batch_late_bytes[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)], ce->ce_length);
1361						}
1362					}
1363				}
1364				if (ce->ce_flags & CE_SHARED) {
1365					BC_ADD_STAT(shared_bytes, (u_int) ce->ce_length);
1366				}
1367				UNLOCK_MOUNT_R(cm);
1368
1369				/*
1370				 * Wake up anyone wanting this extent, as it is now ready for
1371				 * use.
1372				 */
1373				ce->ce_flags |= CE_IODONE;
1374				UNLOCK_EXTENT(ce);
1375				wakeup(ce);
1376
1377				/* Let anyone waiting for the write lock to take hold */
1378				UNLOCK_CACHE_R();
1379
1380				/* Take this opportunity of locklessness to throttle the reader thread, if necessary */
1381				if (issuing_lowpriority_extents) {
1382					throttle_lowpri_io((ce->ce_flags & CE_LOWPRI) ? 1 : 0);
1383				}
1384
1385				LOCK_CACHE_R();
1386				if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1387					/* New playlist arrived, go back to issuing regular priority extents */
1388					debug("New extents, interrupting low priority readahead");
1389					break;
1390				}
1391
1392				if (cel_idx >= BC_cache->c_nextentlists ||
1393					ce_idx  >= BC_cache->c_nextents[cel_idx]) {
1394					/* cache shrunk while we weren't looking.
1395					 * Not a hard error, but we're done with this batch at least
1396					 */
1397					break;
1398				}
1399			}
1400			ce = NULL;
1401			if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1402				/* New playlist arrived, go back to issuing regular priority extents */
1403				break;
1404			}
1405		}
1406		UNLOCK_CACHE_R();
1407
1408		microtime(&batchend);
1409		timersub(&batchend, &batchstart, &batchend);
1410		if (cd->cd_disk_num < STAT_DISKMAX) {
1411			if (issuing_lowpriority_extents) {
1412				BC_cache->c_stats.ss_batch_time_lowpri[cd->cd_disk_num] +=
1413				(u_int) batchend.tv_sec * 1000 +
1414				(u_int) batchend.tv_usec / 1000;
1415			} else {
1416				/* Measure times for the first several batches separately */
1417				BC_cache->c_stats.ss_batch_time[cd->cd_disk_num][MIN(cd->cd_batch, STAT_BATCHMAX)] +=
1418				(u_int) batchend.tv_sec * 1000 +
1419				(u_int) batchend.tv_usec / 1000;
1420			}
1421		}
1422		if (issuing_lowpriority_extents) {
1423			// debug("disk %d low priority batch done", cd->cd_disk_num);
1424		} else {
1425			// debug("disk %d batch %d done", cd->cd_disk_num, cd->cd_batch);
1426			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_BATCH) |
1427								  DBG_FUNC_END,
1428								  cd->cd_batch, cd->cd_disk_num, 0, 0, 0);
1429		}
1430
1431
1432		LOCK_DISK(cd);
1433		if (issuing_lowpriority_extents && !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1434			/* New playlist arrived, go back to issuing regular extents */
1435			issuing_lowpriority_extents = 0;
1436			throttle_set_thread_io_policy(IOPOL_NORMAL);
1437			cd->cd_batch = 0;
1438		} else if (cd->cd_batch + 1 < BC_cache->c_batch_count && !issuing_lowpriority_extents) {
1439			cd->cd_batch++;
1440		} else {
1441			if (num_mounts == cd->cd_nmounts) {
1442				/* no new mounts appeared */
1443				if ( !(cd->cd_flags & CD_ISSUE_LOWPRI)) {
1444					/* go through the playlist again, issuing any low-priority extents we have */
1445					cd->cd_flags |= CD_ISSUE_LOWPRI;
1446					throttle_set_thread_io_policy(IOPOL_THROTTLE);
1447					issuing_lowpriority_extents = 1;
1448				} else {
1449					/* we're done */
1450					cd->cd_flags &= (~CD_HAS_THREAD);
1451					cd->cd_batch = 0;
1452					UNLOCK_DISK(cd);
1453					goto out;
1454				}
1455			} else {
1456				/* New mounts appeared. Run through again */
1457				cd->cd_batch = 0;
1458				debug("disk %d more mounts detected (%d total)", cd->cd_disk_num, cd->cd_nmounts);
1459			}
1460		}
1461		num_mounts = cd->cd_nmounts;
1462		UNLOCK_DISK(cd);
1463	}
1464
1465out:
1466	/*
1467	 * If ce != NULL we have bailed out, but we cannot free memory beyond
1468	 * the bailout point as it may have been read in a previous batch and
1469	 * there may be a sleeping strategy routine assuming that it will
1470	 * remain valid.  Since we are typically killed immediately before a
1471	 * complete cache termination, this does not represent a significant
1472	 * problem.
1473	 *
1474	 * However, to prevent readers blocking waiting for readahead to
1475	 * complete for extents that will never be read, we mark all the
1476	 * extents we have given up on as aborted.
1477	 */
1478	if (ce != NULL) {
1479		UNLOCK_EXTENT(ce);
1480
1481		/* the only errors that get us here are global, so mark the cache as shut down */
1482		LOCK_DISK(cd);
1483		BC_set_flag(BC_FLAG_SHUTDOWN);
1484		cd->cd_flags &= (~CD_HAS_THREAD);
1485		UNLOCK_DISK(cd);
1486
1487		int tmpcel_idx, tmpce_idx;
1488		for (tmpcel_idx = 0;
1489			 tmpcel_idx < BC_cache->c_nextentlists;
1490			 tmpcel_idx++) {
1491			for (tmpce_idx = 0;
1492				 tmpce_idx < BC_cache->c_nextents[tmpcel_idx];
1493				 tmpce_idx++) {
1494				ce = BC_cache->c_extentlists[tmpcel_idx] + tmpce_idx;
1495				LOCK_EXTENT(ce);
1496				/* abort any extent that hasn't been satisfied */
1497				if (!(ce->ce_flags & CE_IODONE)) {
1498					ce->ce_flags |= CE_ABORTED;
1499					/* wake up anyone asleep on this extent */
1500					UNLOCK_EXTENT(ce);
1501					wakeup(ce);
1502				} else {
1503					UNLOCK_EXTENT(ce);
1504				}
1505			}
1506		}
1507		UNLOCK_CACHE_R();
1508	}
1509
1510
1511	debug("disk %d done", cd->cd_disk_num);
1512
1513	/* wake up someone that might be waiting for this reader thread to exit */
1514	wakeup(&cd->cd_flags);
1515
1516	LOCK_READERS();
1517	/* wake up someone that might be waiting for all the reader threads to exit */
1518	BC_cache->c_num_reader_threads--;
1519	if (BC_cache->c_num_reader_threads == 0) {
1520		wakeup(&BC_cache->c_num_reader_threads);
1521	}
1522	UNLOCK_READERS();
1523
1524}
1525
1526/*
1527 * Handle an incoming close request.
1528 */
1529static int
1530BC_close(dev_t dev, int flags, int devtype, struct proc *p)
1531{
1532	struct BC_cache_mount *cm;
1533	int i, cm_idx;
1534
1535	/* Mark our cache_mount for this device as invalid */
1536
1537	LOCK_CACHE_R();
1538	if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
1539
1540		cm_idx = BC_find_cache_mount(dev);
1541		if (-1 != cm_idx) {
1542			cm = BC_cache->c_mounts + cm_idx;
1543			debug("Tearing down closing mount %s with dev 0x%x", uuid_string(cm->cm_uuid), dev);
1544			LOCK_MOUNT_R(cm);
1545			if (cm->cm_state != CM_ABORTED) {
1546				/*
1547				 * Mark all extents as aborted. This will notify any sleepers in the
1548				 * strategy routine that the extent won't have data for them.
1549				 *
1550				 * Note that we shouldn't try to take an extent lock while holding
1551				 * the exclusive mount lock or we risk deadlock with the reader thread
1552				 */
1553				for (i = 0; i < cm->cm_nextents; i++) {
1554					LOCK_EXTENT(cm->cm_pextents[i]);
1555					BC_teardown_extent(cm->cm_pextents[i]);
1556					UNLOCK_EXTENT(cm->cm_pextents[i]);
1557					wakeup(cm->cm_pextents[i]);
1558				}
1559				if (! LOCK_MOUNT_R_TO_W(cm)) {
1560					/* If there is someone waiting for the exclusive lock on this mount,
1561					 * we want to grab it before they can so they don't waste time.
1562					 * If we fail to take the exclusive lock it means someone
1563					 * else is also trying to upgrate the mount lock. We don't keep any
1564					 * state, so we can simply take the write lock again.
1565					 */
1566					LOCK_MOUNT_W(cm);
1567					if (cm->cm_state == CM_ABORTED) {
1568						// Mount is aborted, no more work necessary
1569						UNLOCK_MOUNT_W(cm);
1570						goto out;
1571					}
1572				}
1573				BC_teardown_mount(cm);
1574				UNLOCK_MOUNT_W(cm);
1575			} else {
1576				UNLOCK_MOUNT_R(cm);
1577			}
1578		}
1579	}
1580
1581out:
1582	UNLOCK_CACHE_R();
1583
1584	return BC_cache->c_close(dev, flags, devtype, p);
1585}
1586
1587/*
1588 * Returns CE_ABORTED if any of the extents in the array are aborted
1589 * Returns CE_LOWPRI  if any of the extents in the array are low priority and not done
1590 * Returns CE_IODONE  if all the extents in the array are done
1591 * Returns 0 otherwise
1592 */
1593static int extents_status(struct BC_cache_extent **pce, int nextents) {
1594	int ce_idx;
1595	int ret = CE_IODONE;
1596
1597	for (ce_idx = 0;
1598		 ce_idx < nextents;
1599		 ce_idx++) {
1600
1601		if (pce[ce_idx]->ce_flags & CE_ABORTED) {
1602			return CE_ABORTED;
1603		}
1604
1605		if (! (pce[ce_idx]->ce_flags & CE_IODONE)) {
1606			if (pce[ce_idx]->ce_flags & CE_LOWPRI) {
1607				ret = CE_LOWPRI;
1608			} else {
1609				if (ret == CE_IODONE) {
1610					ret = 0;
1611				}
1612			}
1613		}
1614	}
1615	return ret;
1616}
1617
1618/*
1619 * Waits for the given extent to be filled or aborted
1620 *
1621 * Should only be called with the extent lock.
1622 * NO OTHER LOCKS SHOULD BE HELD INCLUDING THE CACHE LOCK
1623 *
1624 * We are guaranteed that the extent won't disappear when we release
1625 * the cache lock by the cleanup code waiting for BC_cache->c_strategycalls
1626 * to reach 0, and we're counted in there.
1627 */
1628static void wait_for_extent(struct BC_cache_extent *ce, struct timeval starttime) {
1629	struct timeval time;
1630	struct timespec timeout;
1631	microtime(&time);
1632	timersub(&time,
1633			 &starttime,
1634			 &time);
1635	if (time.tv_sec < BC_wait_for_readahead) {
1636		timeout.tv_sec = BC_wait_for_readahead - time.tv_sec;
1637		if (time.tv_usec == 0) {
1638			timeout.tv_nsec = 0;
1639		} else {
1640			timeout.tv_sec--;
1641			timeout.tv_nsec = NSEC_PER_USEC * (USEC_PER_SEC - time.tv_usec);
1642		}
1643		msleep(ce, &ce->ce_lock, PRIBIO, "BC_strategy", &timeout);
1644	}
1645}
1646
1647/*
1648 * Handle an incoming IO request.
1649 */
1650static int
1651BC_strategy(struct buf *bp)
1652{
1653	struct BC_cache_extent *ce = NULL, **pce, **pcontaining_extents = NULL;
1654	int num_extents;
1655	uio_t uio = NULL;
1656	int nbytes, resid, discards = 0;
1657	struct timeval blocked_start_time, blocked_end_time;
1658	daddr64_t blkno;
1659	caddr_t buf_map_p, buf_extent_p;
1660	off_t disk_offset;
1661	kern_return_t kret;
1662	int cm_idx = -1, ce_idx;
1663	dev_t dev;
1664	int32_t bufflags;
1665	int during_cache = 0, during_io = 0, take_detailed_stats = 0, cache_hit = 0;
1666	int is_root_disk, did_block, status;
1667	int is_shared = 0, is_swap = 0;
1668	int should_throttle = 0;
1669	int is_ssd = 0;
1670	int is_stolen = 0;
1671	int unfilled = 0;
1672	vnode_t vp = NULL;
1673	int pid = 0;
1674	int dont_cache = 0;
1675	int throttle_tier = 0;
1676	bufattr_t bap = NULL;
1677
1678	assert(bp != NULL);
1679
1680	blkno = buf_blkno(bp);
1681	nbytes = buf_count(bp);
1682	bufflags = buf_flags(bp);
1683	bap = buf_attr(bp);
1684	dev = buf_device(bp);
1685	vp = buf_vnode(bp);
1686
1687	if (bap) {
1688		throttle_tier = bufattr_throttled(bap);
1689	}
1690
1691	/*
1692	 * If the buf doesn't have a vnode for some reason, pretend
1693	 * we never saw the request at all.
1694	 */
1695	if (dev == nulldev()) {
1696		BC_cache->c_strategy(bp);
1697		BC_ADD_STAT(strategy_unknown, 1);
1698		BC_ADD_STAT(strategy_unknown_bytes, nbytes);
1699		return 0;
1700	}
1701
1702	if (vp && vnode_isswap(vp)) {
1703		is_swap = 1;
1704		goto bypass;
1705	}
1706
1707	if (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE) {
1708		pid = proc_selfpid();
1709
1710		if (vp) {
1711			is_shared = vnode_isdyldsharedcache(vp);
1712 		}
1713	}
1714
1715	if (vp && (vnode_israge(vp))) {
1716		dont_cache = 1;
1717	}
1718
1719	/*
1720	 * If the cache is not active, bypass the request.  We may
1721	 * not be able to fill it, but we still want to record it.
1722	 */
1723	if (!(BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
1724		goto bypass;
1725	}
1726
1727	/*
1728	 * In order to prevent the cache cleanup code from racing with us
1729	 * when we sleep, we track the number of strategy calls in flight.
1730	 */
1731	OSAddAtomic(1, &BC_cache->c_strategycalls);
1732	LOCK_CACHE_R();
1733	during_cache = 1; /* we're included in the in_flight count */
1734
1735	/* Get cache mount asap for use in case we bypass */
1736	cm_idx = BC_find_cache_mount(dev);
1737
1738	if (cm_idx != -1) {
1739		disk_offset = CB_BLOCK_TO_BYTE(BC_cache->c_mounts + cm_idx, blkno);
1740	}
1741
1742	if (BC_cache->c_take_detailed_stats) {
1743		take_detailed_stats = 1;
1744		BC_ADD_STAT(strategy_calls, 1);
1745		if (throttle_tier) {
1746			BC_ADD_STAT(strategy_throttled, 1);
1747		}
1748#ifdef BC_DEBUG
1749//		if (dont_cache) {
1750//			char procname[128];
1751//			proc_selfname(procname, sizeof(procname));
1752//			const char* filename = vp ? vnode_getname(vp) : NULL;
1753//			debug("not recording %s%s from app %s for file %s (disk block 0x%llx) which is%s throttled", (vp && vnode_israge(vp)) ? "rapid age " : "", (bufflags & B_READ) ? "read" : "write", procname, filename?:"unknown", blkno, (bufflags & B_THROTTLED_IO) ? "" : " not");
1754//			if (filename) {
1755//				vnode_putname(filename);
1756//			}
1757//		}
1758#endif
1759	}
1760
1761	/*
1762	 * if we haven't seen this mount before, pass it off
1763	 *
1764	 * If this is a mount we want to cache, we'll kick it off when
1765	 * adding to our history
1766	 */
1767	if (cm_idx == -1) {
1768		/* don't have a mount for this IO */
1769		if (take_detailed_stats)
1770			BC_ADD_STAT(strategy_noncached_mount, 1);
1771		goto bypass;
1772	}
1773
1774	/* Get the info from the mount we need */
1775	LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1776
1777	if (BC_cache->c_mounts[cm_idx].cm_disk &&
1778		(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
1779		!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI)) {
1780		during_io = 1; /* for statistics */
1781		if (take_detailed_stats) {
1782			BC_ADD_STAT(strategy_duringio, 1);
1783		}
1784	}
1785
1786	if (BC_cache->c_mounts[cm_idx].cm_state != CM_READY) {
1787		/* the mount has been aborted, treat it like a missing mount */
1788		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1789		cm_idx = -1;
1790
1791		if (take_detailed_stats)
1792			BC_ADD_STAT(strategy_noncached_mount, 1);
1793		goto bypass;
1794	}
1795
1796	/* if it's not a read, pass it off */
1797	if ( !(bufflags & B_READ)) {
1798		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1799		if (take_detailed_stats)
1800			BC_ADD_STAT(strategy_nonread, 1);
1801		goto bypass;
1802	}
1803
1804	if ((nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize) != 0) {
1805		debug("IO with non-blocksize multiple bytes (%d %% %lld = %lld)", nbytes, BC_cache->c_mounts[cm_idx].cm_blocksize, (nbytes % BC_cache->c_mounts[cm_idx].cm_blocksize));
1806		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1807		if (take_detailed_stats)
1808			BC_ADD_STAT(strategy_nonblocksize, 1);
1809		goto bypass;
1810	}
1811
1812	if (take_detailed_stats) {
1813		BC_ADD_STAT(extent_lookups, 1);
1814		BC_ADD_STAT(requested_bytes, nbytes);
1815		if (cm_idx < STAT_MOUNTMAX) {
1816			BC_ADD_STAT(requested_bytes_m[cm_idx], nbytes);
1817		}
1818	}
1819
1820	/*
1821	 * Look for a cache extent containing this request.
1822	 */
1823	num_extents = 0;
1824	pce = BC_find_extent(BC_cache->c_mounts + cm_idx, disk_offset, nbytes, 1, &num_extents);
1825	if (pce == NULL) {
1826#ifdef BC_EXTRA_DEBUG
1827		if (take_detailed_stats && !dont_cache) {
1828			char procname[128];
1829			proc_selfname(procname, sizeof(procname));
1830			const char* filename = vp ? vnode_getname(vp) : NULL;
1831			if (num_extents > 0) {
1832				xdebug("Missed IO from app %s for file %s (disk offset 0x%llx) (intersected, though)", procname, filename?:"unknown", blkno);
1833			} else {
1834				xdebug("Missed IO from app %s for file %s (disk offset 0x%llx)", procname, filename?:"unknown", blkno);
1835			}
1836			if (filename) {
1837				vnode_putname(filename);
1838			}
1839		}
1840#endif
1841		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1842		goto bypass;
1843	}
1844
1845	assert(num_extents > 0);
1846
1847	cache_hit = 1;
1848
1849	if (take_detailed_stats) {
1850		BC_ADD_STAT(extent_hits, 1);
1851		if (during_io)
1852			BC_ADD_STAT(hit_duringio, 1);
1853		if (num_extents > 1)
1854			BC_ADD_STAT(hit_multiple, 1);
1855	}
1856
1857	pcontaining_extents = BC_MALLOC(num_extents * sizeof(*pcontaining_extents), M_TEMP, M_WAITOK | M_ZERO);
1858	if (pcontaining_extents == NULL) {
1859		UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1860		if (take_detailed_stats)
1861			BC_ADD_STAT(hit_failure, 1);
1862		goto bypass;
1863	}
1864	memcpy(pcontaining_extents, pce, num_extents * sizeof(*pcontaining_extents));
1865
1866	is_ssd = (BC_cache->c_mounts[cm_idx].cm_disk &&
1867			 (BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD));
1868
1869	UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
1870
1871	/* if any extent is aborted, bypass immediately */
1872	status = extents_status(pcontaining_extents, num_extents);
1873	if (status & CE_ABORTED) {
1874		if (take_detailed_stats)
1875			BC_ADD_STAT(hit_aborted, 1);
1876		goto bypass;
1877	}
1878
1879#ifndef EMULATE_ONLY
1880
1881	did_block = 0;
1882	if (! (status & CE_IODONE)) {
1883
1884		if (is_ssd) {
1885			/* Don't delay IOs on SSD since cut-throughs aren't harmful */
1886			unfilled = 1;
1887			goto bypass;
1888		}
1889
1890		if (status & CE_LOWPRI) {
1891			/* Don't wait for low priority extents */
1892			if (take_detailed_stats)
1893				BC_ADD_STAT(strategy_unfilled_lowpri, 1);
1894			cache_hit = 0;
1895			goto bypass;
1896		}
1897
1898		/*
1899		 * wait for all the extents to finish
1900		 */
1901		microtime(&blocked_start_time);
1902		for (ce_idx = 0;
1903			 ce_idx < num_extents;
1904			 ce_idx++) {
1905
1906			ce = pcontaining_extents[ce_idx];
1907			LOCK_EXTENT(ce);
1908			if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
1909				did_block = 1;
1910				UNLOCK_CACHE_R();
1911				/*
1912				 * We need to release the cache lock since we will msleep in wait_for_extent.
1913				 * If we don't release the cache read lock and someone wants to add a new playlist,
1914				 * they go to grab the cache write lock, and everybody stalls for the msleep timeout.
1915				 *
1916				 * We are guaranteed that the cache won't disappear during this time since
1917				 * the cleanup code waits for all the strategy calls (this function) to complete
1918				 * by checking BC_cache->c_strategycalls, which we incremented above.
1919				 *
1920				 * The cache may have grown behind our backs, however. Mount pointers are not valid,
1921				 * but mount indexes still are. Extent pointers are still valid since the extent list
1922				 * is not modified (this is a requirement to be able to msleep with the extent's lock).
1923				 */
1924				wait_for_extent(ce, blocked_start_time);
1925
1926				/* Cache lock must be grabbed without holding any locks to avoid deadlock rdar://8626772 */
1927				UNLOCK_EXTENT(ce);
1928				LOCK_CACHE_R();
1929				LOCK_EXTENT(ce);
1930
1931				if (! (ce->ce_flags & (CE_ABORTED | CE_IODONE))) {
1932					/* strategy call timed out waiting for the extent */
1933					if (take_detailed_stats)
1934						BC_ADD_STAT(strategy_timedout, 1);
1935#ifdef BC_DEBUG
1936					microtime(&blocked_end_time);
1937					timersub(&blocked_end_time,
1938							 &blocked_start_time,
1939							 &blocked_end_time);
1940					char procname[128];
1941					proc_selfname(procname, sizeof(procname));
1942					const char* filename = vp ? vnode_getname(vp) : NULL;
1943					debug("Strategy routine timed out app %s waiting on file %s (disk offset 0x%llx) after %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
1944					if (filename) {
1945						vnode_putname(filename);
1946					}
1947#endif
1948					goto bypass;
1949				}
1950				UNLOCK_EXTENT(ce);
1951				ce = NULL;
1952
1953				/* Check every time we sleep so we can avoid more sleeping if unnecessary */
1954				status = extents_status(pcontaining_extents, num_extents);
1955				if (status & CE_ABORTED) {
1956					if (take_detailed_stats)
1957						BC_ADD_STAT(hit_aborted, 1);
1958					goto bypass;
1959				} else if (status & CE_IODONE) {
1960					break;
1961				}
1962			} else {
1963				UNLOCK_EXTENT(ce);
1964				ce = NULL;
1965			}
1966		}
1967		if (take_detailed_stats && did_block) {
1968			BC_ADD_STAT(strategy_blocked, 1);
1969			microtime(&blocked_end_time);
1970			timersub(&blocked_end_time,
1971					 &blocked_start_time,
1972					 &blocked_end_time);
1973			int ms_blocked = (blocked_end_time.tv_sec * 1000) + (blocked_end_time.tv_usec / (1000));
1974#ifdef BC_DEBUG
1975			char procname[128];
1976			proc_selfname(procname, sizeof(procname));
1977			const char* filename = vp ? vnode_getname(vp) : NULL;
1978			debug("Strategy routine blocked app %s waiting on file %s (disk offset 0x%llx) for %lu.%03d seconds", procname, filename?:"unknown", disk_offset, blocked_end_time.tv_sec, blocked_end_time.tv_usec / 1000);
1979			if (filename) {
1980				vnode_putname(filename);
1981			}
1982#endif
1983			BC_ADD_STAT(strategy_time_blocked, ms_blocked);
1984			if (BC_cache->c_stats.ss_strategy_time_longest_blocked < ms_blocked) {
1985				BC_cache->c_stats.ss_strategy_time_longest_blocked = ms_blocked;
1986			}
1987		}
1988	}
1989
1990	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (did_block ? DBG_BC_IO_HIT_STALLED : DBG_BC_IO_HIT)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp), 0, 0, 0, 0);
1991
1992	/*
1993	 * buf_map will do its best to provide access to this
1994	 * buffer via a kernel accessible address
1995	 * if it fails, we can still fall through.
1996	 */
1997	if (buf_map(bp, &buf_map_p)) {
1998		/* can't map, let someone else worry about it */
1999		if (take_detailed_stats)
2000			BC_ADD_STAT(hit_failure, 1);
2001		goto bypass;
2002	}
2003	buf_extent_p = buf_map_p;
2004
2005	kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
2006	if (kret != KERN_SUCCESS) {
2007		debug("vnode_getwithvid failed - %d", kret);
2008		if (take_detailed_stats)
2009			BC_ADD_STAT(hit_failure, 1);
2010		buf_unmap(bp);
2011		goto bypass;
2012	}
2013
2014	/*
2015	 * fill in bp from the extents
2016	 */
2017	for (ce_idx = 0;
2018		 ce_idx < num_extents;
2019		 ce_idx++) {
2020		u_int64_t nbytes_thisextent;
2021		u_int64_t diskoffset_thisextent;
2022		off_t     cacheoffset_thisextent;
2023
2024		ce = pcontaining_extents[ce_idx];
2025		LOCK_EXTENT(ce);
2026
2027		/*
2028		 * Check that the extent wasn't aborted while we were unlocked.
2029		 */
2030		if (ce->ce_flags & CE_ABORTED) {
2031			if (take_detailed_stats)
2032				BC_ADD_STAT(hit_aborted, 1);
2033			vnode_put(BC_cache->c_vp);
2034			buf_unmap(bp);
2035			goto bypass;
2036		}
2037
2038		assert(ce->ce_flags & CE_IODONE);
2039
2040		diskoffset_thisextent  = MAX(ce->ce_diskoffset, disk_offset);
2041		nbytes_thisextent      = MIN(disk_offset + nbytes, ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent;
2042		cacheoffset_thisextent = ce->ce_cacheoffset + (diskoffset_thisextent - ce->ce_diskoffset);
2043
2044		/* range check against extent */
2045		assert(diskoffset_thisextent  <  ce->ce_diskoffset + ce->ce_length);
2046		assert(diskoffset_thisextent  >= ce->ce_diskoffset);
2047		assert(nbytes_thisextent      <= (ce->ce_diskoffset + ce->ce_length) - diskoffset_thisextent);
2048		assert(cacheoffset_thisextent <  ce->ce_cacheoffset + ce->ce_length);
2049		assert(cacheoffset_thisextent >= ce->ce_cacheoffset);
2050		assert(nbytes_thisextent      <= (ce->ce_cacheoffset + ce->ce_length) - cacheoffset_thisextent);
2051
2052		/* range check against buf_t */
2053		assert(diskoffset_thisextent    <  disk_offset + nbytes);
2054		assert(diskoffset_thisextent    >= disk_offset);
2055		assert(nbytes_thisextent        <= (disk_offset + nbytes) - diskoffset_thisextent);
2056
2057		/* check our buf_map pointer */
2058		assert(buf_extent_p - buf_map_p == diskoffset_thisextent - disk_offset); /* didn't skip any bytes */
2059		assert(buf_map_p + nbytes       >= buf_extent_p + nbytes_thisextent); /* not overflowing the buffer */
2060
2061		/* Make sure we're including the entire buffer requested */
2062		assert(ce_idx > 0 || disk_offset == diskoffset_thisextent); /* include the first byte */
2063		assert(ce_idx < (num_extents - 1) || disk_offset + nbytes == diskoffset_thisextent + nbytes_thisextent); /* include the last byte */
2064
2065		/*
2066		 * Check that the requested blocks are in the buffer.
2067		 */
2068		if (!BC_blocks_present(ce,
2069							   CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, diskoffset_thisextent - ce->ce_diskoffset),
2070							   CB_BYTE_TO_BLOCK(BC_cache->c_mounts + cm_idx, nbytes_thisextent))) {
2071			if (take_detailed_stats)
2072				BC_ADD_STAT(hit_blkmissing, 1);
2073			vnode_put(BC_cache->c_vp);
2074			buf_unmap(bp);
2075			goto bypass;
2076		}
2077
2078		resid = nbytes_thisextent;
2079		uio = uio_create(1, cacheoffset_thisextent, UIO_SYSSPACE, UIO_READ);
2080		if (uio == NULL) {
2081			debug("couldn't allocate uio");
2082			if (take_detailed_stats)
2083				BC_ADD_STAT(hit_failure, 1);
2084			vnode_put(BC_cache->c_vp);
2085			buf_unmap(bp);
2086			goto bypass;
2087		}
2088
2089		kret = uio_addiov(uio, CAST_USER_ADDR_T(buf_extent_p), nbytes_thisextent);
2090		if (kret != KERN_SUCCESS) {
2091			debug("couldn't add iov to uio - %d", kret);
2092			if (take_detailed_stats)
2093				BC_ADD_STAT(hit_failure, 1);
2094			vnode_put(BC_cache->c_vp);
2095			buf_unmap(bp);
2096			goto bypass;
2097		}
2098
2099		kret = cluster_copy_ubc_data(BC_cache->c_vp, uio, &resid, 0);
2100		if (kret != KERN_SUCCESS) {
2101			debug("couldn't copy ubc data - %d", kret);
2102			if (take_detailed_stats)
2103				BC_ADD_STAT(hit_failure, 1);
2104			vnode_put(BC_cache->c_vp);
2105			buf_unmap(bp);
2106			goto bypass;
2107		}
2108
2109		if (resid != 0) {
2110			/* blocks have been stolen for pageout or contiguous memory */
2111			if (take_detailed_stats)
2112				BC_ADD_STAT(hit_stolen, 1);
2113			vnode_put(BC_cache->c_vp);
2114			buf_unmap(bp);
2115			is_stolen = 1;
2116			goto bypass;
2117		}
2118
2119		buf_extent_p += nbytes_thisextent;
2120
2121		/* discard blocks we have touched */
2122		discards += BC_discard_bytes(ce, disk_offset, nbytes_thisextent);
2123
2124		UNLOCK_EXTENT(ce);
2125		ce = NULL;
2126
2127		/* copy was successful */
2128		uio_free(uio);
2129		uio = NULL;
2130	}
2131
2132	vnode_put(BC_cache->c_vp);
2133
2134	buf_setresid(bp, 0);
2135
2136	/* buf_unmap will take care of all cases */
2137	buf_unmap(bp);
2138
2139	/* complete the request */
2140	buf_biodone(bp);
2141
2142	/* can't dereference bp after a buf_biodone has been issued */
2143
2144#else //ifndef EMULATE_ONLY
2145	(void) kret;
2146	(void) p;
2147	(void) resid;
2148
2149	/* discard blocks we have touched */
2150	for (ce_idx = 0;
2151		 ce_idx < num_extents;
2152		 ce_idx++) {
2153		ce = pcontaining_extents[ce_idx];
2154		LOCK_EXTENT(ce);
2155		discards += BC_discard_bytes(ce, disk_offset, nbytes);
2156		UNLOCK_EXTENT(ce);
2157		ce = NULL;
2158	}
2159
2160	/* send the IO to the disk, emulate hit in statistics */
2161	BC_cache->c_strategy(bp);
2162
2163#endif //ifndef EMULATE_ONLY else
2164
2165	if (take_detailed_stats) {
2166		BC_ADD_STAT(hit_bytes, discards);
2167		if (cm_idx < STAT_MOUNTMAX) {
2168			BC_ADD_STAT(hit_bytes_m[cm_idx], discards);
2169		}
2170		if (dont_cache) {
2171			BC_ADD_STAT(strategy_hit_nocache, 1);
2172			BC_ADD_STAT(hit_nocache_bytes, discards);
2173		}
2174		if (is_shared) {
2175			BC_ADD_STAT(hit_shared_bytes, discards);
2176		}
2177	} else {
2178		BC_ADD_STAT(hit_bytes_afterhistory, discards);
2179	}
2180
2181	BC_cache->c_num_ios_since_last_hit = 0;
2182
2183	/* we are not busy anymore */
2184	OSAddAtomic(-1, &BC_cache->c_strategycalls);
2185	UNLOCK_CACHE_R();
2186
2187	_FREE_ZERO(pcontaining_extents, M_TEMP);
2188
2189	if (! dont_cache) {
2190		/* record successful fulfilment (may block) */
2191		BC_add_history(blkno, nbytes, pid, 1, 0, 0, is_shared, dev);
2192	}
2193
2194	/*
2195	 * spec_strategy wants to know if the read has been
2196	 * satisfied by the boot cache in order to avoid
2197	 * throttling the thread unnecessarily. spec_strategy
2198	 * will check for the special return value 0xcafefeed
2199	 * to indicate that the read was satisfied by the
2200	 * cache.
2201	 */
2202
2203#define IO_SATISFIED_BY_CACHE ((int)0xcafefeed)
2204	return IO_SATISFIED_BY_CACHE;
2205
2206bypass:
2207	if (ce != NULL)
2208		UNLOCK_EXTENT(ce);
2209	if (uio != NULL)
2210		uio_free(uio);
2211	_FREE_ZERO(pcontaining_extents, M_TEMP);
2212
2213	/* pass the request on */
2214	BC_cache->c_strategy(bp);
2215
2216	/*
2217	 * can't dereference bp after c_strategy has been issued
2218	 * or else we race with buf_biodone
2219	 */
2220	void *bp_void = (void *)bp; // for use in ktrace
2221	bp = NULL;
2222
2223	/* not really "bypassed" if the cache is not active */
2224	if (during_cache) {
2225		if (cm_idx != -1) {
2226			LOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
2227			if (BC_cache->c_mounts[cm_idx].cm_state == CM_READY) {
2228				discards += BC_handle_discards(BC_cache->c_mounts + cm_idx, disk_offset, nbytes);
2229			}
2230
2231			/* Check if we should throttle this IO */
2232			if (BC_cache->c_readahead_throttles_cutthroughs &&
2233				!is_swap &&
2234				BC_cache->c_mounts[cm_idx].cm_disk &&
2235				(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_HAS_THREAD) &&
2236				!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_ISSUE_LOWPRI) &&
2237				!(BC_cache->c_mounts[cm_idx].cm_disk->cd_flags & CD_IS_SSD)) {
2238				/* We're currently issuing readahead for this disk.
2239				 * Throttle this IO so we don't cut-through the readahead so much.
2240				 */
2241				should_throttle = 1;
2242			}
2243
2244			UNLOCK_MOUNT_R(BC_cache->c_mounts + cm_idx);
2245		}
2246		if (take_detailed_stats) {
2247			BC_ADD_STAT(strategy_bypassed, 1);
2248			if (during_io) {
2249				BC_ADD_STAT(strategy_bypass_duringio, 1);
2250			}
2251			if (dont_cache) {
2252				BC_ADD_STAT(strategy_bypass_nocache, 1);
2253				BC_ADD_STAT(bypass_nocache_bytes, nbytes);
2254				if (during_io) {
2255					BC_ADD_STAT(strategy_bypass_duringio_nocache, 1);
2256				}
2257			}
2258			if (cache_hit) {
2259				BC_ADD_STAT(error_discards, discards);
2260			} else if (dont_cache) {
2261				BC_ADD_STAT(bypass_nocache_discards, discards);
2262			} else if (bufflags & B_READ) {
2263				BC_ADD_STAT(read_discards, discards);
2264			} else {
2265				BC_ADD_STAT(write_discards, discards);
2266			}
2267		} else {
2268			BC_ADD_STAT(lost_bytes_afterhistory, discards);
2269		}
2270	}
2271
2272	if (during_cache) {
2273		OSAddAtomic(-1, &BC_cache->c_strategycalls);
2274		UNLOCK_CACHE_R();
2275	}
2276
2277	if (! is_swap) {
2278		if (! dont_cache) {
2279			is_root_disk = BC_add_history(blkno, nbytes, pid, 0, ((bufflags & B_READ) ? 0 : 1), 0, is_shared, dev);
2280
2281			if (take_detailed_stats && during_io && is_root_disk) {
2282				if (cache_hit) {
2283					if (unfilled) {
2284						BC_ADD_STAT(strategy_bypass_duringio_unfilled, 1);
2285					} else {
2286						BC_ADD_STAT(strategy_bypass_duringio_rootdisk_failure, 1);
2287					}
2288				} else if (bufflags & B_READ) {
2289					BC_ADD_STAT(strategy_bypass_duringio_rootdisk_read, 1);
2290				} else {
2291					BC_ADD_STAT(strategy_bypass_duringio_rootdisk_nonread, 1);
2292				}
2293			}
2294		}
2295	}
2296
2297	/*
2298	 * Check to make sure that we're not missing the cache
2299	 * too often.  If we are, we're no longer providing a
2300	 * performance win and the best thing would be to give
2301	 * up.
2302	 *
2303	 * Don't count throttled IOs since those aren't time
2304	 * critical. (We don't want to jettison the cache just
2305	 * because spotlight is indexing)
2306	 */
2307
2308	/* if this is a read, and we do have an active cache, and the read isn't throttled */
2309	if (during_cache) {
2310		(void) is_stolen;
2311		if (is_swap /*|| is_stolen*/) {  //rdar://10651288&10658086 seeing stolen pages early during boot
2312			if (is_swap) {
2313				debug("detected %s swap file, jettisoning cache", (bufflags & B_READ) ? "read from" : "write to");
2314			} else {
2315				debug("detected stolen page, jettisoning cache");
2316			}
2317			//rdar://9858070 Do this asynchronously to avoid deadlocks
2318			BC_terminate_cache_async();
2319		} else if ((bufflags & B_READ) &&
2320				   !(throttle_tier)) {
2321
2322			struct timeval current_time;
2323			if (BC_cache->c_stats.ss_history_num_recordings < 2) {
2324				microtime(&current_time);
2325				timersub(&current_time,
2326						 &BC_cache->c_loadtime,
2327						 &current_time);
2328			}
2329			/* Don't start counting misses until we've started login or hit our prelogin timeout */
2330			if (BC_cache->c_stats.ss_history_num_recordings >= 2 || current_time.tv_sec > BC_chit_prelogin_timeout_seconds) {
2331
2332				/* increase number of IOs since last hit */
2333				OSAddAtomic(1, &BC_cache->c_num_ios_since_last_hit);
2334
2335				if (BC_cache->c_num_ios_since_last_hit >= BC_chit_max_num_IOs_since_last_hit) {
2336					/*
2337					 * The hit rate is poor, shut the cache down.
2338					 */
2339					debug("hit rate below threshold (0 hits in the last %u lookups), jettisoning cache",
2340						  BC_cache->c_num_ios_since_last_hit);
2341					//rdar://9858070 Do this asynchronously to avoid deadlocks
2342					BC_terminate_cache_async();
2343				}
2344			}
2345		}
2346	}
2347
2348	if (is_swap && (! (BC_cache->c_flags & BC_FLAG_SHUTDOWN))) {
2349		/* if we're swapping, stop readahead */
2350		debug("Detected %s swap file. Terminating readahead", (bufflags & B_READ) ? "read from" : "write to");
2351		BC_set_flag(BC_FLAG_SHUTDOWN);
2352	}
2353
2354	KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, (should_throttle ? DBG_BC_IO_MISS_CUT_THROUGH : DBG_BC_IO_MISS)) | DBG_FUNC_NONE, buf_kernel_addrperm_addr(bp_void), 0, 0, 0, 0);
2355
2356	if (should_throttle && throttle_tier < IOPOL_THROTTLE) {
2357		/*
2358		 * We need to indicate to spec_strategy that we want to
2359		 * throttle this IO to avoid cutting through readahead
2360		 * too much. spec_strategy will check for the special
2361		 * return value 0xcafebeef to indicate that the IO
2362		 * should be throttled.
2363		 */
2364
2365		BC_ADD_STAT(strategy_forced_throttled, 1);
2366
2367#define IO_SHOULD_BE_THROTTLED ((int)0xcafebeef)
2368		return IO_SHOULD_BE_THROTTLED;
2369	}
2370
2371	return 0;
2372}
2373
2374/*
2375 * Handle a block range that needs to be discarded.
2376 *
2377 * Called with the cache mount read lock held
2378 *
2379 * Returns the number of bytes discarded
2380 */
2381static int
2382BC_handle_discards(struct BC_cache_mount *cm, u_int64_t offset, u_int64_t length)
2383{
2384	struct BC_cache_extent **pce, **p;
2385	int count, total_discards;
2386
2387	total_discards = 0;
2388
2389	/*
2390	 * Look for an extent that we overlap.
2391	 */
2392	if ((pce = BC_find_extent(cm, offset, length, 0, NULL)) == NULL)
2393		return 0;		/* doesn't affect us */
2394
2395	/*
2396	 * Discard bytes in the matched extent.
2397	 */
2398	LOCK_EXTENT(*pce);
2399	count = BC_discard_bytes((*pce), offset, length);
2400	UNLOCK_EXTENT(*pce);
2401	total_discards += count;
2402
2403	/*
2404	 * Scan adjacent extents for possible overlap and discard there as well.
2405	 */
2406	p = pce - 1;
2407	while (p >= cm->cm_pextents &&
2408		   BC_check_intersection((*p), offset, length)) {
2409		LOCK_EXTENT(*p);
2410		count = BC_discard_bytes((*p), offset, length);
2411		UNLOCK_EXTENT(*p);
2412		if (count == 0)
2413			break;
2414		total_discards += count;
2415		p--;
2416	}
2417	p = pce + 1;
2418	while (p < (cm->cm_pextents + cm->cm_nextents) &&
2419		   BC_check_intersection((*p), offset, length)) {
2420		LOCK_EXTENT(*p);
2421		count = BC_discard_bytes((*p), offset, length);
2422		UNLOCK_EXTENT(*p);
2423		if (count == 0)
2424			break;
2425		total_discards += count;
2426		p++;
2427	}
2428
2429	return total_discards;
2430}
2431
2432/*
2433 * Shut down readahead operations.
2434 */
2435static int
2436BC_terminate_readahead(void)
2437{
2438	int error;
2439	struct timespec timeout;
2440	timeout.tv_sec = 10;
2441	timeout.tv_nsec = 0;
2442
2443	/*
2444	 * Signal the readahead thread to terminate, and wait for
2445	 * it to comply.  If this takes > 10 seconds, give up.
2446	 */
2447	BC_set_flag(BC_FLAG_SHUTDOWN);
2448
2449	/*
2450	 * If readahead is still in progress, we have to shut it down
2451	 * cleanly.  This is an expensive process, but since readahead
2452	 * will normally complete long before the reads it tries to serve
2453	 * complete, it will typically only be invoked when the playlist
2454	 * is out of synch and the cache hitrate falls below the acceptable
2455	 * threshold.
2456	 *
2457	 * Note that if readahead is aborted, the reader thread will mark
2458	 * the aborted extents and wake up any strategy callers waiting
2459	 * on them, so we don't have to worry about them here.
2460	 */
2461	LOCK_READERS();
2462	while (BC_cache->c_num_reader_threads > 0) {
2463		debug("terminating active readahead");
2464
2465		error = msleep(&BC_cache->c_num_reader_threads, &BC_cache->c_reader_threads_lock, PRIBIO, "BC_terminate_readahead", &timeout);
2466		if (error == EWOULDBLOCK) {
2467			UNLOCK_READERS();
2468
2469			message("timed out waiting for I/O to stop");
2470			if (BC_cache->c_num_reader_threads == 0) {
2471				debug("but I/O has stopped!");
2472			}
2473#ifdef BC_DEBUG
2474 			Debugger("I/O Kit wedged on us");
2475#endif
2476			/*
2477			 * It might be nice to free all the pages that
2478			 * aren't actually referenced by the outstanding
2479			 * region, since otherwise we may be camped on a
2480			 * very large amount of physical memory.
2481			 *
2482			 * Ideally though, this will never happen.
2483			 */
2484			return(EBUSY);	/* really EWEDGED */
2485		}
2486	}
2487	UNLOCK_READERS();
2488
2489	return(0);
2490}
2491
2492static void
2493BC_terminate_cache_thread(void *param0, wait_result_t param1)
2494{
2495	BC_terminate_cache();
2496}
2497
2498/*
2499 * Start up an auxilliary thread to stop the cache so we avoid potential deadlocks
2500 */
2501static void
2502BC_terminate_cache_async(void)
2503{
2504	if (! (BC_cache->c_flags & BC_FLAG_CACHEACTIVE)) {
2505		return;
2506	}
2507
2508	int error;
2509	thread_t rthread;
2510
2511	debug("Kicking off thread to terminate cache");
2512	if ((error = kernel_thread_start(BC_terminate_cache_thread, NULL, &rthread)) == KERN_SUCCESS) {
2513		thread_deallocate(rthread);
2514	} else {
2515		message("Unable to start thread to terminate cache");
2516	}
2517}
2518
2519/*
2520 * Terminate the cache.
2521 *
2522 * This prevents any further requests from being satisfied from the cache
2523 * and releases all the resources owned by it.
2524 *
2525 * Must be called with no locks held
2526 */
2527static int
2528BC_terminate_cache(void)
2529{
2530	int retry, cm_idx, j, ce_idx, cel_idx;
2531
2532	BC_terminate_readahead();
2533
2534	/* can't shut down if readahead is still active */
2535	if (BC_cache->c_num_reader_threads > 0) {
2536		debug("cannot terminate cache while readahead is in progress");
2537		return(EBUSY);
2538	}
2539
2540	LOCK_CACHE_R();
2541
2542	LOCK_HANDLERS();
2543	if (!BC_clear_flag(BC_FLAG_CACHEACTIVE)) {
2544		/* can't shut down if we're not active */
2545		debug("cannot terminate cache when not active");
2546		UNLOCK_HANDLERS();
2547		UNLOCK_CACHE_R();
2548		return(ENXIO);
2549	}
2550
2551	bootcache_contains_block = NULL;
2552
2553	debug("terminating cache...");
2554
2555	/* if we're no longer recording history also, disconnect our strategy routine */
2556	BC_check_handlers();
2557	UNLOCK_HANDLERS();
2558
2559	/*
2560	 * Mark all extents as FREED. This will notify any sleepers in the
2561	 * strategy routine that the extent won't have data for them.
2562	 */
2563	for (cel_idx = 0;
2564		 cel_idx < BC_cache->c_nextentlists;
2565		 cel_idx++) {
2566		for (ce_idx = 0;
2567			 ce_idx < BC_cache->c_nextents[cel_idx];
2568			 ce_idx++) {
2569			struct BC_cache_extent *ce = BC_cache->c_extentlists[cel_idx] + ce_idx;
2570			LOCK_EXTENT(ce);
2571			/*
2572			 * Track unused bytes
2573			 */
2574			if (ce->ce_blockmap != NULL && BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize != 0) {
2575				for (j = 0; j < howmany(ce->ce_length, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize); j++) {
2576					if (CB_BLOCK_PRESENT(ce, j))
2577						BC_ADD_STAT(spurious_bytes, BC_cache->c_mounts[ce->ce_mount_idx].cm_blocksize);
2578				}
2579			}
2580
2581			BC_teardown_extent(ce);
2582			UNLOCK_EXTENT(ce);
2583			wakeup(ce);
2584		}
2585	}
2586
2587	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2588		struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
2589		LOCK_MOUNT_W(cm);
2590		BC_teardown_mount(cm);
2591		UNLOCK_MOUNT_W(cm);
2592	}
2593
2594
2595	/*
2596	 * It is possible that one or more callers are asleep in the
2597	 * strategy routine (eg. if readahead was terminated early,
2598	 * or if we are called off the timeout).
2599	 * Check the count of callers in the strategy code, and sleep
2600	 * until there are none left (or we time out here).  Note that
2601	 * by clearing BC_FLAG_CACHEACTIVE above we prevent any new
2602	 * strategy callers from touching the cache, so the count
2603	 * must eventually drain to zero.
2604	 */
2605	retry = 0;
2606	while (BC_cache->c_strategycalls > 0) {
2607		tsleep(BC_cache, PRIBIO, "BC_terminate_cache", 10);
2608		if (retry++ > 50) {
2609			message("could not terminate cache, timed out with %d caller%s in BC_strategy",
2610					(int) BC_cache->c_strategycalls,
2611					BC_cache->c_strategycalls == 1 ? "" : "s");
2612			UNLOCK_CACHE_R();
2613			return(EBUSY);	/* again really EWEDGED */
2614		}
2615	}
2616
2617	if (! LOCK_CACHE_R_TO_W()) {
2618		/* We shouldn't get here. This is the only LOCK_CACHE_R_TO_W call,
2619		 * so this implies someone is terminating the cache in parallel with us,
2620		 * but we check for exclusivity by clearing BC_FLAG_CACHEACTIVE.
2621		 */
2622		message("Unable to upgrade cache lock to free resources");
2623		return ENXIO;
2624	}
2625
2626	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2627		struct BC_cache_mount *cm = BC_cache->c_mounts + cm_idx;
2628		if (cm->cm_disk != NULL) {
2629			for (j = cm_idx + 1; j < BC_cache->c_nmounts; j++) {
2630				if (BC_cache->c_mounts[j].cm_disk == cm->cm_disk) {
2631					BC_cache->c_mounts[j].cm_disk = NULL;
2632				}
2633			}
2634			BC_teardown_disk(cm->cm_disk);
2635			lck_mtx_destroy(&cm->cm_disk->cd_lock, BC_cache->c_lckgrp);
2636			_FREE_ZERO(cm->cm_disk, M_TEMP);
2637		}
2638	}
2639
2640	BC_free_pagebuffer();
2641	BC_cache->c_cachesize = 0;
2642
2643	/* free memory held by extents and mounts */
2644	for (cel_idx = 0;
2645		 cel_idx < BC_cache->c_nextentlists;
2646		 cel_idx++) {
2647		for (ce_idx = 0;
2648			 ce_idx < BC_cache->c_nextents[cel_idx];
2649			 ce_idx++) {
2650			lck_mtx_destroy(&BC_cache->c_extentlists[cel_idx][ce_idx].ce_lock, BC_cache->c_lckgrp);
2651		}
2652		_FREE_ZERO(BC_cache->c_extentlists[cel_idx], M_TEMP);
2653	}
2654	_FREE_ZERO(BC_cache->c_extentlists, M_TEMP);
2655	_FREE_ZERO(BC_cache->c_nextents, M_TEMP);
2656	BC_cache->c_nextentlists = 0;
2657
2658	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
2659		lck_rw_destroy(&BC_cache->c_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
2660	}
2661	_FREE_ZERO(BC_cache->c_mounts, M_TEMP);
2662	BC_cache->c_nmounts = 0;
2663
2664	UNLOCK_CACHE_W();
2665
2666	/* record stop time */
2667	struct timeval endtime;
2668	microtime(&endtime);
2669	timersub(&endtime,
2670			 &BC_cache->c_cache_starttime,
2671			 &endtime);
2672	BC_cache->c_stats.ss_cache_time += (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000;
2673	return(0);
2674}
2675
2676/*
2677 * Terminate history recording.
2678 *
2679 * This stops us recording any further history events.
2680 */
2681static int
2682BC_terminate_history(void)
2683{
2684	struct BC_history_mount_device  *hmd;
2685	LOCK_HANDLERS();
2686	if (!BC_clear_flag(BC_FLAG_HISTORYACTIVE)) {
2687		/* can't shut down if we're not active */
2688		debug("cannot terminate history when not active");
2689		UNLOCK_HANDLERS();
2690		return(ENXIO);
2691	}
2692
2693	debug("terminating history collection...");
2694
2695	/* if the cache is no longer active also, disconnect our strategy routine */
2696	BC_check_handlers();
2697	UNLOCK_HANDLERS();
2698
2699	/*
2700	 * Kill the timeout handler; we don't want it going off
2701	 * now.
2702	 */
2703	untimeout(BC_timeout_history, NULL);
2704
2705	if (BC_cache->c_take_detailed_stats) {
2706		BC_cache->c_stats.ss_history_mount_no_uuid = 0;
2707		for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
2708			if (uuid_is_null(hmd->hmd_mount.hm_uuid))
2709				BC_ADD_STAT(history_mount_no_uuid, 1);
2710	}
2711
2712	/* record stop time */
2713	if (BC_cache->c_take_detailed_stats) {
2714		struct timeval endtime;
2715		/* record stop time */
2716		microtime(&endtime);
2717		timersub(&endtime,
2718				 &BC_cache->c_history_starttime,
2719				 &endtime);
2720		BC_ADD_STAT(history_time, (u_int) endtime.tv_sec * 1000 + (u_int) endtime.tv_usec / 1000);
2721	}
2722
2723	BC_cache->c_take_detailed_stats = 0;
2724
2725	return(0);
2726}
2727
2728/*
2729 * Check our strategy handlers and make sure they are set/cleared depending on our current state.
2730 *
2731 * Should be called after changing BC_FLAG_CACHEACTIVE or BC_FLAG_HISTORYACTIVE.
2732 * Called with the handlers lock held.
2733 */
2734static void
2735BC_check_handlers(void)
2736{
2737	if (BC_cache->c_devsw == NULL ||
2738		BC_cache->c_strategy == NULL ||
2739		BC_cache->c_close == NULL) {
2740		debug("Cannot check device handlers: cache not setup properly");
2741		return;
2742	}
2743
2744	/* if the cache or history recording is active, make sure we've captured the appropriate device handler routines */
2745	if ((BC_cache->c_flags & BC_FLAG_CACHEACTIVE) ||
2746		(BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
2747
2748		if (BC_cache->c_devsw->d_strategy != (strategy_fcn_t*) BC_strategy ||
2749			BC_cache->c_devsw->d_close    != BC_close) {
2750
2751			debug("connecting handlers...");
2752
2753			/* connect the strategy and close routines */
2754			BC_cache->c_devsw->d_strategy = (strategy_fcn_t*) BC_strategy;
2755			BC_cache->c_devsw->d_close    = BC_close;
2756		}
2757	} else {
2758		if (BC_cache->c_devsw->d_strategy != BC_cache->c_strategy ||
2759			BC_cache->c_devsw->d_close    != BC_cache->c_close) {
2760
2761			debug("disconnecting handlers...");
2762
2763			/* disconnect the strategy and close routines */
2764			BC_cache->c_devsw->d_strategy = BC_cache->c_strategy;
2765			BC_cache->c_devsw->d_close    = BC_cache->c_close;
2766		}
2767	}
2768}
2769
2770/*
2771 * Setup a cache extent with the parameters given by the playlist entry
2772 *
2773 * Returns 0 on success
2774 */
2775static int
2776BC_setup_extent(struct BC_cache_extent *ce, const struct BC_playlist_entry *pe, int batch_adjustment, int cache_mount_idx)
2777{
2778	lck_mtx_init(&ce->ce_lock, BC_cache->c_lckgrp,
2779				 LCK_ATTR_NULL);
2780	ce->ce_diskoffset = pe->pe_offset;
2781	ce->ce_mount_idx = cache_mount_idx;
2782	ce->ce_length = pe->pe_length;
2783#ifdef IGNORE_BATCH
2784	ce->ce_batch = 0;
2785#else
2786	ce->ce_batch = pe->pe_batch;
2787#endif
2788	ce->ce_batch += batch_adjustment;
2789	if (ce->ce_batch > BC_MAXBATCHES) {
2790		ce->ce_batch = BC_MAXBATCHES; /* cap batch number */
2791	}
2792	ce->ce_cacheoffset = 0;
2793	ce->ce_blockmap = NULL;
2794	ce->ce_flags = 0;
2795	if (pe->pe_flags & BC_PE_LOWPRIORITY) {
2796		ce->ce_flags |= CE_LOWPRI;
2797	}
2798	if (pe->pe_flags & BC_PE_SHARED) {
2799		ce->ce_flags |= CE_SHARED;
2800	}
2801
2802	/* track highest batch number for this playlist */
2803	if (ce->ce_batch >= BC_cache->c_batch_count) {
2804		BC_cache->c_batch_count = ce->ce_batch + 1;
2805		// debug("Largest batch is now %d from extent with disk offset %llu", BC_cache->c_batch_count, ce->ce_diskoffset);
2806	}
2807
2808	return 0;
2809}
2810
2811/*
2812 * The blocksize is initialised from the first playlist read, the statistics
2813 * structure, or it can be pre-set by the caller.  Once set, only playlists with
2814 * matching sizes can be read/written.
2815 */
2816#define	BLOCK_ROUNDDOWN(cm, x)	(((x) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
2817#define BLOCK_ROUNDUP(cm, x)	((((x) + ((cm)->cm_blocksize - 1)) / (cm)->cm_blocksize) * (cm)->cm_blocksize)
2818/*
2819 * optional for power-of-two blocksize roundup:
2820 * (((x) + ((cm)->cm_blocksize - 1)) & (~((cm)->cm_blocksize - 1)))
2821 */
2822
2823/*
2824 * Fill in a cache extent now that its mount has been filled in
2825 *
2826 * Called with the extent lock held
2827 *
2828 * Returns 0 on success
2829 */
2830static int
2831BC_fill_in_extent(struct BC_cache_extent *ce)
2832{
2833	int numblocks, roundsize, i;
2834	u_int64_t end;
2835
2836	if (ce->ce_flags & CE_ABORTED ||
2837		ce->ce_length == 0) {
2838		return 1;
2839	}
2840
2841	struct BC_cache_mount *cm = BC_cache->c_mounts + ce->ce_mount_idx;
2842
2843	int blocksize = cm->cm_blocksize;
2844
2845	if (0 == blocksize) {
2846		return 1;
2847	}
2848
2849	roundsize = roundup(ce->ce_length, PAGE_SIZE);
2850
2851	/* make sure we're on block boundaries */
2852	end = ce->ce_diskoffset + roundsize;
2853	ce->ce_diskoffset = BLOCK_ROUNDDOWN(cm, ce->ce_diskoffset);
2854	ce->ce_length = BLOCK_ROUNDUP(cm, end) - ce->ce_diskoffset;
2855
2856	/* make sure we didn't grow our pagebuffer size since is's already been allocated */
2857	if (roundup(ce->ce_length, PAGE_SIZE) > roundsize) {
2858        debug("Clipped extent %llu by a page", ce->ce_diskoffset);
2859		BC_ADD_STAT(extents_clipped, 1);
2860		ce->ce_length = roundsize;
2861	}
2862
2863	numblocks = howmany(ce->ce_length, blocksize);
2864
2865	ce->ce_blockmap = BC_MALLOC(howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)),
2866							  M_TEMP, M_WAITOK | M_ZERO);
2867	if (!ce->ce_blockmap) {
2868		message("can't allocate extent blockmap for %d blocks of %d bytes", numblocks, howmany(numblocks, (CB_MAPFIELDBITS / CB_MAPFIELDBYTES)));
2869		return 1;
2870	}
2871
2872	for (i = 0; i < howmany(ce->ce_length, blocksize); i++) {
2873		CB_MARK_BLOCK_PRESENT((ce), i);
2874	}
2875
2876	return 0;
2877}
2878
2879static void
2880BC_teardown_extent(struct BC_cache_extent *ce)
2881{
2882	ce->ce_flags |= CE_ABORTED;
2883	_FREE_ZERO(ce->ce_blockmap, M_TEMP);
2884}
2885
2886/*
2887 * Setup a cache disk.
2888 * Returns 0 on success
2889 */
2890static int
2891BC_setup_disk(struct BC_cache_disk *cd, u_int64_t disk_id, int is_ssd)
2892{
2893	static int next_disk_num;
2894	lck_mtx_init(&cd->cd_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
2895	cd->cd_disk_id = disk_id;
2896	cd->cd_disk_num = next_disk_num++;
2897	cd->cd_flags = 0;
2898	cd->cd_nmounts = 0;
2899	cd->cd_batch = 0;
2900
2901	if (is_ssd) {
2902		cd->cd_flags |= CD_IS_SSD;
2903	}
2904
2905	debug("Setup disk 0x%llx as disk num %d%s", disk_id, cd->cd_disk_num, (cd->cd_flags & CD_IS_SSD) ? " (ssd)" : "");
2906
2907	return (0);
2908}
2909
2910/*
2911 * Teardown a cache disk.
2912 */
2913static void
2914BC_teardown_disk(struct BC_cache_disk *cd)
2915{
2916	/* Nothing to do */
2917}
2918
2919
2920/*
2921 * Check if a mount has become available for readahead
2922 *
2923 * If so, make sure the reader thread picks up this mount's IOs.
2924 * If there is no reader thread for the mount's disk, kick one off.
2925 *
2926 * Called with the cache mount read lock held
2927 */
2928static void
2929BC_mount_available(struct BC_cache_mount *cm)
2930{
2931#ifdef EMULATE_ONLY
2932	int i;
2933	for (i = 0; i < cm->cm_nextents; i++)
2934			cm->cm_pextents[i]->ce_flags |= CE_IODONE;
2935#else
2936	int i, error;
2937	thread_t rthread;
2938	if (cm->cm_state != CM_READY) {
2939		/* not ready */
2940		return;
2941	}
2942
2943	struct BC_cache_disk *cd = cm->cm_disk;
2944	LOCK_DISK(cd);
2945	cd->cd_nmounts++;
2946	LOCK_READERS();
2947	if (!(BC_cache->c_flags & BC_FLAG_SHUTDOWN)) {
2948		if (cd->cd_flags & CD_HAS_THREAD) {
2949			/* Make sure the thread is not issuing lowpriority IO */
2950			if (cd->cd_flags & CD_ISSUE_LOWPRI) {
2951				debug("Interrupting low-priority thread for new mount %s", uuid_string(cm->cm_uuid));
2952				cd->cd_flags &= (~CD_ISSUE_LOWPRI);
2953				/* TODO: Unthrottle the readahead thread here rather than waiting out its current throttle delay */
2954			}
2955			UNLOCK_READERS();
2956			UNLOCK_DISK(cd);
2957			return;
2958		}
2959
2960		/* Make sure we issue regular IOs for the new playlist in case we were issuing low-priority previously */
2961		cd->cd_flags &= (~CD_ISSUE_LOWPRI);
2962
2963		debug("Kicking off reader thread for disk %d for mount %s", cd->cd_disk_num, uuid_string(cm->cm_uuid));
2964		if ((error = kernel_thread_start(BC_reader_thread, cd, &rthread)) == KERN_SUCCESS) {
2965			thread_deallocate(rthread);
2966			BC_cache->c_num_reader_threads++;
2967			BC_ADD_STAT(readahead_threads, 1);
2968			cd->cd_flags |= CD_HAS_THREAD;
2969			UNLOCK_READERS();
2970			UNLOCK_DISK(cd);
2971			return;
2972		}
2973
2974		message("Unable to start reader thread for disk %d: %d", cd->cd_disk_num, error);
2975	}
2976	UNLOCK_READERS();
2977	UNLOCK_DISK(cd);
2978
2979	/*
2980	 * Getting here indicates some failure.
2981	 *
2982	 * Mark all extents as aborted. This will notify any sleepers in the
2983	 * strategy routine that the extent won't have data for them.
2984	 */
2985	for (i = 0; i < cm->cm_nextents; i++) {
2986		LOCK_EXTENT(cm->cm_pextents[i]);
2987		BC_teardown_extent(cm->cm_pextents[i]);
2988		UNLOCK_EXTENT(cm->cm_pextents[i]);
2989		wakeup(cm->cm_pextents[i]);
2990	}
2991#endif
2992}
2993
2994/*
2995 * Setup a cache mount from the playlist mount.
2996 *
2997 * Allocates cm_pextents large enough to hold pm->pm_nentries extents,
2998 * but leaves cm_nextents 0 since the array hasn't been initialized.
2999 *
3000 * Returns 0 on success
3001 */
3002static int
3003BC_setup_mount(struct BC_cache_mount *cm, struct BC_playlist_mount* pm)
3004{
3005	int error = 0;
3006
3007	lck_rw_init(&cm->cm_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
3008	uuid_copy(cm->cm_uuid, pm->pm_uuid);
3009
3010	/* These will be set once we've detected that this volume has been mounted */
3011	cm->cm_dev = nulldev();
3012	cm->cm_bp = NULL;
3013	cm->cm_devsize = 0;
3014	cm->cm_maxread = 0;
3015	cm->cm_disk = NULL;
3016	cm->cm_blocksize = 0;
3017	cm->cm_nextents = 0;
3018
3019	/* Allocate the sorted extent array.
3020	 * We'll fill it in while we setup the extents.
3021	 */
3022	if (pm->pm_nentries == 0) {
3023		message("Playlist incuded mount with 0 entries");
3024		error = EINVAL;
3025		goto out;
3026	}
3027
3028	cm->cm_pextents = BC_MALLOC(pm->pm_nentries * sizeof(*cm->cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
3029	if (cm->cm_pextents == NULL) {
3030		message("can't allocate mount's extent array (%d entries)", pm->pm_nentries);
3031		error = ENOMEM;
3032		goto out;
3033	}
3034
3035	cm->cm_state = CM_SETUP;
3036
3037out:
3038	if (error) {
3039		cm->cm_state = CM_ABORTED;
3040		_FREE_ZERO(cm->cm_pextents, M_TEMP);
3041		lck_rw_destroy(&cm->cm_lock, BC_cache->c_lckgrp);
3042	}
3043	return error;
3044}
3045
3046/*
3047 * Fill in the rest of the cache mount given the matching mount structure.
3048 *
3049 * Called with the cache mount write lock held
3050 *
3051 * Returns 0 if the mount was successfully filled in and cm_state will be CM_READY
3052 * Returns non-0 on failure or it the mount was already filled in
3053 */
3054static int
3055BC_fill_in_mount(struct BC_cache_mount *cm, mount_t mount, vfs_context_t context)
3056{
3057	uint64_t blkcount, max_byte_count, max_segment_count, max_segment_byte_count, max_block_count;
3058	uint32_t blksize, is_ssd;
3059	int error, mount_idx, i;
3060	u_int64_t disk_id;
3061	struct BC_cache_disk *cd;
3062	vnode_t devvp = NULLVP;
3063
3064	error = 0;
3065
3066	if (CM_SETUP != cm->cm_state) {
3067		return EINVAL;
3068	}
3069
3070	cm->cm_throttle_mask = vfs_throttle_mask(mount);
3071	disk_id = cm->cm_throttle_mask; /* using the throttle mask as a way to identify the physical disk */
3072	/* debug("Got throttle mask %llx for mount %s", cm->cm_throttle_mask, uuid_string(cm->cm_uuid)); */
3073
3074	devvp = vfs_devvp(mount);
3075	if (devvp == NULLVP) {
3076		message("mount %s does not have a vnode", uuid_string(cm->cm_uuid));
3077		error = EINVAL;
3078		goto out;
3079	}
3080
3081#ifdef ROOT_DISK_ONLY
3082	if (devvp == rootvp) {
3083		BC_cache->c_root_disk_id = disk_id;
3084		if (BC_cache->c_root_disk_id == 0) {
3085			message("Root disk is 0");
3086		} else {
3087			debug("Root disk (via cache) is 0x%llx", BC_cache->c_root_disk_id);
3088		}
3089	} else if (0 == BC_cache->c_root_disk_id) {
3090		error = EAGAIN; /* we haven't seen the root mount yet, try again later */
3091		goto out;
3092
3093	//rdar://11653286 disk image volumes (FileVault 1) are messing with this check, so we're going back to != rather than !( & )
3094	} else if (BC_cache->c_root_disk_id != disk_id) {
3095		debug("mount %s (disk 0x%llx) is not on the root disk (disk 0x%llx)", uuid_string(cm->cm_uuid), disk_id, BC_cache->c_root_disk_id);
3096		error = ENODEV;
3097		goto out;
3098	}
3099#endif
3100
3101	/* See if we already have a cache_disk for this disk */
3102	for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
3103		if (BC_cache->c_mounts + mount_idx == cm) continue;
3104
3105		cd = BC_cache->c_mounts[mount_idx].cm_disk;
3106
3107		/*
3108		 * We're not handling mounts backed by multiple disks as gracefull as we should.
3109		 * Previously, this was cd->cd_disk_id == disk_id, so we had a thread per disk combination
3110		 * meaning reader threads may span multiple disks and disks may have multiple reader threads.
3111		 * We've only ever supported the root disk, however, so this wasn't a problem, it just missed
3112		 * cases where you'd have other volumes on one of the disks you're booting from.
3113		 *
3114		 * Now, since we are checking for cd->cd_disk_id & disk_id, we at least include all mounts that
3115		 * are on disks that the root mount uses. We still only have one reader thread, but we don't support
3116		 * playback on composite disks, so that's not a problem yet. See rdar://10081513
3117		 *
3118		 * This assumes that the root mount (or, at least the mount we care most about) will always appear first
3119		 *
3120		 */
3121		if (cd && (cd->cd_disk_id & disk_id)) {
3122			cm->cm_disk = cd;
3123			break;
3124		}
3125	}
3126
3127	/* If we don't already have a cache_disk for this disk, allocate one */
3128	if (cm->cm_disk == NULL) {
3129		cd = BC_MALLOC(sizeof(*cd), M_TEMP, M_WAITOK | M_ZERO);
3130		if (cd == NULL) {
3131			message("can't allocate memory for cache disk");
3132			error = ENOMEM;
3133			goto out;
3134		}
3135
3136		if (VNOP_IOCTL(devvp,       /* vnode */
3137					   DKIOCISSOLIDSTATE,    /* cmd */
3138					   (caddr_t)&is_ssd, /* data */
3139					   0,
3140					   context))           /* context */
3141		{
3142			message("can't determine if disk is a solid state disk for mount %s", uuid_string(cm->cm_uuid));
3143			is_ssd = 0;
3144		}
3145
3146		if ((error = BC_setup_disk(cd, disk_id, is_ssd)) != 0) {
3147			_FREE_ZERO(cd, M_TEMP);
3148			message("cache disk setup failed: %d", error);
3149			goto out;
3150		}
3151		cm->cm_disk = cd;
3152	}
3153
3154
3155	if (VNOP_IOCTL(devvp,
3156				   DKIOCGETBLOCKCOUNT,
3157				   (caddr_t)&blkcount,
3158				   0,
3159				   context)
3160		|| blkcount == 0)
3161	{
3162		message("can't determine device size, not checking");
3163		blkcount = 0;
3164	}
3165
3166	if (VNOP_IOCTL(devvp,
3167				   DKIOCGETBLOCKSIZE,
3168				   (caddr_t)&blksize,
3169				   0,
3170				   context)
3171		|| blksize == 0)
3172	{
3173		message("can't determine device block size for mount %s, defaulting to 512 bytes", uuid_string(cm->cm_uuid));
3174		blksize = 512;
3175	}
3176
3177	if (PAGE_SIZE % blksize != 0) {
3178		message("PAGE_SIZE (%d) is not a multiple of block size (%d) for mount %s", PAGE_SIZE, blksize, uuid_string(cm->cm_uuid));
3179		error = EINVAL;
3180		goto out;
3181	}
3182
3183	cm->cm_blocksize = blksize;
3184	cm->cm_devsize = blksize * blkcount;
3185
3186	/* max read size isn't larger than the max UPL size */
3187	cm->cm_maxread = ubc_upl_maxbufsize();
3188
3189	/* maxread = min ( maxread, MAXBYTECOUNTREAD ) */
3190	if (0 == VNOP_IOCTL(devvp,
3191						DKIOCGETMAXBYTECOUNTREAD,
3192						(caddr_t)&max_byte_count,
3193						0,
3194						context)) {
3195		if (cm->cm_maxread > max_byte_count && max_byte_count > 0) {
3196			cm->cm_maxread = max_byte_count;
3197			debug("MAXBYTECOUNTREAD is %#llx", max_byte_count);
3198		}
3199	}
3200
3201	/* maxread = min ( maxread, MAXBLOCKCOUNTREAD *  BLOCKSIZE ) */
3202	if (0 == VNOP_IOCTL(devvp,
3203						DKIOCGETMAXBLOCKCOUNTREAD,
3204						(caddr_t)&max_block_count,
3205						0,
3206						context)) {
3207		if (cm->cm_maxread > max_block_count * cm->cm_blocksize && max_block_count > 0) {
3208			cm->cm_maxread = max_block_count * cm->cm_blocksize;
3209			debug("MAXBLOCKCOUNTREAD is %#llx, BLOCKSIZE is %#llx, (multiplied %#llx)", max_block_count, cm->cm_blocksize, max_block_count * cm->cm_blocksize, cm->cm_maxread);
3210		}
3211	}
3212
3213	/* maxread = min ( maxread, MAXSEGMENTCOUNTREAD * min (MAXSEGMENTBYTECOUNTREAD, PAGE_SIZE ) ) */
3214	if (0 == VNOP_IOCTL(devvp,
3215						DKIOCGETMAXSEGMENTCOUNTREAD,
3216						(caddr_t)&max_segment_count,
3217						0,
3218						context)) {
3219
3220		if (max_segment_count > 0) {
3221
3222			if (0 == VNOP_IOCTL(devvp,
3223								DKIOCGETMAXSEGMENTBYTECOUNTREAD,
3224								(caddr_t)&max_segment_byte_count,
3225								0,
3226								context)) {
3227				//rdar://13835534 Limit max_segment_byte_count to PAGE_SIZE because some drives don't handle the spec correctly
3228				if (max_segment_byte_count > PAGE_SIZE || max_segment_byte_count == 0) {
3229					debug("MAXSEGMENTBYTECOUNTREAD is %#llx, limiting to PAGE_SIZE %#x", max_segment_byte_count, PAGE_SIZE);
3230					max_segment_byte_count = PAGE_SIZE;
3231				}
3232			} else {
3233				debug("Unable to get MAXSEGMENTBYTECOUNTREAD, assuming PAGE_SIZE %#x", PAGE_SIZE);
3234				max_segment_byte_count = PAGE_SIZE;
3235			}
3236
3237			if (cm->cm_maxread > max_segment_count * max_segment_byte_count) {
3238				cm->cm_maxread = max_segment_count * max_segment_byte_count;
3239				debug("MAXSEGMENTCOUNTREAD is %#llx, MAXSEGMENTBYTECOUNTREAD is %#llx, (multiplied %#llx)", max_segment_count, max_segment_byte_count, max_segment_count * max_segment_byte_count);
3240			}
3241		}
3242	}
3243
3244	/* maxread = min ( maxread, MAX_UPL_TRANSFER * PAGE_SIZE ) */
3245	if (cm->cm_maxread > MAX_UPL_TRANSFER * PAGE_SIZE) {
3246		cm->cm_maxread = MAX_UPL_TRANSFER * PAGE_SIZE;
3247		debug("MAX_UPL_TRANSFER is %#x, PAGE_SIZE is %#x, (multiplied %#x)", MAX_UPL_TRANSFER, PAGE_SIZE, MAX_UPL_TRANSFER * PAGE_SIZE);
3248	}
3249
3250	/* maxread = max ( maxread, MAXPHYS ) */
3251	if (cm->cm_maxread < MAXPHYS) {
3252		debug("can't determine device read size for mount %s; using default", uuid_string(cm->cm_uuid));
3253		cm->cm_maxread = MAXPHYS;
3254	}
3255
3256	/* make sure maxread is a multiple of the block size */
3257	if (cm->cm_maxread % cm->cm_blocksize != 0) {
3258        debug("Mount max IO size (%llu) not a multiple of block size (%llu)", cm->cm_maxread, cm->cm_blocksize);
3259        cm->cm_maxread -= cm->cm_maxread % cm->cm_blocksize;
3260    }
3261
3262	cm->cm_bp = buf_alloc(devvp);
3263	if (cm->cm_bp == NULL) {
3264		message("can't allocate buf");
3265		error = ENOMEM;
3266		goto out;
3267	}
3268
3269	cm->cm_dev = vnode_specrdev(devvp);
3270	if (cm->cm_dev == nulldev()) {
3271		message("mount %s does not have a device", uuid_string(cm->cm_uuid));
3272		error = EINVAL;
3273		goto out;
3274	}
3275
3276	for (i = 0; i < cm->cm_nextents; i++) {
3277		LOCK_EXTENT(cm->cm_pextents[i]);
3278		if (0 != BC_fill_in_extent(cm->cm_pextents[i])) {
3279			BC_teardown_extent(cm->cm_pextents[i]);
3280			UNLOCK_EXTENT(cm->cm_pextents[i]);
3281			wakeup(cm->cm_pextents[i]);
3282		} else {
3283			UNLOCK_EXTENT(cm->cm_pextents[i]);
3284		}
3285	}
3286
3287	cm->cm_state = CM_READY;
3288
3289	debug("Found new cache mount %s disk %d, %d extents", uuid_string(cm->cm_uuid), cm->cm_disk->cd_disk_num, cm->cm_nextents);
3290
3291out:
3292	if (error && error != EAGAIN) {
3293		/*
3294		 * Mark all extents as aborted. This will notify any sleepers in the
3295		 * strategy routine that the extent won't have data for them.
3296		 *
3297		 * Note that its fine to take the extent lock here since no one will
3298		 * have extent lock and try to take the mount lock if the mount isn't CM_READY
3299		 */
3300		for (i = 0; i < cm->cm_nextents; i++) {
3301			LOCK_EXTENT(cm->cm_pextents[i]);
3302			BC_teardown_extent(cm->cm_pextents[i]);
3303			UNLOCK_EXTENT(cm->cm_pextents[i]);
3304			wakeup(cm->cm_pextents[i]);
3305		}
3306
3307		BC_teardown_mount(cm);
3308	}
3309	if (devvp != NULLVP) {
3310		vnode_put(devvp);
3311	}
3312	return error;
3313}
3314
3315/*
3316 * Called with the cache mount write lock held
3317 */
3318static void
3319BC_teardown_mount(struct BC_cache_mount *cm)
3320{
3321	if (cm->cm_bp) {
3322		buf_free(cm->cm_bp);
3323		cm->cm_bp = NULL;
3324	}
3325	cm->cm_nextents = 0;
3326	_FREE_ZERO(cm->cm_pextents, M_TEMP);
3327	cm->cm_state = CM_ABORTED;
3328}
3329
3330/*
3331 * Check to see which mounts have been mounted and finish setting them up.
3332 *
3333 * No extent/mount locks required since we have a write lock on the cache
3334 *
3335 * Called with the cache write lock held
3336 */
3337static int fill_in_bc_cache_mounts(mount_t mount, void* arg)
3338{
3339	int mount_idx, error, i;
3340	struct vfs_attr attr;
3341	vfs_context_t context;
3342	int* go_again = (int*) arg;
3343
3344	VFSATTR_INIT(&attr);
3345	VFSATTR_WANTED(&attr, f_uuid);
3346	context = vfs_context_create(NULL);
3347	error = vfs_getattr(mount, &attr, context);
3348	if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
3349#ifdef BC_DEBUG
3350		char name[MFSNAMELEN];
3351		vfs_name(mount, name);
3352		if (strncmp("devfs", name, sizeof("devfs")) != 0) {
3353			debug("Found mount %s for IO without a UUID: error %d", name, error);
3354		}
3355#endif
3356		vfs_context_rele(context);
3357		return VFS_RETURNED;
3358	} else {
3359		// debug("Found mount for IO with UUID %s", uuid_string(attr.f_uuid));
3360
3361		for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
3362			struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
3363			int match = 0;
3364			if (CM_SETUP == cm->cm_state) {
3365
3366				if (0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
3367					match = 1;
3368				}
3369
3370				/* a null uuid indicates we want to match the root volume, no matter what the uuid (8350414) */
3371				if ((!match) && uuid_is_null(cm->cm_uuid)) {
3372					vnode_t devvp = vfs_devvp(mount);
3373					if (vnode_specrdev(devvp) == rootdev) {
3374						uuid_copy(cm->cm_uuid, attr.f_uuid);
3375						match = 1;
3376						debug("Found root mount %s", uuid_string(cm->cm_uuid));
3377					}
3378					vnode_put(devvp);
3379				}
3380
3381				if (match) {
3382					/* Found a matching mount */
3383
3384					/* Locking here isn't necessary since we're only called while holding the cache write lock
3385					 * and no one holds the mount locks without also holding the cache lock
3386					 */
3387					if (BC_fill_in_mount(cm, mount, context) == EAGAIN) {
3388						*go_again = 1;
3389					}
3390					vfs_context_rele(context);
3391
3392					/* Check to see if we have any more mounts to fill in */
3393					for (i = 0; i < BC_cache->c_nmounts; i++) {
3394						if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
3395							/* have more to fill in, keep iterating */
3396							return VFS_RETURNED;
3397						}
3398					}
3399
3400					return VFS_RETURNED_DONE;
3401				}
3402			}
3403		}
3404	}
3405
3406	vfs_context_rele(context);
3407
3408	/* Check to see if we have any more mounts to fill in */
3409	for (i = 0; i < BC_cache->c_nmounts; i++) {
3410		if (CM_SETUP == BC_cache->c_mounts[i].cm_state) {
3411			/* have more to fill in, keep iterating */
3412			return VFS_RETURNED;
3413		}
3414	}
3415
3416	debug("No new mounts filled in fill_in_bc_cache_mounts");
3417	return VFS_RETURNED_DONE;
3418
3419}
3420
3421#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3422/*
3423 * Sort the extent list.
3424 */
3425static int
3426compare_cache_extents(const void *vfirst, const void *vsecond)
3427{
3428	const struct BC_cache_extent *first, *second;
3429
3430	first  = (const struct BC_cache_extent *)vfirst;
3431	second = (const struct BC_cache_extent *)vsecond;
3432
3433	// Sort by volume first, then by logical block address
3434	int mount_comparison = first->ce_mount_idx - second->ce_mount_idx;
3435	if (mount_comparison != 0)
3436		return((mount_comparison < 0) ? -1 : 1);
3437
3438	if (first->ce_diskoffset == second->ce_diskoffset)
3439		return(0);
3440	return((first->ce_diskoffset < second->ce_diskoffset) ? -1 : 1);
3441}
3442#endif
3443
3444/*
3445 * Fetch the playlist from userspace or the Mach-O segment where it
3446 * was preloaded. Add it to the cache and readahead in batches after
3447 * the current cache, if any.
3448 *
3449 * Returns 0 on success. In failure cases, the cache is not modified.
3450 *
3451 * Called with the cache write lock held.
3452 */
3453static int
3454BC_copyin_playlist(size_t mounts_size, user_addr_t mounts_buf, size_t entries_size, user_addr_t entries_buf)
3455{
3456	int nplaylist_mounts, nplaylist_entries;
3457	struct BC_playlist_mount *playlist_mounts = NULL, *pm;
3458	struct BC_playlist_entry *playlist_entries = NULL, *pe;
3459	struct BC_cache_mount  *cm, *old_cm;
3460	struct BC_cache_extent *ce;
3461
3462	int ncache_mounts = 0, ncache_extents = 0, nnew_mounts = 0;
3463	struct BC_cache_mount  *cache_mounts = NULL;
3464	struct BC_cache_extent *cache_extents = NULL;
3465	int *pmidx_to_cmidx = NULL;
3466	int *next_old_extent_idx = NULL;
3467
3468	int *cache_nextents = NULL;
3469	struct BC_cache_extent **cache_extents_list = NULL;
3470
3471	int old_batch_count;
3472	u_int64_t old_cache_size;
3473
3474	int error, pm_idx, cm_idx, pe_idx, ce_idx, cel_idx, pm_idx2, max_entries;
3475	off_t p;
3476	u_int64_t size;
3477	int actual, remaining_entries;
3478
3479	int clipped_extents = 0, enveloped_extents = 0;
3480
3481	playlist_mounts = NULL;
3482	playlist_entries = NULL;
3483
3484#ifdef BC_DEBUG
3485	struct timeval start_time;
3486	microtime(&start_time);
3487#endif
3488
3489	old_batch_count = BC_cache->c_batch_count;
3490	old_cache_size = BC_cache->c_cachesize;
3491
3492	nplaylist_mounts = (int) mounts_size / sizeof(*playlist_mounts);
3493	nplaylist_entries = (int) entries_size / sizeof(*playlist_entries);
3494
3495	if (BC_cache->c_stats.ss_total_extents > 0) {
3496		debug("adding %u extents to existing cache of %u extents", nplaylist_entries, BC_cache->c_stats.ss_total_extents);
3497	} else {
3498		debug("setting up first cache of %d extents", nplaylist_entries);
3499	}
3500
3501	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
3502		debug("Mount %s has %d extents", uuid_string(BC_cache->c_mounts[cm_idx].cm_uuid), BC_cache->c_mounts[cm_idx].cm_nextents);
3503	}
3504
3505	/*
3506	 * Mount array
3507	 */
3508	if (BC_preloaded_playlist) {
3509		playlist_mounts = CAST_DOWN(struct BC_playlist_mount *, mounts_buf);
3510	} else {
3511		playlist_mounts = BC_MALLOC(mounts_size, M_TEMP, M_WAITOK);
3512		if (playlist_mounts == NULL) {
3513			message("can't allocate unpack mount buffer of %ld bytes", mounts_size);
3514			error = ENOMEM;
3515			goto out;
3516		}
3517		if ((error = copyin(mounts_buf, playlist_mounts, mounts_size)) != 0) {
3518			message("copyin %ld bytes from 0x%llx to %p failed", mounts_size, mounts_buf, playlist_mounts);
3519			goto out;
3520		}
3521
3522		/* if BC_preloaded_playlist is non-NULL, playlist_mounts must be freed */
3523	}
3524
3525	/* map from the playlist mount index to the corresponding cache mount index */
3526	pmidx_to_cmidx = BC_MALLOC(nplaylist_mounts * sizeof(*pmidx_to_cmidx), M_TEMP, M_WAITOK);
3527	if (pmidx_to_cmidx == NULL) {
3528		message("can't allocate playlist index to cache index reference array for %d mounts", nplaylist_mounts);
3529		error = ENOMEM;
3530		goto out;
3531	}
3532
3533	/* In order to merge the mount's current (sorted) extent list with the new (sorted)
3534	 * extent list from the new playlist we first grow the extent list allocation to fit
3535	 * both arrays, then merge the two arrays into the new allocation.
3536	 */
3537
3538	if (BC_cache->c_nmounts > 0) {
3539		/* next_old_extent_idx saves the index of the next unmerged extent in the original mount's extent list */
3540		next_old_extent_idx = BC_MALLOC(BC_cache->c_nmounts * sizeof(*next_old_extent_idx), M_TEMP, M_WAITOK | M_ZERO);
3541		if (next_old_extent_idx == NULL) {
3542			message("can't allocate index array for %d mounts", BC_cache->c_nmounts);
3543			error = ENOMEM;
3544			goto out;
3545		}
3546
3547		/* 0-filled since all the index start at 0 */
3548	}
3549
3550	nnew_mounts = nplaylist_mounts;
3551
3552	/* determine how many mounts are new and how many we already have */
3553	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3554		pmidx_to_cmidx[pm_idx] = -1; /* invalid by default */
3555
3556		for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
3557			if (0 == uuid_compare(BC_cache->c_mounts[cm_idx].cm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
3558				/* already have a record of this mount, won't need a new spot for it */
3559				pmidx_to_cmidx[pm_idx] = cm_idx;
3560				nnew_mounts--;
3561				break;
3562			}
3563		}
3564
3565		/* check to make sure there aren't any duplicate mounts within the playlist */
3566		for (pm_idx2 = pm_idx + 1; pm_idx2 < nplaylist_mounts; pm_idx2++) {
3567			if (0 == uuid_compare(playlist_mounts[pm_idx2].pm_uuid, playlist_mounts[pm_idx].pm_uuid)) {
3568				message("Playlist had duplicate mount %s", uuid_string(playlist_mounts[pm_idx2].pm_uuid));
3569				error = EINVAL;
3570				goto out;
3571			}
3572		}
3573	}
3574
3575	/* cache_mounts is the array of mounts that will replace the one currently in the cache */
3576	cache_mounts = BC_MALLOC((BC_cache->c_nmounts + nnew_mounts) * sizeof(*cache_mounts), M_TEMP, M_WAITOK | M_ZERO);
3577	if (cache_mounts == NULL) {
3578		message("can't allocate memory for %d mounts", (BC_cache->c_nmounts + nnew_mounts));
3579		error = ENOMEM;
3580		goto out;
3581	}
3582	memcpy(cache_mounts, BC_cache->c_mounts, (BC_cache->c_nmounts * sizeof(*BC_cache->c_mounts)));
3583
3584	/* ncache_mounts is the current number of valid mounts in the mount array */
3585	ncache_mounts = BC_cache->c_nmounts;
3586
3587	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3588		if (pmidx_to_cmidx[pm_idx] != -1) {
3589			/* We already have a record for this mount */
3590
3591			if (playlist_mounts[pm_idx].pm_nentries == 0) {
3592				message("Playlist incuded mount with 0 entries");
3593				error = EINVAL;
3594				goto out;
3595			}
3596
3597			cm_idx = pmidx_to_cmidx[pm_idx];
3598
3599			assert(cm_idx < BC_cache->c_nmounts);
3600
3601			/* grow the mount's extent array to fit the new extents (we'll fill in the new extents below) */
3602			cache_mounts[cm_idx].cm_pextents = BC_MALLOC((BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries) * sizeof(*cache_mounts[cm_idx].cm_pextents), M_TEMP, M_WAITOK | M_ZERO);
3603			if (cache_mounts[cm_idx].cm_pextents == NULL) {
3604				message("can't allocate mount's extent array (%d entries)", (BC_cache->c_mounts[cm_idx].cm_nextents + playlist_mounts[pm_idx].pm_nentries));
3605				error = ENOMEM;
3606				goto out;
3607			}
3608
3609			/* don't free the old extent list yet, the real BC_cache's mount still points to it and we may yet fail */
3610
3611			/* cm_nextents is the number of valid extents in this mount's extent list. It starts out as 0 and we fill it in below */
3612			cache_mounts[cm_idx].cm_nextents = 0;
3613		} else {
3614			/* New mount for the cache */
3615
3616			if ((error = BC_setup_mount(cache_mounts + ncache_mounts, playlist_mounts + pm_idx)) != 0) {
3617				goto out;
3618			}
3619
3620			pmidx_to_cmidx[pm_idx] = ncache_mounts;
3621			ncache_mounts++;
3622		}
3623	}
3624
3625	/*
3626	 * Extent array
3627	 *
3628	 * The existing extent array cannot move since a strategy routine may be
3629	 * in the middle of an msleep with one of the extent locks. So, we allocate
3630	 * a new array for the new extents.
3631	 *
3632	 * Add one extent list to our arrays of extent lists
3633	 */
3634
3635	/* cache_nextents is an array of ints, each indicating the number of extents in the list in cache_extents_list at the same index */
3636	cache_nextents = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_nextents), M_TEMP, M_WAITOK | M_ZERO);
3637	if (cache_nextents == NULL) {
3638		message("can't allocate memory for %d extent list sizes", nplaylist_entries);
3639		error = ENOMEM;
3640		goto out;
3641	}
3642	memcpy(cache_nextents, BC_cache->c_nextents, BC_cache->c_nextentlists * sizeof(*BC_cache->c_nextents));
3643
3644	/* cache_extents_list is the array of extent lists. The extent list at the last index is for the new playlist's cache */
3645	cache_extents_list  = BC_MALLOC((BC_cache->c_nextentlists + 1) * sizeof(*BC_cache->c_extentlists),  M_TEMP, M_WAITOK | M_ZERO);
3646	if (cache_extents_list == NULL) {
3647		message("can't allocate memory for %d extent lists", nplaylist_entries);
3648		error = ENOMEM;
3649		goto out;
3650	}
3651	memcpy(cache_extents_list,  BC_cache->c_extentlists,  BC_cache->c_nextentlists * sizeof(*BC_cache->c_extentlists));
3652
3653	/* The extent list for this new playlist's cache */
3654	cache_extents = BC_MALLOC(nplaylist_entries * sizeof(*cache_extents), M_TEMP, M_WAITOK | M_ZERO);
3655	if (cache_extents == NULL) {
3656		message("can't allocate memory for %d extents", nplaylist_entries);
3657		error = ENOMEM;
3658		goto out;
3659	}
3660
3661	/* TODO: iterate over our history and remove any blocks we've already seen from the new cache */
3662
3663	/* Fill in the new extents.
3664	 *
3665	 * We just tack the new playlist onto the end of any cache we currently have as a new batch.
3666	 *
3667	 * At this point, we assume that any additional caches should be in additional batches just as
3668	 * if there was a single recording with a tag separating the new extents. If we did decide to
3669	 * merge, it would still be hard since that would require reordering the extent list and
3670	 * BC_strategy assumes that the cache extents never move (see the msleep in wait_for_extent).
3671	 *
3672	 * We also don't coalesce the new extents into the old (and lower batch) extents.
3673	 * This would be a bit of work, we assume we want a new batch anyway, and it's rare that
3674	 * a new cache comes in while an old cache is still in readahead. So, we don't bother.
3675	 *
3676	 * We do clip any overlap with the new cache from the old cache, however.
3677	 *
3678	 * We don't clip any overlap with the history from the new extents.
3679	 * The history list isn't ordered, so we don't have a fast way to compare the ranges. It's also
3680	 * expected that the overlap is minimal. If we stopped recording history at some point, we don't
3681	 * have this info anyway. So, we don't bother.
3682	 */
3683
3684	/* size is the size in bytes of the cache, used to allocate our page buffer */
3685	size = 0;
3686
3687	if (BC_preloaded_playlist) {
3688		/*
3689		 * Unpack the static control entry array into the extent array.
3690		 */
3691		playlist_entries = CAST_DOWN(struct BC_playlist_entry *, entries_buf);
3692	} else {
3693		/*
3694		 * Since the playlist control entry structure is not the same as the
3695		 * extent structure we use, we need to copy the control entries in
3696		 * and unpack them.
3697		 */
3698		playlist_entries = BC_MALLOC(BC_PLC_CHUNK * sizeof(*playlist_entries),
3699								   M_TEMP, M_WAITOK);
3700		if (playlist_entries == NULL) {
3701			message("can't allocate unpack buffer for %d entries", BC_PLC_CHUNK);
3702			error = ENOMEM;
3703			goto out;
3704		}
3705	}
3706
3707	remaining_entries = nplaylist_entries;
3708	while (remaining_entries > 0) {
3709
3710		if (BC_preloaded_playlist) {
3711			actual = remaining_entries;
3712		} else {
3713			actual = min(remaining_entries, BC_PLC_CHUNK);
3714			if ((error = copyin(entries_buf, playlist_entries,
3715								actual * sizeof(struct BC_playlist_entry))) != 0) {
3716				message("copyin from 0x%llx to %p failed", entries_buf, playlist_entries);
3717				goto out;
3718			}
3719		}
3720
3721		/* unpack into our array */
3722		for (pe_idx = 0; pe_idx < actual; pe_idx++) {
3723			pe = playlist_entries + pe_idx;
3724			if (pe->pe_length == 0) {
3725				debug("Bad Playlist: entry %d has 0 length", (nplaylist_entries - remaining_entries) + pe_idx);
3726				error = EINVAL;
3727				goto out;
3728			}
3729			pm_idx = pe->pe_mount_idx;
3730
3731			if (pm_idx >= nplaylist_mounts || pm_idx < 0) {
3732				message("Bad playlist: entry %d referenced non-existent mount index %d", (nplaylist_entries - remaining_entries) + pe_idx, pm_idx);
3733				error = EINVAL;
3734				goto out;
3735			}
3736
3737			pm = playlist_mounts + pm_idx;
3738			cm_idx = pmidx_to_cmidx[pm_idx];
3739			cm = cache_mounts + cm_idx;
3740			ce = cache_extents + ncache_extents;
3741
3742			/* The size of the extent list is the number of playlist entries + the number of old cache extents */
3743			max_entries = pm->pm_nentries + ((cm_idx < BC_cache->c_nmounts) ? BC_cache->c_mounts[cm_idx].cm_nextents : 0);
3744
3745			if (cm->cm_nextents >= max_entries) {
3746				message("Bad playlist: more entries existed than the mount %s claimed (%d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
3747				error = EINVAL;
3748				goto out;
3749			}
3750
3751			if ((error = BC_setup_extent(ce, pe, old_batch_count, cm_idx)) != 0) {
3752				goto out;
3753			}
3754
3755			/* Merge the new extent with the old extents for this mount. The new extent may get clipped. */
3756#ifdef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3757			if (cm_idx < BC_cache->c_nmounts) {
3758				old_cm = BC_cache->c_mounts + cm_idx;
3759				/* Copy any lower or equal extents from the existing playlist down to the low end of the array */
3760				while (next_old_extent_idx[cm_idx] < old_cm->cm_nextents &&
3761					   old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset <= ce->ce_diskoffset) {
3762					cm->cm_pextents[cm->cm_nextents] = old_cm->cm_pextents[next_old_extent_idx[cm_idx]];
3763					cm->cm_nextents++;
3764					next_old_extent_idx[cm_idx]++;
3765				}
3766
3767				/* check for overlap with the next extent in the old list and clip the new extent */
3768				if (next_old_extent_idx[cm_idx] < old_cm->cm_nextents) {
3769
3770					/* FIXME: rdar://9153031 If the new extent extends past the end of the old extent, we're losing caching! */
3771					if (ce->ce_diskoffset + ce->ce_length > (old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset + old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length)) {
3772						debug("!!! Entry %d (0x%llx:%lld) is getting clipped too much because of previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset, old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_length);
3773					}
3774
3775					u_int64_t max_offset = old_cm->cm_pextents[next_old_extent_idx[cm_idx]]->ce_diskoffset;
3776					if (max_offset < ce->ce_diskoffset + ce->ce_length) {
3777						if (max_offset <= ce->ce_diskoffset) {
3778							ce->ce_length = 0;
3779						} else {
3780							ce->ce_length = (max_offset - ce->ce_diskoffset);
3781						}
3782					}
3783				}
3784			}
3785
3786			/* check for intersection with the next lower extent in the list and clip the new extent */
3787			if (cm->cm_nextents > 0) {
3788				u_int64_t min_offset = cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset + cm->cm_pextents[cm->cm_nextents - 1]->ce_length;
3789				if (min_offset > ce->ce_diskoffset) {
3790
3791					/* Check if this extent is overlapping with an extent from the same playlist */
3792					if (cm->cm_pextents[cm->cm_nextents - 1] >= cache_extents &&
3793						cm->cm_pextents[cm->cm_nextents - 1] < cache_extents + ncache_extents) {
3794						message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[cm->cm_nextents - 1]->ce_diskoffset, cm->cm_pextents[cm->cm_nextents - 1]->ce_length);
3795						error = EINVAL;
3796						goto out;
3797					}
3798
3799					if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
3800						ce->ce_length = 0;
3801					} else {
3802						ce->ce_length -= (min_offset - ce->ce_diskoffset);
3803						ce->ce_diskoffset = min_offset;
3804					}
3805				}
3806			}
3807
3808			if (ce->ce_length != pe->pe_length) {
3809				clipped_extents++;
3810			}
3811			if (ce->ce_length == 0) {
3812				/* new extent is entirely captured in extents from the old cache, throw it out */
3813				enveloped_extents++;
3814				BC_teardown_extent(ce);
3815				lck_mtx_destroy(&ce->ce_lock, BC_cache->c_lckgrp);
3816
3817				/* continue without incrementing ncache_extents so we reuse this extent index */
3818				continue;
3819			}
3820			size += roundup(ce->ce_length, PAGE_SIZE);
3821#endif
3822
3823			cm->cm_pextents[cm->cm_nextents] = ce;
3824			cm->cm_nextents++;
3825			ncache_extents++;
3826		}
3827		remaining_entries -= actual;
3828		entries_buf += actual * sizeof(struct BC_playlist_entry);
3829	}
3830
3831	/* Fill in any remaining extents from the original cache that are still sitting at the high end of the extent list */
3832	for (pm_idx = 0; pm_idx < nplaylist_mounts; pm_idx++) {
3833		cm_idx = pmidx_to_cmidx[pm_idx];
3834		cm = cache_mounts + cm_idx;
3835		pm = playlist_mounts + pm_idx;
3836
3837		if (cm->cm_nextents == 0) {
3838			message("Bad playlist: No entries existed for mount %s (claimed %d)", uuid_string(pm->pm_uuid), pm->pm_nentries);
3839			error = EINVAL;
3840			goto out;
3841		}
3842
3843		if (cm_idx < BC_cache->c_nmounts) {
3844			if (next_old_extent_idx[cm_idx] < BC_cache->c_mounts[cm_idx].cm_nextents) {
3845				/* There are more extents in the old extent array, copy them over */
3846				memcpy(cm->cm_pextents + cm->cm_nextents, BC_cache->c_mounts[cm_idx].cm_pextents + next_old_extent_idx[cm_idx], sizeof(*cm->cm_pextents) * (BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx]));
3847				cm->cm_nextents += BC_cache->c_mounts[cm_idx].cm_nextents - next_old_extent_idx[cm_idx];
3848			}
3849		}
3850
3851	}
3852
3853#ifndef BOOTCACHE_ENTRIES_SORTED_BY_DISK_OFFSET
3854	/* We need an ordered list of extents for each mount, but we were given
3855	 * the extents in a different ordering, so we need to sort the mount's
3856	 * extent list here
3857	 */
3858	for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
3859		cm = cache_mounts + cm_idx;
3860
3861		/* If this is a new mount or a mount with a new extent list, it needs sorting */
3862		if (cm_idx >= BC_cache->c_nmounts ||
3863			cm->cm_pextents != BC_cache->c_mounts[cm_idx].cm_pextents) {
3864
3865			qsort(cm->cm_pextents,
3866				  cm->cm_nextents,
3867				  sizeof(*cm->cm_pextents),
3868				  compare_cache_extents);
3869
3870			/* Clip the new extents of overlap with the old extents */
3871			for (ce_idx = 0; ce_idx < cm->cm_nextents; ce_idx++) {
3872
3873				/* Only look at new extents, old extents were already clipped when they were added and can't be clipped further */
3874				if (cm->cm_pextents[ce_idx] >= cache_extents &&
3875					cm->cm_pextents[ce_idx] < cache_extents + ncache_extents) {
3876
3877					u_int64_t old_length = ce->ce_length;
3878
3879					/* Compare against the previous (lower) extent */
3880					if (ce_idx > 0) {
3881						u_int64_t min_offset = cm->cm_pextents[ce_idx - 1]->ce_diskoffset + cm->cm_pextents[ce_idx - 1]->ce_length;
3882						if (min_offset > ce->ce_diskoffset) {
3883							if (min_offset >= ce->ce_diskoffset + ce->ce_length) {
3884								ce->ce_length = 0;
3885							} else {
3886								ce->ce_length -= (min_offset - ce->ce_diskoffset);
3887								ce->ce_diskoffset = min_offset;
3888							}
3889						}
3890					}
3891
3892					/* Compare against the next (higher) extent */
3893					if (ce_idx < cm_nextents - 1) {
3894						u_int64_t max_offset = cm->cm_pextents[ce_idx + 1]->ce_diskoffset;
3895						if (max_offset < ce->ce_diskoffset + ce->ce_length) {
3896
3897							/* Check if this extent is overlapping with an extent from the same playlist */
3898							if (cm->cm_pextents[ce_idx + 1] >= cache_extents &&
3899								cm->cm_pextents[ce_idx + 1] < cache_extents + ncache_extents) {
3900								message("Bad playlist: entry %d (0x%llx:%lld) overlapped with previous entry for mount %s (0x%llx:%lld)", pe_idx, ce->ce_diskoffset, ce->ce_length, uuid_string(cm->cm_uuid), cm->cm_pextents[ce_idx + 1]->ce_diskoffset, cm->cm_pextents[ce_idx + 1]->ce_length);
3901								error = EINVAL;
3902								goto out;
3903							}
3904
3905							if (max_offset <= ce->ce_diskoffset) {
3906								ce->ce_length = 0;
3907							} else {
3908								ce->ce_length = (max_offset - ce->ce_diskoffset);
3909							}
3910						}
3911					}
3912
3913					if (ce->ce_length != old_length) {
3914						clipped_extents++;
3915						if (ce->ce_length == 0) {
3916							/* extent was enveloped, remove this extent from the mount's extent list */
3917							if ((ce_idx + 1) < cm->cm_nextents) {
3918								memmove(cm->cm_pextents + ce_idx, cm->cm_pextents + (ce_idx + 1), cm->cm_nextents - (ce_idx + 1));
3919							}
3920							cm->cm_nextents--;
3921							ce_idx--;
3922							enveloped_extents++;
3923						}
3924					}
3925				}
3926			}
3927		}
3928	}
3929
3930	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
3931		size += roundup(cache_extents->ce_length, PAGE_SIZE);
3932	}
3933
3934#endif
3935
3936	if (clipped_extents > 0) {
3937		debug("%d extents were clipped, %d of which were completely enveloped", clipped_extents, enveloped_extents);
3938	}
3939
3940	if (ncache_extents == 0) {
3941		debug("No extents added, not updating cache");
3942		error = EALREADY;
3943		goto out;
3944	}
3945
3946	if (! BC_preloaded_playlist) {
3947		_FREE_ZERO(playlist_entries, M_TEMP);
3948		_FREE_ZERO(playlist_mounts, M_TEMP);
3949	}
3950
3951	if (ncache_extents < nplaylist_entries) {
3952		debug("%d extents added out of %d", ncache_extents, nplaylist_entries);
3953	}
3954	for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
3955		debug("Mount %s now has %d extents", uuid_string(cache_mounts[cm_idx].cm_uuid), cache_mounts[cm_idx].cm_nextents);
3956	}
3957
3958
3959	/*
3960	 * In order to simplify allocation of the block and page maps, we round
3961	 * the size up to a multiple of CB_MAPFIELDBITS pages.  This means we
3962	 * may waste as many as 31 pages in the worst case.
3963	 */
3964	size = roundup(size, PAGE_SIZE * CB_MAPFIELDBITS);
3965
3966	/*
3967	 * Verify that the cache is not larger than 50% of physical memory
3968	 * to avoid forcing pageout activity.
3969	 *
3970	 * max_mem and size are in bytes.  All calculations are done in page
3971	 * units to avoid overflow/sign issues.
3972	 *
3973	 * If the cache is too large, we discard it and go to record-only mode.
3974	 * This lets us survive the case where the playlist is corrupted or
3975	 * manually made too large, but the "real" bootstrap footprint
3976	 * would be OK.
3977	 */
3978	BC_cache->c_cachesize += size;
3979	if (BC_cache->c_cachesize > BC_MAX_SIZE) {
3980		message("cache size (%lu bytes) too large for physical memory (%llu bytes), capping at %llu bytes",
3981				(unsigned long)BC_cache->c_cachesize, max_mem, BC_MAX_SIZE);
3982		BC_cache->c_cachesize = BC_MAX_SIZE;
3983	}
3984
3985	/*
3986	 * Allocate/grow the pagebuffer.
3987	 */
3988	if (BC_alloc_pagebuffer() != 0) {
3989		message("can't allocate %lld bytes for cache buffer", BC_cache->c_cachesize);
3990		error = ENOMEM;
3991		goto out;
3992	}
3993
3994	/*
3995	 * Fix up the extent cache offsets.
3996	 */
3997	p = 0;
3998	if (BC_cache->c_nextentlists > 0) {
3999		/* start at the offset of the end of the previous cache list */
4000		for (cel_idx = BC_cache->c_nextentlists - 1; cel_idx >= 0; cel_idx--) {
4001			if (BC_cache->c_nextents[cel_idx] > 0) {
4002				struct BC_cache_extent *last_ce = BC_cache->c_extentlists[cel_idx] + BC_cache->c_nextents[cel_idx] - 1;
4003				p = last_ce->ce_cacheoffset + roundup(last_ce->ce_length, PAGE_SIZE);
4004				break;
4005			}
4006		}
4007	}
4008	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4009		cache_extents[ce_idx].ce_cacheoffset = p;
4010		p += roundup(cache_extents[ce_idx].ce_length, PAGE_SIZE);
4011
4012		/* Abort any extents that extend past our cache size */
4013		if (p >= BC_cache->c_cachesize || cache_extents[ce_idx].ce_length == 0) {
4014			BC_teardown_extent(cache_extents + ce_idx);
4015		}
4016	}
4017
4018	/* We've successfully integrated the new playlist with our cache, copy it into place */
4019
4020	/* free up old cache's allocations that we're replacing */
4021	if (BC_cache->c_nmounts > 0) {
4022		for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
4023			/* If we have a new extent array for this mount, free the old one */
4024			if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
4025				_FREE_ZERO(BC_cache->c_mounts[cm_idx].cm_pextents, M_TEMP);
4026			}
4027		}
4028		_FREE_ZERO(BC_cache->c_mounts, M_TEMP)
4029	}
4030	BC_cache->c_mounts = cache_mounts;
4031	BC_cache->c_nmounts = ncache_mounts;
4032
4033	if (BC_cache->c_nextentlists > 0) {
4034		_FREE_ZERO(BC_cache->c_extentlists, M_TEMP)
4035		_FREE_ZERO(BC_cache->c_nextents, M_TEMP)
4036	}
4037
4038	cache_nextents[BC_cache->c_nextentlists] = ncache_extents;
4039	cache_extents_list[BC_cache->c_nextentlists] = cache_extents;
4040
4041	BC_cache->c_nextentlists++;
4042
4043	BC_cache->c_nextents = cache_nextents;
4044	BC_cache->c_extentlists = cache_extents_list;
4045
4046	BC_cache->c_stats.ss_total_mounts = ncache_mounts;
4047	BC_cache->c_stats.ss_total_extents += ncache_extents;
4048
4049	/* fill (or tear down) the extents for all the mounts we've already seen */
4050	for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4051		ce = cache_extents + ce_idx;
4052		if (CM_SETUP != BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
4053			LOCK_EXTENT(ce);
4054			if (CM_READY == BC_cache->c_mounts[ce->ce_mount_idx].cm_state) {
4055				if (0 == BC_fill_in_extent(ce)) {
4056					UNLOCK_EXTENT(ce);
4057					continue;
4058				}
4059			}
4060			/* either the mount is aborted or fill_in_extent failed */
4061			BC_teardown_extent(ce);
4062			UNLOCK_EXTENT(ce);
4063		}
4064	}
4065
4066	/* Check to see if we have any more mounts to fill in */
4067	for (cm_idx = 0; cm_idx < BC_cache->c_nmounts; cm_idx++) {
4068		if (CM_SETUP == BC_cache->c_mounts[cm_idx].cm_state) {
4069			/* have at least one mount to fill in, iterate all of them */
4070			int go_again = 0;
4071			vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
4072			if (go_again) {
4073				debug("Missing root mount during last iteration, going again");
4074				vfs_iterate(0, fill_in_bc_cache_mounts, &go_again);
4075			}
4076			break;
4077		}
4078	}
4079
4080	error = 0;
4081
4082	/* all done */
4083out:
4084	if (! BC_preloaded_playlist) {
4085		_FREE_ZERO(playlist_entries, M_TEMP);
4086		_FREE_ZERO(playlist_mounts, M_TEMP);
4087	}
4088	_FREE_ZERO(pmidx_to_cmidx, M_TEMP);
4089	_FREE_ZERO(next_old_extent_idx, M_TEMP);
4090	if (error != 0) {
4091		if (error != EALREADY) {
4092			debug("cache setup failed, aborting");
4093		}
4094
4095		/* If it was possible to fail after growing the page buffer, we'd need a way to shrink it back here */
4096
4097		if (BC_cache->c_extentlists == NULL) {
4098			/* If this is our first cache, clear stats and our page buffer */
4099			BC_cache->c_stats.ss_total_extents = 0;
4100			BC_cache->c_stats.ss_total_mounts = 0;
4101			if (BC_cache->c_vp != NULL) {
4102				BC_free_pagebuffer();
4103			}
4104		}
4105
4106		BC_cache->c_cachesize = old_cache_size;
4107		BC_cache->c_batch_count = old_batch_count;
4108
4109		/* free up new cache's allocations that aren't part of the old cache */
4110		if (cache_extents) {
4111			for (ce_idx = 0; ce_idx < ncache_extents; ce_idx++) {
4112				BC_teardown_extent(cache_extents + ce_idx);
4113				lck_mtx_destroy(&cache_extents[ce_idx].ce_lock, BC_cache->c_lckgrp);
4114			}
4115			_FREE_ZERO(cache_extents, M_TEMP);
4116		}
4117		_FREE_ZERO(cache_nextents, M_TEMP);
4118		_FREE_ZERO(cache_extents_list, M_TEMP);
4119
4120		if (cache_mounts) {
4121			for (cm_idx = 0; cm_idx < ncache_mounts; cm_idx++) {
4122				if (cm_idx >= BC_cache->c_nmounts) {
4123					/* newly allocated mounts, teardown completely */
4124					BC_teardown_mount(cache_mounts + cm_idx);
4125					lck_rw_destroy(&cache_mounts[cm_idx].cm_lock, BC_cache->c_lckgrp);
4126				} else if (BC_cache->c_mounts[cm_idx].cm_pextents != cache_mounts[cm_idx].cm_pextents) {
4127					/* old mounts with a new extent list */
4128					_FREE_ZERO(cache_mounts[cm_idx].cm_pextents, M_TEMP);
4129				}
4130			}
4131			_FREE_ZERO(cache_mounts, M_TEMP);
4132		}
4133	}
4134
4135#ifdef BC_DEBUG
4136	struct timeval end_time;
4137	microtime(&end_time);
4138	timersub(&end_time, &start_time, &end_time);
4139	debug("BC_copyin_playlist took %ldus", end_time.tv_sec * 1000000 + end_time.tv_usec);
4140#endif
4141
4142	return(error);
4143}
4144
4145/*
4146 * Initialise the cache.  Fetch the playlist from userspace and
4147 * find and attach to our block device.
4148 */
4149static int
4150BC_init_cache(size_t mounts_size, user_addr_t mounts, size_t entries_size, user_addr_t entries, int record_history)
4151{
4152	int error = 0, copyin_error, mount_idx;
4153	struct timeval current_time;
4154
4155	/* Everything should be set, or nothing should be set */
4156	if ((mounts_size == 0 || mounts == 0 || entries_size == 0 || entries == 0) &&
4157		(mounts_size != 0 || mounts != 0 || entries_size != 0 || entries != 0)) {
4158		debug("Invalid cache parameters: %ld, %lld, %ld, %lld", mounts_size, mounts, entries_size, entries);
4159		return(EINVAL);
4160	}
4161
4162
4163	if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
4164		debug("Cache not yet setup");
4165		return(ENXIO);
4166	}
4167
4168	microtime(&current_time);
4169
4170	/*
4171	 * Start recording history.
4172	 *
4173	 * Only one history recording may be active at a time
4174	 */
4175	if (record_history) {
4176		LOCK_HANDLERS();
4177		if (BC_set_flag(BC_FLAG_HISTORYACTIVE)) {
4178			debug("history recording already started, only one playlist will be returned");
4179			BC_ADD_STAT(history_num_recordings, 1);
4180			UNLOCK_HANDLERS();
4181		} else {
4182			debug("starting history recording");
4183
4184			/* start the timeout handler */
4185			timeout(BC_timeout_history, NULL, BC_history_timeout);
4186
4187			/* always take stats for the first two recordings (boot + login), even if we don't have a playlist */
4188			if (BC_cache->c_stats.ss_history_num_recordings < 2)
4189				BC_cache->c_take_detailed_stats = 1;
4190
4191			BC_cache->c_history_starttime = current_time;
4192			BC_cache->c_history_size = 0;
4193			BC_cache->c_history_num_clusters = 0;
4194			BC_ADD_STAT(history_num_recordings, 1);
4195			BC_clear_flag(BC_FLAG_HTRUNCATED);
4196
4197			/*
4198			 * Take over the strategy routine for our device; we are now
4199			 * recording read operations.
4200			 */
4201			BC_check_handlers();
4202			UNLOCK_HANDLERS();
4203
4204			BC_update_mounts();
4205
4206#ifndef NO_HISTORY
4207# ifdef STATIC_HISTORY
4208			/* initialise the history buffer */
4209			if ((BC_cache->c_history = BC_MALLOC(BC_HISTORY_ALLOC, M_TEMP, M_WAITOK)) == NULL) {
4210				message("can't allocate %d bytes for static history buffer", BC_HISTORY_ALLOC);
4211				BC_clear_flag(BC_FLAG_HISTORYACTIVE);
4212				error = ENOMEM;
4213				goto out;
4214			}
4215
4216			bzero(BC_cache->c_history, BC_HISTORY_ALLOC);
4217# endif
4218#endif
4219
4220		}
4221	}
4222
4223	/*
4224	 * If we have playlist data, fetch it.
4225	 */
4226	if (entries_size > 0) {
4227		LOCK_CACHE_W();
4228
4229		if (BC_cache->c_flags & BC_FLAG_SHUTDOWN) {
4230			debug("cache already shutdown");
4231			error = ENXIO;
4232			UNLOCK_CACHE_W();
4233			goto out;
4234		}
4235
4236		/* we're playing back a cache (or at least trying to),
4237		 * record detailed statistics until we stop recording history
4238		 */
4239		BC_cache->c_take_detailed_stats = 1;
4240
4241		copyin_error = BC_copyin_playlist(mounts_size, mounts, entries_size, entries);
4242		if (copyin_error != 0 && copyin_error != EALREADY) {
4243			message("can't load playlist: %d", copyin_error);
4244			UNLOCK_CACHE_W();
4245			error = copyin_error;
4246			goto out;
4247		}
4248
4249		/* Always throttle cut-through IOs.
4250		 * I tried throttling only after login had started, but
4251		 * then users who log in quickly see a very long delay
4252		 * after entering their password before apps come up.
4253		 * I'd prefer delay the time until the login screen appears
4254		 * and provide a fast-as-possible login.
4255		 * rdar://8592401
4256		 */
4257		BC_cache->c_readahead_throttles_cutthroughs = 1;
4258
4259		LOCK_HANDLERS();
4260		if (BC_set_flag(BC_FLAG_CACHEACTIVE)) {
4261			/* cache is already running, we're adding to it */
4262			UNLOCK_HANDLERS();
4263		} else {
4264			/* first cache we've created this boot */
4265
4266#ifdef READ_HISTORY_BUFFER
4267			/* initialise the read history buffer */
4268			if (NULL == BC_cache->c_rhistory) {
4269				if ((BC_cache->c_rhistory = BC_MALLOC(READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry),
4270													M_TEMP, M_WAITOK)) == NULL) {
4271					message("can't allocate %d bytes for read history buffer",
4272							READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
4273				}
4274			}
4275			if (BC_cache->c_rhistory) {
4276				bzero(BC_cache->c_rhistory, READ_HISTORY_BUFFER * sizeof(struct BC_read_history_entry));
4277			}
4278#endif
4279
4280			BC_cache->c_cache_starttime = current_time;
4281
4282			/*
4283			 * Take over the strategy routine for our device; we are now
4284			 * filling read operations from the cache.
4285			 *
4286			 * This must be done before we kick off reader threads to ensure
4287			 * that we see catch any writes that occur on blocks in our cache.
4288			 * This ensures that a write won't come in unnoticed on a block we've
4289			 * already read in so when it's subsequently requested, the cache gives
4290			 * stale data.
4291			 */
4292			BC_check_handlers();
4293			UNLOCK_HANDLERS();
4294
4295			bootcache_contains_block = BC_cache_contains_block;
4296		}
4297
4298		LOCK_CACHE_W_TO_R();
4299		if (copyin_error != EALREADY) {
4300			/*
4301			 * Start the reader threads.
4302			 */
4303			for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
4304				if (BC_cache->c_mounts[mount_idx].cm_state == CM_READY) {
4305					LOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
4306					BC_mount_available(BC_cache->c_mounts + mount_idx);
4307					UNLOCK_MOUNT_R(BC_cache->c_mounts + mount_idx);
4308				}
4309			}
4310		}
4311
4312		UNLOCK_CACHE_R();
4313
4314		/* cache is now active */
4315
4316	}
4317
4318	if (BC_cache->c_stats.ss_preload_time == 0) {
4319		timersub(&current_time,
4320				 &BC_cache->c_loadtime,
4321				 &current_time);
4322		BC_cache->c_stats.ss_preload_time = (u_int) current_time.tv_sec * 1000 + (u_int) current_time.tv_usec / 1000;
4323	}
4324
4325out:
4326	return(error);
4327}
4328
4329/*
4330 * Timeout on collection.
4331 *
4332 * We haven't heard from the userland component for too long;
4333 * stop recording history, but keep the current history
4334 * around in case they show up.
4335 */
4336static void
4337BC_timeout_history(void *junk)
4338{
4339	/* stop recording history */
4340	debug("history recording timed out");
4341	BC_terminate_history();
4342}
4343
4344/*
4345 * Check for new mounts. Called with no locks held
4346 */
4347static int check_for_new_mount_itr(mount_t mount, void* arg) {
4348	struct vfs_attr attr;
4349	vfs_context_t context;
4350	int error, mount_idx;
4351	int* go_again = (int*) arg;
4352
4353	struct BC_history_mount_device *hmd = NULL;
4354	vnode_t devvp = NULLVP;
4355
4356	LOCK_HISTORY_R();
4357	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
4358		/* don't do anything if we've stopped recording history */
4359		UNLOCK_HISTORY_R();
4360		return VFS_RETURNED_DONE;
4361	}
4362
4363#ifdef BC_DEBUG
4364	char name[MFSNAMELEN];
4365	vfs_name(mount, name);
4366#endif
4367
4368	devvp = vfs_devvp(mount);
4369	if (devvp == NULLVP) {
4370		//debug("Mount %s (%p)'s vnode is NULL", name, mount);
4371		UNLOCK_HISTORY_R();
4372		goto out;
4373	}
4374
4375	dev_t mount_dev = vnode_specrdev(devvp);
4376	//debug("Mount %s (%p)'s vnode %p's device is 0x%x", name, mount, devvp, mount_dev);
4377
4378	/*
4379	 * A new mount appeared, but unfortunately we don't know which one.
4380	 * So, for every mount we have to check through our list of history_mounts
4381	 * to see if we already have info for it.
4382	 */
4383	hmd = BC_get_history_mount_device(mount_dev, NULL);
4384	if (hmd == NULL || (! uuid_is_null(hmd->hmd_mount.hm_uuid))) {
4385		/* Unable to allocate new space for a mount, or
4386		 * we already have info about this mount
4387		 */
4388		UNLOCK_HISTORY_R();
4389		goto out;
4390	}
4391
4392	/*
4393	 * We don't yet have info about this mount. It's new!
4394	 */
4395
4396	hmd->hmd_blocksize = vfs_devblocksize(mount);
4397
4398	VFSATTR_INIT(&attr);
4399	VFSATTR_WANTED(&attr, f_uuid);
4400	context = vfs_context_create(NULL);
4401	error = vfs_getattr(mount, &attr, context);
4402
4403	if ((0 != error) || (! VFSATTR_IS_SUPPORTED(&attr, f_uuid))) {
4404		vfs_context_rele(context);
4405#ifdef BC_DEBUG
4406		if (strncmp("devfs", name, sizeof("devfs")) != 0) {
4407			debug("Found mount %s without a UUID: error %d", name, error);
4408		}
4409#endif
4410		UNLOCK_HISTORY_R();
4411		goto out;
4412	}
4413
4414	hmd->hmd_disk_id = vfs_throttle_mask(mount);
4415
4416	if (VNOP_IOCTL(devvp,              /* vnode */
4417				   DKIOCISSOLIDSTATE,    /* cmd */
4418				   (caddr_t)&hmd->hmd_is_ssd, /* data */
4419				   0,
4420				   context))           /* context */
4421	{
4422		debug("can't determine if physical disk for history mount %s is an ssd", uuid_string(hmd->hmd_mount.hm_uuid));
4423		hmd->hmd_is_ssd = 0;
4424	}
4425
4426#ifdef ROOT_DISK_ONLY
4427	if (devvp == rootvp) {
4428		if (BC_cache->c_root_disk_id == 0) {
4429			BC_cache->c_root_disk_id = hmd->hmd_disk_id;
4430			if (BC_cache->c_root_disk_id == 0) {
4431				message("Root disk is 0");
4432			} else {
4433				debug("Root disk (via history) is 0x%llx", BC_cache->c_root_disk_id);
4434			}
4435		} else if (BC_cache->c_root_disk_id != hmd->hmd_disk_id) {
4436			debug("Root disk 0x%llx doesn't match that found by history 0x%llx", BC_cache->c_root_disk_id, hmd->hmd_disk_id);
4437		}
4438	}
4439#endif
4440
4441
4442	/* Found info for our mount */
4443	debug("Found new historic mount after %d IOs: %s, dev 0x%x, disk 0x%llx, blk %d%s",
4444		  hmd->hmd_mount.hm_nentries,
4445		  uuid_string(attr.f_uuid),
4446		  hmd->hmd_dev,
4447		  hmd->hmd_disk_id,
4448		  hmd->hmd_blocksize,
4449		  hmd->hmd_is_ssd ? ", ssd" : "");
4450
4451	if (hmd->hmd_blocksize == 0) {
4452		hmd->hmd_blocksize = 512;
4453	}
4454	hmd->hmd_mount.hm_nentries = 0;
4455
4456	uuid_copy(hmd->hmd_mount.hm_uuid, attr.f_uuid);
4457
4458	UNLOCK_HISTORY_R();
4459
4460	if (BC_cache->c_flags & BC_FLAG_CACHEACTIVE) {
4461		LOCK_CACHE_R();
4462		/* Check if we have a playlist mount that matches this mount and set it up */
4463		for (mount_idx = 0; mount_idx < BC_cache->c_nmounts; mount_idx++) {
4464			struct BC_cache_mount *cm = BC_cache->c_mounts + mount_idx;
4465			if (CM_SETUP == cm->cm_state && 0 == uuid_compare(cm->cm_uuid, attr.f_uuid)) {
4466				/* Found a matching unfilled mount */
4467
4468				LOCK_MOUNT_W(cm);
4469				if ((error = BC_fill_in_mount(cm, mount, context)) == 0) {
4470					LOCK_MOUNT_W_TO_R(cm);
4471					/* Kick off another reader thread if this is a new physical disk */
4472					BC_mount_available(cm);
4473					UNLOCK_MOUNT_R(cm);
4474				} else {
4475					if (error == EAGAIN) {
4476						*go_again = 1;
4477					}
4478					UNLOCK_MOUNT_W(cm);
4479				}
4480
4481				break;
4482			}
4483		}
4484		UNLOCK_CACHE_R();
4485	}
4486
4487	vfs_context_rele(context);
4488
4489out:
4490	if (devvp != NULLVP) {
4491		vnode_put(devvp);
4492	}
4493	return VFS_RETURNED;
4494}
4495
4496/*
4497 * Check for new or disappearing mounts
4498 */
4499static void BC_update_mounts(void) {
4500
4501	/* don't do anything if we've stopped recording history */
4502	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE)) {
4503		return;
4504	}
4505
4506	int go_again = 0;
4507	vfs_iterate(0, check_for_new_mount_itr, &go_again);
4508	if (go_again) {
4509		debug("Missing root mount during last iteration, going again");
4510		vfs_iterate(0, check_for_new_mount_itr, &go_again);
4511	}
4512}
4513
4514/*
4515 * Find the mount that corresponds to this IO,
4516 * or the first empty slot.
4517 *
4518 * Called with the history read lock
4519 */
4520static struct BC_history_mount_device * BC_get_history_mount_device(dev_t dev, int* pindex) {
4521	struct BC_history_mount_device *tmphmd = NULL, **phmd, *ret = NULL;
4522	int mount_idx = 0;
4523
4524	for (phmd = &BC_cache->c_history_mounts;
4525		 (*phmd) == NULL || (*phmd)->hmd_dev != dev;
4526		 phmd = &(*phmd)->hmd_next) {
4527
4528		if (*phmd == NULL) {
4529			if (tmphmd == NULL) { /* we may have an allocation from previous iterations */
4530				tmphmd = kalloc(sizeof(struct BC_history_mount_device));
4531				if (tmphmd == NULL) {
4532					message("could not allocate %lu bytes for history mount device",
4533							sizeof(struct BC_history_mount_device));
4534					BC_set_flag(BC_FLAG_HTRUNCATED);
4535					goto out;
4536				}
4537
4538				/* claim the new entry */
4539				tmphmd->hmd_next = NULL;
4540				tmphmd->hmd_mount.hm_nentries = 0;
4541				uuid_clear(tmphmd->hmd_mount.hm_uuid);
4542				tmphmd->hmd_disk_id = 0;
4543				tmphmd->hmd_blocksize = 0;
4544				tmphmd->hmd_dev = dev;
4545			}
4546
4547			if (OSCompareAndSwapPtr(NULL, tmphmd, phmd)) {
4548				tmphmd = NULL;
4549				if (BC_cache->c_take_detailed_stats) {
4550					BC_ADD_STAT(history_mounts, 1);
4551				}
4552				break;
4553			}
4554
4555			/* someone else added a mount before we could, check if its the one we want */
4556			if ((*phmd)->hmd_dev == dev) break;
4557
4558		}
4559		mount_idx++;
4560	}
4561
4562	if (pindex)
4563		*pindex = mount_idx;
4564	ret = *phmd;
4565out:
4566	if (tmphmd)
4567		kfree(tmphmd, sizeof(struct BC_history_mount_device));
4568
4569	return ret;
4570}
4571
4572/*
4573 * Record an incoming IO in the history list.
4574 *
4575 * Returns non-0 if this IO was on the root disk.
4576 *
4577 * Note that this function is not reentrant.
4578 */
4579static int
4580BC_add_history(daddr64_t blkno, u_int64_t length, int pid, int hit, int write, int tag, int shared, dev_t dev)
4581{
4582	u_int64_t offset;
4583	struct BC_history_entry_cluster *hec, *tmphec = NULL;
4584	struct BC_history_mount_device *hmd = NULL;
4585	struct BC_history_entry *he;
4586	int mount_idx, entry_idx;
4587	int is_root_disk = 0;
4588
4589	/* don't do anything if we've stopped recording history */
4590	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
4591		return 0;
4592
4593	LOCK_HISTORY_R();
4594
4595	/* check again, with lock */
4596	if (! (BC_cache->c_flags & BC_FLAG_HISTORYACTIVE))
4597		goto out;
4598
4599	/* count IOs during boot (first recording) */
4600	if (BC_cache->c_take_detailed_stats) {
4601		if (! tag) {
4602			if (! write) {
4603				BC_ADD_STAT(history_reads, 1);
4604			} else {
4605				BC_ADD_STAT(history_writes, 1);
4606			}
4607		}
4608	}
4609
4610	/* don't do anything if the history list has been truncated */
4611	if (!tag && BC_cache->c_flags & BC_FLAG_HTRUNCATED)
4612		goto out;
4613
4614	/*
4615	 * In order to avoid recording a playlist larger than we will
4616	 * allow to be played back, if the total byte count for recorded
4617	 * reads reaches 50% of the physical memory in the system, we
4618	 * start ignoring reads.
4619	 * This is preferable to dropping entries from the end of the
4620	 * playlist as we copy it in, since it will result in a clean
4621	 * cessation of caching at a single point, rather than
4622	 * introducing sporadic cache misses throughout the cache's run.
4623	 */
4624	if (!tag && (BC_cache->c_history_size + length) > BC_MAX_SIZE) {
4625		debug("History recording too large, capping at %lluMB", BC_MAX_SIZE / (1024 * 1024));
4626		BC_set_flag(BC_FLAG_HTRUNCATED);
4627		goto out;
4628	}
4629
4630#ifdef NO_HISTORY
4631	BC_set_flag(BC_FLAG_HTRUNCATED);
4632	debug("history disabled, truncating");
4633	goto out;
4634#endif
4635
4636	mount_idx = 0;
4637	offset = 0;
4638	if (! tag) {
4639
4640		hmd = BC_get_history_mount_device(dev, &mount_idx);
4641		if (hmd == NULL) goto out;
4642
4643		if (uuid_is_null(hmd->hmd_mount.hm_uuid)) {
4644			/* Couldn't identify the mount
4645			 *
4646			 * Before the mount has appeared we don't want to
4647			 * include any of this device's IOs in the history
4648			 * since these IOs will be issued next boot before
4649			 * we're able to start this mount's part of the
4650			 * playlist.
4651			 */
4652
4653			/* Keep track of the number of IOs we've seen until the mount appears */
4654			OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
4655
4656			if (BC_cache->c_take_detailed_stats) {
4657				BC_ADD_STAT(history_unknown, 1);
4658				BC_ADD_STAT(history_unknown_bytes, length);
4659			}
4660			goto out;
4661		}
4662
4663		offset = HB_BLOCK_TO_BYTE(hmd, blkno);
4664
4665		/* Only track IOs on the root disk.
4666		 *
4667		 * We don't expect IOs on other physical disks
4668		 * to be part of the critical path to boot/login
4669		 * and we certainly don't expect the other disks
4670		 * to be contested enough to make a cache worthwhile
4671		 */
4672
4673		if (hmd->hmd_disk_id != BC_cache->c_root_disk_id) {
4674#ifdef ROOT_DISK_ONLY
4675			goto out;
4676#endif
4677		} else {
4678			is_root_disk = 1;
4679		}
4680
4681	}
4682
4683	/*
4684	 * Get the current entry cluster.
4685	 */
4686	while ((hec = BC_cache->c_history) == NULL ||
4687		   (entry_idx = OSAddAtomic(1, &hec->hec_nentries)) >= BC_HISTORY_ENTRIES) {
4688		if (hec)
4689			OSAddAtomic(-1, &hec->hec_nentries);
4690
4691		/* need a new cluster */
4692
4693		/* limit the number of clusters we will allocate */
4694		if (BC_cache->c_history_num_clusters + 1 >= BC_HISTORY_MAXCLUSTERS) {
4695			message("too many history clusters (%d, limit %ld)",
4696					BC_cache->c_history_num_clusters,
4697					(long)BC_HISTORY_MAXCLUSTERS);
4698			BC_set_flag(BC_FLAG_HTRUNCATED);
4699			goto out;
4700		}
4701
4702		if (tmphec == NULL) { /* we may have an allocation from previous iterations */
4703			tmphec = kalloc(BC_HISTORY_ALLOC);
4704			if (tmphec == NULL) {
4705				message("could not allocate %d bytes for history cluster",
4706						BC_HISTORY_ALLOC);
4707				BC_set_flag(BC_FLAG_HTRUNCATED);
4708				goto out;
4709			}
4710
4711			/* claim the first entry of the new cluster*/
4712			tmphec->hec_nentries = 1;
4713			tmphec->hec_link = hec;
4714		}
4715
4716		if (OSCompareAndSwapPtr(hec, tmphec, &BC_cache->c_history)) {
4717			hec = tmphec;
4718			entry_idx = 0;
4719			tmphec = NULL;
4720			OSAddAtomic(1, &BC_cache->c_history_num_clusters);
4721			break;
4722		}
4723	}
4724	if (tmphec != NULL)
4725		kfree(tmphec, BC_HISTORY_ALLOC);
4726
4727	/*
4728	 * We've claimed entry entry_idx of cluster hec, fill it in.
4729	 */
4730	he = &(hec->hec_data[entry_idx]);
4731	assert(he >= &hec->hec_data[0]);
4732	assert(he < &hec->hec_data[BC_HISTORY_ENTRIES]);
4733	he->he_offset    = offset;
4734	he->he_length    = length;
4735	he->he_pid       = pid;
4736	he->he_mount_idx = mount_idx;
4737	he->he_flags     = 0x0;
4738	if (hit)    he->he_flags |= BC_HE_HIT;
4739	if (write)  he->he_flags |= BC_HE_WRITE;
4740	if (tag)    he->he_flags |= BC_HE_TAG;
4741	if (shared) he->he_flags |= BC_HE_SHARED;
4742	if (hmd)
4743		OSIncrementAtomic(&hmd->hmd_mount.hm_nentries);
4744
4745	if (!write) {
4746		OSAddAtomic64(length, &BC_cache->c_history_size);
4747		if (BC_cache->c_take_detailed_stats) {
4748			BC_ADD_STAT(history_bytes, length);
4749			BC_ADD_STAT(history_entries, 1);
4750		}
4751	}
4752out:
4753	UNLOCK_HISTORY_R();
4754	return is_root_disk;
4755}
4756
4757/*
4758 * Return the size of the history buffer.
4759 */
4760static int
4761BC_size_history_mounts(void)
4762{
4763	struct BC_history_mount_device *hmd;
4764	int nmounts;
4765
4766	/* walk the list of mount clusters and count the number of mounts */
4767	nmounts = 0;
4768	for (hmd = BC_cache->c_history_mounts; hmd != NULL; hmd = hmd->hmd_next)
4769		nmounts ++;
4770
4771	return(nmounts * sizeof(struct BC_history_mount));
4772}
4773
4774/*
4775 * Return the size of the history buffer.
4776 */
4777static int
4778BC_size_history_entries(void)
4779{
4780	struct BC_history_entry_cluster *hec;
4781	int nentries, cluster_entries;
4782
4783	/* walk the list of clusters and count the number of entries */
4784	nentries = 0;
4785	for (hec = BC_cache->c_history; hec != NULL; hec = hec->hec_link) {
4786		cluster_entries = hec->hec_nentries;
4787		if (cluster_entries > BC_HISTORY_ENTRIES) {
4788			cluster_entries = BC_HISTORY_ENTRIES;
4789		}
4790
4791		nentries += cluster_entries;
4792	}
4793
4794	return(nentries * sizeof(struct BC_history_entry));
4795}
4796
4797/*
4798 * Copy out either the history buffer size, or the entire history buffer.
4799 *
4800 * Note that the cache must be shut down before calling this function in
4801 * order to avoid a race around the entry count/buffer size.
4802 */
4803static int
4804BC_copyout_history_mounts(user_addr_t uptr)
4805{
4806	struct BC_history_mount_device *hmd;
4807	int error;
4808
4809	assert(uptr != 0);
4810
4811	/*
4812	 * Copy out the mounts
4813	 */
4814	for (hmd = BC_cache->c_history_mounts;
4815		 hmd != NULL;
4816		 hmd = hmd->hmd_next) {
4817
4818		/* copy the cluster out */
4819		if ((error = copyout(&(hmd->hmd_mount), uptr,
4820							 sizeof(struct BC_history_mount))) != 0)
4821			return(error);
4822		uptr += sizeof(struct BC_history_mount);
4823
4824	}
4825	return(0);
4826}
4827
4828/*
4829 * Copy out either the history buffer size, or the entire history buffer.
4830 *
4831 * Note that the cache must be shut down before calling this function in
4832 * order to avoid a race around the entry count/buffer size.
4833 */
4834static int
4835BC_copyout_history_entries(user_addr_t uptr)
4836{
4837	struct BC_history_entry_cluster *hec, *ohec;
4838	int error, cluster_entries;
4839
4840	assert(uptr != 0);
4841	assert(BC_cache->c_history);
4842
4843	/*
4844	 * Copying the history entires out is a little messy due to the fact that the
4845	 * clusters are on the list backwards, but we want the entries in order.
4846	 */
4847	ohec = NULL;
4848	if (BC_cache->c_history) {
4849		for (;;) {
4850			/* scan the list until we find the last one we haven't copied */
4851			hec = BC_cache->c_history;
4852			while (hec->hec_link != ohec)
4853				hec = hec->hec_link;
4854			ohec = hec;
4855
4856			cluster_entries = hec->hec_nentries;
4857			if (cluster_entries > BC_HISTORY_ENTRIES) {
4858				cluster_entries = BC_HISTORY_ENTRIES;
4859			}
4860
4861			/* copy the cluster out */
4862			if ((error = copyout(hec->hec_data, uptr,
4863								 cluster_entries * sizeof(struct BC_history_entry))) != 0)
4864				return(error);
4865			uptr += cluster_entries * sizeof(struct BC_history_entry);
4866
4867			/* if this was the last cluster, all done */
4868			if (hec == BC_cache->c_history)
4869				break;
4870		}
4871	}
4872	return(0);
4873}
4874
4875/*
4876 * Discard the history list.
4877 */
4878static void
4879BC_discard_history(void)
4880{
4881	struct BC_history_mount_device  *hmd;
4882	struct BC_history_entry_cluster *hec;
4883
4884	while (BC_cache->c_history_mounts != NULL) {
4885		hmd = BC_cache->c_history_mounts;
4886		BC_cache->c_history_mounts = hmd->hmd_next;
4887		kfree(hmd, sizeof(struct BC_history_mount_device));
4888	}
4889
4890	while (BC_cache->c_history != NULL) {
4891		hec = BC_cache->c_history;
4892		BC_cache->c_history = hec->hec_link;
4893		kfree(hec, BC_HISTORY_ALLOC);
4894	}
4895}
4896
4897/*
4898 * Setup for the root mounted device
4899 */
4900static int
4901BC_setup(void)
4902{
4903	unsigned int boot_arg;
4904
4905	/*
4906	 * If we have less than the minimum supported physical memory,
4907	 * bail immediately.  With too little memory, the cache will
4908	 * become polluted with nondeterministic pagein activity, leaving
4909	 * large amounts of wired memory around which further exacerbates
4910	 * the problem.
4911	 */
4912	if ((max_mem / (1024 * 1024)) < BC_minimum_mem_size) {
4913		debug("insufficient physical memory (%dMB < %dMB required)",
4914			  (int) (max_mem / (1024 * 1024)), BC_minimum_mem_size);
4915		return(ENOMEM);
4916	}
4917
4918	/*
4919	 * Find our block device.
4920	 *
4921	 * Note that in the netbooted case, we are loaded but should not
4922	 * actually run, unless we're explicitly overridden.
4923	 *
4924	 * Note also that if the override is set, we'd better have a valid
4925	 * rootdev...
4926	 */
4927	if (netboot_root() &&
4928		(!PE_parse_boot_argn("BootCacheOverride", &boot_arg, sizeof(boot_arg)) ||
4929		 (boot_arg != 1))) {                          /* not interesting */
4930			return(ENXIO);
4931		}
4932
4933	debug("Major of rootdev is %d", major(rootdev));
4934
4935	BC_cache->c_devsw = &bdevsw[major(rootdev)];
4936	assert(BC_cache->c_devsw != NULL);
4937
4938	/* Make sure we don't clobber the device strategy routine */
4939	if ((BC_cache->c_devsw->d_strategy == (strategy_fcn_t*)BC_strategy) ||
4940		(BC_cache->c_devsw->d_close    == BC_close)) {
4941		debug("cache was not terminated properly previously");
4942		return(ENXIO);
4943	}
4944
4945	BC_cache->c_strategy = BC_cache->c_devsw->d_strategy;
4946	BC_cache->c_close    = BC_cache->c_devsw->d_close;
4947	if (BC_cache->c_strategy == NULL ||
4948		BC_cache->c_close    == NULL) {	/* not configured */
4949		return(ENXIO);
4950	}
4951
4952	BC_set_flag(BC_FLAG_SETUP);
4953
4954	return(0);
4955}
4956
4957
4958/*
4959 * Called from the root-mount hook.
4960 */
4961static void
4962BC_auto_start(void)
4963{
4964	int error;
4965	user_addr_t pm, pe;
4966
4967	if ((error = BC_setup()) != 0) {
4968		debug("BootCache setup failed: %d", error);
4969		return;
4970	}
4971
4972	pm = CAST_USER_ADDR_T((uintptr_t)BC_preloaded_playlist + sizeof(struct BC_playlist_header));
4973	pe = pm + (BC_preloaded_playlist->ph_nmounts * sizeof(struct BC_playlist_mount));
4974
4975	/* Initialize the cache.
4976	 *
4977	 * auto-start is for DVDs or other read-only media, so don't bother recording history
4978	 */
4979	error = BC_init_cache(BC_preloaded_playlist->ph_nmounts  * sizeof(struct BC_playlist_mount), pm,
4980						  BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry), pe,
4981						  0 /* don't record history */);
4982	if (error != 0)
4983		debug("BootCache autostart failed: %d", error);
4984}
4985
4986/*
4987 * Called from the root-unmount hook.
4988 */
4989static void
4990BC_unmount_hook(void)
4991{
4992	debug("device unmounted");
4993
4994	/*
4995	 * If the cache is running, stop it.  If it's already stopped
4996	 * (it may have stopped itself), that's OK.
4997	 */
4998	BC_terminate_cache();
4999	BC_terminate_history();
5000
5001}
5002
5003
5004/*
5005 * Handle the command sysctl.
5006 */
5007static int
5008BC_sysctl SYSCTL_HANDLER_ARGS
5009{
5010	struct BC_command bc;
5011	int error;
5012
5013	/* get the commande structure and validate */
5014	if ((error = SYSCTL_IN(req, &bc, sizeof(bc))) != 0) {
5015		debug("couldn't get command");
5016		return(error);
5017	}
5018	if (bc.bc_magic != BC_MAGIC) {
5019		debug("bad command magic");
5020		return(EINVAL);
5021	}
5022
5023	switch (bc.bc_opcode) {
5024		case BC_OP_START:
5025			debug("BC_OP_START(%d mounts, %d extents)", (int)(bc.bc_data1_size / sizeof(struct BC_playlist_mount)), (int)(bc.bc_data2_size / sizeof(struct BC_playlist_entry)));
5026			if (BC_preloaded_playlist) {
5027				error = EINVAL;
5028				break;
5029			}
5030			error = BC_init_cache(bc.bc_data1_size, (user_addr_t)bc.bc_data1, bc.bc_data2_size, (user_addr_t)bc.bc_data2, 1 /* record history */);
5031			if (error != 0) {
5032				message("cache init failed");
5033			}
5034			break;
5035
5036		case BC_OP_STOP:
5037			debug("BC_OP_STOP");
5038
5039			/*
5040			 * If we're recording history, stop it.  If it's already stopped
5041			 * (it may have stopped itself), that's OK.
5042			 */
5043			BC_terminate_history();
5044
5045			/* return the size of the history buffer */
5046			LOCK_HISTORY_W();
5047			bc.bc_data1_size = BC_size_history_mounts();
5048			bc.bc_data2_size = BC_size_history_entries();
5049			if ((error = SYSCTL_OUT(req, &bc, sizeof(bc))) != 0)
5050				debug("could not return history size");
5051			UNLOCK_HISTORY_W();
5052
5053			break;
5054
5055		case BC_OP_MOUNT:
5056			/* debug("BC_OP_MOUNT"); */
5057
5058			/*
5059			 * A notification that mounts have changed.
5060			 */
5061			BC_update_mounts();
5062			break;
5063
5064		case BC_OP_HISTORY:
5065			debug("BC_OP_HISTORY");
5066			/* if there's a user buffer, verify the size and copy out */
5067			LOCK_HISTORY_W();
5068			if (bc.bc_data1 != 0 && bc.bc_data2 != 0) {
5069				if (bc.bc_data1_size < BC_size_history_mounts() ||
5070					bc.bc_data2_size < BC_size_history_entries()) {
5071					debug("supplied history buffer too small");
5072					error = EINVAL;
5073					UNLOCK_HISTORY_W();
5074					break;
5075				}
5076				if ((error = BC_copyout_history_mounts(bc.bc_data1)) != 0) {
5077					debug("could not copy out history mounts: %d", error);
5078				}
5079				if ((error = BC_copyout_history_entries(bc.bc_data2)) != 0) {
5080					debug("could not copy out history entries: %d", error);
5081				}
5082			}
5083			/*
5084			 * Release the last of the memory we own.
5085			 */
5086			BC_discard_history();
5087			UNLOCK_HISTORY_W();
5088			break;
5089
5090		case BC_OP_JETTISON:
5091			debug("BC_OP_JETTISON");
5092
5093			/*
5094			 * Jettison the cache. If it's already jettisoned
5095			 * (it may have jettisoned itself), that's OK.
5096			 */
5097			BC_terminate_cache();
5098			break;
5099
5100		case BC_OP_STATS:
5101			debug("BC_OP_STATS");
5102			/* check buffer size and copy out */
5103			if (bc.bc_data1_size != sizeof(BC_cache->c_stats)) {
5104				debug("stats structure wrong size");
5105				error = ENOMEM;
5106			} else {
5107				BC_cache->c_stats.ss_cache_flags = BC_cache->c_flags;
5108				if ((error = copyout(&BC_cache->c_stats, bc.bc_data1, bc.bc_data1_size)) != 0)
5109					debug("could not copy out statistics");
5110			}
5111			break;
5112
5113		case BC_OP_TAG:
5114			debug("BC_OP_TAG");
5115			KERNEL_DEBUG_CONSTANT(FSDBG_CODE(DBG_BOOTCACHE, DBG_BC_TAG) |
5116								  DBG_FUNC_NONE, proc_selfppid(), dbg_tag_count,
5117								  0, 0, 0);
5118			BC_add_history(0, 0, 0, 0, 0, 1, 0, 0);
5119			dbg_tag_count++;
5120			break;
5121
5122		case BC_OP_TEST:
5123			debug("BC_OP_TEST");
5124
5125			if (! (BC_cache->c_flags & BC_FLAG_SETUP)) {
5126				error = ENODEV;
5127			}
5128			break;
5129
5130		default:
5131			error = EINVAL;
5132	}
5133	return(error);
5134}
5135
5136
5137/*
5138 * Initialise the block of pages we use to back our data.
5139 *
5140 */
5141static int
5142BC_alloc_pagebuffer()
5143{
5144	kern_return_t kret;
5145	int ret;
5146
5147	if (BC_cache->c_vp == NULL) {
5148
5149		kret = vnode_open(BC_BOOT_BACKINGFILE, (O_CREAT | O_NOFOLLOW | FREAD),
5150						  0400, 0, &BC_cache->c_vp, vfs_context_current());
5151		if (kret != KERN_SUCCESS) {
5152			debug("vnode_open failed - %d", kret);
5153			return -1;
5154		}
5155
5156		BC_cache->c_vid = vnode_vid(BC_cache->c_vp);
5157
5158	} else {
5159		ret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
5160		if (ret != 0) {
5161			debug("vnode_getwithvid failed - %d", ret);
5162			return -1;
5163		}
5164	}
5165
5166	assert(BC_cache->c_cachesize != 0);
5167	kret = ubc_setsize(BC_cache->c_vp, BC_cache->c_cachesize);
5168	if (kret != 1) {
5169		debug("ubc_setsize failed - %d", kret);
5170		(void) vnode_put(BC_cache->c_vp);
5171		return -1;
5172	}
5173
5174	(void) vnode_put(BC_cache->c_vp);
5175
5176	return(0);
5177}
5178
5179/*
5180 * Release all resources associated with our pagebuffer.
5181 */
5182static void
5183BC_free_pagebuffer(void)
5184{
5185	kern_return_t kret;
5186	if (BC_cache->c_vp != NULL) {
5187		/* Can handle a vnode without ubc info */
5188		kret = vnode_getwithref(BC_cache->c_vp);
5189		if (kret != KERN_SUCCESS) {
5190			debug("vnode_getwithref failed - %d", kret);
5191			return;
5192		}
5193
5194		kret = ubc_range_op(BC_cache->c_vp, 0, BC_cache->c_cachesize,
5195							UPL_ROP_DUMP, NULL);
5196		if (kret != KERN_SUCCESS) {
5197			debug("ubc_range_op failed - %d", kret);
5198		}
5199
5200#if 0
5201		/* need this interface to be exported... will eliminate need
5202		 * for the ubc_range_op above */
5203		kret = vnode_delete(BC_BOOT_BACKINGFILE, vfs_context_current());
5204		if (kret) {
5205			debug("vnode_delete failed - %d", error);
5206		}
5207#endif
5208		kret = vnode_close(BC_cache->c_vp, 0, vfs_context_current());
5209		if (kret != KERN_SUCCESS) {
5210			debug("vnode_close failed - %d", kret);
5211		}
5212
5213		BC_cache->c_vp = NULL;
5214	}
5215}
5216
5217/*
5218 * Release one page from the pagebuffer.
5219 */
5220static void
5221BC_free_page(struct BC_cache_extent *ce, int page)
5222{
5223	off_t vpoffset;
5224	kern_return_t kret;
5225
5226	vpoffset = (page * PAGE_SIZE) + ce->ce_cacheoffset;
5227	kret = vnode_getwithvid(BC_cache->c_vp, BC_cache->c_vid);
5228	if (kret != KERN_SUCCESS) {
5229		debug("vnode_getwithvid failed - %d", kret);
5230		return;
5231	}
5232
5233	kret = ubc_range_op(BC_cache->c_vp, vpoffset, vpoffset + PAGE_SIZE,
5234						UPL_ROP_DUMP, NULL);
5235	if (kret != KERN_SUCCESS) {
5236		debug("ubc_range_op failed - %d", kret);
5237	}
5238
5239	(void) vnode_put(BC_cache->c_vp);
5240}
5241
5242void
5243BC_hook(void)
5244{
5245	int error;
5246
5247	if ((error = BC_setup()) != 0) {
5248		debug("BootCache setup failed: %d", error);
5249	}
5250}
5251
5252/*
5253 * Cache start/stop functions.
5254 */
5255void
5256BC_load(void)
5257{
5258	struct mach_header	*mh;
5259	long			xsize;
5260	int			has64bitness, error;
5261	size_t			sysctlsize = sizeof(int);
5262	char			*plsection = "playlist";
5263
5264#ifdef BC_DEBUG
5265	microtime(&debug_starttime);
5266#endif
5267
5268	/* register our control interface */
5269	sysctl_register_oid(&sysctl__kern_BootCache);
5270
5271	/* clear the control structure */
5272	bzero(BC_cache, sizeof(*BC_cache));
5273
5274	BC_cache->c_lckgrp = lck_grp_alloc_init("BootCache", LCK_GRP_ATTR_NULL);
5275	lck_rw_init(&BC_cache->c_history_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5276	lck_rw_init(&BC_cache->c_cache_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5277	lck_mtx_init(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5278	lck_mtx_init(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5279#ifdef BC_DEBUG
5280	lck_mtx_init(&debug_printlock, BC_cache->c_lckgrp, LCK_ATTR_NULL);
5281#endif
5282	/*
5283	 * Find the preload section and its size.  If it's not empty
5284	 * we have a preloaded playlist, so prepare for an early
5285	 * start.
5286	 */
5287	if (kmod_info.info_version != KMOD_INFO_VERSION) {
5288		message("incompatible kmod_info versions");
5289		return;
5290	}
5291	mh = (struct mach_header *)kmod_info.address;
5292
5293	/*
5294	 * If booting on a 32-bit only machine, use the "playlist32" section.
5295	 * Otherwise, fall through to the default "playlist" section.
5296	 */
5297	error = sysctlbyname("hw.cpu64bit_capable", &has64bitness, &sysctlsize,
5298						 NULL, 0);
5299	if (error == 0 && has64bitness == 0) {
5300		plsection = "playlist32";
5301	}
5302	BC_preloaded_playlist = getsectdatafromheader(mh, "BootCache",
5303												  plsection, &BC_preloaded_playlist_size);
5304	debug("preload section %s: %d @ %p",
5305	      plsection, BC_preloaded_playlist_size, BC_preloaded_playlist);
5306
5307	if (BC_preloaded_playlist != NULL) {
5308		if (BC_preloaded_playlist_size < sizeof(struct BC_playlist_header)) {
5309			message("preloaded playlist too small");
5310			return;
5311		}
5312		if (BC_preloaded_playlist->ph_magic != PH_MAGIC) {
5313			message("preloaded playlist has invalid magic (%x, expected %x)",
5314					BC_preloaded_playlist->ph_magic, PH_MAGIC);
5315			return;
5316		}
5317		xsize = sizeof(struct BC_playlist_header) +
5318		(BC_preloaded_playlist->ph_nmounts  * sizeof(struct BC_playlist_mount)) +
5319		(BC_preloaded_playlist->ph_nentries * sizeof(struct BC_playlist_entry));
5320		if (xsize > BC_preloaded_playlist_size) {
5321			message("preloaded playlist too small (%d, expected at least %ld)",
5322					BC_preloaded_playlist_size, xsize);
5323			return;
5324		}
5325		BC_wait_for_readahead = BC_READAHEAD_WAIT_CDROM;
5326		mountroot_post_hook = BC_auto_start;
5327		unmountroot_pre_hook = BC_unmount_hook;
5328
5329		debug("waiting for root mount...");
5330	} else {
5331		/* no preload, normal operation */
5332		BC_preloaded_playlist = NULL;
5333		mountroot_post_hook = BC_hook;
5334		debug("waiting for playlist...");
5335	}
5336
5337	microtime(&BC_cache->c_loadtime);
5338}
5339
5340int
5341BC_unload(void)
5342{
5343	int error;
5344
5345	debug("preparing to unload...");
5346	unmountroot_pre_hook = NULL;
5347
5348	/*
5349	 * If the cache is running, stop it.  If it's already stopped
5350	 * (it may have stopped itself), that's OK.
5351	 */
5352	BC_terminate_history();
5353	if ((error = BC_terminate_cache()) != 0) {
5354		if (error != ENXIO)
5355			goto out;
5356	}
5357	LOCK_HISTORY_W();
5358	BC_discard_history();
5359	UNLOCK_HISTORY_W();
5360
5361	lck_rw_destroy(&BC_cache->c_history_lock, BC_cache->c_lckgrp);
5362	lck_rw_destroy(&BC_cache->c_cache_lock, BC_cache->c_lckgrp);
5363	lck_mtx_destroy(&BC_cache->c_handlers_lock, BC_cache->c_lckgrp);
5364	lck_mtx_destroy(&BC_cache->c_reader_threads_lock, BC_cache->c_lckgrp);
5365	lck_grp_free(BC_cache->c_lckgrp);
5366
5367	sysctl_unregister_oid(&sysctl__kern_BootCache);
5368	error = 0;
5369
5370out:
5371	return(error);
5372}
5373