1/*
2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*-
30 * Copyright (c) 1994 Christopher G. Demetriou
31 * Copyright (c) 1982, 1986, 1989, 1993
32 *	The Regents of the University of California.  All rights reserved.
33 * (c) UNIX System Laboratories, Inc.
34 * All or some portions of this file are derived from material licensed
35 * to the University of California by American Telephone and Telegraph
36 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
37 * the permission of UNIX System Laboratories, Inc.
38 *
39 * Redistribution and use in source and binary forms, with or without
40 * modification, are permitted provided that the following conditions
41 * are met:
42 * 1. Redistributions of source code must retain the above copyright
43 *    notice, this list of conditions and the following disclaimer.
44 * 2. Redistributions in binary form must reproduce the above copyright
45 *    notice, this list of conditions and the following disclaimer in the
46 *    documentation and/or other materials provided with the distribution.
47 * 3. All advertising materials mentioning features or use of this software
48 *    must display the following acknowledgement:
49 *	This product includes software developed by the University of
50 *	California, Berkeley and its contributors.
51 * 4. Neither the name of the University nor the names of its contributors
52 *    may be used to endorse or promote products derived from this software
53 *    without specific prior written permission.
54 *
55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
58 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
65 * SUCH DAMAGE.
66 *
67 *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
68 */
69
70/*
71 * Some references:
72 *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
73 *	Leffler, et al.: The Design and Implementation of the 4.3BSD
74 *		UNIX Operating System (Addison Welley, 1989)
75 */
76
77#include <sys/param.h>
78#include <sys/systm.h>
79#include <sys/proc_internal.h>
80#include <sys/buf_internal.h>
81#include <sys/vnode_internal.h>
82#include <sys/mount_internal.h>
83#include <sys/trace.h>
84#include <sys/malloc.h>
85#include <sys/resourcevar.h>
86#include <miscfs/specfs/specdev.h>
87#include <sys/ubc.h>
88#include <sys/kauth.h>
89#if DIAGNOSTIC
90#include <kern/assert.h>
91#endif /* DIAGNOSTIC */
92#include <kern/task.h>
93#include <kern/zalloc.h>
94#include <kern/lock.h>
95
96#include <sys/fslog.h>		/* fslog_io_error() */
97
98#include <mach/mach_types.h>
99#include <mach/memory_object_types.h>
100#include <kern/sched_prim.h>	/* thread_block() */
101
102#include <vm/vm_kern.h>
103#include <vm/vm_pageout.h>
104
105#include <sys/kdebug.h>
106
107#include <libkern/OSAtomic.h>
108#include <libkern/OSDebug.h>
109#include <sys/ubc_internal.h>
110
111#include <sys/sdt.h>
112#include <sys/cprotect.h>
113
114
115#if BALANCE_QUEUES
116static __inline__ void bufqinc(int q);
117static __inline__ void bufqdec(int q);
118#endif
119
120int	bcleanbuf(buf_t bp, boolean_t discard);
121static int	brecover_data(buf_t bp);
122static boolean_t incore(vnode_t vp, daddr64_t blkno);
123/* timeout is in msecs */
124static buf_t	getnewbuf(int slpflag, int slptimeo, int *queue);
125static void	bremfree_locked(buf_t bp);
126static void	buf_reassign(buf_t bp, vnode_t newvp);
127static errno_t	buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo);
128static int	buf_iterprepare(vnode_t vp, struct buflists *, int flags);
129static void	buf_itercomplete(vnode_t vp, struct buflists *, int flags);
130static boolean_t buffer_cache_gc(int);
131static buf_t	buf_brelse_shadow(buf_t bp);
132static void	buf_free_meta_store(buf_t bp);
133
134static buf_t	buf_create_shadow_internal(buf_t bp, boolean_t force_copy,
135					   uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv);
136
137
138__private_extern__ int  bdwrite_internal(buf_t, int);
139
140/* zone allocated buffer headers */
141static void	bufzoneinit(void);
142static void	bcleanbuf_thread_init(void);
143static void	bcleanbuf_thread(void);
144
145static zone_t	buf_hdr_zone;
146static int	buf_hdr_count;
147
148
149/*
150 * Definitions for the buffer hash lists.
151 */
152#define	BUFHASH(dvp, lbn)	\
153	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
154LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
155u_long	bufhash;
156
157static buf_t	incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp);
158
159/* Definitions for the buffer stats. */
160struct bufstats bufstats;
161
162/* Number of delayed write buffers */
163long nbdwrite = 0;
164int blaundrycnt = 0;
165static int boot_nbuf_headers = 0;
166
167static TAILQ_HEAD(delayqueue, buf) delaybufqueue;
168
169static TAILQ_HEAD(ioqueue, buf) iobufqueue;
170static TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
171static int needbuffer;
172static int need_iobuffer;
173
174static lck_grp_t	*buf_mtx_grp;
175static lck_attr_t	*buf_mtx_attr;
176static lck_grp_attr_t   *buf_mtx_grp_attr;
177static lck_mtx_t	*iobuffer_mtxp;
178static lck_mtx_t	*buf_mtxp;
179
180static int buf_busycount;
181
182static __inline__ int
183buf_timestamp(void)
184{
185	struct	timeval		t;
186	microuptime(&t);
187	return (t.tv_sec);
188}
189
190/*
191 * Insq/Remq for the buffer free lists.
192 */
193#if BALANCE_QUEUES
194#define	binsheadfree(bp, dp, whichq)	do { \
195				    TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
196					bufqinc((whichq));	\
197				} while (0)
198
199#define	binstailfree(bp, dp, whichq)	do { \
200				    TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
201					bufqinc((whichq));	\
202				} while (0)
203#else
204#define	binsheadfree(bp, dp, whichq)	do { \
205				    TAILQ_INSERT_HEAD(dp, bp, b_freelist); \
206				} while (0)
207
208#define	binstailfree(bp, dp, whichq)	do { \
209				    TAILQ_INSERT_TAIL(dp, bp, b_freelist); \
210				} while (0)
211#endif
212
213
214#define BHASHENTCHECK(bp)	\
215	if ((bp)->b_hash.le_prev != (struct buf **)0xdeadbeef)	\
216		panic("%p: b_hash.le_prev is not deadbeef", (bp));
217
218#define BLISTNONE(bp)	\
219	(bp)->b_hash.le_next = (struct buf *)0;	\
220	(bp)->b_hash.le_prev = (struct buf **)0xdeadbeef;
221
222/*
223 * Insq/Remq for the vnode usage lists.
224 */
225#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
226#define	bufremvn(bp) {							\
227	LIST_REMOVE(bp, b_vnbufs);					\
228	(bp)->b_vnbufs.le_next = NOLIST;				\
229}
230
231/*
232 * Time in seconds before a buffer on a list is
233 * considered as a stale buffer
234 */
235#define LRU_IS_STALE 120 /* default value for the LRU */
236#define AGE_IS_STALE 60  /* default value for the AGE */
237#define META_IS_STALE 180 /* default value for the BQ_META */
238
239int lru_is_stale = LRU_IS_STALE;
240int age_is_stale = AGE_IS_STALE;
241int meta_is_stale = META_IS_STALE;
242
243#define MAXLAUNDRY	10
244
245/* LIST_INSERT_HEAD() with assertions */
246static __inline__ void
247blistenterhead(struct bufhashhdr * head, buf_t bp)
248{
249	if ((bp->b_hash.le_next = (head)->lh_first) != NULL)
250		(head)->lh_first->b_hash.le_prev = &(bp)->b_hash.le_next;
251	(head)->lh_first = bp;
252	bp->b_hash.le_prev = &(head)->lh_first;
253	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
254		panic("blistenterhead: le_prev is deadbeef");
255}
256
257static __inline__ void
258binshash(buf_t bp, struct bufhashhdr *dp)
259{
260#if DIAGNOSTIC
261	buf_t	nbp;
262#endif /* DIAGNOSTIC */
263
264	BHASHENTCHECK(bp);
265
266#if DIAGNOSTIC
267	nbp = dp->lh_first;
268	for(; nbp != NULL; nbp = nbp->b_hash.le_next) {
269		if(nbp == bp)
270			panic("buf already in hashlist");
271	}
272#endif /* DIAGNOSTIC */
273
274	blistenterhead(dp, bp);
275}
276
277static __inline__ void
278bremhash(buf_t	bp)
279{
280	if (bp->b_hash.le_prev == (struct buf **)0xdeadbeef)
281		panic("bremhash le_prev is deadbeef");
282	if (bp->b_hash.le_next == bp)
283		panic("bremhash: next points to self");
284
285	if (bp->b_hash.le_next != NULL)
286		bp->b_hash.le_next->b_hash.le_prev = bp->b_hash.le_prev;
287	*bp->b_hash.le_prev = (bp)->b_hash.le_next;
288}
289
290/*
291 * buf_mtxp held.
292 */
293static __inline__ void
294bmovelaundry(buf_t bp)
295{
296	bp->b_whichq = BQ_LAUNDRY;
297	bp->b_timestamp = buf_timestamp();
298	binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
299	blaundrycnt++;
300}
301
302static __inline__ void
303buf_release_credentials(buf_t bp)
304{
305	if (IS_VALID_CRED(bp->b_rcred)) {
306		kauth_cred_unref(&bp->b_rcred);
307	}
308	if (IS_VALID_CRED(bp->b_wcred)) {
309		kauth_cred_unref(&bp->b_wcred);
310	}
311}
312
313
314int
315buf_valid(buf_t bp) {
316
317        if ( (bp->b_flags & (B_DONE | B_DELWRI)) )
318	        return 1;
319	return 0;
320}
321
322int
323buf_fromcache(buf_t bp) {
324
325        if ( (bp->b_flags & B_CACHE) )
326	        return 1;
327	return 0;
328}
329
330void
331buf_markinvalid(buf_t bp) {
332
333        SET(bp->b_flags, B_INVAL);
334}
335
336void
337buf_markdelayed(buf_t bp) {
338
339	if (!ISSET(bp->b_flags, B_DELWRI)) {
340		SET(bp->b_flags, B_DELWRI);
341
342		OSAddAtomicLong(1, &nbdwrite);
343		buf_reassign(bp, bp->b_vp);
344	}
345        SET(bp->b_flags, B_DONE);
346}
347
348void
349buf_markclean(buf_t bp) {
350
351	if (ISSET(bp->b_flags, B_DELWRI)) {
352		CLR(bp->b_flags, B_DELWRI);
353
354		OSAddAtomicLong(-1, &nbdwrite);
355		buf_reassign(bp, bp->b_vp);
356	}
357}
358
359void
360buf_markeintr(buf_t bp) {
361
362        SET(bp->b_flags, B_EINTR);
363}
364
365
366void
367buf_markaged(buf_t bp) {
368
369        SET(bp->b_flags, B_AGE);
370}
371
372int
373buf_fua(buf_t bp) {
374
375        if ((bp->b_flags & B_FUA) == B_FUA)
376	        return 1;
377	return 0;
378}
379
380void
381buf_markfua(buf_t bp) {
382
383        SET(bp->b_flags, B_FUA);
384}
385
386#if CONFIG_PROTECT
387void
388buf_setcpaddr(buf_t bp, struct cprotect *entry) {
389	bp->b_attr.ba_cpentry = entry;
390}
391
392void
393buf_setcpoff (buf_t bp, uint64_t foffset) {
394	bp->b_attr.ba_cp_file_off = foffset;
395}
396
397void *
398bufattr_cpaddr(bufattr_t bap) {
399	return (bap->ba_cpentry);
400}
401
402uint64_t
403bufattr_cpoff(bufattr_t bap) {
404	return (bap->ba_cp_file_off);
405}
406
407void
408bufattr_setcpaddr(bufattr_t bap, void *cp_entry_addr) {
409        bap->ba_cpentry = cp_entry_addr;
410}
411
412void
413bufattr_setcpoff(bufattr_t bap, uint64_t foffset) {
414        bap->ba_cp_file_off = foffset;
415}
416
417#else
418void *
419bufattr_cpaddr(bufattr_t bap __unused) {
420        return NULL;
421}
422
423uint64_t
424bufattr_cpoff(bufattr_t bap __unused) {
425	return 0;
426}
427
428void
429bufattr_setcpaddr(bufattr_t bap __unused, void *cp_entry_addr __unused) {
430}
431
432void
433bufattr_setcpoff(__unused bufattr_t bap, __unused uint64_t foffset) {
434	return;
435}
436#endif /* CONFIG_PROTECT */
437
438bufattr_t
439bufattr_alloc() {
440	bufattr_t bap;
441	MALLOC(bap, bufattr_t, sizeof(struct bufattr), M_TEMP, M_WAITOK);
442	if (bap == NULL)
443		return NULL;
444
445	bzero(bap, sizeof(struct bufattr));
446	return bap;
447}
448
449void
450bufattr_free(bufattr_t bap) {
451	if (bap)
452		FREE(bap, M_TEMP);
453}
454
455int
456bufattr_rawencrypted(bufattr_t bap) {
457	if ( (bap->ba_flags & BA_RAW_ENCRYPTED_IO) )
458		return 1;
459	return 0;
460}
461
462int
463bufattr_throttled(bufattr_t bap) {
464	return (GET_BUFATTR_IO_TIER(bap));
465}
466
467int
468bufattr_nocache(bufattr_t bap) {
469	if ( (bap->ba_flags & BA_NOCACHE) )
470		return 1;
471	return 0;
472}
473
474int
475bufattr_meta(bufattr_t bap) {
476	if ( (bap->ba_flags & BA_META) )
477		return 1;
478	return 0;
479}
480
481int
482bufattr_delayidlesleep(bufattr_t bap)
483{
484	if ( (bap->ba_flags & BA_DELAYIDLESLEEP) )
485		return 1;
486	return 0;
487}
488
489bufattr_t
490buf_attr(buf_t bp) {
491	return &bp->b_attr;
492}
493
494void
495buf_markstatic(buf_t bp __unused) {
496	SET(bp->b_flags, B_STATICCONTENT);
497}
498
499int
500buf_static(buf_t bp) {
501    if ( (bp->b_flags & B_STATICCONTENT) )
502        return 1;
503    return 0;
504}
505
506void
507bufattr_markgreedymode(bufattr_t bap) {
508	SET(bap->ba_flags, BA_GREEDY_MODE);
509}
510
511int
512bufattr_greedymode(bufattr_t bap) {
513    if ( (bap->ba_flags & BA_GREEDY_MODE) )
514        return 1;
515    return 0;
516}
517
518void
519bufattr_markquickcomplete(bufattr_t bap) {
520	SET(bap->ba_flags, BA_QUICK_COMPLETE);
521}
522
523int
524bufattr_quickcomplete(bufattr_t bap) {
525    if ( (bap->ba_flags & BA_QUICK_COMPLETE) )
526        return 1;
527    return 0;
528}
529
530errno_t
531buf_error(buf_t bp) {
532
533        return (bp->b_error);
534}
535
536void
537buf_seterror(buf_t bp, errno_t error) {
538
539        if ((bp->b_error = error))
540	        SET(bp->b_flags, B_ERROR);
541	else
542	        CLR(bp->b_flags, B_ERROR);
543}
544
545void
546buf_setflags(buf_t bp, int32_t flags) {
547
548        SET(bp->b_flags, (flags & BUF_X_WRFLAGS));
549}
550
551void
552buf_clearflags(buf_t bp, int32_t flags) {
553
554        CLR(bp->b_flags, (flags & BUF_X_WRFLAGS));
555}
556
557int32_t
558buf_flags(buf_t bp) {
559
560        return ((bp->b_flags & BUF_X_RDFLAGS));
561}
562
563void
564buf_reset(buf_t bp, int32_t io_flags) {
565
566        CLR(bp->b_flags, (B_READ | B_WRITE | B_ERROR | B_DONE | B_INVAL | B_ASYNC | B_NOCACHE | B_FUA));
567	SET(bp->b_flags, (io_flags & (B_ASYNC | B_READ | B_WRITE | B_NOCACHE)));
568
569	bp->b_error = 0;
570}
571
572uint32_t
573buf_count(buf_t bp) {
574
575        return (bp->b_bcount);
576}
577
578void
579buf_setcount(buf_t bp, uint32_t bcount) {
580
581        bp->b_bcount = bcount;
582}
583
584uint32_t
585buf_size(buf_t bp) {
586
587        return (bp->b_bufsize);
588}
589
590void
591buf_setsize(buf_t bp, uint32_t bufsize) {
592
593        bp->b_bufsize = bufsize;
594}
595
596uint32_t
597buf_resid(buf_t bp) {
598
599        return (bp->b_resid);
600}
601
602void
603buf_setresid(buf_t bp, uint32_t resid) {
604
605        bp->b_resid = resid;
606}
607
608uint32_t
609buf_dirtyoff(buf_t bp) {
610
611        return (bp->b_dirtyoff);
612}
613
614uint32_t
615buf_dirtyend(buf_t bp) {
616
617        return (bp->b_dirtyend);
618}
619
620void
621buf_setdirtyoff(buf_t bp, uint32_t dirtyoff) {
622
623        bp->b_dirtyoff = dirtyoff;
624}
625
626void
627buf_setdirtyend(buf_t bp, uint32_t dirtyend) {
628
629        bp->b_dirtyend = dirtyend;
630}
631
632uintptr_t
633buf_dataptr(buf_t bp) {
634
635        return (bp->b_datap);
636}
637
638void
639buf_setdataptr(buf_t bp, uintptr_t data) {
640
641        bp->b_datap = data;
642}
643
644vnode_t
645buf_vnode(buf_t bp) {
646
647        return (bp->b_vp);
648}
649
650void
651buf_setvnode(buf_t bp, vnode_t vp) {
652
653        bp->b_vp = vp;
654}
655
656
657void *
658buf_callback(buf_t bp)
659{
660        if ( !(bp->b_flags & B_CALL) )
661	        return ((void *) NULL);
662
663	return ((void *)bp->b_iodone);
664}
665
666
667errno_t
668buf_setcallback(buf_t bp, void (*callback)(buf_t, void *), void *transaction)
669{
670	if (callback)
671	        bp->b_flags |= (B_CALL | B_ASYNC);
672	else
673	        bp->b_flags &= ~B_CALL;
674	bp->b_transaction = transaction;
675	bp->b_iodone = callback;
676
677	return (0);
678}
679
680errno_t
681buf_setupl(buf_t bp, upl_t upl, uint32_t offset)
682{
683
684        if ( !(bp->b_lflags & BL_IOBUF) )
685	        return (EINVAL);
686
687	if (upl)
688	        bp->b_flags |= B_CLUSTER;
689	else
690	        bp->b_flags &= ~B_CLUSTER;
691	bp->b_upl = upl;
692	bp->b_uploffset = offset;
693
694	return (0);
695}
696
697buf_t
698buf_clone(buf_t bp, int io_offset, int io_size, void (*iodone)(buf_t, void *), void *arg)
699{
700        buf_t	io_bp;
701
702	if (io_offset < 0 || io_size < 0)
703	        return (NULL);
704
705	if ((unsigned)(io_offset + io_size) > (unsigned)bp->b_bcount)
706	        return (NULL);
707
708	if (bp->b_flags & B_CLUSTER) {
709	        if (io_offset && ((bp->b_uploffset + io_offset) & PAGE_MASK))
710		        return (NULL);
711
712	        if (((bp->b_uploffset + io_offset + io_size) & PAGE_MASK) && ((io_offset + io_size) < bp->b_bcount))
713		        return (NULL);
714	}
715	io_bp = alloc_io_buf(bp->b_vp, 0);
716
717	io_bp->b_flags = bp->b_flags & (B_COMMIT_UPL | B_META | B_PAGEIO | B_CLUSTER | B_PHYS | B_RAW | B_ASYNC | B_READ | B_FUA);
718
719	if (iodone) {
720	        io_bp->b_transaction = arg;
721		io_bp->b_iodone = iodone;
722		io_bp->b_flags |= B_CALL;
723	}
724	if (bp->b_flags & B_CLUSTER) {
725	        io_bp->b_upl = bp->b_upl;
726		io_bp->b_uploffset = bp->b_uploffset + io_offset;
727	} else {
728	        io_bp->b_datap  = (uintptr_t)(((char *)bp->b_datap) + io_offset);
729	}
730	io_bp->b_bcount = io_size;
731
732	return (io_bp);
733}
734
735
736int
737buf_shadow(buf_t bp)
738{
739	if (bp->b_lflags & BL_SHADOW)
740		return 1;
741	return 0;
742}
743
744
745buf_t
746buf_create_shadow_priv(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
747{
748	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 1));
749}
750
751buf_t
752buf_create_shadow(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg)
753{
754	return (buf_create_shadow_internal(bp, force_copy, external_storage, iodone, arg, 0));
755}
756
757
758static buf_t
759buf_create_shadow_internal(buf_t bp, boolean_t force_copy, uintptr_t external_storage, void (*iodone)(buf_t, void *), void *arg, int priv)
760{
761        buf_t	io_bp;
762
763	KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_START, bp, 0, 0, 0, 0);
764
765	if ( !(bp->b_flags & B_META) || (bp->b_lflags & BL_IOBUF)) {
766
767		KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, 0, 0, 0, 0);
768		return (NULL);
769	}
770#ifdef BUF_MAKE_PRIVATE
771	if (bp->b_shadow_ref && bp->b_data_ref == 0 && external_storage == 0)
772		panic("buf_create_shadow: %p is in the private state (%d, %d)", bp, bp->b_shadow_ref, bp->b_data_ref);
773#endif
774	io_bp = alloc_io_buf(bp->b_vp, priv);
775
776	io_bp->b_flags = bp->b_flags & (B_META | B_ZALLOC | B_ASYNC | B_READ | B_FUA);
777	io_bp->b_blkno = bp->b_blkno;
778	io_bp->b_lblkno = bp->b_lblkno;
779
780	if (iodone) {
781	        io_bp->b_transaction = arg;
782		io_bp->b_iodone = iodone;
783		io_bp->b_flags |= B_CALL;
784	}
785	if (force_copy == FALSE) {
786		io_bp->b_bcount = bp->b_bcount;
787		io_bp->b_bufsize = bp->b_bufsize;
788
789		if (external_storage) {
790			io_bp->b_datap = external_storage;
791#ifdef BUF_MAKE_PRIVATE
792			io_bp->b_data_store = NULL;
793#endif
794		} else {
795			io_bp->b_datap = bp->b_datap;
796#ifdef BUF_MAKE_PRIVATE
797			io_bp->b_data_store = bp;
798#endif
799		}
800		*(buf_t *)(&io_bp->b_orig) = bp;
801
802		lck_mtx_lock_spin(buf_mtxp);
803
804		io_bp->b_lflags |= BL_SHADOW;
805		io_bp->b_shadow = bp->b_shadow;
806		bp->b_shadow = io_bp;
807		bp->b_shadow_ref++;
808
809#ifdef BUF_MAKE_PRIVATE
810		if (external_storage)
811			io_bp->b_lflags |= BL_EXTERNAL;
812		else
813			bp->b_data_ref++;
814#endif
815		lck_mtx_unlock(buf_mtxp);
816	} else {
817		if (external_storage) {
818#ifdef BUF_MAKE_PRIVATE
819			io_bp->b_lflags |= BL_EXTERNAL;
820#endif
821			io_bp->b_bcount = bp->b_bcount;
822			io_bp->b_bufsize = bp->b_bufsize;
823			io_bp->b_datap = external_storage;
824		} else {
825			allocbuf(io_bp, bp->b_bcount);
826
827			io_bp->b_lflags |= BL_IOBUF_ALLOC;
828		}
829		bcopy((caddr_t)bp->b_datap, (caddr_t)io_bp->b_datap, bp->b_bcount);
830
831#ifdef BUF_MAKE_PRIVATE
832		io_bp->b_data_store = NULL;
833#endif
834	}
835	KERNEL_DEBUG(0xbbbbc000 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, io_bp, 0);
836
837	return (io_bp);
838}
839
840
841#ifdef BUF_MAKE_PRIVATE
842errno_t
843buf_make_private(buf_t bp)
844{
845	buf_t	ds_bp;
846	buf_t	t_bp;
847	struct buf my_buf;
848
849	KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_START, bp, bp->b_shadow_ref, 0, 0, 0);
850
851	if (bp->b_shadow_ref == 0 || bp->b_data_ref == 0 || ISSET(bp->b_lflags, BL_SHADOW)) {
852
853		KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
854		return (EINVAL);
855	}
856	my_buf.b_flags = B_META;
857	my_buf.b_datap = (uintptr_t)NULL;
858	allocbuf(&my_buf, bp->b_bcount);
859
860	bcopy((caddr_t)bp->b_datap, (caddr_t)my_buf.b_datap, bp->b_bcount);
861
862	lck_mtx_lock_spin(buf_mtxp);
863
864	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
865		if ( !ISSET(bp->b_lflags, BL_EXTERNAL))
866			break;
867	}
868	ds_bp = t_bp;
869
870	if (ds_bp == NULL && bp->b_data_ref)
871		panic("buf_make_private: b_data_ref != 0 && ds_bp == NULL");
872
873	if (ds_bp && (bp->b_data_ref == 0 || bp->b_shadow_ref == 0))
874		panic("buf_make_private: ref_count == 0 && ds_bp != NULL");
875
876	if (ds_bp == NULL) {
877		lck_mtx_unlock(buf_mtxp);
878
879		buf_free_meta_store(&my_buf);
880
881		KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, EINVAL, 0);
882		return (EINVAL);
883	}
884	for (t_bp = bp->b_shadow; t_bp; t_bp = t_bp->b_shadow) {
885		if ( !ISSET(t_bp->b_lflags, BL_EXTERNAL))
886			t_bp->b_data_store = ds_bp;
887	}
888	ds_bp->b_data_ref = bp->b_data_ref;
889
890	bp->b_data_ref = 0;
891	bp->b_datap = my_buf.b_datap;
892
893	lck_mtx_unlock(buf_mtxp);
894
895	KERNEL_DEBUG(0xbbbbc004 | DBG_FUNC_END, bp, bp->b_shadow_ref, 0, 0, 0);
896	return (0);
897}
898#endif
899
900
901void
902buf_setfilter(buf_t bp, void (*filter)(buf_t, void *), void *transaction,
903			  void (**old_iodone)(buf_t, void *), void **old_transaction)
904{
905	if (old_iodone)
906		*old_iodone = bp->b_iodone;
907	if (old_transaction)
908		*old_transaction = bp->b_transaction;
909
910	bp->b_transaction = transaction;
911	bp->b_iodone = filter;
912	if (filter)
913	        bp->b_flags |= B_FILTER;
914	else
915	        bp->b_flags &= ~B_FILTER;
916}
917
918
919daddr64_t
920buf_blkno(buf_t bp) {
921
922        return (bp->b_blkno);
923}
924
925daddr64_t
926buf_lblkno(buf_t bp) {
927
928        return (bp->b_lblkno);
929}
930
931void
932buf_setblkno(buf_t bp, daddr64_t blkno) {
933
934        bp->b_blkno = blkno;
935}
936
937void
938buf_setlblkno(buf_t bp, daddr64_t lblkno) {
939
940        bp->b_lblkno = lblkno;
941}
942
943dev_t
944buf_device(buf_t bp) {
945
946        return (bp->b_dev);
947}
948
949errno_t
950buf_setdevice(buf_t bp, vnode_t vp) {
951
952        if ((vp->v_type != VBLK) && (vp->v_type != VCHR))
953	        return EINVAL;
954	bp->b_dev = vp->v_rdev;
955
956	return 0;
957}
958
959
960void *
961buf_drvdata(buf_t bp) {
962
963        return (bp->b_drvdata);
964}
965
966void
967buf_setdrvdata(buf_t bp, void *drvdata) {
968
969        bp->b_drvdata = drvdata;
970}
971
972void *
973buf_fsprivate(buf_t bp) {
974
975        return (bp->b_fsprivate);
976}
977
978void
979buf_setfsprivate(buf_t bp, void *fsprivate) {
980
981        bp->b_fsprivate = fsprivate;
982}
983
984kauth_cred_t
985buf_rcred(buf_t bp) {
986
987        return (bp->b_rcred);
988}
989
990kauth_cred_t
991buf_wcred(buf_t bp) {
992
993        return (bp->b_wcred);
994}
995
996void *
997buf_upl(buf_t bp) {
998
999        return (bp->b_upl);
1000}
1001
1002uint32_t
1003buf_uploffset(buf_t bp) {
1004
1005        return ((uint32_t)(bp->b_uploffset));
1006}
1007
1008proc_t
1009buf_proc(buf_t bp) {
1010
1011        return (bp->b_proc);
1012}
1013
1014
1015errno_t
1016buf_map(buf_t bp, caddr_t *io_addr)
1017{
1018        buf_t		real_bp;
1019        vm_offset_t	vaddr;
1020        kern_return_t	kret;
1021
1022        if ( !(bp->b_flags & B_CLUSTER)) {
1023	        *io_addr = (caddr_t)bp->b_datap;
1024		return (0);
1025	}
1026	real_bp = (buf_t)(bp->b_real_bp);
1027
1028	if (real_bp && real_bp->b_datap) {
1029	        /*
1030		 * b_real_bp is only valid if B_CLUSTER is SET
1031		 * if it's non-zero, than someone did a cluster_bp call
1032		 * if the backing physical pages were already mapped
1033		 * in before the call to cluster_bp (non-zero b_datap),
1034		 * than we just use that mapping
1035		 */
1036	        *io_addr = (caddr_t)real_bp->b_datap;
1037		return (0);
1038	}
1039	kret = ubc_upl_map(bp->b_upl, &vaddr);    /* Map it in */
1040
1041	if (kret != KERN_SUCCESS) {
1042	        *io_addr = NULL;
1043
1044	        return(ENOMEM);
1045	}
1046	vaddr += bp->b_uploffset;
1047
1048	*io_addr = (caddr_t)vaddr;
1049
1050	return (0);
1051}
1052
1053errno_t
1054buf_unmap(buf_t bp)
1055{
1056        buf_t		real_bp;
1057        kern_return_t	kret;
1058
1059        if ( !(bp->b_flags & B_CLUSTER))
1060	        return (0);
1061	/*
1062	 * see buf_map for the explanation
1063	 */
1064	real_bp = (buf_t)(bp->b_real_bp);
1065
1066	if (real_bp && real_bp->b_datap)
1067	        return (0);
1068
1069	if ((bp->b_lflags & BL_IOBUF) &&
1070	    ((bp->b_flags & (B_PAGEIO | B_READ)) != (B_PAGEIO | B_READ))) {
1071	        /*
1072		 * ignore pageins... the 'right' thing will
1073		 * happen due to the way we handle speculative
1074		 * clusters...
1075		 *
1076		 * when we commit these pages, we'll hit
1077		 * it with UPL_COMMIT_INACTIVE which
1078		 * will clear the reference bit that got
1079		 * turned on when we touched the mapping
1080		 */
1081	        bp->b_flags |= B_AGE;
1082	}
1083	kret = ubc_upl_unmap(bp->b_upl);
1084
1085	if (kret != KERN_SUCCESS)
1086	        return (EINVAL);
1087	return (0);
1088}
1089
1090
1091void
1092buf_clear(buf_t bp) {
1093        caddr_t baddr;
1094
1095        if (buf_map(bp, &baddr) == 0) {
1096	        bzero(baddr, bp->b_bcount);
1097		buf_unmap(bp);
1098	}
1099	bp->b_resid = 0;
1100}
1101
1102/*
1103 * Read or write a buffer that is not contiguous on disk.
1104 * buffer is marked done/error at the conclusion
1105 */
1106static int
1107buf_strategy_fragmented(vnode_t devvp, buf_t bp, off_t f_offset, size_t contig_bytes)
1108{
1109	vnode_t	vp = buf_vnode(bp);
1110	buf_t	io_bp;			 /* For reading or writing a single block */
1111	int	io_direction;
1112	int	io_resid;
1113	size_t	io_contig_bytes;
1114        daddr64_t io_blkno;
1115	int	error = 0;
1116	int	bmap_flags;
1117
1118	/*
1119	 * save our starting point... the bp was already mapped
1120	 * in buf_strategy before we got called
1121	 * no sense doing it again.
1122	 */
1123	io_blkno = bp->b_blkno;
1124	/*
1125	 * Make sure we redo this mapping for the next I/O
1126	 * i.e. this can never be a 'permanent' mapping
1127	 */
1128	bp->b_blkno = bp->b_lblkno;
1129
1130	/*
1131	 * Get an io buffer to do the deblocking
1132	 */
1133	io_bp = alloc_io_buf(devvp, 0);
1134
1135	io_bp->b_lblkno = bp->b_lblkno;
1136	io_bp->b_datap  = bp->b_datap;
1137	io_resid	= bp->b_bcount;
1138        io_direction	= bp->b_flags & B_READ;
1139	io_contig_bytes = contig_bytes;
1140
1141	if (bp->b_flags & B_READ)
1142	        bmap_flags = VNODE_READ;
1143	else
1144	        bmap_flags = VNODE_WRITE;
1145
1146	for (;;) {
1147		if (io_blkno == -1)
1148		        /*
1149			 * this is unexepected, but we'll allow for it
1150			 */
1151		        bzero((caddr_t)io_bp->b_datap, (int)io_contig_bytes);
1152		else {
1153		        io_bp->b_bcount	 = io_contig_bytes;
1154			io_bp->b_bufsize = io_contig_bytes;
1155			io_bp->b_resid   = io_contig_bytes;
1156			io_bp->b_blkno   = io_blkno;
1157
1158			buf_reset(io_bp, io_direction);
1159
1160			/*
1161			 * Call the device to do the I/O and wait for it.  Make sure the appropriate party is charged for write
1162			 */
1163
1164			if (!ISSET(bp->b_flags, B_READ))
1165			        OSAddAtomic(1, &devvp->v_numoutput);
1166
1167			if ((error = VNOP_STRATEGY(io_bp)))
1168			        break;
1169			if ((error = (int)buf_biowait(io_bp)))
1170			        break;
1171			if (io_bp->b_resid) {
1172			        io_resid -= (io_contig_bytes - io_bp->b_resid);
1173				break;
1174			}
1175		}
1176		if ((io_resid -= io_contig_bytes) == 0)
1177		        break;
1178		f_offset       += io_contig_bytes;
1179		io_bp->b_datap += io_contig_bytes;
1180
1181		/*
1182		 * Map the current position to a physical block number
1183		 */
1184		if ((error = VNOP_BLOCKMAP(vp, f_offset, io_resid, &io_blkno, &io_contig_bytes, NULL, bmap_flags, NULL)))
1185		        break;
1186	}
1187	buf_free(io_bp);
1188
1189	if (error)
1190	        buf_seterror(bp, error);
1191	bp->b_resid = io_resid;
1192	/*
1193	 * This I/O is now complete
1194	 */
1195	buf_biodone(bp);
1196
1197	return error;
1198}
1199
1200
1201/*
1202 * struct vnop_strategy_args {
1203 *      struct buf *a_bp;
1204 * } *ap;
1205 */
1206errno_t
1207buf_strategy(vnode_t devvp, void *ap)
1208{
1209        buf_t	bp = ((struct vnop_strategy_args *)ap)->a_bp;
1210	vnode_t	vp = bp->b_vp;
1211	int	bmap_flags;
1212        errno_t error;
1213#if CONFIG_DTRACE
1214	int dtrace_io_start_flag = 0;	 /* We only want to trip the io:::start
1215					  * probe once, with the true physical
1216					  * block in place (b_blkno)
1217					  */
1218
1219#endif
1220
1221	if (vp == NULL || vp->v_type == VCHR || vp->v_type == VBLK)
1222	        panic("buf_strategy: b_vp == NULL || vtype == VCHR | VBLK\n");
1223	/*
1224	 * associate the physical device with
1225	 * with this buf_t even if we don't
1226	 * end up issuing the I/O...
1227	 */
1228	bp->b_dev = devvp->v_rdev;
1229
1230	if (bp->b_flags & B_READ)
1231	        bmap_flags = VNODE_READ;
1232	else
1233	        bmap_flags = VNODE_WRITE;
1234
1235        if ( !(bp->b_flags & B_CLUSTER)) {
1236
1237	        if ( (bp->b_upl) ) {
1238		        /*
1239			 * we have a UPL associated with this bp
1240			 * go through cluster_bp which knows how
1241			 * to deal with filesystem block sizes
1242			 * that aren't equal to the page size
1243			 */
1244			DTRACE_IO1(start, buf_t, bp);
1245		        return (cluster_bp(bp));
1246		}
1247		if (bp->b_blkno == bp->b_lblkno) {
1248		    off_t	f_offset;
1249			size_t 	contig_bytes;
1250
1251			if ((error = VNOP_BLKTOOFF(vp, bp->b_lblkno, &f_offset))) {
1252				DTRACE_IO1(start, buf_t, bp);
1253			        buf_seterror(bp, error);
1254				buf_biodone(bp);
1255
1256			    return (error);
1257			}
1258
1259		if ((error = VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))) {
1260				DTRACE_IO1(start, buf_t, bp);
1261			        buf_seterror(bp, error);
1262				buf_biodone(bp);
1263
1264			        return (error);
1265			}
1266
1267			DTRACE_IO1(start, buf_t, bp);
1268#if CONFIG_DTRACE
1269			dtrace_io_start_flag = 1;
1270#endif /* CONFIG_DTRACE */
1271
1272			if ((bp->b_blkno == -1) || (contig_bytes == 0)) {
1273				/* Set block number to force biodone later */
1274				bp->b_blkno = -1;
1275			        buf_clear(bp);
1276			}
1277			else if ((long)contig_bytes < bp->b_bcount) {
1278			        return (buf_strategy_fragmented(devvp, bp, f_offset, contig_bytes));
1279			}
1280		}
1281
1282#if CONFIG_DTRACE
1283		if (dtrace_io_start_flag == 0) {
1284			DTRACE_IO1(start, buf_t, bp);
1285			dtrace_io_start_flag = 1;
1286		}
1287#endif /* CONFIG_DTRACE */
1288
1289		if (bp->b_blkno == -1) {
1290		        buf_biodone(bp);
1291			return (0);
1292		}
1293	}
1294
1295#if CONFIG_DTRACE
1296	if (dtrace_io_start_flag == 0)
1297		DTRACE_IO1(start, buf_t, bp);
1298#endif /* CONFIG_DTRACE */
1299
1300#if CONFIG_PROTECT
1301	/* Capture f_offset in the bufattr*/
1302	if (bp->b_attr.ba_cpentry != 0) {
1303		/* No need to go here for older EAs */
1304		if(bp->b_attr.ba_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
1305			off_t f_offset;
1306			if ((error = VNOP_BLKTOOFF(bp->b_vp, bp->b_lblkno, &f_offset)))
1307				return error;
1308
1309			/*
1310			 * Attach the file offset to this buffer.  The
1311			 * bufattr attributes will be passed down the stack
1312			 * until they reach IOFlashStorage.  IOFlashStorage
1313			 * will retain the offset in a local variable when it
1314			 * issues its I/Os to the NAND controller.
1315			 *
1316			 * Note that LwVM may end up splitting this I/O
1317			 * into sub-I/Os if it crosses a chunk boundary.  In this
1318			 * case, LwVM will update this field when it dispatches
1319			 * each I/O to IOFlashStorage.  But from our perspective
1320			 * we have only issued a single I/O.
1321			 */
1322			bufattr_setcpoff (&(bp->b_attr), (u_int64_t)f_offset);
1323		}
1324	}
1325#endif
1326
1327	/*
1328	 * we can issue the I/O because...
1329	 * either B_CLUSTER is set which
1330	 * means that the I/O is properly set
1331	 * up to be a multiple of the page size, or
1332	 * we were able to successfully set up the
1333	 * physical block mapping
1334	 */
1335	error = VOCALL(devvp->v_op, VOFFSET(vnop_strategy), ap);
1336	DTRACE_FSINFO(strategy, vnode_t, vp);
1337	return (error);
1338}
1339
1340
1341
1342buf_t
1343buf_alloc(vnode_t vp)
1344{
1345        return(alloc_io_buf(vp, 0));
1346}
1347
1348void
1349buf_free(buf_t bp) {
1350
1351        free_io_buf(bp);
1352}
1353
1354
1355/*
1356 * iterate buffers for the specified vp.
1357 *   if BUF_SCAN_DIRTY is set, do the dirty list
1358 *   if BUF_SCAN_CLEAN is set, do the clean list
1359 *   if neither flag is set, default to BUF_SCAN_DIRTY
1360 *   if BUF_NOTIFY_BUSY is set, call the callout function using a NULL bp for busy pages
1361 */
1362
1363struct buf_iterate_info_t {
1364	int flag;
1365	struct buflists *listhead;
1366};
1367
1368void
1369buf_iterate(vnode_t vp, int (*callout)(buf_t, void *), int flags, void *arg)
1370{
1371	buf_t 	bp;
1372	int	retval;
1373	struct	buflists local_iterblkhd;
1374	int	lock_flags = BAC_NOWAIT | BAC_REMOVE;
1375	int	notify_busy = flags & BUF_NOTIFY_BUSY;
1376	struct buf_iterate_info_t list[2];
1377	int	num_lists, i;
1378
1379	if (flags & BUF_SKIP_LOCKED)
1380	        lock_flags |= BAC_SKIP_LOCKED;
1381	if (flags & BUF_SKIP_NONLOCKED)
1382	        lock_flags |= BAC_SKIP_NONLOCKED;
1383
1384	if ( !(flags & (BUF_SCAN_DIRTY | BUF_SCAN_CLEAN)))
1385	        flags |= BUF_SCAN_DIRTY;
1386
1387	num_lists = 0;
1388
1389	if (flags & BUF_SCAN_DIRTY) {
1390	        list[num_lists].flag = VBI_DIRTY;
1391		list[num_lists].listhead = &vp->v_dirtyblkhd;
1392		num_lists++;
1393	}
1394	if (flags & BUF_SCAN_CLEAN) {
1395		list[num_lists].flag = VBI_CLEAN;
1396		list[num_lists].listhead = &vp->v_cleanblkhd;
1397		num_lists++;
1398	}
1399
1400	for (i = 0; i < num_lists; i++) {
1401		lck_mtx_lock(buf_mtxp);
1402
1403		if (buf_iterprepare(vp, &local_iterblkhd, list[i].flag))  {
1404			lck_mtx_unlock(buf_mtxp);
1405			continue;
1406		}
1407		while (!LIST_EMPTY(&local_iterblkhd)) {
1408			bp = LIST_FIRST(&local_iterblkhd);
1409			LIST_REMOVE(bp, b_vnbufs);
1410			LIST_INSERT_HEAD(list[i].listhead, bp, b_vnbufs);
1411
1412			if (buf_acquire_locked(bp, lock_flags, 0, 0)) {
1413				if (notify_busy) {
1414					bp = NULL;
1415				} else {
1416					continue;
1417				}
1418			}
1419
1420			lck_mtx_unlock(buf_mtxp);
1421
1422			retval = callout(bp, arg);
1423
1424			switch (retval) {
1425			case BUF_RETURNED:
1426				if (bp)
1427					buf_brelse(bp);
1428				break;
1429			case BUF_CLAIMED:
1430				break;
1431			case BUF_RETURNED_DONE:
1432				if (bp)
1433					buf_brelse(bp);
1434				lck_mtx_lock(buf_mtxp);
1435				goto out;
1436			case BUF_CLAIMED_DONE:
1437				lck_mtx_lock(buf_mtxp);
1438				goto out;
1439			}
1440			lck_mtx_lock(buf_mtxp);
1441		} /* while list has more nodes */
1442	  out:
1443		buf_itercomplete(vp, &local_iterblkhd, list[i].flag);
1444		lck_mtx_unlock(buf_mtxp);
1445	} /* for each list */
1446} /* buf_iterate */
1447
1448
1449/*
1450 * Flush out and invalidate all buffers associated with a vnode.
1451 */
1452int
1453buf_invalidateblks(vnode_t vp, int flags, int slpflag, int slptimeo)
1454{
1455	buf_t	bp;
1456	int	aflags;
1457	int	error = 0;
1458	int	must_rescan = 1;
1459	struct	buflists local_iterblkhd;
1460
1461
1462	if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1463		return (0);
1464
1465	lck_mtx_lock(buf_mtxp);
1466
1467	for (;;) {
1468		if (must_rescan == 0)
1469		        /*
1470			 * the lists may not be empty, but all that's left at this
1471			 * point are metadata or B_LOCKED buffers which are being
1472			 * skipped... we know this because we made it through both
1473			 * the clean and dirty lists without dropping buf_mtxp...
1474			 * each time we drop buf_mtxp we bump "must_rescan"
1475			 */
1476		        break;
1477		if (LIST_EMPTY(&vp->v_cleanblkhd) && LIST_EMPTY(&vp->v_dirtyblkhd))
1478		        break;
1479		must_rescan = 0;
1480		/*
1481		 * iterate the clean list
1482		 */
1483		if (buf_iterprepare(vp, &local_iterblkhd, VBI_CLEAN)) {
1484		        goto try_dirty_list;
1485		}
1486		while (!LIST_EMPTY(&local_iterblkhd)) {
1487
1488			bp = LIST_FIRST(&local_iterblkhd);
1489
1490			LIST_REMOVE(bp, b_vnbufs);
1491			LIST_INSERT_HEAD(&vp->v_cleanblkhd, bp, b_vnbufs);
1492
1493			/*
1494			 * some filesystems distinguish meta data blocks with a negative logical block #
1495			 */
1496			if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1497				continue;
1498
1499			aflags = BAC_REMOVE;
1500
1501			if ( !(flags & BUF_INVALIDATE_LOCKED) )
1502				aflags |= BAC_SKIP_LOCKED;
1503
1504			if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1505			        if (error == EDEADLK)
1506				        /*
1507					 * this buffer was marked B_LOCKED...
1508					 * we didn't drop buf_mtxp, so we
1509					 * we don't need to rescan
1510					 */
1511				        continue;
1512			        if (error == EAGAIN) {
1513				        /*
1514					 * found a busy buffer... we blocked and
1515					 * dropped buf_mtxp, so we're going to
1516					 * need to rescan after this pass is completed
1517					 */
1518				        must_rescan++;
1519				        continue;
1520				}
1521				/*
1522				 * got some kind of 'real' error out of the msleep
1523				 * in buf_acquire_locked, terminate the scan and return the error
1524				 */
1525				buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1526
1527				lck_mtx_unlock(buf_mtxp);
1528				return (error);
1529			}
1530			lck_mtx_unlock(buf_mtxp);
1531
1532			if (bp->b_flags & B_LOCKED)
1533				KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 0, 0);
1534
1535			CLR(bp->b_flags, B_LOCKED);
1536			SET(bp->b_flags, B_INVAL);
1537			buf_brelse(bp);
1538
1539			lck_mtx_lock(buf_mtxp);
1540
1541			/*
1542			 * by dropping buf_mtxp, we allow new
1543			 * buffers to be added to the vnode list(s)
1544			 * we'll have to rescan at least once more
1545			 * if the queues aren't empty
1546			 */
1547			must_rescan++;
1548		}
1549		buf_itercomplete(vp, &local_iterblkhd, VBI_CLEAN);
1550
1551try_dirty_list:
1552		/*
1553		 * Now iterate on dirty blks
1554		 */
1555		if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY)) {
1556			continue;
1557		}
1558		while (!LIST_EMPTY(&local_iterblkhd)) {
1559			bp = LIST_FIRST(&local_iterblkhd);
1560
1561			LIST_REMOVE(bp, b_vnbufs);
1562			LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1563
1564			/*
1565			 * some filesystems distinguish meta data blocks with a negative logical block #
1566			 */
1567			if ((flags & BUF_SKIP_META) && (bp->b_lblkno < 0 || ISSET(bp->b_flags, B_META)))
1568				continue;
1569
1570			aflags = BAC_REMOVE;
1571
1572			if ( !(flags & BUF_INVALIDATE_LOCKED) )
1573				aflags |= BAC_SKIP_LOCKED;
1574
1575			if ( (error = (int)buf_acquire_locked(bp, aflags, slpflag, slptimeo)) ) {
1576			        if (error == EDEADLK)
1577				        /*
1578					 * this buffer was marked B_LOCKED...
1579					 * we didn't drop buf_mtxp, so we
1580					 * we don't need to rescan
1581					 */
1582				        continue;
1583			        if (error == EAGAIN) {
1584				        /*
1585					 * found a busy buffer... we blocked and
1586					 * dropped buf_mtxp, so we're going to
1587					 * need to rescan after this pass is completed
1588					 */
1589				        must_rescan++;
1590				        continue;
1591				}
1592				/*
1593				 * got some kind of 'real' error out of the msleep
1594				 * in buf_acquire_locked, terminate the scan and return the error
1595				 */
1596				buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1597
1598				lck_mtx_unlock(buf_mtxp);
1599				return (error);
1600			}
1601			lck_mtx_unlock(buf_mtxp);
1602
1603			if (bp->b_flags & B_LOCKED)
1604				KERNEL_DEBUG(0xbbbbc038, bp, 0, 0, 1, 0);
1605
1606			CLR(bp->b_flags, B_LOCKED);
1607			SET(bp->b_flags, B_INVAL);
1608
1609			if (ISSET(bp->b_flags, B_DELWRI) && (flags & BUF_WRITE_DATA))
1610				(void) VNOP_BWRITE(bp);
1611			else
1612				buf_brelse(bp);
1613
1614			lck_mtx_lock(buf_mtxp);
1615			/*
1616			 * by dropping buf_mtxp, we allow new
1617			 * buffers to be added to the vnode list(s)
1618			 * we'll have to rescan at least once more
1619			 * if the queues aren't empty
1620			 */
1621			must_rescan++;
1622		}
1623		buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1624	}
1625	lck_mtx_unlock(buf_mtxp);
1626
1627	return (0);
1628}
1629
1630void
1631buf_flushdirtyblks(vnode_t vp, int wait, int flags, const char *msg) {
1632
1633	(void) buf_flushdirtyblks_skipinfo(vp, wait, flags, msg);
1634	return;
1635}
1636
1637int
1638buf_flushdirtyblks_skipinfo(vnode_t vp, int wait, int flags, const char *msg) {
1639	buf_t	bp;
1640	int	writes_issued = 0;
1641	errno_t	error;
1642	int	busy = 0;
1643	struct	buflists local_iterblkhd;
1644	int	lock_flags = BAC_NOWAIT | BAC_REMOVE;
1645	int any_locked = 0;
1646
1647	if (flags & BUF_SKIP_LOCKED)
1648	        lock_flags |= BAC_SKIP_LOCKED;
1649	if (flags & BUF_SKIP_NONLOCKED)
1650	        lock_flags |= BAC_SKIP_NONLOCKED;
1651loop:
1652	lck_mtx_lock(buf_mtxp);
1653
1654	if (buf_iterprepare(vp, &local_iterblkhd, VBI_DIRTY) == 0)  {
1655	        while (!LIST_EMPTY(&local_iterblkhd)) {
1656			bp = LIST_FIRST(&local_iterblkhd);
1657			LIST_REMOVE(bp, b_vnbufs);
1658			LIST_INSERT_HEAD(&vp->v_dirtyblkhd, bp, b_vnbufs);
1659
1660			if ((error = buf_acquire_locked(bp, lock_flags, 0, 0)) == EBUSY) {
1661				busy++;
1662			}
1663			if (error) {
1664				/*
1665				 * If we passed in BUF_SKIP_LOCKED or BUF_SKIP_NONLOCKED,
1666				 * we may want to do somethign differently if a locked or unlocked
1667				 * buffer was encountered (depending on the arg specified).
1668				 * In this case, we know that one of those two was set, and the
1669				 * buf acquisition failed above.
1670				 *
1671				 * If it failed with EDEADLK, then save state which can be emitted
1672				 * later on to the caller.  Most callers should not care.
1673				 */
1674				if (error == EDEADLK) {
1675					any_locked++;
1676				}
1677				continue;
1678			}
1679			lck_mtx_unlock(buf_mtxp);
1680
1681			bp->b_flags &= ~B_LOCKED;
1682
1683			/*
1684			 * Wait for I/O associated with indirect blocks to complete,
1685			 * since there is no way to quickly wait for them below.
1686			 */
1687			if ((bp->b_vp == vp) || (wait == 0))
1688			        (void) buf_bawrite(bp);
1689			else
1690			        (void) VNOP_BWRITE(bp);
1691			writes_issued++;
1692
1693			lck_mtx_lock(buf_mtxp);
1694		}
1695		buf_itercomplete(vp, &local_iterblkhd, VBI_DIRTY);
1696	}
1697	lck_mtx_unlock(buf_mtxp);
1698
1699	if (wait) {
1700	        (void)vnode_waitforwrites(vp, 0, 0, 0, msg);
1701
1702		if (vp->v_dirtyblkhd.lh_first && busy) {
1703		        /*
1704			 * we had one or more BUSY buffers on
1705			 * the dirtyblock list... most likely
1706			 * these are due to delayed writes that
1707			 * were moved to the bclean queue but
1708			 * have not yet been 'written'.
1709			 * if we issued some writes on the
1710			 * previous pass, we try again immediately
1711			 * if we didn't, we'll sleep for some time
1712			 * to allow the state to change...
1713			 */
1714		        if (writes_issued == 0) {
1715			        (void)tsleep((caddr_t)&vp->v_numoutput,
1716					     PRIBIO + 1, "vnode_flushdirtyblks", hz/20);
1717			}
1718			writes_issued = 0;
1719			busy = 0;
1720
1721			goto loop;
1722		}
1723	}
1724
1725	return any_locked;
1726}
1727
1728
1729/*
1730 * called with buf_mtxp held...
1731 * this lock protects the queue manipulation
1732 */
1733static int
1734buf_iterprepare(vnode_t vp, struct buflists *iterheadp, int flags)
1735{
1736	struct buflists * listheadp;
1737
1738	if (flags & VBI_DIRTY)
1739		listheadp = &vp->v_dirtyblkhd;
1740	else
1741		listheadp = &vp->v_cleanblkhd;
1742
1743	while (vp->v_iterblkflags & VBI_ITER) 	{
1744	        vp->v_iterblkflags |= VBI_ITERWANT;
1745		msleep(&vp->v_iterblkflags, buf_mtxp, 0, "buf_iterprepare", NULL);
1746	}
1747	if (LIST_EMPTY(listheadp)) {
1748	        LIST_INIT(iterheadp);
1749		return(EINVAL);
1750	}
1751	vp->v_iterblkflags |= VBI_ITER;
1752
1753	iterheadp->lh_first = listheadp->lh_first;
1754	listheadp->lh_first->b_vnbufs.le_prev = &iterheadp->lh_first;
1755	LIST_INIT(listheadp);
1756
1757	return(0);
1758}
1759
1760/*
1761 * called with buf_mtxp held...
1762 * this lock protects the queue manipulation
1763 */
1764static void
1765buf_itercomplete(vnode_t vp, struct buflists *iterheadp, int flags)
1766{
1767	struct buflists * listheadp;
1768	buf_t bp;
1769
1770	if (flags & VBI_DIRTY)
1771		listheadp = &vp->v_dirtyblkhd;
1772	else
1773		listheadp = &vp->v_cleanblkhd;
1774
1775	while (!LIST_EMPTY(iterheadp)) {
1776		bp = LIST_FIRST(iterheadp);
1777		LIST_REMOVE(bp, b_vnbufs);
1778		LIST_INSERT_HEAD(listheadp, bp, b_vnbufs);
1779	}
1780	vp->v_iterblkflags &= ~VBI_ITER;
1781
1782	if  (vp->v_iterblkflags & VBI_ITERWANT) 	{
1783		vp->v_iterblkflags &= ~VBI_ITERWANT;
1784		wakeup(&vp->v_iterblkflags);
1785	}
1786}
1787
1788
1789static void
1790bremfree_locked(buf_t bp)
1791{
1792	struct bqueues *dp = NULL;
1793	int whichq;
1794
1795	whichq = bp->b_whichq;
1796
1797	if (whichq == -1) {
1798		if (bp->b_shadow_ref == 0)
1799			panic("bremfree_locked: %p not on freelist", bp);
1800		/*
1801		 * there are clones pointing to 'bp'...
1802		 * therefore, it was not put on a freelist
1803		 * when buf_brelse was last called on 'bp'
1804		 */
1805		return;
1806	}
1807	/*
1808	 * We only calculate the head of the freelist when removing
1809	 * the last element of the list as that is the only time that
1810	 * it is needed (e.g. to reset the tail pointer).
1811	 *
1812	 * NB: This makes an assumption about how tailq's are implemented.
1813	 */
1814	if (bp->b_freelist.tqe_next == NULL) {
1815	        dp = &bufqueues[whichq];
1816
1817		if (dp->tqh_last != &bp->b_freelist.tqe_next)
1818			panic("bremfree: lost tail");
1819	}
1820	TAILQ_REMOVE(dp, bp, b_freelist);
1821
1822#if BALANCE_QUEUES
1823	bufqdec(whichq);
1824#endif
1825	if (whichq == BQ_LAUNDRY)
1826	        blaundrycnt--;
1827
1828	bp->b_whichq = -1;
1829	bp->b_timestamp = 0;
1830	bp->b_shadow = 0;
1831}
1832
1833/*
1834 * Associate a buffer with a vnode.
1835 * buf_mtxp must be locked on entry
1836 */
1837static void
1838bgetvp_locked(vnode_t vp, buf_t bp)
1839{
1840
1841	if (bp->b_vp != vp)
1842		panic("bgetvp_locked: not free");
1843
1844	if (vp->v_type == VBLK || vp->v_type == VCHR)
1845		bp->b_dev = vp->v_rdev;
1846	else
1847		bp->b_dev = NODEV;
1848	/*
1849	 * Insert onto list for new vnode.
1850	 */
1851	bufinsvn(bp, &vp->v_cleanblkhd);
1852}
1853
1854/*
1855 * Disassociate a buffer from a vnode.
1856 * buf_mtxp must be locked on entry
1857 */
1858static void
1859brelvp_locked(buf_t bp)
1860{
1861	/*
1862	 * Delete from old vnode list, if on one.
1863	 */
1864	if (bp->b_vnbufs.le_next != NOLIST)
1865		bufremvn(bp);
1866
1867	bp->b_vp = (vnode_t)NULL;
1868}
1869
1870/*
1871 * Reassign a buffer from one vnode to another.
1872 * Used to assign file specific control information
1873 * (indirect blocks) to the vnode to which they belong.
1874 */
1875static void
1876buf_reassign(buf_t bp, vnode_t newvp)
1877{
1878	struct buflists *listheadp;
1879
1880	if (newvp == NULL) {
1881		printf("buf_reassign: NULL");
1882		return;
1883	}
1884	lck_mtx_lock_spin(buf_mtxp);
1885
1886	/*
1887	 * Delete from old vnode list, if on one.
1888	 */
1889	if (bp->b_vnbufs.le_next != NOLIST)
1890		bufremvn(bp);
1891	/*
1892	 * If dirty, put on list of dirty buffers;
1893	 * otherwise insert onto list of clean buffers.
1894	 */
1895	if (ISSET(bp->b_flags, B_DELWRI))
1896		listheadp = &newvp->v_dirtyblkhd;
1897	else
1898		listheadp = &newvp->v_cleanblkhd;
1899	bufinsvn(bp, listheadp);
1900
1901	lck_mtx_unlock(buf_mtxp);
1902}
1903
1904static __inline__ void
1905bufhdrinit(buf_t bp)
1906{
1907	bzero((char *)bp, sizeof *bp);
1908	bp->b_dev = NODEV;
1909	bp->b_rcred = NOCRED;
1910	bp->b_wcred = NOCRED;
1911	bp->b_vnbufs.le_next = NOLIST;
1912	bp->b_flags = B_INVAL;
1913
1914	return;
1915}
1916
1917/*
1918 * Initialize buffers and hash links for buffers.
1919 */
1920__private_extern__ void
1921bufinit(void)
1922{
1923	buf_t	bp;
1924	struct bqueues *dp;
1925	int	i;
1926
1927	nbuf_headers = 0;
1928	/* Initialize the buffer queues ('freelists') and the hash table */
1929	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
1930		TAILQ_INIT(dp);
1931	bufhashtbl = hashinit(nbuf_hashelements, M_CACHE, &bufhash);
1932
1933	buf_busycount = 0;
1934
1935	/* Initialize the buffer headers */
1936	for (i = 0; i < max_nbuf_headers; i++) {
1937		nbuf_headers++;
1938		bp = &buf_headers[i];
1939		bufhdrinit(bp);
1940
1941		BLISTNONE(bp);
1942		dp = &bufqueues[BQ_EMPTY];
1943		bp->b_whichq = BQ_EMPTY;
1944		bp->b_timestamp = buf_timestamp();
1945		binsheadfree(bp, dp, BQ_EMPTY);
1946		binshash(bp, &invalhash);
1947	}
1948	boot_nbuf_headers = nbuf_headers;
1949
1950	TAILQ_INIT(&iobufqueue);
1951	TAILQ_INIT(&delaybufqueue);
1952
1953	for (; i < nbuf_headers + niobuf_headers; i++) {
1954		bp = &buf_headers[i];
1955		bufhdrinit(bp);
1956		bp->b_whichq = -1;
1957		binsheadfree(bp, &iobufqueue, -1);
1958	}
1959
1960	/*
1961	 * allocate lock group attribute and group
1962	 */
1963	buf_mtx_grp_attr = lck_grp_attr_alloc_init();
1964	buf_mtx_grp = lck_grp_alloc_init("buffer cache", buf_mtx_grp_attr);
1965
1966	/*
1967	 * allocate the lock attribute
1968	 */
1969	buf_mtx_attr = lck_attr_alloc_init();
1970
1971	/*
1972	 * allocate and initialize mutex's for the buffer and iobuffer pools
1973	 */
1974	buf_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1975	iobuffer_mtxp	= lck_mtx_alloc_init(buf_mtx_grp, buf_mtx_attr);
1976
1977	if (iobuffer_mtxp == NULL)
1978	        panic("couldn't create iobuffer mutex");
1979
1980	if (buf_mtxp == NULL)
1981	        panic("couldn't create buf mutex");
1982
1983	/*
1984	 * allocate and initialize cluster specific global locks...
1985	 */
1986	cluster_init();
1987
1988	printf("using %d buffer headers and %d cluster IO buffer headers\n",
1989		nbuf_headers, niobuf_headers);
1990
1991	/* Set up zones used by the buffer cache */
1992	bufzoneinit();
1993
1994	/* start the bcleanbuf() thread */
1995	bcleanbuf_thread_init();
1996
1997	/* Register a callout for relieving vm pressure */
1998	if (vm_set_buffer_cleanup_callout(buffer_cache_gc) != KERN_SUCCESS) {
1999		panic("Couldn't register buffer cache callout for vm pressure!\n");
2000	}
2001
2002#if BALANCE_QUEUES
2003	{
2004	static void bufq_balance_thread_init(void);
2005	/* create a thread to do dynamic buffer queue balancing */
2006	bufq_balance_thread_init();
2007	}
2008#endif /* notyet */
2009}
2010
2011
2012
2013/*
2014 * Zones for the meta data buffers
2015 */
2016
2017#define MINMETA 512
2018#define MAXMETA 8192
2019
2020struct meta_zone_entry {
2021	zone_t mz_zone;
2022	vm_size_t mz_size;
2023	vm_size_t mz_max;
2024	const char *mz_name;
2025};
2026
2027struct meta_zone_entry meta_zones[] = {
2028	{NULL, (MINMETA * 1), 128 * (MINMETA * 1), "buf.512" },
2029	{NULL, (MINMETA * 2),  64 * (MINMETA * 2), "buf.1024" },
2030	{NULL, (MINMETA * 4),  16 * (MINMETA * 4), "buf.2048" },
2031	{NULL, (MINMETA * 8), 512 * (MINMETA * 8), "buf.4096" },
2032	{NULL, (MINMETA * 16), 512 * (MINMETA * 16), "buf.8192" },
2033	{NULL, 0, 0, "" } /* End */
2034};
2035
2036/*
2037 * Initialize the meta data zones
2038 */
2039static void
2040bufzoneinit(void)
2041{
2042	int i;
2043
2044	for (i = 0; meta_zones[i].mz_size != 0; i++) {
2045		meta_zones[i].mz_zone =
2046				zinit(meta_zones[i].mz_size,
2047					meta_zones[i].mz_max,
2048					PAGE_SIZE,
2049					meta_zones[i].mz_name);
2050		zone_change(meta_zones[i].mz_zone, Z_CALLERACCT, FALSE);
2051	}
2052	buf_hdr_zone = zinit(sizeof(struct buf), 32, PAGE_SIZE, "buf headers");
2053	zone_change(buf_hdr_zone, Z_CALLERACCT, FALSE);
2054}
2055
2056static __inline__ zone_t
2057getbufzone(size_t size)
2058{
2059	int i;
2060
2061	if ((size % 512) || (size < MINMETA) || (size > MAXMETA))
2062		panic("getbufzone: incorect size = %lu", size);
2063
2064	for (i = 0; meta_zones[i].mz_size != 0; i++) {
2065		if (meta_zones[i].mz_size >= size)
2066			break;
2067	}
2068
2069	return (meta_zones[i].mz_zone);
2070}
2071
2072
2073
2074static struct buf *
2075bio_doread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, int async, int queuetype)
2076{
2077	buf_t	bp;
2078
2079	bp = buf_getblk(vp, blkno, size, 0, 0, queuetype);
2080
2081	/*
2082	 * If buffer does not have data valid, start a read.
2083	 * Note that if buffer is B_INVAL, buf_getblk() won't return it.
2084	 * Therefore, it's valid if it's I/O has completed or been delayed.
2085	 */
2086	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
2087		struct proc *p;
2088
2089		p = current_proc();
2090
2091		/* Start I/O for the buffer (keeping credentials). */
2092		SET(bp->b_flags, B_READ | async);
2093		if (IS_VALID_CRED(cred) && !IS_VALID_CRED(bp->b_rcred)) {
2094			kauth_cred_ref(cred);
2095			bp->b_rcred = cred;
2096		}
2097
2098		VNOP_STRATEGY(bp);
2099
2100		trace(TR_BREADMISS, pack(vp, size), blkno);
2101
2102		/* Pay for the read. */
2103		if (p && p->p_stats) {
2104			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_inblock);		/* XXX */
2105			OSAddAtomic64(size, &p->p_stats->ri_diskiobytes.ri_bytesread);
2106		}
2107
2108		if (async) {
2109		        /*
2110			 * since we asked for an ASYNC I/O
2111			 * the biodone will do the brelse
2112			 * we don't want to pass back a bp
2113			 * that we don't 'own'
2114			 */
2115		        bp = NULL;
2116		}
2117	} else if (async) {
2118		buf_brelse(bp);
2119		bp = NULL;
2120	}
2121
2122	trace(TR_BREADHIT, pack(vp, size), blkno);
2123
2124	return (bp);
2125}
2126
2127/*
2128 * Perform the reads for buf_breadn() and buf_meta_breadn().
2129 * Trivial modification to the breada algorithm presented in Bach (p.55).
2130 */
2131static errno_t
2132do_breadn_for_type(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes,
2133		   int nrablks, kauth_cred_t cred, buf_t *bpp, int queuetype)
2134{
2135	buf_t	bp;
2136	int	i;
2137
2138	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, queuetype);
2139
2140	/*
2141	 * For each of the read-ahead blocks, start a read, if necessary.
2142	 */
2143	for (i = 0; i < nrablks; i++) {
2144		/* If it's in the cache, just go on to next one. */
2145		if (incore(vp, rablks[i]))
2146			continue;
2147
2148		/* Get a buffer for the read-ahead block */
2149		(void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC, queuetype);
2150	}
2151
2152	/* Otherwise, we had to start a read for it; wait until it's valid. */
2153	return (buf_biowait(bp));
2154}
2155
2156
2157/*
2158 * Read a disk block.
2159 * This algorithm described in Bach (p.54).
2160 */
2161errno_t
2162buf_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2163{
2164	buf_t	bp;
2165
2166	/* Get buffer for block. */
2167	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_READ);
2168
2169	/* Wait for the read to complete, and return result. */
2170	return (buf_biowait(bp));
2171}
2172
2173/*
2174 * Read a disk block. [bread() for meta-data]
2175 * This algorithm described in Bach (p.54).
2176 */
2177errno_t
2178buf_meta_bread(vnode_t vp, daddr64_t blkno, int size, kauth_cred_t cred, buf_t *bpp)
2179{
2180	buf_t	bp;
2181
2182	/* Get buffer for block. */
2183	bp = *bpp = bio_doread(vp, blkno, size, cred, 0, BLK_META);
2184
2185	/* Wait for the read to complete, and return result. */
2186	return (buf_biowait(bp));
2187}
2188
2189/*
2190 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2191 */
2192errno_t
2193buf_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2194{
2195	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_READ));
2196}
2197
2198/*
2199 * Read-ahead multiple disk blocks. The first is sync, the rest async.
2200 * [buf_breadn() for meta-data]
2201 */
2202errno_t
2203buf_meta_breadn(vnode_t vp, daddr64_t blkno, int size, daddr64_t *rablks, int *rasizes, int nrablks, kauth_cred_t cred, buf_t *bpp)
2204{
2205	return (do_breadn_for_type(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp, BLK_META));
2206}
2207
2208/*
2209 * Block write.  Described in Bach (p.56)
2210 */
2211errno_t
2212buf_bwrite(buf_t bp)
2213{
2214	int	sync, wasdelayed;
2215	errno_t	rv;
2216	proc_t	p = current_proc();
2217	vnode_t	vp = bp->b_vp;
2218
2219	if (bp->b_datap == 0) {
2220	        if (brecover_data(bp) == 0)
2221		        return (0);
2222	}
2223	/* Remember buffer type, to switch on it later. */
2224	sync = !ISSET(bp->b_flags, B_ASYNC);
2225	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
2226	CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
2227
2228	if (wasdelayed)
2229		OSAddAtomicLong(-1, &nbdwrite);
2230
2231	if (!sync) {
2232		/*
2233		 * If not synchronous, pay for the I/O operation and make
2234		 * sure the buf is on the correct vnode queue.  We have
2235		 * to do this now, because if we don't, the vnode may not
2236		 * be properly notified that its I/O has completed.
2237		 */
2238		if (wasdelayed)
2239			buf_reassign(bp, vp);
2240		else
2241			if (p && p->p_stats) {
2242				OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2243				OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2244			}
2245	}
2246	trace(TR_BUFWRITE, pack(vp, bp->b_bcount), bp->b_lblkno);
2247
2248	/* Initiate disk write.  Make sure the appropriate party is charged. */
2249
2250        OSAddAtomic(1, &vp->v_numoutput);
2251
2252	VNOP_STRATEGY(bp);
2253
2254	if (sync) {
2255		/*
2256		 * If I/O was synchronous, wait for it to complete.
2257		 */
2258		rv = buf_biowait(bp);
2259
2260		/*
2261		 * Pay for the I/O operation, if it's not been paid for, and
2262		 * make sure it's on the correct vnode queue. (async operatings
2263		 * were payed for above.)
2264		 */
2265		if (wasdelayed)
2266			buf_reassign(bp, vp);
2267		else
2268			if (p && p->p_stats) {
2269				OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2270				OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2271			}
2272
2273		/* Release the buffer. */
2274		// XXXdbg - only if the unused bit is set
2275		if (!ISSET(bp->b_flags, B_NORELSE)) {
2276		    buf_brelse(bp);
2277		} else {
2278		    CLR(bp->b_flags, B_NORELSE);
2279		}
2280
2281		return (rv);
2282	} else {
2283		return (0);
2284	}
2285}
2286
2287int
2288vn_bwrite(struct vnop_bwrite_args *ap)
2289{
2290	return (buf_bwrite(ap->a_bp));
2291}
2292
2293/*
2294 * Delayed write.
2295 *
2296 * The buffer is marked dirty, but is not queued for I/O.
2297 * This routine should be used when the buffer is expected
2298 * to be modified again soon, typically a small write that
2299 * partially fills a buffer.
2300 *
2301 * NB: magnetic tapes cannot be delayed; they must be
2302 * written in the order that the writes are requested.
2303 *
2304 * Described in Leffler, et al. (pp. 208-213).
2305 *
2306 * Note: With the ability to allocate additional buffer
2307 * headers, we can get in to the situation where "too" many
2308 * buf_bdwrite()s can create situation where the kernel can create
2309 * buffers faster than the disks can service. Doing a buf_bawrite() in
2310 * cases where we have "too many" outstanding buf_bdwrite()s avoids that.
2311 */
2312__private_extern__ int
2313bdwrite_internal(buf_t bp, int return_error)
2314{
2315	proc_t	p  = current_proc();
2316	vnode_t	vp = bp->b_vp;
2317
2318	/*
2319	 * If the block hasn't been seen before:
2320	 *	(1) Mark it as having been seen,
2321	 *	(2) Charge for the write.
2322	 *	(3) Make sure it's on its vnode's correct block list,
2323	 */
2324	if (!ISSET(bp->b_flags, B_DELWRI)) {
2325		SET(bp->b_flags, B_DELWRI);
2326		if (p && p->p_stats) {
2327			OSIncrementAtomicLong(&p->p_stats->p_ru.ru_oublock);	/* XXX */
2328			OSAddAtomic64(buf_count(bp), &p->p_stats->ri_diskiobytes.ri_byteswritten);
2329		}
2330		OSAddAtomicLong(1, &nbdwrite);
2331		buf_reassign(bp, vp);
2332	}
2333
2334	/*
2335	 * if we're not LOCKED, but the total number of delayed writes
2336	 * has climbed above 75% of the total buffers in the system
2337	 * return an error if the caller has indicated that it can
2338	 * handle one in this case, otherwise schedule the I/O now
2339	 * this is done to prevent us from allocating tons of extra
2340	 * buffers when dealing with virtual disks (i.e. DiskImages),
2341	 * because additional buffers are dynamically allocated to prevent
2342	 * deadlocks from occurring
2343	 *
2344	 * however, can't do a buf_bawrite() if the LOCKED bit is set because the
2345	 * buffer is part of a transaction and can't go to disk until
2346	 * the LOCKED bit is cleared.
2347	 */
2348	if (!ISSET(bp->b_flags, B_LOCKED) && nbdwrite > ((nbuf_headers/4)*3)) {
2349		if (return_error)
2350			return (EAGAIN);
2351		/*
2352		 * If the vnode has "too many" write operations in progress
2353		 * wait for them to finish the IO
2354		 */
2355		(void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, "buf_bdwrite");
2356
2357		return (buf_bawrite(bp));
2358	}
2359
2360	/* Otherwise, the "write" is done, so mark and release the buffer. */
2361	SET(bp->b_flags, B_DONE);
2362	buf_brelse(bp);
2363	return (0);
2364}
2365
2366errno_t
2367buf_bdwrite(buf_t bp)
2368{
2369	return (bdwrite_internal(bp, 0));
2370}
2371
2372
2373/*
2374 * Asynchronous block write; just an asynchronous buf_bwrite().
2375 *
2376 * Note: With the abilitty to allocate additional buffer
2377 * headers, we can get in to the situation where "too" many
2378 * buf_bawrite()s can create situation where the kernel can create
2379 * buffers faster than the disks can service.
2380 * We limit the number of "in flight" writes a vnode can have to
2381 * avoid this.
2382 */
2383static int
2384bawrite_internal(buf_t bp, int throttle)
2385{
2386	vnode_t	vp = bp->b_vp;
2387
2388	if (vp) {
2389	        if (throttle)
2390		        /*
2391			 * If the vnode has "too many" write operations in progress
2392			 * wait for them to finish the IO
2393			 */
2394		        (void)vnode_waitforwrites(vp, VNODE_ASYNC_THROTTLE, 0, 0, (const char *)"buf_bawrite");
2395		else if (vp->v_numoutput >= VNODE_ASYNC_THROTTLE)
2396		        /*
2397			 * return to the caller and
2398			 * let him decide what to do
2399			 */
2400		        return (EWOULDBLOCK);
2401	}
2402	SET(bp->b_flags, B_ASYNC);
2403
2404	return (VNOP_BWRITE(bp));
2405}
2406
2407errno_t
2408buf_bawrite(buf_t bp)
2409{
2410	return (bawrite_internal(bp, 1));
2411}
2412
2413
2414
2415static void
2416buf_free_meta_store(buf_t bp)
2417{
2418	if (bp->b_bufsize) {
2419		if (ISSET(bp->b_flags, B_ZALLOC)) {
2420			zone_t z;
2421
2422			z = getbufzone(bp->b_bufsize);
2423			zfree(z, (void *)bp->b_datap);
2424		} else
2425			kmem_free(kernel_map, bp->b_datap, bp->b_bufsize);
2426
2427		bp->b_datap = (uintptr_t)NULL;
2428		bp->b_bufsize = 0;
2429	}
2430}
2431
2432
2433static buf_t
2434buf_brelse_shadow(buf_t bp)
2435{
2436	buf_t	bp_head;
2437	buf_t	bp_temp;
2438	buf_t	bp_return = NULL;
2439#ifdef BUF_MAKE_PRIVATE
2440	buf_t	bp_data;
2441	int	data_ref = 0;
2442#endif
2443	int need_wakeup = 0;
2444
2445	lck_mtx_lock_spin(buf_mtxp);
2446
2447	bp_head = (buf_t)bp->b_orig;
2448
2449	if (bp_head->b_whichq != -1)
2450		panic("buf_brelse_shadow: bp_head on freelist %d\n", bp_head->b_whichq);
2451
2452#ifdef BUF_MAKE_PRIVATE
2453	if (bp_data = bp->b_data_store) {
2454		bp_data->b_data_ref--;
2455		/*
2456		 * snapshot the ref count so that we can check it
2457		 * outside of the lock... we only want the guy going
2458		 * from 1 -> 0 to try and release the storage
2459		 */
2460		data_ref = bp_data->b_data_ref;
2461	}
2462#endif
2463	KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_START, bp, bp_head, bp_head->b_shadow_ref, 0, 0);
2464
2465	bp_head->b_shadow_ref--;
2466
2467	for (bp_temp = bp_head; bp_temp && bp != bp_temp->b_shadow; bp_temp = bp_temp->b_shadow);
2468
2469	if (bp_temp == NULL)
2470		panic("buf_brelse_shadow: bp not on list %p", bp_head);
2471
2472	bp_temp->b_shadow = bp_temp->b_shadow->b_shadow;
2473
2474#ifdef BUF_MAKE_PRIVATE
2475	/*
2476	 * we're about to free the current 'owner' of the data buffer and
2477	 * there is at least one other shadow buf_t still pointing at it
2478	 * so transfer it to the first shadow buf left in the chain
2479	 */
2480	if (bp == bp_data && data_ref) {
2481		if ((bp_data = bp_head->b_shadow) == NULL)
2482			panic("buf_brelse_shadow: data_ref mismatch bp(%p)", bp);
2483
2484		for (bp_temp = bp_data; bp_temp; bp_temp = bp_temp->b_shadow)
2485			bp_temp->b_data_store = bp_data;
2486		bp_data->b_data_ref = data_ref;
2487	}
2488#endif
2489	if (bp_head->b_shadow_ref == 0 && bp_head->b_shadow)
2490		panic("buf_relse_shadow: b_shadow != NULL && b_shadow_ref == 0  bp(%p)", bp);
2491	if (bp_head->b_shadow_ref && bp_head->b_shadow == 0)
2492		panic("buf_relse_shadow: b_shadow == NULL && b_shadow_ref != 0  bp(%p)", bp);
2493
2494	if (bp_head->b_shadow_ref == 0) {
2495		if (!ISSET(bp_head->b_lflags, BL_BUSY)) {
2496
2497			CLR(bp_head->b_flags, B_AGE);
2498			bp_head->b_timestamp = buf_timestamp();
2499
2500			if (ISSET(bp_head->b_flags, B_LOCKED)) {
2501				bp_head->b_whichq = BQ_LOCKED;
2502				binstailfree(bp_head, &bufqueues[BQ_LOCKED], BQ_LOCKED);
2503			} else {
2504				bp_head->b_whichq = BQ_META;
2505				binstailfree(bp_head, &bufqueues[BQ_META], BQ_META);
2506			}
2507		} else if (ISSET(bp_head->b_lflags, BL_WAITSHADOW)) {
2508			CLR(bp_head->b_lflags, BL_WAITSHADOW);
2509
2510			bp_return = bp_head;
2511		}
2512		if (ISSET(bp_head->b_lflags, BL_WANTED_REF)) {
2513			CLR(bp_head->b_lflags, BL_WANTED_REF);
2514			need_wakeup = 1;
2515		}
2516	}
2517	lck_mtx_unlock(buf_mtxp);
2518
2519	if (need_wakeup)
2520		wakeup(bp_head);
2521
2522#ifdef BUF_MAKE_PRIVATE
2523	if (bp == bp_data && data_ref == 0)
2524		buf_free_meta_store(bp);
2525
2526	bp->b_data_store = NULL;
2527#endif
2528	KERNEL_DEBUG(0xbbbbc008 | DBG_FUNC_END, bp, 0, 0, 0, 0);
2529
2530	return (bp_return);
2531}
2532
2533
2534/*
2535 * Release a buffer on to the free lists.
2536 * Described in Bach (p. 46).
2537 */
2538void
2539buf_brelse(buf_t bp)
2540{
2541	struct bqueues *bufq;
2542	long	whichq;
2543	upl_t	upl;
2544	int need_wakeup = 0;
2545	int need_bp_wakeup = 0;
2546
2547
2548	if (bp->b_whichq != -1 || !(bp->b_lflags & BL_BUSY))
2549	        panic("buf_brelse: bad buffer = %p\n", bp);
2550
2551#ifdef JOE_DEBUG
2552	(void) OSBacktrace(&bp->b_stackbrelse[0], 6);
2553
2554	bp->b_lastbrelse = current_thread();
2555	bp->b_tag = 0;
2556#endif
2557	if (bp->b_lflags & BL_IOBUF) {
2558		buf_t	shadow_master_bp = NULL;
2559
2560		if (ISSET(bp->b_lflags, BL_SHADOW))
2561			shadow_master_bp = buf_brelse_shadow(bp);
2562		else if (ISSET(bp->b_lflags, BL_IOBUF_ALLOC))
2563			 buf_free_meta_store(bp);
2564	        free_io_buf(bp);
2565
2566		if (shadow_master_bp) {
2567			bp = shadow_master_bp;
2568			goto finish_shadow_master;
2569		}
2570		return;
2571	}
2572
2573	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_START,
2574		     bp->b_lblkno * PAGE_SIZE, bp, bp->b_datap,
2575		     bp->b_flags, 0);
2576
2577	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
2578
2579	/*
2580	 * if we're invalidating a buffer that has the B_FILTER bit
2581	 * set then call the b_iodone function so it gets cleaned
2582	 * up properly.
2583	 *
2584	 * the HFS journal code depends on this
2585	 */
2586	if (ISSET(bp->b_flags, B_META) && ISSET(bp->b_flags, B_INVAL)) {
2587		if (ISSET(bp->b_flags, B_FILTER)) {	/* if necessary, call out */
2588			void	(*iodone_func)(struct buf *, void *) = bp->b_iodone;
2589			void 	*arg = bp->b_transaction;
2590
2591			CLR(bp->b_flags, B_FILTER);	/* but note callout done */
2592			bp->b_iodone = NULL;
2593			bp->b_transaction = NULL;
2594
2595			if (iodone_func == NULL) {
2596				panic("brelse: bp @ %p has NULL b_iodone!\n", bp);
2597			}
2598			(*iodone_func)(bp, arg);
2599		}
2600	}
2601	/*
2602	 * I/O is done. Cleanup the UPL state
2603	 */
2604	upl = bp->b_upl;
2605
2606	if ( !ISSET(bp->b_flags, B_META) && UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2607		kern_return_t kret;
2608		int           upl_flags;
2609
2610		if (upl == NULL) {
2611		        if ( !ISSET(bp->b_flags, B_INVAL)) {
2612				kret = ubc_create_upl(bp->b_vp,
2613						      ubc_blktooff(bp->b_vp, bp->b_lblkno),
2614						      bp->b_bufsize,
2615						      &upl,
2616						      NULL,
2617						      UPL_PRECIOUS);
2618
2619				if (kret != KERN_SUCCESS)
2620				        panic("brelse: Failed to create UPL");
2621#if  UPL_DEBUG
2622				upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 5);
2623#endif /* UPL_DEBUG */
2624			}
2625		} else {
2626			if (bp->b_datap) {
2627			        kret = ubc_upl_unmap(upl);
2628
2629				if (kret != KERN_SUCCESS)
2630				        panic("ubc_upl_unmap failed");
2631				bp->b_datap = (uintptr_t)NULL;
2632			}
2633		}
2634		if (upl) {
2635			if (bp->b_flags & (B_ERROR | B_INVAL)) {
2636			        if (bp->b_flags & (B_READ | B_INVAL))
2637				        upl_flags = UPL_ABORT_DUMP_PAGES;
2638				else
2639				        upl_flags = 0;
2640
2641				ubc_upl_abort(upl, upl_flags);
2642			} else {
2643			        if (ISSET(bp->b_flags, B_DELWRI | B_WASDIRTY))
2644				        upl_flags = UPL_COMMIT_SET_DIRTY ;
2645				else
2646				        upl_flags = UPL_COMMIT_CLEAR_DIRTY ;
2647
2648				ubc_upl_commit_range(upl, 0, bp->b_bufsize, upl_flags |
2649						     UPL_COMMIT_INACTIVATE | UPL_COMMIT_FREE_ON_EMPTY);
2650			}
2651			bp->b_upl = NULL;
2652		}
2653	} else {
2654		if ( (upl) )
2655			panic("brelse: UPL set for non VREG; vp=%p", bp->b_vp);
2656	}
2657
2658	/*
2659	 * If it's locked, don't report an error; try again later.
2660	 */
2661	if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
2662		CLR(bp->b_flags, B_ERROR);
2663	/*
2664	 * If it's not cacheable, or an error, mark it invalid.
2665	 */
2666	if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
2667		SET(bp->b_flags, B_INVAL);
2668
2669	if ((bp->b_bufsize <= 0) ||
2670			ISSET(bp->b_flags, B_INVAL) ||
2671			(ISSET(bp->b_lflags, BL_WANTDEALLOC) && !ISSET(bp->b_flags, B_DELWRI))) {
2672
2673		boolean_t	delayed_buf_free_meta_store = FALSE;
2674
2675		/*
2676		 * If it's invalid or empty, dissociate it from its vnode,
2677		 * release its storage if B_META, and
2678		 * clean it up a bit and put it on the EMPTY queue
2679		 */
2680		if (ISSET(bp->b_flags, B_DELWRI))
2681			OSAddAtomicLong(-1, &nbdwrite);
2682
2683		if (ISSET(bp->b_flags, B_META)) {
2684			if (bp->b_shadow_ref)
2685				delayed_buf_free_meta_store = TRUE;
2686			else
2687				buf_free_meta_store(bp);
2688		}
2689		/*
2690		 * nuke any credentials we were holding
2691		 */
2692		buf_release_credentials(bp);
2693
2694		lck_mtx_lock_spin(buf_mtxp);
2695
2696		if (bp->b_shadow_ref) {
2697			SET(bp->b_lflags, BL_WAITSHADOW);
2698
2699			lck_mtx_unlock(buf_mtxp);
2700
2701			return;
2702		}
2703		if (delayed_buf_free_meta_store == TRUE) {
2704
2705			lck_mtx_unlock(buf_mtxp);
2706finish_shadow_master:
2707			buf_free_meta_store(bp);
2708
2709			lck_mtx_lock_spin(buf_mtxp);
2710		}
2711		CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
2712
2713		if (bp->b_vp)
2714			brelvp_locked(bp);
2715
2716		bremhash(bp);
2717		BLISTNONE(bp);
2718		binshash(bp, &invalhash);
2719
2720		bp->b_whichq = BQ_EMPTY;
2721		binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
2722	} else {
2723
2724		/*
2725		 * It has valid data.  Put it on the end of the appropriate
2726		 * queue, so that it'll stick around for as long as possible.
2727		 */
2728		if (ISSET(bp->b_flags, B_LOCKED))
2729			whichq = BQ_LOCKED;		/* locked in core */
2730		else if (ISSET(bp->b_flags, B_META))
2731			whichq = BQ_META;		/* meta-data */
2732		else if (ISSET(bp->b_flags, B_AGE))
2733			whichq = BQ_AGE;		/* stale but valid data */
2734		else
2735			whichq = BQ_LRU;		/* valid data */
2736		bufq = &bufqueues[whichq];
2737
2738		bp->b_timestamp = buf_timestamp();
2739
2740		lck_mtx_lock_spin(buf_mtxp);
2741
2742		/*
2743		 * the buf_brelse_shadow routine doesn't take 'ownership'
2744		 * of the parent buf_t... it updates state that is protected by
2745		 * the buf_mtxp, and checks for BL_BUSY to determine whether to
2746		 * put the buf_t back on a free list.  b_shadow_ref is protected
2747		 * by the lock, and since we have not yet cleared B_BUSY, we need
2748		 * to check it while holding the lock to insure that one of us
2749		 * puts this buf_t back on a free list when it is safe to do so
2750		 */
2751		if (bp->b_shadow_ref == 0) {
2752			CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE));
2753			bp->b_whichq = whichq;
2754			binstailfree(bp, bufq, whichq);
2755		} else {
2756			/*
2757			 * there are still cloned buf_t's pointing
2758			 * at this guy... need to keep it off the
2759			 * freelists until a buf_brelse is done on
2760			 * the last clone
2761			 */
2762			CLR(bp->b_flags, (B_ASYNC | B_NOCACHE));
2763		}
2764	}
2765	if (needbuffer) {
2766	        /*
2767		 * needbuffer is a global
2768		 * we're currently using buf_mtxp to protect it
2769		 * delay doing the actual wakeup until after
2770		 * we drop buf_mtxp
2771		 */
2772		needbuffer = 0;
2773		need_wakeup = 1;
2774	}
2775	if (ISSET(bp->b_lflags, BL_WANTED)) {
2776	        /*
2777		 * delay the actual wakeup until after we
2778		 * clear BL_BUSY and we've dropped buf_mtxp
2779		 */
2780		need_bp_wakeup = 1;
2781	}
2782	/*
2783	 * Unlock the buffer.
2784	 */
2785	CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
2786	buf_busycount--;
2787
2788	lck_mtx_unlock(buf_mtxp);
2789
2790	if (need_wakeup) {
2791	        /*
2792		 * Wake up any processes waiting for any buffer to become free.
2793		 */
2794	        wakeup(&needbuffer);
2795	}
2796	if (need_bp_wakeup) {
2797	        /*
2798		 * Wake up any proceeses waiting for _this_ buffer to become free.
2799		 */
2800	        wakeup(bp);
2801	}
2802	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 388)) | DBG_FUNC_END,
2803		     bp, bp->b_datap, bp->b_flags, 0, 0);
2804}
2805
2806/*
2807 * Determine if a block is in the cache.
2808 * Just look on what would be its hash chain.  If it's there, return
2809 * a pointer to it, unless it's marked invalid.  If it's marked invalid,
2810 * we normally don't return the buffer, unless the caller explicitly
2811 * wants us to.
2812 */
2813static boolean_t
2814incore(vnode_t vp, daddr64_t blkno)
2815{
2816        boolean_t retval;
2817	struct	bufhashhdr *dp;
2818
2819	dp = BUFHASH(vp, blkno);
2820
2821	lck_mtx_lock_spin(buf_mtxp);
2822
2823	if (incore_locked(vp, blkno, dp))
2824	        retval = TRUE;
2825	else
2826	        retval = FALSE;
2827	lck_mtx_unlock(buf_mtxp);
2828
2829	return (retval);
2830}
2831
2832
2833static buf_t
2834incore_locked(vnode_t vp, daddr64_t blkno, struct bufhashhdr *dp)
2835{
2836	struct buf *bp;
2837
2838	/* Search hash chain */
2839	for (bp = dp->lh_first; bp != NULL; bp = bp->b_hash.le_next) {
2840		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
2841		    !ISSET(bp->b_flags, B_INVAL)) {
2842			return (bp);
2843		}
2844	}
2845	return (NULL);
2846}
2847
2848
2849void
2850buf_wait_for_shadow_io(vnode_t vp, daddr64_t blkno)
2851{
2852	buf_t bp;
2853	struct	bufhashhdr *dp;
2854
2855	dp = BUFHASH(vp, blkno);
2856
2857	lck_mtx_lock_spin(buf_mtxp);
2858
2859	for (;;) {
2860		if ((bp = incore_locked(vp, blkno, dp)) == NULL)
2861			break;
2862
2863		if (bp->b_shadow_ref == 0)
2864			break;
2865
2866		SET(bp->b_lflags, BL_WANTED_REF);
2867
2868		(void) msleep(bp, buf_mtxp, PSPIN | (PRIBIO+1), "buf_wait_for_shadow", NULL);
2869	}
2870	lck_mtx_unlock(buf_mtxp);
2871}
2872
2873/* XXX FIXME -- Update the comment to reflect the UBC changes (please) -- */
2874/*
2875 * Get a block of requested size that is associated with
2876 * a given vnode and block offset. If it is found in the
2877 * block cache, mark it as having been found, make it busy
2878 * and return it. Otherwise, return an empty block of the
2879 * correct size. It is up to the caller to insure that the
2880 * cached blocks be of the correct size.
2881 */
2882buf_t
2883buf_getblk(vnode_t vp, daddr64_t blkno, int size, int slpflag, int slptimeo, int operation)
2884{
2885	buf_t bp;
2886	int   err;
2887	upl_t upl;
2888	upl_page_info_t *pl;
2889	kern_return_t kret;
2890	int ret_only_valid;
2891	struct timespec ts;
2892	int upl_flags;
2893	struct	bufhashhdr *dp;
2894
2895	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_START,
2896		     (uintptr_t)(blkno * PAGE_SIZE), size, operation, 0, 0);
2897
2898	ret_only_valid = operation & BLK_ONLYVALID;
2899	operation &= ~BLK_ONLYVALID;
2900	dp = BUFHASH(vp, blkno);
2901start:
2902	lck_mtx_lock_spin(buf_mtxp);
2903
2904	if ((bp = incore_locked(vp, blkno, dp))) {
2905		/*
2906		 * Found in the Buffer Cache
2907		 */
2908		if (ISSET(bp->b_lflags, BL_BUSY)) {
2909			/*
2910			 * but is busy
2911			 */
2912			switch (operation) {
2913			case BLK_READ:
2914			case BLK_WRITE:
2915			case BLK_META:
2916				SET(bp->b_lflags, BL_WANTED);
2917				bufstats.bufs_busyincore++;
2918
2919				/*
2920				 * don't retake the mutex after being awakened...
2921				 * the time out is in msecs
2922				 */
2923				ts.tv_sec = (slptimeo/1000);
2924				ts.tv_nsec = (slptimeo % 1000) * 10  * NSEC_PER_USEC * 1000;
2925
2926				KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 396)) | DBG_FUNC_NONE,
2927					     (uintptr_t)blkno, size, operation, 0, 0);
2928
2929				err = msleep(bp, buf_mtxp, slpflag | PDROP | (PRIBIO + 1), "buf_getblk", &ts);
2930
2931				/*
2932				 * Callers who call with PCATCH or timeout are
2933				 * willing to deal with the NULL pointer
2934				 */
2935				if (err && ((slpflag & PCATCH) || ((err == EWOULDBLOCK) && slptimeo)))
2936					return (NULL);
2937				goto start;
2938				/*NOTREACHED*/
2939				break;
2940
2941			default:
2942			        /*
2943				 * unknown operation requested
2944				 */
2945				panic("getblk: paging or unknown operation for incore busy buffer - %x\n", operation);
2946				/*NOTREACHED*/
2947				break;
2948			}
2949		} else {
2950			/*
2951			 * buffer in core and not busy
2952			 */
2953			SET(bp->b_lflags, BL_BUSY);
2954			SET(bp->b_flags, B_CACHE);
2955			buf_busycount++;
2956
2957			bremfree_locked(bp);
2958			bufstats.bufs_incore++;
2959
2960			lck_mtx_unlock(buf_mtxp);
2961#ifdef JOE_DEBUG
2962			bp->b_owner = current_thread();
2963			bp->b_tag   = 1;
2964#endif
2965			if ( (bp->b_upl) )
2966			        panic("buffer has UPL, but not marked BUSY: %p", bp);
2967
2968			if ( !ret_only_valid && bp->b_bufsize != size)
2969			        allocbuf(bp, size);
2970
2971			upl_flags = 0;
2972			switch (operation) {
2973			case BLK_WRITE:
2974				/*
2975				 * "write" operation:  let the UPL subsystem
2976				 * know that we intend to modify the buffer
2977				 * cache pages we're gathering.
2978				 */
2979				upl_flags |= UPL_WILL_MODIFY;
2980			case BLK_READ:
2981				upl_flags |= UPL_PRECIOUS;
2982			        if (UBCINFOEXISTS(bp->b_vp) && bp->b_bufsize) {
2983					kret = ubc_create_upl(vp,
2984							      ubc_blktooff(vp, bp->b_lblkno),
2985							      bp->b_bufsize,
2986							      &upl,
2987							      &pl,
2988							      upl_flags);
2989					if (kret != KERN_SUCCESS)
2990					        panic("Failed to create UPL");
2991
2992					bp->b_upl = upl;
2993
2994					if (upl_valid_page(pl, 0)) {
2995					        if (upl_dirty_page(pl, 0))
2996						        SET(bp->b_flags, B_WASDIRTY);
2997						else
2998						        CLR(bp->b_flags, B_WASDIRTY);
2999					} else
3000					        CLR(bp->b_flags, (B_DONE | B_CACHE | B_WASDIRTY | B_DELWRI));
3001
3002					kret = ubc_upl_map(upl, (vm_offset_t*)&(bp->b_datap));
3003
3004					if (kret != KERN_SUCCESS)
3005					        panic("getblk: ubc_upl_map() failed with (%d)", kret);
3006				}
3007				break;
3008
3009			case BLK_META:
3010				/*
3011				 * VM is not involved in IO for the meta data
3012				 * buffer already has valid data
3013				 */
3014				break;
3015
3016			default:
3017				panic("getblk: paging or unknown operation for incore buffer- %d\n", operation);
3018				/*NOTREACHED*/
3019				break;
3020			}
3021		}
3022	} else { /* not incore() */
3023		int queue = BQ_EMPTY; /* Start with no preference */
3024
3025		if (ret_only_valid) {
3026			lck_mtx_unlock(buf_mtxp);
3027			return (NULL);
3028		}
3029		if ((vnode_isreg(vp) == 0) || (UBCINFOEXISTS(vp) == 0) /*|| (vnode_issystem(vp) == 1)*/)
3030			operation = BLK_META;
3031
3032		if ((bp = getnewbuf(slpflag, slptimeo, &queue)) == NULL)
3033			goto start;
3034
3035		/*
3036		 * getnewbuf may block for a number of different reasons...
3037		 * if it does, it's then possible for someone else to
3038		 * create a buffer for the same block and insert it into
3039		 * the hash... if we see it incore at this point we dump
3040		 * the buffer we were working on and start over
3041		 */
3042		if (incore_locked(vp, blkno, dp)) {
3043			SET(bp->b_flags, B_INVAL);
3044			binshash(bp, &invalhash);
3045
3046			lck_mtx_unlock(buf_mtxp);
3047
3048			buf_brelse(bp);
3049			goto start;
3050		}
3051		/*
3052		 * NOTE: YOU CAN NOT BLOCK UNTIL binshash() HAS BEEN
3053		 *       CALLED!  BE CAREFUL.
3054		 */
3055
3056		/*
3057		 * mark the buffer as B_META if indicated
3058		 * so that when buffer is released it will goto META queue
3059		 */
3060		if (operation == BLK_META)
3061		        SET(bp->b_flags, B_META);
3062
3063		bp->b_blkno = bp->b_lblkno = blkno;
3064		bp->b_vp = vp;
3065
3066		/*
3067		 * Insert in the hash so that incore() can find it
3068		 */
3069		binshash(bp, BUFHASH(vp, blkno));
3070
3071		bgetvp_locked(vp, bp);
3072
3073		lck_mtx_unlock(buf_mtxp);
3074
3075		allocbuf(bp, size);
3076
3077		upl_flags = 0;
3078		switch (operation) {
3079		case BLK_META:
3080			/*
3081			 * buffer data is invalid...
3082			 *
3083			 * I don't want to have to retake buf_mtxp,
3084			 * so the miss and vmhits counters are done
3085			 * with Atomic updates... all other counters
3086			 * in bufstats are protected with either
3087			 * buf_mtxp or iobuffer_mtxp
3088			 */
3089		        OSAddAtomicLong(1, &bufstats.bufs_miss);
3090			break;
3091
3092		case BLK_WRITE:
3093			/*
3094			 * "write" operation:  let the UPL subsystem know
3095			 * that we intend to modify the buffer cache pages
3096			 * we're gathering.
3097			 */
3098			upl_flags |= UPL_WILL_MODIFY;
3099		case BLK_READ:
3100		  {     off_t	f_offset;
3101			size_t 	contig_bytes;
3102			int	bmap_flags;
3103
3104			if ( (bp->b_upl) )
3105				panic("bp already has UPL: %p",bp);
3106
3107			f_offset = ubc_blktooff(vp, blkno);
3108
3109			upl_flags |= UPL_PRECIOUS;
3110			kret = ubc_create_upl(vp,
3111					      f_offset,
3112					      bp->b_bufsize,
3113					      &upl,
3114					      &pl,
3115					      upl_flags);
3116
3117			if (kret != KERN_SUCCESS)
3118				panic("Failed to create UPL");
3119#if  UPL_DEBUG
3120			upl_ubc_alias_set(upl, (uintptr_t) bp, (uintptr_t) 4);
3121#endif /* UPL_DEBUG */
3122			bp->b_upl = upl;
3123
3124			if (upl_valid_page(pl, 0)) {
3125
3126			        if (operation == BLK_READ)
3127				        bmap_flags = VNODE_READ;
3128				else
3129				        bmap_flags = VNODE_WRITE;
3130
3131				SET(bp->b_flags, B_CACHE | B_DONE);
3132
3133			        OSAddAtomicLong(1, &bufstats.bufs_vmhits);
3134
3135				bp->b_validoff = 0;
3136				bp->b_dirtyoff = 0;
3137
3138				if (upl_dirty_page(pl, 0)) {
3139					/* page is dirty */
3140				        SET(bp->b_flags, B_WASDIRTY);
3141
3142					bp->b_validend = bp->b_bcount;
3143					bp->b_dirtyend = bp->b_bcount;
3144				} else {
3145					/* page is clean */
3146					bp->b_validend = bp->b_bcount;
3147					bp->b_dirtyend = 0;
3148				}
3149				/*
3150				 * try to recreate the physical block number associated with
3151				 * this buffer...
3152				 */
3153				if (VNOP_BLOCKMAP(vp, f_offset, bp->b_bcount, &bp->b_blkno, &contig_bytes, NULL, bmap_flags, NULL))
3154				        panic("getblk: VNOP_BLOCKMAP failed");
3155				/*
3156				 * if the extent represented by this buffer
3157				 * is not completely physically contiguous on
3158				 * disk, than we can't cache the physical mapping
3159				 * in the buffer header
3160				 */
3161				if ((long)contig_bytes < bp->b_bcount)
3162				        bp->b_blkno = bp->b_lblkno;
3163			} else {
3164			        OSAddAtomicLong(1, &bufstats.bufs_miss);
3165			}
3166			kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
3167
3168			if (kret != KERN_SUCCESS)
3169			        panic("getblk: ubc_upl_map() failed with (%d)", kret);
3170			break;
3171		  }
3172		default:
3173			panic("getblk: paging or unknown operation - %x", operation);
3174			/*NOTREACHED*/
3175			break;
3176		}
3177	}
3178	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 386)) | DBG_FUNC_END,
3179		     bp, bp->b_datap, bp->b_flags, 3, 0);
3180
3181#ifdef JOE_DEBUG
3182	(void) OSBacktrace(&bp->b_stackgetblk[0], 6);
3183#endif
3184	return (bp);
3185}
3186
3187/*
3188 * Get an empty, disassociated buffer of given size.
3189 */
3190buf_t
3191buf_geteblk(int size)
3192{
3193	buf_t	bp = NULL;
3194	int queue = BQ_EMPTY;
3195
3196	do {
3197		lck_mtx_lock_spin(buf_mtxp);
3198
3199		bp = getnewbuf(0, 0, &queue);
3200	} while (bp == NULL);
3201
3202	SET(bp->b_flags, (B_META|B_INVAL));
3203
3204#if DIAGNOSTIC
3205	assert(queue == BQ_EMPTY);
3206#endif /* DIAGNOSTIC */
3207	/* XXX need to implement logic to deal with other queues */
3208
3209	binshash(bp, &invalhash);
3210	bufstats.bufs_eblk++;
3211
3212	lck_mtx_unlock(buf_mtxp);
3213
3214	allocbuf(bp, size);
3215
3216	return (bp);
3217}
3218
3219uint32_t
3220buf_redundancy_flags(buf_t bp)
3221{
3222	return bp->b_redundancy_flags;
3223}
3224
3225void
3226buf_set_redundancy_flags(buf_t bp, uint32_t flags)
3227{
3228	SET(bp->b_redundancy_flags, flags);
3229}
3230
3231void
3232buf_clear_redundancy_flags(buf_t bp, uint32_t flags)
3233{
3234	CLR(bp->b_redundancy_flags, flags);
3235}
3236
3237/*
3238 * With UBC, there is no need to expand / shrink the file data
3239 * buffer. The VM uses the same pages, hence no waste.
3240 * All the file data buffers can have one size.
3241 * In fact expand / shrink would be an expensive operation.
3242 *
3243 * Only exception to this is meta-data buffers. Most of the
3244 * meta data operations are smaller than PAGE_SIZE. Having the
3245 * meta-data buffers grow and shrink as needed, optimizes use
3246 * of the kernel wired memory.
3247 */
3248
3249int
3250allocbuf(buf_t bp, int size)
3251{
3252	vm_size_t desired_size;
3253
3254	desired_size = roundup(size, CLBYTES);
3255
3256	if (desired_size < PAGE_SIZE)
3257		desired_size = PAGE_SIZE;
3258	if (desired_size > MAXBSIZE)
3259		panic("allocbuf: buffer larger than MAXBSIZE requested");
3260
3261	if (ISSET(bp->b_flags, B_META)) {
3262		zone_t zprev, z;
3263		int    nsize = roundup(size, MINMETA);
3264
3265		if (bp->b_datap) {
3266			vm_offset_t elem = (vm_offset_t)bp->b_datap;
3267
3268			if (ISSET(bp->b_flags, B_ZALLOC)) {
3269			        if (bp->b_bufsize < nsize) {
3270				        /* reallocate to a bigger size */
3271
3272				        zprev = getbufzone(bp->b_bufsize);
3273					if (nsize <= MAXMETA) {
3274					        desired_size = nsize;
3275						z = getbufzone(nsize);
3276						/* b_datap not really a ptr */
3277						*(void **)(&bp->b_datap) = zalloc(z);
3278					} else {
3279					        bp->b_datap = (uintptr_t)NULL;
3280					        kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3281						CLR(bp->b_flags, B_ZALLOC);
3282					}
3283					bcopy((void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3284					zfree(zprev, (void *)elem);
3285				} else {
3286				        desired_size = bp->b_bufsize;
3287				}
3288
3289			} else {
3290				if ((vm_size_t)bp->b_bufsize < desired_size) {
3291					/* reallocate to a bigger size */
3292				        bp->b_datap = (uintptr_t)NULL;
3293					kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3294					bcopy((const void *)elem, (caddr_t)bp->b_datap, bp->b_bufsize);
3295					kmem_free(kernel_map, elem, bp->b_bufsize);
3296				} else {
3297					desired_size = bp->b_bufsize;
3298				}
3299			}
3300		} else {
3301			/* new allocation */
3302			if (nsize <= MAXMETA) {
3303				desired_size = nsize;
3304				z = getbufzone(nsize);
3305				/* b_datap not really a ptr */
3306				*(void **)(&bp->b_datap) = zalloc(z);
3307				SET(bp->b_flags, B_ZALLOC);
3308			} else
3309				kmem_alloc_kobject(kernel_map, (vm_offset_t *)&bp->b_datap, desired_size);
3310		}
3311
3312		if (bp->b_datap == 0)
3313		        panic("allocbuf: NULL b_datap");
3314	}
3315	bp->b_bufsize = desired_size;
3316	bp->b_bcount = size;
3317
3318	return (0);
3319}
3320
3321/*
3322 *	Get a new buffer from one of the free lists.
3323 *
3324 *	Request for a queue is passes in. The queue from which the buffer was taken
3325 *	from is returned. Out of range queue requests get BQ_EMPTY. Request for
3326 *	BQUEUE means no preference. Use heuristics in that case.
3327 *	Heuristics is as follows:
3328 *	Try BQ_AGE, BQ_LRU, BQ_EMPTY, BQ_META in that order.
3329 *	If none available block till one is made available.
3330 *	If buffers available on both BQ_AGE and BQ_LRU, check the timestamps.
3331 *	Pick the most stale buffer.
3332 *	If found buffer was marked delayed write, start the async. write
3333 *	and restart the search.
3334 *	Initialize the fields and disassociate the buffer from the vnode.
3335 *	Remove the buffer from the hash. Return the buffer and the queue
3336 *	on which it was found.
3337 *
3338 *	buf_mtxp is held upon entry
3339 *	returns with buf_mtxp locked if new buf available
3340 *	returns with buf_mtxp UNlocked if new buf NOT available
3341 */
3342
3343static buf_t
3344getnewbuf(int slpflag, int slptimeo, int * queue)
3345{
3346	buf_t	bp;
3347	buf_t	lru_bp;
3348	buf_t	age_bp;
3349	buf_t	meta_bp;
3350	int	age_time, lru_time, bp_time, meta_time;
3351	int	req = *queue;	/* save it for restarts */
3352	struct timespec ts;
3353
3354start:
3355	/*
3356	 * invalid request gets empty queue
3357	 */
3358	if ((*queue >= BQUEUES) || (*queue < 0)
3359		|| (*queue == BQ_LAUNDRY) || (*queue == BQ_LOCKED))
3360		*queue = BQ_EMPTY;
3361
3362
3363	if (*queue == BQ_EMPTY && (bp = bufqueues[*queue].tqh_first))
3364	        goto found;
3365
3366	/*
3367	 * need to grow number of bufs, add another one rather than recycling
3368	 */
3369	if (nbuf_headers < max_nbuf_headers) {
3370		/*
3371		 * Increment  count now as lock
3372		 * is dropped for allocation.
3373		 * That avoids over commits
3374		 */
3375		nbuf_headers++;
3376		goto add_newbufs;
3377	}
3378	/* Try for the requested queue first */
3379	bp = bufqueues[*queue].tqh_first;
3380	if (bp)
3381	        goto found;
3382
3383	/* Unable to use requested queue */
3384	age_bp = bufqueues[BQ_AGE].tqh_first;
3385	lru_bp = bufqueues[BQ_LRU].tqh_first;
3386	meta_bp = bufqueues[BQ_META].tqh_first;
3387
3388	if (!age_bp && !lru_bp && !meta_bp) {
3389		/*
3390		 * Unavailble on AGE or LRU or META queues
3391		 * Try the empty list first
3392		 */
3393		bp = bufqueues[BQ_EMPTY].tqh_first;
3394		if (bp) {
3395			*queue = BQ_EMPTY;
3396			goto found;
3397		}
3398		/*
3399		 * We have seen is this is hard to trigger.
3400		 * This is an overcommit of nbufs but needed
3401		 * in some scenarios with diskiamges
3402		 */
3403
3404add_newbufs:
3405		lck_mtx_unlock(buf_mtxp);
3406
3407		/* Create a new temporary buffer header */
3408		bp = (struct buf *)zalloc(buf_hdr_zone);
3409
3410		if (bp) {
3411			bufhdrinit(bp);
3412			bp->b_whichq = BQ_EMPTY;
3413			bp->b_timestamp = buf_timestamp();
3414			BLISTNONE(bp);
3415			SET(bp->b_flags, B_HDRALLOC);
3416			*queue = BQ_EMPTY;
3417		}
3418		lck_mtx_lock_spin(buf_mtxp);
3419
3420		if (bp) {
3421			binshash(bp, &invalhash);
3422			binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3423			buf_hdr_count++;
3424			goto found;
3425		}
3426		/* subtract already accounted bufcount */
3427		nbuf_headers--;
3428
3429		bufstats.bufs_sleeps++;
3430
3431		/* wait for a free buffer of any kind */
3432		needbuffer = 1;
3433		/* hz value is 100 */
3434		ts.tv_sec = (slptimeo/1000);
3435		/* the hz value is 100; which leads to 10ms */
3436		ts.tv_nsec = (slptimeo % 1000) * NSEC_PER_USEC * 1000 * 10;
3437
3438		msleep(&needbuffer, buf_mtxp, slpflag | PDROP | (PRIBIO+1), "getnewbuf", &ts);
3439		return (NULL);
3440	}
3441
3442	/* Buffer available either on AGE or LRU or META */
3443	bp = NULL;
3444	*queue = -1;
3445
3446	/* Buffer available either on AGE or LRU */
3447	if (!age_bp) {
3448		bp = lru_bp;
3449		*queue = BQ_LRU;
3450	} else if (!lru_bp) {
3451		bp = age_bp;
3452		*queue = BQ_AGE;
3453	} else { /* buffer available on both AGE and LRU */
3454		int		t = buf_timestamp();
3455
3456		age_time = t - age_bp->b_timestamp;
3457		lru_time = t - lru_bp->b_timestamp;
3458		if ((age_time < 0) || (lru_time < 0)) { /* time set backwards */
3459			bp = age_bp;
3460			*queue = BQ_AGE;
3461			/*
3462			 * we should probably re-timestamp eveything in the
3463			 * queues at this point with the current time
3464			 */
3465		} else {
3466			if ((lru_time >= lru_is_stale) && (age_time < age_is_stale)) {
3467				bp = lru_bp;
3468				*queue = BQ_LRU;
3469			} else {
3470				bp = age_bp;
3471				*queue = BQ_AGE;
3472			}
3473		}
3474	}
3475
3476	if (!bp) { /* Neither on AGE nor on LRU */
3477		bp = meta_bp;
3478		*queue = BQ_META;
3479	}  else if (meta_bp) {
3480		int		t = buf_timestamp();
3481
3482		bp_time = t - bp->b_timestamp;
3483		meta_time = t - meta_bp->b_timestamp;
3484
3485		if (!(bp_time < 0) && !(meta_time < 0)) {
3486			/* time not set backwards */
3487			int bp_is_stale;
3488			bp_is_stale = (*queue == BQ_LRU) ?
3489					lru_is_stale : age_is_stale;
3490
3491			if ((meta_time >= meta_is_stale) &&
3492					(bp_time < bp_is_stale)) {
3493				bp = meta_bp;
3494				*queue = BQ_META;
3495			}
3496		}
3497	}
3498found:
3499	if (ISSET(bp->b_flags, B_LOCKED) || ISSET(bp->b_lflags, BL_BUSY))
3500	        panic("getnewbuf: bp @ %p is LOCKED or BUSY! (flags 0x%x)\n", bp, bp->b_flags);
3501
3502	/* Clean it */
3503	if (bcleanbuf(bp, FALSE)) {
3504		/*
3505		 * moved to the laundry thread, buffer not ready
3506		 */
3507		*queue = req;
3508		goto start;
3509	}
3510	return (bp);
3511}
3512
3513
3514/*
3515 * Clean a buffer.
3516 * Returns 0 if buffer is ready to use,
3517 * Returns 1 if issued a buf_bawrite() to indicate
3518 * that the buffer is not ready.
3519 *
3520 * buf_mtxp is held upon entry
3521 * returns with buf_mtxp locked
3522 */
3523int
3524bcleanbuf(buf_t bp, boolean_t discard)
3525{
3526	/* Remove from the queue */
3527	bremfree_locked(bp);
3528
3529#ifdef JOE_DEBUG
3530	bp->b_owner = current_thread();
3531	bp->b_tag   = 2;
3532#endif
3533	/*
3534	 * If buffer was a delayed write, start the IO by queuing
3535	 * it on the LAUNDRY queue, and return 1
3536	 */
3537	if (ISSET(bp->b_flags, B_DELWRI)) {
3538		if (discard) {
3539			SET(bp->b_lflags, BL_WANTDEALLOC);
3540		}
3541
3542		bmovelaundry(bp);
3543
3544		lck_mtx_unlock(buf_mtxp);
3545
3546		wakeup(&bufqueues[BQ_LAUNDRY]);
3547		/*
3548		 * and give it a chance to run
3549		 */
3550		(void)thread_block(THREAD_CONTINUE_NULL);
3551
3552		lck_mtx_lock_spin(buf_mtxp);
3553
3554		return (1);
3555	}
3556#ifdef JOE_DEBUG
3557	bp->b_owner = current_thread();
3558	bp->b_tag   = 8;
3559#endif
3560	/*
3561	 * Buffer is no longer on any free list... we own it
3562	 */
3563	SET(bp->b_lflags, BL_BUSY);
3564	buf_busycount++;
3565
3566	bremhash(bp);
3567
3568	/*
3569	 * disassociate us from our vnode, if we had one...
3570	 */
3571	if (bp->b_vp)
3572		brelvp_locked(bp);
3573
3574	lck_mtx_unlock(buf_mtxp);
3575
3576	BLISTNONE(bp);
3577
3578	if (ISSET(bp->b_flags, B_META))
3579		buf_free_meta_store(bp);
3580
3581	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
3582
3583	buf_release_credentials(bp);
3584
3585	/* If discarding, just move to the empty queue */
3586	if (discard) {
3587		lck_mtx_lock_spin(buf_mtxp);
3588		CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED | B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
3589		bp->b_whichq = BQ_EMPTY;
3590		binshash(bp, &invalhash);
3591		binsheadfree(bp, &bufqueues[BQ_EMPTY], BQ_EMPTY);
3592		CLR(bp->b_lflags, BL_BUSY);
3593		buf_busycount--;
3594	} else {
3595		/* Not discarding: clean up and prepare for reuse */
3596		bp->b_bufsize = 0;
3597		bp->b_datap = (uintptr_t)NULL;
3598		bp->b_upl = (void *)NULL;
3599		/*
3600		 * preserve the state of whether this buffer
3601		 * was allocated on the fly or not...
3602		 * the only other flag that should be set at
3603		 * this point is BL_BUSY...
3604		 */
3605#ifdef JOE_DEBUG
3606		bp->b_owner = current_thread();
3607		bp->b_tag   = 3;
3608#endif
3609		bp->b_lflags = BL_BUSY;
3610		bp->b_flags = (bp->b_flags & B_HDRALLOC);
3611		bp->b_dev = NODEV;
3612		bp->b_blkno = bp->b_lblkno = 0;
3613		bp->b_iodone = NULL;
3614		bp->b_error = 0;
3615		bp->b_resid = 0;
3616		bp->b_bcount = 0;
3617		bp->b_dirtyoff = bp->b_dirtyend = 0;
3618		bp->b_validoff = bp->b_validend = 0;
3619		bzero(&bp->b_attr, sizeof(struct bufattr));
3620
3621		lck_mtx_lock_spin(buf_mtxp);
3622	}
3623	return (0);
3624}
3625
3626
3627
3628errno_t
3629buf_invalblkno(vnode_t vp, daddr64_t lblkno, int flags)
3630{
3631        buf_t	bp;
3632	errno_t	error;
3633	struct bufhashhdr *dp;
3634
3635	dp = BUFHASH(vp, lblkno);
3636
3637relook:
3638	lck_mtx_lock_spin(buf_mtxp);
3639
3640	if ((bp = incore_locked(vp, lblkno, dp)) == (struct buf *)0) {
3641	        lck_mtx_unlock(buf_mtxp);
3642		return (0);
3643	}
3644	if (ISSET(bp->b_lflags, BL_BUSY)) {
3645	        if ( !ISSET(flags, BUF_WAIT)) {
3646		        lck_mtx_unlock(buf_mtxp);
3647			return (EBUSY);
3648		}
3649	        SET(bp->b_lflags, BL_WANTED);
3650
3651		error = msleep((caddr_t)bp, buf_mtxp, PDROP | (PRIBIO + 1), "buf_invalblkno", NULL);
3652
3653		if (error) {
3654			return (error);
3655		}
3656		goto relook;
3657	}
3658	bremfree_locked(bp);
3659	SET(bp->b_lflags, BL_BUSY);
3660	SET(bp->b_flags, B_INVAL);
3661	buf_busycount++;
3662#ifdef JOE_DEBUG
3663	bp->b_owner = current_thread();
3664	bp->b_tag   = 4;
3665#endif
3666	lck_mtx_unlock(buf_mtxp);
3667	buf_brelse(bp);
3668
3669	return (0);
3670}
3671
3672
3673void
3674buf_drop(buf_t bp)
3675{
3676        int need_wakeup = 0;
3677
3678	lck_mtx_lock_spin(buf_mtxp);
3679
3680	if (ISSET(bp->b_lflags, BL_WANTED)) {
3681	        /*
3682		 * delay the actual wakeup until after we
3683		 * clear BL_BUSY and we've dropped buf_mtxp
3684		 */
3685		need_wakeup = 1;
3686	}
3687#ifdef JOE_DEBUG
3688	bp->b_owner = current_thread();
3689	bp->b_tag   = 9;
3690#endif
3691	/*
3692	 * Unlock the buffer.
3693	 */
3694	CLR(bp->b_lflags, (BL_BUSY | BL_WANTED));
3695	buf_busycount--;
3696
3697	lck_mtx_unlock(buf_mtxp);
3698
3699	if (need_wakeup) {
3700	        /*
3701		 * Wake up any proceeses waiting for _this_ buffer to become free.
3702		 */
3703	        wakeup(bp);
3704	}
3705}
3706
3707
3708errno_t
3709buf_acquire(buf_t bp, int flags, int slpflag, int slptimeo) {
3710        errno_t error;
3711
3712        lck_mtx_lock_spin(buf_mtxp);
3713
3714	error = buf_acquire_locked(bp, flags, slpflag, slptimeo);
3715
3716       	lck_mtx_unlock(buf_mtxp);
3717
3718	return (error);
3719}
3720
3721
3722static errno_t
3723buf_acquire_locked(buf_t bp, int flags, int slpflag, int slptimeo)
3724{
3725	errno_t error;
3726	struct timespec ts;
3727
3728	if (ISSET(bp->b_flags, B_LOCKED)) {
3729	        if ((flags & BAC_SKIP_LOCKED))
3730			return (EDEADLK);
3731	} else {
3732	        if ((flags & BAC_SKIP_NONLOCKED))
3733			return (EDEADLK);
3734	}
3735        if (ISSET(bp->b_lflags, BL_BUSY)) {
3736	        /*
3737		 * since the lck_mtx_lock may block, the buffer
3738		 * may become BUSY, so we need to
3739		 * recheck for a NOWAIT request
3740		 */
3741	        if (flags & BAC_NOWAIT)
3742			return (EBUSY);
3743	        SET(bp->b_lflags, BL_WANTED);
3744
3745		/* the hz value is 100; which leads to 10ms */
3746		ts.tv_sec = (slptimeo/100);
3747		ts.tv_nsec = (slptimeo % 100) * 10  * NSEC_PER_USEC * 1000;
3748		error = msleep((caddr_t)bp, buf_mtxp, slpflag | (PRIBIO + 1), "buf_acquire", &ts);
3749
3750		if (error)
3751			return (error);
3752		return (EAGAIN);
3753	}
3754	if (flags & BAC_REMOVE)
3755	        bremfree_locked(bp);
3756	SET(bp->b_lflags, BL_BUSY);
3757	buf_busycount++;
3758
3759#ifdef JOE_DEBUG
3760	bp->b_owner = current_thread();
3761	bp->b_tag   = 5;
3762#endif
3763	return (0);
3764}
3765
3766
3767/*
3768 * Wait for operations on the buffer to complete.
3769 * When they do, extract and return the I/O's error value.
3770 */
3771errno_t
3772buf_biowait(buf_t bp)
3773{
3774	while (!ISSET(bp->b_flags, B_DONE)) {
3775
3776		lck_mtx_lock_spin(buf_mtxp);
3777
3778		if (!ISSET(bp->b_flags, B_DONE)) {
3779			DTRACE_IO1(wait__start, buf_t, bp);
3780			(void) msleep(bp, buf_mtxp, PDROP | (PRIBIO+1), "buf_biowait", NULL);
3781			DTRACE_IO1(wait__done, buf_t, bp);
3782		} else
3783			lck_mtx_unlock(buf_mtxp);
3784	}
3785	/* check for interruption of I/O (e.g. via NFS), then errors. */
3786	if (ISSET(bp->b_flags, B_EINTR)) {
3787		CLR(bp->b_flags, B_EINTR);
3788		return (EINTR);
3789	} else if (ISSET(bp->b_flags, B_ERROR))
3790		return (bp->b_error ? bp->b_error : EIO);
3791	else
3792		return (0);
3793}
3794
3795
3796/*
3797 * Mark I/O complete on a buffer.
3798 *
3799 * If a callback has been requested, e.g. the pageout
3800 * daemon, do so. Otherwise, awaken waiting processes.
3801 *
3802 * [ Leffler, et al., says on p.247:
3803 *	"This routine wakes up the blocked process, frees the buffer
3804 *	for an asynchronous write, or, for a request by the pagedaemon
3805 *	process, invokes a procedure specified in the buffer structure" ]
3806 *
3807 * In real life, the pagedaemon (or other system processes) wants
3808 * to do async stuff to, and doesn't want the buffer buf_brelse()'d.
3809 * (for swap pager, that puts swap buffers on the free lists (!!!),
3810 * for the vn device, that puts malloc'd buffers on the free lists!)
3811 */
3812
3813void
3814buf_biodone(buf_t bp)
3815{
3816	mount_t mp;
3817	struct bufattr *bap;
3818
3819	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_START,
3820		     bp, bp->b_datap, bp->b_flags, 0, 0);
3821
3822	if (ISSET(bp->b_flags, B_DONE))
3823		panic("biodone already");
3824
3825	if (ISSET(bp->b_flags, B_ERROR)) {
3826		fslog_io_error(bp);
3827	}
3828
3829	bap = &bp->b_attr;
3830
3831	if (bp->b_vp && bp->b_vp->v_mount) {
3832		mp = bp->b_vp->v_mount;
3833	} else {
3834		mp = NULL;
3835	}
3836
3837	if (mp && (bp->b_flags & B_READ) == 0) {
3838		update_last_io_time(mp);
3839		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_write_size);
3840	} else if (mp) {
3841		INCR_PENDING_IO(-(pending_io_t)buf_count(bp), mp->mnt_pending_read_size);
3842	}
3843
3844	if (kdebug_enable) {
3845		int code    = DKIO_DONE;
3846		int io_tier = GET_BUFATTR_IO_TIER(bap);
3847
3848		if (bp->b_flags & B_READ)
3849		        code |= DKIO_READ;
3850		if (bp->b_flags & B_ASYNC)
3851		        code |= DKIO_ASYNC;
3852
3853		if (bp->b_flags & B_META)
3854		        code |= DKIO_META;
3855		else if (bp->b_flags & B_PAGEIO)
3856		        code |= DKIO_PAGING;
3857
3858		if (io_tier != 0)
3859			code |= DKIO_THROTTLE;
3860
3861		code |= ((io_tier << DKIO_TIER_SHIFT) & DKIO_TIER_MASK);
3862
3863		if (bp->b_flags & B_PASSIVE)
3864			code |= DKIO_PASSIVE;
3865
3866		if (bap->ba_flags & BA_NOCACHE)
3867			code |= DKIO_NOCACHE;
3868
3869		KERNEL_DEBUG_CONSTANT_IST(KDEBUG_COMMON, FSDBG_CODE(DBG_DKRW, code) | DBG_FUNC_NONE,
3870		                          buf_kernel_addrperm_addr(bp), (uintptr_t)VM_KERNEL_ADDRPERM(bp->b_vp), bp->b_resid, bp->b_error, 0);
3871        }
3872
3873	/*
3874	 * I/O was done, so don't believe
3875	 * the DIRTY state from VM anymore...
3876	 * and we need to reset the THROTTLED/PASSIVE
3877	 * indicators
3878	 */
3879	CLR(bp->b_flags, (B_WASDIRTY | B_PASSIVE));
3880	CLR(bap->ba_flags, (BA_META | BA_NOCACHE | BA_DELAYIDLESLEEP));
3881
3882	SET_BUFATTR_IO_TIER(bap, 0);
3883
3884	DTRACE_IO1(done, buf_t, bp);
3885
3886	if (!ISSET(bp->b_flags, B_READ) && !ISSET(bp->b_flags, B_RAW))
3887	        /*
3888		 * wake up any writer's blocked
3889		 * on throttle or waiting for I/O
3890		 * to drain
3891		 */
3892		vnode_writedone(bp->b_vp);
3893
3894	if (ISSET(bp->b_flags, (B_CALL | B_FILTER))) {	/* if necessary, call out */
3895		void	(*iodone_func)(struct buf *, void *) = bp->b_iodone;
3896		void 	*arg = bp->b_transaction;
3897		int     callout = ISSET(bp->b_flags, B_CALL);
3898
3899		if (iodone_func == NULL)
3900			panic("biodone: bp @ %p has NULL b_iodone!\n", bp);
3901
3902		CLR(bp->b_flags, (B_CALL | B_FILTER));	/* filters and callouts are one-shot */
3903		bp->b_iodone = NULL;
3904		bp->b_transaction = NULL;
3905
3906		if (callout)
3907		        SET(bp->b_flags, B_DONE);	/* note that it's done */
3908
3909		(*iodone_func)(bp, arg);
3910
3911		if (callout) {
3912			/*
3913			 * assumes that the callback function takes
3914			 * ownership of the bp and deals with releasing it if necessary
3915			 */
3916			goto biodone_done;
3917		}
3918		/*
3919		 * in this case the call back function is acting
3920		 * strictly as a filter... it does not take
3921		 * ownership of the bp and is expecting us
3922		 * to finish cleaning up... this is currently used
3923		 * by the HFS journaling code
3924		 */
3925	}
3926	if (ISSET(bp->b_flags, B_ASYNC)) {	/* if async, release it */
3927		SET(bp->b_flags, B_DONE);	/* note that it's done */
3928
3929		buf_brelse(bp);
3930	} else {				/* or just wakeup the buffer */
3931	        /*
3932		 * by taking the mutex, we serialize
3933		 * the buf owner calling buf_biowait so that we'll
3934		 * only see him in one of 2 states...
3935		 * state 1: B_DONE wasn't set and he's
3936		 * blocked in msleep
3937		 * state 2: he's blocked trying to take the
3938		 * mutex before looking at B_DONE
3939		 * BL_WANTED is cleared in case anyone else
3940		 * is blocked waiting for the buffer... note
3941		 * that we haven't cleared B_BUSY yet, so if
3942		 * they do get to run, their going to re-set
3943		 * BL_WANTED and go back to sleep
3944		 */
3945	        lck_mtx_lock_spin(buf_mtxp);
3946
3947		CLR(bp->b_lflags, BL_WANTED);
3948		SET(bp->b_flags, B_DONE);		/* note that it's done */
3949
3950	        lck_mtx_unlock(buf_mtxp);
3951
3952		wakeup(bp);
3953	}
3954biodone_done:
3955	KERNEL_DEBUG((FSDBG_CODE(DBG_FSRW, 387)) | DBG_FUNC_END,
3956                 (uintptr_t)bp, (uintptr_t)bp->b_datap, bp->b_flags, 0, 0);
3957}
3958
3959/*
3960 * Obfuscate buf pointers.
3961 */
3962vm_offset_t
3963buf_kernel_addrperm_addr(void * addr)
3964{
3965	if ((vm_offset_t)addr == 0)
3966		return 0;
3967	else
3968		return ((vm_offset_t)addr + buf_kernel_addrperm);
3969}
3970
3971/*
3972 * Return a count of buffers on the "locked" queue.
3973 */
3974int
3975count_lock_queue(void)
3976{
3977	buf_t	bp;
3978	int	n = 0;
3979
3980	lck_mtx_lock_spin(buf_mtxp);
3981
3982	for (bp = bufqueues[BQ_LOCKED].tqh_first; bp;
3983	    bp = bp->b_freelist.tqe_next)
3984		n++;
3985	lck_mtx_unlock(buf_mtxp);
3986
3987	return (n);
3988}
3989
3990/*
3991 * Return a count of 'busy' buffers. Used at the time of shutdown.
3992 * note: This is also called from the mach side in debug context in kdp.c
3993 */
3994int
3995count_busy_buffers(void)
3996{
3997	return buf_busycount + bufstats.bufs_iobufinuse;
3998}
3999
4000#if DIAGNOSTIC
4001/*
4002 * Print out statistics on the current allocation of the buffer pool.
4003 * Can be enabled to print out on every ``sync'' by setting "syncprt"
4004 * in vfs_syscalls.c using sysctl.
4005 */
4006void
4007vfs_bufstats()
4008{
4009	int i, j, count;
4010	struct buf *bp;
4011	struct bqueues *dp;
4012	int counts[MAXBSIZE/CLBYTES+1];
4013	static char *bname[BQUEUES] =
4014		{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4015
4016	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
4017		count = 0;
4018		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4019			counts[j] = 0;
4020
4021		lck_mtx_lock(buf_mtxp);
4022
4023		for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
4024			counts[bp->b_bufsize/CLBYTES]++;
4025			count++;
4026		}
4027		lck_mtx_unlock(buf_mtxp);
4028
4029		printf("%s: total-%d", bname[i], count);
4030		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
4031			if (counts[j] != 0)
4032				printf(", %d-%d", j * CLBYTES, counts[j]);
4033		printf("\n");
4034	}
4035}
4036#endif /* DIAGNOSTIC */
4037
4038#define	NRESERVEDIOBUFS	128
4039
4040
4041buf_t
4042alloc_io_buf(vnode_t vp, int priv)
4043{
4044	buf_t	bp;
4045
4046	lck_mtx_lock_spin(iobuffer_mtxp);
4047
4048	while (((niobuf_headers - NRESERVEDIOBUFS < bufstats.bufs_iobufinuse) && !priv) ||
4049	       (bp = iobufqueue.tqh_first) == NULL) {
4050		bufstats.bufs_iobufsleeps++;
4051
4052		need_iobuffer = 1;
4053		(void) msleep(&need_iobuffer, iobuffer_mtxp, PSPIN | (PRIBIO+1), (const char *)"alloc_io_buf", NULL);
4054	}
4055	TAILQ_REMOVE(&iobufqueue, bp, b_freelist);
4056
4057	bufstats.bufs_iobufinuse++;
4058	if (bufstats.bufs_iobufinuse > bufstats.bufs_iobufmax)
4059		bufstats.bufs_iobufmax = bufstats.bufs_iobufinuse;
4060
4061	lck_mtx_unlock(iobuffer_mtxp);
4062
4063	/*
4064	 * initialize various fields
4065	 * we don't need to hold the mutex since the buffer
4066	 * is now private... the vp should have a reference
4067	 * on it and is not protected by this mutex in any event
4068	 */
4069	bp->b_timestamp = 0;
4070	bp->b_proc = NULL;
4071
4072	bp->b_datap = 0;
4073	bp->b_flags = 0;
4074	bp->b_lflags = BL_BUSY | BL_IOBUF;
4075	bp->b_redundancy_flags = 0;
4076	bp->b_blkno = bp->b_lblkno = 0;
4077#ifdef JOE_DEBUG
4078	bp->b_owner = current_thread();
4079	bp->b_tag   = 6;
4080#endif
4081	bp->b_iodone = NULL;
4082	bp->b_error = 0;
4083	bp->b_resid = 0;
4084	bp->b_bcount = 0;
4085	bp->b_bufsize = 0;
4086	bp->b_upl = NULL;
4087	bp->b_vp = vp;
4088	bzero(&bp->b_attr, sizeof(struct bufattr));
4089
4090	if (vp && (vp->v_type == VBLK || vp->v_type == VCHR))
4091		bp->b_dev = vp->v_rdev;
4092	else
4093		bp->b_dev = NODEV;
4094
4095	return (bp);
4096}
4097
4098
4099void
4100free_io_buf(buf_t bp)
4101{
4102        int need_wakeup = 0;
4103
4104	/*
4105	 * put buffer back on the head of the iobufqueue
4106	 */
4107	bp->b_vp = NULL;
4108	bp->b_flags = B_INVAL;
4109
4110	lck_mtx_lock_spin(iobuffer_mtxp);
4111
4112	binsheadfree(bp, &iobufqueue, -1);
4113
4114	if (need_iobuffer) {
4115	        /*
4116		 * Wake up any processes waiting because they need an io buffer
4117		 *
4118		 * do the wakeup after we drop the mutex... it's possible that the
4119		 * wakeup will be superfluous if need_iobuffer gets set again and
4120		 * another thread runs this path, but it's highly unlikely, doesn't
4121		 * hurt, and it means we don't hold up I/O progress if the wakeup blocks
4122		 * trying to grab a task related lock...
4123		 */
4124		need_iobuffer = 0;
4125		need_wakeup = 1;
4126	}
4127	if (bufstats.bufs_iobufinuse <= 0)
4128		panic("free_io_buf: bp(%p) - bufstats.bufs_iobufinuse < 0", bp);
4129
4130	bufstats.bufs_iobufinuse--;
4131
4132	lck_mtx_unlock(iobuffer_mtxp);
4133
4134	if (need_wakeup)
4135	        wakeup(&need_iobuffer);
4136}
4137
4138
4139void
4140buf_list_lock(void)
4141{
4142        lck_mtx_lock_spin(buf_mtxp);
4143}
4144
4145void
4146buf_list_unlock(void)
4147{
4148        lck_mtx_unlock(buf_mtxp);
4149}
4150
4151/*
4152 * If getnewbuf() calls bcleanbuf() on the same thread
4153 * there is a potential for stack overrun and deadlocks.
4154 * So we always handoff the work to a worker thread for completion
4155 */
4156
4157
4158static void
4159bcleanbuf_thread_init(void)
4160{
4161	thread_t	thread = THREAD_NULL;
4162
4163	/* create worker thread */
4164	kernel_thread_start((thread_continue_t)bcleanbuf_thread, NULL, &thread);
4165	thread_deallocate(thread);
4166}
4167
4168typedef int (*bcleanbufcontinuation)(int);
4169
4170static void
4171bcleanbuf_thread(void)
4172{
4173	struct buf *bp;
4174	int error = 0;
4175	int loopcnt = 0;
4176
4177	for (;;) {
4178	        lck_mtx_lock_spin(buf_mtxp);
4179
4180		while ( (bp = TAILQ_FIRST(&bufqueues[BQ_LAUNDRY])) == NULL) {
4181			(void)msleep0(&bufqueues[BQ_LAUNDRY], buf_mtxp, PRIBIO|PDROP, "blaundry", 0, (bcleanbufcontinuation)bcleanbuf_thread);
4182		}
4183
4184		/*
4185		 * Remove from the queue
4186		 */
4187		bremfree_locked(bp);
4188
4189		/*
4190		 * Buffer is no longer on any free list
4191		 */
4192		SET(bp->b_lflags, BL_BUSY);
4193		buf_busycount++;
4194
4195#ifdef JOE_DEBUG
4196		bp->b_owner = current_thread();
4197		bp->b_tag   = 10;
4198#endif
4199
4200		lck_mtx_unlock(buf_mtxp);
4201		/*
4202		 * do the IO
4203		 */
4204		error = bawrite_internal(bp, 0);
4205
4206		if (error) {
4207		        bp->b_whichq = BQ_LAUNDRY;
4208			bp->b_timestamp = buf_timestamp();
4209
4210		        lck_mtx_lock_spin(buf_mtxp);
4211
4212			binstailfree(bp, &bufqueues[BQ_LAUNDRY], BQ_LAUNDRY);
4213			blaundrycnt++;
4214
4215			/* we never leave a busy page on the laundry queue */
4216			CLR(bp->b_lflags, BL_BUSY);
4217			buf_busycount--;
4218#ifdef JOE_DEBUG
4219			bp->b_owner = current_thread();
4220			bp->b_tag   = 11;
4221#endif
4222
4223			lck_mtx_unlock(buf_mtxp);
4224
4225			if (loopcnt > MAXLAUNDRY) {
4226				/*
4227				 * bawrite_internal() can return errors if we're throttled. If we've
4228				 * done several I/Os and failed, give the system some time to unthrottle
4229				 * the vnode
4230				 */
4231				(void)tsleep((void *)&bufqueues[BQ_LAUNDRY], PRIBIO, "blaundry", 1);
4232				loopcnt = 0;
4233			} else {
4234				/* give other threads a chance to run */
4235				(void)thread_block(THREAD_CONTINUE_NULL);
4236				loopcnt++;
4237			}
4238		}
4239	}
4240}
4241
4242
4243static int
4244brecover_data(buf_t bp)
4245{
4246	int	upl_offset;
4247        upl_t	upl;
4248	upl_page_info_t *pl;
4249	kern_return_t kret;
4250	vnode_t	vp = bp->b_vp;
4251	int upl_flags;
4252
4253
4254	if ( !UBCINFOEXISTS(vp) || bp->b_bufsize == 0)
4255	        goto dump_buffer;
4256
4257	upl_flags = UPL_PRECIOUS;
4258	if (! (buf_flags(bp) & B_READ)) {
4259		/*
4260		 * "write" operation:  let the UPL subsystem know
4261		 * that we intend to modify the buffer cache pages we're
4262		 * gathering.
4263		 */
4264		upl_flags |= UPL_WILL_MODIFY;
4265	}
4266
4267	kret = ubc_create_upl(vp,
4268			      ubc_blktooff(vp, bp->b_lblkno),
4269			      bp->b_bufsize,
4270			      &upl,
4271			      &pl,
4272			      upl_flags);
4273	if (kret != KERN_SUCCESS)
4274	        panic("Failed to create UPL");
4275
4276	for (upl_offset = 0; upl_offset < bp->b_bufsize; upl_offset += PAGE_SIZE) {
4277
4278	        if (!upl_valid_page(pl, upl_offset / PAGE_SIZE) || !upl_dirty_page(pl, upl_offset / PAGE_SIZE)) {
4279		        ubc_upl_abort(upl, 0);
4280			goto dump_buffer;
4281		}
4282	}
4283	bp->b_upl = upl;
4284
4285	kret = ubc_upl_map(upl, (vm_offset_t *)&(bp->b_datap));
4286
4287	if (kret != KERN_SUCCESS)
4288	        panic("getblk: ubc_upl_map() failed with (%d)", kret);
4289	return (1);
4290
4291dump_buffer:
4292	bp->b_bufsize = 0;
4293	SET(bp->b_flags, B_INVAL);
4294	buf_brelse(bp);
4295
4296	return(0);
4297}
4298
4299boolean_t
4300buffer_cache_gc(int all)
4301{
4302	buf_t bp;
4303	boolean_t did_large_zfree = FALSE;
4304	boolean_t need_wakeup = FALSE;
4305	int now = buf_timestamp();
4306	uint32_t found = 0;
4307	struct bqueues privq;
4308	int thresh_hold = BUF_STALE_THRESHHOLD;
4309
4310	if (all)
4311		thresh_hold = 0;
4312	/*
4313	 * We only care about metadata (incore storage comes from zalloc()).
4314	 * Unless "all" is set (used to evict meta data buffers in preparation
4315	 * for deep sleep), we only evict up to BUF_MAX_GC_BATCH_SIZE buffers
4316	 * that have not been accessed in the last 30s. This limit controls both
4317	 * the hold time of the global lock "buf_mtxp" and the length of time
4318	 * we spend compute bound in the GC thread which calls this function
4319	 */
4320	lck_mtx_lock(buf_mtxp);
4321
4322	do {
4323		found = 0;
4324		TAILQ_INIT(&privq);
4325		need_wakeup = FALSE;
4326
4327		while (((bp = TAILQ_FIRST(&bufqueues[BQ_META]))) &&
4328				(now > bp->b_timestamp) &&
4329				(now - bp->b_timestamp > thresh_hold) &&
4330				(found < BUF_MAX_GC_BATCH_SIZE)) {
4331
4332			/* Remove from free list */
4333			bremfree_locked(bp);
4334			found++;
4335
4336#ifdef JOE_DEBUG
4337			bp->b_owner = current_thread();
4338			bp->b_tag   = 12;
4339#endif
4340
4341			/* If dirty, move to laundry queue and remember to do wakeup */
4342			if (ISSET(bp->b_flags, B_DELWRI)) {
4343				SET(bp->b_lflags, BL_WANTDEALLOC);
4344
4345				bmovelaundry(bp);
4346				need_wakeup = TRUE;
4347
4348				continue;
4349			}
4350
4351			/*
4352			 * Mark busy and put on private list.  We could technically get
4353			 * away without setting BL_BUSY here.
4354			 */
4355			SET(bp->b_lflags, BL_BUSY);
4356			buf_busycount++;
4357
4358			/*
4359			 * Remove from hash and dissociate from vp.
4360			 */
4361			bremhash(bp);
4362			if (bp->b_vp) {
4363				brelvp_locked(bp);
4364			}
4365
4366			TAILQ_INSERT_TAIL(&privq, bp, b_freelist);
4367		}
4368
4369		if (found == 0) {
4370			break;
4371		}
4372
4373		/* Drop lock for batch processing */
4374		lck_mtx_unlock(buf_mtxp);
4375
4376		/* Wakeup and yield for laundry if need be */
4377		if (need_wakeup) {
4378			wakeup(&bufqueues[BQ_LAUNDRY]);
4379			(void)thread_block(THREAD_CONTINUE_NULL);
4380		}
4381
4382		/* Clean up every buffer on private list */
4383		TAILQ_FOREACH(bp, &privq, b_freelist) {
4384			/* Take note if we've definitely freed at least a page to a zone */
4385			if ((ISSET(bp->b_flags, B_ZALLOC)) && (buf_size(bp) >= PAGE_SIZE)) {
4386				did_large_zfree = TRUE;
4387			}
4388
4389			trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
4390
4391			/* Free Storage */
4392			buf_free_meta_store(bp);
4393
4394			/* Release credentials */
4395			buf_release_credentials(bp);
4396
4397			/* Prepare for moving to empty queue */
4398			CLR(bp->b_flags, (B_META | B_ZALLOC | B_DELWRI | B_LOCKED
4399						| B_AGE | B_ASYNC | B_NOCACHE | B_FUA));
4400			bp->b_whichq = BQ_EMPTY;
4401			BLISTNONE(bp);
4402		}
4403		lck_mtx_lock(buf_mtxp);
4404
4405		/* Back under lock, move them all to invalid hash and clear busy */
4406		TAILQ_FOREACH(bp, &privq, b_freelist) {
4407			binshash(bp, &invalhash);
4408			CLR(bp->b_lflags, BL_BUSY);
4409			buf_busycount--;
4410
4411#ifdef JOE_DEBUG
4412			if (bp->b_owner != current_thread()) {
4413				panic("Buffer stolen from buffer_cache_gc()");
4414			}
4415			bp->b_owner = current_thread();
4416			bp->b_tag   = 13;
4417#endif
4418		}
4419
4420		/* And do a big bulk move to the empty queue */
4421		TAILQ_CONCAT(&bufqueues[BQ_EMPTY], &privq, b_freelist);
4422
4423	} while (all && (found == BUF_MAX_GC_BATCH_SIZE));
4424
4425	lck_mtx_unlock(buf_mtxp);
4426
4427	return did_large_zfree;
4428}
4429
4430
4431/*
4432 * disabled for now
4433 */
4434
4435#if FLUSH_QUEUES
4436
4437#define NFLUSH 32
4438
4439static int
4440bp_cmp(void *a, void *b)
4441{
4442    buf_t *bp_a = *(buf_t **)a,
4443          *bp_b = *(buf_t **)b;
4444    daddr64_t res;
4445
4446    // don't have to worry about negative block
4447    // numbers so this is ok to do.
4448    //
4449    res = (bp_a->b_blkno - bp_b->b_blkno);
4450
4451    return (int)res;
4452}
4453
4454
4455int
4456bflushq(int whichq, mount_t mp)
4457{
4458	buf_t	bp, next;
4459	int	i, buf_count;
4460	int	total_writes = 0;
4461	static buf_t flush_table[NFLUSH];
4462
4463	if (whichq < 0 || whichq >= BQUEUES) {
4464	    return (0);
4465	}
4466
4467  restart:
4468	lck_mtx_lock(buf_mtxp);
4469
4470	bp = TAILQ_FIRST(&bufqueues[whichq]);
4471
4472	for (buf_count = 0; bp; bp = next) {
4473	    next = bp->b_freelist.tqe_next;
4474
4475	    if (bp->b_vp == NULL || bp->b_vp->v_mount != mp) {
4476		continue;
4477	    }
4478
4479	    if (ISSET(bp->b_flags, B_DELWRI) && !ISSET(bp->b_lflags, BL_BUSY)) {
4480
4481		bremfree_locked(bp);
4482#ifdef JOE_DEBUG
4483		bp->b_owner = current_thread();
4484		bp->b_tag   = 7;
4485#endif
4486		SET(bp->b_lflags, BL_BUSY);
4487		buf_busycount++;
4488
4489		flush_table[buf_count] = bp;
4490		buf_count++;
4491		total_writes++;
4492
4493		if (buf_count >= NFLUSH) {
4494		    lck_mtx_unlock(buf_mtxp);
4495
4496		    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4497
4498		    for (i = 0; i < buf_count; i++) {
4499			buf_bawrite(flush_table[i]);
4500		    }
4501		    goto restart;
4502		}
4503	    }
4504	}
4505	lck_mtx_unlock(buf_mtxp);
4506
4507	if (buf_count > 0) {
4508	    qsort(flush_table, buf_count, sizeof(struct buf *), bp_cmp);
4509
4510	    for (i = 0; i < buf_count; i++) {
4511		buf_bawrite(flush_table[i]);
4512	    }
4513	}
4514
4515	return (total_writes);
4516}
4517#endif
4518
4519
4520#if BALANCE_QUEUES
4521
4522/* XXX move this to a separate file */
4523
4524/*
4525 * NOTE: THIS CODE HAS NOT BEEN UPDATED
4526 * WITH RESPECT TO THE NEW LOCKING MODEL
4527 */
4528
4529
4530/*
4531 * Dynamic Scaling of the Buffer Queues
4532 */
4533
4534typedef long long blsize_t;
4535
4536blsize_t MAXNBUF; /* initialize to (sane_size / PAGE_SIZE) */
4537/* Global tunable limits */
4538blsize_t nbufh;			/* number of buffer headers */
4539blsize_t nbuflow;		/* minimum number of buffer headers required */
4540blsize_t nbufhigh;		/* maximum number of buffer headers allowed */
4541blsize_t nbuftarget;	/* preferred number of buffer headers */
4542
4543/*
4544 * assertions:
4545 *
4546 * 1.	0 < nbuflow <= nbufh <= nbufhigh
4547 * 2.	nbufhigh <= MAXNBUF
4548 * 3.	0 < nbuflow <= nbuftarget <= nbufhigh
4549 * 4.	nbufh can not be set by sysctl().
4550 */
4551
4552/* Per queue tunable limits */
4553
4554struct bufqlim {
4555	blsize_t	bl_nlow;	/* minimum number of buffer headers required */
4556	blsize_t	bl_num;		/* number of buffer headers on the queue */
4557	blsize_t	bl_nlhigh;	/* maximum number of buffer headers allowed */
4558	blsize_t	bl_target;	/* preferred number of buffer headers */
4559	long	bl_stale;	/* Seconds after which a buffer is considered stale */
4560} bufqlim[BQUEUES];
4561
4562/*
4563 * assertions:
4564 *
4565 * 1.	0 <= bl_nlow <= bl_num <= bl_nlhigh
4566 * 2.	bl_nlhigh <= MAXNBUF
4567 * 3.  bufqlim[BQ_META].bl_nlow != 0
4568 * 4.  bufqlim[BQ_META].bl_nlow > (number of possible concurrent
4569 *									file system IO operations)
4570 * 5.	bl_num can not be set by sysctl().
4571 * 6.	bl_nhigh <= nbufhigh
4572 */
4573
4574/*
4575 * Rationale:
4576 * ----------
4577 * Defining it blsize_t as long permits 2^31 buffer headers per queue.
4578 * Which can describe (2^31 * PAGE_SIZE) memory per queue.
4579 *
4580 * These limits are exported to by means of sysctl().
4581 * It was decided to define blsize_t as a 64 bit quantity.
4582 * This will make sure that we will not be required to change it
4583 * as long as we do not exceed 64 bit address space for the kernel.
4584 *
4585 * low and high numbers parameters initialized at compile time
4586 * and boot arguments can be used to override them. sysctl()
4587 * would not change the value. sysctl() can get all the values
4588 * but can set only target. num is the current level.
4589 *
4590 * Advantages of having a "bufqscan" thread doing the balancing are,
4591 * Keep enough bufs on BQ_EMPTY.
4592 *	getnewbuf() by default will always select a buffer from the BQ_EMPTY.
4593 *		getnewbuf() perfoms best if a buffer was found there.
4594 *		Also this minimizes the possibility of starting IO
4595 *		from getnewbuf(). That's a performance win, too.
4596 *
4597 *	Localize complex logic [balancing as well as time aging]
4598 *		to balancebufq().
4599 *
4600 *	Simplify getnewbuf() logic by elimination of time aging code.
4601 */
4602
4603/*
4604 * Algorithm:
4605 * -----------
4606 * The goal of the dynamic scaling of the buffer queues to to keep
4607 * the size of the LRU close to bl_target. Buffers on a queue would
4608 * be time aged.
4609 *
4610 * There would be a thread which will be responsible for "balancing"
4611 * the buffer cache queues.
4612 *
4613 * The scan order would be:	AGE, LRU, META, EMPTY.
4614 */
4615
4616long bufqscanwait = 0;
4617
4618static void bufqscan_thread();
4619static int balancebufq(int q);
4620static int btrimempty(int n);
4621static __inline__ int initbufqscan(void);
4622static __inline__ int nextbufq(int q);
4623static void buqlimprt(int all);
4624
4625
4626static __inline__ void
4627bufqinc(int q)
4628{
4629	if ((q < 0) || (q >= BQUEUES))
4630		return;
4631
4632	bufqlim[q].bl_num++;
4633	return;
4634}
4635
4636static __inline__ void
4637bufqdec(int q)
4638{
4639	if ((q < 0) || (q >= BQUEUES))
4640		return;
4641
4642	bufqlim[q].bl_num--;
4643	return;
4644}
4645
4646static void
4647bufq_balance_thread_init(void)
4648{
4649	thread_t	thread = THREAD_NULL;
4650
4651	if (bufqscanwait++ == 0) {
4652
4653		/* Initalize globals */
4654		MAXNBUF = (sane_size / PAGE_SIZE);
4655		nbufh = nbuf_headers;
4656		nbuflow = min(nbufh, 100);
4657		nbufhigh = min(MAXNBUF, max(nbufh, 2048));
4658		nbuftarget = (sane_size >> 5) / PAGE_SIZE;
4659		nbuftarget = max(nbuflow, nbuftarget);
4660		nbuftarget = min(nbufhigh, nbuftarget);
4661
4662		/*
4663		 * Initialize the bufqlim
4664		 */
4665
4666		/* LOCKED queue */
4667		bufqlim[BQ_LOCKED].bl_nlow = 0;
4668		bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4669		bufqlim[BQ_LOCKED].bl_target = 0;
4670		bufqlim[BQ_LOCKED].bl_stale = 30;
4671
4672		/* LRU queue */
4673		bufqlim[BQ_LRU].bl_nlow = 0;
4674		bufqlim[BQ_LRU].bl_nlhigh = nbufhigh/4;
4675		bufqlim[BQ_LRU].bl_target = nbuftarget/4;
4676		bufqlim[BQ_LRU].bl_stale = LRU_IS_STALE;
4677
4678		/* AGE queue */
4679		bufqlim[BQ_AGE].bl_nlow = 0;
4680		bufqlim[BQ_AGE].bl_nlhigh = nbufhigh/4;
4681		bufqlim[BQ_AGE].bl_target = nbuftarget/4;
4682		bufqlim[BQ_AGE].bl_stale = AGE_IS_STALE;
4683
4684		/* EMPTY queue */
4685		bufqlim[BQ_EMPTY].bl_nlow = 0;
4686		bufqlim[BQ_EMPTY].bl_nlhigh = nbufhigh/4;
4687		bufqlim[BQ_EMPTY].bl_target = nbuftarget/4;
4688		bufqlim[BQ_EMPTY].bl_stale = 600000;
4689
4690		/* META queue */
4691		bufqlim[BQ_META].bl_nlow = 0;
4692		bufqlim[BQ_META].bl_nlhigh = nbufhigh/4;
4693		bufqlim[BQ_META].bl_target = nbuftarget/4;
4694		bufqlim[BQ_META].bl_stale = META_IS_STALE;
4695
4696		/* LAUNDRY queue */
4697		bufqlim[BQ_LOCKED].bl_nlow = 0;
4698		bufqlim[BQ_LOCKED].bl_nlhigh = 32;
4699		bufqlim[BQ_LOCKED].bl_target = 0;
4700		bufqlim[BQ_LOCKED].bl_stale = 30;
4701
4702		buqlimprt(1);
4703	}
4704
4705	/* create worker thread */
4706	kernel_thread_start((thread_continue_t)bufqscan_thread, NULL, &thread);
4707	thread_deallocate(thread);
4708}
4709
4710/* The workloop for the buffer balancing thread */
4711static void
4712bufqscan_thread()
4713{
4714	int moretodo = 0;
4715
4716	for(;;) {
4717		do {
4718			int q;	/* buffer queue to process */
4719
4720			q = initbufqscan();
4721			for (; q; ) {
4722				moretodo |= balancebufq(q);
4723				q = nextbufq(q);
4724			}
4725		} while (moretodo);
4726
4727#if DIAGNOSTIC
4728		vfs_bufstats();
4729		buqlimprt(0);
4730#endif
4731		(void)tsleep((void *)&bufqscanwait, PRIBIO, "bufqscanwait", 60 * hz);
4732		moretodo = 0;
4733	}
4734}
4735
4736/* Seed for the buffer queue balancing */
4737static __inline__ int
4738initbufqscan()
4739{
4740	/* Start with AGE queue */
4741	return (BQ_AGE);
4742}
4743
4744/* Pick next buffer queue to balance */
4745static __inline__ int
4746nextbufq(int q)
4747{
4748	int order[] = { BQ_AGE, BQ_LRU, BQ_META, BQ_EMPTY, 0 };
4749
4750	q++;
4751	q %= sizeof(order);
4752	return (order[q]);
4753}
4754
4755/* function to balance the buffer queues */
4756static int
4757balancebufq(int q)
4758{
4759	int moretodo = 0;
4760	int n, t;
4761
4762	/* reject invalid q */
4763	if ((q < 0) || (q >= BQUEUES))
4764		goto out;
4765
4766	/* LOCKED or LAUNDRY queue MUST not be balanced */
4767	if ((q == BQ_LOCKED) || (q == BQ_LAUNDRY))
4768		goto out;
4769
4770	n = (bufqlim[q].bl_num - bufqlim[q].bl_target);
4771
4772	/* If queue has less than target nothing more to do */
4773	if (n < 0)
4774		goto out;
4775
4776	if ( n > 8 ) {
4777		/* Balance only a small amount (12.5%) at a time */
4778		n >>= 3;
4779	}
4780
4781	/* EMPTY queue needs special handling */
4782	if (q == BQ_EMPTY) {
4783		moretodo |= btrimempty(n);
4784		goto out;
4785	}
4786
4787	t = buf_timestamp():
4788
4789	for (; n > 0; n--) {
4790		struct buf *bp = bufqueues[q].tqh_first;
4791		if (!bp)
4792			break;
4793
4794		/* check if it's stale */
4795		if ((t - bp->b_timestamp) > bufqlim[q].bl_stale) {
4796			if (bcleanbuf(bp, FALSE)) {
4797				/* buf_bawrite() issued, bp not ready */
4798				moretodo = 1;
4799			} else {
4800				/* release the cleaned buffer to BQ_EMPTY */
4801				SET(bp->b_flags, B_INVAL);
4802				buf_brelse(bp);
4803			}
4804		} else
4805			break;
4806	}
4807
4808out:
4809	return (moretodo);
4810}
4811
4812static int
4813btrimempty(int n)
4814{
4815	/*
4816	 * When struct buf are allocated dynamically, this would
4817	 * reclaim upto 'n' struct buf from the empty queue.
4818	 */
4819
4820	 return (0);
4821}
4822
4823static void
4824buqlimprt(int all)
4825{
4826	int i;
4827    static char *bname[BQUEUES] =
4828		{ "LOCKED", "LRU", "AGE", "EMPTY", "META", "LAUNDRY" };
4829
4830	if (all)
4831		for (i = 0; i < BQUEUES; i++) {
4832			printf("%s : ", bname[i]);
4833			printf("min = %ld, ", (long)bufqlim[i].bl_nlow);
4834			printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4835			printf("max = %ld, ", (long)bufqlim[i].bl_nlhigh);
4836			printf("target = %ld, ", (long)bufqlim[i].bl_target);
4837			printf("stale after %ld seconds\n", bufqlim[i].bl_stale);
4838		}
4839	else
4840		for (i = 0; i < BQUEUES; i++) {
4841			printf("%s : ", bname[i]);
4842			printf("cur = %ld, ", (long)bufqlim[i].bl_num);
4843		}
4844}
4845
4846#endif
4847
4848
4849