1/*
2 * Copyright (c) 1999-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1991, 1993, 1994
30 *	The Regents of the University of California.  All rights reserved.
31 * (c) UNIX System Laboratories, Inc.
32 * All or some portions of this file are derived from material licensed
33 * to the University of California by American Telephone and Telegraph
34 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
35 * the permission of UNIX System Laboratories, Inc.
36 *
37 * Redistribution and use in source and binary forms, with or without
38 * modification, are permitted provided that the following conditions
39 * are met:
40 * 1. Redistributions of source code must retain the above copyright
41 *    notice, this list of conditions and the following disclaimer.
42 * 2. Redistributions in binary form must reproduce the above copyright
43 *    notice, this list of conditions and the following disclaimer in the
44 *    documentation and/or other materials provided with the distribution.
45 * 3. All advertising materials mentioning features or use of this software
46 *    must display the following acknowledgement:
47 *	This product includes software developed by the University of
48 *	California, Berkeley and its contributors.
49 * 4. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 *      hfs_vfsops.c
66 *  derived from	@(#)ufs_vfsops.c	8.8 (Berkeley) 5/20/95
67 *
68 *      (c) Copyright 1997-2002 Apple Computer, Inc. All rights reserved.
69 *
70 *      hfs_vfsops.c -- VFS layer for loadable HFS file system.
71 *
72 */
73#include <sys/param.h>
74#include <sys/systm.h>
75#include <sys/kauth.h>
76
77#include <sys/ubc.h>
78#include <sys/ubc_internal.h>
79#include <sys/vnode_internal.h>
80#include <sys/mount_internal.h>
81#include <sys/sysctl.h>
82#include <sys/malloc.h>
83#include <sys/stat.h>
84#include <sys/quota.h>
85#include <sys/disk.h>
86#include <sys/paths.h>
87#include <sys/utfconv.h>
88#include <sys/kdebug.h>
89#include <sys/fslog.h>
90#include <sys/ubc.h>
91#include <sys/buf_internal.h>
92
93#include <kern/locks.h>
94
95#include <vfs/vfs_journal.h>
96
97#include <miscfs/specfs/specdev.h>
98#include <hfs/hfs_mount.h>
99
100#include <libkern/crypto/md5.h>
101#include <uuid/uuid.h>
102
103#include "hfs.h"
104#include "hfs_catalog.h"
105#include "hfs_cnode.h"
106#include "hfs_dbg.h"
107#include "hfs_endian.h"
108#include "hfs_hotfiles.h"
109#include "hfs_quota.h"
110#include "hfs_btreeio.h"
111
112#include "hfscommon/headers/FileMgrInternal.h"
113#include "hfscommon/headers/BTreesInternal.h"
114
115#if CONFIG_PROTECT
116#include <sys/cprotect.h>
117#endif
118
119#if CONFIG_HFS_ALLOC_RBTREE
120#include "hfscommon/headers/HybridAllocator.h"
121#endif
122
123#define HFS_MOUNT_DEBUG 1
124
125#if	HFS_DIAGNOSTIC
126int hfs_dbg_all = 0;
127int hfs_dbg_err = 0;
128#endif
129
130/* Enable/disable debugging code for live volume resizing */
131int hfs_resize_debug = 0;
132
133lck_grp_attr_t *  hfs_group_attr;
134lck_attr_t *  hfs_lock_attr;
135lck_grp_t *  hfs_mutex_group;
136lck_grp_t *  hfs_rwlock_group;
137lck_grp_t *  hfs_spinlock_group;
138
139extern struct vnodeopv_desc hfs_vnodeop_opv_desc;
140extern struct vnodeopv_desc hfs_std_vnodeop_opv_desc;
141
142/* not static so we can re-use in hfs_readwrite.c for build_path calls */
143int hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, vfs_context_t context);
144
145static int hfs_changefs(struct mount *mp, struct hfs_mount_args *args);
146static int hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, vfs_context_t context);
147static int hfs_flushfiles(struct mount *, int, struct proc *);
148static int hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush);
149static int hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp);
150static int hfs_init(struct vfsconf *vfsp);
151static int hfs_vfs_root(struct mount *mp, struct vnode **vpp, vfs_context_t context);
152static int hfs_quotactl(struct mount *, int, uid_t, caddr_t, vfs_context_t context);
153static int hfs_start(struct mount *mp, int flags, vfs_context_t context);
154static int hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, vfs_context_t context);
155static int hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec);
156static int hfs_journal_replay(vnode_t devvp, vfs_context_t context);
157static int hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context);
158static int hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context);
159
160void hfs_initialize_allocator (struct hfsmount *hfsmp);
161int hfs_teardown_allocator (struct hfsmount *hfsmp);
162void hfs_unmap_blocks (struct hfsmount *hfsmp);
163
164int hfs_mount(struct mount *mp, vnode_t  devvp, user_addr_t data, vfs_context_t context);
165int hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args, int journal_replay_only, vfs_context_t context);
166int hfs_reload(struct mount *mp);
167int hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, vfs_context_t context);
168int hfs_sync(struct mount *mp, int waitfor, vfs_context_t context);
169int hfs_sysctl(int *name, u_int namelen, user_addr_t oldp, size_t *oldlenp,
170                      user_addr_t newp, size_t newlen, vfs_context_t context);
171int hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context);
172
173/*
174 * Called by vfs_mountroot when mounting HFS Plus as root.
175 */
176
177int
178hfs_mountroot(mount_t mp, vnode_t rvp, vfs_context_t context)
179{
180	struct hfsmount *hfsmp;
181	ExtendedVCB *vcb;
182	struct vfsstatfs *vfsp;
183	int error;
184
185	if ((error = hfs_mountfs(rvp, mp, NULL, 0, context))) {
186		if (HFS_MOUNT_DEBUG) {
187			printf("hfs_mountroot: hfs_mountfs returned %d, rvp (%p) name (%s) \n",
188					error, rvp, (rvp->v_name ? rvp->v_name : "unknown device"));
189		}
190		return (error);
191	}
192
193	/* Init hfsmp */
194	hfsmp = VFSTOHFS(mp);
195
196	hfsmp->hfs_uid = UNKNOWNUID;
197	hfsmp->hfs_gid = UNKNOWNGID;
198	hfsmp->hfs_dir_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
199	hfsmp->hfs_file_mask = (S_IRWXU | S_IRGRP|S_IXGRP | S_IROTH|S_IXOTH); /* 0755 */
200
201	/* Establish the free block reserve. */
202	vcb = HFSTOVCB(hfsmp);
203	vcb->reserveBlocks = ((u_int64_t)vcb->totalBlocks * HFS_MINFREE) / 100;
204	vcb->reserveBlocks = MIN(vcb->reserveBlocks, HFS_MAXRESERVE / vcb->blockSize);
205
206	vfsp = vfs_statfs(mp);
207	(void)hfs_statfs(mp, vfsp, NULL);
208
209	return (0);
210}
211
212
213/*
214 * VFS Operations.
215 *
216 * mount system call
217 */
218
219int
220hfs_mount(struct mount *mp, vnode_t devvp, user_addr_t data, vfs_context_t context)
221{
222	struct proc *p = vfs_context_proc(context);
223	struct hfsmount *hfsmp = NULL;
224	struct hfs_mount_args args;
225	int retval = E_NONE;
226	u_int32_t cmdflags;
227
228	if ((retval = copyin(data, (caddr_t)&args, sizeof(args)))) {
229		if (HFS_MOUNT_DEBUG) {
230			printf("hfs_mount: copyin returned %d for fs\n", retval);
231		}
232		return (retval);
233	}
234	cmdflags = (u_int32_t)vfs_flags(mp) & MNT_CMDFLAGS;
235	if (cmdflags & MNT_UPDATE) {
236		hfsmp = VFSTOHFS(mp);
237
238		/* Reload incore data after an fsck. */
239		if (cmdflags & MNT_RELOAD) {
240			if (vfs_isrdonly(mp)) {
241				int error = hfs_reload(mp);
242				if (error && HFS_MOUNT_DEBUG) {
243					printf("hfs_mount: hfs_reload returned %d on %s \n", error, hfsmp->vcbVN);
244				}
245				return error;
246			}
247			else {
248				if (HFS_MOUNT_DEBUG) {
249					printf("hfs_mount: MNT_RELOAD not supported on rdwr filesystem %s\n", hfsmp->vcbVN);
250				}
251				return (EINVAL);
252			}
253		}
254
255		/* Change to a read-only file system. */
256		if (((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) &&
257		    vfs_isrdonly(mp)) {
258			int flags;
259
260			/* Set flag to indicate that a downgrade to read-only
261			 * is in progress and therefore block any further
262			 * modifications to the file system.
263			 */
264			hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
265			hfsmp->hfs_flags |= HFS_RDONLY_DOWNGRADE;
266			hfsmp->hfs_downgrading_proc = current_thread();
267			hfs_unlock_global (hfsmp);
268
269			/* use VFS_SYNC to push out System (btree) files */
270			retval = VFS_SYNC(mp, MNT_WAIT, context);
271			if (retval && ((cmdflags & MNT_FORCE) == 0)) {
272				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
273				hfsmp->hfs_downgrading_proc = NULL;
274				if (HFS_MOUNT_DEBUG) {
275					printf("hfs_mount: VFS_SYNC returned %d during b-tree sync of %s \n", retval, hfsmp->vcbVN);
276				}
277				goto out;
278			}
279
280			flags = WRITECLOSE;
281			if (cmdflags & MNT_FORCE)
282				flags |= FORCECLOSE;
283
284			if ((retval = hfs_flushfiles(mp, flags, p))) {
285				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
286				hfsmp->hfs_downgrading_proc = NULL;
287				if (HFS_MOUNT_DEBUG) {
288					printf("hfs_mount: hfs_flushfiles returned %d on %s \n", retval, hfsmp->vcbVN);
289				}
290				goto out;
291			}
292
293			/* mark the volume cleanly unmounted */
294			hfsmp->vcbAtrb |= kHFSVolumeUnmountedMask;
295			retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
296			hfsmp->hfs_flags |= HFS_READ_ONLY;
297
298			/*
299			 * Close down the journal.
300			 *
301			 * NOTE: It is critically important to close down the journal
302			 * and have it issue all pending I/O prior to calling VNOP_FSYNC below.
303			 * In a journaled environment it is expected that the journal be
304			 * the only actor permitted to issue I/O for metadata blocks in HFS.
305			 * If we were to call VNOP_FSYNC prior to closing down the journal,
306			 * we would inadvertantly issue (and wait for) the I/O we just
307			 * initiated above as part of the flushvolumeheader call.
308			 *
309			 * To avoid this, we follow the same order of operations as in
310			 * unmount and issue the journal_close prior to calling VNOP_FSYNC.
311			 */
312
313			if (hfsmp->jnl) {
314				hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
315
316			    journal_close(hfsmp->jnl);
317			    hfsmp->jnl = NULL;
318
319			    // Note: we explicitly don't want to shutdown
320			    //       access to the jvp because we may need
321			    //       it later if we go back to being read-write.
322
323				hfs_unlock_global (hfsmp);
324			}
325
326
327			/*
328			 * Write out any pending I/O still outstanding against the device node
329			 * now that the journal has been closed.
330			 */
331			if (!retval) {
332				if (vnode_mount(hfsmp->hfs_devvp) == mp) {
333					retval = hfs_fsync(hfsmp->hfs_devvp, MNT_WAIT, 0, p);
334				} else {
335					vnode_get(hfsmp->hfs_devvp);
336					retval = VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
337					vnode_put(hfsmp->hfs_devvp);
338				}
339			}
340
341			if (retval) {
342				if (HFS_MOUNT_DEBUG) {
343					printf("hfs_mount: FSYNC on devvp returned %d for fs %s\n", retval, hfsmp->vcbVN);
344				}
345				hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
346				hfsmp->hfs_downgrading_proc = NULL;
347				hfsmp->hfs_flags &= ~HFS_READ_ONLY;
348				goto out;
349			}
350
351#if CONFIG_HFS_ALLOC_RBTREE
352			(void) hfs_teardown_allocator(hfsmp);
353#endif
354			hfsmp->hfs_downgrading_proc = NULL;
355		}
356
357		/* Change to a writable file system. */
358		if (vfs_iswriteupgrade(mp)) {
359#if CONFIG_HFS_ALLOC_RBTREE
360				thread_t allocator_thread;
361#endif
362
363			/*
364			 * On inconsistent disks, do not allow read-write mount
365			 * unless it is the boot volume being mounted.
366			 */
367			if (!(vfs_flags(mp) & MNT_ROOTFS) &&
368					(hfsmp->vcbAtrb & kHFSVolumeInconsistentMask)) {
369				if (HFS_MOUNT_DEBUG) {
370					printf("hfs_mount: attempting to mount inconsistent non-root volume %s\n",  (hfsmp->vcbVN));
371				}
372				retval = EINVAL;
373				goto out;
374			}
375
376			// If the journal was shut-down previously because we were
377			// asked to be read-only, let's start it back up again now
378
379			if (   (HFSTOVCB(hfsmp)->vcbAtrb & kHFSVolumeJournaledMask)
380			    && hfsmp->jnl == NULL
381			    && hfsmp->jvp != NULL) {
382			    int jflags;
383
384			    if (hfsmp->hfs_flags & HFS_NEED_JNL_RESET) {
385					jflags = JOURNAL_RESET;
386				} else {
387					jflags = 0;
388				}
389
390				hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
391
392				hfsmp->jnl = journal_open(hfsmp->jvp,
393						(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
394						hfsmp->jnl_size,
395						hfsmp->hfs_devvp,
396						hfsmp->hfs_logical_block_size,
397						jflags,
398						0,
399						hfs_sync_metadata, hfsmp->hfs_mp);
400
401				/*
402				 * Set up the trim callback function so that we can add
403				 * recently freed extents to the free extent cache once
404				 * the transaction that freed them is written to the
405				 * journal on disk.
406				 */
407				if (hfsmp->jnl)
408					journal_trim_set_callback(hfsmp->jnl, hfs_trim_callback, hfsmp);
409
410				hfs_unlock_global (hfsmp);
411
412				if (hfsmp->jnl == NULL) {
413					if (HFS_MOUNT_DEBUG) {
414						printf("hfs_mount: journal_open == NULL; couldn't be opened on %s \n", (hfsmp->vcbVN));
415					}
416					retval = EINVAL;
417					goto out;
418				} else {
419					hfsmp->hfs_flags &= ~HFS_NEED_JNL_RESET;
420				}
421
422			}
423
424			/* See if we need to erase unused Catalog nodes due to <rdar://problem/6947811>. */
425			retval = hfs_erase_unused_nodes(hfsmp);
426			if (retval != E_NONE) {
427				if (HFS_MOUNT_DEBUG) {
428					printf("hfs_mount: hfs_erase_unused_nodes returned %d for fs %s\n", retval, hfsmp->vcbVN);
429				}
430				goto out;
431			}
432
433			/* If this mount point was downgraded from read-write
434			 * to read-only, clear that information as we are now
435			 * moving back to read-write.
436			 */
437			hfsmp->hfs_flags &= ~HFS_RDONLY_DOWNGRADE;
438			hfsmp->hfs_downgrading_proc = NULL;
439
440			/* mark the volume dirty (clear clean unmount bit) */
441			hfsmp->vcbAtrb &= ~kHFSVolumeUnmountedMask;
442
443			retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
444			if (retval != E_NONE) {
445				if (HFS_MOUNT_DEBUG) {
446					printf("hfs_mount: hfs_flushvolumeheader returned %d for fs %s\n", retval, hfsmp->vcbVN);
447				}
448				goto out;
449			}
450
451			/* Only clear HFS_READ_ONLY after a successful write */
452			hfsmp->hfs_flags &= ~HFS_READ_ONLY;
453
454
455			if (!(hfsmp->hfs_flags & (HFS_READ_ONLY | HFS_STANDARD))) {
456				/* Setup private/hidden directories for hardlinks. */
457				hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
458				hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
459
460				hfs_remove_orphans(hfsmp);
461
462				/*
463				 * Allow hot file clustering if conditions allow.
464				 */
465				if ((hfsmp->hfs_flags & HFS_METADATA_ZONE) &&
466					   ((hfsmp->hfs_mp->mnt_kern_flag & MNTK_SSD) == 0))	{
467					(void) hfs_recording_init(hfsmp);
468				}
469				/* Force ACLs on HFS+ file systems. */
470				if (vfs_extendedsecurity(HFSTOVFS(hfsmp)) == 0) {
471					vfs_setextendedsecurity(HFSTOVFS(hfsmp));
472				}
473			}
474
475#if CONFIG_HFS_ALLOC_RBTREE
476			/*
477			 * Like the normal mount case, we need to handle creation of the allocation red-black tree
478			 * if we're upgrading from read-only to read-write.
479			 *
480			 * We spawn a thread to create the pair of red-black trees for this volume.
481			 * However, in so doing, we must be careful to ensure that if this thread is still
482			 * running after mount has finished, it doesn't interfere with an unmount. Specifically,
483			 * we'll need to set a bit that indicates we're in progress building the trees here.
484			 * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
485			 * notifies the tree generation code that an unmount is waiting.  Also, mark the extent
486			 * tree flags that the allocator is enabled for use before we spawn the thread that will start
487			 * scanning the RB tree.
488			 *
489			 * Only do this if we're operating on a read-write mount (we wouldn't care for read-only),
490			 * which has not previously encountered a bad error on the red-black tree code.  Also, don't
491			 * try to re-build a tree that already exists.
492			 *
493			 * When this is enabled, we must re-integrate the above function into our bitmap iteration
494			 * so that we accurately send TRIMs down to the underlying disk device as needed.
495			 */
496
497			if (hfsmp->extent_tree_flags == 0) {
498				hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
499				/* Initialize EOF counter so that the thread can assume it started at initial values */
500				hfsmp->offset_block_end = 0;
501
502				InitTree(hfsmp);
503
504				kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
505				thread_deallocate(allocator_thread);
506			}
507
508#endif
509		}
510
511		/* Update file system parameters. */
512		retval = hfs_changefs(mp, &args);
513		if (retval &&  HFS_MOUNT_DEBUG) {
514			printf("hfs_mount: hfs_changefs returned %d for %s\n", retval, hfsmp->vcbVN);
515		}
516
517	} else /* not an update request */ {
518
519		/* Set the mount flag to indicate that we support volfs  */
520		vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_DOVOLFS));
521
522		retval = hfs_mountfs(devvp, mp, &args, 0, context);
523		if (retval && HFS_MOUNT_DEBUG) {
524			printf("hfs_mount: hfs_mountfs returned %d\n", retval);
525		}
526#if CONFIG_PROTECT
527		/*
528		 * If above mount call was successful, and this mount is content protection
529		 * enabled, then verify the on-disk EA on the root to ensure that the filesystem
530		 * is of a suitable vintage to allow the mount to proceed.
531		 */
532		if ((retval == 0) && (cp_fs_protected (mp))) {
533			int err = 0;
534
535			struct cp_root_xattr *xattr = NULL;
536			MALLOC (xattr, struct cp_root_xattr*, sizeof(struct cp_root_xattr), M_TEMP, M_WAITOK);
537			if (xattr == NULL) {
538				err = ENOMEM;
539				goto badalloc;
540			}
541			bzero (xattr, sizeof(struct cp_root_xattr));
542			hfsmp = vfs_fsprivate(mp);
543
544			/* go get the EA to get the version information */
545			err = cp_getrootxattr (hfsmp, xattr);
546			/*
547			 * If there was no EA there, then write one out.
548			 * Assuming EA is not present on the root means
549			 * this is an erase install or a very old FS
550			 */
551			if (err == ENOATTR) {
552				printf("No root EA set, creating new EA with new version: %d\n", CP_NEW_MAJOR_VERS);
553				bzero(xattr, sizeof(struct cp_root_xattr));
554				xattr->major_version = CP_NEW_MAJOR_VERS;
555				xattr->minor_version = CP_MINOR_VERS;
556				xattr->flags = 0;
557
558				err = cp_setrootxattr (hfsmp, xattr);
559			}
560
561			/*
562			 * For any other error, including having an out of date CP version in the
563			 * EA, or for an error out of cp_setrootxattr, deny the mount
564			 * and do not proceed further.
565			 */
566			if (err || (xattr->major_version != CP_NEW_MAJOR_VERS && xattr->major_version != CP_PREV_MAJOR_VERS))  {
567				/* Deny the mount and tear down. */
568				retval = EPERM;
569				(void) hfs_unmount (mp, MNT_FORCE, context);
570			}
571			printf("Running with CP root xattr: %d.%d\n", xattr->major_version, xattr->minor_version);
572badalloc:
573			if(xattr) {
574				FREE(xattr, M_TEMP);
575			}
576		}
577#endif
578	}
579out:
580	if (retval == 0) {
581		(void)hfs_statfs(mp, vfs_statfs(mp), context);
582	}
583	return (retval);
584}
585
586
587struct hfs_changefs_cargs {
588	struct hfsmount *hfsmp;
589        int		namefix;
590        int		permfix;
591        int		permswitch;
592};
593
594static int
595hfs_changefs_callback(struct vnode *vp, void *cargs)
596{
597	ExtendedVCB *vcb;
598	struct cnode *cp;
599	struct cat_desc cndesc;
600	struct cat_attr cnattr;
601	struct hfs_changefs_cargs *args;
602	int lockflags;
603	int error;
604
605	args = (struct hfs_changefs_cargs *)cargs;
606
607	cp = VTOC(vp);
608	vcb = HFSTOVCB(args->hfsmp);
609
610	lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
611	error = cat_lookup(args->hfsmp, &cp->c_desc, 0, &cndesc, &cnattr, NULL, NULL);
612	hfs_systemfile_unlock(args->hfsmp, lockflags);
613	if (error) {
614	        /*
615		 * If we couldn't find this guy skip to the next one
616		 */
617	        if (args->namefix)
618		        cache_purge(vp);
619
620		return (VNODE_RETURNED);
621	}
622	/*
623	 * Get the real uid/gid and perm mask from disk.
624	 */
625	if (args->permswitch || args->permfix) {
626	        cp->c_uid = cnattr.ca_uid;
627		cp->c_gid = cnattr.ca_gid;
628		cp->c_mode = cnattr.ca_mode;
629	}
630	/*
631	 * If we're switching name converters then...
632	 *   Remove the existing entry from the namei cache.
633	 *   Update name to one based on new encoder.
634	 */
635	if (args->namefix) {
636	        cache_purge(vp);
637		replace_desc(cp, &cndesc);
638
639		if (cndesc.cd_cnid == kHFSRootFolderID) {
640		        strlcpy((char *)vcb->vcbVN, (const char *)cp->c_desc.cd_nameptr, NAME_MAX+1);
641			cp->c_desc.cd_encoding = args->hfsmp->hfs_encoding;
642		}
643	} else {
644	        cat_releasedesc(&cndesc);
645	}
646	return (VNODE_RETURNED);
647}
648
649/* Change fs mount parameters */
650static int
651hfs_changefs(struct mount *mp, struct hfs_mount_args *args)
652{
653	int retval = 0;
654	int namefix, permfix, permswitch;
655	struct hfsmount *hfsmp;
656	ExtendedVCB *vcb;
657	hfs_to_unicode_func_t	get_unicode_func;
658	unicode_to_hfs_func_t	get_hfsname_func;
659	u_int32_t old_encoding = 0;
660	struct hfs_changefs_cargs cargs;
661	u_int32_t mount_flags;
662
663	hfsmp = VFSTOHFS(mp);
664	vcb = HFSTOVCB(hfsmp);
665	mount_flags = (unsigned int)vfs_flags(mp);
666
667	hfsmp->hfs_flags |= HFS_IN_CHANGEFS;
668
669	permswitch = (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) &&
670	               ((mount_flags & MNT_UNKNOWNPERMISSIONS) == 0)) ||
671	              (((hfsmp->hfs_flags & HFS_UNKNOWN_PERMS) == 0) &&
672	               (mount_flags & MNT_UNKNOWNPERMISSIONS)));
673
674	/* The root filesystem must operate with actual permissions: */
675	if (permswitch && (mount_flags & MNT_ROOTFS) && (mount_flags & MNT_UNKNOWNPERMISSIONS)) {
676		vfs_clearflags(mp, (u_int64_t)((unsigned int)MNT_UNKNOWNPERMISSIONS));	/* Just say "No". */
677		retval = EINVAL;
678		goto exit;
679	}
680	if (mount_flags & MNT_UNKNOWNPERMISSIONS)
681		hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
682	else
683		hfsmp->hfs_flags &= ~HFS_UNKNOWN_PERMS;
684
685	namefix = permfix = 0;
686
687	/*
688	 * Tracking of hot files requires up-to-date access times.  So if
689	 * access time updates are disabled, we must also disable hot files.
690	 */
691	if (mount_flags & MNT_NOATIME) {
692		(void) hfs_recording_suspend(hfsmp);
693	}
694
695	/* Change the timezone (Note: this affects all hfs volumes and hfs+ volume create dates) */
696	if (args->hfs_timezone.tz_minuteswest != VNOVAL) {
697		gTimeZone = args->hfs_timezone;
698	}
699
700	/* Change the default uid, gid and/or mask */
701	if ((args->hfs_uid != (uid_t)VNOVAL) && (hfsmp->hfs_uid != args->hfs_uid)) {
702		hfsmp->hfs_uid = args->hfs_uid;
703		if (vcb->vcbSigWord == kHFSPlusSigWord)
704			++permfix;
705	}
706	if ((args->hfs_gid != (gid_t)VNOVAL) && (hfsmp->hfs_gid != args->hfs_gid)) {
707		hfsmp->hfs_gid = args->hfs_gid;
708		if (vcb->vcbSigWord == kHFSPlusSigWord)
709			++permfix;
710	}
711	if (args->hfs_mask != (mode_t)VNOVAL) {
712		if (hfsmp->hfs_dir_mask != (args->hfs_mask & ALLPERMS)) {
713			hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
714			hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
715			if ((args->flags != VNOVAL) && (args->flags & HFSFSMNT_NOXONFILES))
716				hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
717			if (vcb->vcbSigWord == kHFSPlusSigWord)
718				++permfix;
719		}
720	}
721
722	/* Change the hfs encoding value (hfs only) */
723	if ((vcb->vcbSigWord == kHFSSigWord)	&&
724	    (args->hfs_encoding != (u_int32_t)VNOVAL)              &&
725	    (hfsmp->hfs_encoding != args->hfs_encoding)) {
726
727		retval = hfs_getconverter(args->hfs_encoding, &get_unicode_func, &get_hfsname_func);
728		if (retval)
729			goto exit;
730
731		/*
732		 * Connect the new hfs_get_unicode converter but leave
733		 * the old hfs_get_hfsname converter in place so that
734		 * we can lookup existing vnodes to get their correctly
735		 * encoded names.
736		 *
737		 * When we're all finished, we can then connect the new
738		 * hfs_get_hfsname converter and release our interest
739		 * in the old converters.
740		 */
741		hfsmp->hfs_get_unicode = get_unicode_func;
742		old_encoding = hfsmp->hfs_encoding;
743		hfsmp->hfs_encoding = args->hfs_encoding;
744		++namefix;
745	}
746
747	if (!(namefix || permfix || permswitch))
748		goto exit;
749
750	/* XXX 3762912 hack to support HFS filesystem 'owner' */
751	if (permfix)
752		vfs_setowner(mp,
753		    hfsmp->hfs_uid == UNKNOWNUID ? KAUTH_UID_NONE : hfsmp->hfs_uid,
754		    hfsmp->hfs_gid == UNKNOWNGID ? KAUTH_GID_NONE : hfsmp->hfs_gid);
755
756	/*
757	 * For each active vnode fix things that changed
758	 *
759	 * Note that we can visit a vnode more than once
760	 * and we can race with fsync.
761	 *
762	 * hfs_changefs_callback will be called for each vnode
763	 * hung off of this mount point
764	 *
765	 * The vnode will be properly referenced and unreferenced
766	 * around the callback
767	 */
768	cargs.hfsmp = hfsmp;
769	cargs.namefix = namefix;
770	cargs.permfix = permfix;
771	cargs.permswitch = permswitch;
772
773	vnode_iterate(mp, 0, hfs_changefs_callback, (void *)&cargs);
774
775	/*
776	 * If we're switching name converters we can now
777	 * connect the new hfs_get_hfsname converter and
778	 * release our interest in the old converters.
779	 */
780	if (namefix) {
781		hfsmp->hfs_get_hfsname = get_hfsname_func;
782		vcb->volumeNameEncodingHint = args->hfs_encoding;
783		(void) hfs_relconverter(old_encoding);
784	}
785exit:
786	hfsmp->hfs_flags &= ~HFS_IN_CHANGEFS;
787	return (retval);
788}
789
790
791struct hfs_reload_cargs {
792	struct hfsmount *hfsmp;
793        int		error;
794};
795
796static int
797hfs_reload_callback(struct vnode *vp, void *cargs)
798{
799	struct cnode *cp;
800	struct hfs_reload_cargs *args;
801	int lockflags;
802
803	args = (struct hfs_reload_cargs *)cargs;
804	/*
805	 * flush all the buffers associated with this node
806	 */
807	(void) buf_invalidateblks(vp, 0, 0, 0);
808
809	cp = VTOC(vp);
810	/*
811	 * Remove any directory hints
812	 */
813	if (vnode_isdir(vp))
814	        hfs_reldirhints(cp, 0);
815
816	/*
817	 * Re-read cnode data for all active vnodes (non-metadata files).
818	 */
819	if (!vnode_issystem(vp) && !VNODE_IS_RSRC(vp) && (cp->c_fileid >= kHFSFirstUserCatalogNodeID)) {
820	        struct cat_fork *datafork;
821		struct cat_desc desc;
822
823		datafork = cp->c_datafork ? &cp->c_datafork->ff_data : NULL;
824
825		/* lookup by fileID since name could have changed */
826		lockflags = hfs_systemfile_lock(args->hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
827		args->error = cat_idlookup(args->hfsmp, cp->c_fileid, 0, 0, &desc, &cp->c_attr, datafork);
828		hfs_systemfile_unlock(args->hfsmp, lockflags);
829		if (args->error) {
830		        return (VNODE_RETURNED_DONE);
831		}
832
833		/* update cnode's catalog descriptor */
834		(void) replace_desc(cp, &desc);
835	}
836	return (VNODE_RETURNED);
837}
838
839/*
840 * Reload all incore data for a filesystem (used after running fsck on
841 * the root filesystem and finding things to fix). The filesystem must
842 * be mounted read-only.
843 *
844 * Things to do to update the mount:
845 *	invalidate all cached meta-data.
846 *	invalidate all inactive vnodes.
847 *	invalidate all cached file data.
848 *	re-read volume header from disk.
849 *	re-load meta-file info (extents, file size).
850 *	re-load B-tree header data.
851 *	re-read cnode data for all active vnodes.
852 */
853int
854hfs_reload(struct mount *mountp)
855{
856	register struct vnode *devvp;
857	struct buf *bp;
858	int error, i;
859	struct hfsmount *hfsmp;
860	struct HFSPlusVolumeHeader *vhp;
861	ExtendedVCB *vcb;
862	struct filefork *forkp;
863    	struct cat_desc cndesc;
864	struct hfs_reload_cargs args;
865	daddr64_t priIDSector;
866
867    	hfsmp = VFSTOHFS(mountp);
868	vcb = HFSTOVCB(hfsmp);
869
870	if (vcb->vcbSigWord == kHFSSigWord)
871		return (EINVAL);	/* rooting from HFS is not supported! */
872
873	/*
874	 * Invalidate all cached meta-data.
875	 */
876	devvp = hfsmp->hfs_devvp;
877	if (buf_invalidateblks(devvp, 0, 0, 0))
878		panic("hfs_reload: dirty1");
879
880	args.hfsmp = hfsmp;
881	args.error = 0;
882	/*
883	 * hfs_reload_callback will be called for each vnode
884	 * hung off of this mount point that can't be recycled...
885	 * vnode_iterate will recycle those that it can (the VNODE_RELOAD option)
886	 * the vnode will be in an 'unbusy' state (VNODE_WAIT) and
887	 * properly referenced and unreferenced around the callback
888	 */
889	vnode_iterate(mountp, VNODE_RELOAD | VNODE_WAIT, hfs_reload_callback, (void *)&args);
890
891	if (args.error)
892	        return (args.error);
893
894	/*
895	 * Re-read VolumeHeader from disk.
896	 */
897	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
898			HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
899
900	error = (int)buf_meta_bread(hfsmp->hfs_devvp,
901			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
902			hfsmp->hfs_physical_block_size, NOCRED, &bp);
903	if (error) {
904        	if (bp != NULL)
905        		buf_brelse(bp);
906		return (error);
907	}
908
909	vhp = (HFSPlusVolumeHeader *) (buf_dataptr(bp) + HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
910
911	/* Do a quick sanity check */
912	if ((SWAP_BE16(vhp->signature) != kHFSPlusSigWord &&
913	     SWAP_BE16(vhp->signature) != kHFSXSigWord) ||
914	    (SWAP_BE16(vhp->version) != kHFSPlusVersion &&
915	     SWAP_BE16(vhp->version) != kHFSXVersion) ||
916	    SWAP_BE32(vhp->blockSize) != vcb->blockSize) {
917		buf_brelse(bp);
918		return (EIO);
919	}
920
921	vcb->vcbLsMod		= to_bsd_time(SWAP_BE32(vhp->modifyDate));
922	vcb->vcbAtrb		= SWAP_BE32 (vhp->attributes);
923	vcb->vcbJinfoBlock  = SWAP_BE32(vhp->journalInfoBlock);
924	vcb->vcbClpSiz		= SWAP_BE32 (vhp->rsrcClumpSize);
925	vcb->vcbNxtCNID		= SWAP_BE32 (vhp->nextCatalogID);
926	vcb->vcbVolBkUp		= to_bsd_time(SWAP_BE32(vhp->backupDate));
927	vcb->vcbWrCnt		= SWAP_BE32 (vhp->writeCount);
928	vcb->vcbFilCnt		= SWAP_BE32 (vhp->fileCount);
929	vcb->vcbDirCnt		= SWAP_BE32 (vhp->folderCount);
930	HFS_UPDATE_NEXT_ALLOCATION(vcb, SWAP_BE32 (vhp->nextAllocation));
931	vcb->totalBlocks	= SWAP_BE32 (vhp->totalBlocks);
932	vcb->freeBlocks		= SWAP_BE32 (vhp->freeBlocks);
933	vcb->encodingsBitmap	= SWAP_BE64 (vhp->encodingsBitmap);
934	bcopy(vhp->finderInfo, vcb->vcbFndrInfo, sizeof(vhp->finderInfo));
935	vcb->localCreateDate	= SWAP_BE32 (vhp->createDate); /* hfs+ create date is in local time */
936
937	/*
938	 * Re-load meta-file vnode data (extent info, file size, etc).
939	 */
940	forkp = VTOF((struct vnode *)vcb->extentsRefNum);
941	for (i = 0; i < kHFSPlusExtentDensity; i++) {
942		forkp->ff_extents[i].startBlock =
943			SWAP_BE32 (vhp->extentsFile.extents[i].startBlock);
944		forkp->ff_extents[i].blockCount =
945			SWAP_BE32 (vhp->extentsFile.extents[i].blockCount);
946	}
947	forkp->ff_size      = SWAP_BE64 (vhp->extentsFile.logicalSize);
948	forkp->ff_blocks    = SWAP_BE32 (vhp->extentsFile.totalBlocks);
949	forkp->ff_clumpsize = SWAP_BE32 (vhp->extentsFile.clumpSize);
950
951
952	forkp = VTOF((struct vnode *)vcb->catalogRefNum);
953	for (i = 0; i < kHFSPlusExtentDensity; i++) {
954		forkp->ff_extents[i].startBlock	=
955			SWAP_BE32 (vhp->catalogFile.extents[i].startBlock);
956		forkp->ff_extents[i].blockCount	=
957			SWAP_BE32 (vhp->catalogFile.extents[i].blockCount);
958	}
959	forkp->ff_size      = SWAP_BE64 (vhp->catalogFile.logicalSize);
960	forkp->ff_blocks    = SWAP_BE32 (vhp->catalogFile.totalBlocks);
961	forkp->ff_clumpsize = SWAP_BE32 (vhp->catalogFile.clumpSize);
962
963	if (hfsmp->hfs_attribute_vp) {
964		forkp = VTOF(hfsmp->hfs_attribute_vp);
965		for (i = 0; i < kHFSPlusExtentDensity; i++) {
966			forkp->ff_extents[i].startBlock	=
967				SWAP_BE32 (vhp->attributesFile.extents[i].startBlock);
968			forkp->ff_extents[i].blockCount	=
969				SWAP_BE32 (vhp->attributesFile.extents[i].blockCount);
970		}
971		forkp->ff_size      = SWAP_BE64 (vhp->attributesFile.logicalSize);
972		forkp->ff_blocks    = SWAP_BE32 (vhp->attributesFile.totalBlocks);
973		forkp->ff_clumpsize = SWAP_BE32 (vhp->attributesFile.clumpSize);
974	}
975
976	forkp = VTOF((struct vnode *)vcb->allocationsRefNum);
977	for (i = 0; i < kHFSPlusExtentDensity; i++) {
978		forkp->ff_extents[i].startBlock	=
979			SWAP_BE32 (vhp->allocationFile.extents[i].startBlock);
980		forkp->ff_extents[i].blockCount	=
981			SWAP_BE32 (vhp->allocationFile.extents[i].blockCount);
982	}
983	forkp->ff_size      = SWAP_BE64 (vhp->allocationFile.logicalSize);
984	forkp->ff_blocks    = SWAP_BE32 (vhp->allocationFile.totalBlocks);
985	forkp->ff_clumpsize = SWAP_BE32 (vhp->allocationFile.clumpSize);
986
987	buf_brelse(bp);
988	vhp = NULL;
989
990	/*
991	 * Re-load B-tree header data
992	 */
993	forkp = VTOF((struct vnode *)vcb->extentsRefNum);
994	if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
995		return (error);
996
997	forkp = VTOF((struct vnode *)vcb->catalogRefNum);
998	if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
999		return (error);
1000
1001	if (hfsmp->hfs_attribute_vp) {
1002		forkp = VTOF(hfsmp->hfs_attribute_vp);
1003		if ( (error = MacToVFSError( BTReloadData((FCB*)forkp) )) )
1004			return (error);
1005	}
1006
1007	/* Reload the volume name */
1008	if ((error = cat_idlookup(hfsmp, kHFSRootFolderID, 0, 0, &cndesc, NULL, NULL)))
1009		return (error);
1010	vcb->volumeNameEncodingHint = cndesc.cd_encoding;
1011	bcopy(cndesc.cd_nameptr, vcb->vcbVN, min(255, cndesc.cd_namelen));
1012	cat_releasedesc(&cndesc);
1013
1014	/* Re-establish private/hidden directories. */
1015	hfs_privatedir_init(hfsmp, FILE_HARDLINKS);
1016	hfs_privatedir_init(hfsmp, DIR_HARDLINKS);
1017
1018	/* In case any volume information changed to trigger a notification */
1019	hfs_generate_volume_notifications(hfsmp);
1020
1021	return (0);
1022}
1023
1024
1025
1026static void
1027hfs_syncer(void *arg0, void *unused)
1028{
1029#pragma unused(unused)
1030
1031    struct hfsmount *hfsmp = arg0;
1032    clock_sec_t secs;
1033    clock_usec_t usecs;
1034    uint32_t delay = HFS_META_DELAY;
1035    uint64_t now;
1036    static int no_max=1;
1037
1038    clock_get_calendar_microtime(&secs, &usecs);
1039    now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1040
1041    //
1042    // If the amount of pending writes is more than our limit, wait
1043    // for 2/3 of it to drain and then flush the journal.
1044    //
1045    if (hfsmp->hfs_mp->mnt_pending_write_size > hfsmp->hfs_max_pending_io) {
1046	    int counter=0;
1047	    uint64_t pending_io, start, rate = 0;
1048
1049	    no_max = 0;
1050
1051	    hfs_start_transaction(hfsmp);   // so we hold off any new i/o's
1052
1053	    pending_io = hfsmp->hfs_mp->mnt_pending_write_size;
1054
1055	    clock_get_calendar_microtime(&secs, &usecs);
1056	    start = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1057
1058	    while(hfsmp->hfs_mp->mnt_pending_write_size > (pending_io/3) && counter++ < 500) {
1059		    tsleep((caddr_t)hfsmp, PRIBIO, "hfs-wait-for-io-to-drain", 10);
1060	    }
1061
1062	    if (counter >= 500) {
1063		    printf("hfs: timed out waiting for io to drain (%lld)\n", (int64_t)hfsmp->hfs_mp->mnt_pending_write_size);
1064	    }
1065
1066	    if (hfsmp->jnl) {
1067		    journal_flush(hfsmp->jnl, FALSE);
1068	    } else {
1069		    hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1070	    }
1071
1072	    clock_get_calendar_microtime(&secs, &usecs);
1073	    now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1074	    hfsmp->hfs_last_sync_time = now;
1075	    if (now != start) {
1076		    rate = ((pending_io * 1000000ULL) / (now - start));     // yields bytes per second
1077	    }
1078
1079	    hfs_end_transaction(hfsmp);
1080
1081	    //
1082	    // If a reasonable amount of time elapsed then check the
1083	    // i/o rate.  If it's taking less than 1 second or more
1084	    // than 2 seconds, adjust hfs_max_pending_io so that we
1085	    // will allow about 1.5 seconds of i/o to queue up.
1086	    //
1087	    if (((now - start) >= 300000) && (rate != 0)) {
1088		    uint64_t scale = (pending_io * 100) / rate;
1089
1090		    if (scale < 100 || scale > 200) {
1091			    // set it so that it should take about 1.5 seconds to drain
1092			    hfsmp->hfs_max_pending_io = (rate * 150ULL) / 100ULL;
1093		    }
1094	    }
1095
1096    } else if (   ((now - hfsmp->hfs_last_sync_time) >= 5000000ULL)
1097	       || (((now - hfsmp->hfs_last_sync_time) >= 100000LL)
1098		   && ((now - hfsmp->hfs_last_sync_request_time) >= 100000LL)
1099		   && (hfsmp->hfs_active_threads == 0)
1100		   && (hfsmp->hfs_global_lock_nesting == 0))) {
1101
1102	    //
1103	    // Flush the journal if more than 5 seconds elapsed since
1104	    // the last sync OR we have not sync'ed recently and the
1105	    // last sync request time was more than 100 milliseconds
1106	    // ago and no one is in the middle of a transaction right
1107	    // now.  Else we defer the sync and reschedule it.
1108	    //
1109	    if (hfsmp->jnl) {
1110			hfs_lock_global (hfsmp, HFS_SHARED_LOCK);
1111
1112		    journal_flush(hfsmp->jnl, FALSE);
1113
1114			hfs_unlock_global (hfsmp);
1115	    } else {
1116		    hfs_sync(hfsmp->hfs_mp, MNT_WAIT, vfs_context_kernel());
1117	    }
1118
1119	    clock_get_calendar_microtime(&secs, &usecs);
1120	    now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
1121	    hfsmp->hfs_last_sync_time = now;
1122
1123    } else if (hfsmp->hfs_active_threads == 0) {
1124	    uint64_t deadline;
1125
1126	    clock_interval_to_deadline(delay, HFS_MILLISEC_SCALE, &deadline);
1127	    thread_call_enter_delayed(hfsmp->hfs_syncer, deadline);
1128
1129	    // note: we intentionally return early here and do not
1130	    // decrement the sync_scheduled and sync_incomplete
1131	    // variables because we rescheduled the timer.
1132
1133	    return;
1134    }
1135
1136    //
1137    // NOTE: we decrement these *after* we're done the journal_flush() since
1138    // it can take a significant amount of time and so we don't want more
1139    // callbacks scheduled until we're done this one.
1140    //
1141    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_scheduled);
1142    OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
1143    wakeup((caddr_t)&hfsmp->hfs_sync_incomplete);
1144}
1145
1146
1147extern int IOBSDIsMediaEjectable( const char *cdev_name );
1148
1149/*
1150 * Initialization code for Red-Black Tree Allocator
1151 *
1152 * This function will build the two red-black trees necessary for allocating space
1153 * from the metadata zone as well as normal allocations.  Currently, we use
1154 * an advisory read to get most of the data into the buffer cache.
1155 * This function is intended to be run in a separate thread so as not to slow down mount.
1156 *
1157 */
1158
1159void
1160hfs_initialize_allocator (struct hfsmount *hfsmp) {
1161
1162#if CONFIG_HFS_ALLOC_RBTREE
1163	u_int32_t err;
1164
1165	/*
1166	 * Take the allocation file lock.  Journal transactions will block until
1167	 * we're done here.
1168	 */
1169	int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1170
1171	/*
1172	 * GenerateTree assumes that the bitmap lock is held when you call the function.
1173	 * It will drop and re-acquire the lock periodically as needed to let other allocations
1174	 * through.  It returns with the bitmap lock held. Since we only maintain one tree,
1175	 * we don't need to specify a start block (always starts at 0).
1176	 */
1177	err = GenerateTree(hfsmp, hfsmp->totalBlocks, &flags, 1);
1178	if (err) {
1179		goto bailout;
1180	}
1181	/* Mark offset tree as built */
1182	hfsmp->extent_tree_flags |= HFS_ALLOC_RB_ACTIVE;
1183
1184bailout:
1185	/*
1186	 * GenerateTree may drop the bitmap lock during operation in order to give other
1187	 * threads a chance to allocate blocks, but it will always return with the lock held, so
1188	 * we don't need to re-grab the lock in order to update the TREEBUILD_INFLIGHT bit.
1189	 */
1190	hfsmp->extent_tree_flags &= ~HFS_ALLOC_TREEBUILD_INFLIGHT;
1191	if (err != 0) {
1192		/* Wakeup any waiters on the allocation bitmap lock */
1193		wakeup((caddr_t)&hfsmp->extent_tree_flags);
1194	}
1195
1196	hfs_systemfile_unlock(hfsmp, flags);
1197#else
1198#pragma unused (hfsmp)
1199#endif
1200}
1201
1202void hfs_unmap_blocks (struct hfsmount *hfsmp) {
1203	/*
1204	 * Take the allocation file lock.  Journal transactions will block until
1205	 * we're done here.
1206	 */
1207	int flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1208
1209	/*
1210	 * UnmapBlocks assumes that the bitmap lock is held when you call the function.
1211	 * We don't care if there were any error issuing unmaps yet.
1212	 */
1213	(void) UnmapBlocks(hfsmp);
1214
1215	hfs_systemfile_unlock(hfsmp, flags);
1216}
1217
1218
1219/*
1220 * Teardown code for the Red-Black Tree allocator.
1221 * This function consolidates the code which serializes with respect
1222 * to a thread that may be potentially still building the tree when we need to begin
1223 * tearing it down.   Since the red-black tree may not be live when we enter this function
1224 * we return:
1225 *		1 -> Tree was live.
1226 *		0 -> Tree was not active at time of call.
1227 */
1228
1229int
1230hfs_teardown_allocator (struct hfsmount *hfsmp) {
1231	int rb_used = 0;
1232
1233#if CONFIG_HFS_ALLOC_RBTREE
1234
1235	int flags = 0;
1236
1237	/*
1238	 * Check to see if the tree-generation is still on-going.
1239	 * If it is, then block until it's done.
1240	 */
1241
1242	flags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
1243
1244
1245	while (hfsmp->extent_tree_flags & HFS_ALLOC_TREEBUILD_INFLIGHT) {
1246		hfsmp->extent_tree_flags |= HFS_ALLOC_TEARDOWN_INFLIGHT;
1247
1248		lck_rw_sleep(&(VTOC(hfsmp->hfs_allocation_vp))->c_rwlock, LCK_SLEEP_EXCLUSIVE,
1249					 &hfsmp->extent_tree_flags, THREAD_UNINT);
1250	}
1251
1252	if (hfs_isrbtree_active (hfsmp)) {
1253		rb_used = 1;
1254
1255		/* Tear down the RB Trees while we have the bitmap locked */
1256		DestroyTrees(hfsmp);
1257
1258	}
1259
1260	hfs_systemfile_unlock(hfsmp, flags);
1261#else
1262	#pragma unused (hfsmp)
1263#endif
1264	return rb_used;
1265
1266}
1267
1268static int hfs_root_unmounted_cleanly = 0;
1269
1270SYSCTL_DECL(_vfs_generic);
1271SYSCTL_INT(_vfs_generic, OID_AUTO, root_unmounted_cleanly, CTLFLAG_RD, &hfs_root_unmounted_cleanly, 0, "Root filesystem was unmounted cleanly");
1272
1273/*
1274 * Common code for mount and mountroot
1275 */
1276int
1277hfs_mountfs(struct vnode *devvp, struct mount *mp, struct hfs_mount_args *args,
1278            int journal_replay_only, vfs_context_t context)
1279{
1280	struct proc *p = vfs_context_proc(context);
1281	int retval = E_NONE;
1282	struct hfsmount	*hfsmp = NULL;
1283	struct buf *bp;
1284	dev_t dev;
1285	HFSMasterDirectoryBlock *mdbp = NULL;
1286	int ronly;
1287#if QUOTA
1288	int i;
1289#endif
1290	int mntwrapper;
1291	kauth_cred_t cred;
1292	u_int64_t disksize;
1293	daddr64_t log_blkcnt;
1294	u_int32_t log_blksize;
1295	u_int32_t phys_blksize;
1296	u_int32_t minblksize;
1297	u_int32_t iswritable;
1298	daddr64_t mdb_offset;
1299	int isvirtual = 0;
1300	int isroot = 0;
1301	u_int32_t device_features = 0;
1302	int isssd;
1303#if CONFIG_HFS_ALLOC_RBTREE
1304	thread_t allocator_thread;
1305#endif
1306
1307	if (args == NULL) {
1308		/* only hfs_mountroot passes us NULL as the 'args' argument */
1309		isroot = 1;
1310	}
1311
1312	ronly = vfs_isrdonly(mp);
1313	dev = vnode_specrdev(devvp);
1314	cred = p ? vfs_context_ucred(context) : NOCRED;
1315	mntwrapper = 0;
1316
1317	bp = NULL;
1318	hfsmp = NULL;
1319	mdbp = NULL;
1320	minblksize = kHFSBlockSize;
1321
1322	/* Advisory locking should be handled at the VFS layer */
1323	vfs_setlocklocal(mp);
1324
1325	/* Get the logical block size (treated as physical block size everywhere) */
1326	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&log_blksize, 0, context)) {
1327		if (HFS_MOUNT_DEBUG) {
1328			printf("hfs_mountfs: DKIOCGETBLOCKSIZE failed\n");
1329		}
1330		retval = ENXIO;
1331		goto error_exit;
1332	}
1333	if (log_blksize == 0 || log_blksize > 1024*1024*1024) {
1334		printf("hfs: logical block size 0x%x looks bad.  Not mounting.\n", log_blksize);
1335		retval = ENXIO;
1336		goto error_exit;
1337	}
1338
1339	/* Get the physical block size. */
1340	retval = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_blksize, 0, context);
1341	if (retval) {
1342		if ((retval != ENOTSUP) && (retval != ENOTTY)) {
1343			if (HFS_MOUNT_DEBUG) {
1344				printf("hfs_mountfs: DKIOCGETPHYSICALBLOCKSIZE failed\n");
1345			}
1346			retval = ENXIO;
1347			goto error_exit;
1348		}
1349		/* If device does not support this ioctl, assume that physical
1350		 * block size is same as logical block size
1351		 */
1352		phys_blksize = log_blksize;
1353	}
1354	if (phys_blksize == 0 || phys_blksize > 1024*1024*1024) {
1355		printf("hfs: physical block size 0x%x looks bad.  Not mounting.\n", phys_blksize);
1356		retval = ENXIO;
1357		goto error_exit;
1358	}
1359
1360	/* Switch to 512 byte sectors (temporarily) */
1361	if (log_blksize > 512) {
1362		u_int32_t size512 = 512;
1363
1364		if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&size512, FWRITE, context)) {
1365			if (HFS_MOUNT_DEBUG) {
1366				printf("hfs_mountfs: DKIOCSETBLOCKSIZE failed \n");
1367			}
1368			retval = ENXIO;
1369			goto error_exit;
1370		}
1371	}
1372	/* Get the number of 512 byte physical blocks. */
1373	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1374		/* resetting block size may fail if getting block count did */
1375		(void)VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context);
1376		if (HFS_MOUNT_DEBUG) {
1377			printf("hfs_mountfs: DKIOCGETBLOCKCOUNT failed\n");
1378		}
1379		retval = ENXIO;
1380		goto error_exit;
1381	}
1382	/* Compute an accurate disk size (i.e. within 512 bytes) */
1383	disksize = (u_int64_t)log_blkcnt * (u_int64_t)512;
1384
1385	/*
1386	 * On Tiger it is not necessary to switch the device
1387	 * block size to be 4k if there are more than 31-bits
1388	 * worth of blocks but to insure compatibility with
1389	 * pre-Tiger systems we have to do it.
1390	 *
1391	 * If the device size is not a multiple of 4K (8 * 512), then
1392	 * switching the logical block size isn't going to help because
1393	 * we will be unable to write the alternate volume header.
1394	 * In this case, just leave the logical block size unchanged.
1395	 */
1396	if (log_blkcnt > 0x000000007fffffff && (log_blkcnt & 7) == 0) {
1397		minblksize = log_blksize = 4096;
1398		if (phys_blksize < log_blksize)
1399			phys_blksize = log_blksize;
1400	}
1401
1402	/*
1403	 * The cluster layer is not currently prepared to deal with a logical
1404	 * block size larger than the system's page size.  (It can handle
1405	 * blocks per page, but not multiple pages per block.)  So limit the
1406	 * logical block size to the page size.
1407	 */
1408	if (log_blksize > PAGE_SIZE)
1409		log_blksize = PAGE_SIZE;
1410
1411	/* Now switch to our preferred physical block size. */
1412	if (log_blksize > 512) {
1413		if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1414			if (HFS_MOUNT_DEBUG) {
1415				printf("hfs_mountfs: DKIOCSETBLOCKSIZE (2) failed\n");
1416			}
1417			retval = ENXIO;
1418			goto error_exit;
1419		}
1420		/* Get the count of physical blocks. */
1421		if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1422			if (HFS_MOUNT_DEBUG) {
1423				printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (2) failed\n");
1424			}
1425			retval = ENXIO;
1426			goto error_exit;
1427		}
1428	}
1429	/*
1430	 * At this point:
1431	 *   minblksize is the minimum physical block size
1432	 *   log_blksize has our preferred physical block size
1433	 *   log_blkcnt has the total number of physical blocks
1434	 */
1435
1436	mdb_offset = (daddr64_t)HFS_PRI_SECTOR(log_blksize);
1437	if ((retval = (int)buf_meta_bread(devvp,
1438				HFS_PHYSBLK_ROUNDDOWN(mdb_offset, (phys_blksize/log_blksize)),
1439				phys_blksize, cred, &bp))) {
1440		if (HFS_MOUNT_DEBUG) {
1441			printf("hfs_mountfs: buf_meta_bread failed with %d\n", retval);
1442		}
1443		goto error_exit;
1444	}
1445	MALLOC(mdbp, HFSMasterDirectoryBlock *, kMDBSize, M_TEMP, M_WAITOK);
1446	if (mdbp == NULL) {
1447		retval = ENOMEM;
1448		if (HFS_MOUNT_DEBUG) {
1449			printf("hfs_mountfs: MALLOC failed\n");
1450		}
1451		goto error_exit;
1452	}
1453	bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, kMDBSize);
1454	buf_brelse(bp);
1455	bp = NULL;
1456
1457	MALLOC(hfsmp, struct hfsmount *, sizeof(struct hfsmount), M_HFSMNT, M_WAITOK);
1458	if (hfsmp == NULL) {
1459		if (HFS_MOUNT_DEBUG) {
1460			printf("hfs_mountfs: MALLOC (2) failed\n");
1461		}
1462		retval = ENOMEM;
1463		goto error_exit;
1464	}
1465	bzero(hfsmp, sizeof(struct hfsmount));
1466
1467	hfs_chashinit_finish(hfsmp);
1468
1469	/*
1470	 * See if the disk supports unmap (trim).
1471	 *
1472	 * NOTE: vfs_init_io_attributes has not been called yet, so we can't use the io_flags field
1473	 * returned by vfs_ioattr.  We need to call VNOP_IOCTL ourselves.
1474	 */
1475	if (VNOP_IOCTL(devvp, DKIOCGETFEATURES, (caddr_t)&device_features, 0, context) == 0) {
1476		if (device_features & DK_FEATURE_UNMAP) {
1477			hfsmp->hfs_flags |= HFS_UNMAP;
1478		}
1479	}
1480
1481	/*
1482	 * See if the disk is a solid state device, too.  We need this to decide what to do about
1483	 * hotfiles.
1484	 */
1485	if (VNOP_IOCTL(devvp, DKIOCISSOLIDSTATE, (caddr_t)&isssd, 0, context) == 0) {
1486		if (isssd) {
1487			hfsmp->hfs_flags |= HFS_SSD;
1488		}
1489	}
1490
1491
1492	/*
1493	 *  Init the volume information structure
1494	 */
1495
1496	lck_mtx_init(&hfsmp->hfs_mutex, hfs_mutex_group, hfs_lock_attr);
1497	lck_mtx_init(&hfsmp->hfc_mutex, hfs_mutex_group, hfs_lock_attr);
1498	lck_rw_init(&hfsmp->hfs_global_lock, hfs_rwlock_group, hfs_lock_attr);
1499	lck_rw_init(&hfsmp->hfs_insync, hfs_rwlock_group, hfs_lock_attr);
1500	lck_spin_init(&hfsmp->vcbFreeExtLock, hfs_spinlock_group, hfs_lock_attr);
1501
1502	vfs_setfsprivate(mp, hfsmp);
1503	hfsmp->hfs_mp = mp;			/* Make VFSTOHFS work */
1504	hfsmp->hfs_raw_dev = vnode_specrdev(devvp);
1505	hfsmp->hfs_devvp = devvp;
1506	vnode_ref(devvp);  /* Hold a ref on the device, dropped when hfsmp is freed. */
1507	hfsmp->hfs_logical_block_size = log_blksize;
1508	hfsmp->hfs_logical_block_count = log_blkcnt;
1509	hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1510	hfsmp->hfs_physical_block_size = phys_blksize;
1511	hfsmp->hfs_log_per_phys = (phys_blksize / log_blksize);
1512	hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1513	if (ronly)
1514		hfsmp->hfs_flags |= HFS_READ_ONLY;
1515	if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS)
1516		hfsmp->hfs_flags |= HFS_UNKNOWN_PERMS;
1517
1518#if QUOTA
1519	for (i = 0; i < MAXQUOTAS; i++)
1520		dqfileinit(&hfsmp->hfs_qfiles[i]);
1521#endif
1522
1523	if (args) {
1524		hfsmp->hfs_uid = (args->hfs_uid == (uid_t)VNOVAL) ? UNKNOWNUID : args->hfs_uid;
1525		if (hfsmp->hfs_uid == 0xfffffffd) hfsmp->hfs_uid = UNKNOWNUID;
1526		hfsmp->hfs_gid = (args->hfs_gid == (gid_t)VNOVAL) ? UNKNOWNGID : args->hfs_gid;
1527		if (hfsmp->hfs_gid == 0xfffffffd) hfsmp->hfs_gid = UNKNOWNGID;
1528		vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);				/* tell the VFS */
1529		if (args->hfs_mask != (mode_t)VNOVAL) {
1530			hfsmp->hfs_dir_mask = args->hfs_mask & ALLPERMS;
1531			if (args->flags & HFSFSMNT_NOXONFILES) {
1532				hfsmp->hfs_file_mask = (args->hfs_mask & DEFFILEMODE);
1533			} else {
1534				hfsmp->hfs_file_mask = args->hfs_mask & ALLPERMS;
1535			}
1536		} else {
1537			hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;		/* 0777: rwx---rwx */
1538			hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;	/* 0666: no --x by default? */
1539		}
1540		if ((args->flags != (int)VNOVAL) && (args->flags & HFSFSMNT_WRAPPER))
1541			mntwrapper = 1;
1542	} else {
1543		/* Even w/o explicit mount arguments, MNT_UNKNOWNPERMISSIONS requires setting up uid, gid, and mask: */
1544		if (((unsigned int)vfs_flags(mp)) & MNT_UNKNOWNPERMISSIONS) {
1545			hfsmp->hfs_uid = UNKNOWNUID;
1546			hfsmp->hfs_gid = UNKNOWNGID;
1547			vfs_setowner(mp, hfsmp->hfs_uid, hfsmp->hfs_gid);			/* tell the VFS */
1548			hfsmp->hfs_dir_mask = UNKNOWNPERMISSIONS & ALLPERMS;		/* 0777: rwx---rwx */
1549			hfsmp->hfs_file_mask = UNKNOWNPERMISSIONS & DEFFILEMODE;	/* 0666: no --x by default? */
1550		}
1551	}
1552
1553	/* Find out if disk media is writable. */
1554	if (VNOP_IOCTL(devvp, DKIOCISWRITABLE, (caddr_t)&iswritable, 0, context) == 0) {
1555		if (iswritable)
1556			hfsmp->hfs_flags |= HFS_WRITEABLE_MEDIA;
1557		else
1558			hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1559	}
1560
1561	// record the current time at which we're mounting this volume
1562	struct timeval tv;
1563	microtime(&tv);
1564	hfsmp->hfs_mount_time = tv.tv_sec;
1565
1566	/* Mount a standard HFS disk */
1567	if ((SWAP_BE16(mdbp->drSigWord) == kHFSSigWord) &&
1568	    (mntwrapper || (SWAP_BE16(mdbp->drEmbedSigWord) != kHFSPlusSigWord))) {
1569#if CONFIG_HFS_STD
1570		/* On 10.6 and beyond, non read-only mounts for HFS standard vols get rejected */
1571		if (vfs_isrdwr(mp)) {
1572			retval = EROFS;
1573			goto error_exit;
1574		}
1575
1576		printf("hfs_mountfs: Mounting HFS Standard volumes was deprecated in Mac OS 10.7 \n");
1577
1578		/* Treat it as if it's read-only and not writeable */
1579		hfsmp->hfs_flags |= HFS_READ_ONLY;
1580		hfsmp->hfs_flags &= ~HFS_WRITEABLE_MEDIA;
1581
1582	   	/* If only journal replay is requested, exit immediately */
1583		if (journal_replay_only) {
1584			retval = 0;
1585			goto error_exit;
1586		}
1587
1588	        if ((vfs_flags(mp) & MNT_ROOTFS)) {
1589			retval = EINVAL;  /* Cannot root from HFS standard disks */
1590			goto error_exit;
1591		}
1592		/* HFS disks can only use 512 byte physical blocks */
1593		if (log_blksize > kHFSBlockSize) {
1594			log_blksize = kHFSBlockSize;
1595			if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1596				retval = ENXIO;
1597				goto error_exit;
1598			}
1599			if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1600				retval = ENXIO;
1601				goto error_exit;
1602			}
1603			hfsmp->hfs_logical_block_size = log_blksize;
1604			hfsmp->hfs_logical_block_count = log_blkcnt;
1605			hfsmp->hfs_logical_bytes = (uint64_t) log_blksize * (uint64_t) log_blkcnt;
1606			hfsmp->hfs_physical_block_size = log_blksize;
1607			hfsmp->hfs_log_per_phys = 1;
1608		}
1609		if (args) {
1610			hfsmp->hfs_encoding = args->hfs_encoding;
1611			HFSTOVCB(hfsmp)->volumeNameEncodingHint = args->hfs_encoding;
1612
1613			/* establish the timezone */
1614			gTimeZone = args->hfs_timezone;
1615		}
1616
1617		retval = hfs_getconverter(hfsmp->hfs_encoding, &hfsmp->hfs_get_unicode,
1618					&hfsmp->hfs_get_hfsname);
1619		if (retval)
1620			goto error_exit;
1621
1622		retval = hfs_MountHFSVolume(hfsmp, mdbp, p);
1623		if (retval)
1624			(void) hfs_relconverter(hfsmp->hfs_encoding);
1625#else
1626		/* On platforms where HFS Standard is not supported, deny the mount altogether */
1627		retval = EINVAL;
1628		goto error_exit;
1629#endif
1630
1631	} else /* Mount an HFS Plus disk */ {
1632		HFSPlusVolumeHeader *vhp;
1633		off_t embeddedOffset;
1634		int   jnl_disable = 0;
1635
1636		/* Get the embedded Volume Header */
1637		if (SWAP_BE16(mdbp->drEmbedSigWord) == kHFSPlusSigWord) {
1638			embeddedOffset = SWAP_BE16(mdbp->drAlBlSt) * kHFSBlockSize;
1639			embeddedOffset += (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.startBlock) *
1640			                  (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1641
1642			/*
1643			 * If the embedded volume doesn't start on a block
1644			 * boundary, then switch the device to a 512-byte
1645			 * block size so everything will line up on a block
1646			 * boundary.
1647			 */
1648			if ((embeddedOffset % log_blksize) != 0) {
1649				printf("hfs_mountfs: embedded volume offset not"
1650				    " a multiple of physical block size (%d);"
1651				    " switching to 512\n", log_blksize);
1652				log_blksize = 512;
1653				if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE,
1654				    (caddr_t)&log_blksize, FWRITE, context)) {
1655
1656					if (HFS_MOUNT_DEBUG) {
1657						printf("hfs_mountfs: DKIOCSETBLOCKSIZE (3) failed\n");
1658					}
1659					retval = ENXIO;
1660					goto error_exit;
1661				}
1662				if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT,
1663				    (caddr_t)&log_blkcnt, 0, context)) {
1664					if (HFS_MOUNT_DEBUG) {
1665						printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (3) failed\n");
1666					}
1667					retval = ENXIO;
1668					goto error_exit;
1669				}
1670				/* Note: relative block count adjustment */
1671				hfsmp->hfs_logical_block_count *=
1672				    hfsmp->hfs_logical_block_size / log_blksize;
1673
1674				/* Update logical /physical block size */
1675				hfsmp->hfs_logical_block_size = log_blksize;
1676				hfsmp->hfs_physical_block_size = log_blksize;
1677
1678				phys_blksize = log_blksize;
1679				hfsmp->hfs_log_per_phys = 1;
1680			}
1681
1682			disksize = (u_int64_t)SWAP_BE16(mdbp->drEmbedExtent.blockCount) *
1683			           (u_int64_t)SWAP_BE32(mdbp->drAlBlkSiz);
1684
1685			hfsmp->hfs_logical_block_count = disksize / log_blksize;
1686
1687			hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1688
1689			mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1690			retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1691					phys_blksize, cred, &bp);
1692			if (retval) {
1693				if (HFS_MOUNT_DEBUG) {
1694					printf("hfs_mountfs: buf_meta_bread (2) failed with %d\n", retval);
1695				}
1696				goto error_exit;
1697			}
1698			bcopy((char *)buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize), mdbp, 512);
1699			buf_brelse(bp);
1700			bp = NULL;
1701			vhp = (HFSPlusVolumeHeader*) mdbp;
1702
1703		} else /* pure HFS+ */ {
1704			embeddedOffset = 0;
1705			vhp = (HFSPlusVolumeHeader*) mdbp;
1706		}
1707
1708		if (isroot) {
1709			hfs_root_unmounted_cleanly = ((SWAP_BE32(vhp->attributes) & kHFSVolumeUnmountedMask) != 0);
1710		}
1711
1712		/*
1713		 * On inconsistent disks, do not allow read-write mount
1714		 * unless it is the boot volume being mounted.  We also
1715		 * always want to replay the journal if the journal_replay_only
1716		 * flag is set because that will (most likely) get the
1717		 * disk into a consistent state before fsck_hfs starts
1718		 * looking at it.
1719		 */
1720		if (  !(vfs_flags(mp) & MNT_ROOTFS)
1721		   && (SWAP_BE32(vhp->attributes) & kHFSVolumeInconsistentMask)
1722		   && !journal_replay_only
1723		   && !(hfsmp->hfs_flags & HFS_READ_ONLY)) {
1724
1725			if (HFS_MOUNT_DEBUG) {
1726				printf("hfs_mountfs: failed to mount non-root inconsistent disk\n");
1727			}
1728			retval = EINVAL;
1729			goto error_exit;
1730		}
1731
1732
1733		// XXXdbg
1734		//
1735		hfsmp->jnl = NULL;
1736		hfsmp->jvp = NULL;
1737		if (args != NULL && (args->flags & HFSFSMNT_EXTENDED_ARGS) &&
1738		    args->journal_disable) {
1739		    jnl_disable = 1;
1740		}
1741
1742		//
1743		// We only initialize the journal here if the last person
1744		// to mount this volume was journaling aware.  Otherwise
1745		// we delay journal initialization until later at the end
1746		// of hfs_MountHFSPlusVolume() because the last person who
1747		// mounted it could have messed things up behind our back
1748		// (so we need to go find the .journal file, make sure it's
1749		// the right size, re-sync up if it was moved, etc).
1750		//
1751		if (   (SWAP_BE32(vhp->lastMountedVersion) == kHFSJMountVersion)
1752			&& (SWAP_BE32(vhp->attributes) & kHFSVolumeJournaledMask)
1753			&& !jnl_disable) {
1754
1755			// if we're able to init the journal, mark the mount
1756			// point as journaled.
1757			//
1758			if ((retval = hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred)) == 0) {
1759				vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1760			} else {
1761				if (retval == EROFS) {
1762					// EROFS is a special error code that means the volume has an external
1763					// journal which we couldn't find.  in that case we do not want to
1764					// rewrite the volume header - we'll just refuse to mount the volume.
1765					if (HFS_MOUNT_DEBUG) {
1766						printf("hfs_mountfs: hfs_early_journal_init indicated external jnl \n");
1767					}
1768					retval = EINVAL;
1769					goto error_exit;
1770				}
1771
1772				// if the journal failed to open, then set the lastMountedVersion
1773				// to be "FSK!" which fsck_hfs will see and force the fsck instead
1774				// of just bailing out because the volume is journaled.
1775				if (!ronly) {
1776					if (HFS_MOUNT_DEBUG) {
1777						printf("hfs_mountfs: hfs_early_journal_init failed, setting to FSK \n");
1778					}
1779
1780					HFSPlusVolumeHeader *jvhp;
1781
1782				    hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1783
1784				    if (mdb_offset == 0) {
1785					mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1786				    }
1787
1788				    bp = NULL;
1789				    retval = (int)buf_meta_bread(devvp,
1790						    HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1791						    phys_blksize, cred, &bp);
1792				    if (retval == 0) {
1793					jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1794
1795					if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1796						printf ("hfs(1): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1797					    jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1798					    buf_bwrite(bp);
1799					} else {
1800					    buf_brelse(bp);
1801					}
1802					bp = NULL;
1803				    } else if (bp) {
1804					buf_brelse(bp);
1805					// clear this so the error exit path won't try to use it
1806					bp = NULL;
1807				    }
1808				}
1809
1810				// if this isn't the root device just bail out.
1811				// If it is the root device we just continue on
1812				// in the hopes that fsck_hfs will be able to
1813				// fix any damage that exists on the volume.
1814				if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1815					if (HFS_MOUNT_DEBUG) {
1816						printf("hfs_mountfs: hfs_early_journal_init failed, erroring out \n");
1817					}
1818				    retval = EINVAL;
1819				    goto error_exit;
1820				}
1821			}
1822		}
1823		// XXXdbg
1824
1825		/* Either the journal is replayed successfully, or there
1826		 * was nothing to replay, or no journal exists.  In any case,
1827		 * return success.
1828		 */
1829		if (journal_replay_only) {
1830			retval = 0;
1831			goto error_exit;
1832		}
1833
1834		(void) hfs_getconverter(0, &hfsmp->hfs_get_unicode, &hfsmp->hfs_get_hfsname);
1835
1836		retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1837		/*
1838		 * If the backend didn't like our physical blocksize
1839		 * then retry with physical blocksize of 512.
1840		 */
1841		if ((retval == ENXIO) && (log_blksize > 512) && (log_blksize != minblksize)) {
1842			printf("hfs_mountfs: could not use physical block size "
1843					"(%d) switching to 512\n", log_blksize);
1844			log_blksize = 512;
1845			if (VNOP_IOCTL(devvp, DKIOCSETBLOCKSIZE, (caddr_t)&log_blksize, FWRITE, context)) {
1846				if (HFS_MOUNT_DEBUG) {
1847					printf("hfs_mountfs: DKIOCSETBLOCKSIZE (4) failed \n");
1848				}
1849				retval = ENXIO;
1850				goto error_exit;
1851			}
1852			if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&log_blkcnt, 0, context)) {
1853				if (HFS_MOUNT_DEBUG) {
1854					printf("hfs_mountfs: DKIOCGETBLOCKCOUNT (4) failed \n");
1855				}
1856				retval = ENXIO;
1857				goto error_exit;
1858			}
1859			devvp->v_specsize = log_blksize;
1860			/* Note: relative block count adjustment (in case this is an embedded volume). */
1861			hfsmp->hfs_logical_block_count *= hfsmp->hfs_logical_block_size / log_blksize;
1862			hfsmp->hfs_logical_block_size = log_blksize;
1863			hfsmp->hfs_log_per_phys = hfsmp->hfs_physical_block_size / log_blksize;
1864
1865			hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
1866
1867			if (hfsmp->jnl && hfsmp->jvp == devvp) {
1868			    // close and re-open this with the new block size
1869			    journal_close(hfsmp->jnl);
1870			    hfsmp->jnl = NULL;
1871			    if (hfs_early_journal_init(hfsmp, vhp, args, embeddedOffset, mdb_offset, mdbp, cred) == 0) {
1872					vfs_setflags(mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
1873				} else {
1874					// if the journal failed to open, then set the lastMountedVersion
1875					// to be "FSK!" which fsck_hfs will see and force the fsck instead
1876					// of just bailing out because the volume is journaled.
1877					if (!ronly) {
1878						if (HFS_MOUNT_DEBUG) {
1879							printf("hfs_mountfs: hfs_early_journal_init (2) resetting.. \n");
1880						}
1881				    	HFSPlusVolumeHeader *jvhp;
1882
1883				    	hfsmp->hfs_flags |= HFS_NEED_JNL_RESET;
1884
1885				    	if (mdb_offset == 0) {
1886							mdb_offset = (daddr64_t)((embeddedOffset / log_blksize) + HFS_PRI_SECTOR(log_blksize));
1887				    	}
1888
1889				   	 	bp = NULL;
1890				    	retval = (int)buf_meta_bread(devvp, HFS_PHYSBLK_ROUNDDOWN(mdb_offset, hfsmp->hfs_log_per_phys),
1891							phys_blksize, cred, &bp);
1892				    	if (retval == 0) {
1893							jvhp = (HFSPlusVolumeHeader *)(buf_dataptr(bp) + HFS_PRI_OFFSET(phys_blksize));
1894
1895							if (SWAP_BE16(jvhp->signature) == kHFSPlusSigWord || SWAP_BE16(jvhp->signature) == kHFSXSigWord) {
1896								printf ("hfs(2): Journal replay fail.  Writing lastMountVersion as FSK!\n");
1897					    		jvhp->lastMountedVersion = SWAP_BE32(kFSKMountVersion);
1898					    		buf_bwrite(bp);
1899							} else {
1900					    		buf_brelse(bp);
1901							}
1902							bp = NULL;
1903				    	} else if (bp) {
1904							buf_brelse(bp);
1905							// clear this so the error exit path won't try to use it
1906							bp = NULL;
1907				    	}
1908					}
1909
1910					// if this isn't the root device just bail out.
1911					// If it is the root device we just continue on
1912					// in the hopes that fsck_hfs will be able to
1913					// fix any damage that exists on the volume.
1914					if ( !(vfs_flags(mp) & MNT_ROOTFS)) {
1915						if (HFS_MOUNT_DEBUG) {
1916							printf("hfs_mountfs: hfs_early_journal_init (2) failed \n");
1917						}
1918				    	retval = EINVAL;
1919				    	goto error_exit;
1920					}
1921				}
1922			}
1923
1924			/* Try again with a smaller block size... */
1925			retval = hfs_MountHFSPlusVolume(hfsmp, vhp, embeddedOffset, disksize, p, args, cred);
1926			if (retval && HFS_MOUNT_DEBUG) {
1927				printf("hfs_MountHFSPlusVolume (late) returned %d\n",retval);
1928			}
1929		}
1930		if (retval)
1931			(void) hfs_relconverter(0);
1932	}
1933
1934	// save off a snapshot of the mtime from the previous mount
1935	// (for matador).
1936	hfsmp->hfs_last_mounted_mtime = hfsmp->hfs_mtime;
1937
1938	if ( retval ) {
1939		if (HFS_MOUNT_DEBUG) {
1940			printf("hfs_mountfs: encountered failure %d \n", retval);
1941		}
1942		goto error_exit;
1943	}
1944
1945	mp->mnt_vfsstat.f_fsid.val[0] = (long)dev;
1946	mp->mnt_vfsstat.f_fsid.val[1] = vfs_typenum(mp);
1947	vfs_setmaxsymlen(mp, 0);
1948
1949	mp->mnt_vtable->vfc_vfsflags |= VFC_VFSNATIVEXATTR;
1950#if NAMEDSTREAMS
1951	mp->mnt_kern_flag |= MNTK_NAMED_STREAMS;
1952#endif
1953	if (!(hfsmp->hfs_flags & HFS_STANDARD)) {
1954		/* Tell VFS that we support directory hard links. */
1955		mp->mnt_vtable->vfc_vfsflags |= VFC_VFSDIRLINKS;
1956	} else {
1957		/* HFS standard doesn't support extended readdir! */
1958		mount_set_noreaddirext (mp);
1959	}
1960
1961	if (args) {
1962		/*
1963		 * Set the free space warning levels for a non-root volume:
1964		 *
1965		 * Set the "danger" limit to 1% of the volume size or 100MB, whichever
1966		 * is less.  Set the "warning" limit to 2% of the volume size or 150MB,
1967		 * whichever is less.  And last, set the "desired" freespace level to
1968		 * to 3% of the volume size or 200MB, whichever is less.
1969		 */
1970		hfsmp->hfs_freespace_notify_dangerlimit =
1971			MIN(HFS_VERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1972				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_VERYLOWDISKTRIGGERFRACTION);
1973		hfsmp->hfs_freespace_notify_warninglimit =
1974			MIN(HFS_LOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1975				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKTRIGGERFRACTION);
1976		hfsmp->hfs_freespace_notify_desiredlevel =
1977			MIN(HFS_LOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1978				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_LOWDISKSHUTOFFFRACTION);
1979	} else {
1980		/*
1981		 * Set the free space warning levels for the root volume:
1982		 *
1983		 * Set the "danger" limit to 5% of the volume size or 512MB, whichever
1984		 * is less.  Set the "warning" limit to 10% of the volume size or 1GB,
1985		 * whichever is less.  And last, set the "desired" freespace level to
1986		 * to 11% of the volume size or 1.25GB, whichever is less.
1987		 */
1988		hfsmp->hfs_freespace_notify_dangerlimit =
1989			MIN(HFS_ROOTVERYLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1990				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTVERYLOWDISKTRIGGERFRACTION);
1991		hfsmp->hfs_freespace_notify_warninglimit =
1992			MIN(HFS_ROOTLOWDISKTRIGGERLEVEL / HFSTOVCB(hfsmp)->blockSize,
1993				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKTRIGGERFRACTION);
1994		hfsmp->hfs_freespace_notify_desiredlevel =
1995			MIN(HFS_ROOTLOWDISKSHUTOFFLEVEL / HFSTOVCB(hfsmp)->blockSize,
1996				(HFSTOVCB(hfsmp)->totalBlocks / 100) * HFS_ROOTLOWDISKSHUTOFFFRACTION);
1997	};
1998
1999	/* Check if the file system exists on virtual device, like disk image */
2000	if (VNOP_IOCTL(devvp, DKIOCISVIRTUAL, (caddr_t)&isvirtual, 0, context) == 0) {
2001		if (isvirtual) {
2002			hfsmp->hfs_flags |= HFS_VIRTUAL_DEVICE;
2003		}
2004	}
2005
2006	/* do not allow ejectability checks on the root device */
2007	if (isroot == 0) {
2008		if ((hfsmp->hfs_flags & HFS_VIRTUAL_DEVICE) == 0 &&
2009				IOBSDIsMediaEjectable(mp->mnt_vfsstat.f_mntfromname)) {
2010			hfsmp->hfs_max_pending_io = 4096*1024;   // a reasonable value to start with.
2011			hfsmp->hfs_syncer = thread_call_allocate(hfs_syncer, hfsmp);
2012			if (hfsmp->hfs_syncer == NULL) {
2013				printf("hfs: failed to allocate syncer thread callback for %s (%s)\n",
2014						mp->mnt_vfsstat.f_mntfromname, mp->mnt_vfsstat.f_mntonname);
2015			}
2016		}
2017	}
2018
2019#if CONFIG_HFS_MOUNT_UNMAP
2020	/* Enable UNMAPs for embedded SSDs only for now */
2021	/*
2022	 * TODO: Should we enable this for CoreStorage volumes, too?
2023	 */
2024	if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2025		if (hfsmp->hfs_flags & HFS_UNMAP) {
2026			hfs_unmap_blocks(hfsmp);
2027		}
2028	}
2029#endif
2030
2031
2032#if CONFIG_HFS_ALLOC_RBTREE
2033	/*
2034	 * We spawn a thread to create the pair of red-black trees for this volume.
2035	 * However, in so doing, we must be careful to ensure that if this thread is still
2036	 * running after mount has finished, it doesn't interfere with an unmount. Specifically,
2037	 * we'll need to set a bit that indicates we're in progress building the trees here.
2038	 * Unmount will check for this bit, and then if it's set, mark a corresponding bit that
2039	 * notifies the tree generation code that an unmount is waiting.  Also mark the bit that
2040	 * indicates the tree is live and operating.
2041	 *
2042	 * Only do this if we're operating on a read-write mount (we wouldn't care for read-only).
2043	 */
2044
2045	if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2046		hfsmp->extent_tree_flags |= (HFS_ALLOC_TREEBUILD_INFLIGHT | HFS_ALLOC_RB_ENABLED);
2047
2048		/* Initialize EOF counter so that the thread can assume it started at initial values */
2049		hfsmp->offset_block_end = 0;
2050		InitTree(hfsmp);
2051
2052		kernel_thread_start ((thread_continue_t) hfs_initialize_allocator , hfsmp, &allocator_thread);
2053		thread_deallocate(allocator_thread);
2054	}
2055
2056#endif
2057
2058	/*
2059	 * Start looking for free space to drop below this level and generate a
2060	 * warning immediately if needed:
2061	 */
2062	hfsmp->hfs_notification_conditions = 0;
2063	hfs_generate_volume_notifications(hfsmp);
2064
2065	if (ronly == 0) {
2066		(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2067	}
2068	FREE(mdbp, M_TEMP);
2069	return (0);
2070
2071error_exit:
2072	if (bp)
2073		buf_brelse(bp);
2074	if (mdbp)
2075		FREE(mdbp, M_TEMP);
2076
2077	if (hfsmp && hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2078		vnode_clearmountedon(hfsmp->jvp);
2079		(void)VNOP_CLOSE(hfsmp->jvp, ronly ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2080		hfsmp->jvp = NULL;
2081	}
2082	if (hfsmp) {
2083		if (hfsmp->hfs_devvp) {
2084			vnode_rele(hfsmp->hfs_devvp);
2085		}
2086		hfs_delete_chash(hfsmp);
2087
2088		FREE(hfsmp, M_HFSMNT);
2089		vfs_setfsprivate(mp, NULL);
2090	}
2091        return (retval);
2092}
2093
2094
2095/*
2096 * Make a filesystem operational.
2097 * Nothing to do at the moment.
2098 */
2099/* ARGSUSED */
2100static int
2101hfs_start(__unused struct mount *mp, __unused int flags, __unused vfs_context_t context)
2102{
2103	return (0);
2104}
2105
2106
2107/*
2108 * unmount system call
2109 */
2110int
2111hfs_unmount(struct mount *mp, int mntflags, vfs_context_t context)
2112{
2113	struct proc *p = vfs_context_proc(context);
2114	struct hfsmount *hfsmp = VFSTOHFS(mp);
2115	int retval = E_NONE;
2116	int flags;
2117	int force;
2118	int started_tr = 0;
2119	int rb_used = 0;
2120
2121	flags = 0;
2122	force = 0;
2123	if (mntflags & MNT_FORCE) {
2124		flags |= FORCECLOSE;
2125		force = 1;
2126	}
2127
2128	if ((retval = hfs_flushfiles(mp, flags, p)) && !force)
2129 		return (retval);
2130
2131	if (hfsmp->hfs_flags & HFS_METADATA_ZONE)
2132		(void) hfs_recording_suspend(hfsmp);
2133
2134	/*
2135	 * Cancel any pending timers for this volume.  Then wait for any timers
2136	 * which have fired, but whose callbacks have not yet completed.
2137	 */
2138	if (hfsmp->hfs_syncer)
2139	{
2140		struct timespec ts = {0, 100000000};	/* 0.1 seconds */
2141
2142		/*
2143		 * Cancel any timers that have been scheduled, but have not
2144		 * fired yet.  NOTE: The kernel considers a timer complete as
2145		 * soon as it starts your callback, so the kernel does not
2146		 * keep track of the number of callbacks in progress.
2147		 */
2148		if (thread_call_cancel(hfsmp->hfs_syncer))
2149			OSDecrementAtomic((volatile SInt32 *)&hfsmp->hfs_sync_incomplete);
2150		thread_call_free(hfsmp->hfs_syncer);
2151		hfsmp->hfs_syncer = NULL;
2152
2153		/*
2154		 * This waits for all of the callbacks that were entered before
2155		 * we did thread_call_cancel above, but have not completed yet.
2156		 */
2157		while(hfsmp->hfs_sync_incomplete > 0)
2158		{
2159			msleep((caddr_t)&hfsmp->hfs_sync_incomplete, NULL, PWAIT, "hfs_unmount", &ts);
2160		}
2161
2162		if (hfsmp->hfs_sync_incomplete < 0)
2163			panic("hfs_unmount: pm_sync_incomplete underflow!\n");
2164	}
2165
2166#if CONFIG_HFS_ALLOC_RBTREE
2167	rb_used = hfs_teardown_allocator(hfsmp);
2168#endif
2169
2170	/*
2171	 * Flush out the b-trees, volume bitmap and Volume Header
2172	 */
2173	if ((hfsmp->hfs_flags & HFS_READ_ONLY) == 0) {
2174		retval = hfs_start_transaction(hfsmp);
2175		if (retval == 0) {
2176		    started_tr = 1;
2177		} else if (!force) {
2178		    goto err_exit;
2179		}
2180
2181		if (hfsmp->hfs_startup_vp) {
2182			(void) hfs_lock(VTOC(hfsmp->hfs_startup_vp), HFS_EXCLUSIVE_LOCK);
2183			retval = hfs_fsync(hfsmp->hfs_startup_vp, MNT_WAIT, 0, p);
2184			hfs_unlock(VTOC(hfsmp->hfs_startup_vp));
2185			if (retval && !force)
2186				goto err_exit;
2187		}
2188
2189		if (hfsmp->hfs_attribute_vp) {
2190			(void) hfs_lock(VTOC(hfsmp->hfs_attribute_vp), HFS_EXCLUSIVE_LOCK);
2191			retval = hfs_fsync(hfsmp->hfs_attribute_vp, MNT_WAIT, 0, p);
2192			hfs_unlock(VTOC(hfsmp->hfs_attribute_vp));
2193			if (retval && !force)
2194				goto err_exit;
2195		}
2196
2197		(void) hfs_lock(VTOC(hfsmp->hfs_catalog_vp), HFS_EXCLUSIVE_LOCK);
2198		retval = hfs_fsync(hfsmp->hfs_catalog_vp, MNT_WAIT, 0, p);
2199		hfs_unlock(VTOC(hfsmp->hfs_catalog_vp));
2200		if (retval && !force)
2201			goto err_exit;
2202
2203		(void) hfs_lock(VTOC(hfsmp->hfs_extents_vp), HFS_EXCLUSIVE_LOCK);
2204		retval = hfs_fsync(hfsmp->hfs_extents_vp, MNT_WAIT, 0, p);
2205		hfs_unlock(VTOC(hfsmp->hfs_extents_vp));
2206		if (retval && !force)
2207			goto err_exit;
2208
2209		if (hfsmp->hfs_allocation_vp) {
2210			(void) hfs_lock(VTOC(hfsmp->hfs_allocation_vp), HFS_EXCLUSIVE_LOCK);
2211			retval = hfs_fsync(hfsmp->hfs_allocation_vp, MNT_WAIT, 0, p);
2212			hfs_unlock(VTOC(hfsmp->hfs_allocation_vp));
2213			if (retval && !force)
2214				goto err_exit;
2215		}
2216
2217		if (hfsmp->hfc_filevp && vnode_issystem(hfsmp->hfc_filevp)) {
2218			retval = hfs_fsync(hfsmp->hfc_filevp, MNT_WAIT, 0, p);
2219			if (retval && !force)
2220				goto err_exit;
2221		}
2222
2223		/* If runtime corruption was detected, indicate that the volume
2224		 * was not unmounted cleanly.
2225		 */
2226		if (hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) {
2227			HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2228		} else {
2229			HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeUnmountedMask;
2230		}
2231
2232
2233		if (rb_used) {
2234			/* If the rb-tree was live, just set min_start to 0 */
2235			hfsmp->nextAllocation = 0;
2236		}
2237		else {
2238			if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
2239				int i;
2240				u_int32_t min_start = hfsmp->totalBlocks;
2241
2242				// set the nextAllocation pointer to the smallest free block number
2243				// we've seen so on the next mount we won't rescan unnecessarily
2244				lck_spin_lock(&hfsmp->vcbFreeExtLock);
2245				for(i=0; i < (int)hfsmp->vcbFreeExtCnt; i++) {
2246					if (hfsmp->vcbFreeExt[i].startBlock < min_start) {
2247						min_start = hfsmp->vcbFreeExt[i].startBlock;
2248					}
2249				}
2250				lck_spin_unlock(&hfsmp->vcbFreeExtLock);
2251				if (min_start < hfsmp->nextAllocation) {
2252					hfsmp->nextAllocation = min_start;
2253				}
2254			}
2255		}
2256
2257
2258		retval = hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
2259		if (retval) {
2260			HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeUnmountedMask;
2261			if (!force)
2262				goto err_exit;	/* could not flush everything */
2263		}
2264
2265		if (started_tr) {
2266		    hfs_end_transaction(hfsmp);
2267		    started_tr = 0;
2268		}
2269	}
2270
2271	if (hfsmp->jnl) {
2272		hfs_journal_flush(hfsmp, FALSE);
2273	}
2274
2275	/*
2276	 *	Invalidate our caches and release metadata vnodes
2277	 */
2278	(void) hfsUnmount(hfsmp, p);
2279
2280	if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2281		(void) hfs_relconverter(hfsmp->hfs_encoding);
2282
2283	// XXXdbg
2284	if (hfsmp->jnl) {
2285	    journal_close(hfsmp->jnl);
2286	    hfsmp->jnl = NULL;
2287	}
2288
2289	VNOP_FSYNC(hfsmp->hfs_devvp, MNT_WAIT, context);
2290
2291	if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
2292	    vnode_clearmountedon(hfsmp->jvp);
2293	    retval = VNOP_CLOSE(hfsmp->jvp,
2294	                       hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE,
2295			       vfs_context_kernel());
2296	    vnode_put(hfsmp->jvp);
2297	    hfsmp->jvp = NULL;
2298	}
2299	// XXXdbg
2300
2301	/*
2302	 * Last chance to dump unreferenced system files.
2303	 */
2304	(void) vflush(mp, NULLVP, FORCECLOSE);
2305
2306#if HFS_SPARSE_DEV
2307	/* Drop our reference on the backing fs (if any). */
2308	if ((hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) && hfsmp->hfs_backingfs_rootvp) {
2309		struct vnode * tmpvp;
2310
2311		hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
2312		tmpvp = hfsmp->hfs_backingfs_rootvp;
2313		hfsmp->hfs_backingfs_rootvp = NULLVP;
2314		vnode_rele(tmpvp);
2315	}
2316#endif /* HFS_SPARSE_DEV */
2317	lck_mtx_destroy(&hfsmp->hfc_mutex, hfs_mutex_group);
2318	lck_spin_destroy(&hfsmp->vcbFreeExtLock, hfs_spinlock_group);
2319	vnode_rele(hfsmp->hfs_devvp);
2320
2321	hfs_delete_chash(hfsmp);
2322	FREE(hfsmp, M_HFSMNT);
2323
2324	return (0);
2325
2326  err_exit:
2327	if (started_tr) {
2328		hfs_end_transaction(hfsmp);
2329	}
2330	return retval;
2331}
2332
2333
2334/*
2335 * Return the root of a filesystem.
2336 */
2337static int
2338hfs_vfs_root(struct mount *mp, struct vnode **vpp, __unused vfs_context_t context)
2339{
2340	return hfs_vget(VFSTOHFS(mp), (cnid_t)kHFSRootFolderID, vpp, 1, 0);
2341}
2342
2343
2344/*
2345 * Do operations associated with quotas
2346 */
2347#if !QUOTA
2348static int
2349hfs_quotactl(__unused struct mount *mp, __unused int cmds, __unused uid_t uid, __unused caddr_t datap, __unused vfs_context_t context)
2350{
2351	return (ENOTSUP);
2352}
2353#else
2354static int
2355hfs_quotactl(struct mount *mp, int cmds, uid_t uid, caddr_t datap, vfs_context_t context)
2356{
2357	struct proc *p = vfs_context_proc(context);
2358	int cmd, type, error;
2359
2360	if (uid == ~0U)
2361		uid = kauth_cred_getuid(vfs_context_ucred(context));
2362	cmd = cmds >> SUBCMDSHIFT;
2363
2364	switch (cmd) {
2365	case Q_SYNC:
2366	case Q_QUOTASTAT:
2367		break;
2368	case Q_GETQUOTA:
2369		if (uid == kauth_cred_getuid(vfs_context_ucred(context)))
2370			break;
2371		/* fall through */
2372	default:
2373		if ( (error = vfs_context_suser(context)) )
2374			return (error);
2375	}
2376
2377	type = cmds & SUBCMDMASK;
2378	if ((u_int)type >= MAXQUOTAS)
2379		return (EINVAL);
2380	if (vfs_busy(mp, LK_NOWAIT))
2381		return (0);
2382
2383	switch (cmd) {
2384
2385	case Q_QUOTAON:
2386		error = hfs_quotaon(p, mp, type, datap);
2387		break;
2388
2389	case Q_QUOTAOFF:
2390		error = hfs_quotaoff(p, mp, type);
2391		break;
2392
2393	case Q_SETQUOTA:
2394		error = hfs_setquota(mp, uid, type, datap);
2395		break;
2396
2397	case Q_SETUSE:
2398		error = hfs_setuse(mp, uid, type, datap);
2399		break;
2400
2401	case Q_GETQUOTA:
2402		error = hfs_getquota(mp, uid, type, datap);
2403		break;
2404
2405	case Q_SYNC:
2406		error = hfs_qsync(mp);
2407		break;
2408
2409	case Q_QUOTASTAT:
2410		error = hfs_quotastat(mp, type, datap);
2411		break;
2412
2413	default:
2414		error = EINVAL;
2415		break;
2416	}
2417	vfs_unbusy(mp);
2418
2419	return (error);
2420}
2421#endif /* QUOTA */
2422
2423/* Subtype is composite of bits */
2424#define HFS_SUBTYPE_JOURNALED      0x01
2425#define HFS_SUBTYPE_CASESENSITIVE  0x02
2426/* bits 2 - 6 reserved */
2427#define HFS_SUBTYPE_STANDARDHFS    0x80
2428
2429/*
2430 * Get file system statistics.
2431 */
2432int
2433hfs_statfs(struct mount *mp, register struct vfsstatfs *sbp, __unused vfs_context_t context)
2434{
2435	ExtendedVCB *vcb = VFSTOVCB(mp);
2436	struct hfsmount *hfsmp = VFSTOHFS(mp);
2437	u_int32_t freeCNIDs;
2438	u_int16_t subtype = 0;
2439
2440	freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)vcb->vcbNxtCNID;
2441
2442	sbp->f_bsize = (u_int32_t)vcb->blockSize;
2443	sbp->f_iosize = (size_t)cluster_max_io_size(mp, 0);
2444	sbp->f_blocks = (u_int64_t)((u_int32_t)vcb->totalBlocks);
2445	sbp->f_bfree = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 0));
2446	sbp->f_bavail = (u_int64_t)((u_int32_t )hfs_freeblks(hfsmp, 1));
2447	sbp->f_files = (u_int64_t)((u_int32_t )(vcb->totalBlocks - 2));  /* max files is constrained by total blocks */
2448	sbp->f_ffree = (u_int64_t)((u_int32_t )(MIN(freeCNIDs, sbp->f_bavail)));
2449
2450	/*
2451	 * Subtypes (flavors) for HFS
2452	 *   0:   Mac OS Extended
2453	 *   1:   Mac OS Extended (Journaled)
2454	 *   2:   Mac OS Extended (Case Sensitive)
2455	 *   3:   Mac OS Extended (Case Sensitive, Journaled)
2456	 *   4 - 127:   Reserved
2457	 * 128:   Mac OS Standard
2458	 *
2459	 */
2460	if (hfsmp->hfs_flags & HFS_STANDARD) {
2461		subtype = HFS_SUBTYPE_STANDARDHFS;
2462	} else /* HFS Plus */ {
2463		if (hfsmp->jnl)
2464			subtype |= HFS_SUBTYPE_JOURNALED;
2465		if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
2466			subtype |= HFS_SUBTYPE_CASESENSITIVE;
2467	}
2468	sbp->f_fssubtype = subtype;
2469
2470	return (0);
2471}
2472
2473
2474//
2475// XXXdbg -- this is a callback to be used by the journal to
2476//           get meta data blocks flushed out to disk.
2477//
2478// XXXdbg -- be smarter and don't flush *every* block on each
2479//           call.  try to only flush some so we don't wind up
2480//           being too synchronous.
2481//
2482__private_extern__
2483void
2484hfs_sync_metadata(void *arg)
2485{
2486	struct mount *mp = (struct mount *)arg;
2487	struct hfsmount *hfsmp;
2488	ExtendedVCB *vcb;
2489	buf_t	bp;
2490	int  retval;
2491	daddr64_t priIDSector;
2492	hfsmp = VFSTOHFS(mp);
2493	vcb = HFSTOVCB(hfsmp);
2494
2495	// now make sure the super block is flushed
2496	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
2497				  HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
2498
2499	retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2500			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
2501			hfsmp->hfs_physical_block_size, NOCRED, &bp);
2502	if ((retval != 0 ) && (retval != ENXIO)) {
2503		printf("hfs_sync_metadata: can't read volume header at %d! (retval 0x%x)\n",
2504		       (int)priIDSector, retval);
2505	}
2506
2507	if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2508	    buf_bwrite(bp);
2509	} else if (bp) {
2510	    buf_brelse(bp);
2511	}
2512
2513	// the alternate super block...
2514	// XXXdbg - we probably don't need to do this each and every time.
2515	//          hfs_btreeio.c:FlushAlternate() should flag when it was
2516	//          written...
2517	if (hfsmp->hfs_alt_id_sector) {
2518		retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
2519				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
2520				hfsmp->hfs_physical_block_size, NOCRED, &bp);
2521		if (retval == 0 && ((buf_flags(bp) & (B_DELWRI | B_LOCKED)) == B_DELWRI)) {
2522		    buf_bwrite(bp);
2523		} else if (bp) {
2524		    buf_brelse(bp);
2525		}
2526	}
2527}
2528
2529
2530struct hfs_sync_cargs {
2531        kauth_cred_t cred;
2532        struct proc  *p;
2533        int    waitfor;
2534        int    error;
2535};
2536
2537
2538static int
2539hfs_sync_callback(struct vnode *vp, void *cargs)
2540{
2541	struct cnode *cp;
2542	struct hfs_sync_cargs *args;
2543	int error;
2544
2545	args = (struct hfs_sync_cargs *)cargs;
2546
2547	if (hfs_lock(VTOC(vp), HFS_EXCLUSIVE_LOCK) != 0) {
2548		return (VNODE_RETURNED);
2549	}
2550	cp = VTOC(vp);
2551
2552	if ((cp->c_flag & C_MODIFIED) ||
2553	    (cp->c_touch_acctime | cp->c_touch_chgtime | cp->c_touch_modtime) ||
2554	    vnode_hasdirtyblks(vp)) {
2555	        error = hfs_fsync(vp, args->waitfor, 0, args->p);
2556
2557		if (error)
2558		        args->error = error;
2559	}
2560	hfs_unlock(cp);
2561	return (VNODE_RETURNED);
2562}
2563
2564
2565
2566/*
2567 * Go through the disk queues to initiate sandbagged IO;
2568 * go through the inodes to write those that have been modified;
2569 * initiate the writing of the super block if it has been modified.
2570 *
2571 * Note: we are always called with the filesystem marked `MPBUSY'.
2572 */
2573int
2574hfs_sync(struct mount *mp, int waitfor, vfs_context_t context)
2575{
2576	struct proc *p = vfs_context_proc(context);
2577	struct cnode *cp;
2578	struct hfsmount *hfsmp;
2579	ExtendedVCB *vcb;
2580	struct vnode *meta_vp[4];
2581	int i;
2582	int error, allerror = 0;
2583	struct hfs_sync_cargs args;
2584
2585	hfsmp = VFSTOHFS(mp);
2586
2587	/*
2588	 * hfs_changefs might be manipulating vnodes so back off
2589	 */
2590	if (hfsmp->hfs_flags & HFS_IN_CHANGEFS)
2591		return (0);
2592
2593	if (hfsmp->hfs_flags & HFS_READ_ONLY)
2594		return (EROFS);
2595
2596	/* skip over frozen volumes */
2597	if (!lck_rw_try_lock_shared(&hfsmp->hfs_insync))
2598		return 0;
2599
2600	args.cred = kauth_cred_get();
2601	args.waitfor = waitfor;
2602	args.p = p;
2603	args.error = 0;
2604	/*
2605	 * hfs_sync_callback will be called for each vnode
2606	 * hung off of this mount point... the vnode will be
2607	 * properly referenced and unreferenced around the callback
2608	 */
2609	vnode_iterate(mp, 0, hfs_sync_callback, (void *)&args);
2610
2611	if (args.error)
2612	        allerror = args.error;
2613
2614	vcb = HFSTOVCB(hfsmp);
2615
2616	meta_vp[0] = vcb->extentsRefNum;
2617	meta_vp[1] = vcb->catalogRefNum;
2618	meta_vp[2] = vcb->allocationsRefNum;  /* This is NULL for standard HFS */
2619	meta_vp[3] = hfsmp->hfs_attribute_vp; /* Optional file */
2620
2621	/* Now sync our three metadata files */
2622	for (i = 0; i < 4; ++i) {
2623		struct vnode *btvp;
2624
2625		btvp = meta_vp[i];;
2626		if ((btvp==0) || (vnode_mount(btvp) != mp))
2627			continue;
2628
2629		/* XXX use hfs_systemfile_lock instead ? */
2630		(void) hfs_lock(VTOC(btvp), HFS_EXCLUSIVE_LOCK);
2631		cp = VTOC(btvp);
2632
2633		if (((cp->c_flag &  C_MODIFIED) == 0) &&
2634		    (cp->c_touch_acctime == 0) &&
2635		    (cp->c_touch_chgtime == 0) &&
2636		    (cp->c_touch_modtime == 0) &&
2637		    vnode_hasdirtyblks(btvp) == 0) {
2638			hfs_unlock(VTOC(btvp));
2639			continue;
2640		}
2641		error = vnode_get(btvp);
2642		if (error) {
2643			hfs_unlock(VTOC(btvp));
2644			continue;
2645		}
2646		if ((error = hfs_fsync(btvp, waitfor, 0, p)))
2647			allerror = error;
2648
2649		hfs_unlock(cp);
2650		vnode_put(btvp);
2651	};
2652
2653	/*
2654	 * Force stale file system control information to be flushed.
2655	 */
2656	if (vcb->vcbSigWord == kHFSSigWord) {
2657		if ((error = VNOP_FSYNC(hfsmp->hfs_devvp, waitfor, context))) {
2658			allerror = error;
2659		}
2660	}
2661#if QUOTA
2662	hfs_qsync(mp);
2663#endif /* QUOTA */
2664
2665	hfs_hotfilesync(hfsmp, vfs_context_kernel());
2666
2667	/*
2668	 * Write back modified superblock.
2669	 */
2670	if (IsVCBDirty(vcb)) {
2671		error = hfs_flushvolumeheader(hfsmp, waitfor, 0);
2672		if (error)
2673			allerror = error;
2674	}
2675
2676	if (hfsmp->jnl) {
2677	    hfs_journal_flush(hfsmp, FALSE);
2678	}
2679
2680	{
2681		clock_sec_t secs;
2682		clock_usec_t usecs;
2683		uint64_t now;
2684
2685		clock_get_calendar_microtime(&secs, &usecs);
2686		now = ((uint64_t)secs * 1000000ULL) + (uint64_t)usecs;
2687		hfsmp->hfs_last_sync_time = now;
2688	}
2689
2690	lck_rw_unlock_shared(&hfsmp->hfs_insync);
2691	return (allerror);
2692}
2693
2694
2695/*
2696 * File handle to vnode
2697 *
2698 * Have to be really careful about stale file handles:
2699 * - check that the cnode id is valid
2700 * - call hfs_vget() to get the locked cnode
2701 * - check for an unallocated cnode (i_mode == 0)
2702 * - check that the given client host has export rights and return
2703 *   those rights via. exflagsp and credanonp
2704 */
2705static int
2706hfs_fhtovp(struct mount *mp, int fhlen, unsigned char *fhp, struct vnode **vpp, __unused vfs_context_t context)
2707{
2708	struct hfsfid *hfsfhp;
2709	struct vnode *nvp;
2710	int result;
2711
2712	*vpp = NULL;
2713	hfsfhp = (struct hfsfid *)fhp;
2714
2715	if (fhlen < (int)sizeof(struct hfsfid))
2716		return (EINVAL);
2717
2718	result = hfs_vget(VFSTOHFS(mp), ntohl(hfsfhp->hfsfid_cnid), &nvp, 0, 0);
2719	if (result) {
2720		if (result == ENOENT)
2721			result = ESTALE;
2722		return result;
2723	}
2724
2725	/*
2726	 * We used to use the create time as the gen id of the file handle,
2727	 * but it is not static enough because it can change at any point
2728	 * via system calls.  We still don't have another volume ID or other
2729	 * unique identifier to use for a generation ID across reboots that
2730	 * persists until the file is removed.  Using only the CNID exposes
2731	 * us to the potential wrap-around case, but as of 2/2008, it would take
2732	 * over 2 months to wrap around if the machine did nothing but allocate
2733	 * CNIDs.  Using some kind of wrap counter would only be effective if
2734	 * each file had the wrap counter associated with it.  For now,
2735	 * we use only the CNID to identify the file as it's good enough.
2736	 */
2737
2738	*vpp = nvp;
2739
2740	hfs_unlock(VTOC(nvp));
2741	return (0);
2742}
2743
2744
2745/*
2746 * Vnode pointer to File handle
2747 */
2748/* ARGSUSED */
2749static int
2750hfs_vptofh(struct vnode *vp, int *fhlenp, unsigned char *fhp, __unused vfs_context_t context)
2751{
2752	struct cnode *cp;
2753	struct hfsfid *hfsfhp;
2754
2755	if (ISHFS(VTOVCB(vp)))
2756		return (ENOTSUP);	/* hfs standard is not exportable */
2757
2758	if (*fhlenp < (int)sizeof(struct hfsfid))
2759		return (EOVERFLOW);
2760
2761	cp = VTOC(vp);
2762	hfsfhp = (struct hfsfid *)fhp;
2763	/* only the CNID is used to identify the file now */
2764	hfsfhp->hfsfid_cnid = htonl(cp->c_fileid);
2765	hfsfhp->hfsfid_gen = htonl(cp->c_fileid);
2766	*fhlenp = sizeof(struct hfsfid);
2767
2768	return (0);
2769}
2770
2771
2772/*
2773 * Initial HFS filesystems, done only once.
2774 */
2775static int
2776hfs_init(__unused struct vfsconf *vfsp)
2777{
2778	static int done = 0;
2779
2780	if (done)
2781		return (0);
2782	done = 1;
2783	hfs_chashinit();
2784	hfs_converterinit();
2785
2786	BTReserveSetup();
2787
2788
2789	hfs_lock_attr    = lck_attr_alloc_init();
2790	hfs_group_attr   = lck_grp_attr_alloc_init();
2791	hfs_mutex_group  = lck_grp_alloc_init("hfs-mutex", hfs_group_attr);
2792	hfs_rwlock_group = lck_grp_alloc_init("hfs-rwlock", hfs_group_attr);
2793	hfs_spinlock_group = lck_grp_alloc_init("hfs-spinlock", hfs_group_attr);
2794
2795#if HFS_COMPRESSION
2796    decmpfs_init();
2797#endif
2798
2799	return (0);
2800}
2801
2802static int
2803hfs_getmountpoint(struct vnode *vp, struct hfsmount **hfsmpp)
2804{
2805	struct hfsmount * hfsmp;
2806	char fstypename[MFSNAMELEN];
2807
2808	if (vp == NULL)
2809		return (EINVAL);
2810
2811	if (!vnode_isvroot(vp))
2812		return (EINVAL);
2813
2814	vnode_vfsname(vp, fstypename);
2815	if (strncmp(fstypename, "hfs", sizeof(fstypename)) != 0)
2816		return (EINVAL);
2817
2818	hfsmp = VTOHFS(vp);
2819
2820	if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord)
2821		return (EINVAL);
2822
2823	*hfsmpp = hfsmp;
2824
2825	return (0);
2826}
2827
2828// XXXdbg
2829#include <sys/filedesc.h>
2830
2831/*
2832 * HFS filesystem related variables.
2833 */
2834int
2835hfs_sysctl(int *name, __unused u_int namelen, user_addr_t oldp, size_t *oldlenp,
2836			user_addr_t newp, size_t newlen, vfs_context_t context)
2837{
2838	struct proc *p = vfs_context_proc(context);
2839	int error;
2840	struct hfsmount *hfsmp;
2841
2842	/* all sysctl names at this level are terminal */
2843
2844	if (name[0] == HFS_ENCODINGBIAS) {
2845		int bias;
2846
2847		bias = hfs_getencodingbias();
2848		error = sysctl_int(oldp, oldlenp, newp, newlen, &bias);
2849		if (error == 0 && newp)
2850			hfs_setencodingbias(bias);
2851		return (error);
2852
2853	} else if (name[0] == HFS_EXTEND_FS) {
2854        u_int64_t  newsize;
2855		vnode_t vp = vfs_context_cwd(context);
2856
2857		if (newp == USER_ADDR_NULL || vp == NULLVP)
2858			return (EINVAL);
2859		if ((error = hfs_getmountpoint(vp, &hfsmp)))
2860			return (error);
2861		error = sysctl_quad(oldp, oldlenp, newp, newlen, (quad_t *)&newsize);
2862		if (error)
2863			return (error);
2864
2865		error = hfs_extendfs(hfsmp, newsize, context);
2866		return (error);
2867
2868	} else if (name[0] == HFS_ENCODINGHINT) {
2869		size_t bufsize;
2870		size_t bytes;
2871		u_int32_t hint;
2872		u_int16_t *unicode_name = NULL;
2873		char *filename = NULL;
2874
2875		if ((newlen <= 0) || (newlen > MAXPATHLEN))
2876			return (EINVAL);
2877
2878		bufsize = MAX(newlen * 3, MAXPATHLEN);
2879		MALLOC(filename, char *, newlen, M_TEMP, M_WAITOK);
2880		if (filename == NULL) {
2881			error = ENOMEM;
2882			goto encodinghint_exit;
2883		}
2884		MALLOC(unicode_name, u_int16_t *, bufsize, M_TEMP, M_WAITOK);
2885		if (filename == NULL) {
2886			error = ENOMEM;
2887			goto encodinghint_exit;
2888		}
2889
2890		error = copyin(newp, (caddr_t)filename, newlen);
2891		if (error == 0) {
2892			error = utf8_decodestr((u_int8_t *)filename, newlen - 1, unicode_name,
2893			                       &bytes, bufsize, 0, UTF_DECOMPOSED);
2894			if (error == 0) {
2895				hint = hfs_pickencoding(unicode_name, bytes / 2);
2896				error = sysctl_int(oldp, oldlenp, USER_ADDR_NULL, 0, (int32_t *)&hint);
2897			}
2898		}
2899
2900encodinghint_exit:
2901		if (unicode_name)
2902			FREE(unicode_name, M_TEMP);
2903		if (filename)
2904			FREE(filename, M_TEMP);
2905		return (error);
2906
2907	} else if (name[0] == HFS_ENABLE_JOURNALING) {
2908		// make the file system journaled...
2909		vnode_t vp = vfs_context_cwd(context);
2910		vnode_t jvp;
2911		ExtendedVCB *vcb;
2912		struct cat_attr jnl_attr, jinfo_attr;
2913		struct cat_fork jnl_fork, jinfo_fork;
2914		void *jnl = NULL;
2915		int lockflags;
2916
2917		/* Only root can enable journaling */
2918		if (!is_suser()) {
2919			return (EPERM);
2920		}
2921		if (vp == NULLVP)
2922		        return EINVAL;
2923
2924		hfsmp = VTOHFS(vp);
2925		if (hfsmp->hfs_flags & HFS_READ_ONLY) {
2926			return EROFS;
2927		}
2928		if (HFSTOVCB(hfsmp)->vcbSigWord == kHFSSigWord) {
2929			printf("hfs: can't make a plain hfs volume journaled.\n");
2930			return EINVAL;
2931		}
2932
2933		if (hfsmp->jnl) {
2934		    printf("hfs: volume @ mp %p is already journaled!\n", vnode_mount(vp));
2935		    return EAGAIN;
2936		}
2937
2938		vcb = HFSTOVCB(hfsmp);
2939		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_EXTENTS, HFS_EXCLUSIVE_LOCK);
2940		if (BTHasContiguousNodes(VTOF(vcb->catalogRefNum)) == 0 ||
2941			BTHasContiguousNodes(VTOF(vcb->extentsRefNum)) == 0) {
2942
2943			printf("hfs: volume has a btree w/non-contiguous nodes.  can not enable journaling.\n");
2944			hfs_systemfile_unlock(hfsmp, lockflags);
2945			return EINVAL;
2946		}
2947		hfs_systemfile_unlock(hfsmp, lockflags);
2948
2949		// make sure these both exist!
2950		if (   GetFileInfo(vcb, kHFSRootFolderID, ".journal_info_block", &jinfo_attr, &jinfo_fork) == 0
2951			|| GetFileInfo(vcb, kHFSRootFolderID, ".journal", &jnl_attr, &jnl_fork) == 0) {
2952
2953			return EINVAL;
2954		}
2955
2956		hfs_sync(hfsmp->hfs_mp, MNT_WAIT, context);
2957
2958		printf("hfs: Initializing the journal (joffset 0x%llx sz 0x%llx)...\n",
2959			   (off_t)name[2], (off_t)name[3]);
2960
2961		//
2962		// XXXdbg - note that currently (Sept, 08) hfs_util does not support
2963		//          enabling the journal on a separate device so it is safe
2964		//          to just copy hfs_devvp here.  If hfs_util gets the ability
2965		//          to dynamically enable the journal on a separate device then
2966		//          we will have to do the same thing as hfs_early_journal_init()
2967		//          to locate and open the journal device.
2968		//
2969		jvp = hfsmp->hfs_devvp;
2970		jnl = journal_create(jvp,
2971							 (off_t)name[2] * (off_t)HFSTOVCB(hfsmp)->blockSize
2972							 + HFSTOVCB(hfsmp)->hfsPlusIOPosOffset,
2973							 (off_t)((unsigned)name[3]),
2974							 hfsmp->hfs_devvp,
2975							 hfsmp->hfs_logical_block_size,
2976							 0,
2977							 0,
2978							 hfs_sync_metadata, hfsmp->hfs_mp);
2979
2980		/*
2981		 * Set up the trim callback function so that we can add
2982		 * recently freed extents to the free extent cache once
2983		 * the transaction that freed them is written to the
2984		 * journal on disk.
2985		 */
2986		if (jnl)
2987			journal_trim_set_callback(jnl, hfs_trim_callback, hfsmp);
2988
2989		if (jnl == NULL) {
2990			printf("hfs: FAILED to create the journal!\n");
2991			if (jvp && jvp != hfsmp->hfs_devvp) {
2992				vnode_clearmountedon(jvp);
2993				VNOP_CLOSE(jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
2994			}
2995			jvp = NULL;
2996
2997			return EINVAL;
2998		}
2999
3000		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3001
3002		/*
3003		 * Flush all dirty metadata buffers.
3004		 */
3005		buf_flushdirtyblks(hfsmp->hfs_devvp, TRUE, 0, "hfs_sysctl");
3006		buf_flushdirtyblks(hfsmp->hfs_extents_vp, TRUE, 0, "hfs_sysctl");
3007		buf_flushdirtyblks(hfsmp->hfs_catalog_vp, TRUE, 0, "hfs_sysctl");
3008		buf_flushdirtyblks(hfsmp->hfs_allocation_vp, TRUE, 0, "hfs_sysctl");
3009		if (hfsmp->hfs_attribute_vp)
3010			buf_flushdirtyblks(hfsmp->hfs_attribute_vp, TRUE, 0, "hfs_sysctl");
3011
3012		HFSTOVCB(hfsmp)->vcbJinfoBlock = name[1];
3013		HFSTOVCB(hfsmp)->vcbAtrb |= kHFSVolumeJournaledMask;
3014		hfsmp->jvp = jvp;
3015		hfsmp->jnl = jnl;
3016
3017		// save this off for the hack-y check in hfs_remove()
3018		hfsmp->jnl_start        = (u_int32_t)name[2];
3019		hfsmp->jnl_size         = (off_t)((unsigned)name[3]);
3020		hfsmp->hfs_jnlinfoblkid = jinfo_attr.ca_fileid;
3021		hfsmp->hfs_jnlfileid    = jnl_attr.ca_fileid;
3022
3023		vfs_setflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3024
3025		hfs_unlock_global (hfsmp);
3026		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3027
3028		{
3029			fsid_t fsid;
3030
3031			fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3032			fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3033			vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3034		}
3035		return 0;
3036	} else if (name[0] == HFS_DISABLE_JOURNALING) {
3037		// clear the journaling bit
3038		vnode_t vp = vfs_context_cwd(context);
3039
3040		/* Only root can disable journaling */
3041		if (!is_suser()) {
3042			return (EPERM);
3043		}
3044		if (vp == NULLVP)
3045		        return EINVAL;
3046
3047		hfsmp = VTOHFS(vp);
3048
3049		/*
3050		 * Disabling journaling is disallowed on volumes with directory hard links
3051		 * because we have not tested the relevant code path.
3052		 */
3053		if (hfsmp->hfs_private_attr[DIR_HARDLINKS].ca_entries != 0){
3054			printf("hfs: cannot disable journaling on volumes with directory hardlinks\n");
3055			return EPERM;
3056		}
3057
3058		printf("hfs: disabling journaling for mount @ %p\n", vnode_mount(vp));
3059
3060		hfs_lock_global (hfsmp, HFS_EXCLUSIVE_LOCK);
3061
3062		// Lights out for you buddy!
3063		journal_close(hfsmp->jnl);
3064		hfsmp->jnl = NULL;
3065
3066		if (hfsmp->jvp && hfsmp->jvp != hfsmp->hfs_devvp) {
3067			vnode_clearmountedon(hfsmp->jvp);
3068			VNOP_CLOSE(hfsmp->jvp, hfsmp->hfs_flags & HFS_READ_ONLY ? FREAD : FREAD|FWRITE, vfs_context_kernel());
3069			vnode_put(hfsmp->jvp);
3070		}
3071		hfsmp->jvp = NULL;
3072		vfs_clearflags(hfsmp->hfs_mp, (u_int64_t)((unsigned int)MNT_JOURNALED));
3073		hfsmp->jnl_start        = 0;
3074		hfsmp->hfs_jnlinfoblkid = 0;
3075		hfsmp->hfs_jnlfileid    = 0;
3076
3077		HFSTOVCB(hfsmp)->vcbAtrb &= ~kHFSVolumeJournaledMask;
3078
3079		hfs_unlock_global (hfsmp);
3080
3081		hfs_flushvolumeheader(hfsmp, MNT_WAIT, 1);
3082
3083		{
3084			fsid_t fsid;
3085
3086			fsid.val[0] = (int32_t)hfsmp->hfs_raw_dev;
3087			fsid.val[1] = (int32_t)vfs_typenum(HFSTOVFS(hfsmp));
3088			vfs_event_signal(&fsid, VQ_UPDATE, (intptr_t)NULL);
3089		}
3090		return 0;
3091	} else if (name[0] == HFS_GET_JOURNAL_INFO) {
3092		vnode_t vp = vfs_context_cwd(context);
3093		off_t jnl_start, jnl_size;
3094
3095		if (vp == NULLVP)
3096		        return EINVAL;
3097
3098		/* 64-bit processes won't work with this sysctl -- can't fit a pointer into an int! */
3099		if (proc_is64bit(current_proc()))
3100			return EINVAL;
3101
3102		hfsmp = VTOHFS(vp);
3103	    if (hfsmp->jnl == NULL) {
3104			jnl_start = 0;
3105			jnl_size  = 0;
3106	    } else {
3107			jnl_start = (off_t)(hfsmp->jnl_start * HFSTOVCB(hfsmp)->blockSize) + (off_t)HFSTOVCB(hfsmp)->hfsPlusIOPosOffset;
3108			jnl_size  = (off_t)hfsmp->jnl_size;
3109	    }
3110
3111	    if ((error = copyout((caddr_t)&jnl_start, CAST_USER_ADDR_T(name[1]), sizeof(off_t))) != 0) {
3112			return error;
3113		}
3114	    if ((error = copyout((caddr_t)&jnl_size, CAST_USER_ADDR_T(name[2]), sizeof(off_t))) != 0) {
3115			return error;
3116		}
3117
3118		return 0;
3119	} else if (name[0] == HFS_SET_PKG_EXTENSIONS) {
3120
3121	    return set_package_extensions_table((user_addr_t)((unsigned)name[1]), name[2], name[3]);
3122
3123	} else if (name[0] == VFS_CTL_QUERY) {
3124    	struct sysctl_req *req;
3125    	union union_vfsidctl vc;
3126    	struct mount *mp;
3127 	    struct vfsquery vq;
3128
3129		req = CAST_DOWN(struct sysctl_req *, oldp);	/* we're new style vfs sysctl. */
3130
3131        error = SYSCTL_IN(req, &vc, proc_is64bit(p)? sizeof(vc.vc64):sizeof(vc.vc32));
3132		if (error) return (error);
3133
3134		mp = vfs_getvfs(&vc.vc32.vc_fsid); /* works for 32 and 64 */
3135        if (mp == NULL) return (ENOENT);
3136
3137		hfsmp = VFSTOHFS(mp);
3138		bzero(&vq, sizeof(vq));
3139		vq.vq_flags = hfsmp->hfs_notification_conditions;
3140		return SYSCTL_OUT(req, &vq, sizeof(vq));;
3141	} else if (name[0] == HFS_REPLAY_JOURNAL) {
3142		vnode_t devvp = NULL;
3143		int device_fd;
3144		if (namelen != 2) {
3145			return (EINVAL);
3146		}
3147		device_fd = name[1];
3148		error = file_vnode(device_fd, &devvp);
3149		if (error) {
3150			return error;
3151		}
3152		error = vnode_getwithref(devvp);
3153		if (error) {
3154			file_drop(device_fd);
3155			return error;
3156		}
3157		error = hfs_journal_replay(devvp, context);
3158		file_drop(device_fd);
3159		vnode_put(devvp);
3160		return error;
3161	} else if (name[0] == HFS_ENABLE_RESIZE_DEBUG) {
3162		hfs_resize_debug = 1;
3163		printf ("hfs_sysctl: Enabled volume resize debugging.\n");
3164		return 0;
3165	}
3166
3167	return (ENOTSUP);
3168}
3169
3170/*
3171 * hfs_vfs_vget is not static since it is used in hfs_readwrite.c to support
3172 * the build_path ioctl.  We use it to leverage the code below that updates
3173 * the origin list cache if necessary
3174 */
3175
3176int
3177hfs_vfs_vget(struct mount *mp, ino64_t ino, struct vnode **vpp, __unused vfs_context_t context)
3178{
3179	int error;
3180	int lockflags;
3181	struct hfsmount *hfsmp;
3182
3183	hfsmp = VFSTOHFS(mp);
3184
3185	error = hfs_vget(hfsmp, (cnid_t)ino, vpp, 1, 0);
3186	if (error)
3187		return (error);
3188
3189	/*
3190	 * ADLs may need to have their origin state updated
3191	 * since build_path needs a valid parent.  The same is true
3192	 * for hardlinked files as well.  There isn't a race window here
3193	 * in re-acquiring the cnode lock since we aren't pulling any data
3194	 * out of the cnode; instead, we're going to the catalog.
3195	 */
3196	if ((VTOC(*vpp)->c_flag & C_HARDLINK) &&
3197	    (hfs_lock(VTOC(*vpp), HFS_EXCLUSIVE_LOCK) == 0)) {
3198		cnode_t *cp = VTOC(*vpp);
3199		struct cat_desc cdesc;
3200
3201		if (!hfs_haslinkorigin(cp)) {
3202			lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3203			error = cat_findname(hfsmp, (cnid_t)ino, &cdesc);
3204			hfs_systemfile_unlock(hfsmp, lockflags);
3205			if (error == 0) {
3206				if ((cdesc.cd_parentcnid != hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3207					(cdesc.cd_parentcnid != hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid)) {
3208					hfs_savelinkorigin(cp, cdesc.cd_parentcnid);
3209				}
3210				cat_releasedesc(&cdesc);
3211			}
3212		}
3213		hfs_unlock(cp);
3214	}
3215	return (0);
3216}
3217
3218
3219/*
3220 * Look up an HFS object by ID.
3221 *
3222 * The object is returned with an iocount reference and the cnode locked.
3223 *
3224 * If the object is a file then it will represent the data fork.
3225 */
3226int
3227hfs_vget(struct hfsmount *hfsmp, cnid_t cnid, struct vnode **vpp, int skiplock, int allow_deleted)
3228{
3229	struct vnode *vp = NULLVP;
3230	struct cat_desc cndesc;
3231	struct cat_attr cnattr;
3232	struct cat_fork cnfork;
3233	u_int32_t linkref = 0;
3234	int error;
3235
3236	/* Check for cnids that should't be exported. */
3237	if ((cnid < kHFSFirstUserCatalogNodeID) &&
3238	    (cnid != kHFSRootFolderID && cnid != kHFSRootParentID)) {
3239		return (ENOENT);
3240	}
3241	/* Don't export our private directories. */
3242	if (cnid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid ||
3243	    cnid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) {
3244		return (ENOENT);
3245	}
3246	/*
3247	 * Check the hash first
3248	 */
3249	vp = hfs_chash_getvnode(hfsmp, cnid, 0, skiplock, allow_deleted);
3250	if (vp) {
3251		*vpp = vp;
3252		return(0);
3253	}
3254
3255	bzero(&cndesc, sizeof(cndesc));
3256	bzero(&cnattr, sizeof(cnattr));
3257	bzero(&cnfork, sizeof(cnfork));
3258
3259	/*
3260	 * Not in hash, lookup in catalog
3261	 */
3262	if (cnid == kHFSRootParentID) {
3263		static char hfs_rootname[] = "/";
3264
3265		cndesc.cd_nameptr = (const u_int8_t *)&hfs_rootname[0];
3266		cndesc.cd_namelen = 1;
3267		cndesc.cd_parentcnid = kHFSRootParentID;
3268		cndesc.cd_cnid = kHFSRootFolderID;
3269		cndesc.cd_flags = CD_ISDIR;
3270
3271		cnattr.ca_fileid = kHFSRootFolderID;
3272		cnattr.ca_linkcount = 1;
3273		cnattr.ca_entries = 1;
3274		cnattr.ca_dircount = 1;
3275		cnattr.ca_mode = (S_IFDIR | S_IRWXU | S_IRWXG | S_IRWXO);
3276	} else {
3277		int lockflags;
3278		cnid_t pid;
3279		const char *nameptr;
3280
3281		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3282		error = cat_idlookup(hfsmp, cnid, 0, 0, &cndesc, &cnattr, &cnfork);
3283		hfs_systemfile_unlock(hfsmp, lockflags);
3284
3285		if (error) {
3286			*vpp = NULL;
3287			return (error);
3288		}
3289
3290		/*
3291		 * Check for a raw hardlink inode and save its linkref.
3292		 */
3293		pid = cndesc.cd_parentcnid;
3294		nameptr = (const char *)cndesc.cd_nameptr;
3295
3296		if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3297		    (bcmp(nameptr, HFS_INODE_PREFIX, HFS_INODE_PREFIX_LEN) == 0)) {
3298			linkref = strtoul(&nameptr[HFS_INODE_PREFIX_LEN], NULL, 10);
3299
3300		} else if ((pid == hfsmp->hfs_private_desc[DIR_HARDLINKS].cd_cnid) &&
3301		           (bcmp(nameptr, HFS_DIRINODE_PREFIX, HFS_DIRINODE_PREFIX_LEN) == 0)) {
3302			linkref = strtoul(&nameptr[HFS_DIRINODE_PREFIX_LEN], NULL, 10);
3303
3304		} else if ((pid == hfsmp->hfs_private_desc[FILE_HARDLINKS].cd_cnid) &&
3305		           (bcmp(nameptr, HFS_DELETE_PREFIX, HFS_DELETE_PREFIX_LEN) == 0)) {
3306			*vpp = NULL;
3307			cat_releasedesc(&cndesc);
3308			return (ENOENT);  /* open unlinked file */
3309		}
3310	}
3311
3312	/*
3313	 * Finish initializing cnode descriptor for hardlinks.
3314	 *
3315	 * We need a valid name and parent for reverse lookups.
3316	 */
3317	if (linkref) {
3318		cnid_t nextlinkid;
3319		cnid_t prevlinkid;
3320		struct cat_desc linkdesc;
3321		int lockflags;
3322
3323		cnattr.ca_linkref = linkref;
3324
3325		/*
3326		 * Pick up the first link in the chain and get a descriptor for it.
3327		 * This allows blind volfs paths to work for hardlinks.
3328		 */
3329		if ((hfs_lookup_siblinglinks(hfsmp, linkref, &prevlinkid,  &nextlinkid) == 0) &&
3330		    (nextlinkid != 0)) {
3331			lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
3332			error = cat_findname(hfsmp, nextlinkid, &linkdesc);
3333			hfs_systemfile_unlock(hfsmp, lockflags);
3334			if (error == 0) {
3335				cat_releasedesc(&cndesc);
3336				bcopy(&linkdesc, &cndesc, sizeof(linkdesc));
3337			}
3338		}
3339	}
3340
3341	if (linkref) {
3342		int newvnode_flags = 0;
3343
3344		error = hfs_getnewvnode(hfsmp, NULL, NULL, &cndesc, 0, &cnattr,
3345								&cnfork, &vp, &newvnode_flags);
3346		if (error == 0) {
3347			VTOC(vp)->c_flag |= C_HARDLINK;
3348			vnode_setmultipath(vp);
3349		}
3350	} else {
3351		struct componentname cn;
3352		int newvnode_flags = 0;
3353
3354		/* Supply hfs_getnewvnode with a component name. */
3355		MALLOC_ZONE(cn.cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
3356		cn.cn_nameiop = LOOKUP;
3357		cn.cn_flags = ISLASTCN | HASBUF;
3358		cn.cn_context = NULL;
3359		cn.cn_pnlen = MAXPATHLEN;
3360		cn.cn_nameptr = cn.cn_pnbuf;
3361		cn.cn_namelen = cndesc.cd_namelen;
3362		cn.cn_hash = 0;
3363		cn.cn_consume = 0;
3364		bcopy(cndesc.cd_nameptr, cn.cn_nameptr, cndesc.cd_namelen + 1);
3365
3366		error = hfs_getnewvnode(hfsmp, NULLVP, &cn, &cndesc, 0, &cnattr,
3367								&cnfork, &vp, &newvnode_flags);
3368
3369		if (error == 0 && (VTOC(vp)->c_flag & C_HARDLINK)) {
3370			hfs_savelinkorigin(VTOC(vp), cndesc.cd_parentcnid);
3371		}
3372		FREE_ZONE(cn.cn_pnbuf, cn.cn_pnlen, M_NAMEI);
3373	}
3374	cat_releasedesc(&cndesc);
3375
3376	*vpp = vp;
3377	if (vp && skiplock) {
3378		hfs_unlock(VTOC(vp));
3379	}
3380	return (error);
3381}
3382
3383
3384/*
3385 * Flush out all the files in a filesystem.
3386 */
3387static int
3388#if QUOTA
3389hfs_flushfiles(struct mount *mp, int flags, struct proc *p)
3390#else
3391hfs_flushfiles(struct mount *mp, int flags, __unused struct proc *p)
3392#endif /* QUOTA */
3393{
3394	struct hfsmount *hfsmp;
3395	struct vnode *skipvp = NULLVP;
3396	int error;
3397#if QUOTA
3398	int quotafilecnt;
3399	int i;
3400#endif
3401
3402	hfsmp = VFSTOHFS(mp);
3403
3404#if QUOTA
3405	/*
3406	 * The open quota files have an indirect reference on
3407	 * the root directory vnode.  We must account for this
3408	 * extra reference when doing the intial vflush.
3409	 */
3410	quotafilecnt = 0;
3411	if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3412
3413		/* Find out how many quota files we have open. */
3414		for (i = 0; i < MAXQUOTAS; i++) {
3415			if (hfsmp->hfs_qfiles[i].qf_vp != NULLVP)
3416				++quotafilecnt;
3417		}
3418
3419		/* Obtain the root vnode so we can skip over it. */
3420		skipvp = hfs_chash_getvnode(hfsmp, kHFSRootFolderID, 0, 0, 0);
3421	}
3422#endif /* QUOTA */
3423
3424	error = vflush(mp, skipvp, SKIPSYSTEM | SKIPSWAP | flags);
3425	if (error != 0)
3426		return(error);
3427
3428	error = vflush(mp, skipvp, SKIPSYSTEM | flags);
3429
3430#if QUOTA
3431	if (((unsigned int)vfs_flags(mp)) & MNT_QUOTA) {
3432		if (skipvp) {
3433			/*
3434			 * See if there are additional references on the
3435			 * root vp besides the ones obtained from the open
3436			 * quota files and the hfs_chash_getvnode call above.
3437			 */
3438			if ((error == 0) &&
3439			    (vnode_isinuse(skipvp,  quotafilecnt))) {
3440				error = EBUSY;  /* root directory is still open */
3441			}
3442			hfs_unlock(VTOC(skipvp));
3443			vnode_put(skipvp);
3444		}
3445		if (error && (flags & FORCECLOSE) == 0)
3446			return (error);
3447
3448		for (i = 0; i < MAXQUOTAS; i++) {
3449			if (hfsmp->hfs_qfiles[i].qf_vp == NULLVP)
3450				continue;
3451			hfs_quotaoff(p, mp, i);
3452		}
3453		error = vflush(mp, NULLVP, SKIPSYSTEM | flags);
3454	}
3455#endif /* QUOTA */
3456
3457	return (error);
3458}
3459
3460/*
3461 * Update volume encoding bitmap (HFS Plus only)
3462 */
3463__private_extern__
3464void
3465hfs_setencodingbits(struct hfsmount *hfsmp, u_int32_t encoding)
3466{
3467#define  kIndexMacUkrainian	48  /* MacUkrainian encoding is 152 */
3468#define  kIndexMacFarsi		49  /* MacFarsi encoding is 140 */
3469
3470	u_int32_t	index;
3471
3472	switch (encoding) {
3473	case kTextEncodingMacUkrainian:
3474		index = kIndexMacUkrainian;
3475		break;
3476	case kTextEncodingMacFarsi:
3477		index = kIndexMacFarsi;
3478		break;
3479	default:
3480		index = encoding;
3481		break;
3482	}
3483
3484	if (index < 64 && (hfsmp->encodingsBitmap & (u_int64_t)(1ULL << index)) == 0) {
3485		HFS_MOUNT_LOCK(hfsmp, TRUE)
3486		hfsmp->encodingsBitmap |= (u_int64_t)(1ULL << index);
3487		MarkVCBDirty(hfsmp);
3488		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
3489	}
3490}
3491
3492/*
3493 * Update volume stats
3494 *
3495 * On journal volumes this will cause a volume header flush
3496 */
3497int
3498hfs_volupdate(struct hfsmount *hfsmp, enum volop op, int inroot)
3499{
3500	struct timeval tv;
3501
3502	microtime(&tv);
3503
3504	lck_mtx_lock(&hfsmp->hfs_mutex);
3505
3506	MarkVCBDirty(hfsmp);
3507	hfsmp->hfs_mtime = tv.tv_sec;
3508
3509	switch (op) {
3510	case VOL_UPDATE:
3511		break;
3512	case VOL_MKDIR:
3513		if (hfsmp->hfs_dircount != 0xFFFFFFFF)
3514			++hfsmp->hfs_dircount;
3515		if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3516			++hfsmp->vcbNmRtDirs;
3517		break;
3518	case VOL_RMDIR:
3519		if (hfsmp->hfs_dircount != 0)
3520			--hfsmp->hfs_dircount;
3521		if (inroot && hfsmp->vcbNmRtDirs != 0xFFFF)
3522			--hfsmp->vcbNmRtDirs;
3523		break;
3524	case VOL_MKFILE:
3525		if (hfsmp->hfs_filecount != 0xFFFFFFFF)
3526			++hfsmp->hfs_filecount;
3527		if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3528			++hfsmp->vcbNmFls;
3529		break;
3530	case VOL_RMFILE:
3531		if (hfsmp->hfs_filecount != 0)
3532			--hfsmp->hfs_filecount;
3533		if (inroot && hfsmp->vcbNmFls != 0xFFFF)
3534			--hfsmp->vcbNmFls;
3535		break;
3536	}
3537
3538	lck_mtx_unlock(&hfsmp->hfs_mutex);
3539
3540	if (hfsmp->jnl) {
3541		hfs_flushvolumeheader(hfsmp, 0, 0);
3542	}
3543
3544	return (0);
3545}
3546
3547
3548static int
3549hfs_flushMDB(struct hfsmount *hfsmp, int waitfor, int altflush)
3550{
3551	ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3552	struct filefork *fp;
3553	HFSMasterDirectoryBlock	*mdb;
3554	struct buf *bp = NULL;
3555	int retval;
3556	int sector_size;
3557	ByteCount namelen;
3558
3559	sector_size = hfsmp->hfs_logical_block_size;
3560	retval = (int)buf_bread(hfsmp->hfs_devvp, (daddr64_t)HFS_PRI_SECTOR(sector_size), sector_size, NOCRED, &bp);
3561	if (retval) {
3562		if (bp)
3563			buf_brelse(bp);
3564		return retval;
3565	}
3566
3567	lck_mtx_lock(&hfsmp->hfs_mutex);
3568
3569	mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp) + HFS_PRI_OFFSET(sector_size));
3570
3571	mdb->drCrDate	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->hfs_itime)));
3572	mdb->drLsMod	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbLsMod)));
3573	mdb->drAtrb	= SWAP_BE16 (vcb->vcbAtrb);
3574	mdb->drNmFls	= SWAP_BE16 (vcb->vcbNmFls);
3575	mdb->drAllocPtr	= SWAP_BE16 (vcb->nextAllocation);
3576	mdb->drClpSiz	= SWAP_BE32 (vcb->vcbClpSiz);
3577	mdb->drNxtCNID	= SWAP_BE32 (vcb->vcbNxtCNID);
3578	mdb->drFreeBks	= SWAP_BE16 (vcb->freeBlocks);
3579
3580	namelen = strlen((char *)vcb->vcbVN);
3581	retval = utf8_to_hfs(vcb, namelen, vcb->vcbVN, mdb->drVN);
3582	/* Retry with MacRoman in case that's how it was exported. */
3583	if (retval)
3584		retval = utf8_to_mac_roman(namelen, vcb->vcbVN, mdb->drVN);
3585
3586	mdb->drVolBkUp	= SWAP_BE32 (UTCToLocal(to_hfs_time(vcb->vcbVolBkUp)));
3587	mdb->drWrCnt	= SWAP_BE32 (vcb->vcbWrCnt);
3588	mdb->drNmRtDirs	= SWAP_BE16 (vcb->vcbNmRtDirs);
3589	mdb->drFilCnt	= SWAP_BE32 (vcb->vcbFilCnt);
3590	mdb->drDirCnt	= SWAP_BE32 (vcb->vcbDirCnt);
3591
3592	bcopy(vcb->vcbFndrInfo, mdb->drFndrInfo, sizeof(mdb->drFndrInfo));
3593
3594	fp = VTOF(vcb->extentsRefNum);
3595	mdb->drXTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3596	mdb->drXTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3597	mdb->drXTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3598	mdb->drXTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3599	mdb->drXTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3600	mdb->drXTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3601	mdb->drXTFlSize	= SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3602	mdb->drXTClpSiz	= SWAP_BE32 (fp->ff_clumpsize);
3603	FTOC(fp)->c_flag &= ~C_MODIFIED;
3604
3605	fp = VTOF(vcb->catalogRefNum);
3606	mdb->drCTExtRec[0].startBlock = SWAP_BE16 (fp->ff_extents[0].startBlock);
3607	mdb->drCTExtRec[0].blockCount = SWAP_BE16 (fp->ff_extents[0].blockCount);
3608	mdb->drCTExtRec[1].startBlock = SWAP_BE16 (fp->ff_extents[1].startBlock);
3609	mdb->drCTExtRec[1].blockCount = SWAP_BE16 (fp->ff_extents[1].blockCount);
3610	mdb->drCTExtRec[2].startBlock = SWAP_BE16 (fp->ff_extents[2].startBlock);
3611	mdb->drCTExtRec[2].blockCount = SWAP_BE16 (fp->ff_extents[2].blockCount);
3612	mdb->drCTFlSize	= SWAP_BE32 (fp->ff_blocks * vcb->blockSize);
3613	mdb->drCTClpSiz	= SWAP_BE32 (fp->ff_clumpsize);
3614	FTOC(fp)->c_flag &= ~C_MODIFIED;
3615
3616	MarkVCBClean( vcb );
3617
3618	lck_mtx_unlock(&hfsmp->hfs_mutex);
3619
3620	/* If requested, flush out the alternate MDB */
3621	if (altflush) {
3622		struct buf *alt_bp = NULL;
3623
3624		if (buf_meta_bread(hfsmp->hfs_devvp, hfsmp->hfs_alt_id_sector, sector_size, NOCRED, &alt_bp) == 0) {
3625			bcopy(mdb, (char *)buf_dataptr(alt_bp) + HFS_ALT_OFFSET(sector_size), kMDBSize);
3626
3627			(void) VNOP_BWRITE(alt_bp);
3628		} else if (alt_bp)
3629			buf_brelse(alt_bp);
3630	}
3631
3632	if (waitfor != MNT_WAIT)
3633		buf_bawrite(bp);
3634	else
3635		retval = VNOP_BWRITE(bp);
3636
3637	return (retval);
3638}
3639
3640/*
3641 *  Flush any dirty in-memory mount data to the on-disk
3642 *  volume header.
3643 *
3644 *  Note: the on-disk volume signature is intentionally
3645 *  not flushed since the on-disk "H+" and "HX" signatures
3646 *  are always stored in-memory as "H+".
3647 */
3648int
3649hfs_flushvolumeheader(struct hfsmount *hfsmp, int waitfor, int altflush)
3650{
3651	ExtendedVCB *vcb = HFSTOVCB(hfsmp);
3652	struct filefork *fp;
3653	HFSPlusVolumeHeader *volumeHeader, *altVH;
3654	int retval;
3655	struct buf *bp, *alt_bp;
3656	int i;
3657	daddr64_t priIDSector;
3658	int critical;
3659	u_int16_t  signature;
3660	u_int16_t  hfsversion;
3661
3662	if (hfsmp->hfs_flags & HFS_READ_ONLY) {
3663		return(0);
3664	}
3665	if (hfsmp->hfs_flags & HFS_STANDARD) {
3666		return hfs_flushMDB(hfsmp, waitfor, altflush);
3667	}
3668	critical = altflush;
3669	priIDSector = (daddr64_t)((vcb->hfsPlusIOPosOffset / hfsmp->hfs_logical_block_size) +
3670				  HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size));
3671
3672	if (hfs_start_transaction(hfsmp) != 0) {
3673	    return EINVAL;
3674	}
3675
3676	bp = NULL;
3677	alt_bp = NULL;
3678
3679	retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3680			HFS_PHYSBLK_ROUNDDOWN(priIDSector, hfsmp->hfs_log_per_phys),
3681			hfsmp->hfs_physical_block_size, NOCRED, &bp);
3682	if (retval) {
3683		printf("hfs: err %d reading VH blk (%s)\n", retval, vcb->vcbVN);
3684		goto err_exit;
3685	}
3686
3687	volumeHeader = (HFSPlusVolumeHeader *)((char *)buf_dataptr(bp) +
3688			HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3689
3690	/*
3691	 * Sanity check what we just read.  If it's bad, try the alternate
3692	 * instead.
3693	 */
3694	signature = SWAP_BE16 (volumeHeader->signature);
3695	hfsversion   = SWAP_BE16 (volumeHeader->version);
3696	if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3697	    (hfsversion < kHFSPlusVersion) || (hfsversion > 100) ||
3698	    (SWAP_BE32 (volumeHeader->blockSize) != vcb->blockSize)) {
3699		printf("hfs: corrupt VH on %s, sig 0x%04x, ver %d, blksize %d%s\n",
3700		      vcb->vcbVN, signature, hfsversion,
3701		      SWAP_BE32 (volumeHeader->blockSize),
3702		      hfsmp->hfs_alt_id_sector ? "; trying alternate" : "");
3703		hfs_mark_volume_inconsistent(hfsmp);
3704
3705		if (hfsmp->hfs_alt_id_sector) {
3706			retval = buf_meta_bread(hfsmp->hfs_devvp,
3707			    HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3708			    hfsmp->hfs_physical_block_size, NOCRED, &alt_bp);
3709			if (retval) {
3710				printf("hfs: err %d reading alternate VH (%s)\n", retval, vcb->vcbVN);
3711				goto err_exit;
3712			}
3713
3714			altVH = (HFSPlusVolumeHeader *)((char *)buf_dataptr(alt_bp) +
3715				HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size));
3716			signature = SWAP_BE16(altVH->signature);
3717			hfsversion = SWAP_BE16(altVH->version);
3718
3719			if ((signature != kHFSPlusSigWord && signature != kHFSXSigWord) ||
3720			    (hfsversion < kHFSPlusVersion) || (kHFSPlusVersion > 100) ||
3721			    (SWAP_BE32(altVH->blockSize) != vcb->blockSize)) {
3722				printf("hfs: corrupt alternate VH on %s, sig 0x%04x, ver %d, blksize %d\n",
3723				    vcb->vcbVN, signature, hfsversion,
3724				    SWAP_BE32(altVH->blockSize));
3725				retval = EIO;
3726				goto err_exit;
3727			}
3728
3729			/* The alternate is plausible, so use it. */
3730			bcopy(altVH, volumeHeader, kMDBSize);
3731			buf_brelse(alt_bp);
3732			alt_bp = NULL;
3733		} else {
3734			/* No alternate VH, nothing more we can do. */
3735			retval = EIO;
3736			goto err_exit;
3737		}
3738	}
3739
3740	if (hfsmp->jnl) {
3741		journal_modify_block_start(hfsmp->jnl, bp);
3742	}
3743
3744	/*
3745	 * For embedded HFS+ volumes, update create date if it changed
3746	 * (ie from a setattrlist call)
3747	 */
3748	if ((vcb->hfsPlusIOPosOffset != 0) &&
3749	    (SWAP_BE32 (volumeHeader->createDate) != vcb->localCreateDate)) {
3750		struct buf *bp2;
3751		HFSMasterDirectoryBlock	*mdb;
3752
3753		retval = (int)buf_meta_bread(hfsmp->hfs_devvp,
3754				HFS_PHYSBLK_ROUNDDOWN(HFS_PRI_SECTOR(hfsmp->hfs_logical_block_size), hfsmp->hfs_log_per_phys),
3755				hfsmp->hfs_physical_block_size, NOCRED, &bp2);
3756		if (retval) {
3757			if (bp2)
3758				buf_brelse(bp2);
3759			retval = 0;
3760		} else {
3761			mdb = (HFSMasterDirectoryBlock *)(buf_dataptr(bp2) +
3762				HFS_PRI_OFFSET(hfsmp->hfs_physical_block_size));
3763
3764			if ( SWAP_BE32 (mdb->drCrDate) != vcb->localCreateDate )
3765			  {
3766				if (hfsmp->jnl) {
3767				    journal_modify_block_start(hfsmp->jnl, bp2);
3768				}
3769
3770				mdb->drCrDate = SWAP_BE32 (vcb->localCreateDate);	/* pick up the new create date */
3771
3772				if (hfsmp->jnl) {
3773					journal_modify_block_end(hfsmp->jnl, bp2, NULL, NULL);
3774				} else {
3775					(void) VNOP_BWRITE(bp2);		/* write out the changes */
3776				}
3777			  }
3778			else
3779			  {
3780				buf_brelse(bp2);						/* just release it */
3781			  }
3782		  }
3783	}
3784
3785	lck_mtx_lock(&hfsmp->hfs_mutex);
3786
3787	/* Note: only update the lower 16 bits worth of attributes */
3788	volumeHeader->attributes       = SWAP_BE32 (vcb->vcbAtrb);
3789	volumeHeader->journalInfoBlock = SWAP_BE32 (vcb->vcbJinfoBlock);
3790	if (hfsmp->jnl) {
3791		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSJMountVersion);
3792	} else {
3793		volumeHeader->lastMountedVersion = SWAP_BE32 (kHFSPlusMountVersion);
3794	}
3795	volumeHeader->createDate	= SWAP_BE32 (vcb->localCreateDate);  /* volume create date is in local time */
3796	volumeHeader->modifyDate	= SWAP_BE32 (to_hfs_time(vcb->vcbLsMod));
3797	volumeHeader->backupDate	= SWAP_BE32 (to_hfs_time(vcb->vcbVolBkUp));
3798	volumeHeader->fileCount		= SWAP_BE32 (vcb->vcbFilCnt);
3799	volumeHeader->folderCount	= SWAP_BE32 (vcb->vcbDirCnt);
3800	volumeHeader->totalBlocks	= SWAP_BE32 (vcb->totalBlocks);
3801	volumeHeader->freeBlocks	= SWAP_BE32 (vcb->freeBlocks);
3802	volumeHeader->nextAllocation	= SWAP_BE32 (vcb->nextAllocation);
3803	volumeHeader->rsrcClumpSize	= SWAP_BE32 (vcb->vcbClpSiz);
3804	volumeHeader->dataClumpSize	= SWAP_BE32 (vcb->vcbClpSiz);
3805	volumeHeader->nextCatalogID	= SWAP_BE32 (vcb->vcbNxtCNID);
3806	volumeHeader->writeCount	= SWAP_BE32 (vcb->vcbWrCnt);
3807	volumeHeader->encodingsBitmap	= SWAP_BE64 (vcb->encodingsBitmap);
3808
3809	if (bcmp(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo)) != 0) {
3810		bcopy(vcb->vcbFndrInfo, volumeHeader->finderInfo, sizeof(volumeHeader->finderInfo));
3811		critical = 1;
3812	}
3813
3814	/*
3815	 * System files are only dirty when altflush is set.
3816	 */
3817	if (altflush == 0) {
3818		goto done;
3819	}
3820
3821	/* Sync Extents over-flow file meta data */
3822	fp = VTOF(vcb->extentsRefNum);
3823	if (FTOC(fp)->c_flag & C_MODIFIED) {
3824		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3825			volumeHeader->extentsFile.extents[i].startBlock	=
3826				SWAP_BE32 (fp->ff_extents[i].startBlock);
3827			volumeHeader->extentsFile.extents[i].blockCount	=
3828				SWAP_BE32 (fp->ff_extents[i].blockCount);
3829		}
3830		volumeHeader->extentsFile.logicalSize = SWAP_BE64 (fp->ff_size);
3831		volumeHeader->extentsFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3832		volumeHeader->extentsFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3833		FTOC(fp)->c_flag &= ~C_MODIFIED;
3834	}
3835
3836	/* Sync Catalog file meta data */
3837	fp = VTOF(vcb->catalogRefNum);
3838	if (FTOC(fp)->c_flag & C_MODIFIED) {
3839		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3840			volumeHeader->catalogFile.extents[i].startBlock	=
3841				SWAP_BE32 (fp->ff_extents[i].startBlock);
3842			volumeHeader->catalogFile.extents[i].blockCount	=
3843				SWAP_BE32 (fp->ff_extents[i].blockCount);
3844		}
3845		volumeHeader->catalogFile.logicalSize = SWAP_BE64 (fp->ff_size);
3846		volumeHeader->catalogFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3847		volumeHeader->catalogFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3848		FTOC(fp)->c_flag &= ~C_MODIFIED;
3849	}
3850
3851	/* Sync Allocation file meta data */
3852	fp = VTOF(vcb->allocationsRefNum);
3853	if (FTOC(fp)->c_flag & C_MODIFIED) {
3854		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3855			volumeHeader->allocationFile.extents[i].startBlock =
3856				SWAP_BE32 (fp->ff_extents[i].startBlock);
3857			volumeHeader->allocationFile.extents[i].blockCount =
3858				SWAP_BE32 (fp->ff_extents[i].blockCount);
3859		}
3860		volumeHeader->allocationFile.logicalSize = SWAP_BE64 (fp->ff_size);
3861		volumeHeader->allocationFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3862		volumeHeader->allocationFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3863		FTOC(fp)->c_flag &= ~C_MODIFIED;
3864	}
3865
3866	/* Sync Attribute file meta data */
3867	if (hfsmp->hfs_attribute_vp) {
3868		fp = VTOF(hfsmp->hfs_attribute_vp);
3869		for (i = 0; i < kHFSPlusExtentDensity; i++) {
3870			volumeHeader->attributesFile.extents[i].startBlock =
3871				SWAP_BE32 (fp->ff_extents[i].startBlock);
3872			volumeHeader->attributesFile.extents[i].blockCount =
3873				SWAP_BE32 (fp->ff_extents[i].blockCount);
3874		}
3875		FTOC(fp)->c_flag &= ~C_MODIFIED;
3876		volumeHeader->attributesFile.logicalSize = SWAP_BE64 (fp->ff_size);
3877		volumeHeader->attributesFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3878		volumeHeader->attributesFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3879	}
3880
3881	/* Sync Startup file meta data */
3882	if (hfsmp->hfs_startup_vp) {
3883		fp = VTOF(hfsmp->hfs_startup_vp);
3884		if (FTOC(fp)->c_flag & C_MODIFIED) {
3885			for (i = 0; i < kHFSPlusExtentDensity; i++) {
3886				volumeHeader->startupFile.extents[i].startBlock =
3887					SWAP_BE32 (fp->ff_extents[i].startBlock);
3888				volumeHeader->startupFile.extents[i].blockCount =
3889					SWAP_BE32 (fp->ff_extents[i].blockCount);
3890			}
3891			volumeHeader->startupFile.logicalSize = SWAP_BE64 (fp->ff_size);
3892			volumeHeader->startupFile.totalBlocks = SWAP_BE32 (fp->ff_blocks);
3893			volumeHeader->startupFile.clumpSize   = SWAP_BE32 (fp->ff_clumpsize);
3894			FTOC(fp)->c_flag &= ~C_MODIFIED;
3895		}
3896	}
3897
3898done:
3899	MarkVCBClean(hfsmp);
3900	lck_mtx_unlock(&hfsmp->hfs_mutex);
3901
3902	/* If requested, flush out the alternate volume header */
3903	if (altflush && hfsmp->hfs_alt_id_sector) {
3904		if (buf_meta_bread(hfsmp->hfs_devvp,
3905				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
3906				hfsmp->hfs_physical_block_size, NOCRED, &alt_bp) == 0) {
3907			if (hfsmp->jnl) {
3908				journal_modify_block_start(hfsmp->jnl, alt_bp);
3909			}
3910
3911			bcopy(volumeHeader, (char *)buf_dataptr(alt_bp) +
3912					HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size),
3913					kMDBSize);
3914
3915			if (hfsmp->jnl) {
3916				journal_modify_block_end(hfsmp->jnl, alt_bp, NULL, NULL);
3917			} else {
3918				(void) VNOP_BWRITE(alt_bp);
3919			}
3920		} else if (alt_bp)
3921			buf_brelse(alt_bp);
3922	}
3923
3924	if (hfsmp->jnl) {
3925		journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
3926	} else {
3927		if (waitfor != MNT_WAIT)
3928			buf_bawrite(bp);
3929		else {
3930		    retval = VNOP_BWRITE(bp);
3931		    /* When critical data changes, flush the device cache */
3932		    if (critical && (retval == 0)) {
3933			(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE,
3934					 NULL, FWRITE, NULL);
3935		    }
3936		}
3937	}
3938	hfs_end_transaction(hfsmp);
3939
3940	return (retval);
3941
3942err_exit:
3943	if (alt_bp)
3944		buf_brelse(alt_bp);
3945	if (bp)
3946		buf_brelse(bp);
3947	hfs_end_transaction(hfsmp);
3948	return retval;
3949}
3950
3951
3952/*
3953 * Extend a file system.
3954 */
3955int
3956hfs_extendfs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
3957{
3958	struct proc *p = vfs_context_proc(context);
3959	kauth_cred_t cred = vfs_context_ucred(context);
3960	struct  vnode *vp;
3961	struct  vnode *devvp;
3962	struct  buf *bp;
3963	struct  filefork *fp = NULL;
3964	ExtendedVCB  *vcb;
3965	struct  cat_fork forkdata;
3966	u_int64_t  oldsize;
3967	u_int64_t  newblkcnt;
3968	u_int64_t  prev_phys_block_count;
3969	u_int32_t  addblks;
3970	u_int64_t  sector_count;
3971	u_int32_t  sector_size;
3972	u_int32_t  phys_sector_size;
3973	u_int32_t  overage_blocks;
3974	daddr64_t  prev_alt_sector;
3975	daddr_t	   bitmapblks;
3976	int  lockflags = 0;
3977	int  error;
3978	int64_t oldBitmapSize;
3979	Boolean  usedExtendFileC = false;
3980	int transaction_begun = 0;
3981
3982	devvp = hfsmp->hfs_devvp;
3983	vcb = HFSTOVCB(hfsmp);
3984
3985	/*
3986	 * - HFS Plus file systems only.
3987	 * - Journaling must be enabled.
3988	 * - No embedded volumes.
3989	 */
3990	if ((vcb->vcbSigWord == kHFSSigWord) ||
3991	     (hfsmp->jnl == NULL) ||
3992	     (vcb->hfsPlusIOPosOffset != 0)) {
3993		return (EPERM);
3994	}
3995	/*
3996	 * If extending file system by non-root, then verify
3997	 * ownership and check permissions.
3998	 */
3999	if (suser(cred, NULL)) {
4000		error = hfs_vget(hfsmp, kHFSRootFolderID, &vp, 0, 0);
4001
4002		if (error)
4003			return (error);
4004		error = hfs_owner_rights(hfsmp, VTOC(vp)->c_uid, cred, p, 0);
4005		if (error == 0) {
4006			error = hfs_write_access(vp, cred, p, false);
4007		}
4008		hfs_unlock(VTOC(vp));
4009		vnode_put(vp);
4010		if (error)
4011			return (error);
4012
4013		error = vnode_authorize(devvp, NULL, KAUTH_VNODE_READ_DATA | KAUTH_VNODE_WRITE_DATA, context);
4014		if (error)
4015			return (error);
4016	}
4017	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKSIZE, (caddr_t)&sector_size, 0, context)) {
4018		return (ENXIO);
4019	}
4020	if (sector_size != hfsmp->hfs_logical_block_size) {
4021		return (ENXIO);
4022	}
4023	if (VNOP_IOCTL(devvp, DKIOCGETBLOCKCOUNT, (caddr_t)&sector_count, 0, context)) {
4024		return (ENXIO);
4025	}
4026	if ((sector_size * sector_count) < newsize) {
4027		printf("hfs_extendfs: not enough space on device\n");
4028		return (ENOSPC);
4029	}
4030	error = VNOP_IOCTL(devvp, DKIOCGETPHYSICALBLOCKSIZE, (caddr_t)&phys_sector_size, 0, context);
4031	if (error) {
4032		if ((error != ENOTSUP) && (error != ENOTTY)) {
4033			return (ENXIO);
4034		}
4035		/* If ioctl is not supported, force physical and logical sector size to be same */
4036		phys_sector_size = sector_size;
4037	}
4038	oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4039
4040	/*
4041	 * Validate new size.
4042	 */
4043	if ((newsize <= oldsize) || (newsize % sector_size) || (newsize % phys_sector_size)) {
4044		printf("hfs_extendfs: invalid size\n");
4045		return (EINVAL);
4046	}
4047	newblkcnt = newsize / vcb->blockSize;
4048	if (newblkcnt > (u_int64_t)0xFFFFFFFF)
4049		return (EOVERFLOW);
4050
4051	addblks = newblkcnt - vcb->totalBlocks;
4052
4053	if (hfs_resize_debug) {
4054		printf ("hfs_extendfs: old: size=%qu, blkcnt=%u\n", oldsize, hfsmp->totalBlocks);
4055		printf ("hfs_extendfs: new: size=%qu, blkcnt=%u, addblks=%u\n", newsize, (u_int32_t)newblkcnt, addblks);
4056	}
4057	printf("hfs_extendfs: will extend \"%s\" by %d blocks\n", vcb->vcbVN, addblks);
4058
4059	HFS_MOUNT_LOCK(hfsmp, TRUE);
4060	if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4061		HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4062		error = EALREADY;
4063		goto out;
4064	}
4065	hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4066	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4067
4068	/* Start with a clean journal. */
4069	hfs_journal_flush(hfsmp, TRUE);
4070
4071	/*
4072	 * Enclose changes inside a transaction.
4073	 */
4074	if (hfs_start_transaction(hfsmp) != 0) {
4075		error = EINVAL;
4076		goto out;
4077	}
4078	transaction_begun = 1;
4079
4080
4081	/* Update the hfsmp fields for the physical information about the device */
4082	prev_phys_block_count = hfsmp->hfs_logical_block_count;
4083	prev_alt_sector = hfsmp->hfs_alt_id_sector;
4084
4085	hfsmp->hfs_logical_block_count = sector_count;
4086	/*
4087	 * Note that the new AltVH location must be based on the device's EOF rather than the new
4088	 * filesystem's EOF, so we use logical_block_count here rather than newsize.
4089	 */
4090	hfsmp->hfs_alt_id_sector = (hfsmp->hfsPlusIOPosOffset / sector_size) +
4091	                          HFS_ALT_SECTOR(sector_size, hfsmp->hfs_logical_block_count);
4092	hfsmp->hfs_logical_bytes = (uint64_t) sector_count * (uint64_t) sector_size;
4093
4094
4095	/*
4096	 * Note: we take the attributes lock in case we have an attribute data vnode
4097	 * which needs to change size.
4098	 */
4099	lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4100	vp = vcb->allocationsRefNum;
4101	fp = VTOF(vp);
4102	bcopy(&fp->ff_data, &forkdata, sizeof(forkdata));
4103
4104	/*
4105	 * Calculate additional space required (if any) by allocation bitmap.
4106	 */
4107	oldBitmapSize = fp->ff_size;
4108	bitmapblks = roundup((newblkcnt+7) / 8, vcb->vcbVBMIOSize) / vcb->blockSize;
4109	if (bitmapblks > (daddr_t)fp->ff_blocks)
4110		bitmapblks -= fp->ff_blocks;
4111	else
4112		bitmapblks = 0;
4113
4114	/*
4115	 * The allocation bitmap can contain unused bits that are beyond end of
4116	 * current volume's allocation blocks.  Usually they are supposed to be
4117	 * zero'ed out but there can be cases where they might be marked as used.
4118	 * After extending the file system, those bits can represent valid
4119	 * allocation blocks, so we mark all the bits from the end of current
4120	 * volume to end of allocation bitmap as "free".
4121	 *
4122	 * Figure out the number of overage blocks before proceeding though,
4123	 * so we don't add more bytes to our I/O than necessary.
4124	 * First figure out the total number of blocks representable by the
4125	 * end of the bitmap file vs. the total number of blocks in the new FS.
4126	 * Then subtract away the number of blocks in the current FS.  This is how much
4127	 * we can mark as free right now without having to grow the bitmap file.
4128	 */
4129	overage_blocks = fp->ff_blocks * vcb->blockSize * 8;
4130	overage_blocks = MIN (overage_blocks, newblkcnt);
4131   	overage_blocks -= vcb->totalBlocks;
4132
4133	BlockMarkFreeUnused(vcb, vcb->totalBlocks, overage_blocks);
4134
4135	if (bitmapblks > 0) {
4136		daddr64_t blkno;
4137		daddr_t blkcnt;
4138		off_t bytesAdded;
4139
4140		/*
4141		 * Get the bitmap's current size (in allocation blocks) so we know
4142		 * where to start zero filling once the new space is added.  We've
4143		 * got to do this before the bitmap is grown.
4144		 */
4145		blkno  = (daddr64_t)fp->ff_blocks;
4146
4147		/*
4148		 * Try to grow the allocation file in the normal way, using allocation
4149		 * blocks already existing in the file system.  This way, we might be
4150		 * able to grow the bitmap contiguously, or at least in the metadata
4151		 * zone.
4152		 */
4153		error = ExtendFileC(vcb, fp, bitmapblks * vcb->blockSize, 0,
4154				kEFAllMask | kEFNoClumpMask | kEFReserveMask
4155				| kEFMetadataMask | kEFContigMask, &bytesAdded);
4156
4157		if (error == 0) {
4158			usedExtendFileC = true;
4159		} else {
4160			/*
4161			 * If the above allocation failed, fall back to allocating the new
4162			 * extent of the bitmap from the space we're going to add.  Since those
4163			 * blocks don't yet belong to the file system, we have to update the
4164			 * extent list directly, and manually adjust the file size.
4165			 */
4166			bytesAdded = 0;
4167			error = AddFileExtent(vcb, fp, vcb->totalBlocks, bitmapblks);
4168			if (error) {
4169				printf("hfs_extendfs: error %d adding extents\n", error);
4170				goto out;
4171			}
4172			fp->ff_blocks += bitmapblks;
4173			VTOC(vp)->c_blocks = fp->ff_blocks;
4174			VTOC(vp)->c_flag |= C_MODIFIED;
4175		}
4176
4177		/*
4178		 * Update the allocation file's size to include the newly allocated
4179		 * blocks.  Note that ExtendFileC doesn't do this, which is why this
4180		 * statement is outside the above "if" statement.
4181		 */
4182		fp->ff_size += (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4183
4184		/*
4185		 * Zero out the new bitmap blocks.
4186		 */
4187		{
4188
4189			bp = NULL;
4190			blkcnt = bitmapblks;
4191			while (blkcnt > 0) {
4192				error = (int)buf_meta_bread(vp, blkno, vcb->blockSize, NOCRED, &bp);
4193				if (error) {
4194					if (bp) {
4195						buf_brelse(bp);
4196					}
4197					break;
4198				}
4199				bzero((char *)buf_dataptr(bp), vcb->blockSize);
4200				buf_markaged(bp);
4201				error = (int)buf_bwrite(bp);
4202				if (error)
4203					break;
4204				--blkcnt;
4205				++blkno;
4206			}
4207		}
4208		if (error) {
4209			printf("hfs_extendfs: error %d  clearing blocks\n", error);
4210			goto out;
4211		}
4212		/*
4213		 * Mark the new bitmap space as allocated.
4214		 *
4215		 * Note that ExtendFileC will have marked any blocks it allocated, so
4216		 * this is only needed if we used AddFileExtent.  Also note that this
4217		 * has to come *after* the zero filling of new blocks in the case where
4218		 * we used AddFileExtent (since the part of the bitmap we're touching
4219		 * is in those newly allocated blocks).
4220		 */
4221		if (!usedExtendFileC) {
4222			error = BlockMarkAllocated(vcb, vcb->totalBlocks, bitmapblks);
4223			if (error) {
4224				printf("hfs_extendfs: error %d setting bitmap\n", error);
4225				goto out;
4226			}
4227			vcb->freeBlocks -= bitmapblks;
4228		}
4229	}
4230	/*
4231	 * Mark the new alternate VH as allocated.
4232	 */
4233	if (vcb->blockSize == 512)
4234		error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 2, 2);
4235	else
4236		error = BlockMarkAllocated(vcb, vcb->totalBlocks + addblks - 1, 1);
4237	if (error) {
4238		printf("hfs_extendfs: error %d setting bitmap (VH)\n", error);
4239		goto out;
4240	}
4241	/*
4242	 * Mark the old alternate VH as free.
4243	 */
4244	if (vcb->blockSize == 512)
4245		(void) BlockMarkFree(vcb, vcb->totalBlocks - 2, 2);
4246	else
4247		(void) BlockMarkFree(vcb, vcb->totalBlocks - 1, 1);
4248	/*
4249	 * Adjust file system variables for new space.
4250	 */
4251	vcb->totalBlocks += addblks;
4252	vcb->freeBlocks += addblks;
4253	MarkVCBDirty(vcb);
4254	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4255	if (error) {
4256		printf("hfs_extendfs: couldn't flush volume headers (%d)", error);
4257		/*
4258		 * Restore to old state.
4259		 */
4260		if (usedExtendFileC) {
4261			(void) TruncateFileC(vcb, fp, oldBitmapSize, 0, FORK_IS_RSRC(fp),
4262								 FTOC(fp)->c_fileid, false);
4263		} else {
4264			fp->ff_blocks -= bitmapblks;
4265			fp->ff_size -= (u_int64_t)bitmapblks * (u_int64_t)vcb->blockSize;
4266			/*
4267			 * No need to mark the excess blocks free since those bitmap blocks
4268			 * are no longer part of the bitmap.  But we do need to undo the
4269			 * effect of the "vcb->freeBlocks -= bitmapblks" above.
4270			 */
4271			vcb->freeBlocks += bitmapblks;
4272		}
4273		vcb->totalBlocks -= addblks;
4274		vcb->freeBlocks -= addblks;
4275		hfsmp->hfs_logical_block_count = prev_phys_block_count;
4276		hfsmp->hfs_alt_id_sector = prev_alt_sector;
4277		MarkVCBDirty(vcb);
4278		if (vcb->blockSize == 512) {
4279			if (BlockMarkAllocated(vcb, vcb->totalBlocks - 2, 2)) {
4280				hfs_mark_volume_inconsistent(hfsmp);
4281			}
4282		} else {
4283			if (BlockMarkAllocated(vcb, vcb->totalBlocks - 1, 1)) {
4284				hfs_mark_volume_inconsistent(hfsmp);
4285			}
4286		}
4287		goto out;
4288	}
4289	/*
4290	 * Invalidate the old alternate volume header.
4291	 */
4292	bp = NULL;
4293	if (prev_alt_sector) {
4294		if (buf_meta_bread(hfsmp->hfs_devvp,
4295				HFS_PHYSBLK_ROUNDDOWN(prev_alt_sector, hfsmp->hfs_log_per_phys),
4296				hfsmp->hfs_physical_block_size, NOCRED, &bp) == 0) {
4297			journal_modify_block_start(hfsmp->jnl, bp);
4298
4299			bzero((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size), kMDBSize);
4300
4301			journal_modify_block_end(hfsmp->jnl, bp, NULL, NULL);
4302		} else if (bp) {
4303			buf_brelse(bp);
4304		}
4305	}
4306
4307	/*
4308	 * Update the metadata zone size based on current volume size
4309	 */
4310	hfs_metadatazone_init(hfsmp, false);
4311
4312	/*
4313	 * Adjust the size of hfsmp->hfs_attrdata_vp
4314	 */
4315	if (hfsmp->hfs_attrdata_vp) {
4316		struct cnode *attr_cp;
4317		struct filefork *attr_fp;
4318
4319		if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4320			attr_cp = VTOC(hfsmp->hfs_attrdata_vp);
4321			attr_fp = VTOF(hfsmp->hfs_attrdata_vp);
4322
4323			attr_cp->c_blocks = newblkcnt;
4324			attr_fp->ff_blocks = newblkcnt;
4325			attr_fp->ff_extents[0].blockCount = newblkcnt;
4326			attr_fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4327			ubc_setsize(hfsmp->hfs_attrdata_vp, attr_fp->ff_size);
4328			vnode_put(hfsmp->hfs_attrdata_vp);
4329		}
4330	}
4331
4332	/*
4333	 * Update the R/B Tree if necessary.  Since we don't have to drop the systemfile
4334	 * locks in the middle of these operations like we do in the truncate case
4335	 * where we have to relocate files, we can only update the red-black tree
4336	 * if there were actual changes made to the bitmap.  Also, we can't really scan the
4337	 * new portion of the bitmap before it has been allocated. The BlockMarkAllocated
4338	 * routines are smart enough to avoid the r/b tree if the portion they are manipulating is
4339	 * not currently controlled by the tree.
4340	 *
4341	 * We only update hfsmp->allocLimit if totalBlocks actually increased.
4342	 */
4343	if (error == 0) {
4344		UpdateAllocLimit(hfsmp, hfsmp->totalBlocks);
4345	}
4346
4347	/* Release all locks and sync up journal content before
4348	 * checking and extending, if required, the journal
4349	 */
4350	if (lockflags) {
4351		hfs_systemfile_unlock(hfsmp, lockflags);
4352		lockflags = 0;
4353	}
4354	if (transaction_begun) {
4355		hfs_end_transaction(hfsmp);
4356		hfs_journal_flush(hfsmp, TRUE);
4357		transaction_begun = 0;
4358	}
4359
4360	/* Increase the journal size, if required. */
4361	error = hfs_extend_journal(hfsmp, sector_size, sector_count, context);
4362	if (error) {
4363		printf ("hfs_extendfs: Could not extend journal size\n");
4364		goto out_noalloc;
4365	}
4366
4367	/* Log successful extending */
4368	printf("hfs_extendfs: extended \"%s\" to %d blocks (was %d blocks)\n",
4369	       hfsmp->vcbVN, hfsmp->totalBlocks, (u_int32_t)(oldsize/hfsmp->blockSize));
4370
4371out:
4372	if (error && fp) {
4373		/* Restore allocation fork. */
4374		bcopy(&forkdata, &fp->ff_data, sizeof(forkdata));
4375		VTOC(vp)->c_blocks = fp->ff_blocks;
4376
4377	}
4378
4379out_noalloc:
4380	HFS_MOUNT_LOCK(hfsmp, TRUE);
4381	hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4382	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4383	if (lockflags) {
4384		hfs_systemfile_unlock(hfsmp, lockflags);
4385	}
4386	if (transaction_begun) {
4387		hfs_end_transaction(hfsmp);
4388		hfs_journal_flush(hfsmp, FALSE);
4389		/* Just to be sure, sync all data to the disk */
4390		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4391	}
4392
4393	return MacToVFSError(error);
4394}
4395
4396#define HFS_MIN_SIZE  (32LL * 1024LL * 1024LL)
4397
4398/*
4399 * Truncate a file system (while still mounted).
4400 */
4401int
4402hfs_truncatefs(struct hfsmount *hfsmp, u_int64_t newsize, vfs_context_t context)
4403{
4404	struct  buf *bp = NULL;
4405	u_int64_t oldsize;
4406	u_int32_t newblkcnt;
4407	u_int32_t reclaimblks = 0;
4408	int lockflags = 0;
4409	int transaction_begun = 0;
4410	Boolean updateFreeBlocks = false;
4411	Boolean disable_sparse = false;
4412	int error = 0;
4413
4414	lck_mtx_lock(&hfsmp->hfs_mutex);
4415	if (hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) {
4416		lck_mtx_unlock(&hfsmp->hfs_mutex);
4417		return (EALREADY);
4418	}
4419	hfsmp->hfs_flags |= HFS_RESIZE_IN_PROGRESS;
4420	hfsmp->hfs_resize_blocksmoved = 0;
4421	hfsmp->hfs_resize_totalblocks = 0;
4422	hfsmp->hfs_resize_progress = 0;
4423	lck_mtx_unlock(&hfsmp->hfs_mutex);
4424
4425	/*
4426	 * - Journaled HFS Plus volumes only.
4427	 * - No embedded volumes.
4428	 */
4429	if ((hfsmp->jnl == NULL) ||
4430	    (hfsmp->hfsPlusIOPosOffset != 0)) {
4431		error = EPERM;
4432		goto out;
4433	}
4434	oldsize = (u_int64_t)hfsmp->totalBlocks * (u_int64_t)hfsmp->blockSize;
4435	newblkcnt = newsize / hfsmp->blockSize;
4436	reclaimblks = hfsmp->totalBlocks - newblkcnt;
4437
4438	if (hfs_resize_debug) {
4439		printf ("hfs_truncatefs: old: size=%qu, blkcnt=%u, freeblks=%u\n", oldsize, hfsmp->totalBlocks, hfs_freeblks(hfsmp, 1));
4440		printf ("hfs_truncatefs: new: size=%qu, blkcnt=%u, reclaimblks=%u\n", newsize, newblkcnt, reclaimblks);
4441	}
4442
4443	/* Make sure new size is valid. */
4444	if ((newsize < HFS_MIN_SIZE) ||
4445	    (newsize >= oldsize) ||
4446	    (newsize % hfsmp->hfs_logical_block_size) ||
4447	    (newsize % hfsmp->hfs_physical_block_size)) {
4448		printf ("hfs_truncatefs: invalid size (newsize=%qu, oldsize=%qu)\n", newsize, oldsize);
4449		error = EINVAL;
4450		goto out;
4451	}
4452
4453	/*
4454	 * Make sure that the file system has enough free blocks reclaim.
4455	 *
4456	 * Before resize, the disk is divided into four zones -
4457	 * 	A. Allocated_Stationary - These are allocated blocks that exist
4458	 * 	   before the new end of disk.  These blocks will not be
4459	 * 	   relocated or modified during resize.
4460	 * 	B. Free_Stationary - These are free blocks that exist before the
4461	 * 	   new end of disk.  These blocks can be used for any new
4462	 * 	   allocations during resize, including allocation for relocating
4463	 * 	   data from the area of disk being reclaimed.
4464	 * 	C. Allocated_To-Reclaim - These are allocated blocks that exist
4465	 *         beyond the new end of disk.  These blocks need to be reclaimed
4466	 *         during resize by allocating equal number of blocks in Free
4467	 *         Stationary zone and copying the data.
4468	 *      D. Free_To-Reclaim - These are free blocks that exist beyond the
4469	 *         new end of disk.  Nothing special needs to be done to reclaim
4470	 *         them.
4471	 *
4472	 * Total number of blocks on the disk before resize:
4473	 * ------------------------------------------------
4474	 * 	Total Blocks = Allocated_Stationary + Free_Stationary +
4475	 * 	               Allocated_To-Reclaim + Free_To-Reclaim
4476	 *
4477	 * Total number of blocks that need to be reclaimed:
4478	 * ------------------------------------------------
4479	 *	Blocks to Reclaim = Allocated_To-Reclaim + Free_To-Reclaim
4480	 *
4481	 * Note that the check below also makes sure that we have enough space
4482	 * to relocate data from Allocated_To-Reclaim to Free_Stationary.
4483	 * Therefore we do not need to check total number of blocks to relocate
4484	 * later in the code.
4485	 *
4486	 * The condition below gets converted to:
4487	 *
4488	 * Allocated To-Reclaim + Free To-Reclaim >= Free Stationary + Free To-Reclaim
4489	 *
4490	 * which is equivalent to:
4491	 *
4492	 *              Allocated To-Reclaim >= Free Stationary
4493	 */
4494	if (reclaimblks >= hfs_freeblks(hfsmp, 1)) {
4495		printf("hfs_truncatefs: insufficient space (need %u blocks; have %u free blocks)\n", reclaimblks, hfs_freeblks(hfsmp, 1));
4496		error = ENOSPC;
4497		goto out;
4498	}
4499
4500	/* Start with a clean journal. */
4501	hfs_journal_flush(hfsmp, TRUE);
4502
4503	if (hfs_start_transaction(hfsmp) != 0) {
4504		error = EINVAL;
4505		goto out;
4506	}
4507	transaction_begun = 1;
4508
4509	/* Take the bitmap lock to update the alloc limit field */
4510	lockflags = hfs_systemfile_lock(hfsmp, SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4511
4512	/*
4513	 * Prevent new allocations from using the part we're trying to truncate.
4514	 *
4515	 * NOTE: allocLimit is set to the allocation block number where the new
4516	 * alternate volume header will be.  That way there will be no files to
4517	 * interfere with allocating the new alternate volume header, and no files
4518	 * in the allocation blocks beyond (i.e. the blocks we're trying to
4519	 * truncate away.
4520	 *
4521	 * Also shrink the red-black tree if needed.
4522	 */
4523	if (hfsmp->blockSize == 512) {
4524		error = UpdateAllocLimit (hfsmp, newblkcnt - 2);
4525	}
4526	else {
4527		error = UpdateAllocLimit (hfsmp, newblkcnt - 1);
4528	}
4529
4530	/* Sparse devices use first fit allocation which is not ideal
4531	 * for volume resize which requires best fit allocation.  If a
4532	 * sparse device is being truncated, disable the sparse device
4533	 * property temporarily for the duration of resize.  Also reset
4534	 * the free extent cache so that it is rebuilt as sorted by
4535	 * totalBlocks instead of startBlock.
4536	 *
4537	 * Note that this will affect all allocations on the volume and
4538	 * ideal fix would be just to modify resize-related allocations,
4539	 * but it will result in complexity like handling of two free
4540	 * extent caches sorted differently, etc.  So we stick to this
4541	 * solution for now.
4542	 */
4543	HFS_MOUNT_LOCK(hfsmp, TRUE);
4544	if (hfsmp->hfs_flags & HFS_HAS_SPARSE_DEVICE) {
4545		hfsmp->hfs_flags &= ~HFS_HAS_SPARSE_DEVICE;
4546		ResetVCBFreeExtCache(hfsmp);
4547		disable_sparse = true;
4548	}
4549
4550	/*
4551	 * Update the volume free block count to reflect the total number
4552	 * of free blocks that will exist after a successful resize.
4553	 * Relocation of extents will result in no net change in the total
4554	 * free space on the disk.  Therefore the code that allocates
4555	 * space for new extent and deallocates the old extent explicitly
4556	 * prevents updating the volume free block count.  It will also
4557	 * prevent false disk full error when the number of blocks in
4558	 * an extent being relocated is more than the free blocks that
4559	 * will exist after the volume is resized.
4560	 */
4561	hfsmp->freeBlocks -= reclaimblks;
4562	updateFreeBlocks = true;
4563	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4564
4565	if (lockflags) {
4566		hfs_systemfile_unlock(hfsmp, lockflags);
4567		lockflags = 0;
4568	}
4569
4570	/*
4571	 * Update the metadata zone size to match the new volume size,
4572	 * and if it too less, metadata zone might be disabled.
4573	 */
4574	hfs_metadatazone_init(hfsmp, false);
4575
4576	/*
4577	 * If some files have blocks at or beyond the location of the
4578	 * new alternate volume header, recalculate free blocks and
4579	 * reclaim blocks.  Otherwise just update free blocks count.
4580	 *
4581	 * The current allocLimit is set to the location of new alternate
4582	 * volume header, and reclaimblks are the total number of blocks
4583	 * that need to be reclaimed.  So the check below is really
4584	 * ignoring the blocks allocated for old alternate volume header.
4585	 */
4586	if (hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks)) {
4587		/*
4588		 * hfs_reclaimspace will use separate transactions when
4589		 * relocating files (so we don't overwhelm the journal).
4590		 */
4591		hfs_end_transaction(hfsmp);
4592		transaction_begun = 0;
4593
4594		/* Attempt to reclaim some space. */
4595		error = hfs_reclaimspace(hfsmp, hfsmp->allocLimit, reclaimblks, context);
4596		if (error != 0) {
4597			printf("hfs_truncatefs: couldn't reclaim space on %s (error=%d)\n", hfsmp->vcbVN, error);
4598			error = ENOSPC;
4599			goto out;
4600		}
4601		if (hfs_start_transaction(hfsmp) != 0) {
4602			error = EINVAL;
4603			goto out;
4604		}
4605		transaction_begun = 1;
4606
4607		/* Check if we're clear now. */
4608		error = hfs_isallocated(hfsmp, hfsmp->allocLimit, reclaimblks);
4609		if (error != 0) {
4610			printf("hfs_truncatefs: didn't reclaim enough space on %s (error=%d)\n", hfsmp->vcbVN, error);
4611			error = EAGAIN;  /* tell client to try again */
4612			goto out;
4613		}
4614	}
4615
4616	/*
4617	 * Note: we take the attributes lock in case we have an attribute data vnode
4618	 * which needs to change size.
4619	 */
4620	lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE | SFL_EXTENTS | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
4621
4622	/*
4623	 * Allocate last 1KB for alternate volume header.
4624	 */
4625	error = BlockMarkAllocated(hfsmp, hfsmp->allocLimit, (hfsmp->blockSize == 512) ? 2 : 1);
4626	if (error) {
4627		printf("hfs_truncatefs: Error %d allocating new alternate volume header\n", error);
4628		goto out;
4629	}
4630
4631	/*
4632	 * Mark the old alternate volume header as free.
4633	 * We don't bother shrinking allocation bitmap file.
4634	 */
4635	if (hfsmp->blockSize == 512)
4636		(void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 2, 2);
4637	else
4638		(void) BlockMarkFree(hfsmp, hfsmp->totalBlocks - 1, 1);
4639
4640	/*
4641	 * Invalidate the existing alternate volume header.
4642	 *
4643	 * Don't include this in a transaction (don't call journal_modify_block)
4644	 * since this block will be outside of the truncated file system!
4645	 */
4646	if (hfsmp->hfs_alt_id_sector) {
4647		error = buf_meta_bread(hfsmp->hfs_devvp,
4648				HFS_PHYSBLK_ROUNDDOWN(hfsmp->hfs_alt_id_sector, hfsmp->hfs_log_per_phys),
4649				hfsmp->hfs_physical_block_size, NOCRED, &bp);
4650		if (error == 0) {
4651			bzero((void*)((char *)buf_dataptr(bp) + HFS_ALT_OFFSET(hfsmp->hfs_physical_block_size)), kMDBSize);
4652			(void) VNOP_BWRITE(bp);
4653		} else {
4654			if (bp) {
4655				buf_brelse(bp);
4656			}
4657		}
4658		bp = NULL;
4659	}
4660
4661	/* Log successful shrinking. */
4662	printf("hfs_truncatefs: shrank \"%s\" to %d blocks (was %d blocks)\n",
4663	       hfsmp->vcbVN, newblkcnt, hfsmp->totalBlocks);
4664
4665	/*
4666	 * Adjust file system variables and flush them to disk.
4667	 */
4668	hfsmp->totalBlocks = newblkcnt;
4669	hfsmp->hfs_logical_block_count = newsize / hfsmp->hfs_logical_block_size;
4670	hfsmp->hfs_logical_bytes = (uint64_t) hfsmp->hfs_logical_block_count * (uint64_t) hfsmp->hfs_logical_block_size;
4671
4672	/*
4673	 * Note that although the logical block size is updated here, it is only done for
4674	 * the benefit of the partition management software.  The logical block count change
4675	 * has not yet actually been propagated to the disk device yet.
4676	 */
4677
4678	hfsmp->hfs_alt_id_sector = HFS_ALT_SECTOR(hfsmp->hfs_logical_block_size, hfsmp->hfs_logical_block_count);
4679	MarkVCBDirty(hfsmp);
4680	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
4681	if (error)
4682		panic("hfs_truncatefs: unexpected error flushing volume header (%d)\n", error);
4683
4684	/*
4685	 * Adjust the size of hfsmp->hfs_attrdata_vp
4686	 */
4687	if (hfsmp->hfs_attrdata_vp) {
4688		struct cnode *cp;
4689		struct filefork *fp;
4690
4691		if (vnode_get(hfsmp->hfs_attrdata_vp) == 0) {
4692			cp = VTOC(hfsmp->hfs_attrdata_vp);
4693			fp = VTOF(hfsmp->hfs_attrdata_vp);
4694
4695			cp->c_blocks = newblkcnt;
4696			fp->ff_blocks = newblkcnt;
4697			fp->ff_extents[0].blockCount = newblkcnt;
4698			fp->ff_size = (off_t) newblkcnt * hfsmp->blockSize;
4699			ubc_setsize(hfsmp->hfs_attrdata_vp, fp->ff_size);
4700			vnode_put(hfsmp->hfs_attrdata_vp);
4701		}
4702	}
4703
4704out:
4705	/*
4706	 * Update the allocLimit to acknowledge the last one or two blocks now.
4707	 * Add it to the tree as well if necessary.
4708	 */
4709	UpdateAllocLimit (hfsmp, hfsmp->totalBlocks);
4710
4711	HFS_MOUNT_LOCK(hfsmp, TRUE);
4712	if (disable_sparse == true) {
4713		/* Now that resize is completed, set the volume to be sparse
4714		 * device again so that all further allocations will be first
4715		 * fit instead of best fit.  Reset free extent cache so that
4716		 * it is rebuilt.
4717		 */
4718		hfsmp->hfs_flags |= HFS_HAS_SPARSE_DEVICE;
4719		ResetVCBFreeExtCache(hfsmp);
4720	}
4721
4722	if (error && (updateFreeBlocks == true)) {
4723		hfsmp->freeBlocks += reclaimblks;
4724	}
4725
4726	if (hfsmp->nextAllocation >= hfsmp->allocLimit) {
4727		hfsmp->nextAllocation = hfsmp->hfs_metazone_end + 1;
4728	}
4729	hfsmp->hfs_flags &= ~HFS_RESIZE_IN_PROGRESS;
4730	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
4731
4732	/* On error, reset the metadata zone for original volume size */
4733	if (error && (updateFreeBlocks == true)) {
4734		hfs_metadatazone_init(hfsmp, false);
4735	}
4736
4737	if (lockflags) {
4738		hfs_systemfile_unlock(hfsmp, lockflags);
4739	}
4740	if (transaction_begun) {
4741		hfs_end_transaction(hfsmp);
4742		hfs_journal_flush(hfsmp, FALSE);
4743		/* Just to be sure, sync all data to the disk */
4744		(void) VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
4745	}
4746
4747	return MacToVFSError(error);
4748}
4749
4750
4751/*
4752 * Invalidate the physical block numbers associated with buffer cache blocks
4753 * in the given extent of the given vnode.
4754 */
4755struct hfs_inval_blk_no {
4756	daddr64_t sectorStart;
4757	daddr64_t sectorCount;
4758};
4759static int
4760hfs_invalidate_block_numbers_callback(buf_t bp, void *args_in)
4761{
4762	daddr64_t blkno;
4763	struct hfs_inval_blk_no *args;
4764
4765	blkno = buf_blkno(bp);
4766	args = args_in;
4767
4768	if (blkno >= args->sectorStart && blkno < args->sectorStart+args->sectorCount)
4769		buf_setblkno(bp, buf_lblkno(bp));
4770
4771	return BUF_RETURNED;
4772}
4773static void
4774hfs_invalidate_sectors(struct vnode *vp, daddr64_t sectorStart, daddr64_t sectorCount)
4775{
4776	struct hfs_inval_blk_no args;
4777	args.sectorStart = sectorStart;
4778	args.sectorCount = sectorCount;
4779
4780	buf_iterate(vp, hfs_invalidate_block_numbers_callback, BUF_SCAN_DIRTY|BUF_SCAN_CLEAN, &args);
4781}
4782
4783
4784/*
4785 * Copy the contents of an extent to a new location.  Also invalidates the
4786 * physical block number of any buffer cache block in the copied extent
4787 * (so that if the block is written, it will go through VNOP_BLOCKMAP to
4788 * determine the new physical block number).
4789 *
4790 * At this point, for regular files, we hold the truncate lock exclusive
4791 * and the cnode lock exclusive.
4792 */
4793static int
4794hfs_copy_extent(
4795	struct hfsmount *hfsmp,
4796	struct vnode *vp,		/* The file whose extent is being copied. */
4797	u_int32_t oldStart,		/* The start of the source extent. */
4798	u_int32_t newStart,		/* The start of the destination extent. */
4799	u_int32_t blockCount,	/* The number of allocation blocks to copy. */
4800	vfs_context_t context)
4801{
4802	int err = 0;
4803	size_t bufferSize;
4804	void *buffer = NULL;
4805	struct vfsioattr ioattr;
4806	buf_t bp = NULL;
4807	off_t resid;
4808	size_t ioSize;
4809	u_int32_t ioSizeSectors;	/* Device sectors in this I/O */
4810	daddr64_t srcSector, destSector;
4811	u_int32_t sectorsPerBlock = hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4812#if CONFIG_PROTECT
4813	int cpenabled = 0;
4814#endif
4815
4816	/*
4817	 * Sanity check that we have locked the vnode of the file we're copying.
4818	 *
4819	 * But since hfs_systemfile_lock() doesn't actually take the lock on
4820	 * the allocation file if a journal is active, ignore the check if the
4821	 * file being copied is the allocation file.
4822	 */
4823	struct cnode *cp = VTOC(vp);
4824	if (cp != hfsmp->hfs_allocation_cp && cp->c_lockowner != current_thread())
4825		panic("hfs_copy_extent: vp=%p (cp=%p) not owned?\n", vp, cp);
4826
4827#if CONFIG_PROTECT
4828	/*
4829	 * Prepare the CP blob and get it ready for use, if necessary.
4830	 *
4831	 * Note that we specifically *exclude* system vnodes (catalog, bitmap, extents, EAs),
4832	 * because they are implicitly protected via the media key on iOS.  As such, they
4833	 * must not be relocated except with the media key.  So it is OK to not pass down
4834	 * a special cpentry to the IOMedia/LwVM code for handling.
4835	 */
4836	if (!vnode_issystem (vp) && vnode_isreg(vp) && cp_fs_protected (hfsmp->hfs_mp)) {
4837		int cp_err = 0;
4838		/*
4839		 * Ideally, the file whose extents we are about to manipulate is using the
4840		 * newer offset-based IVs so that we can manipulate it regardless of the
4841		 * current lock state.  However, we must maintain support for older-style
4842		 * EAs.
4843		 *
4844		 * For the older EA case, the IV was tied to the device LBA for file content.
4845		 * This means that encrypted data cannot be moved from one location to another
4846		 * in the filesystem without garbling the IV data.  As a result, we need to
4847		 * access the file's plaintext because we cannot do our AES-symmetry trick
4848		 * here.  This requires that we attempt a key-unwrap here (via cp_handle_relocate)
4849		 * to make forward progress.  If the keys are unavailable then we will
4850		 * simply stop the resize in its tracks here since we cannot move
4851		 * this extent at this time.
4852		 */
4853		if ((cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) == 0) {
4854			cp_err = cp_handle_relocate(cp, hfsmp);
4855		}
4856
4857		if (cp_err) {
4858			printf ("hfs_copy_extent: cp_handle_relocate failed (%d) \n", cp_err);
4859			return cp_err;
4860		}
4861
4862		cpenabled = 1;
4863	}
4864#endif
4865
4866
4867	/*
4868	 * Determine the I/O size to use
4869	 *
4870	 * NOTE: Many external drives will result in an ioSize of 128KB.
4871	 * TODO: Should we use a larger buffer, doing several consecutive
4872	 * reads, then several consecutive writes?
4873	 */
4874	vfs_ioattr(hfsmp->hfs_mp, &ioattr);
4875	bufferSize = MIN(ioattr.io_maxreadcnt, ioattr.io_maxwritecnt);
4876	if (kmem_alloc(kernel_map, (vm_offset_t*) &buffer, bufferSize))
4877		return ENOMEM;
4878
4879	/* Get a buffer for doing the I/O */
4880	bp = buf_alloc(hfsmp->hfs_devvp);
4881	buf_setdataptr(bp, (uintptr_t)buffer);
4882
4883	resid = (off_t) blockCount * (off_t) hfsmp->blockSize;
4884	srcSector = (daddr64_t) oldStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4885	destSector = (daddr64_t) newStart * hfsmp->blockSize / hfsmp->hfs_logical_block_size;
4886	while (resid > 0) {
4887		ioSize = MIN(bufferSize, (size_t) resid);
4888		ioSizeSectors = ioSize / hfsmp->hfs_logical_block_size;
4889
4890		/* Prepare the buffer for reading */
4891		buf_reset(bp, B_READ);
4892		buf_setsize(bp, ioSize);
4893		buf_setcount(bp, ioSize);
4894		buf_setblkno(bp, srcSector);
4895		buf_setlblkno(bp, srcSector);
4896
4897		/*
4898		 * Note that because this is an I/O to the device vp
4899		 * it is correct to have lblkno and blkno both point to the
4900		 * start sector being read from.  If it were being issued against the
4901		 * underlying file then that would be different.
4902		 */
4903
4904		/* Attach the new CP blob  to the buffer if needed */
4905#if CONFIG_PROTECT
4906		if (cpenabled) {
4907			if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4908				/* attach the RELOCATION_INFLIGHT flag for the underlying call to VNOP_STRATEGY */
4909				cp->c_cpentry->cp_flags |= CP_RELOCATION_INFLIGHT;
4910				buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4911			}
4912			else {
4913				/*
4914				 * Use the cnode's cp key.  This file is tied to the
4915				 * LBAs of the physical blocks that it occupies.
4916				 */
4917				buf_setcpaddr (bp, cp->c_cpentry);
4918			}
4919
4920			/* Initialize the content protection file offset to start at 0 */
4921			buf_setcpoff (bp, 0);
4922		}
4923#endif
4924
4925		/* Do the read */
4926		err = VNOP_STRATEGY(bp);
4927		if (!err)
4928			err = buf_biowait(bp);
4929		if (err) {
4930#if CONFIG_PROTECT
4931			/* Turn the flag off in error cases. */
4932			if (cpenabled) {
4933				cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
4934			}
4935#endif
4936			printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (read)\n", err);
4937			break;
4938		}
4939
4940		/* Prepare the buffer for writing */
4941		buf_reset(bp, B_WRITE);
4942		buf_setsize(bp, ioSize);
4943		buf_setcount(bp, ioSize);
4944		buf_setblkno(bp, destSector);
4945		buf_setlblkno(bp, destSector);
4946		if (vnode_issystem(vp) && journal_uses_fua(hfsmp->jnl))
4947			buf_markfua(bp);
4948
4949#if CONFIG_PROTECT
4950		/* Attach the CP to the buffer if needed */
4951		if (cpenabled) {
4952			if (cp->c_cpentry->cp_flags & CP_OFF_IV_ENABLED) {
4953				buf_setcpaddr(bp, hfsmp->hfs_resize_cpentry);
4954			}
4955			else {
4956				/*
4957				 * Use the cnode's CP key.  This file is still tied
4958				 * to the LBAs of the physical blocks that it occupies.
4959				 */
4960				buf_setcpaddr (bp, cp->c_cpentry);
4961			}
4962			/*
4963			 * The last STRATEGY call may have updated the cp file offset behind our
4964			 * back, so we cannot trust it.  Re-initialize the content protection
4965			 * file offset back to 0 before initiating the write portion of this I/O.
4966			 */
4967			buf_setcpoff (bp, 0);
4968		}
4969#endif
4970
4971		/* Do the write */
4972		vnode_startwrite(hfsmp->hfs_devvp);
4973		err = VNOP_STRATEGY(bp);
4974		if (!err) {
4975			err = buf_biowait(bp);
4976		}
4977#if CONFIG_PROTECT
4978		/* Turn the flag off regardless once the strategy call finishes. */
4979		if (cpenabled) {
4980			cp->c_cpentry->cp_flags &= ~CP_RELOCATION_INFLIGHT;
4981		}
4982#endif
4983		if (err) {
4984			printf("hfs_copy_extent: Error %d from VNOP_STRATEGY (write)\n", err);
4985			break;
4986		}
4987
4988		resid -= ioSize;
4989		srcSector += ioSizeSectors;
4990		destSector += ioSizeSectors;
4991	}
4992	if (bp)
4993		buf_free(bp);
4994	if (buffer)
4995		kmem_free(kernel_map, (vm_offset_t)buffer, bufferSize);
4996
4997	/* Make sure all writes have been flushed to disk. */
4998	if (vnode_issystem(vp) && !journal_uses_fua(hfsmp->jnl)) {
4999		err = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
5000		if (err) {
5001			printf("hfs_copy_extent: DKIOCSYNCHRONIZECACHE failed (%d)\n", err);
5002			err = 0;	/* Don't fail the copy. */
5003		}
5004	}
5005
5006	if (!err)
5007		hfs_invalidate_sectors(vp, (daddr64_t)oldStart*sectorsPerBlock, (daddr64_t)blockCount*sectorsPerBlock);
5008
5009	return err;
5010}
5011
5012
5013/* Structure to store state of reclaiming extents from a
5014 * given file.  hfs_reclaim_file()/hfs_reclaim_xattr()
5015 * initializes the values in this structure which are then
5016 * used by code that reclaims and splits the extents.
5017 */
5018struct hfs_reclaim_extent_info {
5019	struct vnode *vp;
5020	u_int32_t fileID;
5021	u_int8_t forkType;
5022	u_int8_t is_dirlink;                 /* Extent belongs to directory hard link */
5023	u_int8_t is_sysfile;                 /* Extent belongs to system file */
5024	u_int8_t is_xattr;                   /* Extent belongs to extent-based xattr */
5025	u_int8_t extent_index;
5026	int lockflags;                       /* Locks that reclaim and split code should grab before modifying the extent record */
5027	u_int32_t blocks_relocated;          /* Total blocks relocated for this file till now */
5028	u_int32_t recStartBlock;             /* File allocation block number (FABN) for current extent record */
5029	u_int32_t cur_blockCount;            /* Number of allocation blocks that have been checked for reclaim */
5030	struct filefork *catalog_fp;         /* If non-NULL, extent is from catalog record */
5031	union record {
5032		HFSPlusExtentRecord overflow;/* Extent record from overflow extents btree */
5033		HFSPlusAttrRecord xattr;     /* Attribute record for large EAs */
5034	} record;
5035	HFSPlusExtentDescriptor *extents;    /* Pointer to current extent record being processed.
5036					      * For catalog extent record, points to the correct
5037					      * extent information in filefork.  For overflow extent
5038					      * record, or xattr record, points to extent record
5039					      * in the structure above
5040					      */
5041	struct cat_desc *dirlink_desc;
5042	struct cat_attr *dirlink_attr;
5043	struct filefork *dirlink_fork;	      /* For directory hard links, fp points actually to this */
5044	struct BTreeIterator *iterator;       /* Shared read/write iterator, hfs_reclaim_file/xattr()
5045                                               * use it for reading and hfs_reclaim_extent()/hfs_split_extent()
5046					       * use it for writing updated extent record
5047					       */
5048	struct FSBufferDescriptor btdata;     /* Shared btdata for reading/writing extent record, same as iterator above */
5049	u_int16_t recordlen;
5050	int overflow_count;                   /* For debugging, counter for overflow extent record */
5051	FCB *fcb;                             /* Pointer to the current btree being traversed */
5052};
5053
5054/*
5055 * Split the current extent into two extents, with first extent
5056 * to contain given number of allocation blocks.  Splitting of
5057 * extent creates one new extent entry which can result in
5058 * shifting of many entries through all the extent records of a
5059 * file, and/or creating a new extent record in the overflow
5060 * extent btree.
5061 *
5062 * Example:
5063 * The diagram below represents two consecutive extent records,
5064 * for simplicity, lets call them record X and X+1 respectively.
5065 * Interesting extent entries have been denoted by letters.
5066 * If the letter is unchanged before and after split, it means
5067 * that the extent entry was not modified during the split.
5068 * A '.' means that the entry remains unchanged after the split
5069 * and is not relevant for our example.  A '0' means that the
5070 * extent entry is empty.
5071 *
5072 * If there isn't sufficient contiguous free space to relocate
5073 * an extent (extent "C" below), we will have to break the one
5074 * extent into multiple smaller extents, and relocate each of
5075 * the smaller extents individually.  The way we do this is by
5076 * finding the largest contiguous free space that is currently
5077 * available (N allocation blocks), and then convert extent "C"
5078 * into two extents, C1 and C2, that occupy exactly the same
5079 * allocation blocks as extent C.  Extent C1 is the first
5080 * N allocation blocks of extent C, and extent C2 is the remainder
5081 * of extent C.  Then we can relocate extent C1 since we know
5082 * we have enough contiguous free space to relocate it in its
5083 * entirety.  We then repeat the process starting with extent C2.
5084 *
5085 * In record X, only the entries following entry C are shifted, and
5086 * the original entry C is replaced with two entries C1 and C2 which
5087 * are actually two extent entries for contiguous allocation blocks.
5088 *
5089 * Note that the entry E from record X is shifted into record X+1 as
5090 * the new first entry.  Since the first entry of record X+1 is updated,
5091 * the FABN will also get updated with the blockCount of entry E.
5092 * This also results in shifting of all extent entries in record X+1.
5093 * Note that the number of empty entries after the split has been
5094 * changed from 3 to 2.
5095 *
5096 * Before:
5097 *               record X                           record X+1
5098 *  ---------------------===---------     ---------------------------------
5099 *  | A | . | . | . | B | C | D | E |     | F | . | . | . | G | 0 | 0 | 0 |
5100 *  ---------------------===---------     ---------------------------------
5101 *
5102 * After:
5103 *  ---------------------=======-----     ---------------------------------
5104 *  | A | . | . | . | B | C1| C2| D |     | E | F | . | . | . | G | 0 | 0 |
5105 *  ---------------------=======-----     ---------------------------------
5106 *
5107 *  C1.startBlock = C.startBlock
5108 *  C1.blockCount = N
5109 *
5110 *  C2.startBlock = C.startBlock + N
5111 *  C2.blockCount = C.blockCount - N
5112 *
5113 *                                        FABN = old FABN - E.blockCount
5114 *
5115 * Inputs:
5116 *	extent_info -   This is the structure that contains state about
5117 *	                the current file, extent, and extent record that
5118 *	                is being relocated.  This structure is shared
5119 *	                among code that traverses through all the extents
5120 *	                of the file, code that relocates extents, and
5121 *	                code that splits the extent.
5122 *	newBlockCount - The blockCount of the extent to be split after
5123 *	                successfully split operation.
5124 * Output:
5125 * 	Zero on success, non-zero on failure.
5126 */
5127static int
5128hfs_split_extent(struct hfs_reclaim_extent_info *extent_info, uint32_t newBlockCount)
5129{
5130	int error = 0;
5131	int index = extent_info->extent_index;
5132	int i;
5133	HFSPlusExtentDescriptor shift_extent; /* Extent entry that should be shifted into next extent record */
5134	HFSPlusExtentDescriptor last_extent;
5135	HFSPlusExtentDescriptor *extents; /* Pointer to current extent record being manipulated */
5136	HFSPlusExtentRecord *extents_rec = NULL;
5137	HFSPlusExtentKey *extents_key = NULL;
5138	HFSPlusAttrRecord *xattr_rec = NULL;
5139	HFSPlusAttrKey *xattr_key = NULL;
5140	struct BTreeIterator iterator;
5141	struct FSBufferDescriptor btdata;
5142	uint16_t reclen;
5143	uint32_t read_recStartBlock;	/* Starting allocation block number to read old extent record */
5144	uint32_t write_recStartBlock;	/* Starting allocation block number to insert newly updated extent record */
5145	Boolean create_record = false;
5146	Boolean is_xattr;
5147	struct cnode *cp;
5148
5149	is_xattr = extent_info->is_xattr;
5150	extents = extent_info->extents;
5151	cp = VTOC(extent_info->vp);
5152
5153	if (newBlockCount == 0) {
5154		if (hfs_resize_debug) {
5155			printf ("hfs_split_extent: No splitting required for newBlockCount=0\n");
5156		}
5157		return error;
5158	}
5159
5160	if (hfs_resize_debug) {
5161		printf ("hfs_split_extent: Split record:%u recStartBlock=%u %u:(%u,%u) for %u blocks\n", extent_info->overflow_count, extent_info->recStartBlock, index, extents[index].startBlock, extents[index].blockCount, newBlockCount);
5162	}
5163
5164	/* Extents overflow btree can not have more than 8 extents.
5165	 * No split allowed if the 8th extent is already used.
5166	 */
5167	if ((extent_info->fileID == kHFSExtentsFileID) && (extents[kHFSPlusExtentDensity - 1].blockCount != 0)) {
5168		printf ("hfs_split_extent: Maximum 8 extents allowed for extents overflow btree, cannot split further.\n");
5169		error = ENOSPC;
5170		goto out;
5171	}
5172
5173	/* Determine the starting allocation block number for the following
5174	 * overflow extent record, if any, before the current record
5175	 * gets modified.
5176	 */
5177	read_recStartBlock = extent_info->recStartBlock;
5178	for (i = 0; i < kHFSPlusExtentDensity; i++) {
5179		if (extents[i].blockCount == 0) {
5180			break;
5181		}
5182		read_recStartBlock += extents[i].blockCount;
5183	}
5184
5185	/* Shift and split */
5186	if (index == kHFSPlusExtentDensity-1) {
5187		/* The new extent created after split will go into following overflow extent record */
5188		shift_extent.startBlock = extents[index].startBlock + newBlockCount;
5189		shift_extent.blockCount = extents[index].blockCount - newBlockCount;
5190
5191		/* Last extent in the record will be split, so nothing to shift */
5192	} else {
5193		/* Splitting of extents can result in at most of one
5194		 * extent entry to be shifted into following overflow extent
5195		 * record.  So, store the last extent entry for later.
5196		 */
5197		shift_extent = extents[kHFSPlusExtentDensity-1];
5198		if ((hfs_resize_debug) && (shift_extent.blockCount != 0)) {
5199			printf ("hfs_split_extent: Save 7:(%u,%u) to shift into overflow record\n", shift_extent.startBlock, shift_extent.blockCount);
5200		}
5201
5202		/* Start shifting extent information from the end of the extent
5203		 * record to the index where we want to insert the new extent.
5204		 * Note that kHFSPlusExtentDensity-1 is already saved above, and
5205		 * does not need to be shifted.  The extent entry that is being
5206		 * split does not get shifted.
5207		 */
5208		for (i = kHFSPlusExtentDensity-2; i > index; i--) {
5209			if (hfs_resize_debug) {
5210				if (extents[i].blockCount) {
5211					printf ("hfs_split_extent: Shift %u:(%u,%u) to %u:(%u,%u)\n", i, extents[i].startBlock, extents[i].blockCount, i+1, extents[i].startBlock, extents[i].blockCount);
5212				}
5213			}
5214			extents[i+1] = extents[i];
5215		}
5216	}
5217
5218	if (index == kHFSPlusExtentDensity-1) {
5219		/* The second half of the extent being split will be the overflow
5220		 * entry that will go into following overflow extent record.  The
5221		 * value has been stored in 'shift_extent' above, so there is
5222		 * nothing to be done here.
5223		 */
5224	} else {
5225		/* Update the values in the second half of the extent being split
5226		 * before updating the first half of the split.  Note that the
5227		 * extent to split or first half of the split is at index 'index'
5228		 * and a new extent or second half of the split will be inserted at
5229		 * 'index+1' or into following overflow extent record.
5230		 */
5231		extents[index+1].startBlock = extents[index].startBlock + newBlockCount;
5232		extents[index+1].blockCount = extents[index].blockCount - newBlockCount;
5233	}
5234	/* Update the extent being split, only the block count will change */
5235	extents[index].blockCount = newBlockCount;
5236
5237	if (hfs_resize_debug) {
5238		printf ("hfs_split_extent: Split %u:(%u,%u) and ", index, extents[index].startBlock, extents[index].blockCount);
5239		if (index != kHFSPlusExtentDensity-1) {
5240			printf ("%u:(%u,%u)\n", index+1, extents[index+1].startBlock, extents[index+1].blockCount);
5241		} else {
5242			printf ("overflow:(%u,%u)\n", shift_extent.startBlock, shift_extent.blockCount);
5243		}
5244	}
5245
5246	/* Write out information about the newly split extent to the disk */
5247	if (extent_info->catalog_fp) {
5248		/* (extent_info->catalog_fp != NULL) means the newly split
5249		 * extent exists in the catalog record.  This means that
5250		 * the cnode was updated.  Therefore, to write out the changes,
5251		 * mark the cnode as modified.   We cannot call hfs_update()
5252		 * in this function because the caller hfs_reclaim_extent()
5253		 * is holding the catalog lock currently.
5254		 */
5255		cp->c_flag |= C_MODIFIED;
5256	} else {
5257		/* The newly split extent is for large EAs or is in overflow
5258		 * extent record, so update it directly in the btree using the
5259		 * iterator information from the shared extent_info structure
5260	 	 */
5261		error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5262				&(extent_info->btdata), extent_info->recordlen);
5263		if (error) {
5264			printf ("hfs_split_extent: fileID=%u BTReplaceRecord returned error=%d\n", extent_info->fileID, error);
5265			goto out;
5266		}
5267	}
5268
5269	/* No extent entry to be shifted into another extent overflow record */
5270	if (shift_extent.blockCount == 0) {
5271		if (hfs_resize_debug) {
5272			printf ("hfs_split_extent: No extent entry to be shifted into overflow records\n");
5273		}
5274		error = 0;
5275		goto out;
5276	}
5277
5278	/* The overflow extent entry has to be shifted into an extent
5279	 * overflow record.  This means that we might have to shift
5280	 * extent entries from all subsequent overflow records by one.
5281	 * We start iteration from the first record to the last record,
5282	 * and shift the extent entry from one record to another.
5283	 * We might have to create a new extent record for the last
5284	 * extent entry for the file.
5285	 */
5286
5287	/* Initialize iterator to search the next record */
5288	bzero(&iterator, sizeof(iterator));
5289	if (is_xattr) {
5290		/* Copy the key from the iterator that was used to update the modified attribute record. */
5291		xattr_key = (HFSPlusAttrKey *)&(iterator.key);
5292		bcopy((HFSPlusAttrKey *)&(extent_info->iterator->key), xattr_key, sizeof(HFSPlusAttrKey));
5293		/* Note: xattr_key->startBlock will be initialized later in the iteration loop */
5294
5295		MALLOC(xattr_rec, HFSPlusAttrRecord *,
5296				sizeof(HFSPlusAttrRecord), M_TEMP, M_WAITOK);
5297		if (xattr_rec == NULL) {
5298			error = ENOMEM;
5299			goto out;
5300		}
5301		btdata.bufferAddress = xattr_rec;
5302		btdata.itemSize = sizeof(HFSPlusAttrRecord);
5303		btdata.itemCount = 1;
5304		extents = xattr_rec->overflowExtents.extents;
5305	} else {
5306		/* Initialize the extent key for the current file */
5307		extents_key = (HFSPlusExtentKey *) &(iterator.key);
5308		extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5309		extents_key->forkType = extent_info->forkType;
5310		extents_key->fileID = extent_info->fileID;
5311		/* Note: extents_key->startBlock will be initialized later in the iteration loop */
5312
5313		MALLOC(extents_rec, HFSPlusExtentRecord *,
5314				sizeof(HFSPlusExtentRecord), M_TEMP, M_WAITOK);
5315		if (extents_rec == NULL) {
5316			error = ENOMEM;
5317			goto out;
5318		}
5319		btdata.bufferAddress = extents_rec;
5320		btdata.itemSize = sizeof(HFSPlusExtentRecord);
5321		btdata.itemCount = 1;
5322		extents = extents_rec[0];
5323	}
5324
5325	/* The overflow extent entry has to be shifted into an extent
5326	 * overflow record.  This means that we might have to shift
5327	 * extent entries from all subsequent overflow records by one.
5328	 * We start iteration from the first record to the last record,
5329	 * examine one extent record in each iteration and shift one
5330	 * extent entry from one record to another.  We might have to
5331	 * create a new extent record for the last extent entry for the
5332	 * file.
5333	 *
5334	 * If shift_extent.blockCount is non-zero, it means that there is
5335	 * an extent entry that needs to be shifted into the next
5336	 * overflow extent record.  We keep on going till there are no such
5337	 * entries left to be shifted.  This will also change the starting
5338	 * allocation block number of the extent record which is part of
5339	 * the key for the extent record in each iteration.  Note that
5340	 * because the extent record key is changing while we are searching,
5341	 * the record can not be updated directly, instead it has to be
5342	 * deleted and inserted again.
5343	 */
5344	while (shift_extent.blockCount) {
5345		if (hfs_resize_debug) {
5346			printf ("hfs_split_extent: Will shift (%u,%u) into overflow record with startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, read_recStartBlock);
5347		}
5348
5349		/* Search if there is any existing overflow extent record
5350		 * that matches the current file and the logical start block
5351		 * number.
5352		 *
5353		 * For this, the logical start block number in the key is
5354		 * the value calculated based on the logical start block
5355		 * number of the current extent record and the total number
5356		 * of blocks existing in the current extent record.
5357		 */
5358		if (is_xattr) {
5359			xattr_key->startBlock = read_recStartBlock;
5360		} else {
5361			extents_key->startBlock = read_recStartBlock;
5362		}
5363		error = BTSearchRecord(extent_info->fcb, &iterator, &btdata, &reclen, &iterator);
5364		if (error) {
5365			if (error != btNotFound) {
5366				printf ("hfs_split_extent: fileID=%u startBlock=%u BTSearchRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5367				goto out;
5368			}
5369			/* No matching record was found, so create a new extent record.
5370			 * Note:  Since no record was found, we can't rely on the
5371			 * btree key in the iterator any longer.  This will be initialized
5372			 * later before we insert the record.
5373			 */
5374			create_record = true;
5375		}
5376
5377		/* The extra extent entry from the previous record is being inserted
5378		 * as the first entry in the current extent record.  This will change
5379		 * the file allocation block number (FABN) of the current extent
5380		 * record, which is the startBlock value from the extent record key.
5381		 * Since one extra entry is being inserted in the record, the new
5382		 * FABN for the record will less than old FABN by the number of blocks
5383		 * in the new extent entry being inserted at the start.  We have to
5384		 * do this before we update read_recStartBlock to point at the
5385		 * startBlock of the following record.
5386		 */
5387		write_recStartBlock = read_recStartBlock - shift_extent.blockCount;
5388		if (hfs_resize_debug) {
5389			if (create_record) {
5390				printf ("hfs_split_extent: No records found for startBlock=%u, will create new with startBlock=%u\n", read_recStartBlock, write_recStartBlock);
5391			}
5392		}
5393
5394		/* Now update the read_recStartBlock to account for total number
5395		 * of blocks in this extent record.  It will now point to the
5396		 * starting allocation block number for the next extent record.
5397		 */
5398		for (i = 0; i < kHFSPlusExtentDensity; i++) {
5399			if (extents[i].blockCount == 0) {
5400				break;
5401			}
5402			read_recStartBlock += extents[i].blockCount;
5403		}
5404
5405		if (create_record == true) {
5406			/* Initialize new record content with only one extent entry */
5407			bzero(extents, sizeof(HFSPlusExtentRecord));
5408			/* The new record will contain only one extent entry */
5409			extents[0] = shift_extent;
5410			/* There are no more overflow extents to be shifted */
5411			shift_extent.startBlock = shift_extent.blockCount = 0;
5412
5413			if (is_xattr) {
5414				/* BTSearchRecord above returned btNotFound,
5415				 * but since the attribute btree is never empty
5416				 * if we are trying to insert new overflow
5417				 * record for the xattrs, the extents_key will
5418				 * contain correct data.  So we don't need to
5419				 * re-initialize it again like below.
5420				 */
5421
5422				/* Initialize the new xattr record */
5423				xattr_rec->recordType = kHFSPlusAttrExtents;
5424				xattr_rec->overflowExtents.reserved = 0;
5425				reclen = sizeof(HFSPlusAttrExtents);
5426			} else {
5427				/* BTSearchRecord above returned btNotFound,
5428				 * which means that extents_key content might
5429				 * not correspond to the record that we are
5430				 * trying to create, especially when the extents
5431				 * overflow btree is empty.  So we reinitialize
5432				 * the extents_key again always.
5433				 */
5434				extents_key->keyLength = kHFSPlusExtentKeyMaximumLength;
5435				extents_key->forkType = extent_info->forkType;
5436				extents_key->fileID = extent_info->fileID;
5437
5438				/* Initialize the new extent record */
5439				reclen = sizeof(HFSPlusExtentRecord);
5440			}
5441		} else {
5442			/* The overflow extent entry from previous record will be
5443			 * the first entry in this extent record.  If the last
5444			 * extent entry in this record is valid, it will be shifted
5445			 * into the following extent record as its first entry.  So
5446			 * save the last entry before shifting entries in current
5447			 * record.
5448			 */
5449			last_extent = extents[kHFSPlusExtentDensity-1];
5450
5451			/* Shift all entries by one index towards the end */
5452			for (i = kHFSPlusExtentDensity-2; i >= 0; i--) {
5453				extents[i+1] = extents[i];
5454			}
5455
5456			/* Overflow extent entry saved from previous record
5457			 * is now the first entry in the current record.
5458			 */
5459			extents[0] = shift_extent;
5460
5461			if (hfs_resize_debug) {
5462				printf ("hfs_split_extent: Shift overflow=(%u,%u) to record with updated startBlock=%u\n", shift_extent.startBlock, shift_extent.blockCount, write_recStartBlock);
5463			}
5464
5465			/* The last entry from current record will be the
5466			 * overflow entry which will be the first entry for
5467			 * the following extent record.
5468			 */
5469			shift_extent = last_extent;
5470
5471			/* Since the key->startBlock is being changed for this record,
5472			 * it should be deleted and inserted with the new key.
5473			 */
5474			error = BTDeleteRecord(extent_info->fcb, &iterator);
5475			if (error) {
5476				printf ("hfs_split_extent: fileID=%u startBlock=%u BTDeleteRecord error=%d\n", extent_info->fileID, read_recStartBlock, error);
5477				goto out;
5478			}
5479			if (hfs_resize_debug) {
5480				printf ("hfs_split_extent: Deleted extent record with startBlock=%u\n", (is_xattr ? xattr_key->startBlock : extents_key->startBlock));
5481			}
5482		}
5483
5484		/* Insert the newly created or modified extent record */
5485		bzero(&iterator.hint, sizeof(iterator.hint));
5486		if (is_xattr) {
5487			xattr_key->startBlock = write_recStartBlock;
5488		} else {
5489			extents_key->startBlock = write_recStartBlock;
5490		}
5491		error = BTInsertRecord(extent_info->fcb, &iterator, &btdata, reclen);
5492		if (error) {
5493			printf ("hfs_split_extent: fileID=%u, startBlock=%u BTInsertRecord error=%d\n", extent_info->fileID, write_recStartBlock, error);
5494			goto out;
5495		}
5496		if (hfs_resize_debug) {
5497			printf ("hfs_split_extent: Inserted extent record with startBlock=%u\n", write_recStartBlock);
5498		}
5499	}
5500
5501out:
5502	/*
5503	 * Extents overflow btree or attributes btree headers might have
5504	 * been modified during the split/shift operation, so flush the
5505	 * changes to the disk while we are inside journal transaction.
5506	 * We should only be able to generate I/O that modifies the B-Tree
5507	 * header nodes while we're in the middle of a journal transaction.
5508	 * Otherwise it might result in panic during unmount.
5509	 */
5510	BTFlushPath(extent_info->fcb);
5511
5512	if (extents_rec) {
5513		FREE (extents_rec, M_TEMP);
5514	}
5515	if (xattr_rec) {
5516		FREE (xattr_rec, M_TEMP);
5517	}
5518	return error;
5519}
5520
5521
5522/*
5523 * Relocate an extent if it lies beyond the expected end of volume.
5524 *
5525 * This function is called for every extent of the file being relocated.
5526 * It allocates space for relocation, copies the data, deallocates
5527 * the old extent, and update corresponding on-disk extent.  If the function
5528 * does not find contiguous space to  relocate an extent, it splits the
5529 * extent in smaller size to be able to relocate it out of the area of
5530 * disk being reclaimed.  As an optimization, if an extent lies partially
5531 * in the area of the disk being reclaimed, it is split so that we only
5532 * have to relocate the area that was overlapping with the area of disk
5533 * being reclaimed.
5534 *
5535 * Note that every extent is relocated in its own transaction so that
5536 * they do not overwhelm the journal.  This function handles the extent
5537 * record that exists in the catalog record, extent record from overflow
5538 * extents btree, and extents for large EAs.
5539 *
5540 * Inputs:
5541 *	extent_info - This is the structure that contains state about
5542 *	              the current file, extent, and extent record that
5543 *	              is being relocated.  This structure is shared
5544 *	              among code that traverses through all the extents
5545 *	              of the file, code that relocates extents, and
5546 *	              code that splits the extent.
5547 */
5548static int
5549hfs_reclaim_extent(struct hfsmount *hfsmp, const u_long allocLimit, struct hfs_reclaim_extent_info *extent_info, vfs_context_t context)
5550{
5551	int error = 0;
5552	int index;
5553	struct cnode *cp;
5554	u_int32_t oldStartBlock;
5555	u_int32_t oldBlockCount;
5556	u_int32_t newStartBlock;
5557	u_int32_t newBlockCount;
5558	u_int32_t roundedBlockCount;
5559	uint16_t node_size;
5560	uint32_t remainder_blocks;
5561	u_int32_t alloc_flags;
5562	int blocks_allocated = false;
5563
5564	index = extent_info->extent_index;
5565	cp = VTOC(extent_info->vp);
5566
5567	oldStartBlock = extent_info->extents[index].startBlock;
5568	oldBlockCount = extent_info->extents[index].blockCount;
5569
5570	if (0 && hfs_resize_debug) {
5571		printf ("hfs_reclaim_extent: Examine record:%u recStartBlock=%u, %u:(%u,%u)\n", extent_info->overflow_count, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount);
5572	}
5573
5574	/* If the current extent lies completely within allocLimit,
5575	 * it does not require any relocation.
5576	 */
5577	if ((oldStartBlock + oldBlockCount) <= allocLimit) {
5578		extent_info->cur_blockCount += oldBlockCount;
5579		return error;
5580	}
5581
5582	/* Every extent should be relocated in its own transaction
5583	 * to make sure that we don't overflow the journal buffer.
5584	 */
5585	error = hfs_start_transaction(hfsmp);
5586	if (error) {
5587		return error;
5588	}
5589	extent_info->lockflags = hfs_systemfile_lock(hfsmp, extent_info->lockflags, HFS_EXCLUSIVE_LOCK);
5590
5591	/* Check if the extent lies partially in the area to reclaim,
5592	 * i.e. it starts before allocLimit and ends beyond allocLimit.
5593	 * We have already skipped extents that lie completely within
5594	 * allocLimit in the check above, so we only check for the
5595	 * startBlock.  If it lies partially, split it so that we
5596	 * only relocate part of the extent.
5597	 */
5598	if (oldStartBlock < allocLimit) {
5599		newBlockCount = allocLimit - oldStartBlock;
5600
5601		if (hfs_resize_debug) {
5602			int idx = extent_info->extent_index;
5603			printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5604		}
5605
5606		/* If the extent belongs to a btree, check and trim
5607		 * it to be multiple of the node size.
5608		 */
5609		if (extent_info->is_sysfile) {
5610			node_size = get_btree_nodesize(extent_info->vp);
5611			/* If the btree node size is less than the block size,
5612			 * splitting this extent will not split a node across
5613			 * different extents.  So we only check and trim if
5614			 * node size is more than the allocation block size.
5615			 */
5616			if (node_size > hfsmp->blockSize) {
5617				remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5618				if (remainder_blocks) {
5619					newBlockCount -= remainder_blocks;
5620					if (hfs_resize_debug) {
5621						printf ("hfs_reclaim_extent: Round-down newBlockCount to be multiple of nodeSize, node_allocblks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5622					}
5623				}
5624			}
5625			/* The newBlockCount is zero because of rounding-down so that
5626			 * btree nodes are not split across extents.  Therefore this
5627			 * straddling extent across resize-boundary does not require
5628			 * splitting.  Skip over to relocating of complete extent.
5629			 */
5630			if (newBlockCount == 0) {
5631				if (hfs_resize_debug) {
5632					printf ("hfs_reclaim_extent: After round-down newBlockCount=0, skip split, relocate full extent\n");
5633				}
5634				goto relocate_full_extent;
5635			}
5636		}
5637
5638		/* Split the extents into two parts --- the first extent lies
5639		 * completely within allocLimit and therefore does not require
5640		 * relocation.  The second extent will require relocation which
5641		 * will be handled when the caller calls this function again
5642		 * for the next extent.
5643		 */
5644		error = hfs_split_extent(extent_info, newBlockCount);
5645		if (error == 0) {
5646			/* Split success, no relocation required */
5647			goto out;
5648		}
5649		/* Split failed, so try to relocate entire extent */
5650		if (hfs_resize_debug) {
5651			int idx = extent_info->extent_index;
5652			printf ("hfs_reclaim_extent: Split straddling extent %u:(%u,%u) for %u blocks failed, relocate full extent\n", idx, extent_info->extents[idx].startBlock, extent_info->extents[idx].blockCount, newBlockCount);
5653		}
5654	}
5655
5656relocate_full_extent:
5657	/* At this point, the current extent requires relocation.
5658	 * We will try to allocate space equal to the size of the extent
5659	 * being relocated first to try to relocate it without splitting.
5660	 * If the allocation fails, we will try to allocate contiguous
5661	 * blocks out of metadata zone.  If that allocation also fails,
5662	 * then we will take a whatever contiguous block run is returned
5663	 * by the allocation, split the extent into two parts, and then
5664	 * relocate the first splitted extent.
5665	 */
5666	alloc_flags = HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS;
5667	if (extent_info->is_sysfile) {
5668		alloc_flags |= HFS_ALLOC_METAZONE;
5669	}
5670
5671	error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount, alloc_flags,
5672			&newStartBlock, &newBlockCount);
5673	if ((extent_info->is_sysfile == false) &&
5674	    ((error == dskFulErr) || (error == ENOSPC))) {
5675		/* For non-system files, try reallocating space in metadata zone */
5676		alloc_flags |= HFS_ALLOC_METAZONE;
5677		error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5678				alloc_flags, &newStartBlock, &newBlockCount);
5679	}
5680	if ((error == dskFulErr) || (error == ENOSPC)) {
5681		/* We did not find desired contiguous space for this extent.
5682		 * So try to allocate the maximum contiguous space available.
5683		 */
5684		alloc_flags &= ~HFS_ALLOC_FORCECONTIG;
5685
5686		error = BlockAllocate(hfsmp, 1, oldBlockCount, oldBlockCount,
5687				alloc_flags, &newStartBlock, &newBlockCount);
5688		if (error) {
5689			printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5690			goto out;
5691		}
5692		blocks_allocated = true;
5693
5694		/* The number of blocks allocated is less than the requested
5695		 * number of blocks.  For btree extents, check and trim the
5696		 * extent to be multiple of the node size.
5697		 */
5698		if (extent_info->is_sysfile) {
5699			node_size = get_btree_nodesize(extent_info->vp);
5700			if (node_size > hfsmp->blockSize) {
5701				remainder_blocks = newBlockCount % (node_size / hfsmp->blockSize);
5702				if (remainder_blocks) {
5703					roundedBlockCount = newBlockCount - remainder_blocks;
5704					/* Free tail-end blocks of the newly allocated extent */
5705					BlockDeallocate(hfsmp, newStartBlock + roundedBlockCount,
5706							       newBlockCount - roundedBlockCount,
5707							       HFS_ALLOC_SKIPFREEBLKS);
5708					newBlockCount = roundedBlockCount;
5709					if (hfs_resize_debug) {
5710						printf ("hfs_reclaim_extent: Fixing extent block count, node_blks=%u, old=%u, new=%u\n", node_size/hfsmp->blockSize, newBlockCount + remainder_blocks, newBlockCount);
5711					}
5712					if (newBlockCount == 0) {
5713						printf ("hfs_reclaim_extent: Not enough contiguous blocks available to relocate fileID=%d\n", extent_info->fileID);
5714						error = ENOSPC;
5715						goto out;
5716					}
5717				}
5718			}
5719		}
5720
5721		/* The number of blocks allocated is less than the number of
5722		 * blocks requested, so split this extent --- the first extent
5723		 * will be relocated as part of this function call and the caller
5724		 * will handle relocating the second extent by calling this
5725		 * function again for the second extent.
5726		 */
5727		error = hfs_split_extent(extent_info, newBlockCount);
5728		if (error) {
5729			printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) split error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5730			goto out;
5731		}
5732		oldBlockCount = newBlockCount;
5733	}
5734	if (error) {
5735		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) contig BlockAllocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5736		goto out;
5737	}
5738	blocks_allocated = true;
5739
5740	/* Copy data from old location to new location */
5741	error = hfs_copy_extent(hfsmp, extent_info->vp, oldStartBlock,
5742			newStartBlock, newBlockCount, context);
5743	if (error) {
5744		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u)=>(%u,%u) hfs_copy_extent error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount, error);
5745		goto out;
5746	}
5747
5748	/* Update the extent record with the new start block information */
5749	extent_info->extents[index].startBlock = newStartBlock;
5750
5751	/* Sync the content back to the disk */
5752	if (extent_info->catalog_fp) {
5753		/* Update the extents in catalog record */
5754		if (extent_info->is_dirlink) {
5755			error = cat_update_dirlink(hfsmp, extent_info->forkType,
5756					extent_info->dirlink_desc, extent_info->dirlink_attr,
5757					&(extent_info->dirlink_fork->ff_data));
5758		} else {
5759			cp->c_flag |= C_MODIFIED;
5760			/* If this is a system file, sync volume headers on disk */
5761			if (extent_info->is_sysfile) {
5762				error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
5763			}
5764		}
5765	} else {
5766		/* Replace record for extents overflow or extents-based xattrs */
5767		error = BTReplaceRecord(extent_info->fcb, extent_info->iterator,
5768				&(extent_info->btdata), extent_info->recordlen);
5769	}
5770	if (error) {
5771		printf ("hfs_reclaim_extent: fileID=%u, update record error=%u\n", extent_info->fileID, error);
5772		goto out;
5773	}
5774
5775	/* Deallocate the old extent */
5776	error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5777	if (error) {
5778		printf ("hfs_reclaim_extent: fileID=%u start=%u, %u:(%u,%u) BlockDeallocate error=%d\n", extent_info->fileID, extent_info->recStartBlock, index, oldStartBlock, oldBlockCount, error);
5779		goto out;
5780	}
5781	extent_info->blocks_relocated += newBlockCount;
5782
5783	if (hfs_resize_debug) {
5784		printf ("hfs_reclaim_extent: Relocated record:%u %u:(%u,%u) to (%u,%u)\n", extent_info->overflow_count, index, oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
5785	}
5786
5787out:
5788	if (error != 0) {
5789		if (blocks_allocated == true) {
5790			BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
5791		}
5792	} else {
5793		/* On success, increment the total allocation blocks processed */
5794		extent_info->cur_blockCount += newBlockCount;
5795	}
5796
5797	hfs_systemfile_unlock(hfsmp, extent_info->lockflags);
5798
5799	/* For a non-system file, if an extent entry from catalog record
5800	 * was modified, sync the in-memory changes to the catalog record
5801	 * on disk before ending the transaction.
5802	 */
5803	 if ((extent_info->catalog_fp) &&
5804	     (extent_info->is_sysfile == false)) {
5805		(void) hfs_update(extent_info->vp, MNT_WAIT);
5806	}
5807
5808	hfs_end_transaction(hfsmp);
5809
5810	return error;
5811}
5812
5813/* Report intermediate progress during volume resize */
5814static void
5815hfs_truncatefs_progress(struct hfsmount *hfsmp)
5816{
5817	u_int32_t cur_progress = 0;
5818
5819	hfs_resize_progress(hfsmp, &cur_progress);
5820	if (cur_progress > (hfsmp->hfs_resize_progress + 9)) {
5821		printf("hfs_truncatefs: %d%% done...\n", cur_progress);
5822		hfsmp->hfs_resize_progress = cur_progress;
5823	}
5824	return;
5825}
5826
5827/*
5828 * Reclaim space at the end of a volume for given file and forktype.
5829 *
5830 * This routine attempts to move any extent which contains allocation blocks
5831 * at or after "allocLimit."  A separate transaction is used for every extent
5832 * that needs to be moved.  If there is not contiguous space available for
5833 * moving an extent, it can be split into smaller extents.  The contents of
5834 * any moved extents are read and written via the volume's device vnode --
5835 * NOT via "vp."  During the move, moved blocks which are part of a transaction
5836 * have their physical block numbers invalidated so they will eventually be
5837 * written to their new locations.
5838 *
5839 * This function is also called for directory hard links.  Directory hard links
5840 * are regular files with no data fork and resource fork that contains alias
5841 * information for backward compatibility with pre-Leopard systems.  However
5842 * non-Mac OS X implementation can add/modify data fork or resource fork
5843 * information to directory hard links, so we check, and if required, relocate
5844 * both data fork and resource fork.
5845 *
5846 * Inputs:
5847 *    hfsmp       The volume being resized.
5848 *    vp          The vnode for the system file.
5849 *    fileID	  ID of the catalog record that needs to be relocated
5850 *    forktype	  The type of fork that needs relocated,
5851 *    			kHFSResourceForkType for resource fork,
5852 *    			kHFSDataForkType for data fork
5853 *    allocLimit  Allocation limit for the new volume size,
5854 *    		  do not use this block or beyond.  All extents
5855 *    		  that use this block or any blocks beyond this limit
5856 *    		  will be relocated.
5857 *
5858 * Side Effects:
5859 * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
5860 * blocks that were relocated.
5861 */
5862static int
5863hfs_reclaim_file(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID,
5864		u_int8_t forktype, u_long allocLimit, vfs_context_t context)
5865{
5866	int error = 0;
5867	struct hfs_reclaim_extent_info *extent_info;
5868	int i;
5869	int lockflags = 0;
5870	struct cnode *cp;
5871	struct filefork *fp;
5872	int took_truncate_lock = false;
5873	int release_desc = false;
5874	HFSPlusExtentKey *key;
5875
5876	/* If there is no vnode for this file, then there's nothing to do. */
5877	if (vp == NULL) {
5878		return 0;
5879	}
5880
5881	cp = VTOC(vp);
5882
5883	if (hfs_resize_debug) {
5884		const char *filename = (const char *) cp->c_desc.cd_nameptr;
5885		int namelen = cp->c_desc.cd_namelen;
5886
5887		if (filename == NULL) {
5888			filename = "";
5889			namelen = 0;
5890		}
5891		printf("hfs_reclaim_file: reclaiming '%.*s'\n", namelen, filename);
5892	}
5893
5894	MALLOC(extent_info, struct hfs_reclaim_extent_info *,
5895	       sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
5896	if (extent_info == NULL) {
5897		return ENOMEM;
5898	}
5899	bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
5900	extent_info->vp = vp;
5901	extent_info->fileID = fileID;
5902	extent_info->forkType = forktype;
5903	extent_info->is_sysfile = vnode_issystem(vp);
5904	if (vnode_isdir(vp) && (cp->c_flag & C_HARDLINK)) {
5905		extent_info->is_dirlink = true;
5906	}
5907	/* We always need allocation bitmap and extent btree lock */
5908	lockflags = SFL_BITMAP | SFL_EXTENTS;
5909	if ((fileID == kHFSCatalogFileID) || (extent_info->is_dirlink == true)) {
5910		lockflags |= SFL_CATALOG;
5911	} else if (fileID == kHFSAttributesFileID) {
5912		lockflags |= SFL_ATTRIBUTE;
5913	} else if (fileID == kHFSStartupFileID) {
5914		lockflags |= SFL_STARTUP;
5915	}
5916	extent_info->lockflags = lockflags;
5917	extent_info->fcb = VTOF(hfsmp->hfs_extents_vp);
5918
5919	/* Flush data associated with current file on disk.
5920	 *
5921	 * If the current vnode is directory hard link, no flushing of
5922	 * journal or vnode is required.  The current kernel does not
5923	 * modify data/resource fork of directory hard links, so nothing
5924	 * will be in the cache.  If a directory hard link is newly created,
5925	 * the resource fork data is written directly using devvp and
5926	 * the code that actually relocates data (hfs_copy_extent()) also
5927	 * uses devvp for its I/O --- so they will see a consistent copy.
5928	 */
5929	if (extent_info->is_sysfile) {
5930		/* If the current vnode is system vnode, flush journal
5931		 * to make sure that all data is written to the disk.
5932		 */
5933		error = hfs_journal_flush(hfsmp, TRUE);
5934		if (error) {
5935			printf ("hfs_reclaim_file: journal_flush returned %d\n", error);
5936			goto out;
5937		}
5938	} else if (extent_info->is_dirlink == false) {
5939		/* Flush all blocks associated with this regular file vnode.
5940		 * Normally there should not be buffer cache blocks for regular
5941		 * files, but for objects like symlinks, we can have buffer cache
5942		 * blocks associated with the vnode.  Therefore we call
5943		 * buf_flushdirtyblks() also.
5944		 */
5945		buf_flushdirtyblks(vp, 0, BUF_SKIP_LOCKED, "hfs_reclaim_file");
5946
5947		hfs_unlock(cp);
5948		hfs_lock_truncate(cp, HFS_EXCLUSIVE_LOCK);
5949		took_truncate_lock = true;
5950		(void) cluster_push(vp, 0);
5951		error = hfs_lock(cp, HFS_FORCE_LOCK);
5952		if (error) {
5953			goto out;
5954		}
5955
5956		/* If the file no longer exists, nothing left to do */
5957		if (cp->c_flag & C_NOEXISTS) {
5958			error = 0;
5959			goto out;
5960		}
5961
5962		/* Wait for any in-progress writes to this vnode to complete, so that we'll
5963		 * be copying consistent bits.  (Otherwise, it's possible that an async
5964		 * write will complete to the old extent after we read from it.  That
5965		 * could lead to corruption.)
5966		 */
5967		error = vnode_waitforwrites(vp, 0, 0, 0, "hfs_reclaim_file");
5968		if (error) {
5969			goto out;
5970		}
5971	}
5972
5973	if (hfs_resize_debug) {
5974		printf("hfs_reclaim_file: === Start reclaiming %sfork for %sid=%u ===\n", (forktype ? "rsrc" : "data"), (extent_info->is_dirlink ? "dirlink" : "file"), fileID);
5975	}
5976
5977	if (extent_info->is_dirlink) {
5978		MALLOC(extent_info->dirlink_desc, struct cat_desc *,
5979				sizeof(struct cat_desc), M_TEMP, M_WAITOK);
5980		MALLOC(extent_info->dirlink_attr, struct cat_attr *,
5981				sizeof(struct cat_attr), M_TEMP, M_WAITOK);
5982		MALLOC(extent_info->dirlink_fork, struct filefork *,
5983				sizeof(struct filefork), M_TEMP, M_WAITOK);
5984		if ((extent_info->dirlink_desc == NULL) ||
5985		    (extent_info->dirlink_attr == NULL) ||
5986		    (extent_info->dirlink_fork == NULL)) {
5987			error = ENOMEM;
5988			goto out;
5989		}
5990
5991		/* Lookup catalog record for directory hard link and
5992		 * create a fake filefork for the value looked up from
5993		 * the disk.
5994		 */
5995		fp = extent_info->dirlink_fork;
5996		bzero(extent_info->dirlink_fork, sizeof(struct filefork));
5997		extent_info->dirlink_fork->ff_cp = cp;
5998		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
5999		error = cat_lookup_dirlink(hfsmp, fileID, forktype,
6000				extent_info->dirlink_desc, extent_info->dirlink_attr,
6001				&(extent_info->dirlink_fork->ff_data));
6002		hfs_systemfile_unlock(hfsmp, lockflags);
6003		if (error) {
6004			printf ("hfs_reclaim_file: cat_lookup_dirlink for fileID=%u returned error=%u\n", fileID, error);
6005			goto out;
6006		}
6007		release_desc = true;
6008	} else {
6009		fp = VTOF(vp);
6010	}
6011
6012	extent_info->catalog_fp = fp;
6013	extent_info->recStartBlock = 0;
6014	extent_info->extents = extent_info->catalog_fp->ff_extents;
6015	/* Relocate extents from the catalog record */
6016	for (i = 0; i < kHFSPlusExtentDensity; ++i) {
6017		if (fp->ff_extents[i].blockCount == 0) {
6018			break;
6019		}
6020		extent_info->extent_index = i;
6021		error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6022		if (error) {
6023			printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, fp->ff_extents[i].startBlock, fp->ff_extents[i].blockCount, error);
6024			goto out;
6025		}
6026	}
6027
6028	/* If the number of allocation blocks processed for reclaiming
6029	 * are less than total number of blocks for the file, continuing
6030	 * working on overflow extents record.
6031	 */
6032	if (fp->ff_blocks <= extent_info->cur_blockCount) {
6033		if (0 && hfs_resize_debug) {
6034			printf ("hfs_reclaim_file: Nothing more to relocate, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6035		}
6036		goto out;
6037	}
6038
6039	if (hfs_resize_debug) {
6040		printf ("hfs_reclaim_file: Will check overflow records, offset=%d, ff_blocks=%u, cur_blockCount=%u\n", i, fp->ff_blocks, extent_info->cur_blockCount);
6041	}
6042
6043	MALLOC(extent_info->iterator, struct BTreeIterator *, sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6044	if (extent_info->iterator == NULL) {
6045		error = ENOMEM;
6046		goto out;
6047	}
6048	bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6049	key = (HFSPlusExtentKey *) &(extent_info->iterator->key);
6050	key->keyLength = kHFSPlusExtentKeyMaximumLength;
6051	key->forkType = forktype;
6052	key->fileID = fileID;
6053	key->startBlock = extent_info->cur_blockCount;
6054
6055	extent_info->btdata.bufferAddress = extent_info->record.overflow;
6056	extent_info->btdata.itemSize = sizeof(HFSPlusExtentRecord);
6057	extent_info->btdata.itemCount = 1;
6058
6059	extent_info->catalog_fp = NULL;
6060
6061	/* Search the first overflow extent with expected startBlock as 'cur_blockCount' */
6062	lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6063	error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6064			&(extent_info->btdata), &(extent_info->recordlen),
6065			extent_info->iterator);
6066	hfs_systemfile_unlock(hfsmp, lockflags);
6067	while (error == 0) {
6068		extent_info->overflow_count++;
6069		extent_info->recStartBlock = key->startBlock;
6070		extent_info->extents = extent_info->record.overflow;
6071		for (i = 0; i < kHFSPlusExtentDensity; i++) {
6072			if (extent_info->record.overflow[i].blockCount == 0) {
6073				goto out;
6074			}
6075			extent_info->extent_index = i;
6076			error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6077			if (error) {
6078				printf ("hfs_reclaim_file: fileID=%u #%d %u:(%u,%u) hfs_reclaim_extent error=%d\n", fileID, extent_info->overflow_count, i, extent_info->record.overflow[i].startBlock, extent_info->record.overflow[i].blockCount, error);
6079				goto out;
6080			}
6081		}
6082
6083		/* Look for more overflow records */
6084		lockflags = hfs_systemfile_lock(hfsmp, lockflags, HFS_EXCLUSIVE_LOCK);
6085		error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6086				extent_info->iterator, &(extent_info->btdata),
6087				&(extent_info->recordlen));
6088		hfs_systemfile_unlock(hfsmp, lockflags);
6089		if (error) {
6090			break;
6091		}
6092		/* Stop when we encounter a different file or fork. */
6093		if ((key->fileID != fileID) || (key->forkType != forktype)) {
6094			break;
6095		}
6096	}
6097	if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6098		error = 0;
6099	}
6100
6101out:
6102	/* If any blocks were relocated, account them and report progress */
6103	if (extent_info->blocks_relocated) {
6104		hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6105		hfs_truncatefs_progress(hfsmp);
6106		if (fileID < kHFSFirstUserCatalogNodeID) {
6107			printf ("hfs_reclaim_file: Relocated %u blocks from fileID=%u on \"%s\"\n",
6108					extent_info->blocks_relocated, fileID, hfsmp->vcbVN);
6109		}
6110	}
6111	if (extent_info->iterator) {
6112		FREE(extent_info->iterator, M_TEMP);
6113	}
6114	if (release_desc == true) {
6115		cat_releasedesc(extent_info->dirlink_desc);
6116	}
6117	if (extent_info->dirlink_desc) {
6118		FREE(extent_info->dirlink_desc, M_TEMP);
6119	}
6120	if (extent_info->dirlink_attr) {
6121		FREE(extent_info->dirlink_attr, M_TEMP);
6122	}
6123	if (extent_info->dirlink_fork) {
6124		FREE(extent_info->dirlink_fork, M_TEMP);
6125	}
6126	if ((extent_info->blocks_relocated != 0) && (extent_info->is_sysfile == false)) {
6127		(void) hfs_update(vp, MNT_WAIT);
6128	}
6129	if (took_truncate_lock) {
6130		hfs_unlock_truncate(cp, 0);
6131	}
6132	if (extent_info) {
6133		FREE(extent_info, M_TEMP);
6134	}
6135	if (hfs_resize_debug) {
6136		printf("hfs_reclaim_file: === Finished relocating %sfork for fileid=%u (error=%d) ===\n", (forktype ? "rsrc" : "data"), fileID, error);
6137	}
6138
6139	return error;
6140}
6141
6142
6143/*
6144 * This journal_relocate callback updates the journal info block to point
6145 * at the new journal location.  This write must NOT be done using the
6146 * transaction.  We must write the block immediately.  We must also force
6147 * it to get to the media so that the new journal location will be seen by
6148 * the replay code before we can safely let journaled blocks be written
6149 * to their normal locations.
6150 *
6151 * The tests for journal_uses_fua below are mildly hacky.  Since the journal
6152 * and the file system are both on the same device, I'm leveraging what
6153 * the journal has decided about FUA.
6154 */
6155struct hfs_journal_relocate_args {
6156	struct hfsmount *hfsmp;
6157	vfs_context_t context;
6158	u_int32_t newStartBlock;
6159	u_int32_t newBlockCount;
6160};
6161
6162static errno_t
6163hfs_journal_relocate_callback(void *_args)
6164{
6165	int error;
6166	struct hfs_journal_relocate_args *args = _args;
6167	struct hfsmount *hfsmp = args->hfsmp;
6168	buf_t bp;
6169	JournalInfoBlock *jibp;
6170
6171	error = buf_meta_bread(hfsmp->hfs_devvp,
6172		hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6173		hfsmp->blockSize, vfs_context_ucred(args->context), &bp);
6174	if (error) {
6175		printf("hfs_journal_relocate_callback: failed to read JIB (%d)\n", error);
6176		if (bp) {
6177        		buf_brelse(bp);
6178		}
6179		return error;
6180	}
6181	jibp = (JournalInfoBlock*) buf_dataptr(bp);
6182	jibp->offset = SWAP_BE64((u_int64_t)args->newStartBlock * hfsmp->blockSize);
6183	jibp->size = SWAP_BE64((u_int64_t)args->newBlockCount * hfsmp->blockSize);
6184	if (journal_uses_fua(hfsmp->jnl))
6185		buf_markfua(bp);
6186	error = buf_bwrite(bp);
6187	if (error) {
6188		printf("hfs_journal_relocate_callback: failed to write JIB (%d)\n", error);
6189		return error;
6190	}
6191	if (!journal_uses_fua(hfsmp->jnl)) {
6192		error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, args->context);
6193		if (error) {
6194			printf("hfs_journal_relocate_callback: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6195			error = 0;		/* Don't fail the operation. */
6196		}
6197	}
6198
6199	return error;
6200}
6201
6202
6203/* Type of resize operation in progress */
6204#define HFS_RESIZE_TRUNCATE	1
6205#define HFS_RESIZE_EXTEND	2
6206
6207/*
6208 * Core function to relocate the journal file.  This function takes the
6209 * journal size of the newly relocated journal --- the caller can
6210 * provide a new journal size if they want to change the size of
6211 * the journal.  The function takes care of updating the journal info
6212 * block and all other data structures correctly.
6213 *
6214 * Note: This function starts a transaction and grabs the btree locks.
6215 */
6216static int
6217hfs_relocate_journal_file(struct hfsmount *hfsmp, u_int32_t jnl_size, int resize_type, vfs_context_t context)
6218{
6219	int error;
6220	int journal_err;
6221	int lockflags;
6222	u_int32_t oldStartBlock;
6223	u_int32_t newStartBlock;
6224	u_int32_t oldBlockCount;
6225	u_int32_t newBlockCount;
6226	u_int32_t jnlBlockCount;
6227	u_int32_t alloc_skipfreeblks;
6228	struct cat_desc journal_desc;
6229	struct cat_attr journal_attr;
6230	struct cat_fork journal_fork;
6231	struct hfs_journal_relocate_args callback_args;
6232
6233	/* Calculate the number of allocation blocks required for the journal */
6234	jnlBlockCount = howmany(jnl_size, hfsmp->blockSize);
6235
6236	/*
6237	 * During truncatefs(), the volume free block count is updated
6238	 * before relocating data and reflects the total number of free
6239	 * blocks that will exist on volume after the resize is successful.
6240	 * This means that the allocation blocks required for relocation
6241	 * have already been reserved and accounted for in the free block
6242	 * count.  Therefore, block allocation and deallocation routines
6243	 * can skip the free block check by passing HFS_ALLOC_SKIPFREEBLKS
6244	 * flag.
6245	 *
6246	 * This special handling is not required when the file system
6247	 * is being extended as we want all the allocated and deallocated
6248	 * blocks to be accounted for correctly.
6249	 */
6250	if (resize_type == HFS_RESIZE_TRUNCATE) {
6251		alloc_skipfreeblks = HFS_ALLOC_SKIPFREEBLKS;
6252	} else {
6253		alloc_skipfreeblks = 0;
6254	}
6255
6256	error = hfs_start_transaction(hfsmp);
6257	if (error) {
6258		printf("hfs_relocate_journal_file: hfs_start_transaction returned %d\n", error);
6259		return error;
6260	}
6261	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6262
6263	error = BlockAllocate(hfsmp, 1, jnlBlockCount, jnlBlockCount,
6264			HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | alloc_skipfreeblks,
6265			 &newStartBlock, &newBlockCount);
6266	if (error) {
6267		printf("hfs_relocate_journal_file: BlockAllocate returned %d\n", error);
6268		goto fail;
6269	}
6270	if (newBlockCount != jnlBlockCount) {
6271		printf("hfs_relocate_journal_file: newBlockCount != jnlBlockCount (%u, %u)\n", newBlockCount, jnlBlockCount);
6272		goto free_fail;
6273	}
6274
6275	error = cat_idlookup(hfsmp, hfsmp->hfs_jnlfileid, 1, 0, &journal_desc, &journal_attr, &journal_fork);
6276	if (error) {
6277		printf("hfs_relocate_journal_file: cat_idlookup returned %d\n", error);
6278		goto free_fail;
6279	}
6280
6281	oldStartBlock = journal_fork.cf_extents[0].startBlock;
6282	oldBlockCount = journal_fork.cf_extents[0].blockCount;
6283	error = BlockDeallocate(hfsmp, oldStartBlock, oldBlockCount, alloc_skipfreeblks);
6284	if (error) {
6285		printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6286		goto free_fail;
6287	}
6288
6289	/* Update the catalog record for .journal */
6290	journal_fork.cf_size = newBlockCount * hfsmp->blockSize;
6291	journal_fork.cf_extents[0].startBlock = newStartBlock;
6292	journal_fork.cf_extents[0].blockCount = newBlockCount;
6293	journal_fork.cf_blocks = newBlockCount;
6294	error = cat_update(hfsmp, &journal_desc, &journal_attr, &journal_fork, NULL);
6295	cat_releasedesc(&journal_desc);  /* all done with cat descriptor */
6296	if (error) {
6297		printf("hfs_relocate_journal_file: cat_update returned %d\n", error);
6298		goto free_fail;
6299	}
6300
6301	/*
6302	 * If the journal is part of the file system, then tell the journal
6303	 * code about the new location.  If the journal is on an external
6304	 * device, then just keep using it as-is.
6305	 */
6306	if (hfsmp->jvp == hfsmp->hfs_devvp) {
6307		callback_args.hfsmp = hfsmp;
6308		callback_args.context = context;
6309		callback_args.newStartBlock = newStartBlock;
6310		callback_args.newBlockCount = newBlockCount;
6311
6312		error = journal_relocate(hfsmp->jnl, (off_t)newStartBlock*hfsmp->blockSize,
6313			(off_t)newBlockCount*hfsmp->blockSize, 0,
6314			hfs_journal_relocate_callback, &callback_args);
6315		if (error) {
6316			/* NOTE: journal_relocate will mark the journal invalid. */
6317			printf("hfs_relocate_journal_file: journal_relocate returned %d\n", error);
6318			goto fail;
6319		}
6320		if (hfs_resize_debug) {
6321			printf ("hfs_relocate_journal_file: Successfully relocated journal from (%u,%u) to (%u,%u)\n", oldStartBlock, oldBlockCount, newStartBlock, newBlockCount);
6322		}
6323		hfsmp->jnl_start = newStartBlock;
6324		hfsmp->jnl_size = (off_t)newBlockCount * hfsmp->blockSize;
6325	}
6326
6327	hfs_systemfile_unlock(hfsmp, lockflags);
6328	error = hfs_end_transaction(hfsmp);
6329	if (error) {
6330		printf("hfs_relocate_journal_file: hfs_end_transaction returned %d\n", error);
6331	}
6332
6333	return error;
6334
6335free_fail:
6336	journal_err = BlockDeallocate(hfsmp, newStartBlock, newBlockCount, HFS_ALLOC_SKIPFREEBLKS);
6337	if (journal_err) {
6338		printf("hfs_relocate_journal_file: BlockDeallocate returned %d\n", error);
6339		hfs_mark_volume_inconsistent(hfsmp);
6340	}
6341fail:
6342	hfs_systemfile_unlock(hfsmp, lockflags);
6343	(void) hfs_end_transaction(hfsmp);
6344	if (hfs_resize_debug) {
6345		printf ("hfs_relocate_journal_file: Error relocating journal file (error=%d)\n", error);
6346	}
6347	return error;
6348}
6349
6350
6351/*
6352 * Relocate the journal file when the file system is being truncated.
6353 * We do not down-size the journal when the file system size is
6354 * reduced, so we always provide the current journal size to the
6355 * relocate code.
6356 */
6357static int
6358hfs_reclaim_journal_file(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6359{
6360	int error = 0;
6361	u_int32_t startBlock;
6362	u_int32_t blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6363
6364	/*
6365	 * Figure out the location of the .journal file.  When the journal
6366	 * is on an external device, we need to look up the .journal file.
6367	 */
6368	if (hfsmp->jvp == hfsmp->hfs_devvp) {
6369		startBlock = hfsmp->jnl_start;
6370		blockCount = hfsmp->jnl_size / hfsmp->blockSize;
6371	} else {
6372		u_int32_t fileid;
6373		u_int32_t old_jnlfileid;
6374		struct cat_attr attr;
6375		struct cat_fork fork;
6376
6377		/*
6378		 * The cat_lookup inside GetFileInfo will fail because hfs_jnlfileid
6379		 * is set, and it is trying to hide the .journal file.  So temporarily
6380		 * unset the field while calling GetFileInfo.
6381		 */
6382		old_jnlfileid = hfsmp->hfs_jnlfileid;
6383		hfsmp->hfs_jnlfileid = 0;
6384		fileid = GetFileInfo(hfsmp, kHFSRootFolderID, ".journal", &attr, &fork);
6385		hfsmp->hfs_jnlfileid = old_jnlfileid;
6386		if (fileid != old_jnlfileid) {
6387			printf("hfs_reclaim_journal_file: cannot find .journal file!\n");
6388			return EIO;
6389		}
6390
6391		startBlock = fork.cf_extents[0].startBlock;
6392		blockCount = fork.cf_extents[0].blockCount;
6393	}
6394
6395	if (startBlock + blockCount <= allocLimit) {
6396		/* The journal file does not require relocation */
6397		return 0;
6398	}
6399
6400	error = hfs_relocate_journal_file(hfsmp, blockCount * hfsmp->blockSize, HFS_RESIZE_TRUNCATE, context);
6401	if (error == 0) {
6402		hfsmp->hfs_resize_blocksmoved += blockCount;
6403		hfs_truncatefs_progress(hfsmp);
6404		printf ("hfs_reclaim_journal_file: Relocated %u blocks from journal on \"%s\"\n",
6405				blockCount, hfsmp->vcbVN);
6406	}
6407
6408	return error;
6409}
6410
6411
6412/*
6413 * Move the journal info block to a new location.  We have to make sure the
6414 * new copy of the journal info block gets to the media first, then change
6415 * the field in the volume header and the catalog record.
6416 */
6417static int
6418hfs_reclaim_journal_info_block(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6419{
6420	int error;
6421	int journal_err;
6422	int lockflags;
6423	u_int32_t oldBlock;
6424	u_int32_t newBlock;
6425	u_int32_t blockCount;
6426	struct cat_desc jib_desc;
6427	struct cat_attr jib_attr;
6428	struct cat_fork jib_fork;
6429	buf_t old_bp, new_bp;
6430
6431	if (hfsmp->vcbJinfoBlock <= allocLimit) {
6432		/* The journal info block does not require relocation */
6433		return 0;
6434	}
6435
6436	error = hfs_start_transaction(hfsmp);
6437	if (error) {
6438		printf("hfs_reclaim_journal_info_block: hfs_start_transaction returned %d\n", error);
6439		return error;
6440	}
6441	lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG | SFL_BITMAP, HFS_EXCLUSIVE_LOCK);
6442
6443	error = BlockAllocate(hfsmp, 1, 1, 1,
6444			HFS_ALLOC_METAZONE | HFS_ALLOC_FORCECONTIG | HFS_ALLOC_SKIPFREEBLKS,
6445			&newBlock, &blockCount);
6446	if (error) {
6447		printf("hfs_reclaim_journal_info_block: BlockAllocate returned %d\n", error);
6448		goto fail;
6449	}
6450	if (blockCount != 1) {
6451		printf("hfs_reclaim_journal_info_block: blockCount != 1 (%u)\n", blockCount);
6452		goto free_fail;
6453	}
6454	error = BlockDeallocate(hfsmp, hfsmp->vcbJinfoBlock, 1, HFS_ALLOC_SKIPFREEBLKS);
6455	if (error) {
6456		printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6457		goto free_fail;
6458	}
6459
6460	/* Copy the old journal info block content to the new location */
6461	error = buf_meta_bread(hfsmp->hfs_devvp,
6462		hfsmp->vcbJinfoBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6463		hfsmp->blockSize, vfs_context_ucred(context), &old_bp);
6464	if (error) {
6465		printf("hfs_reclaim_journal_info_block: failed to read JIB (%d)\n", error);
6466		if (old_bp) {
6467        		buf_brelse(old_bp);
6468		}
6469		goto free_fail;
6470	}
6471	new_bp = buf_getblk(hfsmp->hfs_devvp,
6472		newBlock * (hfsmp->blockSize/hfsmp->hfs_logical_block_size),
6473		hfsmp->blockSize, 0, 0, BLK_META);
6474	bcopy((char*)buf_dataptr(old_bp), (char*)buf_dataptr(new_bp), hfsmp->blockSize);
6475	buf_brelse(old_bp);
6476	if (journal_uses_fua(hfsmp->jnl))
6477		buf_markfua(new_bp);
6478	error = buf_bwrite(new_bp);
6479	if (error) {
6480		printf("hfs_reclaim_journal_info_block: failed to write new JIB (%d)\n", error);
6481		goto free_fail;
6482	}
6483	if (!journal_uses_fua(hfsmp->jnl)) {
6484		error = VNOP_IOCTL(hfsmp->hfs_devvp, DKIOCSYNCHRONIZECACHE, NULL, FWRITE, context);
6485		if (error) {
6486			printf("hfs_reclaim_journal_info_block: DKIOCSYNCHRONIZECACHE failed (%d)\n", error);
6487			/* Don't fail the operation. */
6488		}
6489	}
6490
6491	/* Update the catalog record for .journal_info_block */
6492	error = cat_idlookup(hfsmp, hfsmp->hfs_jnlinfoblkid, 1, 0, &jib_desc, &jib_attr, &jib_fork);
6493	if (error) {
6494		printf("hfs_reclaim_journal_info_block: cat_idlookup returned %d\n", error);
6495		goto fail;
6496	}
6497	oldBlock = jib_fork.cf_extents[0].startBlock;
6498	jib_fork.cf_size = hfsmp->blockSize;
6499	jib_fork.cf_extents[0].startBlock = newBlock;
6500	jib_fork.cf_extents[0].blockCount = 1;
6501	jib_fork.cf_blocks = 1;
6502	error = cat_update(hfsmp, &jib_desc, &jib_attr, &jib_fork, NULL);
6503	cat_releasedesc(&jib_desc);  /* all done with cat descriptor */
6504	if (error) {
6505		printf("hfs_reclaim_journal_info_block: cat_update returned %d\n", error);
6506		goto fail;
6507	}
6508
6509	/* Update the pointer to the journal info block in the volume header. */
6510	hfsmp->vcbJinfoBlock = newBlock;
6511	error = hfs_flushvolumeheader(hfsmp, MNT_WAIT, HFS_ALTFLUSH);
6512	if (error) {
6513		printf("hfs_reclaim_journal_info_block: hfs_flushvolumeheader returned %d\n", error);
6514		goto fail;
6515	}
6516	hfs_systemfile_unlock(hfsmp, lockflags);
6517	error = hfs_end_transaction(hfsmp);
6518	if (error) {
6519		printf("hfs_reclaim_journal_info_block: hfs_end_transaction returned %d\n", error);
6520	}
6521	error = hfs_journal_flush(hfsmp, FALSE);
6522	if (error) {
6523		printf("hfs_reclaim_journal_info_block: journal_flush returned %d\n", error);
6524	}
6525
6526	/* Account for the block relocated and print progress */
6527	hfsmp->hfs_resize_blocksmoved += 1;
6528	hfs_truncatefs_progress(hfsmp);
6529	if (!error) {
6530		printf ("hfs_reclaim_journal_info: Relocated 1 block from journal info on \"%s\"\n",
6531				hfsmp->vcbVN);
6532		if (hfs_resize_debug) {
6533			printf ("hfs_reclaim_journal_info_block: Successfully relocated journal info block from (%u,%u) to (%u,%u)\n", oldBlock, blockCount, newBlock, blockCount);
6534		}
6535	}
6536	return error;
6537
6538free_fail:
6539	journal_err = BlockDeallocate(hfsmp, newBlock, blockCount, HFS_ALLOC_SKIPFREEBLKS);
6540	if (journal_err) {
6541		printf("hfs_reclaim_journal_info_block: BlockDeallocate returned %d\n", error);
6542		hfs_mark_volume_inconsistent(hfsmp);
6543	}
6544
6545fail:
6546	hfs_systemfile_unlock(hfsmp, lockflags);
6547	(void) hfs_end_transaction(hfsmp);
6548	if (hfs_resize_debug) {
6549		printf ("hfs_reclaim_journal_info_block: Error relocating journal info block (error=%d)\n", error);
6550	}
6551	return error;
6552}
6553
6554
6555static u_int64_t
6556calculate_journal_size(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count)
6557{
6558	u_int64_t journal_size;
6559	u_int32_t journal_scale;
6560
6561#define DEFAULT_JOURNAL_SIZE (8*1024*1024)
6562#define MAX_JOURNAL_SIZE     (512*1024*1024)
6563
6564	/* Calculate the journal size for this volume.   We want
6565	 * at least 8 MB of journal for each 100 GB of disk space.
6566	 * We cap the size at 512 MB, unless the allocation block
6567	 * size is larger, in which case, we use one allocation
6568	 * block.
6569	 */
6570	journal_scale = (sector_size * sector_count) / ((u_int64_t)100 * 1024 * 1024 * 1024);
6571	journal_size = DEFAULT_JOURNAL_SIZE * (journal_scale + 1);
6572	if (journal_size > MAX_JOURNAL_SIZE) {
6573		journal_size = MAX_JOURNAL_SIZE;
6574	}
6575	if (journal_size < hfsmp->blockSize) {
6576		journal_size = hfsmp->blockSize;
6577	}
6578	return journal_size;
6579}
6580
6581
6582/*
6583 * Calculate the expected journal size based on current partition size.
6584 * If the size of the current journal is less than the calculated size,
6585 * force journal relocation with the new journal size.
6586 */
6587static int
6588hfs_extend_journal(struct hfsmount *hfsmp, u_int32_t sector_size, u_int64_t sector_count, vfs_context_t context)
6589{
6590	int error = 0;
6591	u_int64_t calc_journal_size;
6592
6593	if (hfsmp->jvp != hfsmp->hfs_devvp) {
6594		if (hfs_resize_debug) {
6595			printf("hfs_extend_journal: not resizing the journal because it is on an external device.\n");
6596		}
6597		return 0;
6598	}
6599
6600	calc_journal_size = calculate_journal_size(hfsmp, sector_size, sector_count);
6601	if (calc_journal_size <= hfsmp->jnl_size) {
6602		/* The journal size requires no modification */
6603		goto out;
6604	}
6605
6606	if (hfs_resize_debug) {
6607		printf ("hfs_extend_journal: journal old=%u, new=%qd\n", hfsmp->jnl_size, calc_journal_size);
6608	}
6609
6610	/* Extend the journal to the new calculated size */
6611	error = hfs_relocate_journal_file(hfsmp, calc_journal_size, HFS_RESIZE_EXTEND, context);
6612	if (error == 0) {
6613		printf ("hfs_extend_journal: Extended journal size to %u bytes on \"%s\"\n",
6614				hfsmp->jnl_size, hfsmp->vcbVN);
6615	}
6616out:
6617	return error;
6618}
6619
6620
6621/*
6622 * This function traverses through all extended attribute records for a given
6623 * fileID, and calls function that reclaims data blocks that exist in the
6624 * area of the disk being reclaimed which in turn is responsible for allocating
6625 * new space, copying extent data, deallocating new space, and if required,
6626 * splitting the extent.
6627 *
6628 * Note: The caller has already acquired the cnode lock on the file.  Therefore
6629 * we are assured that no other thread would be creating/deleting/modifying
6630 * extended attributes for this file.
6631 *
6632 * Side Effects:
6633 * hfsmp->hfs_resize_blocksmoved is incremented by the number of allocation
6634 * blocks that were relocated.
6635 *
6636 * Returns:
6637 * 	0 on success, non-zero on failure.
6638 */
6639static int
6640hfs_reclaim_xattr(struct hfsmount *hfsmp, struct vnode *vp, u_int32_t fileID, u_int32_t allocLimit, vfs_context_t context)
6641{
6642	int error = 0;
6643	struct hfs_reclaim_extent_info *extent_info;
6644	int i;
6645	HFSPlusAttrKey *key;
6646	int *lockflags;
6647
6648	if (hfs_resize_debug) {
6649		printf("hfs_reclaim_xattr: === Start reclaiming xattr for id=%u ===\n", fileID);
6650	}
6651
6652	MALLOC(extent_info, struct hfs_reclaim_extent_info *,
6653	       sizeof(struct hfs_reclaim_extent_info), M_TEMP, M_WAITOK);
6654	if (extent_info == NULL) {
6655		return ENOMEM;
6656	}
6657	bzero(extent_info, sizeof(struct hfs_reclaim_extent_info));
6658	extent_info->vp = vp;
6659	extent_info->fileID = fileID;
6660	extent_info->is_xattr = true;
6661	extent_info->is_sysfile = vnode_issystem(vp);
6662	extent_info->fcb = VTOF(hfsmp->hfs_attribute_vp);
6663	lockflags = &(extent_info->lockflags);
6664	*lockflags = SFL_ATTRIBUTE | SFL_BITMAP;
6665
6666	/* Initialize iterator from the extent_info structure */
6667	MALLOC(extent_info->iterator, struct BTreeIterator *,
6668	       sizeof(struct BTreeIterator), M_TEMP, M_WAITOK);
6669	if (extent_info->iterator == NULL) {
6670		error = ENOMEM;
6671		goto out;
6672	}
6673	bzero(extent_info->iterator, sizeof(struct BTreeIterator));
6674
6675	/* Build attribute key */
6676	key = (HFSPlusAttrKey *)&(extent_info->iterator->key);
6677	error = hfs_buildattrkey(fileID, NULL, key);
6678	if (error) {
6679		goto out;
6680	}
6681
6682	/* Initialize btdata from extent_info structure.  Note that the
6683	 * buffer pointer actually points to the xattr record from the
6684	 * extent_info structure itself.
6685	 */
6686	extent_info->btdata.bufferAddress = &(extent_info->record.xattr);
6687	extent_info->btdata.itemSize = sizeof(HFSPlusAttrRecord);
6688	extent_info->btdata.itemCount = 1;
6689
6690	/*
6691	 * Sync all extent-based attribute data to the disk.
6692	 *
6693	 * All extent-based attribute data I/O is performed via cluster
6694	 * I/O using a virtual file that spans across entire file system
6695	 * space.
6696	 */
6697	hfs_lock_truncate(VTOC(hfsmp->hfs_attrdata_vp), HFS_EXCLUSIVE_LOCK);
6698	(void)cluster_push(hfsmp->hfs_attrdata_vp, 0);
6699	error = vnode_waitforwrites(hfsmp->hfs_attrdata_vp, 0, 0, 0, "hfs_reclaim_xattr");
6700	hfs_unlock_truncate(VTOC(hfsmp->hfs_attrdata_vp), 0);
6701	if (error) {
6702		goto out;
6703	}
6704
6705	/* Search for extended attribute for current file.  This
6706	 * will place the iterator before the first matching record.
6707	 */
6708	*lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6709	error = BTSearchRecord(extent_info->fcb, extent_info->iterator,
6710			&(extent_info->btdata), &(extent_info->recordlen),
6711			extent_info->iterator);
6712	hfs_systemfile_unlock(hfsmp, *lockflags);
6713	if (error) {
6714		if (error != btNotFound) {
6715			goto out;
6716		}
6717		/* btNotFound is expected here, so just mask it */
6718		error = 0;
6719	}
6720
6721	while (1) {
6722		/* Iterate to the next record */
6723		*lockflags = hfs_systemfile_lock(hfsmp, *lockflags, HFS_EXCLUSIVE_LOCK);
6724		error = BTIterateRecord(extent_info->fcb, kBTreeNextRecord,
6725				extent_info->iterator, &(extent_info->btdata),
6726				&(extent_info->recordlen));
6727		hfs_systemfile_unlock(hfsmp, *lockflags);
6728
6729		/* Stop the iteration if we encounter end of btree or xattr with different fileID */
6730		if (error || key->fileID != fileID) {
6731			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6732				error = 0;
6733			}
6734			break;
6735		}
6736
6737		/* We only care about extent-based EAs */
6738		if ((extent_info->record.xattr.recordType != kHFSPlusAttrForkData) &&
6739		    (extent_info->record.xattr.recordType != kHFSPlusAttrExtents)) {
6740			continue;
6741		}
6742
6743		if (extent_info->record.xattr.recordType == kHFSPlusAttrForkData) {
6744			extent_info->overflow_count = 0;
6745			extent_info->extents = extent_info->record.xattr.forkData.theFork.extents;
6746		} else if (extent_info->record.xattr.recordType == kHFSPlusAttrExtents) {
6747			extent_info->overflow_count++;
6748			extent_info->extents = extent_info->record.xattr.overflowExtents.extents;
6749		}
6750
6751		extent_info->recStartBlock = key->startBlock;
6752		for (i = 0; i < kHFSPlusExtentDensity; i++) {
6753			if (extent_info->extents[i].blockCount == 0) {
6754				break;
6755			}
6756			extent_info->extent_index = i;
6757			error = hfs_reclaim_extent(hfsmp, allocLimit, extent_info, context);
6758			if (error) {
6759				printf ("hfs_reclaim_xattr: fileID=%u hfs_reclaim_extent error=%d\n", fileID, error);
6760				goto out;
6761			}
6762		}
6763	}
6764
6765out:
6766	/* If any blocks were relocated, account them and report progress */
6767	if (extent_info->blocks_relocated) {
6768		hfsmp->hfs_resize_blocksmoved += extent_info->blocks_relocated;
6769		hfs_truncatefs_progress(hfsmp);
6770	}
6771	if (extent_info->iterator) {
6772		FREE(extent_info->iterator, M_TEMP);
6773	}
6774	if (extent_info) {
6775		FREE(extent_info, M_TEMP);
6776	}
6777	if (hfs_resize_debug) {
6778		printf("hfs_reclaim_xattr: === Finished relocating xattr for fileid=%u (error=%d) ===\n", fileID, error);
6779	}
6780	return error;
6781}
6782
6783/*
6784 * Reclaim any extent-based extended attributes allocation blocks from
6785 * the area of the disk that is being truncated.
6786 *
6787 * The function traverses the attribute btree to find out the fileIDs
6788 * of the extended attributes that need to be relocated.  For every
6789 * file whose large EA requires relocation, it looks up the cnode and
6790 * calls hfs_reclaim_xattr() to do all the work for allocating
6791 * new space, copying data, deallocating old space, and if required,
6792 * splitting the extents.
6793 *
6794 * Inputs:
6795 * 	allocLimit    - starting block of the area being reclaimed
6796 *
6797 * Returns:
6798 *   	returns 0 on success, non-zero on failure.
6799 */
6800static int
6801hfs_reclaim_xattrspace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6802{
6803	int error = 0;
6804	FCB *fcb;
6805	struct BTreeIterator *iterator = NULL;
6806	struct FSBufferDescriptor btdata;
6807	HFSPlusAttrKey *key;
6808	HFSPlusAttrRecord rec;
6809	int lockflags = 0;
6810	cnid_t prev_fileid = 0;
6811	struct vnode *vp;
6812	int need_relocate;
6813	int btree_operation;
6814	u_int32_t files_moved = 0;
6815	u_int32_t prev_blocksmoved;
6816	int i;
6817
6818	fcb = VTOF(hfsmp->hfs_attribute_vp);
6819	/* Store the value to print total blocks moved by this function in end */
6820	prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6821
6822	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6823		return ENOMEM;
6824	}
6825	bzero(iterator, sizeof(*iterator));
6826	key = (HFSPlusAttrKey *)&iterator->key;
6827	btdata.bufferAddress = &rec;
6828	btdata.itemSize = sizeof(rec);
6829	btdata.itemCount = 1;
6830
6831	need_relocate = false;
6832	btree_operation = kBTreeFirstRecord;
6833	/* Traverse the attribute btree to find extent-based EAs to reclaim */
6834	while (1) {
6835		lockflags = hfs_systemfile_lock(hfsmp, SFL_ATTRIBUTE, HFS_SHARED_LOCK);
6836		error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6837		hfs_systemfile_unlock(hfsmp, lockflags);
6838		if (error) {
6839			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6840				error = 0;
6841			}
6842			break;
6843		}
6844		btree_operation = kBTreeNextRecord;
6845
6846		/* If the extents of current fileID were already relocated, skip it */
6847		if (prev_fileid == key->fileID) {
6848			continue;
6849		}
6850
6851		/* Check if any of the extents in the current record need to be relocated */
6852		need_relocate = false;
6853		switch(rec.recordType) {
6854			case kHFSPlusAttrForkData:
6855				for (i = 0; i < kHFSPlusExtentDensity; i++) {
6856					if (rec.forkData.theFork.extents[i].blockCount == 0) {
6857						break;
6858					}
6859					if ((rec.forkData.theFork.extents[i].startBlock +
6860					     rec.forkData.theFork.extents[i].blockCount) > allocLimit) {
6861						need_relocate = true;
6862						break;
6863					}
6864				}
6865				break;
6866
6867			case kHFSPlusAttrExtents:
6868				for (i = 0; i < kHFSPlusExtentDensity; i++) {
6869					if (rec.overflowExtents.extents[i].blockCount == 0) {
6870						break;
6871					}
6872					if ((rec.overflowExtents.extents[i].startBlock +
6873					     rec.overflowExtents.extents[i].blockCount) > allocLimit) {
6874						need_relocate = true;
6875						break;
6876					}
6877				}
6878				break;
6879		};
6880
6881		/* Continue iterating to next attribute record */
6882		if (need_relocate == false) {
6883			continue;
6884		}
6885
6886		/* Look up the vnode for corresponding file.  The cnode
6887		 * will be locked which will ensure that no one modifies
6888		 * the xattrs when we are relocating them.
6889		 *
6890		 * We want to allow open-unlinked files to be moved,
6891		 * so provide allow_deleted == 1 for hfs_vget().
6892		 */
6893		if (hfs_vget(hfsmp, key->fileID, &vp, 0, 1) != 0) {
6894			continue;
6895		}
6896
6897		error = hfs_reclaim_xattr(hfsmp, vp, key->fileID, allocLimit, context);
6898		hfs_unlock(VTOC(vp));
6899		vnode_put(vp);
6900		if (error) {
6901			printf ("hfs_reclaim_xattrspace: Error relocating xattrs for fileid=%u (error=%d)\n", key->fileID, error);
6902			break;
6903		}
6904		prev_fileid = key->fileID;
6905		files_moved++;
6906	}
6907
6908	if (files_moved) {
6909		printf("hfs_reclaim_xattrspace: Relocated %u xattr blocks from %u files on \"%s\"\n",
6910				(hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
6911				files_moved, hfsmp->vcbVN);
6912	}
6913
6914	kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
6915	return error;
6916}
6917
6918/*
6919 * Reclaim blocks from regular files.
6920 *
6921 * This function iterates over all the record in catalog btree looking
6922 * for files with extents that overlap into the space we're trying to
6923 * free up.  If a file extent requires relocation, it looks up the vnode
6924 * and calls function to relocate the data.
6925 *
6926 * Returns:
6927 * 	Zero on success, non-zero on failure.
6928 */
6929static int
6930hfs_reclaim_filespace(struct hfsmount *hfsmp, u_int32_t allocLimit, vfs_context_t context)
6931{
6932	int error;
6933	FCB *fcb;
6934	struct BTreeIterator *iterator = NULL;
6935	struct FSBufferDescriptor btdata;
6936	int btree_operation;
6937	int lockflags;
6938	struct HFSPlusCatalogFile filerec;
6939	struct vnode *vp;
6940	struct vnode *rvp;
6941	struct filefork *datafork;
6942	u_int32_t files_moved = 0;
6943	u_int32_t prev_blocksmoved;
6944
6945	fcb = VTOF(hfsmp->hfs_catalog_vp);
6946	/* Store the value to print total blocks moved by this function at the end */
6947	prev_blocksmoved = hfsmp->hfs_resize_blocksmoved;
6948
6949	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
6950		error = ENOMEM;
6951		goto reclaim_filespace_done;
6952	}
6953
6954#if CONFIG_PROTECT
6955	int keys_generated = 0;
6956	/*
6957	 * For content-protected filesystems, we may need to relocate files that
6958	 * are encrypted.  If they use the new-style offset-based IVs, then
6959	 * we can move them regardless of the lock state.  We create a temporary
6960	 * key here that we use to read/write the data, then we discard it at the
6961	 * end of the function.
6962	 */
6963	if (cp_fs_protected (hfsmp->hfs_mp)) {
6964		error = cp_entry_gentempkeys(&hfsmp->hfs_resize_cpentry, hfsmp);
6965		if (error) {
6966			printf("hfs_reclaimspace: Error generating temporary keys for resize (%d)\n", error);
6967			goto reclaim_filespace_done;
6968		}
6969	}
6970#endif
6971
6972	bzero(iterator, sizeof(*iterator));
6973
6974	btdata.bufferAddress = &filerec;
6975	btdata.itemSize = sizeof(filerec);
6976	btdata.itemCount = 1;
6977
6978	btree_operation = kBTreeFirstRecord;
6979	while (1) {
6980		lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_SHARED_LOCK);
6981		error = BTIterateRecord(fcb, btree_operation, iterator, &btdata, NULL);
6982		hfs_systemfile_unlock(hfsmp, lockflags);
6983		if (error) {
6984			if (error == fsBTRecordNotFoundErr || error == fsBTEndOfIterationErr) {
6985				error = 0;
6986			}
6987			break;
6988		}
6989		btree_operation = kBTreeNextRecord;
6990
6991		if (filerec.recordType != kHFSPlusFileRecord) {
6992			continue;
6993		}
6994
6995		/* Check if any of the extents require relocation */
6996		if (hfs_file_extent_overlaps(hfsmp, allocLimit, &filerec) == false) {
6997			continue;
6998		}
6999
7000		/* We want to allow open-unlinked files to be moved, so allow_deleted == 1 */
7001		if (hfs_vget(hfsmp, filerec.fileID, &vp, 0, 1) != 0) {
7002			if (hfs_resize_debug) {
7003				printf("hfs_reclaim_filespace: hfs_vget(%u) failed.\n", filerec.fileID);
7004			}
7005			continue;
7006		}
7007
7008		/* If data fork exists or item is a directory hard link, relocate blocks */
7009		datafork = VTOF(vp);
7010		if ((datafork && datafork->ff_blocks > 0) || vnode_isdir(vp)) {
7011			error = hfs_reclaim_file(hfsmp, vp, filerec.fileID,
7012					kHFSDataForkType, allocLimit, context);
7013			if (error)  {
7014				printf ("hfs_reclaimspace: Error reclaiming datafork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7015				hfs_unlock(VTOC(vp));
7016				vnode_put(vp);
7017				break;
7018			}
7019		}
7020
7021		/* If resource fork exists or item is a directory hard link, relocate blocks */
7022		if (((VTOC(vp)->c_blocks - (datafork ? datafork->ff_blocks : 0)) > 0) || vnode_isdir(vp)) {
7023			if (vnode_isdir(vp)) {
7024				/* Resource fork vnode lookup is invalid for directory hard link.
7025				 * So we fake data fork vnode as resource fork vnode.
7026				 */
7027				rvp = vp;
7028			} else {
7029				error = hfs_vgetrsrc(hfsmp, vp, &rvp, TRUE, FALSE);
7030				if (error) {
7031					printf ("hfs_reclaimspace: Error looking up rvp for fileid=%u (error=%d)\n", filerec.fileID, error);
7032					hfs_unlock(VTOC(vp));
7033					vnode_put(vp);
7034					break;
7035				}
7036				VTOC(rvp)->c_flag |= C_NEED_RVNODE_PUT;
7037			}
7038
7039			error = hfs_reclaim_file(hfsmp, rvp, filerec.fileID,
7040					kHFSResourceForkType, allocLimit, context);
7041			if (error) {
7042				printf ("hfs_reclaimspace: Error reclaiming rsrcfork blocks of fileid=%u (error=%d)\n", filerec.fileID, error);
7043				hfs_unlock(VTOC(vp));
7044				vnode_put(vp);
7045				break;
7046			}
7047		}
7048
7049		/* The file forks were relocated successfully, now drop the
7050		 * cnode lock and vnode reference, and continue iterating to
7051		 * next catalog record.
7052		 */
7053		hfs_unlock(VTOC(vp));
7054		vnode_put(vp);
7055		files_moved++;
7056	}
7057
7058	if (files_moved) {
7059		printf("hfs_reclaim_filespace: Relocated %u blocks from %u files on \"%s\"\n",
7060				(hfsmp->hfs_resize_blocksmoved - prev_blocksmoved),
7061				files_moved, hfsmp->vcbVN);
7062	}
7063
7064reclaim_filespace_done:
7065	if (iterator) {
7066		kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7067	}
7068
7069#if CONFIG_PROTECT
7070	if (keys_generated) {
7071		cp_entry_destroy(&hfsmp->hfs_resize_cpentry);
7072	}
7073#endif
7074	return error;
7075}
7076
7077/*
7078 * Reclaim space at the end of a file system.
7079 *
7080 * Inputs -
7081 * 	allocLimit 	- start block of the space being reclaimed
7082 * 	reclaimblks 	- number of allocation blocks to reclaim
7083 */
7084static int
7085hfs_reclaimspace(struct hfsmount *hfsmp, u_int32_t allocLimit, u_int32_t reclaimblks, vfs_context_t context)
7086{
7087	int error = 0;
7088
7089	/*
7090	 * Preflight the bitmap to find out total number of blocks that need
7091	 * relocation.
7092	 *
7093	 * Note: Since allocLimit is set to the location of new alternate volume
7094	 * header, the check below does not account for blocks allocated for old
7095	 * alternate volume header.
7096	 */
7097	error = hfs_count_allocated(hfsmp, allocLimit, reclaimblks, &(hfsmp->hfs_resize_totalblocks));
7098	if (error) {
7099		printf ("hfs_reclaimspace: Unable to determine total blocks to reclaim error=%d\n", error);
7100		return error;
7101	}
7102	if (hfs_resize_debug) {
7103		printf ("hfs_reclaimspace: Total number of blocks to reclaim = %u\n", hfsmp->hfs_resize_totalblocks);
7104	}
7105
7106	/* Just to be safe, sync the content of the journal to the disk before we proceed */
7107	hfs_journal_flush(hfsmp, TRUE);
7108
7109	/* First, relocate journal file blocks if they're in the way.
7110	 * Doing this first will make sure that journal relocate code
7111	 * gets access to contiguous blocks on disk first.  The journal
7112	 * file has to be contiguous on the disk, otherwise resize will
7113	 * fail.
7114	 */
7115	error = hfs_reclaim_journal_file(hfsmp, allocLimit, context);
7116	if (error) {
7117		printf("hfs_reclaimspace: hfs_reclaim_journal_file failed (%d)\n", error);
7118		return error;
7119	}
7120
7121	/* Relocate journal info block blocks if they're in the way. */
7122	error = hfs_reclaim_journal_info_block(hfsmp, allocLimit, context);
7123	if (error) {
7124		printf("hfs_reclaimspace: hfs_reclaim_journal_info_block failed (%d)\n", error);
7125		return error;
7126	}
7127
7128	/* Relocate extents of the Extents B-tree if they're in the way.
7129	 * Relocating extents btree before other btrees is important as
7130	 * this will provide access to largest contiguous block range on
7131	 * the disk for relocating extents btree.  Note that extents btree
7132	 * can only have maximum of 8 extents.
7133	 */
7134	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_extents_vp, kHFSExtentsFileID,
7135			kHFSDataForkType, allocLimit, context);
7136	if (error) {
7137		printf("hfs_reclaimspace: reclaim extents b-tree returned %d\n", error);
7138		return error;
7139	}
7140
7141	/* Relocate extents of the Allocation file if they're in the way. */
7142	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_allocation_vp, kHFSAllocationFileID,
7143			kHFSDataForkType, allocLimit, context);
7144	if (error) {
7145		printf("hfs_reclaimspace: reclaim allocation file returned %d\n", error);
7146		return error;
7147	}
7148
7149	/* Relocate extents of the Catalog B-tree if they're in the way. */
7150	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_catalog_vp, kHFSCatalogFileID,
7151			kHFSDataForkType, allocLimit, context);
7152	if (error) {
7153		printf("hfs_reclaimspace: reclaim catalog b-tree returned %d\n", error);
7154		return error;
7155	}
7156
7157	/* Relocate extents of the Attributes B-tree if they're in the way. */
7158	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_attribute_vp, kHFSAttributesFileID,
7159			kHFSDataForkType, allocLimit, context);
7160	if (error) {
7161		printf("hfs_reclaimspace: reclaim attribute b-tree returned %d\n", error);
7162		return error;
7163	}
7164
7165	/* Relocate extents of the Startup File if there is one and they're in the way. */
7166	error = hfs_reclaim_file(hfsmp, hfsmp->hfs_startup_vp, kHFSStartupFileID,
7167			kHFSDataForkType, allocLimit, context);
7168	if (error) {
7169		printf("hfs_reclaimspace: reclaim startup file returned %d\n", error);
7170		return error;
7171	}
7172
7173	/*
7174	 * We need to make sure the alternate volume header gets flushed if we moved
7175	 * any extents in the volume header.  But we need to do that before
7176	 * shrinking the size of the volume, or else the journal code will panic
7177	 * with an invalid (too large) block number.
7178	 *
7179	 * Note that blks_moved will be set if ANY extent was moved, even
7180	 * if it was just an overflow extent.  In this case, the journal_flush isn't
7181	 * strictly required, but shouldn't hurt.
7182	 */
7183	if (hfsmp->hfs_resize_blocksmoved) {
7184		hfs_journal_flush(hfsmp, TRUE);
7185	}
7186
7187	/* Reclaim extents from catalog file records */
7188	error = hfs_reclaim_filespace(hfsmp, allocLimit, context);
7189	if (error) {
7190		printf ("hfs_reclaimspace: hfs_reclaim_filespace returned error=%d\n", error);
7191		return error;
7192	}
7193
7194	/* Reclaim extents from extent-based extended attributes, if any */
7195	error = hfs_reclaim_xattrspace(hfsmp, allocLimit, context);
7196	if (error) {
7197		printf ("hfs_reclaimspace: hfs_reclaim_xattrspace returned error=%d\n", error);
7198		return error;
7199	}
7200
7201	return error;
7202}
7203
7204
7205/*
7206 * Check if there are any extents (including overflow extents) that overlap
7207 * into the disk space that is being reclaimed.
7208 *
7209 * Output -
7210 * 	true  - One of the extents need to be relocated
7211 * 	false - No overflow extents need to be relocated, or there was an error
7212 */
7213static int
7214hfs_file_extent_overlaps(struct hfsmount *hfsmp, u_int32_t allocLimit, struct HFSPlusCatalogFile *filerec)
7215{
7216	struct BTreeIterator * iterator = NULL;
7217	struct FSBufferDescriptor btdata;
7218	HFSPlusExtentRecord extrec;
7219	HFSPlusExtentKey *extkeyptr;
7220	FCB *fcb;
7221	int overlapped = false;
7222	int i, j;
7223	int error;
7224	int lockflags = 0;
7225	u_int32_t endblock;
7226
7227	/* Check if data fork overlaps the target space */
7228	for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7229		if (filerec->dataFork.extents[i].blockCount == 0) {
7230			break;
7231		}
7232		endblock = filerec->dataFork.extents[i].startBlock +
7233			filerec->dataFork.extents[i].blockCount;
7234		if (endblock > allocLimit) {
7235			overlapped = true;
7236			goto out;
7237		}
7238	}
7239
7240	/* Check if resource fork overlaps the target space */
7241	for (j = 0; j < kHFSPlusExtentDensity; ++j) {
7242		if (filerec->resourceFork.extents[j].blockCount == 0) {
7243			break;
7244		}
7245		endblock = filerec->resourceFork.extents[j].startBlock +
7246			filerec->resourceFork.extents[j].blockCount;
7247		if (endblock > allocLimit) {
7248			overlapped = true;
7249			goto out;
7250		}
7251	}
7252
7253	/* Return back if there are no overflow extents for this file */
7254	if ((i < kHFSPlusExtentDensity) && (j < kHFSPlusExtentDensity)) {
7255		goto out;
7256	}
7257
7258	if (kmem_alloc(kernel_map, (vm_offset_t *)&iterator, sizeof(*iterator))) {
7259		return 0;
7260	}
7261	bzero(iterator, sizeof(*iterator));
7262	extkeyptr = (HFSPlusExtentKey *)&iterator->key;
7263	extkeyptr->keyLength = kHFSPlusExtentKeyMaximumLength;
7264	extkeyptr->forkType = 0;
7265	extkeyptr->fileID = filerec->fileID;
7266	extkeyptr->startBlock = 0;
7267
7268	btdata.bufferAddress = &extrec;
7269	btdata.itemSize = sizeof(extrec);
7270	btdata.itemCount = 1;
7271
7272	fcb = VTOF(hfsmp->hfs_extents_vp);
7273
7274	lockflags = hfs_systemfile_lock(hfsmp, SFL_EXTENTS, HFS_SHARED_LOCK);
7275
7276	/* This will position the iterator just before the first overflow
7277	 * extent record for given fileID.  It will always return btNotFound,
7278	 * so we special case the error code.
7279	 */
7280	error = BTSearchRecord(fcb, iterator, &btdata, NULL, iterator);
7281	if (error && (error != btNotFound)) {
7282		goto out;
7283	}
7284
7285	/* BTIterateRecord() might return error if the btree is empty, and
7286	 * therefore we return that the extent does not overflow to the caller
7287	 */
7288	error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7289	while (error == 0) {
7290		/* Stop when we encounter a different file. */
7291		if (extkeyptr->fileID != filerec->fileID) {
7292			break;
7293		}
7294		/* Check if any of the forks exist in the target space. */
7295		for (i = 0; i < kHFSPlusExtentDensity; ++i) {
7296			if (extrec[i].blockCount == 0) {
7297				break;
7298			}
7299			endblock = extrec[i].startBlock + extrec[i].blockCount;
7300			if (endblock > allocLimit) {
7301				overlapped = true;
7302				goto out;
7303			}
7304		}
7305		/* Look for more records. */
7306		error = BTIterateRecord(fcb, kBTreeNextRecord, iterator, &btdata, NULL);
7307	}
7308
7309out:
7310	if (lockflags) {
7311		hfs_systemfile_unlock(hfsmp, lockflags);
7312	}
7313	if (iterator) {
7314		kmem_free(kernel_map, (vm_offset_t)iterator, sizeof(*iterator));
7315	}
7316	return overlapped;
7317}
7318
7319
7320/*
7321 * Calculate the progress of a file system resize operation.
7322 */
7323__private_extern__
7324int
7325hfs_resize_progress(struct hfsmount *hfsmp, u_int32_t *progress)
7326{
7327	if ((hfsmp->hfs_flags & HFS_RESIZE_IN_PROGRESS) == 0) {
7328		return (ENXIO);
7329	}
7330
7331	if (hfsmp->hfs_resize_totalblocks > 0) {
7332		*progress = (u_int32_t)((hfsmp->hfs_resize_blocksmoved * 100ULL) / hfsmp->hfs_resize_totalblocks);
7333	} else {
7334		*progress = 0;
7335	}
7336
7337	return (0);
7338}
7339
7340
7341/*
7342 * Creates a UUID from a unique "name" in the HFS UUID Name space.
7343 * See version 3 UUID.
7344 */
7345static void
7346hfs_getvoluuid(struct hfsmount *hfsmp, uuid_t result)
7347{
7348	MD5_CTX  md5c;
7349	uint8_t  rawUUID[8];
7350
7351	((uint32_t *)rawUUID)[0] = hfsmp->vcbFndrInfo[6];
7352	((uint32_t *)rawUUID)[1] = hfsmp->vcbFndrInfo[7];
7353
7354	MD5Init( &md5c );
7355	MD5Update( &md5c, HFS_UUID_NAMESPACE_ID, sizeof( uuid_t ) );
7356	MD5Update( &md5c, rawUUID, sizeof (rawUUID) );
7357	MD5Final( result, &md5c );
7358
7359	result[6] = 0x30 | ( result[6] & 0x0F );
7360	result[8] = 0x80 | ( result[8] & 0x3F );
7361}
7362
7363/*
7364 * Get file system attributes.
7365 */
7366static int
7367hfs_vfs_getattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7368{
7369#define HFS_ATTR_CMN_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST))
7370#define HFS_ATTR_FILE_VALIDMASK (ATTR_FILE_VALIDMASK & ~(ATTR_FILE_FILETYPE | ATTR_FILE_FORKCOUNT | ATTR_FILE_FORKLIST))
7371#define HFS_ATTR_CMN_VOL_VALIDMASK (ATTR_CMN_VALIDMASK & ~(ATTR_CMN_NAMEDATTRCOUNT | ATTR_CMN_NAMEDATTRLIST | ATTR_CMN_ACCTIME))
7372
7373	ExtendedVCB *vcb = VFSTOVCB(mp);
7374	struct hfsmount *hfsmp = VFSTOHFS(mp);
7375	u_int32_t freeCNIDs;
7376
7377	freeCNIDs = (u_int32_t)0xFFFFFFFF - (u_int32_t)hfsmp->vcbNxtCNID;
7378
7379	VFSATTR_RETURN(fsap, f_objcount, (u_int64_t)hfsmp->vcbFilCnt + (u_int64_t)hfsmp->vcbDirCnt);
7380	VFSATTR_RETURN(fsap, f_filecount, (u_int64_t)hfsmp->vcbFilCnt);
7381	VFSATTR_RETURN(fsap, f_dircount, (u_int64_t)hfsmp->vcbDirCnt);
7382	VFSATTR_RETURN(fsap, f_maxobjcount, (u_int64_t)0xFFFFFFFF);
7383	VFSATTR_RETURN(fsap, f_iosize, (size_t)cluster_max_io_size(mp, 0));
7384	VFSATTR_RETURN(fsap, f_blocks, (u_int64_t)hfsmp->totalBlocks);
7385	VFSATTR_RETURN(fsap, f_bfree, (u_int64_t)hfs_freeblks(hfsmp, 0));
7386	VFSATTR_RETURN(fsap, f_bavail, (u_int64_t)hfs_freeblks(hfsmp, 1));
7387	VFSATTR_RETURN(fsap, f_bsize, (u_int32_t)vcb->blockSize);
7388	/* XXX needs clarification */
7389	VFSATTR_RETURN(fsap, f_bused, hfsmp->totalBlocks - hfs_freeblks(hfsmp, 1));
7390	/* Maximum files is constrained by total blocks. */
7391	VFSATTR_RETURN(fsap, f_files, (u_int64_t)(hfsmp->totalBlocks - 2));
7392	VFSATTR_RETURN(fsap, f_ffree, MIN((u_int64_t)freeCNIDs, (u_int64_t)hfs_freeblks(hfsmp, 1)));
7393
7394	fsap->f_fsid.val[0] = hfsmp->hfs_raw_dev;
7395	fsap->f_fsid.val[1] = vfs_typenum(mp);
7396	VFSATTR_SET_SUPPORTED(fsap, f_fsid);
7397
7398	VFSATTR_RETURN(fsap, f_signature, vcb->vcbSigWord);
7399	VFSATTR_RETURN(fsap, f_carbon_fsid, 0);
7400
7401	if (VFSATTR_IS_ACTIVE(fsap, f_capabilities)) {
7402		vol_capabilities_attr_t *cap;
7403
7404		cap = &fsap->f_capabilities;
7405
7406		if (hfsmp->hfs_flags & HFS_STANDARD) {
7407			cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7408				VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7409				VOL_CAP_FMT_CASE_PRESERVING |
7410				VOL_CAP_FMT_FAST_STATFS |
7411				VOL_CAP_FMT_HIDDEN_FILES |
7412				VOL_CAP_FMT_PATH_FROM_ID;
7413		} else {
7414			cap->capabilities[VOL_CAPABILITIES_FORMAT] =
7415				VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7416				VOL_CAP_FMT_SYMBOLICLINKS |
7417				VOL_CAP_FMT_HARDLINKS |
7418				VOL_CAP_FMT_JOURNAL |
7419				VOL_CAP_FMT_ZERO_RUNS |
7420				(hfsmp->jnl ? VOL_CAP_FMT_JOURNAL_ACTIVE : 0) |
7421				(hfsmp->hfs_flags & HFS_CASE_SENSITIVE ? VOL_CAP_FMT_CASE_SENSITIVE : 0) |
7422				VOL_CAP_FMT_CASE_PRESERVING |
7423				VOL_CAP_FMT_FAST_STATFS |
7424				VOL_CAP_FMT_2TB_FILESIZE |
7425				VOL_CAP_FMT_HIDDEN_FILES |
7426#if HFS_COMPRESSION
7427				VOL_CAP_FMT_PATH_FROM_ID |
7428				VOL_CAP_FMT_DECMPFS_COMPRESSION;
7429#else
7430				VOL_CAP_FMT_PATH_FROM_ID;
7431#endif
7432		}
7433		cap->capabilities[VOL_CAPABILITIES_INTERFACES] =
7434			VOL_CAP_INT_SEARCHFS |
7435			VOL_CAP_INT_ATTRLIST |
7436			VOL_CAP_INT_NFSEXPORT |
7437			VOL_CAP_INT_READDIRATTR |
7438			VOL_CAP_INT_EXCHANGEDATA |
7439			VOL_CAP_INT_ALLOCATE |
7440			VOL_CAP_INT_VOL_RENAME |
7441			VOL_CAP_INT_ADVLOCK |
7442			VOL_CAP_INT_FLOCK |
7443#if NAMEDSTREAMS
7444			VOL_CAP_INT_EXTENDED_ATTR |
7445			VOL_CAP_INT_NAMEDSTREAMS;
7446#else
7447			VOL_CAP_INT_EXTENDED_ATTR;
7448#endif
7449		cap->capabilities[VOL_CAPABILITIES_RESERVED1] = 0;
7450		cap->capabilities[VOL_CAPABILITIES_RESERVED2] = 0;
7451
7452		cap->valid[VOL_CAPABILITIES_FORMAT] =
7453			VOL_CAP_FMT_PERSISTENTOBJECTIDS |
7454			VOL_CAP_FMT_SYMBOLICLINKS |
7455			VOL_CAP_FMT_HARDLINKS |
7456			VOL_CAP_FMT_JOURNAL |
7457			VOL_CAP_FMT_JOURNAL_ACTIVE |
7458			VOL_CAP_FMT_NO_ROOT_TIMES |
7459			VOL_CAP_FMT_SPARSE_FILES |
7460			VOL_CAP_FMT_ZERO_RUNS |
7461			VOL_CAP_FMT_CASE_SENSITIVE |
7462			VOL_CAP_FMT_CASE_PRESERVING |
7463			VOL_CAP_FMT_FAST_STATFS |
7464			VOL_CAP_FMT_2TB_FILESIZE |
7465			VOL_CAP_FMT_OPENDENYMODES |
7466			VOL_CAP_FMT_HIDDEN_FILES |
7467#if HFS_COMPRESSION
7468			VOL_CAP_FMT_PATH_FROM_ID |
7469			VOL_CAP_FMT_DECMPFS_COMPRESSION;
7470#else
7471			VOL_CAP_FMT_PATH_FROM_ID;
7472#endif
7473		cap->valid[VOL_CAPABILITIES_INTERFACES] =
7474			VOL_CAP_INT_SEARCHFS |
7475			VOL_CAP_INT_ATTRLIST |
7476			VOL_CAP_INT_NFSEXPORT |
7477			VOL_CAP_INT_READDIRATTR |
7478			VOL_CAP_INT_EXCHANGEDATA |
7479			VOL_CAP_INT_COPYFILE |
7480			VOL_CAP_INT_ALLOCATE |
7481			VOL_CAP_INT_VOL_RENAME |
7482			VOL_CAP_INT_ADVLOCK |
7483			VOL_CAP_INT_FLOCK |
7484			VOL_CAP_INT_MANLOCK |
7485#if NAMEDSTREAMS
7486			VOL_CAP_INT_EXTENDED_ATTR |
7487			VOL_CAP_INT_NAMEDSTREAMS;
7488#else
7489			VOL_CAP_INT_EXTENDED_ATTR;
7490#endif
7491		cap->valid[VOL_CAPABILITIES_RESERVED1] = 0;
7492		cap->valid[VOL_CAPABILITIES_RESERVED2] = 0;
7493		VFSATTR_SET_SUPPORTED(fsap, f_capabilities);
7494	}
7495	if (VFSATTR_IS_ACTIVE(fsap, f_attributes)) {
7496		vol_attributes_attr_t *attrp = &fsap->f_attributes;
7497
7498        	attrp->validattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7499        	attrp->validattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7500        	attrp->validattr.dirattr = ATTR_DIR_VALIDMASK;
7501        	attrp->validattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7502        	attrp->validattr.forkattr = 0;
7503
7504        	attrp->nativeattr.commonattr = HFS_ATTR_CMN_VOL_VALIDMASK;
7505        	attrp->nativeattr.volattr = ATTR_VOL_VALIDMASK & ~ATTR_VOL_INFO;
7506        	attrp->nativeattr.dirattr = ATTR_DIR_VALIDMASK;
7507        	attrp->nativeattr.fileattr = HFS_ATTR_FILE_VALIDMASK;
7508        	attrp->nativeattr.forkattr = 0;
7509		VFSATTR_SET_SUPPORTED(fsap, f_attributes);
7510	}
7511	fsap->f_create_time.tv_sec = hfsmp->hfs_itime;
7512	fsap->f_create_time.tv_nsec = 0;
7513	VFSATTR_SET_SUPPORTED(fsap, f_create_time);
7514	fsap->f_modify_time.tv_sec = hfsmp->vcbLsMod;
7515	fsap->f_modify_time.tv_nsec = 0;
7516	VFSATTR_SET_SUPPORTED(fsap, f_modify_time);
7517
7518	fsap->f_backup_time.tv_sec = hfsmp->vcbVolBkUp;
7519	fsap->f_backup_time.tv_nsec = 0;
7520	VFSATTR_SET_SUPPORTED(fsap, f_backup_time);
7521	if (VFSATTR_IS_ACTIVE(fsap, f_fssubtype)) {
7522		u_int16_t subtype = 0;
7523
7524		/*
7525		 * Subtypes (flavors) for HFS
7526		 *   0:   Mac OS Extended
7527		 *   1:   Mac OS Extended (Journaled)
7528		 *   2:   Mac OS Extended (Case Sensitive)
7529		 *   3:   Mac OS Extended (Case Sensitive, Journaled)
7530		 *   4 - 127:   Reserved
7531		 * 128:   Mac OS Standard
7532		 *
7533		 */
7534		if (hfsmp->hfs_flags & HFS_STANDARD) {
7535			subtype = HFS_SUBTYPE_STANDARDHFS;
7536		} else /* HFS Plus */ {
7537			if (hfsmp->jnl)
7538				subtype |= HFS_SUBTYPE_JOURNALED;
7539			if (hfsmp->hfs_flags & HFS_CASE_SENSITIVE)
7540				subtype |= HFS_SUBTYPE_CASESENSITIVE;
7541		}
7542		fsap->f_fssubtype = subtype;
7543		VFSATTR_SET_SUPPORTED(fsap, f_fssubtype);
7544	}
7545
7546	if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7547		strlcpy(fsap->f_vol_name, (char *) hfsmp->vcbVN, MAXPATHLEN);
7548		VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7549	}
7550	if (VFSATTR_IS_ACTIVE(fsap, f_uuid)) {
7551		hfs_getvoluuid(hfsmp, fsap->f_uuid);
7552		VFSATTR_SET_SUPPORTED(fsap, f_uuid);
7553	}
7554	return (0);
7555}
7556
7557/*
7558 * Perform a volume rename.  Requires the FS' root vp.
7559 */
7560static int
7561hfs_rename_volume(struct vnode *vp, const char *name, proc_t p)
7562{
7563	ExtendedVCB *vcb = VTOVCB(vp);
7564	struct cnode *cp = VTOC(vp);
7565	struct hfsmount *hfsmp = VTOHFS(vp);
7566	struct cat_desc to_desc;
7567	struct cat_desc todir_desc;
7568	struct cat_desc new_desc;
7569	cat_cookie_t cookie;
7570	int lockflags;
7571	int error = 0;
7572	char converted_volname[256];
7573	size_t volname_length = 0;
7574	size_t conv_volname_length = 0;
7575
7576
7577	/*
7578	 * Ignore attempts to rename a volume to a zero-length name.
7579	 */
7580	if (name[0] == 0)
7581		return(0);
7582
7583	bzero(&to_desc, sizeof(to_desc));
7584	bzero(&todir_desc, sizeof(todir_desc));
7585	bzero(&new_desc, sizeof(new_desc));
7586	bzero(&cookie, sizeof(cookie));
7587
7588	todir_desc.cd_parentcnid = kHFSRootParentID;
7589	todir_desc.cd_cnid = kHFSRootFolderID;
7590	todir_desc.cd_flags = CD_ISDIR;
7591
7592	to_desc.cd_nameptr = (const u_int8_t *)name;
7593	to_desc.cd_namelen = strlen(name);
7594	to_desc.cd_parentcnid = kHFSRootParentID;
7595	to_desc.cd_cnid = cp->c_cnid;
7596	to_desc.cd_flags = CD_ISDIR;
7597
7598	if ((error = hfs_lock(cp, HFS_EXCLUSIVE_LOCK)) == 0) {
7599		if ((error = hfs_start_transaction(hfsmp)) == 0) {
7600			if ((error = cat_preflight(hfsmp, CAT_RENAME, &cookie, p)) == 0) {
7601				lockflags = hfs_systemfile_lock(hfsmp, SFL_CATALOG, HFS_EXCLUSIVE_LOCK);
7602
7603				error = cat_rename(hfsmp, &cp->c_desc, &todir_desc, &to_desc, &new_desc);
7604
7605				/*
7606				 * If successful, update the name in the VCB, ensure it's terminated.
7607				 */
7608				if (!error) {
7609					strlcpy((char *)vcb->vcbVN, name, sizeof(vcb->vcbVN));
7610					volname_length = strlen ((const char*)vcb->vcbVN);
7611#define DKIOCCSSETLVNAME _IOW('d', 198, char[256])
7612					/* Send the volume name down to CoreStorage if necessary */
7613					error = utf8_normalizestr(vcb->vcbVN, volname_length, (u_int8_t*)converted_volname, &conv_volname_length, 256, UTF_PRECOMPOSED);
7614					if (error == 0) {
7615						(void) VNOP_IOCTL (hfsmp->hfs_devvp, DKIOCCSSETLVNAME, converted_volname, 0, vfs_context_current());
7616					}
7617					error = 0;
7618				}
7619
7620				hfs_systemfile_unlock(hfsmp, lockflags);
7621				cat_postflight(hfsmp, &cookie, p);
7622
7623				if (error)
7624					MarkVCBDirty(vcb);
7625				(void) hfs_flushvolumeheader(hfsmp, MNT_WAIT, 0);
7626			}
7627			hfs_end_transaction(hfsmp);
7628		}
7629		if (!error) {
7630			/* Release old allocated name buffer */
7631			if (cp->c_desc.cd_flags & CD_HASBUF) {
7632				const char *tmp_name = (const char *)cp->c_desc.cd_nameptr;
7633
7634				cp->c_desc.cd_nameptr = 0;
7635				cp->c_desc.cd_namelen = 0;
7636				cp->c_desc.cd_flags &= ~CD_HASBUF;
7637				vfs_removename(tmp_name);
7638			}
7639			/* Update cnode's catalog descriptor */
7640			replace_desc(cp, &new_desc);
7641			vcb->volumeNameEncodingHint = new_desc.cd_encoding;
7642			cp->c_touch_chgtime = TRUE;
7643		}
7644
7645		hfs_unlock(cp);
7646	}
7647
7648	return(error);
7649}
7650
7651/*
7652 * Get file system attributes.
7653 */
7654static int
7655hfs_vfs_setattr(struct mount *mp, struct vfs_attr *fsap, __unused vfs_context_t context)
7656{
7657	kauth_cred_t cred = vfs_context_ucred(context);
7658	int error = 0;
7659
7660	/*
7661	 * Must be superuser or owner of filesystem to change volume attributes
7662	 */
7663	if (!kauth_cred_issuser(cred) && (kauth_cred_getuid(cred) != vfs_statfs(mp)->f_owner))
7664		return(EACCES);
7665
7666	if (VFSATTR_IS_ACTIVE(fsap, f_vol_name)) {
7667		vnode_t root_vp;
7668
7669		error = hfs_vfs_root(mp, &root_vp, context);
7670		if (error)
7671			goto out;
7672
7673		error = hfs_rename_volume(root_vp, fsap->f_vol_name, vfs_context_proc(context));
7674		(void) vnode_put(root_vp);
7675		if (error)
7676			goto out;
7677
7678		VFSATTR_SET_SUPPORTED(fsap, f_vol_name);
7679	}
7680
7681out:
7682	return error;
7683}
7684
7685/* If a runtime corruption is detected, set the volume inconsistent
7686 * bit in the volume attributes.  The volume inconsistent bit is a persistent
7687 * bit which represents that the volume is corrupt and needs repair.
7688 * The volume inconsistent bit can be set from the kernel when it detects
7689 * runtime corruption or from file system repair utilities like fsck_hfs when
7690 * a repair operation fails.  The bit should be cleared only from file system
7691 * verify/repair utility like fsck_hfs when a verify/repair succeeds.
7692 */
7693void hfs_mark_volume_inconsistent(struct hfsmount *hfsmp)
7694{
7695	HFS_MOUNT_LOCK(hfsmp, TRUE);
7696	if ((hfsmp->vcbAtrb & kHFSVolumeInconsistentMask) == 0) {
7697		hfsmp->vcbAtrb |= kHFSVolumeInconsistentMask;
7698		MarkVCBDirty(hfsmp);
7699	}
7700	if ((hfsmp->hfs_flags & HFS_READ_ONLY)==0) {
7701		/* Log information to ASL log */
7702		fslog_fs_corrupt(hfsmp->hfs_mp);
7703		printf("hfs: Runtime corruption detected on %s, fsck will be forced on next mount.\n", hfsmp->vcbVN);
7704	}
7705	HFS_MOUNT_UNLOCK(hfsmp, TRUE);
7706}
7707
7708/* Replay the journal on the device node provided.  Returns zero if
7709 * journal replay succeeded or no journal was supposed to be replayed.
7710 */
7711static int hfs_journal_replay(vnode_t devvp, vfs_context_t context)
7712{
7713	int retval = 0;
7714	int error = 0;
7715	struct mount *mp = NULL;
7716	struct hfs_mount_args *args = NULL;
7717
7718	/* Replay allowed only on raw devices */
7719	if (!vnode_ischr(devvp) && !vnode_isblk(devvp)) {
7720		retval = EINVAL;
7721		goto out;
7722	}
7723
7724	/* Create dummy mount structures */
7725	MALLOC(mp, struct mount *, sizeof(struct mount), M_TEMP, M_WAITOK);
7726	if (mp == NULL) {
7727		retval = ENOMEM;
7728		goto out;
7729	}
7730	bzero(mp, sizeof(struct mount));
7731	mount_lock_init(mp);
7732
7733	MALLOC(args, struct hfs_mount_args *, sizeof(struct hfs_mount_args), M_TEMP, M_WAITOK);
7734	if (args == NULL) {
7735		retval = ENOMEM;
7736		goto out;
7737	}
7738	bzero(args, sizeof(struct hfs_mount_args));
7739
7740	retval = hfs_mountfs(devvp, mp, args, 1, context);
7741	buf_flushdirtyblks(devvp, TRUE, 0, "hfs_journal_replay");
7742
7743	/* FSYNC the devnode to be sure all data has been flushed */
7744	error = VNOP_FSYNC(devvp, MNT_WAIT, context);
7745	if (error) {
7746		retval = error;
7747	}
7748
7749out:
7750	if (mp) {
7751		mount_lock_destroy(mp);
7752		FREE(mp, M_TEMP);
7753	}
7754	if (args) {
7755		FREE(args, M_TEMP);
7756	}
7757	return retval;
7758}
7759
7760/*
7761 * hfs vfs operations.
7762 */
7763struct vfsops hfs_vfsops = {
7764	hfs_mount,
7765	hfs_start,
7766	hfs_unmount,
7767	hfs_vfs_root,
7768	hfs_quotactl,
7769	hfs_vfs_getattr, 	/* was hfs_statfs */
7770	hfs_sync,
7771	hfs_vfs_vget,
7772	hfs_fhtovp,
7773	hfs_vptofh,
7774	hfs_init,
7775	hfs_sysctl,
7776	hfs_vfs_setattr,
7777	{NULL}
7778};
7779