1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in
16 *    the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/param.h>
32#include <sys/errno.h>
33#include <sys/queue.h>
34
35#include <assert.h>
36#include <ctype.h>
37#include <fcntl.h>
38#include <stdalign.h>
39#include <stdbool.h>
40#include <stddef.h>
41#include <stdlib.h>
42#include <string.h>
43#include <unistd.h>
44
45#include <util.h>
46
47#include "makefs.h"
48#include "zfs.h"
49
50#define	VDEV_LABEL_SPACE	\
51	((off_t)(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE))
52_Static_assert(VDEV_LABEL_SPACE <= MINDEVSIZE, "");
53
54#define	MINMSSIZE		((off_t)1 << 24) /* 16MB */
55#define	DFLTMSSIZE		((off_t)1 << 29) /* 512MB */
56#define	MAXMSSIZE		((off_t)1 << 34) /* 16GB */
57
58#define	INDIR_LEVELS		6
59/* Indirect blocks are always 128KB. */
60#define	BLKPTR_PER_INDIR	(MAXBLOCKSIZE / sizeof(blkptr_t))
61
62struct dnode_cursor {
63	char		inddir[INDIR_LEVELS][MAXBLOCKSIZE];
64	off_t		indloc;
65	off_t		indspace;
66	dnode_phys_t	*dnode;
67	off_t		dataoff;
68	off_t		datablksz;
69};
70
71void
72zfs_prep_opts(fsinfo_t *fsopts)
73{
74	zfs_opt_t *zfs;
75	size_t align;
76
77	align = alignof(uint64_t);
78	zfs = aligned_alloc(align, roundup2(sizeof(*zfs), align));
79	if (zfs == NULL)
80		err(1, "aligned_alloc");
81	memset(zfs, 0, sizeof(*zfs));
82
83	const option_t zfs_options[] = {
84		{ '\0', "bootfs", &zfs->bootfs, OPT_STRPTR,
85		  0, 0, "Bootable dataset" },
86		{ '\0', "mssize", &zfs->mssize, OPT_INT64,
87		  MINMSSIZE, MAXMSSIZE, "Metaslab size" },
88		{ '\0', "poolname", &zfs->poolname, OPT_STRPTR,
89		  0, 0, "ZFS pool name" },
90		{ '\0', "rootpath", &zfs->rootpath, OPT_STRPTR,
91		  0, 0, "Prefix for all dataset mount points" },
92		{ '\0', "ashift", &zfs->ashift, OPT_INT32,
93		  MINBLOCKSHIFT, MAXBLOCKSHIFT, "ZFS pool ashift" },
94		{ '\0', "nowarn", &zfs->nowarn, OPT_BOOL,
95		  0, 0, "Suppress warning about experimental ZFS support" },
96		{ .name = NULL }
97	};
98
99	STAILQ_INIT(&zfs->datasetdescs);
100
101	fsopts->fs_specific = zfs;
102	fsopts->fs_options = copy_opts(zfs_options);
103}
104
105int
106zfs_parse_opts(const char *option, fsinfo_t *fsopts)
107{
108	zfs_opt_t *zfs;
109	struct dataset_desc *dsdesc;
110	char buf[BUFSIZ], *opt, *val;
111	int rv;
112
113	zfs = fsopts->fs_specific;
114
115	opt = val = estrdup(option);
116	opt = strsep(&val, "=");
117	if (strcmp(opt, "fs") == 0) {
118		if (val == NULL)
119			errx(1, "invalid filesystem parameters `%s'", option);
120
121		/*
122		 * Dataset descriptions will be parsed later, in dsl_init().
123		 * Just stash them away for now.
124		 */
125		dsdesc = ecalloc(1, sizeof(*dsdesc));
126		dsdesc->params = estrdup(val);
127		free(opt);
128		STAILQ_INSERT_TAIL(&zfs->datasetdescs, dsdesc, next);
129		return (1);
130	}
131	free(opt);
132
133	rv = set_option(fsopts->fs_options, option, buf, sizeof(buf));
134	return (rv == -1 ? 0 : 1);
135}
136
137static void
138zfs_size_vdev(fsinfo_t *fsopts)
139{
140	zfs_opt_t *zfs;
141	off_t asize, mssize, vdevsize, vdevsize1;
142
143	zfs = fsopts->fs_specific;
144
145	assert(fsopts->maxsize != 0);
146	assert(zfs->ashift != 0);
147
148	/*
149	 * Figure out how big the vdev should be.
150	 */
151	vdevsize = rounddown2(fsopts->maxsize, 1 << zfs->ashift);
152	if (vdevsize < MINDEVSIZE)
153		errx(1, "maximum image size is too small");
154	if (vdevsize < fsopts->minsize || vdevsize > fsopts->maxsize) {
155		errx(1, "image size bounds must be multiples of %d",
156		    1 << zfs->ashift);
157	}
158	asize = vdevsize - VDEV_LABEL_SPACE;
159
160	/*
161	 * Size metaslabs according to the following heuristic:
162	 * - provide at least 8 metaslabs,
163	 * - without using a metaslab size larger than 512MB.
164	 * This approximates what OpenZFS does without being complicated.  In
165	 * practice we expect pools to be expanded upon first use, and OpenZFS
166	 * does not resize metaslabs in that case, so there is no right answer
167	 * here.  In general we want to provide large metaslabs even if the
168	 * image size is small, and 512MB is a reasonable size for pools up to
169	 * several hundred gigabytes.
170	 *
171	 * The user may override this heuristic using the "-o mssize" option.
172	 */
173	mssize = zfs->mssize;
174	if (mssize == 0) {
175		mssize = MAX(MIN(asize / 8, DFLTMSSIZE), MINMSSIZE);
176		if (!powerof2(mssize))
177			mssize = 1l << (flsll(mssize) - 1);
178	}
179	if (!powerof2(mssize))
180		errx(1, "metaslab size must be a power of 2");
181
182	/*
183	 * If we have some slop left over, try to cover it by resizing the vdev,
184	 * subject to the maxsize and minsize parameters.
185	 */
186	if (asize % mssize != 0) {
187		vdevsize1 = rounddown2(asize, mssize) + VDEV_LABEL_SPACE;
188		if (vdevsize1 < fsopts->minsize)
189			vdevsize1 = roundup2(asize, mssize) + VDEV_LABEL_SPACE;
190		if (vdevsize1 <= fsopts->maxsize)
191			vdevsize = vdevsize1;
192	}
193	asize = vdevsize - VDEV_LABEL_SPACE;
194
195	zfs->asize = asize;
196	zfs->vdevsize = vdevsize;
197	zfs->mssize = mssize;
198	zfs->msshift = flsll(mssize) - 1;
199	zfs->mscount = asize / mssize;
200}
201
202/*
203 * Validate options and set some default values.
204 */
205static void
206zfs_check_opts(fsinfo_t *fsopts)
207{
208	zfs_opt_t *zfs;
209
210	zfs = fsopts->fs_specific;
211
212	if (fsopts->offset != 0)
213		errx(1, "unhandled offset option");
214	if (fsopts->maxsize == 0)
215		errx(1, "an image size must be specified");
216
217	if (zfs->poolname == NULL)
218		errx(1, "a pool name must be specified");
219	if (!isalpha(zfs->poolname[0]))
220		errx(1, "the pool name must begin with a letter");
221	for (size_t i = 0, len = strlen(zfs->poolname); i < len; i++) {
222		if (!isalnum(zfs->poolname[i]) && zfs->poolname[i] != '_')
223			errx(1, "invalid character '%c' in pool name",
224			    zfs->poolname[i]);
225	}
226	if (strcmp(zfs->poolname, "mirror") == 0 ||
227	    strcmp(zfs->poolname, "raidz") == 0 ||
228	    strcmp(zfs->poolname, "draid") == 0) {
229		errx(1, "pool name '%s' is reserved and cannot be used",
230		    zfs->poolname);
231	}
232
233	if (zfs->rootpath == NULL)
234		easprintf(&zfs->rootpath, "/%s", zfs->poolname);
235	if (zfs->rootpath[0] != '/')
236		errx(1, "mountpoint `%s' must be absolute", zfs->rootpath);
237
238	if (zfs->ashift == 0)
239		zfs->ashift = 12;
240
241	zfs_size_vdev(fsopts);
242}
243
244void
245zfs_cleanup_opts(fsinfo_t *fsopts)
246{
247	struct dataset_desc *d, *tmp;
248	zfs_opt_t *zfs;
249
250	zfs = fsopts->fs_specific;
251	free(zfs->rootpath);
252	free(zfs->bootfs);
253	free(__DECONST(void *, zfs->poolname));
254	STAILQ_FOREACH_SAFE(d, &zfs->datasetdescs, next, tmp) {
255		free(d->params);
256		free(d);
257	}
258	free(zfs);
259	free(fsopts->fs_options);
260}
261
262static size_t
263nvlist_size(const nvlist_t *nvl)
264{
265	return (sizeof(nvl->nv_header) + nvl->nv_size);
266}
267
268static void
269nvlist_copy(const nvlist_t *nvl, char *buf, size_t sz)
270{
271	assert(sz >= nvlist_size(nvl));
272
273	memcpy(buf, &nvl->nv_header, sizeof(nvl->nv_header));
274	memcpy(buf + sizeof(nvl->nv_header), nvl->nv_data, nvl->nv_size);
275}
276
277/*
278 * Avoid returning a GUID of 0, just to avoid the possibility that something
279 * will interpret that as meaning that the GUID is uninitialized.
280 */
281uint64_t
282randomguid(void)
283{
284	uint64_t ret;
285
286	do {
287		ret = ((uint64_t)random() << 32) | random();
288	} while (ret == 0);
289
290	return (ret);
291}
292
293static nvlist_t *
294pool_config_nvcreate(zfs_opt_t *zfs)
295{
296	nvlist_t *featuresnv, *poolnv;
297
298	poolnv = nvlist_create(NV_UNIQUE_NAME);
299	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_TXG, TXG);
300	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VERSION, SPA_VERSION);
301	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_STATE, POOL_STATE_EXPORTED);
302	nvlist_add_string(poolnv, ZPOOL_CONFIG_POOL_NAME, zfs->poolname);
303	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_POOL_GUID, zfs->poolguid);
304	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_TOP_GUID, zfs->vdevguid);
305	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
306	nvlist_add_uint64(poolnv, ZPOOL_CONFIG_VDEV_CHILDREN, 1);
307
308	featuresnv = nvlist_create(NV_UNIQUE_NAME);
309	nvlist_add_nvlist(poolnv, ZPOOL_CONFIG_FEATURES_FOR_READ, featuresnv);
310	nvlist_destroy(featuresnv);
311
312	return (poolnv);
313}
314
315static nvlist_t *
316pool_disk_vdev_config_nvcreate(zfs_opt_t *zfs)
317{
318	nvlist_t *diskvdevnv;
319
320	assert(zfs->objarrid != 0);
321
322	diskvdevnv = nvlist_create(NV_UNIQUE_NAME);
323	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK);
324	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASHIFT, zfs->ashift);
325	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ASIZE, zfs->asize);
326	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_GUID, zfs->vdevguid);
327	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_ID, 0);
328	nvlist_add_string(diskvdevnv, ZPOOL_CONFIG_PATH, "/dev/null");
329	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_WHOLE_DISK, 1);
330	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
331	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_ARRAY,
332	    zfs->objarrid);
333	nvlist_add_uint64(diskvdevnv, ZPOOL_CONFIG_METASLAB_SHIFT,
334	    zfs->msshift);
335
336	return (diskvdevnv);
337}
338
339static nvlist_t *
340pool_root_vdev_config_nvcreate(zfs_opt_t *zfs)
341{
342	nvlist_t *diskvdevnv, *rootvdevnv;
343
344	diskvdevnv = pool_disk_vdev_config_nvcreate(zfs);
345	rootvdevnv = nvlist_create(NV_UNIQUE_NAME);
346
347	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_ID, 0);
348	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_GUID, zfs->poolguid);
349	nvlist_add_string(rootvdevnv, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT);
350	nvlist_add_uint64(rootvdevnv, ZPOOL_CONFIG_CREATE_TXG, TXG);
351	nvlist_add_nvlist_array(rootvdevnv, ZPOOL_CONFIG_CHILDREN, &diskvdevnv,
352	    1);
353	nvlist_destroy(diskvdevnv);
354
355	return (rootvdevnv);
356}
357
358/*
359 * Create the pool's "config" object, which contains an nvlist describing pool
360 * parameters and the vdev topology.  It is similar but not identical to the
361 * nvlist stored in vdev labels.  The main difference is that vdev labels do not
362 * describe the full vdev tree and in particular do not contain the "root"
363 * meta-vdev.
364 */
365static void
366pool_init_objdir_config(zfs_opt_t *zfs, zfs_zap_t *objdir)
367{
368	dnode_phys_t *dnode;
369	nvlist_t *poolconfig, *vdevconfig;
370	void *configbuf;
371	uint64_t dnid;
372	off_t configloc, configblksz;
373	int error;
374
375	dnode = objset_dnode_bonus_alloc(zfs->mos, DMU_OT_PACKED_NVLIST,
376	    DMU_OT_PACKED_NVLIST_SIZE, sizeof(uint64_t), &dnid);
377
378	poolconfig = pool_config_nvcreate(zfs);
379
380	vdevconfig = pool_root_vdev_config_nvcreate(zfs);
381	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
382	nvlist_destroy(vdevconfig);
383
384	error = nvlist_export(poolconfig);
385	if (error != 0)
386		errc(1, error, "nvlist_export");
387
388	configblksz = nvlist_size(poolconfig);
389	configloc = objset_space_alloc(zfs, zfs->mos, &configblksz);
390	configbuf = ecalloc(1, configblksz);
391	nvlist_copy(poolconfig, configbuf, configblksz);
392
393	vdev_pwrite_dnode_data(zfs, dnode, configbuf, configblksz, configloc);
394
395	dnode->dn_datablkszsec = configblksz >> MINBLOCKSHIFT;
396	dnode->dn_flags = DNODE_FLAG_USED_BYTES;
397	*(uint64_t *)DN_BONUS(dnode) = nvlist_size(poolconfig);
398
399	zap_add_uint64(objdir, DMU_POOL_CONFIG, dnid);
400
401	nvlist_destroy(poolconfig);
402	free(configbuf);
403}
404
405/*
406 * Add objects block pointer list objects, used for deferred frees.  We don't do
407 * anything with them, but they need to be present or OpenZFS will refuse to
408 * import the pool.
409 */
410static void
411pool_init_objdir_bplists(zfs_opt_t *zfs __unused, zfs_zap_t *objdir)
412{
413	uint64_t dnid;
414
415	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
416	    BPOBJ_SIZE_V2, &dnid);
417	zap_add_uint64(objdir, DMU_POOL_FREE_BPOBJ, dnid);
418
419	(void)objset_dnode_bonus_alloc(zfs->mos, DMU_OT_BPOBJ, DMU_OT_BPOBJ_HDR,
420	    BPOBJ_SIZE_V2, &dnid);
421	zap_add_uint64(objdir, DMU_POOL_SYNC_BPLIST, dnid);
422}
423
424/*
425 * Add required feature metadata objects.  We don't know anything about ZFS
426 * features, so the objects are just empty ZAPs.
427 */
428static void
429pool_init_objdir_feature_maps(zfs_opt_t *zfs, zfs_zap_t *objdir)
430{
431	dnode_phys_t *dnode;
432	uint64_t dnid;
433
434	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
435	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_READ, dnid);
436	zap_write(zfs, zap_alloc(zfs->mos, dnode));
437
438	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
439	zap_add_uint64(objdir, DMU_POOL_FEATURES_FOR_WRITE, dnid);
440	zap_write(zfs, zap_alloc(zfs->mos, dnode));
441
442	dnode = objset_dnode_alloc(zfs->mos, DMU_OTN_ZAP_METADATA, &dnid);
443	zap_add_uint64(objdir, DMU_POOL_FEATURE_DESCRIPTIONS, dnid);
444	zap_write(zfs, zap_alloc(zfs->mos, dnode));
445}
446
447static void
448pool_init_objdir_dsl(zfs_opt_t *zfs, zfs_zap_t *objdir)
449{
450	zap_add_uint64(objdir, DMU_POOL_ROOT_DATASET,
451	    dsl_dir_id(zfs->rootdsldir));
452}
453
454static void
455pool_init_objdir_poolprops(zfs_opt_t *zfs, zfs_zap_t *objdir)
456{
457	dnode_phys_t *dnode;
458	uint64_t id;
459
460	dnode = objset_dnode_alloc(zfs->mos, DMU_OT_POOL_PROPS, &id);
461	zap_add_uint64(objdir, DMU_POOL_PROPS, id);
462
463	zfs->poolprops = zap_alloc(zfs->mos, dnode);
464}
465
466/*
467 * Initialize the MOS object directory, the root of virtually all of the pool's
468 * data and metadata.
469 */
470static void
471pool_init_objdir(zfs_opt_t *zfs)
472{
473	zfs_zap_t *zap;
474	dnode_phys_t *objdir;
475
476	objdir = objset_dnode_lookup(zfs->mos, DMU_POOL_DIRECTORY_OBJECT);
477
478	zap = zap_alloc(zfs->mos, objdir);
479	pool_init_objdir_config(zfs, zap);
480	pool_init_objdir_bplists(zfs, zap);
481	pool_init_objdir_feature_maps(zfs, zap);
482	pool_init_objdir_dsl(zfs, zap);
483	pool_init_objdir_poolprops(zfs, zap);
484	zap_write(zfs, zap);
485}
486
487/*
488 * Initialize the meta-object set (MOS) and immediately write out several
489 * special objects whose contents are already finalized, including the object
490 * directory.
491 *
492 * Once the MOS is finalized, it'll look roughly like this:
493 *
494 *	object directory (ZAP)
495 *	|-> vdev config object (nvlist)
496 *	|-> features for read
497 *	|-> features for write
498 *	|-> feature descriptions
499 *	|-> sync bplist
500 *	|-> free bplist
501 *	|-> pool properties
502 *	L-> root DSL directory
503 *	    |-> DSL child directory (ZAP)
504 *	    |   |-> $MOS (DSL dir)
505 *	    |   |   |-> child map
506 *	    |   |   L-> props (ZAP)
507 *	    |   |-> $FREE (DSL dir)
508 *	    |   |   |-> child map
509 *	    |   |   L-> props (ZAP)
510 *	    |   |-> $ORIGIN (DSL dir)
511 *	    |   |   |-> child map
512 *	    |   |   |-> dataset
513 *	    |   |   |   L-> deadlist
514 *	    |   |   |-> snapshot
515 *	    |   |   |   |-> deadlist
516 *	    |   |   |   L-> snapshot names
517 *	    |   |   |-> props (ZAP)
518 *	    |   |   L-> clones (ZAP)
519 *	    |   |-> dataset 1 (DSL dir)
520 *	    |   |   |-> DSL dataset
521 *	    |   |   |   |-> snapshot names
522 *	    |   |   |   L-> deadlist
523 *	    |   |   |-> child map
524 *	    |   |   |   L-> ...
525 *	    |   |   L-> props
526 *	    |   |-> dataset 2
527 *	    |   |   L-> ...
528 *	    |   |-> ...
529 *	    |   L-> dataset n
530 *	    |-> DSL root dataset
531 *	    |   |-> snapshot names
532 *	    |   L-> deadlist
533 *	    L-> props (ZAP)
534 *	space map object array
535 *	|-> space map 1
536 *	|-> space map 2
537 *	|-> ...
538 *	L-> space map n (zfs->mscount)
539 *
540 * The space map object array is pointed to by the "msarray" property in the
541 * pool configuration.
542 */
543static void
544pool_init(zfs_opt_t *zfs)
545{
546	uint64_t dnid;
547
548	zfs->poolguid = randomguid();
549	zfs->vdevguid = randomguid();
550
551	zfs->mos = objset_alloc(zfs, DMU_OST_META);
552
553	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_DIRECTORY, &dnid);
554	assert(dnid == DMU_POOL_DIRECTORY_OBJECT);
555
556	(void)objset_dnode_alloc(zfs->mos, DMU_OT_OBJECT_ARRAY, &zfs->objarrid);
557
558	dsl_init(zfs);
559
560	pool_init_objdir(zfs);
561}
562
563static void
564pool_labels_write(zfs_opt_t *zfs)
565{
566	uberblock_t *ub;
567	vdev_label_t *label;
568	nvlist_t *poolconfig, *vdevconfig;
569	int error;
570
571	label = ecalloc(1, sizeof(*label));
572
573	/*
574	 * Assemble the vdev configuration and store it in the label.
575	 */
576	poolconfig = pool_config_nvcreate(zfs);
577	vdevconfig = pool_disk_vdev_config_nvcreate(zfs);
578	nvlist_add_nvlist(poolconfig, ZPOOL_CONFIG_VDEV_TREE, vdevconfig);
579	nvlist_destroy(vdevconfig);
580
581	error = nvlist_export(poolconfig);
582	if (error != 0)
583		errc(1, error, "nvlist_export");
584	nvlist_copy(poolconfig, label->vl_vdev_phys.vp_nvlist,
585	    sizeof(label->vl_vdev_phys.vp_nvlist));
586	nvlist_destroy(poolconfig);
587
588	/*
589	 * Fill out the uberblock.  Just make each one the same.  The embedded
590	 * checksum is calculated in vdev_label_write().
591	 */
592	for (size_t uoff = 0; uoff < sizeof(label->vl_uberblock);
593	    uoff += (1 << zfs->ashift)) {
594		ub = (uberblock_t *)(&label->vl_uberblock[0] + uoff);
595		ub->ub_magic = UBERBLOCK_MAGIC;
596		ub->ub_version = SPA_VERSION;
597		ub->ub_txg = TXG;
598		ub->ub_guid_sum = zfs->poolguid + zfs->vdevguid;
599		ub->ub_timestamp = 0;
600
601		ub->ub_software_version = SPA_VERSION;
602		ub->ub_mmp_magic = MMP_MAGIC;
603		ub->ub_mmp_delay = 0;
604		ub->ub_mmp_config = 0;
605		ub->ub_checkpoint_txg = 0;
606		objset_root_blkptr_copy(zfs->mos, &ub->ub_rootbp);
607	}
608
609	/*
610	 * Write out four copies of the label: two at the beginning of the vdev
611	 * and two at the end.
612	 */
613	for (int i = 0; i < VDEV_LABELS; i++)
614		vdev_label_write(zfs, i, label);
615
616	free(label);
617}
618
619static void
620pool_fini(zfs_opt_t *zfs)
621{
622	zap_write(zfs, zfs->poolprops);
623	dsl_write(zfs);
624	objset_write(zfs, zfs->mos);
625	pool_labels_write(zfs);
626}
627
628struct dnode_cursor *
629dnode_cursor_init(zfs_opt_t *zfs, zfs_objset_t *os, dnode_phys_t *dnode,
630    off_t size, off_t blksz)
631{
632	struct dnode_cursor *c;
633	uint64_t nbppindir, indlevel, ndatablks, nindblks;
634
635	assert(dnode->dn_nblkptr == 1);
636	assert(blksz <= MAXBLOCKSIZE);
637
638	if (blksz == 0) {
639		/* Must be between 1<<ashift and 128KB. */
640		blksz = MIN(MAXBLOCKSIZE, MAX(1 << zfs->ashift,
641		    powerof2(size) ? size : (1l << flsll(size))));
642	}
643	assert(powerof2(blksz));
644
645	/*
646	 * Do we need indirect blocks?  Figure out how many levels are needed
647	 * (indlevel == 1 means no indirect blocks) and how much space is needed
648	 * (it has to be allocated up-front to break the dependency cycle
649	 * described in objset_write()).
650	 */
651	ndatablks = size == 0 ? 0 : howmany(size, blksz);
652	nindblks = 0;
653	for (indlevel = 1, nbppindir = 1; ndatablks > nbppindir; indlevel++) {
654		nbppindir *= BLKPTR_PER_INDIR;
655		nindblks += howmany(ndatablks, indlevel * nbppindir);
656	}
657	assert(indlevel < INDIR_LEVELS);
658
659	dnode->dn_nlevels = (uint8_t)indlevel;
660	dnode->dn_maxblkid = ndatablks > 0 ? ndatablks - 1 : 0;
661	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
662
663	c = ecalloc(1, sizeof(*c));
664	if (nindblks > 0) {
665		c->indspace = nindblks * MAXBLOCKSIZE;
666		c->indloc = objset_space_alloc(zfs, os, &c->indspace);
667	}
668	c->dnode = dnode;
669	c->dataoff = 0;
670	c->datablksz = blksz;
671
672	return (c);
673}
674
675static void
676_dnode_cursor_flush(zfs_opt_t *zfs, struct dnode_cursor *c, unsigned int levels)
677{
678	blkptr_t *bp, *pbp;
679	void *buf;
680	uint64_t fill;
681	off_t blkid, blksz, loc;
682
683	assert(levels > 0);
684	assert(levels <= c->dnode->dn_nlevels - 1U);
685
686	blksz = MAXBLOCKSIZE;
687	blkid = (c->dataoff / c->datablksz) / BLKPTR_PER_INDIR;
688	for (unsigned int level = 1; level <= levels; level++) {
689		buf = c->inddir[level - 1];
690
691		if (level == c->dnode->dn_nlevels - 1U) {
692			pbp = &c->dnode->dn_blkptr[0];
693		} else {
694			uint64_t iblkid;
695
696			iblkid = blkid & (BLKPTR_PER_INDIR - 1);
697			pbp = (blkptr_t *)
698			    &c->inddir[level][iblkid * sizeof(blkptr_t)];
699		}
700
701		/*
702		 * Space for indirect blocks is allocated up-front; see the
703		 * comment in objset_write().
704		 */
705		loc = c->indloc;
706		c->indloc += blksz;
707		assert(c->indspace >= blksz);
708		c->indspace -= blksz;
709
710		bp = buf;
711		fill = 0;
712		for (size_t i = 0; i < BLKPTR_PER_INDIR; i++)
713			fill += BP_GET_FILL(&bp[i]);
714
715		vdev_pwrite_dnode_indir(zfs, c->dnode, level, fill, buf, blksz,
716		    loc, pbp);
717		memset(buf, 0, MAXBLOCKSIZE);
718
719		blkid /= BLKPTR_PER_INDIR;
720	}
721}
722
723blkptr_t *
724dnode_cursor_next(zfs_opt_t *zfs, struct dnode_cursor *c, off_t off)
725{
726	off_t blkid, l1id;
727	unsigned int levels;
728
729	if (c->dnode->dn_nlevels == 1) {
730		assert(off < MAXBLOCKSIZE);
731		return (&c->dnode->dn_blkptr[0]);
732	}
733
734	assert(off % c->datablksz == 0);
735
736	/* Do we need to flush any full indirect blocks? */
737	if (off > 0) {
738		blkid = off / c->datablksz;
739		for (levels = 0; levels < c->dnode->dn_nlevels - 1U; levels++) {
740			if (blkid % BLKPTR_PER_INDIR != 0)
741				break;
742			blkid /= BLKPTR_PER_INDIR;
743		}
744		if (levels > 0)
745			_dnode_cursor_flush(zfs, c, levels);
746	}
747
748	c->dataoff = off;
749	l1id = (off / c->datablksz) & (BLKPTR_PER_INDIR - 1);
750	return ((blkptr_t *)&c->inddir[0][l1id * sizeof(blkptr_t)]);
751}
752
753void
754dnode_cursor_finish(zfs_opt_t *zfs, struct dnode_cursor *c)
755{
756	unsigned int levels;
757
758	assert(c->dnode->dn_nlevels > 0);
759	levels = c->dnode->dn_nlevels - 1;
760	if (levels > 0)
761		_dnode_cursor_flush(zfs, c, levels);
762	assert(c->indspace == 0);
763	free(c);
764}
765
766void
767zfs_makefs(const char *image, const char *dir, fsnode *root, fsinfo_t *fsopts)
768{
769	zfs_opt_t *zfs;
770	int dirfd;
771
772	zfs = fsopts->fs_specific;
773
774	/*
775	 * Use a fixed seed to provide reproducible pseudo-random numbers for
776	 * on-disk structures when needed (e.g., GUIDs, ZAP hash salts).
777	 */
778	srandom(1729);
779
780	zfs_check_opts(fsopts);
781
782	if (!zfs->nowarn) {
783		fprintf(stderr,
784		    "ZFS support is currently considered experimental. "
785		    "Do not use it for anything critical.\n");
786	}
787
788	dirfd = open(dir, O_DIRECTORY | O_RDONLY);
789	if (dirfd < 0)
790		err(1, "open(%s)", dir);
791
792	vdev_init(zfs, image);
793	pool_init(zfs);
794	fs_build(zfs, dirfd, root);
795	pool_fini(zfs);
796	vdev_fini(zfs);
797}
798