1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in
16 *    the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <sys/stat.h>
32
33#include <assert.h>
34#include <dirent.h>
35#include <fcntl.h>
36#include <stdlib.h>
37#include <string.h>
38#include <unistd.h>
39
40#include <util.h>
41
42#include "makefs.h"
43#include "zfs.h"
44
45typedef struct {
46	const char	*name;
47	unsigned int	id;
48	uint16_t	size;
49	sa_bswap_type_t	bs;
50} zfs_sattr_t;
51
52typedef struct zfs_fs {
53	zfs_objset_t	*os;
54
55	/* Offset table for system attributes, indexed by a zpl_attr_t. */
56	uint16_t	*saoffs;
57	size_t		sacnt;
58	const zfs_sattr_t *satab;
59} zfs_fs_t;
60
61/*
62 * The order of the attributes doesn't matter, this is simply the one hard-coded
63 * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
64 */
65typedef enum zpl_attr {
66	ZPL_ATIME,
67	ZPL_MTIME,
68	ZPL_CTIME,
69	ZPL_CRTIME,
70	ZPL_GEN,
71	ZPL_MODE,
72	ZPL_SIZE,
73	ZPL_PARENT,
74	ZPL_LINKS,
75	ZPL_XATTR,
76	ZPL_RDEV,
77	ZPL_FLAGS,
78	ZPL_UID,
79	ZPL_GID,
80	ZPL_PAD,
81	ZPL_ZNODE_ACL,
82	ZPL_DACL_COUNT,
83	ZPL_SYMLINK,
84	ZPL_SCANSTAMP,
85	ZPL_DACL_ACES,
86	ZPL_DXATTR,
87	ZPL_PROJID,
88} zpl_attr_t;
89
90/*
91 * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
92 */
93static const zfs_sattr_t zpl_attrs[] = {
94#define	_ZPL_ATTR(n, s, b)	{ .name = #n, .id = n, .size = s, .bs = b }
95	_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
96	_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
97	_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
98	_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
99	_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
100	_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
101	_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
102	_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
103	_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
104	_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
105	_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
106	_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
107	_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
108	_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
109	_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
110	_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
111	_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
112	_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
113	_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
114	_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
115	_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
116	_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
117#undef ZPL_ATTR
118};
119
120/*
121 * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
122 * It need not match in general, but FreeBSD's loader doesn't bother parsing the
123 * layout and just hard-codes attribute offsets.
124 */
125static const sa_attr_type_t zpl_attr_layout[] = {
126	ZPL_MODE,
127	ZPL_SIZE,
128	ZPL_GEN,
129	ZPL_UID,
130	ZPL_GID,
131	ZPL_PARENT,
132	ZPL_FLAGS,
133	ZPL_ATIME,
134	ZPL_MTIME,
135	ZPL_CTIME,
136	ZPL_CRTIME,
137	ZPL_LINKS,
138	ZPL_DACL_COUNT,
139	ZPL_DACL_ACES,
140	ZPL_SYMLINK,
141};
142
143/*
144 * Keys for the ZPL attribute tables in the SA layout ZAP.  The first two
145 * indices are reserved for legacy attribute encoding.
146 */
147#define	SA_LAYOUT_INDEX_DEFAULT	2
148#define	SA_LAYOUT_INDEX_SYMLINK	3
149
150struct fs_populate_dir {
151	SLIST_ENTRY(fs_populate_dir) next;
152	int			dirfd;
153	uint64_t		objid;
154	zfs_zap_t		*zap;
155};
156
157struct fs_populate_arg {
158	zfs_opt_t	*zfs;
159	zfs_fs_t	*fs;			/* owning filesystem */
160	uint64_t	rootdirid;		/* root directory dnode ID */
161	int		rootdirfd;		/* root directory fd */
162	SLIST_HEAD(, fs_populate_dir) dirs;	/* stack of directories */
163};
164
165static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
166
167static void
168eclose(int fd)
169{
170	if (close(fd) != 0)
171		err(1, "close");
172}
173
174static bool
175fsnode_isroot(const fsnode *cur)
176{
177	return (strcmp(cur->name, ".") == 0);
178}
179
180/*
181 * Visit each node in a directory hierarchy, in pre-order depth-first order.
182 */
183static void
184fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
185{
186	assert(root->type == S_IFDIR);
187
188	for (fsnode *cur = root; cur != NULL; cur = cur->next) {
189		assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
190		    cur->type == S_IFLNK);
191
192		if (cb(cur, arg) == 0)
193			continue;
194		if (cur->type == S_IFDIR && cur->child != NULL)
195			fsnode_foreach(cur->child, cb, arg);
196	}
197}
198
199static void
200fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
201{
202	struct fs_populate_dir *dir;
203	uint64_t type;
204
205	switch (cur->type) {
206	case S_IFREG:
207		type = DT_REG;
208		break;
209	case S_IFDIR:
210		type = DT_DIR;
211		break;
212	case S_IFLNK:
213		type = DT_LNK;
214		break;
215	default:
216		assert(0);
217	}
218
219	dir = SLIST_FIRST(&arg->dirs);
220	zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
221}
222
223static void
224fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
225    size_t *szp)
226{
227	assert(ind < fs->sacnt);
228	assert(fs->saoffs[ind] != 0xffff);
229
230	memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
231	*szp += fs->satab[ind].size;
232}
233
234static void
235fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
236    size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
237{
238	assert(ind < fs->sacnt);
239	assert(fs->saoffs[ind] != 0xffff);
240	assert(fs->satab[ind].size == 0);
241
242	memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
243	*szp += valsz;
244}
245
246/*
247 * Derive the relative fd/path combo needed to access a file.  Ideally we'd
248 * always be able to use relative lookups (i.e., use the *at() system calls),
249 * since they require less path translation and are more amenable to sandboxing,
250 * but the handling of multiple staging directories makes that difficult.  To
251 * make matters worse, we have no choice but to use relative lookups when
252 * dealing with an mtree manifest, so both mechanisms are implemented.
253 */
254static void
255fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
256    char *path, size_t sz, int *dirfdp)
257{
258	if (cur->contents != NULL) {
259		size_t n;
260
261		*dirfdp = AT_FDCWD;
262		n = strlcpy(path, cur->contents, sz);
263		assert(n < sz);
264	} else if (cur->root == NULL) {
265		size_t n;
266
267		*dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
268		n = strlcpy(path, cur->name, sz);
269		assert(n < sz);
270	} else {
271		int n;
272
273		*dirfdp = AT_FDCWD;
274		n = snprintf(path, sz, "%s/%s/%s",
275		    cur->root, cur->path, cur->name);
276		assert(n >= 0);
277		assert((size_t)n < sz);
278	}
279}
280
281static int
282fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
283{
284	char path[PATH_MAX];
285	int fd;
286
287	fs_populate_path(cur, arg, path, sizeof(path), &fd);
288
289	fd = openat(fd, path, flags);
290	if (fd < 0)
291		err(1, "openat(%s)", path);
292	return (fd);
293}
294
295static int
296fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags)
297{
298	int fd;
299	char path[PATH_MAX];
300
301	fs_populate_path(cur, arg, path, sizeof(path), &fd);
302
303	return (openat(fd, path, flags));
304}
305
306static void
307fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
308    char *buf, size_t bufsz)
309{
310	char path[PATH_MAX];
311	int fd;
312
313	if (cur->symlink != NULL) {
314		size_t n;
315
316		n = strlcpy(buf, cur->symlink, bufsz);
317		assert(n < bufsz);
318	} else {
319		ssize_t n;
320
321		fs_populate_path(cur, arg, path, sizeof(path), &fd);
322
323		n = readlinkat(fd, path, buf, bufsz - 1);
324		if (n == -1)
325			err(1, "readlinkat(%s)", cur->name);
326		buf[n] = '\0';
327	}
328}
329
330static void
331fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts,
332    uint16_t ind, size_t *szp)
333{
334	uint64_t timebuf[2];
335
336	assert(ind < fs->sacnt);
337	assert(fs->saoffs[ind] != 0xffff);
338	assert(fs->satab[ind].size == sizeof(timebuf));
339
340	timebuf[0] = ts->tv_sec;
341	timebuf[1] = ts->tv_nsec;
342	fs_populate_attr(fs, attrbuf, timebuf, ind, szp);
343}
344
345static void
346fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
347    dnode_phys_t *dnode)
348{
349	char target[PATH_MAX];
350	zfs_fs_t *fs;
351	zfs_ace_hdr_t aces[3];
352	struct stat *sb;
353	sa_hdr_phys_t *sahdr;
354	uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
355	char *attrbuf;
356	size_t bonussz, hdrsz;
357	int layout;
358
359	assert(dnode->dn_bonustype == DMU_OT_SA);
360	assert(dnode->dn_nblkptr == 1);
361
362	fs = arg->fs;
363	sb = &cur->inode->st;
364
365	switch (cur->type) {
366	case S_IFREG:
367		layout = SA_LAYOUT_INDEX_DEFAULT;
368		links = cur->inode->nlink;
369		objsize = sb->st_size;
370		parent = SLIST_FIRST(&arg->dirs)->objid;
371		break;
372	case S_IFDIR:
373		layout = SA_LAYOUT_INDEX_DEFAULT;
374		links = 1; /* .. */
375		objsize = 1; /* .. */
376
377		/*
378		 * The size of a ZPL directory is the number of entries
379		 * (including "." and ".."), and the link count is the number of
380		 * entries which are directories (including "." and "..").
381		 */
382		for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
383		    c != NULL; c = c->next) {
384			if (c->type == S_IFDIR)
385				links++;
386			objsize++;
387		}
388
389		/* The root directory is its own parent. */
390		parent = SLIST_EMPTY(&arg->dirs) ?
391		    arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
392		break;
393	case S_IFLNK:
394		fs_readlink(cur, arg, target, sizeof(target));
395
396		layout = SA_LAYOUT_INDEX_SYMLINK;
397		links = 1;
398		objsize = strlen(target);
399		parent = SLIST_FIRST(&arg->dirs)->objid;
400		break;
401	default:
402		assert(0);
403	}
404
405	daclcount = nitems(aces);
406	flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE |
407	    ZFS_AV_MODIFIED;
408	gen = 1;
409	gid = sb->st_gid;
410	mode = sb->st_mode;
411	uid = sb->st_uid;
412
413	memset(aces, 0, sizeof(aces));
414	aces[0].z_flags = ACE_OWNER;
415	aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
416	aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
417	    ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
418	    ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
419	if ((mode & S_IRUSR) != 0)
420		aces[0].z_access_mask |= ACE_READ_DATA;
421	if ((mode & S_IWUSR) != 0)
422		aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
423	if ((mode & S_IXUSR) != 0)
424		aces[0].z_access_mask |= ACE_EXECUTE;
425
426	aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
427	aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
428	aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
429	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
430	if ((mode & S_IRGRP) != 0)
431		aces[1].z_access_mask |= ACE_READ_DATA;
432	if ((mode & S_IWGRP) != 0)
433		aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
434	if ((mode & S_IXGRP) != 0)
435		aces[1].z_access_mask |= ACE_EXECUTE;
436
437	aces[2].z_flags = ACE_EVERYONE;
438	aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
439	aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
440	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
441	if ((mode & S_IROTH) != 0)
442		aces[2].z_access_mask |= ACE_READ_DATA;
443	if ((mode & S_IWOTH) != 0)
444		aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
445	if ((mode & S_IXOTH) != 0)
446		aces[2].z_access_mask |= ACE_EXECUTE;
447
448	switch (layout) {
449	case SA_LAYOUT_INDEX_DEFAULT:
450		/* At most one variable-length attribute. */
451		hdrsz = sizeof(uint64_t);
452		break;
453	case SA_LAYOUT_INDEX_SYMLINK:
454		/* At most five variable-length attributes. */
455		hdrsz = sizeof(uint64_t) * 2;
456		break;
457	default:
458		assert(0);
459	}
460
461	sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
462	sahdr->sa_magic = SA_MAGIC;
463	SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
464
465	bonussz = SA_HDR_SIZE(sahdr);
466	attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
467
468	fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
469	fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
470	fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
471	fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
472	fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
473	fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
474	fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
475	fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
476	fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
477
478	/*
479	 * We deliberately set atime = mtime here to ensure that images are
480	 * reproducible.
481	 */
482	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
483	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
484	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
485#ifdef __linux__
486	/* Linux has no st_birthtim; approximate with st_ctim */
487	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz);
488#else
489	fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
490#endif
491
492	fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
493	    ZPL_DACL_ACES, &bonussz);
494	sahdr->sa_lengths[0] = sizeof(aces);
495
496	if (cur->type == S_IFLNK) {
497		assert(layout == SA_LAYOUT_INDEX_SYMLINK);
498		/* Need to use a spill block pointer if the target is long. */
499		assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
500		fs_populate_varszattr(fs, attrbuf, target, objsize,
501		    sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
502		sahdr->sa_lengths[1] = (uint16_t)objsize;
503	}
504
505	dnode->dn_bonuslen = bonussz;
506}
507
508static void
509fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
510{
511	struct dnode_cursor *c;
512	dnode_phys_t *dnode;
513	zfs_opt_t *zfs;
514	char *buf;
515	uint64_t dnid;
516	ssize_t n;
517	size_t bufsz;
518	off_t nbytes, reqbytes, size;
519	int fd;
520
521	assert(cur->type == S_IFREG);
522	assert((cur->inode->flags & FI_ROOT) == 0);
523
524	zfs = arg->zfs;
525
526	assert(cur->inode->ino != 0);
527	if ((cur->inode->flags & FI_ALLOCATED) != 0) {
528		/*
529		 * This is a hard link of an existing file.
530		 *
531		 * XXX-MJ need to check whether it crosses datasets, add a test
532		 * case for that
533		 */
534		fs_populate_dirent(arg, cur, cur->inode->ino);
535		return;
536	}
537
538	dnode = objset_dnode_bonus_alloc(arg->fs->os,
539	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
540	cur->inode->ino = dnid;
541	cur->inode->flags |= FI_ALLOCATED;
542
543	fd = fs_open(cur, arg, O_RDONLY);
544
545	buf = zfs->filebuf;
546	bufsz = sizeof(zfs->filebuf);
547	size = cur->inode->st.st_size;
548	c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
549	for (off_t foff = 0; foff < size; foff += nbytes) {
550		off_t loc, sofar;
551
552		/*
553		 * Fill up our buffer, handling partial reads.
554		 */
555		sofar = 0;
556		nbytes = MIN(size - foff, (off_t)bufsz);
557		do {
558			n = read(fd, buf + sofar, nbytes);
559			if (n < 0)
560				err(1, "reading from '%s'", cur->name);
561			if (n == 0)
562				errx(1, "unexpected EOF reading '%s'",
563				    cur->name);
564			sofar += n;
565		} while (sofar < nbytes);
566
567		if (nbytes < (off_t)bufsz)
568			memset(buf + nbytes, 0, bufsz - nbytes);
569
570		reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE;
571		loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes);
572		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc,
573		    dnode_cursor_next(zfs, c, foff));
574	}
575	eclose(fd);
576	dnode_cursor_finish(zfs, c);
577
578	fs_populate_sattrs(arg, cur, dnode);
579	fs_populate_dirent(arg, cur, dnid);
580}
581
582static void
583fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
584{
585	dnode_phys_t *dnode;
586	zfs_objset_t *os;
587	uint64_t dnid;
588	int dirfd;
589
590	assert(cur->type == S_IFDIR);
591	assert((cur->inode->flags & FI_ALLOCATED) == 0);
592
593	os = arg->fs->os;
594
595	dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
596	    DMU_OT_SA, 0, &dnid);
597
598	/*
599	 * Add an entry to the parent directory and open this directory.
600	 */
601	if (!SLIST_EMPTY(&arg->dirs)) {
602		fs_populate_dirent(arg, cur, dnid);
603		/*
604		 * We only need the directory fd if we're finding files in
605		 * it.  If it's just there for other directories or
606		 * files using contents= we don't need to succeed here.
607		 */
608		dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY);
609	} else {
610		arg->rootdirid = dnid;
611		dirfd = arg->rootdirfd;
612		arg->rootdirfd = -1;
613	}
614
615	/*
616	 * Set ZPL attributes.
617	 */
618	fs_populate_sattrs(arg, cur, dnode);
619
620	/*
621	 * If this is a root directory, then its children belong to a different
622	 * dataset and this directory remains empty in the current objset.
623	 */
624	if ((cur->inode->flags & FI_ROOT) == 0) {
625		struct fs_populate_dir *dir;
626
627		dir = ecalloc(1, sizeof(*dir));
628		dir->dirfd = dirfd;
629		dir->objid = dnid;
630		dir->zap = zap_alloc(os, dnode);
631		SLIST_INSERT_HEAD(&arg->dirs, dir, next);
632	} else {
633		zap_write(arg->zfs, zap_alloc(os, dnode));
634		fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
635	}
636}
637
638static void
639fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
640{
641	dnode_phys_t *dnode;
642	uint64_t dnid;
643
644	assert(cur->type == S_IFLNK);
645	assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
646
647	dnode = objset_dnode_bonus_alloc(arg->fs->os,
648	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
649
650	fs_populate_dirent(arg, cur, dnid);
651
652	fs_populate_sattrs(arg, cur, dnode);
653}
654
655static int
656fs_foreach_populate(fsnode *cur, void *_arg)
657{
658	struct fs_populate_arg *arg;
659	struct fs_populate_dir *dir;
660	int ret;
661
662	arg = _arg;
663	switch (cur->type) {
664	case S_IFREG:
665		fs_populate_file(cur, arg);
666		break;
667	case S_IFDIR:
668		if (fsnode_isroot(cur))
669			break;
670		fs_populate_dir(cur, arg);
671		break;
672	case S_IFLNK:
673		fs_populate_symlink(cur, arg);
674		break;
675	default:
676		assert(0);
677	}
678
679	ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
680
681	if (cur->next == NULL &&
682	    (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
683		/*
684		 * We reached a terminal node in a subtree.  Walk back up and
685		 * write out directories.  We're done once we hit the root of a
686		 * dataset or find a level where we're not on the edge of the
687		 * tree.
688		 */
689		do {
690			dir = SLIST_FIRST(&arg->dirs);
691			SLIST_REMOVE_HEAD(&arg->dirs, next);
692			zap_write(arg->zfs, dir->zap);
693			if (dir->dirfd != -1)
694				eclose(dir->dirfd);
695			free(dir);
696			cur = cur->parent;
697		} while (cur != NULL && cur->next == NULL &&
698		    (cur->inode->flags & FI_ROOT) == 0);
699	}
700
701	return (ret);
702}
703
704static void
705fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
706    const sa_attr_type_t layout[], size_t sacnt)
707{
708	char ti[16];
709
710	assert(sizeof(layout[0]) == 2);
711
712	snprintf(ti, sizeof(ti), "%u", index);
713	zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
714	    (const uint8_t *)layout);
715}
716
717/*
718 * Initialize system attribute tables.
719 *
720 * There are two elements to this.  First, we write the zpl_attrs[] and
721 * zpl_attr_layout[] tables to disk.  Then we create a lookup table which
722 * allows us to set file attributes quickly.
723 */
724static uint64_t
725fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
726{
727	zfs_zap_t *sazap, *salzap, *sarzap;
728	zfs_objset_t *os;
729	dnode_phys_t *saobj, *salobj, *sarobj;
730	uint64_t saobjid, salobjid, sarobjid;
731	uint16_t offset;
732
733	os = fs->os;
734
735	/*
736	 * The on-disk tables are stored in two ZAP objects, the registry object
737	 * and the layout object.  Individual attributes are described by
738	 * entries in the registry object; for example, the value for the
739	 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
740	 * The attributes of a file are ordered according to one of the layouts
741	 * defined in the layout object.  The master node object is simply used
742	 * to locate the registry and layout objects.
743	 */
744	saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
745	salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
746	sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
747
748	sarzap = zap_alloc(os, sarobj);
749	for (size_t i = 0; i < nitems(zpl_attrs); i++) {
750		const zfs_sattr_t *sa;
751		uint64_t attr;
752
753		attr = 0;
754		sa = &zpl_attrs[i];
755		SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
756		zap_add_uint64(sarzap, sa->name, attr);
757	}
758	zap_write(zfs, sarzap);
759
760	/*
761	 * Layouts are arrays of indices into the registry.  We define two
762	 * layouts for use by the ZPL, one for non-symlinks and one for
763	 * symlinks.  They are identical except that the symlink layout includes
764	 * ZPL_SYMLINK as its final attribute.
765	 */
766	salzap = zap_alloc(os, salobj);
767	assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
768	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
769	    zpl_attr_layout, nitems(zpl_attr_layout) - 1);
770	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
771	    zpl_attr_layout, nitems(zpl_attr_layout));
772	zap_write(zfs, salzap);
773
774	sazap = zap_alloc(os, saobj);
775	zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
776	zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
777	zap_write(zfs, sazap);
778
779	/* Sanity check. */
780	for (size_t i = 0; i < nitems(zpl_attrs); i++)
781		assert(i == zpl_attrs[i].id);
782
783	/*
784	 * Build the offset table used when setting file attributes.  File
785	 * attributes are stored in the object's bonus buffer; this table
786	 * provides the buffer offset of attributes referenced by the layout
787	 * table.
788	 */
789	fs->sacnt = nitems(zpl_attrs);
790	fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
791	for (size_t i = 0; i < fs->sacnt; i++)
792		fs->saoffs[i] = 0xffff;
793	offset = 0;
794	for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
795		uint16_t size;
796
797		assert(zpl_attr_layout[i] < fs->sacnt);
798
799		fs->saoffs[zpl_attr_layout[i]] = offset;
800		size = zpl_attrs[zpl_attr_layout[i]].size;
801		offset += size;
802	}
803	fs->satab = zpl_attrs;
804
805	return (saobjid);
806}
807
808static void
809fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
810{
811	char *mountpoint, *origmountpoint, *name, *next;
812	fsnode *cur, *root;
813	uint64_t canmount;
814
815	if (!dsl_dir_has_dataset(dsldir))
816		return;
817
818	if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
819		return;
820	mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
821	if (mountpoint == NULL)
822		return;
823
824	/*
825	 * If we were asked to specify a bootfs, set it here.
826	 */
827	if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
828	    dsl_dir_fullname(dsldir)) == 0) {
829		zap_add_uint64(zfs->poolprops, "bootfs",
830		    dsl_dir_dataset_id(dsldir));
831	}
832
833	origmountpoint = mountpoint;
834
835	/*
836	 * Figure out which fsnode corresponds to our mountpoint.
837	 */
838	root = arg;
839	cur = root;
840	if (strcmp(mountpoint, zfs->rootpath) != 0) {
841		mountpoint += strlen(zfs->rootpath);
842
843		/*
844		 * Look up the directory in the staged tree.  For example, if
845		 * the dataset's mount point is /foo/bar/baz, we'll search the
846		 * root directory for "foo", search "foo" for "baz", and so on.
847		 * Each intermediate name must refer to a directory; the final
848		 * component need not exist.
849		 */
850		cur = root;
851		for (next = name = mountpoint; next != NULL;) {
852			for (; *next == '/'; next++)
853				;
854			name = strsep(&next, "/");
855
856			for (; cur != NULL && strcmp(cur->name, name) != 0;
857			    cur = cur->next)
858				;
859			if (cur == NULL) {
860				if (next == NULL)
861					break;
862				errx(1, "missing mountpoint directory for `%s'",
863				    dsl_dir_fullname(dsldir));
864			}
865			if (cur->type != S_IFDIR) {
866				errx(1,
867				    "mountpoint for `%s' is not a directory",
868				    dsl_dir_fullname(dsldir));
869			}
870			if (next != NULL)
871				cur = cur->child;
872		}
873	}
874
875	if (cur != NULL) {
876		assert(cur->type == S_IFDIR);
877
878		/*
879		 * Multiple datasets shouldn't share a mountpoint.  It's
880		 * technically allowed, but it's not clear what makefs should do
881		 * in that case.
882		 */
883		assert((cur->inode->flags & FI_ROOT) == 0);
884		if (cur != root)
885			cur->inode->flags |= FI_ROOT;
886		assert(cur->inode->param == NULL);
887		cur->inode->param = dsldir;
888	}
889
890	free(origmountpoint);
891}
892
893static int
894fs_foreach_mark(fsnode *cur, void *arg)
895{
896	uint64_t *countp;
897
898	countp = arg;
899	if (cur->type == S_IFDIR && fsnode_isroot(cur))
900		return (1);
901
902	if (cur->inode->ino == 0) {
903		cur->inode->ino = ++(*countp);
904		cur->inode->nlink = 1;
905	} else {
906		cur->inode->nlink++;
907	}
908
909	return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
910}
911
912/*
913 * Create a filesystem dataset.  More specifically:
914 * - create an object set for the dataset,
915 * - add required metadata (SA tables, property definitions, etc.) to that
916 *   object set,
917 * - optionally populate the object set with file objects, using "root" as the
918 *   root directory.
919 *
920 * "dirfd" is a directory descriptor for the directory referenced by "root".  It
921 * is closed before returning.
922 */
923static void
924fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
925{
926	struct fs_populate_arg arg;
927	zfs_fs_t fs;
928	zfs_zap_t *masterzap;
929	zfs_objset_t *os;
930	dnode_phys_t *deleteq, *masterobj;
931	uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
932	bool fakedroot;
933
934	/*
935	 * This dataset's mountpoint doesn't exist in the staging tree, or the
936	 * dataset doesn't have a mountpoint at all.  In either case we still
937	 * need a root directory.  Fake up a root fsnode to handle this case.
938	 */
939	fakedroot = root == NULL;
940	if (fakedroot) {
941		struct stat *stp;
942
943		assert(dirfd == -1);
944
945		root = ecalloc(1, sizeof(*root));
946		root->inode = ecalloc(1, sizeof(*root->inode));
947		root->name = estrdup(".");
948		root->type = S_IFDIR;
949
950		stp = &root->inode->st;
951		stp->st_uid = 0;
952		stp->st_gid = 0;
953		stp->st_mode = S_IFDIR | 0755;
954	}
955	assert(root->type == S_IFDIR);
956	assert(fsnode_isroot(root));
957
958	/*
959	 * Initialize the object set for this dataset.
960	 */
961	os = objset_alloc(zfs, DMU_OST_ZFS);
962	masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
963	assert(moid == MASTER_NODE_OBJ);
964
965	memset(&fs, 0, sizeof(fs));
966	fs.os = os;
967
968	/*
969	 * Create the ZAP SA layout now since filesystem object dnodes will
970	 * refer to those attributes.
971	 */
972	saobjid = fs_set_zpl_attrs(zfs, &fs);
973
974	/*
975	 * Make a pass over the staged directory to detect hard links and assign
976	 * virtual dnode numbers.
977	 */
978	dnodecount = 1; /* root directory */
979	fsnode_foreach(root, fs_foreach_mark, &dnodecount);
980
981	/*
982	 * Make a second pass to populate the dataset with files from the
983	 * staged directory.  Most of our runtime is spent here.
984	 */
985	arg.rootdirfd = dirfd;
986	arg.zfs = zfs;
987	arg.fs = &fs;
988	SLIST_INIT(&arg.dirs);
989	fs_populate_dir(root, &arg);
990	assert(!SLIST_EMPTY(&arg.dirs));
991	fsnode_foreach(root, fs_foreach_populate, &arg);
992	assert(SLIST_EMPTY(&arg.dirs));
993	rootdirid = arg.rootdirid;
994
995	/*
996	 * Create an empty delete queue.  We don't do anything with it, but
997	 * OpenZFS will refuse to mount filesystems that don't have one.
998	 */
999	deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
1000	zap_write(zfs, zap_alloc(os, deleteq));
1001
1002	/*
1003	 * Populate and write the master node object.  This is a ZAP object
1004	 * containing various dataset properties and the object IDs of the root
1005	 * directory and delete queue.
1006	 */
1007	masterzap = zap_alloc(os, masterobj);
1008	zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
1009	zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
1010	zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
1011	zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
1012	zap_add_uint64(masterzap, "normalization", 0 /* off */);
1013	zap_add_uint64(masterzap, "utf8only", 0 /* off */);
1014	zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
1015	zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
1016	zap_write(zfs, masterzap);
1017
1018	/*
1019	 * All finished with this object set, we may as well write it now.
1020	 * The DSL layer will sum up the bytes consumed by each dataset using
1021	 * information stored in the object set, so it can't be freed just yet.
1022	 */
1023	dsl_dir_dataset_write(zfs, os, dsldir);
1024
1025	if (fakedroot) {
1026		free(root->inode);
1027		free(root->name);
1028		free(root);
1029	}
1030	free(fs.saoffs);
1031}
1032
1033/*
1034 * Create an object set for each DSL directory which has a dataset and doesn't
1035 * already have an object set.
1036 */
1037static void
1038fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
1039{
1040	if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
1041		fs_build_one(zfs, dsldir, NULL, -1);
1042}
1043
1044/*
1045 * Create our datasets and populate them with files.
1046 */
1047void
1048fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
1049{
1050	/*
1051	 * Run through our datasets and find the root fsnode for each one.  Each
1052	 * root fsnode is flagged so that we can figure out which dataset it
1053	 * belongs to.
1054	 */
1055	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
1056
1057	/*
1058	 * Did we find our boot filesystem?
1059	 */
1060	if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
1061		errx(1, "no mounted dataset matches bootfs property `%s'",
1062		    zfs->bootfs);
1063
1064	/*
1065	 * Traverse the file hierarchy starting from the root fsnode.  One
1066	 * dataset, not necessarily the root dataset, must "own" the root
1067	 * directory by having its mountpoint be equal to the root path.
1068	 *
1069	 * As roots of other datasets are encountered during the traversal,
1070	 * fs_build_one() recursively creates the corresponding object sets and
1071	 * populates them.  Once this function has returned, all datasets will
1072	 * have been fully populated.
1073	 */
1074	fs_build_one(zfs, root->inode->param, root, dirfd);
1075
1076	/*
1077	 * Now create object sets for datasets whose mountpoints weren't found
1078	 * in the staging directory, either because there is no mountpoint, or
1079	 * because the mountpoint doesn't correspond to an existing directory.
1080	 */
1081	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
1082}
1083