1/*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in
16 *    the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31#include <assert.h>
32#include <fcntl.h>
33#include <stdlib.h>
34#include <string.h>
35#include <unistd.h>
36
37#include <util.h>
38
39#include "zfs.h"
40
41#pragma GCC diagnostic push
42#pragma GCC diagnostic ignored "-Wunused-function"
43#include "zfs/fletcher.c"
44#include "zfs/sha256.c"
45#pragma GCC diagnostic pop
46
47static void
48blkptr_set(blkptr_t *bp, off_t off, off_t size, uint8_t dntype, uint8_t level,
49    uint64_t fill, enum zio_checksum cksumt, zio_cksum_t *cksum)
50{
51	dva_t *dva;
52
53	assert(powerof2(size));
54
55	BP_ZERO(bp);
56	BP_SET_LSIZE(bp, size);
57	BP_SET_PSIZE(bp, size);
58	BP_SET_CHECKSUM(bp, cksumt);
59	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
60	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
61	BP_SET_BIRTH(bp, TXG, TXG);
62	BP_SET_LEVEL(bp, level);
63	BP_SET_FILL(bp, fill);
64	BP_SET_TYPE(bp, dntype);
65
66	dva = BP_IDENTITY(bp);
67	DVA_SET_VDEV(dva, 0);
68	DVA_SET_OFFSET(dva, off);
69	DVA_SET_ASIZE(dva, size);
70	memcpy(&bp->blk_cksum, cksum, sizeof(*cksum));
71}
72
73/*
74 * Write a block of data to the vdev.  The offset is always relative to the end
75 * of the second leading vdev label.
76 *
77 * Consumers should generally use the helpers below, which provide block
78 * pointers and update dnode accounting, rather than calling this function
79 * directly.
80 */
81static void
82vdev_pwrite(const zfs_opt_t *zfs, const void *buf, size_t len, off_t off)
83{
84	ssize_t n;
85
86	assert(off >= 0 && off < zfs->asize);
87	assert(powerof2(len));
88	assert((off_t)len > 0 && off + (off_t)len > off &&
89	    off + (off_t)len < zfs->asize);
90	if (zfs->spacemap != NULL) {
91		/*
92		 * Verify that the blocks being written were in fact allocated.
93		 *
94		 * The space map isn't available once the on-disk space map is
95		 * finalized, so this check doesn't quite catch everything.
96		 */
97		assert(bit_ntest(zfs->spacemap, off >> zfs->ashift,
98		    (off + len - 1) >> zfs->ashift, 1));
99	}
100
101	off += VDEV_LABEL_START_SIZE;
102	for (size_t sofar = 0; sofar < len; sofar += n) {
103		n = pwrite(zfs->fd, (const char *)buf + sofar, len - sofar,
104		    off + sofar);
105		if (n < 0)
106			err(1, "pwrite");
107		assert(n > 0);
108	}
109}
110
111void
112vdev_pwrite_data(zfs_opt_t *zfs, uint8_t datatype, uint8_t cksumtype,
113    uint8_t level, uint64_t fill, const void *data, off_t sz, off_t loc,
114    blkptr_t *bp)
115{
116	zio_cksum_t cksum;
117
118	assert(cksumtype == ZIO_CHECKSUM_FLETCHER_4);
119
120	fletcher_4_native(data, sz, NULL, &cksum);
121	blkptr_set(bp, loc, sz, datatype, level, fill, cksumtype, &cksum);
122	vdev_pwrite(zfs, data, sz, loc);
123}
124
125void
126vdev_pwrite_dnode_indir(zfs_opt_t *zfs, dnode_phys_t *dnode, uint8_t level,
127    uint64_t fill, const void *data, off_t sz, off_t loc, blkptr_t *bp)
128{
129	vdev_pwrite_data(zfs, dnode->dn_type, dnode->dn_checksum, level, fill,
130	    data, sz, loc, bp);
131
132	assert((dnode->dn_flags & DNODE_FLAG_USED_BYTES) != 0);
133	dnode->dn_used += sz;
134}
135
136void
137vdev_pwrite_dnode_data(zfs_opt_t *zfs, dnode_phys_t *dnode, const void *data,
138    off_t sz, off_t loc)
139{
140	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, data, sz, loc,
141	    &dnode->dn_blkptr[0]);
142}
143
144static void
145vdev_label_set_checksum(void *buf, off_t off, off_t size)
146{
147	zio_cksum_t cksum;
148	zio_eck_t *eck;
149
150	assert(size > 0 && (size_t)size >= sizeof(zio_eck_t));
151
152	eck = (zio_eck_t *)((char *)buf + size) - 1;
153	eck->zec_magic = ZEC_MAGIC;
154	ZIO_SET_CHECKSUM(&eck->zec_cksum, off, 0, 0, 0);
155	zio_checksum_SHA256(buf, size, NULL, &cksum);
156	eck->zec_cksum = cksum;
157}
158
159/*
160 * Set embedded checksums and write the label at the specified index.
161 */
162void
163vdev_label_write(zfs_opt_t *zfs, int ind, const vdev_label_t *labelp)
164{
165	vdev_label_t *label;
166	ssize_t n;
167	off_t blksz, loff;
168
169	assert(ind >= 0 && ind < VDEV_LABELS);
170
171	/*
172	 * Make a copy since we have to modify the label to set checksums.
173	 */
174	label = ecalloc(1, sizeof(*label));
175	memcpy(label, labelp, sizeof(*label));
176
177	if (ind < 2)
178		loff = ind * sizeof(*label);
179	else
180		loff = zfs->vdevsize - (VDEV_LABELS - ind) * sizeof(*label);
181
182	/*
183	 * Set the verifier checksum for the boot block.  We don't use it, but
184	 * the FreeBSD loader reads it and will complain if the checksum isn't
185	 * valid.
186	 */
187	vdev_label_set_checksum(&label->vl_be,
188	    loff + __offsetof(vdev_label_t, vl_be), sizeof(label->vl_be));
189
190	/*
191	 * Set the verifier checksum for the label.
192	 */
193	vdev_label_set_checksum(&label->vl_vdev_phys,
194	    loff + __offsetof(vdev_label_t, vl_vdev_phys),
195	    sizeof(label->vl_vdev_phys));
196
197	/*
198	 * Set the verifier checksum for the uberblocks.  There is one uberblock
199	 * per sector; for example, with an ashift of 12 we end up with
200	 * 128KB/4KB=32 copies of the uberblock in the ring.
201	 */
202	blksz = 1 << zfs->ashift;
203	assert(sizeof(label->vl_uberblock) % blksz == 0);
204	for (size_t roff = 0; roff < sizeof(label->vl_uberblock);
205	    roff += blksz) {
206		vdev_label_set_checksum(&label->vl_uberblock[0] + roff,
207		    loff + __offsetof(vdev_label_t, vl_uberblock) + roff,
208		    blksz);
209	}
210
211	n = pwrite(zfs->fd, label, sizeof(*label), loff);
212	if (n < 0)
213		err(1, "writing vdev label");
214	assert(n == sizeof(*label));
215
216	free(label);
217}
218
219/*
220 * Find a chunk of contiguous free space of length *lenp, according to the
221 * following rules:
222 * 1. If the length is less than or equal to 128KB, the returned run's length
223 *    will be the smallest power of 2 equal to or larger than the length.
224 * 2. If the length is larger than 128KB, the returned run's length will be
225 *    the smallest multiple of 128KB that is larger than the length.
226 * 3. The returned run's length will be size-aligned up to 128KB.
227 *
228 * XXX-MJ the third rule isn't actually required, so this can just be a dumb
229 * bump allocator.  Maybe there's some benefit to keeping large blocks aligned,
230 * so let's keep it for now and hope we don't get too much fragmentation.
231 * Alternately we could try to allocate all blocks of a certain size from the
232 * same metaslab.
233 */
234off_t
235vdev_space_alloc(zfs_opt_t *zfs, off_t *lenp)
236{
237	off_t len;
238	int align, loc, minblksz, nbits;
239
240	minblksz = 1 << zfs->ashift;
241	len = roundup2(*lenp, minblksz);
242
243	assert(len != 0);
244	assert(len / minblksz <= INT_MAX);
245
246	if (len < MAXBLOCKSIZE) {
247		if ((len & (len - 1)) != 0)
248			len = (off_t)1 << flsll(len);
249		align = len / minblksz;
250	} else {
251		len = roundup2(len, MAXBLOCKSIZE);
252		align = MAXBLOCKSIZE / minblksz;
253	}
254
255	for (loc = 0, nbits = len / minblksz;; loc = roundup2(loc, align)) {
256		bit_ffc_area_at(zfs->spacemap, loc, zfs->spacemapbits, nbits,
257		    &loc);
258		if (loc == -1) {
259			errx(1, "failed to find %ju bytes of space",
260			    (uintmax_t)len);
261		}
262		if ((loc & (align - 1)) == 0)
263			break;
264	}
265	assert(loc + nbits > loc);
266	bit_nset(zfs->spacemap, loc, loc + nbits - 1);
267	*lenp = len;
268
269	return ((off_t)loc << zfs->ashift);
270}
271
272static void
273vdev_spacemap_init(zfs_opt_t *zfs)
274{
275	uint64_t nbits;
276
277	assert(powerof2(zfs->mssize));
278
279	nbits = rounddown2(zfs->asize, zfs->mssize) >> zfs->ashift;
280	if (nbits > INT_MAX) {
281		/*
282		 * With the smallest block size of 512B, the limit on the image
283		 * size is 2TB.  That should be enough for anyone.
284		 */
285		errx(1, "image size is too large");
286	}
287	zfs->spacemapbits = (int)nbits;
288	zfs->spacemap = bit_alloc(zfs->spacemapbits);
289	if (zfs->spacemap == NULL)
290		err(1, "bitstring allocation failed");
291}
292
293void
294vdev_spacemap_write(zfs_opt_t *zfs)
295{
296	dnode_phys_t *objarr;
297	bitstr_t *spacemap;
298	uint64_t *objarrblk;
299	off_t smblksz, objarrblksz, objarrloc;
300
301	struct {
302		dnode_phys_t	*dnode;
303		uint64_t	dnid;
304		off_t		loc;
305	} *sma;
306
307	objarrblksz = sizeof(uint64_t) * zfs->mscount;
308	assert(objarrblksz <= MAXBLOCKSIZE);
309	objarrloc = objset_space_alloc(zfs, zfs->mos, &objarrblksz);
310	objarrblk = ecalloc(1, objarrblksz);
311
312	objarr = objset_dnode_lookup(zfs->mos, zfs->objarrid);
313	objarr->dn_datablkszsec = objarrblksz >> MINBLOCKSHIFT;
314
315	/*
316	 * Use the smallest block size for space maps.  The space allocation
317	 * algorithm should aim to minimize the number of holes.
318	 */
319	smblksz = 1 << zfs->ashift;
320
321	/*
322	 * First allocate dnodes and space for all of our space maps.  No more
323	 * space can be allocated from the vdev after this point.
324	 */
325	sma = ecalloc(zfs->mscount, sizeof(*sma));
326	for (uint64_t i = 0; i < zfs->mscount; i++) {
327		sma[i].dnode = objset_dnode_bonus_alloc(zfs->mos,
328		    DMU_OT_SPACE_MAP, DMU_OT_SPACE_MAP_HEADER,
329		    sizeof(space_map_phys_t), &sma[i].dnid);
330		sma[i].loc = objset_space_alloc(zfs, zfs->mos, &smblksz);
331	}
332	spacemap = zfs->spacemap;
333	zfs->spacemap = NULL;
334
335	/*
336	 * Now that the set of allocated space is finalized, populate each space
337	 * map and write it to the vdev.
338	 */
339	for (uint64_t i = 0; i < zfs->mscount; i++) {
340		space_map_phys_t *sm;
341		uint64_t alloc, length, *smblk;
342		int shift, startb, endb, srunb, erunb;
343
344		/*
345		 * We only allocate a single block for this space map, but
346		 * OpenZFS assumes that a space map object with sufficient bonus
347		 * space supports histograms.
348		 */
349		sma[i].dnode->dn_nblkptr = 3;
350		sma[i].dnode->dn_datablkszsec = smblksz >> MINBLOCKSHIFT;
351
352		smblk = ecalloc(1, smblksz);
353
354		alloc = length = 0;
355		shift = zfs->msshift - zfs->ashift;
356		for (srunb = startb = i * (1 << shift),
357		    endb = (i + 1) * (1 << shift);
358		    srunb < endb; srunb = erunb) {
359			uint64_t runlen, runoff;
360
361			/* Find a run of allocated space. */
362			bit_ffs_at(spacemap, srunb, zfs->spacemapbits, &srunb);
363			if (srunb == -1 || srunb >= endb)
364				break;
365
366			bit_ffc_at(spacemap, srunb, zfs->spacemapbits, &erunb);
367			if (erunb == -1 || erunb > endb)
368				erunb = endb;
369
370			/*
371			 * The space represented by [srunb, erunb) has been
372			 * allocated.  Add a record to the space map to indicate
373			 * this.  Run offsets are relative to the beginning of
374			 * the metaslab.
375			 */
376			runlen = erunb - srunb;
377			runoff = srunb - startb;
378
379			assert(length * sizeof(uint64_t) < (uint64_t)smblksz);
380			smblk[length] = SM_PREFIX_ENCODE(SM2_PREFIX) |
381			    SM2_RUN_ENCODE(runlen) | SM2_VDEV_ENCODE(0);
382			smblk[length + 1] = SM2_TYPE_ENCODE(SM_ALLOC) |
383			    SM2_OFFSET_ENCODE(runoff);
384
385			alloc += runlen << zfs->ashift;
386			length += 2;
387		}
388
389		sm = DN_BONUS(sma[i].dnode);
390		sm->smp_length = length * sizeof(uint64_t);
391		sm->smp_alloc = alloc;
392
393		vdev_pwrite_dnode_data(zfs, sma[i].dnode, smblk, smblksz,
394		    sma[i].loc);
395		free(smblk);
396
397		/* Record this space map in the space map object array. */
398		objarrblk[i] = sma[i].dnid;
399	}
400
401	/*
402	 * All of the space maps are written, now write the object array.
403	 */
404	vdev_pwrite_dnode_data(zfs, objarr, objarrblk, objarrblksz, objarrloc);
405	free(objarrblk);
406
407	assert(zfs->spacemap == NULL);
408	free(spacemap);
409	free(sma);
410}
411
412void
413vdev_init(zfs_opt_t *zfs, const char *image)
414{
415	assert(zfs->ashift >= MINBLOCKSHIFT);
416
417	zfs->fd = open(image, O_RDWR | O_CREAT | O_TRUNC, 0644);
418	if (zfs->fd == -1)
419		err(1, "Can't open `%s' for writing", image);
420	if (ftruncate(zfs->fd, zfs->vdevsize) != 0)
421		err(1, "Failed to extend image file `%s'", image);
422
423	vdev_spacemap_init(zfs);
424}
425
426void
427vdev_fini(zfs_opt_t *zfs)
428{
429	assert(zfs->spacemap == NULL);
430
431	if (zfs->fd != -1) {
432		if (close(zfs->fd) != 0)
433			err(1, "close");
434		zfs->fd = -1;
435	}
436}
437