1/*	$NetBSD: vndcompress.c,v 1.29 2017/07/29 21:04:07 riastradh Exp $	*/
2
3/*-
4 * Copyright (c) 2013 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Taylor R. Campbell.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32#include <sys/cdefs.h>
33__RCSID("$NetBSD: vndcompress.c,v 1.29 2017/07/29 21:04:07 riastradh Exp $");
34
35#include <sys/endian.h>
36#include <sys/stat.h>
37
38#include <assert.h>
39#include <err.h>
40#include <errno.h>
41#include <fcntl.h>
42#include <inttypes.h>
43#include <limits.h>
44#include <signal.h>
45#include <stdbool.h>
46#include <stdint.h>
47#include <stdio.h>
48#include <stdlib.h>
49#include <string.h>
50#include <unistd.h>
51#include <zlib.h>
52
53#include "common.h"
54#include "offtab.h"
55#include "utils.h"
56
57/*
58 * XXX Switch to control bug-for-bug byte-for-byte compatibility with
59 * NetBSD's vndcompress.
60 */
61#define	VNDCOMPRESS_COMPAT	0
62
63__CTASSERT(sizeof(struct cloop2_header) == CLOOP2_OFFSET_TABLE_OFFSET);
64
65struct compress_state {
66	uint64_t	size;		/* uncompressed size */
67	uint64_t	offset;		/* output byte offset */
68	uint32_t	blocksize;	/* bytes per block */
69	uint32_t	blkno;		/* input block number */
70	uint32_t	n_full_blocks;	/* floor(size/blocksize) */
71	uint32_t	n_blocks;	/* ceiling(size/blocksize) */
72	uint32_t	n_offsets;	/* n_blocks + 1 */
73	uint32_t	end_block;	/* last block to transfer */
74	uint32_t	checkpoint_blocks;	/* blocks before checkpoint */
75	int		image_fd;
76	int		cloop2_fd;
77	struct offtab	offtab;
78	uint32_t	n_checkpointed_blocks;
79	volatile sig_atomic_t
80			initialized;	/* everything above initialized?  */
81};
82
83/* Global compression state for SIGINFO handler.  */
84static struct compress_state	global_state;
85
86struct sigdesc {
87	int sd_signo;
88	const char *sd_name;
89};
90
91static const struct sigdesc info_signals[] = {
92	{ SIGINFO, "SIGINFO" },
93	{ SIGUSR1, "SIGUSR1" },
94};
95
96static const struct sigdesc checkpoint_signals[] = {
97	{ SIGUSR2, "SIGUSR2" },
98};
99
100static void	init_signals(void);
101static void	init_signal_handler(int, const struct sigdesc *, size_t,
102		    void (*)(int));
103static void	info_signal_handler(int);
104static void	checkpoint_signal_handler(int);
105static void	compress_progress(struct compress_state *);
106static void	compress_init(int, char **, const struct options *,
107		    struct compress_state *);
108static bool	compress_restart(struct compress_state *);
109static uint32_t	compress_block(int, int, uint32_t, uint32_t, uint32_t, void *,
110		    void *);
111static void	compress_maybe_checkpoint(struct compress_state *);
112static void	compress_checkpoint(struct compress_state *);
113static void	compress_exit(struct compress_state *);
114
115/*
116 * Compression entry point.
117 */
118int
119vndcompress(int argc, char **argv, const struct options *O)
120{
121	struct compress_state *const S = &global_state;
122
123	/* Paranoia.  The other fields either have no sentinel or use zero.  */
124	S->image_fd = -1;
125	S->cloop2_fd = -1;
126
127	/* Set up signal handlers so we can handle SIGINFO ASAP.  */
128	init_signals();
129
130	/*
131	 * Parse the arguments to initialize our state.
132	 */
133	compress_init(argc, argv, O, S);
134	assert(MIN_BLOCKSIZE <= S->blocksize);
135	assert(S->blocksize <= MAX_BLOCKSIZE);
136
137	/*
138	 * Allocate compression buffers.
139	 *
140	 * Compression may actually expand.  From an overabundance of
141	 * caution, assume it can expand by at most double.
142	 *
143	 * XXX Check and consider tightening this assumption.
144	 */
145	__CTASSERT(MAX_BLOCKSIZE <= SIZE_MAX);
146	void *const uncompbuf = malloc(S->blocksize);
147	if (uncompbuf == NULL)
148		err(1, "malloc uncompressed buffer");
149
150	/* XXX compression ratio bound */
151	__CTASSERT(MUL_OK(size_t, 2, MAX_BLOCKSIZE));
152	void *const compbuf = malloc(2 * (size_t)S->blocksize);
153	if (compbuf == NULL)
154		err(1, "malloc compressed buffer");
155
156	/*
157	 * Compress the blocks.  S->blkno specifies the input block
158	 * we're about to transfer.  S->offset is the current output
159	 * offset.
160	 */
161	while (S->blkno < S->n_blocks) {
162		/* Report any progress.  */
163		compress_progress(S);
164
165		/* Stop if we've done the requested partial transfer.  */
166		if ((0 < S->end_block) && (S->end_block <= S->blkno))
167			goto out;
168
169		/* Checkpoint if appropriate.  */
170		compress_maybe_checkpoint(S);
171		offtab_prepare_put(&S->offtab, (S->blkno + 1));
172
173		/* Choose read size: partial if last block, full if not.  */
174		const uint32_t readsize = (S->blkno == S->n_full_blocks?
175		    (S->size % S->blocksize) : S->blocksize);
176		assert(readsize > 0);
177		assert(readsize <= S->blocksize);
178
179		/* Fail noisily if we might be about to overflow.  */
180		/* XXX compression ratio bound */
181		__CTASSERT(MUL_OK(uint64_t, 2, MAX_BLOCKSIZE));
182		__CTASSERT(MUL_OK(off_t, 2, MAX_BLOCKSIZE));
183		assert(S->offset <= MIN(UINT64_MAX, OFF_MAX));
184		if (!ADD_OK(uint64_t, S->offset, 2*(uintmax_t)readsize) ||
185		    !ADD_OK(off_t, S->offset, 2*(uintmax_t)readsize))
186			errx(1, "blkno %"PRIu32" may overflow: %ju + 2*%ju",
187			    S->blkno, (uintmax_t)S->offset,
188			    (uintmax_t)readsize);
189
190		/* Process the block.  */
191		const uint32_t complen =
192		    compress_block(S->image_fd, S->cloop2_fd, S->blkno,
193			S->blocksize, readsize, uncompbuf, compbuf);
194
195		/*
196		 * Signal-atomically update the state to reflect
197		 * (a) what block number we are now at,
198		 * (b) how far we are now in the output file, and
199		 * (c) where the last block ended.
200		 */
201		assert(ADD_OK(uint32_t, S->blkno, 1));
202		assert(ADD_OK(uint64_t, S->offset, complen));
203		assert(ADD_OK(off_t, (off_t)S->offset, (off_t)complen));
204		assert((S->blkno + 1) < S->n_offsets);
205	    {
206		sigset_t old_sigmask;
207		block_signals(&old_sigmask);
208		S->blkno += 1;					/* (a) */
209		S->offset += complen;				/* (b) */
210		offtab_put(&S->offtab, S->blkno, S->offset);	/* (c) */
211		restore_sigmask(&old_sigmask);
212	    }
213	}
214
215	/* Make sure we're all done. */
216	assert(S->blkno == S->n_blocks);
217	assert((S->blkno + 1) == S->n_offsets);
218
219	/* Pad to the disk block size.  */
220	const uint32_t n_extra = (S->offset % DEV_BSIZE);
221	if (n_extra != 0) {
222		const uint32_t n_padding = (DEV_BSIZE - n_extra);
223		/* Reuse compbuf -- guaranteed to be large enough.  */
224		(void)memset(compbuf, 0, n_padding);
225		const ssize_t n_written = write(S->cloop2_fd, compbuf,
226		    n_padding);
227		if (n_written == -1)
228			err(1, "write final padding failed");
229		assert(n_written >= 0);
230		if ((size_t)n_written != n_padding)
231			errx(1, "partial write of final padding bytes"
232			    ": %zu != %"PRIu32,
233			    (size_t)n_written, n_padding);
234
235		/* Account for the extra bytes in the output file.  */
236		assert(ADD_OK(uint64_t, S->offset, n_padding));
237		assert(ADD_OK(off_t, (off_t)S->offset, (off_t)n_padding));
238	    {
239		sigset_t old_sigmask;
240		block_signals(&old_sigmask);
241		S->offset += n_padding;
242		restore_sigmask(&old_sigmask);
243	    }
244	}
245
246out:
247	/* One last checkpoint to commit the offset table.  */
248	assert(S->offset <= OFF_MAX);
249	assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
250	compress_checkpoint(S);
251
252	/*
253	 * Free the compression buffers and finalize the compression.
254	 */
255	free(compbuf);
256	free(uncompbuf);
257	compress_exit(S);
258
259	return 0;
260}
261
262/*
263 * Signal cruft.
264 */
265
266static void
267init_signals(void)
268{
269
270	init_signal_handler(SA_RESTART, info_signals,
271	    __arraycount(info_signals), &info_signal_handler);
272	init_signal_handler(SA_RESTART, checkpoint_signals,
273	    __arraycount(checkpoint_signals), &checkpoint_signal_handler);
274}
275
276static void
277init_signal_handler(int flags, const struct sigdesc *signals, size_t n,
278    void (*handler)(int))
279{
280	static const struct sigaction zero_sa;
281	struct sigaction sa = zero_sa;
282	size_t i;
283
284	(void)sigemptyset(&sa.sa_mask);
285	for (i = 0; i < n; i++)
286		(void)sigaddset(&sa.sa_mask, signals[i].sd_signo);
287	sa.sa_flags = flags;
288	sa.sa_handler = handler;
289	for (i = 0; i < n; i++)
290		if (sigaction(signals[i].sd_signo, &sa, NULL) == -1)
291			err(1, "sigaction(%s)", signals[i].sd_name);
292}
293
294static void
295info_signal_handler(int signo __unused)
296{
297	/* Save errno.  */
298	const int error = errno;
299	struct compress_state *const S = &global_state;
300	char buf[128];
301
302	/* Bail if the state is not yet initialized.  */
303	if (!S->initialized) {
304		warnx_ss("initializing");
305		goto out;
306	}
307
308	/* Carefully calculate our I/O position.  */
309	assert(S->blocksize > 0);
310	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
311	const uint64_t nread = ((uint64_t)S->blkno * (uint64_t)S->blocksize);
312
313	assert(S->n_blocks > 0);
314	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, sizeof(uint64_t)));
315	__CTASSERT(ADD_OK(uint64_t, CLOOP2_OFFSET_TABLE_OFFSET,
316		MAX_N_BLOCKS*sizeof(uint64_t)));
317	const uint64_t nwritten = (S->offset <= (CLOOP2_OFFSET_TABLE_OFFSET +
318		((uint64_t)S->n_blocks * sizeof(uint64_t)))?
319	    0 : S->offset);
320
321	/* snprintf_ss can't do floating-point, so do fixed-point instead.  */
322	const uint64_t ratio_percent =
323	    (nread > 0?
324		((nwritten >= (UINT64_MAX / 100)) ?
325		    ((nwritten / nread) * 100) : ((nwritten * 100) / nread))
326		: 0);
327
328	/* Format the status.  */
329	assert(S->n_checkpointed_blocks <= MAX_N_BLOCKS);
330	assert(S->blocksize <= MAX_BLOCKSIZE);
331	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
332	const int n = snprintf_ss(buf, sizeof(buf),
333	    "vndcompress: read %"PRIu64" bytes, wrote %"PRIu64" bytes, "
334	    "compression ratio %"PRIu64"%% (checkpointed %"PRIu64" bytes)\n",
335	    nread, nwritten, ratio_percent,
336	    ((uint64_t)S->n_checkpointed_blocks * (uint64_t)S->blocksize));
337	if (n < 0) {
338		const char msg[] = "vndcompress: can't format info\n";
339		(void)write(STDERR_FILENO, msg, __arraycount(msg));
340	} else {
341		__CTASSERT(INT_MAX <= SIZE_MAX);
342		(void)write(STDERR_FILENO, buf, (size_t)n);
343	}
344
345out:
346	/* Restore errno.  */
347	errno = error;
348}
349
350static void
351checkpoint_signal_handler(int signo __unused)
352{
353	/* Save errno.  */
354	const int error = errno;
355	struct compress_state *const S = &global_state;
356
357	/* Bail if the state is not yet initialized.  */
358	if (!S->initialized) {
359		warnx_ss("nothing to checkpoint yet");
360		goto out;
361	}
362
363	assert(S->image_fd >= 0);
364	assert(S->cloop2_fd >= 0);
365
366	/* Take a checkpoint.  */
367	assert(S->blkno <= MAX_N_BLOCKS);
368	assert(S->blocksize <= MAX_BLOCKSIZE);
369	__CTASSERT(MUL_OK(uint64_t, MAX_N_BLOCKS, MAX_BLOCKSIZE));
370	warnx_ss("checkpointing %"PRIu64" bytes",
371	    ((uint64_t)S->blkno * (uint64_t)S->blocksize));
372	compress_checkpoint(S);
373
374out:
375	/* Restore errno.  */
376	errno = error;
377}
378
379/*
380 * Report progress.
381 *
382 * XXX Should do a progress bar here.
383 */
384static void
385compress_progress(struct compress_state *S __unused)
386{
387}
388
389/*
390 * Parse arguments, open the files, and initialize the state.
391 */
392static void
393compress_init(int argc, char **argv, const struct options *O,
394    struct compress_state *S)
395{
396
397	if (!((argc == 2) || (argc == 3)))
398		usage();
399
400	const char *const image_pathname = argv[0];
401	const char *const cloop2_pathname = argv[1];
402
403	/* Grab the block size either from `-b' or from the last argument.  */
404	__CTASSERT(0 < DEV_BSIZE);
405	__CTASSERT((MIN_BLOCKSIZE % DEV_BSIZE) == 0);
406	__CTASSERT(MIN_BLOCKSIZE <= DEF_BLOCKSIZE);
407	__CTASSERT((DEF_BLOCKSIZE % DEV_BSIZE) == 0);
408	__CTASSERT(DEF_BLOCKSIZE <= MAX_BLOCKSIZE);
409	__CTASSERT((MAX_BLOCKSIZE % DEV_BSIZE) == 0);
410	if (ISSET(O->flags, FLAG_b)) {
411		if (argc == 3) {
412			warnx("use -b or the extra argument, not both");
413			usage();
414		}
415		S->blocksize = O->blocksize;
416	} else {
417		S->blocksize = (argc == 2? DEF_BLOCKSIZE :
418		    strsuftoll("block size", argv[2], MIN_BLOCKSIZE,
419			MAX_BLOCKSIZE));
420	}
421
422	/* Sanity-check the blocksize.  (strsuftoll guarantees bounds.)  */
423	__CTASSERT(DEV_BSIZE <= UINT32_MAX);
424	if ((S->blocksize % DEV_BSIZE) != 0)
425		errx(1, "bad blocksize: %"PRIu32
426		    " (not a multiple of %"PRIu32")",
427		    S->blocksize, (uint32_t)DEV_BSIZE);
428	assert(MIN_BLOCKSIZE <= S->blocksize);
429	assert((S->blocksize % DEV_BSIZE) == 0);
430	assert(S->blocksize <= MAX_BLOCKSIZE);
431
432	/* Grab the end block number if we have one.  */
433	S->end_block = (ISSET(O->flags, FLAG_p)? O->end_block : 0);
434
435	/* Grab the checkpoint block count, if we have one.  */
436	S->checkpoint_blocks =
437	    (ISSET(O->flags, FLAG_k)? O->checkpoint_blocks : 0);
438
439	/* Open the input image file and the output cloop2 file.  */
440	S->image_fd = open(image_pathname, O_RDONLY);
441	if (S->image_fd == -1)
442		err(1, "open(%s)", image_pathname);
443
444	int oflags;
445	if (!ISSET(O->flags, FLAG_r))
446		oflags = (O_WRONLY | O_TRUNC | O_CREAT);
447	else if (!ISSET(O->flags, FLAG_R))
448		oflags = (O_RDWR | O_CREAT);
449	else
450		oflags = O_RDWR;
451	S->cloop2_fd = open(cloop2_pathname, oflags, 0777);
452	if (S->cloop2_fd == -1)
453		err(1, "open(%s)", cloop2_pathname);
454
455	/* Find the size of the input image.  */
456	if (ISSET(O->flags, FLAG_l)) {
457		S->size = O->length;
458	} else {
459		static const struct stat zero_st;
460		struct stat st = zero_st;
461		if (fstat(S->image_fd, &st) == -1)
462			err(1, "stat(%s)", image_pathname);
463		if (st.st_size <= 0)
464			errx(1, "unknown image size");
465		assert(st.st_size >= 0);
466		__CTASSERT(OFF_MAX <= UINT64_MAX);
467		assert(__type_fit(uint64_t, st.st_size));
468		S->size = st.st_size;
469	}
470	assert(S->size <= OFF_MAX);
471
472	/* Find number of full blocks and whether there's a partial block.  */
473	__CTASSERT(0 < MIN_BLOCKSIZE);
474	assert(0 < S->blocksize);
475	if (TOOMANY(off_t, (off_t)S->size, (off_t)S->blocksize,
476		(off_t)MAX_N_BLOCKS))
477		errx(1, "image too large for block size %"PRIu32": %"PRIu64,
478		    S->blocksize, S->size);
479	__CTASSERT(MAX_N_BLOCKS <= UINT32_MAX);
480	S->n_full_blocks = S->size/S->blocksize;
481	S->n_blocks = HOWMANY(S->size, S->blocksize);
482	assert(S->n_full_blocks <= S->n_blocks);
483	assert(S->n_blocks <= MAX_N_BLOCKS);
484
485	/* Choose a window size.  */
486	const uint32_t window_size = (ISSET(O->flags, FLAG_w)? O->window_size :
487	    DEF_WINDOW_SIZE);
488
489	/* Create an offset table for the blocks; one extra for the end.  */
490	__CTASSERT(ADD_OK(uint32_t, MAX_N_BLOCKS, 1));
491	S->n_offsets = (S->n_blocks + 1);
492	__CTASSERT(MAX_N_OFFSETS == (MAX_N_BLOCKS + 1));
493	__CTASSERT(MUL_OK(size_t, MAX_N_OFFSETS, sizeof(uint64_t)));
494	__CTASSERT(CLOOP2_OFFSET_TABLE_OFFSET <= OFFTAB_MAX_FDPOS);
495	offtab_init(&S->offtab, S->n_offsets, window_size, S->cloop2_fd,
496	    CLOOP2_OFFSET_TABLE_OFFSET);
497
498	/* Attempt to restart a partial transfer if requested.  */
499	if (ISSET(O->flags, FLAG_r)) {
500		if (compress_restart(S)) {
501			/*
502			 * Restart succeeded.  Truncate the output
503			 * here, in case any garbage got appended.  We
504			 * are committed to making progress at this
505			 * point.  If the ftruncate fails, we don't
506			 * lose anything valuable -- this is the last
507			 * point at which we can restart anyway.
508			 */
509			if (ftruncate(S->cloop2_fd, S->offset) == -1)
510				err(1, "ftruncate failed");
511
512			/* All set!  No more initialization to do.  */
513			return;
514		} else {
515			/* Restart failed.  Barf now if requested.  */
516			if (ISSET(O->flags, FLAG_R))
517				errx(1, "restart failed, aborting");
518
519			/* Otherwise, truncate and start at the top.  */
520			if (ftruncate(S->cloop2_fd, 0) == -1)
521				err(1, "truncate failed");
522			if (lseek(S->cloop2_fd, 0, SEEK_SET) == -1)
523				err(1, "lseek to cloop2 beginning failed");
524
525			/* If we seeked in the input, rewind.  */
526			if (S->blkno != 0) {
527				if (lseek(S->image_fd, 0, SEEK_SET) == -1)
528					err(1,
529					    "lseek to image beginning failed");
530			}
531		}
532	}
533
534	/* Write a bogus (zero) header for now, until we checkpoint.  */
535	static const struct cloop2_header zero_header;
536	const ssize_t h_written = write(S->cloop2_fd, &zero_header,
537	    sizeof(zero_header));
538	if (h_written == -1)
539		err(1, "write header");
540	assert(h_written >= 0);
541	if ((size_t)h_written != sizeof(zero_header))
542		errx(1, "partial write of header: %zu != %zu",
543		    (size_t)h_written, sizeof(zero_header));
544
545	/* Reset the offset table to be empty and write it.  */
546	offtab_reset_write(&S->offtab);
547
548	/* Start at the beginning of the image.  */
549	S->blkno = 0;
550	S->offset = (sizeof(struct cloop2_header) +
551	    ((uint64_t)S->n_offsets * sizeof(uint64_t)));
552	S->n_checkpointed_blocks = 0;
553
554	/* Good to go and ready for interruption by a signal.  */
555	S->initialized = 1;
556}
557
558/*
559 * Try to recover state from an existing output file.
560 *
561 * On success, fill the offset table with what's in the file, set
562 * S->blkno and S->offset to reflect our position, and seek to the
563 * respective positions in the input and output files.
564 *
565 * On failure, return false.  May clobber the offset table, S->blkno,
566 * S->offset, and the file pointers.
567 */
568static bool
569compress_restart(struct compress_state *S)
570{
571
572	/* Read in the header.  */
573	static const struct cloop2_header zero_header;
574	struct cloop2_header header = zero_header;
575
576	const ssize_t h_read = read_block(S->cloop2_fd, &header,
577	    sizeof(header));
578	if (h_read == -1) {
579		warn("failed to read header");
580		return false;
581	}
582	assert(h_read >= 0);
583	if ((size_t)h_read != sizeof(header)) {
584		warnx("partial read of header");
585		return false;
586	}
587
588	/* Check that the header looks like a header.  */
589	__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
590	if (memcmp(header.cl2h_magic, cloop2_magic, sizeof(cloop2_magic))
591	    != 0) {
592		warnx("bad cloop2 shell script magic");
593		return false;
594	}
595
596	/* Check the header parameters.  */
597	if (be32toh(header.cl2h_blocksize) != S->blocksize) {
598		warnx("mismatched block size: %"PRIu32
599		    " (expected %"PRIu32")",
600		    be32toh(header.cl2h_blocksize), S->blocksize);
601		return false;
602	}
603	if (be32toh(header.cl2h_n_blocks) != S->n_blocks) {
604		warnx("mismatched number of blocks: %"PRIu32
605		    " (expected %"PRIu32")",
606		    be32toh(header.cl2h_n_blocks), S->n_blocks);
607		return false;
608	}
609
610	/* Read in the partial offset table.  */
611	if (!offtab_reset_read(&S->offtab, &warn, &warnx))
612		return false;
613	if (!offtab_prepare_get(&S->offtab, 0))
614		return false;
615	const uint64_t first_offset = offtab_get(&S->offtab, 0);
616	__CTASSERT(MUL_OK(uint64_t, MAX_N_OFFSETS, sizeof(uint64_t)));
617	__CTASSERT(ADD_OK(uint64_t, sizeof(struct cloop2_header),
618		MAX_N_OFFSETS*sizeof(uint64_t)));
619	const uint64_t expected = sizeof(struct cloop2_header) +
620	    ((uint64_t)S->n_offsets * sizeof(uint64_t));
621	if (first_offset != expected) {
622		warnx("first offset is not 0x%"PRIx64": 0x%"PRIx64,
623		    expected, first_offset);
624		return false;
625	}
626
627	/* Find where we left off.  */
628	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
629	uint32_t blkno = 0;
630	uint64_t last_offset = first_offset;
631	for (blkno = 0; blkno < S->n_blocks; blkno++) {
632		if (!offtab_prepare_get(&S->offtab, blkno))
633			return false;
634		const uint64_t offset = offtab_get(&S->offtab, blkno);
635		if (offset == ~(uint64_t)0)
636			break;
637
638		if (0 < blkno) {
639			const uint64_t start = last_offset;
640			const uint64_t end = offset;
641			if (end <= start) {
642				warnx("bad offset table: 0x%"PRIx64
643				    ", 0x%"PRIx64, start, end);
644				return false;
645			}
646			/* XXX compression ratio bound */
647			__CTASSERT(MUL_OK(size_t, 2, MAX_BLOCKSIZE));
648			if ((2 * (size_t)S->blocksize) <= (end - start)) {
649				warnx("block %"PRIu32" too large:"
650				    " %"PRIu64" bytes"
651				    " from 0x%"PRIx64" to 0x%"PRIx64,
652				    blkno, (end - start), start, end);
653				return false;
654			}
655		}
656
657		last_offset = offset;
658	}
659
660	if (blkno == 0) {
661		warnx("no blocks were written; nothing to restart");
662		return false;
663	}
664
665	/* Make sure the rest of the offset table is all ones.  */
666	if (blkno < S->n_blocks) {
667		uint32_t nblkno;
668
669		for (nblkno = blkno; nblkno < S->n_blocks; nblkno++) {
670			if (!offtab_prepare_get(&S->offtab, nblkno))
671				return false;
672			const uint64_t offset = offtab_get(&S->offtab, nblkno);
673			if (offset != ~(uint64_t)0) {
674				warnx("bad partial offset table entry"
675				    " at %"PRIu32": 0x%"PRIx64,
676				    nblkno, offset);
677				return false;
678			}
679		}
680	}
681
682	/*
683	 * XXX Consider decompressing some number of blocks to make
684	 * sure they match.
685	 */
686
687	/* Back up by one.  */
688	assert(1 <= blkno);
689	blkno -= 1;
690
691	/* Seek to the output position.  */
692	assert(last_offset <= OFF_MAX);
693	if (lseek(S->cloop2_fd, last_offset, SEEK_SET) == -1) {
694		warn("lseek output cloop2 to %"PRIx64" failed", last_offset);
695		return false;
696	}
697
698	/* Switch from reading to writing the offset table.  */
699	if (!offtab_transmogrify_read_to_write(&S->offtab, blkno))
700		return false;
701
702	/*
703	 * Seek to the input position last, after all other possible
704	 * failures, because if the input is a pipe, we can't change
705	 * our mind, rewind, and start at the beginning instead of
706	 * restarting.
707	 */
708	assert(S->size <= OFF_MAX);
709	assert(blkno <= (S->size / S->blocksize));
710	const off_t restart_position = ((off_t)blkno * (off_t)S->blocksize);
711	assert(0 <= restart_position);
712	assert(restart_position <= (off_t)S->size);
713	if (lseek(S->image_fd, restart_position, SEEK_SET) == -1) {
714		if (errno != ESPIPE) {
715			warn("lseek input image failed");
716			return false;
717		}
718
719		/* Try read instead of lseek for a pipe/socket/fifo.  */
720		void *const buffer = malloc(0x10000);
721		if (buffer == NULL)
722			err(1, "malloc temporary buffer");
723		off_t left = restart_position;
724		while (left > 0) {
725			const size_t size = MIN(0x10000, left);
726			const ssize_t n_read = read_block(S->image_fd, buffer,
727			    size);
728			if (n_read == -1) {
729				free(buffer);
730				warn("read of input image failed");
731				return false;
732			}
733			assert(n_read >= 0);
734			if ((size_t)n_read != size) {
735				free(buffer);
736				warnx("partial read of input image");
737				return false;
738			}
739			assert((off_t)size <= left);
740			left -= size;
741		}
742		free(buffer);
743	}
744
745	/* Start where we left off.  */
746	S->blkno = blkno;
747	S->offset = last_offset;
748	S->n_checkpointed_blocks = blkno;
749
750	/* Good to go and ready for interruption by a signal.  */
751	S->initialized = 1;
752
753	/* Success!  */
754	return true;
755}
756
757/*
758 * Read a single block, compress it, and write the compressed block.
759 * Return the size of the compressed block.
760 */
761static uint32_t
762compress_block(int in_fd, int out_fd, uint32_t blkno, uint32_t blocksize,
763    uint32_t readsize, void *uncompbuf, void *compbuf)
764{
765
766	assert(readsize <= blocksize);
767	assert(blocksize <= MAX_BLOCKSIZE);
768
769	/* Read the uncompressed block.  */
770	const ssize_t n_read = read_block(in_fd, uncompbuf, readsize);
771	if (n_read == -1)
772		err(1, "read block %"PRIu32, blkno);
773	assert(n_read >= 0);
774	if ((size_t)n_read != readsize)
775		errx(1, "partial read of block %"PRIu32": %zu != %"PRIu32,
776		    blkno, (size_t)n_read, readsize);
777
778	/* Compress the block.  */
779	/* XXX compression ratio bound */
780	__CTASSERT(MUL_OK(unsigned long, 2, MAX_BLOCKSIZE));
781	const unsigned long uncomplen =
782	    (VNDCOMPRESS_COMPAT? blocksize : readsize); /* XXX */
783	unsigned long complen = (uncomplen * 2);
784	const int zerror = compress2(compbuf, &complen, uncompbuf, uncomplen,
785	    Z_BEST_COMPRESSION);
786	if (zerror != Z_OK)
787		errx(1, "compressed failed at block %"PRIu32" (%d): %s", blkno,
788		    zerror, zError(zerror));
789	assert(complen <= (uncomplen * 2));
790
791	/* Write the compressed block.  */
792	const ssize_t n_written = write(out_fd, compbuf, complen);
793	if (n_written == -1)
794		err(1, "write block %"PRIu32, blkno);
795	assert(n_written >= 0);
796	if ((size_t)n_written != complen)
797		errx(1, "partial write of block %"PRIu32": %zu != %lu",
798		    blkno, (size_t)n_written, complen);
799
800	return (size_t)n_written;
801}
802
803/*
804 * Checkpoint if appropriate.
805 */
806static void
807compress_maybe_checkpoint(struct compress_state *S)
808{
809
810	if ((0 < S->checkpoint_blocks) && (0 < S->blkno) &&
811	    ((S->blkno % S->checkpoint_blocks) == 0)) {
812		assert(S->offset <= OFF_MAX);
813		assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
814		compress_checkpoint(S);
815	}
816}
817
818/*
819 * Write the prefix of the offset table that we have filled so far.
820 *
821 * We fsync the data blocks we have written, and then write the offset
822 * table, and then fsync the offset table and file metadata.  This
823 * should help to avoid offset tables that point at garbage data.
824 *
825 * This may be called from a signal handler, so it must not use stdio,
826 * malloc, &c. -- it may only (a) handle signal-safe state in S, and
827 * (b) do file descriptor I/O / fsync.
828 *
829 * XXX This requires further thought and heavy testing to be sure.
830 *
831 * XXX Should have an option to suppress fsync.
832 *
833 * XXX Should have an option to fail on fsync failures.
834 *
835 * XXX Would be nice if we could just do a barrier rather than an
836 * fsync.
837 *
838 * XXX How might we automatically test the fsyncs?
839 */
840static void
841compress_checkpoint(struct compress_state *S)
842{
843
844	assert(S->blkno < S->n_offsets);
845	const uint32_t n_offsets = (S->blkno + 1);
846	assert(n_offsets <= S->n_offsets);
847
848	assert(S->offset <= OFF_MAX);
849	assert((off_t)S->offset <= lseek(S->cloop2_fd, 0, SEEK_CUR));
850
851	/* Make sure the data hits the disk before we say it's ready.  */
852	if (fsync_range(S->cloop2_fd, (FFILESYNC | FDISKSYNC), 0, S->offset)
853	    == -1)
854		warn_ss("fsync of output failed");
855
856	/* Say the data blocks are ready.  */
857	offtab_checkpoint(&S->offtab, n_offsets,
858	    (S->n_checkpointed_blocks == 0? OFFTAB_CHECKPOINT_SYNC : 0));
859
860	/*
861	 * If this is the first checkpoint, initialize the header.
862	 * Signal handler can race with main code here, but it is
863	 * harmless -- just an extra fsync and write of the header,
864	 * which are both idempotent.
865	 *
866	 * Once we have synchronously checkpointed the offset table,
867	 * subsequent writes will preserve a valid state.
868	 */
869	if (S->n_checkpointed_blocks == 0) {
870		static const struct cloop2_header zero_header;
871		struct cloop2_header header = zero_header;
872
873		/* Format the header.  */
874		__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
875		(void)memcpy(header.cl2h_magic, cloop2_magic,
876		    sizeof(cloop2_magic));
877		header.cl2h_blocksize = htobe32(S->blocksize);
878		header.cl2h_n_blocks = htobe32(S->n_blocks);
879
880		/* Write the header.  */
881		const ssize_t h_written = pwrite(S->cloop2_fd, &header,
882		    sizeof(header), 0);
883		if (h_written == -1)
884			err_ss(1, "write header");
885		assert(h_written >= 0);
886		if ((size_t)h_written != sizeof(header))
887			errx_ss(1, "partial write of header: %zu != %zu",
888			    (size_t)h_written, sizeof(header));
889	}
890
891	/* Record how many blocks we've checkpointed.  */
892    {
893	sigset_t old_sigmask;
894	block_signals(&old_sigmask);
895	S->n_checkpointed_blocks = S->blkno;
896	restore_sigmask(&old_sigmask);
897    }
898}
899
900/*
901 * Release everything we allocated in compress_init.
902 */
903static void
904compress_exit(struct compress_state *S)
905{
906
907	/* Done with the offset table.  Destroy it.  */
908	offtab_destroy(&S->offtab);
909
910	/* Done with the files.  Close them.  */
911	if (close(S->cloop2_fd) == -1)
912		warn("close(cloop2 fd)");
913	if (close(S->image_fd) == -1)
914		warn("close(image fd)");
915}
916