1/* gzjoin -- command to join gzip files into one gzip file
2
3  Copyright (C) 2004 Mark Adler, all rights reserved
4  version 1.0, 11 Dec 2004
5
6  This software is provided 'as-is', without any express or implied
7  warranty.  In no event will the author be held liable for any damages
8  arising from the use of this software.
9
10  Permission is granted to anyone to use this software for any purpose,
11  including commercial applications, and to alter it and redistribute it
12  freely, subject to the following restrictions:
13
14  1. The origin of this software must not be misrepresented; you must not
15     claim that you wrote the original software. If you use this software
16     in a product, an acknowledgment in the product documentation would be
17     appreciated but is not required.
18  2. Altered source versions must be plainly marked as such, and must not be
19     misrepresented as being the original software.
20  3. This notice may not be removed or altered from any source distribution.
21
22  Mark Adler    madler@alumni.caltech.edu
23 */
24
25/*
26 * Change history:
27 *
28 * 1.0  11 Dec 2004     - First version
29 * 1.1  12 Jun 2005     - Changed ssize_t to long for portability
30 */
31
32/*
33   gzjoin takes one or more gzip files on the command line and writes out a
34   single gzip file that will uncompress to the concatenation of the
35   uncompressed data from the individual gzip files.  gzjoin does this without
36   having to recompress any of the data and without having to calculate a new
37   crc32 for the concatenated uncompressed data.  gzjoin does however have to
38   decompress all of the input data in order to find the bits in the compressed
39   data that need to be modified to concatenate the streams.
40
41   gzjoin does not do an integrity check on the input gzip files other than
42   checking the gzip header and decompressing the compressed data.  They are
43   otherwise assumed to be complete and correct.
44
45   Each joint between gzip files removes at least 18 bytes of previous trailer
46   and subsequent header, and inserts an average of about three bytes to the
47   compressed data in order to connect the streams.  The output gzip file
48   has a minimal ten-byte gzip header with no file name or modification time.
49
50   This program was written to illustrate the use of the Z_BLOCK option of
51   inflate() and the crc32_combine() function.  gzjoin will not compile with
52   versions of zlib earlier than 1.2.3.
53 */
54
55#include <stdio.h>      /* fputs(), fprintf(), fwrite(), putc() */
56#include <stdlib.h>     /* exit(), malloc(), free() */
57#include <fcntl.h>      /* open() */
58#include <unistd.h>     /* close(), read(), lseek() */
59#include "zlib.h"
60    /* crc32(), crc32_combine(), inflateInit2(), inflate(), inflateEnd() */
61
62#define local static
63
64/* exit with an error (return a value to allow use in an expression) */
65local int bail(char *why1, char *why2)
66{
67    fprintf(stderr, "gzjoin error: %s%s, output incomplete\n", why1, why2);
68    exit(1);
69    return 0;
70}
71
72/* -- simple buffered file input with access to the buffer -- */
73
74#define CHUNK 32768         /* must be a power of two and fit in unsigned */
75
76/* bin buffered input file type */
77typedef struct {
78    char *name;             /* name of file for error messages */
79    int fd;                 /* file descriptor */
80    unsigned left;          /* bytes remaining at next */
81    unsigned char *next;    /* next byte to read */
82    unsigned char *buf;     /* allocated buffer of length CHUNK */
83} bin;
84
85/* close a buffered file and free allocated memory */
86local void bclose(bin *in)
87{
88    if (in != NULL) {
89        if (in->fd != -1)
90            close(in->fd);
91        if (in->buf != NULL)
92            free(in->buf);
93        free(in);
94    }
95}
96
97/* open a buffered file for input, return a pointer to type bin, or NULL on
98   failure */
99local bin *bopen(char *name)
100{
101    bin *in;
102
103    in = malloc(sizeof(bin));
104    if (in == NULL)
105        return NULL;
106    in->buf = malloc(CHUNK);
107    in->fd = open(name, O_RDONLY, 0);
108    if (in->buf == NULL || in->fd == -1) {
109        bclose(in);
110        return NULL;
111    }
112    in->left = 0;
113    in->next = in->buf;
114    in->name = name;
115    return in;
116}
117
118/* load buffer from file, return -1 on read error, 0 or 1 on success, with
119   1 indicating that end-of-file was reached */
120local int bload(bin *in)
121{
122    long len;
123
124    if (in == NULL)
125        return -1;
126    if (in->left != 0)
127        return 0;
128    in->next = in->buf;
129    do {
130        len = (long)read(in->fd, in->buf + in->left, CHUNK - in->left);
131        if (len < 0)
132            return -1;
133        in->left += (unsigned)len;
134    } while (len != 0 && in->left < CHUNK);
135    return len == 0 ? 1 : 0;
136}
137
138/* get a byte from the file, bail if end of file */
139#define bget(in) (in->left ? 0 : bload(in), \
140                  in->left ? (in->left--, *(in->next)++) : \
141                    bail("unexpected end of file on ", in->name))
142
143/* get a four-byte little-endian unsigned integer from file */
144local unsigned long bget4(bin *in)
145{
146    unsigned long val;
147
148    val = bget(in);
149    val += (unsigned long)(bget(in)) << 8;
150    val += (unsigned long)(bget(in)) << 16;
151    val += (unsigned long)(bget(in)) << 24;
152    return val;
153}
154
155/* skip bytes in file */
156local void bskip(bin *in, unsigned skip)
157{
158    /* check pointer */
159    if (in == NULL)
160        return;
161
162    /* easy case -- skip bytes in buffer */
163    if (skip <= in->left) {
164        in->left -= skip;
165        in->next += skip;
166        return;
167    }
168
169    /* skip what's in buffer, discard buffer contents */
170    skip -= in->left;
171    in->left = 0;
172
173    /* seek past multiples of CHUNK bytes */
174    if (skip > CHUNK) {
175        unsigned left;
176
177        left = skip & (CHUNK - 1);
178        if (left == 0) {
179            /* exact number of chunks: seek all the way minus one byte to check
180               for end-of-file with a read */
181            lseek(in->fd, skip - 1, SEEK_CUR);
182            if (read(in->fd, in->buf, 1) != 1)
183                bail("unexpected end of file on ", in->name);
184            return;
185        }
186
187        /* skip the integral chunks, update skip with remainder */
188        lseek(in->fd, skip - left, SEEK_CUR);
189        skip = left;
190    }
191
192    /* read more input and skip remainder */
193    bload(in);
194    if (skip > in->left)
195        bail("unexpected end of file on ", in->name);
196    in->left -= skip;
197    in->next += skip;
198}
199
200/* -- end of buffered input functions -- */
201
202/* skip the gzip header from file in */
203local void gzhead(bin *in)
204{
205    int flags;
206
207    /* verify gzip magic header and compression method */
208    if (bget(in) != 0x1f || bget(in) != 0x8b || bget(in) != 8)
209        bail(in->name, " is not a valid gzip file");
210
211    /* get and verify flags */
212    flags = bget(in);
213    if ((flags & 0xe0) != 0)
214        bail("unknown reserved bits set in ", in->name);
215
216    /* skip modification time, extra flags, and os */
217    bskip(in, 6);
218
219    /* skip extra field if present */
220    if (flags & 4) {
221        unsigned len;
222
223        len = bget(in);
224        len += (unsigned)(bget(in)) << 8;
225        bskip(in, len);
226    }
227
228    /* skip file name if present */
229    if (flags & 8)
230        while (bget(in) != 0)
231            ;
232
233    /* skip comment if present */
234    if (flags & 16)
235        while (bget(in) != 0)
236            ;
237
238    /* skip header crc if present */
239    if (flags & 2)
240        bskip(in, 2);
241}
242
243/* write a four-byte little-endian unsigned integer to out */
244local void put4(unsigned long val, FILE *out)
245{
246    putc(val & 0xff, out);
247    putc((val >> 8) & 0xff, out);
248    putc((val >> 16) & 0xff, out);
249    putc((val >> 24) & 0xff, out);
250}
251
252/* Load up zlib stream from buffered input, bail if end of file */
253local void zpull(z_streamp strm, bin *in)
254{
255    if (in->left == 0)
256        bload(in);
257    if (in->left == 0)
258        bail("unexpected end of file on ", in->name);
259    strm->avail_in = in->left;
260    strm->next_in = in->next;
261}
262
263/* Write header for gzip file to out and initialize trailer. */
264local void gzinit(unsigned long *crc, unsigned long *tot, FILE *out)
265{
266    fwrite("\x1f\x8b\x08\0\0\0\0\0\0\xff", 1, 10, out);
267    *crc = crc32(0L, Z_NULL, 0);
268    *tot = 0;
269}
270
271/* Copy the compressed data from name, zeroing the last block bit of the last
272   block if clr is true, and adding empty blocks as needed to get to a byte
273   boundary.  If clr is false, then the last block becomes the last block of
274   the output, and the gzip trailer is written.  crc and tot maintains the
275   crc and length (modulo 2^32) of the output for the trailer.  The resulting
276   gzip file is written to out.  gzinit() must be called before the first call
277   of gzcopy() to write the gzip header and to initialize crc and tot. */
278local void gzcopy(char *name, int clr, unsigned long *crc, unsigned long *tot,
279                  FILE *out)
280{
281    int ret;                /* return value from zlib functions */
282    int pos;                /* where the "last block" bit is in byte */
283    int last;               /* true if processing the last block */
284    bin *in;                /* buffered input file */
285    unsigned char *start;   /* start of compressed data in buffer */
286    unsigned char *junk;    /* buffer for uncompressed data -- discarded */
287    z_off_t len;            /* length of uncompressed data (support > 4 GB) */
288    z_stream strm;          /* zlib inflate stream */
289
290    /* open gzip file and skip header */
291    in = bopen(name);
292    if (in == NULL)
293        bail("could not open ", name);
294    gzhead(in);
295
296    /* allocate buffer for uncompressed data and initialize raw inflate
297       stream */
298    junk = malloc(CHUNK);
299    strm.zalloc = Z_NULL;
300    strm.zfree = Z_NULL;
301    strm.opaque = Z_NULL;
302    strm.avail_in = 0;
303    strm.next_in = Z_NULL;
304    ret = inflateInit2(&strm, -15);
305    if (junk == NULL || ret != Z_OK)
306        bail("out of memory", "");
307
308    /* inflate and copy compressed data, clear last-block bit if requested */
309    len = 0;
310    zpull(&strm, in);
311    start = strm.next_in;
312    last = start[0] & 1;
313    if (last && clr)
314        start[0] &= ~1;
315    strm.avail_out = 0;
316    for (;;) {
317        /* if input used and output done, write used input and get more */
318        if (strm.avail_in == 0 && strm.avail_out != 0) {
319            fwrite(start, 1, strm.next_in - start, out);
320            start = in->buf;
321            in->left = 0;
322            zpull(&strm, in);
323        }
324
325        /* decompress -- return early when end-of-block reached */
326        strm.avail_out = CHUNK;
327        strm.next_out = junk;
328        ret = inflate(&strm, Z_BLOCK);
329        switch (ret) {
330        case Z_MEM_ERROR:
331            bail("out of memory", "");
332        case Z_DATA_ERROR:
333            bail("invalid compressed data in ", in->name);
334        }
335
336        /* update length of uncompressed data */
337        len += CHUNK - strm.avail_out;
338
339        /* check for block boundary (only get this when block copied out) */
340        if (strm.data_type & 128) {
341            /* if that was the last block, then done */
342            if (last)
343                break;
344
345            /* number of unused bits in last byte */
346            pos = strm.data_type & 7;
347
348            /* find the next last-block bit */
349            if (pos != 0) {
350                /* next last-block bit is in last used byte */
351                pos = 0x100 >> pos;
352                last = strm.next_in[-1] & pos;
353                if (last && clr)
354                    strm.next_in[-1] &= ~pos;
355            }
356            else {
357                /* next last-block bit is in next unused byte */
358                if (strm.avail_in == 0) {
359                    /* don't have that byte yet -- get it */
360                    fwrite(start, 1, strm.next_in - start, out);
361                    start = in->buf;
362                    in->left = 0;
363                    zpull(&strm, in);
364                }
365                last = strm.next_in[0] & 1;
366                if (last && clr)
367                    strm.next_in[0] &= ~1;
368            }
369        }
370    }
371
372    /* update buffer with unused input */
373    in->left = strm.avail_in;
374    in->next = strm.next_in;
375
376    /* copy used input, write empty blocks to get to byte boundary */
377    pos = strm.data_type & 7;
378    fwrite(start, 1, in->next - start - 1, out);
379    last = in->next[-1];
380    if (pos == 0 || !clr)
381        /* already at byte boundary, or last file: write last byte */
382        putc(last, out);
383    else {
384        /* append empty blocks to last byte */
385        last &= ((0x100 >> pos) - 1);       /* assure unused bits are zero */
386        if (pos & 1) {
387            /* odd -- append an empty stored block */
388            putc(last, out);
389            if (pos == 1)
390                putc(0, out);               /* two more bits in block header */
391            fwrite("\0\0\xff\xff", 1, 4, out);
392        }
393        else {
394            /* even -- append 1, 2, or 3 empty fixed blocks */
395            switch (pos) {
396            case 6:
397                putc(last | 8, out);
398                last = 0;
399            case 4:
400                putc(last | 0x20, out);
401                last = 0;
402            case 2:
403                putc(last | 0x80, out);
404                putc(0, out);
405            }
406        }
407    }
408
409    /* update crc and tot */
410    *crc = crc32_combine(*crc, bget4(in), len);
411    *tot += (unsigned long)len;
412
413    /* clean up */
414    inflateEnd(&strm);
415    free(junk);
416    bclose(in);
417
418    /* write trailer if this is the last gzip file */
419    if (!clr) {
420        put4(*crc, out);
421        put4(*tot, out);
422    }
423}
424
425/* join the gzip files on the command line, write result to stdout */
426int main(int argc, char **argv)
427{
428    unsigned long crc, tot;     /* running crc and total uncompressed length */
429
430    /* skip command name */
431    argc--;
432    argv++;
433
434    /* show usage if no arguments */
435    if (argc == 0) {
436        fputs("gzjoin usage: gzjoin f1.gz [f2.gz [f3.gz ...]] > fjoin.gz\n",
437              stderr);
438        return 0;
439    }
440
441    /* join gzip files on command line and write to stdout */
442    gzinit(&crc, &tot, stdout);
443    while (argc--)
444        gzcopy(*argv++, argc, &crc, &tot, stdout);
445
446    /* done */
447    return 0;
448}
449