1/* gzappend -- command to append to a gzip file
2
3  Copyright (C) 2003 Mark Adler, all rights reserved
4  version 1.1, 4 Nov 2003
5
6  This software is provided 'as-is', without any express or implied
7  warranty.  In no event will the author be held liable for any damages
8  arising from the use of this software.
9
10  Permission is granted to anyone to use this software for any purpose,
11  including commercial applications, and to alter it and redistribute it
12  freely, subject to the following restrictions:
13
14  1. The origin of this software must not be misrepresented; you must not
15     claim that you wrote the original software. If you use this software
16     in a product, an acknowledgment in the product documentation would be
17     appreciated but is not required.
18  2. Altered source versions must be plainly marked as such, and must not be
19     misrepresented as being the original software.
20  3. This notice may not be removed or altered from any source distribution.
21
22  Mark Adler    madler@alumni.caltech.edu
23 */
24
25/*
26 * Change history:
27 *
28 * 1.0  19 Oct 2003     - First version
29 * 1.1   4 Nov 2003     - Expand and clarify some comments and notes
30 *                      - Add version and copyright to help
31 *                      - Send help to stdout instead of stderr
32 *                      - Add some preemptive typecasts
33 *                      - Add L to constants in lseek() calls
34 *                      - Remove some debugging information in error messages
35 *                      - Use new data_type definition for zlib 1.2.1
36 *                      - Simplfy and unify file operations
37 *                      - Finish off gzip file in gztack()
38 *                      - Use deflatePrime() instead of adding empty blocks
39 *                      - Keep gzip file clean on appended file read errors
40 *                      - Use in-place rotate instead of auxiliary buffer
41 *                        (Why you ask?  Because it was fun to write!)
42 */
43
44/*
45   gzappend takes a gzip file and appends to it, compressing files from the
46   command line or data from stdin.  The gzip file is written to directly, to
47   avoid copying that file, in case it's large.  Note that this results in the
48   unfriendly behavior that if gzappend fails, the gzip file is corrupted.
49
50   This program was written to illustrate the use of the new Z_BLOCK option of
51   zlib 1.2.x's inflate() function.  This option returns from inflate() at each
52   block boundary to facilitate locating and modifying the last block bit at
53   the start of the final deflate block.  Also whether using Z_BLOCK or not,
54   another required feature of zlib 1.2.x is that inflate() now provides the
55   number of unusued bits in the last input byte used.  gzappend will not work
56   with versions of zlib earlier than 1.2.1.
57
58   gzappend first decompresses the gzip file internally, discarding all but
59   the last 32K of uncompressed data, and noting the location of the last block
60   bit and the number of unused bits in the last byte of the compressed data.
61   The gzip trailer containing the CRC-32 and length of the uncompressed data
62   is verified.  This trailer will be later overwritten.
63
64   Then the last block bit is cleared by seeking back in the file and rewriting
65   the byte that contains it.  Seeking forward, the last byte of the compressed
66   data is saved along with the number of unused bits to initialize deflate.
67
68   A deflate process is initialized, using the last 32K of the uncompressed
69   data from the gzip file to initialize the dictionary.  If the total
70   uncompressed data was less than 32K, then all of it is used to initialize
71   the dictionary.  The deflate output bit buffer is also initialized with the
72   last bits from the original deflate stream.  From here on, the data to
73   append is simply compressed using deflate, and written to the gzip file.
74   When that is complete, the new CRC-32 and uncompressed length are written
75   as the trailer of the gzip file.
76 */
77
78#include <stdio.h>
79#include <stdlib.h>
80#include <string.h>
81#include <fcntl.h>
82#include <unistd.h>
83#include "zlib.h"
84
85#define local static
86#define LGCHUNK 14
87#define CHUNK (1U << LGCHUNK)
88#define DSIZE 32768U
89
90/* print an error message and terminate with extreme prejudice */
91local void bye(char *msg1, char *msg2)
92{
93    fprintf(stderr, "gzappend error: %s%s\n", msg1, msg2);
94    exit(1);
95}
96
97/* return the greatest common divisor of a and b using Euclid's algorithm,
98   modified to be fast when one argument much greater than the other, and
99   coded to avoid unnecessary swapping */
100local unsigned gcd(unsigned a, unsigned b)
101{
102    unsigned c;
103
104    while (a && b)
105        if (a > b) {
106            c = b;
107            while (a - c >= c)
108                c <<= 1;
109            a -= c;
110        }
111        else {
112            c = a;
113            while (b - c >= c)
114                c <<= 1;
115            b -= c;
116        }
117    return a + b;
118}
119
120/* rotate list[0..len-1] left by rot positions, in place */
121local void rotate(unsigned char *list, unsigned len, unsigned rot)
122{
123    unsigned char tmp;
124    unsigned cycles;
125    unsigned char *start, *last, *to, *from;
126
127    /* normalize rot and handle degenerate cases */
128    if (len < 2) return;
129    if (rot >= len) rot %= len;
130    if (rot == 0) return;
131
132    /* pointer to last entry in list */
133    last = list + (len - 1);
134
135    /* do simple left shift by one */
136    if (rot == 1) {
137        tmp = *list;
138        memcpy(list, list + 1, len - 1);
139        *last = tmp;
140        return;
141    }
142
143    /* do simple right shift by one */
144    if (rot == len - 1) {
145        tmp = *last;
146        memmove(list + 1, list, len - 1);
147        *list = tmp;
148        return;
149    }
150
151    /* otherwise do rotate as a set of cycles in place */
152    cycles = gcd(len, rot);             /* number of cycles */
153    do {
154        start = from = list + cycles;   /* start index is arbitrary */
155        tmp = *from;                    /* save entry to be overwritten */
156        for (;;) {
157            to = from;                  /* next step in cycle */
158            from += rot;                /* go right rot positions */
159            if (from > last) from -= len;   /* (pointer better not wrap) */
160            if (from == start) break;   /* all but one shifted */
161            *to = *from;                /* shift left */
162        }
163        *to = tmp;                      /* complete the circle */
164    } while (--cycles);
165}
166
167/* structure for gzip file read operations */
168typedef struct {
169    int fd;                     /* file descriptor */
170    int size;                   /* 1 << size is bytes in buf */
171    unsigned left;              /* bytes available at next */
172    unsigned char *buf;         /* buffer */
173    unsigned char *next;        /* next byte in buffer */
174    char *name;                 /* file name for error messages */
175} file;
176
177/* reload buffer */
178local int readin(file *in)
179{
180    int len;
181
182    len = read(in->fd, in->buf, 1 << in->size);
183    if (len == -1) bye("error reading ", in->name);
184    in->left = (unsigned)len;
185    in->next = in->buf;
186    return len;
187}
188
189/* read from file in, exit if end-of-file */
190local int readmore(file *in)
191{
192    if (readin(in) == 0) bye("unexpected end of ", in->name);
193    return 0;
194}
195
196#define read1(in) (in->left == 0 ? readmore(in) : 0, \
197                   in->left--, *(in->next)++)
198
199/* skip over n bytes of in */
200local void skip(file *in, unsigned n)
201{
202    unsigned bypass;
203
204    if (n > in->left) {
205        n -= in->left;
206        bypass = n & ~((1U << in->size) - 1);
207        if (bypass) {
208            if (lseek(in->fd, (off_t)bypass, SEEK_CUR) == -1)
209                bye("seeking ", in->name);
210            n -= bypass;
211        }
212        readmore(in);
213        if (n > in->left)
214            bye("unexpected end of ", in->name);
215    }
216    in->left -= n;
217    in->next += n;
218}
219
220/* read a four-byte unsigned integer, little-endian, from in */
221unsigned long read4(file *in)
222{
223    unsigned long val;
224
225    val = read1(in);
226    val += (unsigned)read1(in) << 8;
227    val += (unsigned long)read1(in) << 16;
228    val += (unsigned long)read1(in) << 24;
229    return val;
230}
231
232/* skip over gzip header */
233local void gzheader(file *in)
234{
235    int flags;
236    unsigned n;
237
238    if (read1(in) != 31 || read1(in) != 139) bye(in->name, " not a gzip file");
239    if (read1(in) != 8) bye("unknown compression method in", in->name);
240    flags = read1(in);
241    if (flags & 0xe0) bye("unknown header flags set in", in->name);
242    skip(in, 6);
243    if (flags & 4) {
244        n = read1(in);
245        n += (unsigned)(read1(in)) << 8;
246        skip(in, n);
247    }
248    if (flags & 8) while (read1(in) != 0) ;
249    if (flags & 16) while (read1(in) != 0) ;
250    if (flags & 2) skip(in, 2);
251}
252
253/* decompress gzip file "name", return strm with a deflate stream ready to
254   continue compression of the data in the gzip file, and return a file
255   descriptor pointing to where to write the compressed data -- the deflate
256   stream is initialized to compress using level "level" */
257local int gzscan(char *name, z_stream *strm, int level)
258{
259    int ret, lastbit, left, full;
260    unsigned have;
261    unsigned long crc, tot;
262    unsigned char *window;
263    off_t lastoff, end;
264    file gz;
265
266    /* open gzip file */
267    gz.name = name;
268    gz.fd = open(name, O_RDWR, 0);
269    if (gz.fd == -1) bye("cannot open ", name);
270    gz.buf = malloc(CHUNK);
271    if (gz.buf == NULL) bye("out of memory", "");
272    gz.size = LGCHUNK;
273    gz.left = 0;
274
275    /* skip gzip header */
276    gzheader(&gz);
277
278    /* prepare to decompress */
279    window = malloc(DSIZE);
280    if (window == NULL) bye("out of memory", "");
281    strm->zalloc = Z_NULL;
282    strm->zfree = Z_NULL;
283    strm->opaque = Z_NULL;
284    ret = inflateInit2(strm, -15);
285    if (ret != Z_OK) bye("out of memory", " or library mismatch");
286
287    /* decompress the deflate stream, saving append information */
288    lastbit = 0;
289    lastoff = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
290    left = 0;
291    strm->avail_in = gz.left;
292    strm->next_in = gz.next;
293    crc = crc32(0L, Z_NULL, 0);
294    have = full = 0;
295    do {
296        /* if needed, get more input */
297        if (strm->avail_in == 0) {
298            readmore(&gz);
299            strm->avail_in = gz.left;
300            strm->next_in = gz.next;
301        }
302
303        /* set up output to next available section of sliding window */
304        strm->avail_out = DSIZE - have;
305        strm->next_out = window + have;
306
307        /* inflate and check for errors */
308        ret = inflate(strm, Z_BLOCK);
309        if (ret == Z_STREAM_ERROR) bye("internal stream error!", "");
310        if (ret == Z_MEM_ERROR) bye("out of memory", "");
311        if (ret == Z_DATA_ERROR)
312            bye("invalid compressed data--format violated in", name);
313
314        /* update crc and sliding window pointer */
315        crc = crc32(crc, window + have, DSIZE - have - strm->avail_out);
316        if (strm->avail_out)
317            have = DSIZE - strm->avail_out;
318        else {
319            have = 0;
320            full = 1;
321        }
322
323        /* process end of block */
324        if (strm->data_type & 128) {
325            if (strm->data_type & 64)
326                left = strm->data_type & 0x1f;
327            else {
328                lastbit = strm->data_type & 0x1f;
329                lastoff = lseek(gz.fd, 0L, SEEK_CUR) - strm->avail_in;
330            }
331        }
332    } while (ret != Z_STREAM_END);
333    inflateEnd(strm);
334    gz.left = strm->avail_in;
335    gz.next = strm->next_in;
336
337    /* save the location of the end of the compressed data */
338    end = lseek(gz.fd, 0L, SEEK_CUR) - gz.left;
339
340    /* check gzip trailer and save total for deflate */
341    if (crc != read4(&gz))
342        bye("invalid compressed data--crc mismatch in ", name);
343    tot = strm->total_out;
344    if ((tot & 0xffffffffUL) != read4(&gz))
345        bye("invalid compressed data--length mismatch in", name);
346
347    /* if not at end of file, warn */
348    if (gz.left || readin(&gz))
349        fprintf(stderr,
350            "gzappend warning: junk at end of gzip file overwritten\n");
351
352    /* clear last block bit */
353    lseek(gz.fd, lastoff - (lastbit != 0), SEEK_SET);
354    if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
355    *gz.buf = (unsigned char)(*gz.buf ^ (1 << ((8 - lastbit) & 7)));
356    lseek(gz.fd, -1L, SEEK_CUR);
357    if (write(gz.fd, gz.buf, 1) != 1) bye("writing after seek to ", name);
358
359    /* if window wrapped, build dictionary from window by rotating */
360    if (full) {
361        rotate(window, DSIZE, have);
362        have = DSIZE;
363    }
364
365    /* set up deflate stream with window, crc, total_in, and leftover bits */
366    ret = deflateInit2(strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY);
367    if (ret != Z_OK) bye("out of memory", "");
368    deflateSetDictionary(strm, window, have);
369    strm->adler = crc;
370    strm->total_in = tot;
371    if (left) {
372        lseek(gz.fd, --end, SEEK_SET);
373        if (read(gz.fd, gz.buf, 1) != 1) bye("reading after seek on ", name);
374        deflatePrime(strm, 8 - left, *gz.buf);
375    }
376    lseek(gz.fd, end, SEEK_SET);
377
378    /* clean up and return */
379    free(window);
380    free(gz.buf);
381    return gz.fd;
382}
383
384/* append file "name" to gzip file gd using deflate stream strm -- if last
385   is true, then finish off the deflate stream at the end */
386local void gztack(char *name, int gd, z_stream *strm, int last)
387{
388    int fd, len, ret;
389    unsigned left;
390    unsigned char *in, *out;
391
392    /* open file to compress and append */
393    fd = 0;
394    if (name != NULL) {
395        fd = open(name, O_RDONLY, 0);
396        if (fd == -1)
397            fprintf(stderr, "gzappend warning: %s not found, skipping ...\n",
398                    name);
399    }
400
401    /* allocate buffers */
402    in = fd == -1 ? NULL : malloc(CHUNK);
403    out = malloc(CHUNK);
404    if (out == NULL) bye("out of memory", "");
405
406    /* compress input file and append to gzip file */
407    do {
408        /* get more input */
409        len = fd == -1 ? 0 : read(fd, in, CHUNK);
410        if (len == -1) {
411            fprintf(stderr,
412                    "gzappend warning: error reading %s, skipping rest ...\n",
413                    name);
414            len = 0;
415        }
416        strm->avail_in = (unsigned)len;
417        strm->next_in = in;
418        if (len) strm->adler = crc32(strm->adler, in, (unsigned)len);
419
420        /* compress and write all available output */
421        do {
422            strm->avail_out = CHUNK;
423            strm->next_out = out;
424            ret = deflate(strm, last && len == 0 ? Z_FINISH : Z_NO_FLUSH);
425            left = CHUNK - strm->avail_out;
426            while (left) {
427                len = write(gd, out + CHUNK - strm->avail_out - left, left);
428                if (len == -1) bye("writing gzip file", "");
429                left -= (unsigned)len;
430            }
431        } while (strm->avail_out == 0 && ret != Z_STREAM_END);
432    } while (len != 0);
433
434    /* write trailer after last entry */
435    if (last) {
436        deflateEnd(strm);
437        out[0] = (unsigned char)(strm->adler);
438        out[1] = (unsigned char)(strm->adler >> 8);
439        out[2] = (unsigned char)(strm->adler >> 16);
440        out[3] = (unsigned char)(strm->adler >> 24);
441        out[4] = (unsigned char)(strm->total_in);
442        out[5] = (unsigned char)(strm->total_in >> 8);
443        out[6] = (unsigned char)(strm->total_in >> 16);
444        out[7] = (unsigned char)(strm->total_in >> 24);
445        len = 8;
446        do {
447            ret = write(gd, out + 8 - len, len);
448            if (ret == -1) bye("writing gzip file", "");
449            len -= ret;
450        } while (len);
451        close(gd);
452    }
453
454    /* clean up and return */
455    free(out);
456    if (in != NULL) free(in);
457    if (fd > 0) close(fd);
458}
459
460/* process the compression level option if present, scan the gzip file, and
461   append the specified files, or append the data from stdin if no other file
462   names are provided on the command line -- the gzip file must be writable
463   and seekable */
464int main(int argc, char **argv)
465{
466    int gd, level;
467    z_stream strm;
468
469    /* ignore command name */
470    argv++;
471
472    /* provide usage if no arguments */
473    if (*argv == NULL) {
474        printf("gzappend 1.1 (4 Nov 2003) Copyright (C) 2003 Mark Adler\n");
475        printf(
476            "usage: gzappend [-level] file.gz [ addthis [ andthis ... ]]\n");
477        return 0;
478    }
479
480    /* set compression level */
481    level = Z_DEFAULT_COMPRESSION;
482    if (argv[0][0] == '-') {
483        if (argv[0][1] < '0' || argv[0][1] > '9' || argv[0][2] != 0)
484            bye("invalid compression level", "");
485        level = argv[0][1] - '0';
486        if (*++argv == NULL) bye("no gzip file name after options", "");
487    }
488
489    /* prepare to append to gzip file */
490    gd = gzscan(*argv++, &strm, level);
491
492    /* append files on command line, or from stdin if none */
493    if (*argv == NULL)
494        gztack(NULL, gd, &strm, 1);
495    else
496        do {
497            gztack(*argv, gd, &strm, argv[1] == NULL);
498        } while (*++argv != NULL);
499    return 0;
500}
501