archive_write_set_format_warc.c revision 358090
1/*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * Author: Sebastian Freundt  <devel@fresse.org>
4 *
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
19 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "archive_platform.h"
29__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_write_set_format_warc.c 358090 2020-02-19 01:51:44Z mm $");
30
31#ifdef HAVE_ERRNO_H
32#include <errno.h>
33#endif
34#include <stdio.h>
35#ifdef HAVE_STDLIB_H
36#include <stdlib.h>
37#endif
38#ifdef HAVE_STRING_H
39#include <string.h>
40#endif
41#ifdef HAVE_TIME_H
42#include <time.h>
43#endif
44
45#include "archive.h"
46#include "archive_entry.h"
47#include "archive_entry_locale.h"
48#include "archive_private.h"
49#include "archive_random_private.h"
50#include "archive_write_private.h"
51#include "archive_write_set_format_private.h"
52
53struct warc_s {
54	unsigned int omit_warcinfo:1;
55
56	time_t now;
57	mode_t typ;
58	unsigned int rng;
59	/* populated size */
60	uint64_t populz;
61};
62
63static const char warcinfo[] =
64    "software: libarchive/" ARCHIVE_VERSION_ONLY_STRING "\r\n"
65    "format: WARC file version 1.0\r\n";
66
67typedef enum {
68	WT_NONE,
69	/* warcinfo */
70	WT_INFO,
71	/* metadata */
72	WT_META,
73	/* resource */
74	WT_RSRC,
75	/* request, unsupported */
76	WT_REQ,
77	/* response, unsupported */
78	WT_RSP,
79	/* revisit, unsupported */
80	WT_RVIS,
81	/* conversion, unsupported */
82	WT_CONV,
83	/* continuation, unsupported at the moment */
84	WT_CONT,
85	/* invalid type */
86	LAST_WT
87} warc_type_t;
88
89typedef struct {
90	warc_type_t type;
91	const char *tgturi;
92	const char *recid;
93	time_t rtime;
94	time_t mtime;
95	const char *cnttyp;
96	uint64_t cntlen;
97} warc_essential_hdr_t;
98
99typedef struct {
100	unsigned int u[4U];
101} warc_uuid_t;
102
103static int _warc_options(struct archive_write*, const char *key, const char *v);
104static int _warc_header(struct archive_write *a, struct archive_entry *entry);
105static ssize_t _warc_data(struct archive_write *a, const void *buf, size_t sz);
106static int _warc_finish_entry(struct archive_write *a);
107static int _warc_close(struct archive_write *a);
108static int _warc_free(struct archive_write *a);
109
110/* private routines */
111static ssize_t _popul_ehdr(struct archive_string *t, size_t z, warc_essential_hdr_t);
112static int _gen_uuid(warc_uuid_t *tgt);
113
114
115/*
116 * Set output format to ISO 28500 (aka WARC) format.
117 */
118int
119archive_write_set_format_warc(struct archive *_a)
120{
121	struct archive_write *a = (struct archive_write *)_a;
122	struct warc_s *w;
123
124	archive_check_magic(_a, ARCHIVE_WRITE_MAGIC,
125	    ARCHIVE_STATE_NEW, "archive_write_set_format_warc");
126
127	/* If another format was already registered, unregister it. */
128	if (a->format_free != NULL) {
129		(a->format_free)(a);
130	}
131
132	w = malloc(sizeof(*w));
133	if (w == NULL) {
134		archive_set_error(&a->archive, ENOMEM,
135		    "Can't allocate warc data");
136		return (ARCHIVE_FATAL);
137	}
138	/* by default we're emitting a file wide header */
139	w->omit_warcinfo = 0U;
140	/* obtain current time for date fields */
141	w->now = time(NULL);
142	/* reset file type info */
143	w->typ = 0;
144	/* also initialise our rng */
145	w->rng = (unsigned int)w->now;
146
147	a->format_data = w;
148	a->format_name = "WARC/1.0";
149	a->format_options = _warc_options;
150	a->format_write_header = _warc_header;
151	a->format_write_data = _warc_data;
152	a->format_close = _warc_close;
153	a->format_free = _warc_free;
154	a->format_finish_entry = _warc_finish_entry;
155	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
156	a->archive.archive_format_name = "WARC/1.0";
157	return (ARCHIVE_OK);
158}
159
160
161/* archive methods */
162static int
163_warc_options(struct archive_write *a, const char *key, const char *val)
164{
165	struct warc_s *w = a->format_data;
166
167	if (strcmp(key, "omit-warcinfo") == 0) {
168		if (val == NULL || strcmp(val, "true") == 0) {
169			/* great */
170			w->omit_warcinfo = 1U;
171			return (ARCHIVE_OK);
172		}
173	}
174
175	/* Note: The "warn" return is just to inform the options
176	 * supervisor that we didn't handle it.  It will generate
177	 * a suitable error if no one used this option. */
178	return (ARCHIVE_WARN);
179}
180
181static int
182_warc_header(struct archive_write *a, struct archive_entry *entry)
183{
184	struct warc_s *w = a->format_data;
185	struct archive_string hdr;
186#define MAX_HDR_SIZE 512
187
188	/* check whether warcinfo record needs outputting */
189	if (!w->omit_warcinfo) {
190		ssize_t r;
191		warc_essential_hdr_t wi = {
192			WT_INFO,
193			/*uri*/NULL,
194			/*urn*/NULL,
195			/*rtm*/0,
196			/*mtm*/0,
197			/*cty*/"application/warc-fields",
198			/*len*/sizeof(warcinfo) - 1U,
199		};
200		wi.rtime = w->now;
201		wi.mtime = w->now;
202
203		archive_string_init(&hdr);
204		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, wi);
205		if (r >= 0) {
206			/* jackpot! */
207			/* now also use HDR buffer for the actual warcinfo */
208			archive_strncat(&hdr, warcinfo, sizeof(warcinfo) -1);
209
210			/* append end-of-record indicator */
211			archive_strncat(&hdr, "\r\n\r\n", 4);
212
213			/* write to output stream */
214			__archive_write_output(a, hdr.s, archive_strlen(&hdr));
215		}
216		/* indicate we're done with file header writing */
217		w->omit_warcinfo = 1U;
218		archive_string_free(&hdr);
219	}
220
221	if (archive_entry_pathname(entry) == NULL) {
222		archive_set_error(&a->archive, EINVAL,
223		    "Invalid filename");
224		return (ARCHIVE_WARN);
225	}
226
227	w->typ = archive_entry_filetype(entry);
228	w->populz = 0U;
229	if (w->typ == AE_IFREG) {
230		warc_essential_hdr_t rh = {
231			WT_RSRC,
232			/*uri*/NULL,
233			/*urn*/NULL,
234			/*rtm*/0,
235			/*mtm*/0,
236			/*cty*/NULL,
237			/*len*/0,
238		};
239		ssize_t r;
240		rh.tgturi = archive_entry_pathname(entry);
241		rh.rtime = w->now;
242		rh.mtime = archive_entry_mtime(entry);
243		rh.cntlen = (size_t)archive_entry_size(entry);
244
245		archive_string_init(&hdr);
246		r = _popul_ehdr(&hdr, MAX_HDR_SIZE, rh);
247		if (r < 0) {
248			/* don't bother */
249			archive_set_error(
250				&a->archive,
251				ARCHIVE_ERRNO_FILE_FORMAT,
252				"cannot archive file");
253			return (ARCHIVE_WARN);
254		}
255		/* otherwise append to output stream */
256		__archive_write_output(a, hdr.s, r);
257		/* and let subsequent calls to _data() know about the size */
258		w->populz = rh.cntlen;
259		archive_string_free(&hdr);
260		return (ARCHIVE_OK);
261	}
262	/* just resort to erroring as per Tim's advice */
263	__archive_write_entry_filetype_unsupported(
264	    &a->archive, entry, "WARC");
265	return (ARCHIVE_FAILED);
266}
267
268static ssize_t
269_warc_data(struct archive_write *a, const void *buf, size_t len)
270{
271	struct warc_s *w = a->format_data;
272
273	if (w->typ == AE_IFREG) {
274		int rc;
275
276		/* never write more bytes than announced */
277		if (len > w->populz) {
278			len = (size_t)w->populz;
279		}
280
281		/* now then, out we put the whole shebang */
282		rc = __archive_write_output(a, buf, len);
283		if (rc != ARCHIVE_OK) {
284			return rc;
285		}
286	}
287	return len;
288}
289
290static int
291_warc_finish_entry(struct archive_write *a)
292{
293	static const char _eor[] = "\r\n\r\n";
294	struct warc_s *w = a->format_data;
295
296	if (w->typ == AE_IFREG) {
297		int rc = __archive_write_output(a, _eor, sizeof(_eor) - 1U);
298
299		if (rc != ARCHIVE_OK) {
300			return rc;
301		}
302	}
303	/* reset type info */
304	w->typ = 0;
305	return (ARCHIVE_OK);
306}
307
308static int
309_warc_close(struct archive_write *a)
310{
311	(void)a; /* UNUSED */
312	return (ARCHIVE_OK);
313}
314
315static int
316_warc_free(struct archive_write *a)
317{
318	struct warc_s *w = a->format_data;
319
320	free(w);
321	a->format_data = NULL;
322	return (ARCHIVE_OK);
323}
324
325
326/* private routines */
327static void
328xstrftime(struct archive_string *as, const char *fmt, time_t t)
329{
330/** like strftime(3) but for time_t objects */
331	struct tm *rt;
332#if defined(HAVE_GMTIME_R) || defined(HAVE__GMTIME64_S)
333	struct tm timeHere;
334#endif
335#if defined(HAVE__GMTIME64_S)
336	errno_t terr;
337	__time64_t tmptime;
338#endif
339	char strtime[100];
340	size_t len;
341
342#ifdef HAVE_GMTIME_R
343	if ((rt = gmtime_r(&t, &timeHere)) == NULL)
344		return;
345#elif defined(HAVE__GMTIME64_S)
346	tmptime = t;
347	terr = _gmtime64_s(&timeHere, &tmptime);
348	if (terr)
349		rt = NULL;
350	else
351		rt = &timeHere;
352#else
353	if ((rt = gmtime(&t)) == NULL)
354		return;
355#endif
356	/* leave the hard yacker to our role model strftime() */
357	len = strftime(strtime, sizeof(strtime)-1, fmt, rt);
358	archive_strncat(as, strtime, len);
359}
360
361static ssize_t
362_popul_ehdr(struct archive_string *tgt, size_t tsz, warc_essential_hdr_t hdr)
363{
364	static const char _ver[] = "WARC/1.0\r\n";
365	static const char * const _typ[LAST_WT] = {
366		NULL, "warcinfo", "metadata", "resource", NULL
367	};
368	char std_uuid[48U];
369
370	if (hdr.type == WT_NONE || hdr.type > WT_RSRC) {
371		/* brilliant, how exactly did we get here? */
372		return -1;
373	}
374
375	archive_strcpy(tgt, _ver);
376
377	archive_string_sprintf(tgt, "WARC-Type: %s\r\n", _typ[hdr.type]);
378
379	if (hdr.tgturi != NULL) {
380		/* check if there's a xyz:// */
381		static const char _uri[] = "";
382		static const char _fil[] = "file://";
383		const char *u;
384		char *chk = strchr(hdr.tgturi, ':');
385
386		if (chk != NULL && chk[1U] == '/' && chk[2U] == '/') {
387			/* yep, it's definitely a URI */
388			u = _uri;
389		} else {
390			/* hm, best to prepend file:// then */
391			u = _fil;
392		}
393		archive_string_sprintf(tgt,
394			"WARC-Target-URI: %s%s\r\n", u, hdr.tgturi);
395	}
396
397	/* record time is usually when the http is sent off,
398	 * just treat the archive writing as such for a moment */
399	xstrftime(tgt, "WARC-Date: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.rtime);
400
401	/* while we're at it, record the mtime */
402	xstrftime(tgt, "Last-Modified: %Y-%m-%dT%H:%M:%SZ\r\n", hdr.mtime);
403
404	if (hdr.recid == NULL) {
405		/* generate one, grrrr */
406		warc_uuid_t u;
407
408		_gen_uuid(&u);
409		/* Unfortunately, archive_string_sprintf does not
410		 * handle the minimum number following '%'.
411		 * So we have to use snprintf function here instead
412		 * of archive_string_snprintf function. */
413#if defined(_WIN32) && !defined(__CYGWIN__) && !( defined(_MSC_VER) && _MSC_VER >= 1900)
414#define snprintf _snprintf
415#endif
416		snprintf(
417			std_uuid, sizeof(std_uuid),
418			"<urn:uuid:%08x-%04x-%04x-%04x-%04x%08x>",
419			u.u[0U],
420			u.u[1U] >> 16U, u.u[1U] & 0xffffU,
421			u.u[2U] >> 16U, u.u[2U] & 0xffffU,
422			u.u[3U]);
423		hdr.recid = std_uuid;
424	}
425
426	/* record-id is mandatory, fingers crossed we won't fail */
427	archive_string_sprintf(tgt, "WARC-Record-ID: %s\r\n", hdr.recid);
428
429	if (hdr.cnttyp != NULL) {
430		archive_string_sprintf(tgt, "Content-Type: %s\r\n", hdr.cnttyp);
431	}
432
433	/* next one is mandatory */
434	archive_string_sprintf(tgt, "Content-Length: %ju\r\n", (uintmax_t)hdr.cntlen);
435	/**/
436	archive_strncat(tgt, "\r\n", 2);
437
438	return (archive_strlen(tgt) >= tsz)? -1: (ssize_t)archive_strlen(tgt);
439}
440
441static int
442_gen_uuid(warc_uuid_t *tgt)
443{
444	archive_random(tgt->u, sizeof(tgt->u));
445	/* obey uuid version 4 rules */
446	tgt->u[1U] &= 0xffff0fffU;
447	tgt->u[1U] |= 0x4000U;
448	tgt->u[2U] &= 0x3fffffffU;
449	tgt->u[2U] |= 0x80000000U;
450	return 0;
451}
452
453/* archive_write_set_format_warc.c ends here */
454