1299425Smm/*-
2299425Smm * Copyright (c) 2014 Sebastian Freundt
3299425Smm * All rights reserved.
4299425Smm *
5299425Smm * Redistribution and use in source and binary forms, with or without
6299425Smm * modification, are permitted provided that the following conditions
7299425Smm * are met:
8299425Smm * 1. Redistributions of source code must retain the above copyright
9299425Smm *    notice, this list of conditions and the following disclaimer.
10299425Smm * 2. Redistributions in binary form must reproduce the above copyright
11299425Smm *    notice, this list of conditions and the following disclaimer in the
12299425Smm *    documentation and/or other materials provided with the distribution.
13299425Smm *
14299425Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15299425Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16299425Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17299425Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18299425Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19299425Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20299425Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21299425Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22299425Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23299425Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24299425Smm */
25299425Smm
26299425Smm#include "archive_platform.h"
27299425Smm__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 368708 2020-12-16 22:25:40Z mm $");
28299425Smm
29299425Smm/**
30299425Smm * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31299425Smm * ISO 28500:2009.
32299425Smm * For the purposes of this file we used the final draft from:
33299425Smm * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34299425Smm *
35299425Smm * Todo:
36299425Smm * [ ] real-world warcs can contain resources at endpoints ending in /
37299425Smm *     e.g. http://bibnum.bnf.fr/warc/
38299425Smm *     if you're lucky their response contains a Content-Location: header
39299425Smm *     pointing to a unix-compliant filename, in the example above it's
40299425Smm *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41299425Smm *     however, that's not mandated and github for example doesn't follow
42299425Smm *     this convention.
43299425Smm *     We need a set of archive options to control what to do with
44299425Smm *     entries like these, at the moment care is taken to skip them.
45299425Smm *
46299425Smm **/
47299425Smm
48299425Smm#ifdef HAVE_SYS_STAT_H
49299425Smm#include <sys/stat.h>
50299425Smm#endif
51299425Smm#ifdef HAVE_ERRNO_H
52299425Smm#include <errno.h>
53299425Smm#endif
54299425Smm#ifdef HAVE_STDLIB_H
55299425Smm#include <stdlib.h>
56299425Smm#endif
57299425Smm#ifdef HAVE_STRING_H
58299425Smm#include <string.h>
59299425Smm#endif
60299425Smm#ifdef HAVE_LIMITS_H
61299425Smm#include <limits.h>
62299425Smm#endif
63299425Smm#ifdef HAVE_CTYPE_H
64299425Smm#include <ctype.h>
65299425Smm#endif
66299425Smm#ifdef HAVE_TIME_H
67299425Smm#include <time.h>
68299425Smm#endif
69299425Smm
70299425Smm#include "archive.h"
71299425Smm#include "archive_entry.h"
72299425Smm#include "archive_private.h"
73299425Smm#include "archive_read_private.h"
74299425Smm
75299425Smmtypedef enum {
76299425Smm	WT_NONE,
77299425Smm	/* warcinfo */
78299425Smm	WT_INFO,
79299425Smm	/* metadata */
80299425Smm	WT_META,
81299425Smm	/* resource */
82299425Smm	WT_RSRC,
83299425Smm	/* request, unsupported */
84299425Smm	WT_REQ,
85299425Smm	/* response, unsupported */
86299425Smm	WT_RSP,
87299425Smm	/* revisit, unsupported */
88299425Smm	WT_RVIS,
89299425Smm	/* conversion, unsupported */
90299425Smm	WT_CONV,
91313571Smm	/* continuation, unsupported at the moment */
92299425Smm	WT_CONT,
93299425Smm	/* invalid type */
94299425Smm	LAST_WT
95299425Smm} warc_type_t;
96299425Smm
97299425Smmtypedef struct {
98299425Smm	size_t len;
99299425Smm	const char *str;
100299425Smm} warc_string_t;
101299425Smm
102299425Smmtypedef struct {
103299425Smm	size_t len;
104299425Smm	char *str;
105299425Smm} warc_strbuf_t;
106299425Smm
107299425Smmstruct warc_s {
108299425Smm	/* content length ahead */
109299425Smm	size_t cntlen;
110299425Smm	/* and how much we've processed so far */
111299425Smm	size_t cntoff;
112299425Smm	/* and how much we need to consume between calls */
113299425Smm	size_t unconsumed;
114299425Smm
115299425Smm	/* string pool */
116299425Smm	warc_strbuf_t pool;
117299425Smm	/* previous version */
118299425Smm	unsigned int pver;
119299425Smm	/* stringified format name */
120299425Smm	struct archive_string sver;
121299425Smm};
122299425Smm
123299425Smmstatic int _warc_bid(struct archive_read *a, int);
124299425Smmstatic int _warc_cleanup(struct archive_read *a);
125299425Smmstatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126299425Smmstatic int _warc_skip(struct archive_read *a);
127299425Smmstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128299425Smm
129299425Smm/* private routines */
130299425Smmstatic unsigned int _warc_rdver(const char buf[10], size_t bsz);
131299425Smmstatic unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132299425Smmstatic warc_string_t _warc_rduri(const char *buf, size_t bsz);
133299425Smmstatic ssize_t _warc_rdlen(const char *buf, size_t bsz);
134299425Smmstatic time_t _warc_rdrtm(const char *buf, size_t bsz);
135299425Smmstatic time_t _warc_rdmtm(const char *buf, size_t bsz);
136299425Smmstatic const char *_warc_find_eoh(const char *buf, size_t bsz);
137313929Smmstatic const char *_warc_find_eol(const char *buf, size_t bsz);
138299425Smm
139299425Smmint
140299425Smmarchive_read_support_format_warc(struct archive *_a)
141299425Smm{
142299425Smm	struct archive_read *a = (struct archive_read *)_a;
143299425Smm	struct warc_s *w;
144299425Smm	int r;
145299425Smm
146299425Smm	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147299425Smm	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148299425Smm
149311042Smm	if ((w = calloc(1, sizeof(*w))) == NULL) {
150299425Smm		archive_set_error(&a->archive, ENOMEM,
151299425Smm		    "Can't allocate warc data");
152299425Smm		return (ARCHIVE_FATAL);
153299425Smm	}
154299425Smm
155299425Smm	r = __archive_read_register_format(
156299425Smm		a, w, "warc",
157299425Smm		_warc_bid, NULL, _warc_rdhdr, _warc_read,
158299425Smm		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
159299425Smm
160299425Smm	if (r != ARCHIVE_OK) {
161299425Smm		free(w);
162299425Smm		return (r);
163299425Smm	}
164299425Smm	return (ARCHIVE_OK);
165299425Smm}
166299425Smm
167299425Smmstatic int
168299425Smm_warc_cleanup(struct archive_read *a)
169299425Smm{
170299425Smm	struct warc_s *w = a->format->data;
171299425Smm
172299425Smm	if (w->pool.len > 0U) {
173299425Smm		free(w->pool.str);
174299425Smm	}
175299425Smm	archive_string_free(&w->sver);
176299425Smm	free(w);
177299425Smm	a->format->data = NULL;
178299425Smm	return (ARCHIVE_OK);
179299425Smm}
180299425Smm
181299425Smmstatic int
182299425Smm_warc_bid(struct archive_read *a, int best_bid)
183299425Smm{
184299425Smm	const char *hdr;
185299425Smm	ssize_t nrd;
186299425Smm	unsigned int ver;
187299425Smm
188299425Smm	(void)best_bid; /* UNUSED */
189299425Smm
190299425Smm	/* check first line of file, it should be a record already */
191299425Smm	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
192299425Smm		/* no idea what to do */
193299425Smm		return -1;
194299425Smm	} else if (nrd < 12) {
195299425Smm		/* nah, not for us, our magic cookie is at least 12 bytes */
196299425Smm		return -1;
197299425Smm	}
198299425Smm
199299425Smm	/* otherwise snarf the record's version number */
200299425Smm	ver = _warc_rdver(hdr, nrd);
201313929Smm	if (ver < 1200U || ver > 10000U) {
202313929Smm		/* we only support WARC 0.12 to 1.0 */
203299425Smm		return -1;
204299425Smm	}
205299425Smm
206299425Smm	/* otherwise be confident */
207299425Smm	return (64);
208299425Smm}
209299425Smm
210299425Smmstatic int
211299425Smm_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
212299425Smm{
213299425Smm#define HDR_PROBE_LEN		(12U)
214299425Smm	struct warc_s *w = a->format->data;
215299425Smm	unsigned int ver;
216299425Smm	const char *buf;
217299425Smm	ssize_t nrd;
218299425Smm	const char *eoh;
219299425Smm	/* for the file name, saves some strndup()'ing */
220299425Smm	warc_string_t fnam;
221299425Smm	/* warc record type, not that we really use it a lot */
222299425Smm	warc_type_t ftyp;
223299425Smm	/* content-length+error monad */
224299425Smm	ssize_t cntlen;
225299425Smm	/* record time is the WARC-Date time we reinterpret it as ctime */
226299425Smm	time_t rtime;
227299425Smm	/* mtime is the Last-Modified time which will be the entry's mtime */
228299425Smm	time_t mtime;
229299425Smm
230299425Smmstart_over:
231299425Smm	/* just use read_ahead() they keep track of unconsumed
232299425Smm	 * bits and bobs for us; no need to put an extra shift in
233299425Smm	 * and reproduce that functionality here */
234299425Smm	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235299425Smm
236299425Smm	if (nrd < 0) {
237299425Smm		/* no good */
238299425Smm		archive_set_error(
239299425Smm			&a->archive, ARCHIVE_ERRNO_MISC,
240299425Smm			"Bad record header");
241299425Smm		return (ARCHIVE_FATAL);
242299425Smm	} else if (buf == NULL) {
243299425Smm		/* there should be room for at least WARC/bla\r\n
244299425Smm		 * must be EOF therefore */
245299425Smm		return (ARCHIVE_EOF);
246299425Smm	}
247299425Smm 	/* looks good so far, try and find the end of the header now */
248299425Smm	eoh = _warc_find_eoh(buf, nrd);
249299425Smm	if (eoh == NULL) {
250299425Smm		/* still no good, the header end might be beyond the
251299425Smm		 * probe we've requested, but then again who'd cram
252299425Smm		 * so much stuff into the header *and* be 28500-compliant */
253299425Smm		archive_set_error(
254299425Smm			&a->archive, ARCHIVE_ERRNO_MISC,
255299425Smm			"Bad record header");
256299425Smm		return (ARCHIVE_FATAL);
257313929Smm	}
258313929Smm	ver = _warc_rdver(buf, eoh - buf);
259313929Smm	/* we currently support WARC 0.12 to 1.0 */
260313929Smm	if (ver == 0U) {
261299425Smm		archive_set_error(
262299425Smm			&a->archive, ARCHIVE_ERRNO_MISC,
263313929Smm			"Invalid record version");
264299425Smm		return (ARCHIVE_FATAL);
265313929Smm	} else if (ver < 1200U || ver > 10000U) {
266313929Smm		archive_set_error(
267313929Smm			&a->archive, ARCHIVE_ERRNO_MISC,
268313929Smm			"Unsupported record version: %u.%u",
269313929Smm			ver / 10000, (ver % 10000) / 100);
270313929Smm		return (ARCHIVE_FATAL);
271313929Smm	}
272313929Smm	cntlen = _warc_rdlen(buf, eoh - buf);
273313929Smm	if (cntlen < 0) {
274299425Smm		/* nightmare!  the specs say content-length is mandatory
275299425Smm		 * so I don't feel overly bad stopping the reader here */
276299425Smm		archive_set_error(
277299425Smm			&a->archive, EINVAL,
278299425Smm			"Bad content length");
279299425Smm		return (ARCHIVE_FATAL);
280313929Smm	}
281313929Smm	rtime = _warc_rdrtm(buf, eoh - buf);
282313929Smm	if (rtime == (time_t)-1) {
283299425Smm		/* record time is mandatory as per WARC/1.0,
284299425Smm		 * so just barf here, fast and loud */
285299425Smm		archive_set_error(
286299425Smm			&a->archive, EINVAL,
287299425Smm			"Bad record time");
288299425Smm		return (ARCHIVE_FATAL);
289299425Smm	}
290299425Smm
291299425Smm	/* let the world know we're a WARC archive */
292299425Smm	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293299425Smm	if (ver != w->pver) {
294299425Smm		/* stringify this entry's version */
295299425Smm		archive_string_sprintf(&w->sver,
296313929Smm			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297299425Smm		/* remember the version */
298299425Smm		w->pver = ver;
299299425Smm	}
300299425Smm	/* start off with the type */
301299425Smm	ftyp = _warc_rdtyp(buf, eoh - buf);
302299425Smm	/* and let future calls know about the content */
303299425Smm	w->cntlen = cntlen;
304299425Smm	w->cntoff = 0U;
305299425Smm	mtime = 0;/* Avoid compiling error on some platform. */
306299425Smm
307299425Smm	switch (ftyp) {
308299425Smm	case WT_RSRC:
309299425Smm	case WT_RSP:
310299425Smm		/* only try and read the filename in the cases that are
311299425Smm		 * guaranteed to have one */
312299425Smm		fnam = _warc_rduri(buf, eoh - buf);
313299425Smm		/* check the last character in the URI to avoid creating
314299425Smm		 * directory endpoints as files, see Todo above */
315299425Smm		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316299425Smm			/* break here for now */
317299425Smm			fnam.len = 0U;
318299425Smm			fnam.str = NULL;
319299425Smm			break;
320299425Smm		}
321299425Smm		/* bang to our string pool, so we save a
322299425Smm		 * malloc()+free() roundtrip */
323299425Smm		if (fnam.len + 1U > w->pool.len) {
324299425Smm			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325299425Smm			w->pool.str = realloc(w->pool.str, w->pool.len);
326299425Smm		}
327299425Smm		memcpy(w->pool.str, fnam.str, fnam.len);
328299425Smm		w->pool.str[fnam.len] = '\0';
329305192Smm		/* let no one else know about the pool, it's a secret, shhh */
330299425Smm		fnam.str = w->pool.str;
331299425Smm
332299425Smm		/* snarf mtime or deduce from rtime
333299425Smm		 * this is a custom header added by our writer, it's quite
334299425Smm		 * hard to believe anyone else would go through with it
335299425Smm		 * (apart from being part of some http responses of course) */
336299425Smm		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
337299425Smm			mtime = rtime;
338299425Smm		}
339299425Smm		break;
340368708Smm	case WT_NONE:
341368708Smm	case WT_INFO:
342368708Smm	case WT_META:
343368708Smm	case WT_REQ:
344368708Smm	case WT_RVIS:
345368708Smm	case WT_CONV:
346368708Smm	case WT_CONT:
347368708Smm	case LAST_WT:
348299425Smm	default:
349299425Smm		fnam.len = 0U;
350299425Smm		fnam.str = NULL;
351299425Smm		break;
352299425Smm	}
353299425Smm
354299425Smm	/* now eat some of those delicious buffer bits */
355299425Smm	__archive_read_consume(a, eoh - buf);
356299425Smm
357299425Smm	switch (ftyp) {
358299425Smm	case WT_RSRC:
359299425Smm	case WT_RSP:
360299425Smm		if (fnam.len > 0U) {
361299425Smm			/* populate entry object */
362299425Smm			archive_entry_set_filetype(entry, AE_IFREG);
363299425Smm			archive_entry_copy_pathname(entry, fnam.str);
364299425Smm			archive_entry_set_size(entry, cntlen);
365299425Smm			archive_entry_set_perm(entry, 0644);
366299425Smm			/* rtime is the new ctime, mtime stays mtime */
367299425Smm			archive_entry_set_ctime(entry, rtime, 0L);
368299425Smm			archive_entry_set_mtime(entry, mtime, 0L);
369299425Smm			break;
370299425Smm		}
371299425Smm		/* FALLTHROUGH */
372368708Smm	case WT_NONE:
373368708Smm	case WT_INFO:
374368708Smm	case WT_META:
375368708Smm	case WT_REQ:
376368708Smm	case WT_RVIS:
377368708Smm	case WT_CONV:
378368708Smm	case WT_CONT:
379368708Smm	case LAST_WT:
380299425Smm	default:
381299425Smm		/* consume the content and start over */
382299425Smm		_warc_skip(a);
383299425Smm		goto start_over;
384299425Smm	}
385299425Smm	return (ARCHIVE_OK);
386299425Smm}
387299425Smm
388299425Smmstatic int
389299425Smm_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
390299425Smm{
391299425Smm	struct warc_s *w = a->format->data;
392299425Smm	const char *rab;
393299425Smm	ssize_t nrd;
394299425Smm
395299425Smm	if (w->cntoff >= w->cntlen) {
396299425Smm	eof:
397299425Smm		/* it's our lucky day, no work, we can leave early */
398299425Smm		*buf = NULL;
399299425Smm		*bsz = 0U;
400299425Smm		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
401299425Smm		w->unconsumed = 0U;
402299425Smm		return (ARCHIVE_EOF);
403299425Smm	}
404299425Smm
405299425Smm	rab = __archive_read_ahead(a, 1U, &nrd);
406299425Smm	if (nrd < 0) {
407299425Smm		*bsz = 0U;
408299425Smm		/* big catastrophe */
409299425Smm		return (int)nrd;
410299425Smm	} else if (nrd == 0) {
411299425Smm		goto eof;
412299425Smm	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
413299425Smm		/* clamp to content-length */
414299425Smm		nrd = w->cntlen - w->cntoff;
415299425Smm	}
416299425Smm	*off = w->cntoff;
417299425Smm	*bsz = nrd;
418299425Smm	*buf = rab;
419299425Smm
420299425Smm	w->cntoff += nrd;
421299425Smm	w->unconsumed = (size_t)nrd;
422299425Smm	return (ARCHIVE_OK);
423299425Smm}
424299425Smm
425299425Smmstatic int
426299425Smm_warc_skip(struct archive_read *a)
427299425Smm{
428299425Smm	struct warc_s *w = a->format->data;
429299425Smm
430299425Smm	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
431299425Smm	w->cntlen = 0U;
432299425Smm	w->cntoff = 0U;
433299425Smm	return (ARCHIVE_OK);
434299425Smm}
435299425Smm
436299425Smm
437299425Smm/* private routines */
438299425Smmstatic void*
439299425Smmdeconst(const void *c)
440299425Smm{
441299425Smm	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
442299425Smm}
443299425Smm
444299425Smmstatic char*
445299425Smmxmemmem(const char *hay, const size_t haysize,
446299425Smm	const char *needle, const size_t needlesize)
447299425Smm{
448299425Smm	const char *const eoh = hay + haysize;
449299425Smm	const char *const eon = needle + needlesize;
450299425Smm	const char *hp;
451299425Smm	const char *np;
452299425Smm	const char *cand;
453299425Smm	unsigned int hsum;
454299425Smm	unsigned int nsum;
455299425Smm	unsigned int eqp;
456299425Smm
457299425Smm	/* trivial checks first
458299425Smm         * a 0-sized needle is defined to be found anywhere in haystack
459299425Smm         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
460299425Smm         * that happens to begin with *NEEDLE) */
461299425Smm	if (needlesize == 0UL) {
462299425Smm		return deconst(hay);
463299425Smm	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
464299425Smm		/* trivial */
465299425Smm		return NULL;
466299425Smm	}
467299425Smm
468299425Smm	/* First characters of haystack and needle are the same now. Both are
469299425Smm	 * guaranteed to be at least one character long.  Now computes the sum
470299425Smm	 * of characters values of needle together with the sum of the first
471299425Smm	 * needle_len characters of haystack. */
472299425Smm	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
473299425Smm	     hp < eoh && np < eon;
474299425Smm	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
475299425Smm
476299425Smm	/* HP now references the (NEEDLESIZE + 1)-th character. */
477299425Smm	if (np < eon) {
478299425Smm		/* haystack is smaller than needle, :O */
479299425Smm		return NULL;
480299425Smm	} else if (eqp) {
481299425Smm		/* found a match */
482299425Smm		return deconst(hay);
483299425Smm	}
484299425Smm
485299425Smm	/* now loop through the rest of haystack,
486299425Smm	 * updating the sum iteratively */
487299425Smm	for (cand = hay; hp < eoh; hp++) {
488299425Smm		hsum ^= *cand++;
489299425Smm		hsum ^= *hp;
490299425Smm
491299425Smm		/* Since the sum of the characters is already known to be
492299425Smm		 * equal at that point, it is enough to check just NEEDLESIZE - 1
493299425Smm		 * characters for equality,
494299425Smm		 * also CAND is by design < HP, so no need for range checks */
495299425Smm		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
496299425Smm			return deconst(cand);
497299425Smm		}
498299425Smm	}
499299425Smm	return NULL;
500299425Smm}
501299425Smm
502299425Smmstatic int
503299425Smmstrtoi_lim(const char *str, const char **ep, int llim, int ulim)
504299425Smm{
505299425Smm	int res = 0;
506299425Smm	const char *sp;
507299425Smm	/* we keep track of the number of digits via rulim */
508299425Smm	int rulim;
509299425Smm
510299425Smm	for (sp = str, rulim = ulim > 10 ? ulim : 10;
511299425Smm	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
512299425Smm	     sp++, rulim /= 10) {
513299425Smm		res *= 10;
514299425Smm		res += *sp - '0';
515299425Smm	}
516299425Smm	if (sp == str) {
517299425Smm		res = -1;
518299425Smm	} else if (res < llim || res > ulim) {
519299425Smm		res = -2;
520299425Smm	}
521299425Smm	*ep = (const char*)sp;
522299425Smm	return res;
523299425Smm}
524299425Smm
525299425Smmstatic time_t
526299425Smmtime_from_tm(struct tm *t)
527299425Smm{
528299425Smm#if HAVE_TIMEGM
529299425Smm        /* Use platform timegm() if available. */
530299425Smm        return (timegm(t));
531299425Smm#elif HAVE__MKGMTIME64
532299425Smm        return (_mkgmtime64(t));
533299425Smm#else
534299425Smm        /* Else use direct calculation using POSIX assumptions. */
535299425Smm        /* First, fix up tm_yday based on the year/month/day. */
536299425Smm        if (mktime(t) == (time_t)-1)
537299425Smm                return ((time_t)-1);
538299425Smm        /* Then we can compute timegm() from first principles. */
539299425Smm        return (t->tm_sec
540299425Smm            + t->tm_min * 60
541299425Smm            + t->tm_hour * 3600
542299425Smm            + t->tm_yday * 86400
543299425Smm            + (t->tm_year - 70) * 31536000
544299425Smm            + ((t->tm_year - 69) / 4) * 86400
545299425Smm            - ((t->tm_year - 1) / 100) * 86400
546299425Smm            + ((t->tm_year + 299) / 400) * 86400);
547299425Smm#endif
548299425Smm}
549299425Smm
550299425Smmstatic time_t
551299425Smmxstrpisotime(const char *s, char **endptr)
552299425Smm{
553299425Smm/** like strptime() but strictly for ISO 8601 Zulu strings */
554299425Smm	struct tm tm;
555299425Smm	time_t res = (time_t)-1;
556299425Smm
557299425Smm	/* make sure tm is clean */
558299425Smm	memset(&tm, 0, sizeof(tm));
559299425Smm
560299425Smm	/* as a courtesy to our callers, and since this is a non-standard
561299425Smm	 * routine, we skip leading whitespace */
562315433Smm	while (*s == ' ' || *s == '\t')
563302295Smm		++s;
564299425Smm
565299425Smm	/* read year */
566299425Smm	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
567299425Smm		goto out;
568299425Smm	}
569299425Smm	/* read month */
570299425Smm	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
571299425Smm		goto out;
572299425Smm	}
573299425Smm	/* read day-of-month */
574299425Smm	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
575299425Smm		goto out;
576299425Smm	}
577299425Smm	/* read hour */
578299425Smm	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
579299425Smm		goto out;
580299425Smm	}
581299425Smm	/* read minute */
582299425Smm	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
583299425Smm		goto out;
584299425Smm	}
585299425Smm	/* read second */
586299425Smm	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
587299425Smm		goto out;
588299425Smm	}
589299425Smm
590313571Smm	/* massage TM to fulfill some of POSIX' constraints */
591299425Smm	tm.tm_year -= 1900;
592299425Smm	tm.tm_mon--;
593299425Smm
594299425Smm	/* now convert our custom tm struct to a unix stamp using UTC */
595299425Smm	res = time_from_tm(&tm);
596299425Smm
597299425Smmout:
598299425Smm	if (endptr != NULL) {
599299425Smm		*endptr = deconst(s);
600299425Smm	}
601299425Smm	return res;
602299425Smm}
603299425Smm
604299425Smmstatic unsigned int
605313929Smm_warc_rdver(const char *buf, size_t bsz)
606299425Smm{
607299425Smm	static const char magic[] = "WARC/";
608315433Smm	const char *c;
609313929Smm	unsigned int ver = 0U;
610313929Smm	unsigned int end = 0U;
611299425Smm
612313929Smm	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
613313929Smm		/* buffer too small or invalid magic */
614313929Smm		return ver;
615299425Smm	}
616299425Smm	/* looks good so far, read the version number for a laugh */
617299425Smm	buf += sizeof(magic) - 1U;
618299425Smm
619315433Smm	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
620315433Smm	    isdigit((unsigned char)buf[2U])) {
621313929Smm		/* we support a maximum of 2 digits in the minor version */
622315433Smm		if (isdigit((unsigned char)buf[3U]))
623313929Smm			end = 1U;
624313929Smm		/* set up major version */
625313929Smm		ver = (buf[0U] - '0') * 10000U;
626313929Smm		/* set up minor version */
627313929Smm		if (end == 1U) {
628313929Smm			ver += (buf[2U] - '0') * 1000U;
629313929Smm			ver += (buf[3U] - '0') * 100U;
630313929Smm		} else
631313929Smm			ver += (buf[2U] - '0') * 100U;
632313929Smm		/*
633313929Smm		 * WARC below version 0.12 has a space-separated header
634313929Smm		 * WARC 0.12 and above terminates the version with a CRLF
635313929Smm		 */
636315433Smm		c = buf + 3U + end;
637313929Smm		if (ver >= 1200U) {
638315433Smm			if (memcmp(c, "\r\n", 2U) != 0)
639313929Smm				ver = 0U;
640358090Smm		} else {
641358090Smm			/* ver < 1200U */
642315433Smm			if (*c != ' ' && *c != '\t')
643313929Smm				ver = 0U;
644299425Smm		}
645299425Smm	}
646299425Smm	return ver;
647299425Smm}
648299425Smm
649299425Smmstatic unsigned int
650299425Smm_warc_rdtyp(const char *buf, size_t bsz)
651299425Smm{
652299425Smm	static const char _key[] = "\r\nWARC-Type:";
653313929Smm	const char *val, *eol;
654299425Smm
655299425Smm	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
656299425Smm		/* no bother */
657299425Smm		return WT_NONE;
658299425Smm	}
659313929Smm	val += sizeof(_key) - 1U;
660313929Smm	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
661313929Smm		/* no end of line */
662313929Smm		return WT_NONE;
663313929Smm	}
664313929Smm
665299425Smm	/* overread whitespace */
666315433Smm	while (val < eol && (*val == ' ' || *val == '\t'))
667302295Smm		++val;
668299425Smm
669313929Smm	if (val + 8U == eol) {
670313929Smm		if (memcmp(val, "resource", 8U) == 0)
671313929Smm			return WT_RSRC;
672313929Smm		else if (memcmp(val, "response", 8U) == 0)
673313929Smm			return WT_RSP;
674299425Smm	}
675299425Smm	return WT_NONE;
676299425Smm}
677299425Smm
678299425Smmstatic warc_string_t
679299425Smm_warc_rduri(const char *buf, size_t bsz)
680299425Smm{
681299425Smm	static const char _key[] = "\r\nWARC-Target-URI:";
682313929Smm	const char *val, *uri, *eol, *p;
683299425Smm	warc_string_t res = {0U, NULL};
684299425Smm
685299425Smm	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
686299425Smm		/* no bother */
687299425Smm		return res;
688299425Smm	}
689299425Smm	/* overread whitespace */
690302295Smm	val += sizeof(_key) - 1U;
691313929Smm	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
692313929Smm		/* no end of line */
693313929Smm		return res;
694313929Smm	}
695313929Smm
696315433Smm	while (val < eol && (*val == ' ' || *val == '\t'))
697302295Smm		++val;
698299425Smm
699299425Smm	/* overread URL designators */
700313929Smm	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
701299425Smm		/* not touching that! */
702299425Smm		return res;
703299425Smm	}
704299425Smm
705313929Smm	/* spaces inside uri are not allowed, CRLF should follow */
706313929Smm	for (p = val; p < eol; p++) {
707315433Smm		if (isspace((unsigned char)*p))
708313929Smm			return res;
709313929Smm	}
710313929Smm
711313929Smm	/* there must be at least space for ftp */
712313929Smm	if (uri < (val + 3U))
713313929Smm		return res;
714313929Smm
715313929Smm	/* move uri to point to after :// */
716299425Smm	uri += 3U;
717299425Smm
718299425Smm	/* now then, inspect the URI */
719299425Smm	if (memcmp(val, "file", 4U) == 0) {
720299425Smm		/* perfect, nothing left to do here */
721299425Smm
722299425Smm	} else if (memcmp(val, "http", 4U) == 0 ||
723299425Smm		   memcmp(val, "ftp", 3U) == 0) {
724299425Smm		/* overread domain, and the first / */
725299425Smm		while (uri < eol && *uri++ != '/');
726299425Smm	} else {
727299425Smm		/* not sure what to do? best to bugger off */
728299425Smm		return res;
729299425Smm	}
730299425Smm	res.str = uri;
731299425Smm	res.len = eol - uri;
732299425Smm	return res;
733299425Smm}
734299425Smm
735299425Smmstatic ssize_t
736299425Smm_warc_rdlen(const char *buf, size_t bsz)
737299425Smm{
738299425Smm	static const char _key[] = "\r\nContent-Length:";
739313929Smm	const char *val, *eol;
740299425Smm	char *on = NULL;
741299425Smm	long int len;
742299425Smm
743299425Smm	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
744299425Smm		/* no bother */
745299425Smm		return -1;
746299425Smm	}
747313929Smm	val += sizeof(_key) - 1U;
748313929Smm	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
749313929Smm		/* no end of line */
750313929Smm		return -1;
751313929Smm	}
752299425Smm
753313929Smm	/* skip leading whitespace */
754315433Smm	while (val < eol && (*val == ' ' || *val == '\t'))
755313929Smm		val++;
756313929Smm	/* there must be at least one digit */
757315433Smm	if (!isdigit((unsigned char)*val))
758313929Smm		return -1;
759348608Smm	errno = 0;
760299425Smm	len = strtol(val, &on, 10);
761348608Smm	if (errno != 0 || on != eol) {
762313929Smm		/* line must end here */
763299425Smm		return -1;
764299425Smm	}
765313929Smm
766299425Smm	return (size_t)len;
767299425Smm}
768299425Smm
769299425Smmstatic time_t
770299425Smm_warc_rdrtm(const char *buf, size_t bsz)
771299425Smm{
772299425Smm	static const char _key[] = "\r\nWARC-Date:";
773313929Smm	const char *val, *eol;
774299425Smm	char *on = NULL;
775299425Smm	time_t res;
776299425Smm
777299425Smm	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
778299425Smm		/* no bother */
779299425Smm		return (time_t)-1;
780299425Smm	}
781313929Smm	val += sizeof(_key) - 1U;
782313929Smm	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
783313929Smm		/* no end of line */
784313929Smm		return -1;
785313929Smm	}
786299425Smm
787299425Smm	/* xstrpisotime() kindly overreads whitespace for us, so use that */
788299425Smm	res = xstrpisotime(val, &on);
789313929Smm	if (on != eol) {
790313929Smm		/* line must end here */
791313929Smm		return -1;
792299425Smm	}
793299425Smm	return res;
794299425Smm}
795299425Smm
796299425Smmstatic time_t
797299425Smm_warc_rdmtm(const char *buf, size_t bsz)
798299425Smm{
799299425Smm	static const char _key[] = "\r\nLast-Modified:";
800313929Smm	const char *val, *eol;
801299425Smm	char *on = NULL;
802299425Smm	time_t res;
803299425Smm
804299425Smm	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
805299425Smm		/* no bother */
806299425Smm		return (time_t)-1;
807299425Smm	}
808313929Smm	val += sizeof(_key) - 1U;
809313929Smm	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
810313929Smm		/* no end of line */
811313929Smm		return -1;
812313929Smm	}
813299425Smm
814299425Smm	/* xstrpisotime() kindly overreads whitespace for us, so use that */
815299425Smm	res = xstrpisotime(val, &on);
816313929Smm	if (on != eol) {
817313929Smm		/* line must end here */
818313929Smm		return -1;
819299425Smm	}
820299425Smm	return res;
821299425Smm}
822299425Smm
823299425Smmstatic const char*
824299425Smm_warc_find_eoh(const char *buf, size_t bsz)
825299425Smm{
826299425Smm	static const char _marker[] = "\r\n\r\n";
827299425Smm	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
828299425Smm
829299425Smm	if (hit != NULL) {
830299425Smm		hit += sizeof(_marker) - 1U;
831299425Smm	}
832299425Smm	return hit;
833299425Smm}
834299425Smm
835313929Smmstatic const char*
836313929Smm_warc_find_eol(const char *buf, size_t bsz)
837313929Smm{
838313929Smm	static const char _marker[] = "\r\n";
839313929Smm	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
840313929Smm
841313929Smm	return hit;
842313929Smm}
843299425Smm/* archive_read_support_format_warc.c ends here */
844