archive_read_support_format_warc.c revision 358090
1126209Sache/*-
2146040Stjr * Copyright (c) 2014 Sebastian Freundt
3250724Sjkim * All rights reserved.
4146040Stjr *
5146040Stjr * Redistribution and use in source and binary forms, with or without
6126209Sache * modification, are permitted provided that the following conditions
7126209Sache * are met:
8146040Stjr * 1. Redistributions of source code must retain the above copyright
9146040Stjr *    notice, this list of conditions and the following disclaimer.
10146040Stjr * 2. Redistributions in binary form must reproduce the above copyright
11126209Sache *    notice, this list of conditions and the following disclaimer in the
12126209Sache *    documentation and/or other materials provided with the distribution.
13126209Sache *
14126209Sache * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15146040Stjr * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16126209Sache * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17146040Stjr * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18250724Sjkim * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19250724Sjkim * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20126209Sache * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21126209Sache * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22126209Sache * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23126209Sache * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24146040Stjr */
25146040Stjr
26126209Sache#include "archive_platform.h"
27126209Sache__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 358090 2020-02-19 01:51:44Z mm $");
28126209Sache
29126209Sache/**
30126209Sache * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
31126209Sache * ISO 28500:2009.
32126209Sache * For the purposes of this file we used the final draft from:
33126209Sache * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34126209Sache *
35126209Sache * Todo:
36126209Sache * [ ] real-world warcs can contain resources at endpoints ending in /
37126209Sache *     e.g. http://bibnum.bnf.fr/warc/
38126209Sache *     if you're lucky their response contains a Content-Location: header
39126209Sache *     pointing to a unix-compliant filename, in the example above it's
40126209Sache *     Content-Location: http://bibnum.bnf.fr/warc/index.html
41126209Sache *     however, that's not mandated and github for example doesn't follow
42126209Sache *     this convention.
43126209Sache *     We need a set of archive options to control what to do with
44126209Sache *     entries like these, at the moment care is taken to skip them.
45250724Sjkim *
46126209Sache **/
47126209Sache
48250724Sjkim#ifdef HAVE_SYS_STAT_H
49126209Sache#include <sys/stat.h>
50126209Sache#endif
51126209Sache#ifdef HAVE_ERRNO_H
52126209Sache#include <errno.h>
53250724Sjkim#endif
54126209Sache#ifdef HAVE_STDLIB_H
55126209Sache#include <stdlib.h>
56126209Sache#endif
57126209Sache#ifdef HAVE_STRING_H
58126209Sache#include <string.h>
59250724Sjkim#endif
60126209Sache#ifdef HAVE_LIMITS_H
61126209Sache#include <limits.h>
62126209Sache#endif
63126209Sache#ifdef HAVE_CTYPE_H
64250724Sjkim#include <ctype.h>
65250724Sjkim#endif
66250724Sjkim#ifdef HAVE_TIME_H
67250724Sjkim#include <time.h>
68126209Sache#endif
69126209Sache
70126209Sache#include "archive.h"
71126209Sache#include "archive_entry.h"
72126209Sache#include "archive_private.h"
73250724Sjkim#include "archive_read_private.h"
74126209Sache
75126209Sachetypedef enum {
76126209Sache	WT_NONE,
77126209Sache	/* warcinfo */
78126209Sache	WT_INFO,
79126209Sache	/* metadata */
80126209Sache	WT_META,
81250724Sjkim	/* resource */
82126209Sache	WT_RSRC,
83126209Sache	/* request, unsupported */
84126209Sache	WT_REQ,
85250724Sjkim	/* response, unsupported */
86126209Sache	WT_RSP,
87126209Sache	/* revisit, unsupported */
88126209Sache	WT_RVIS,
89250724Sjkim	/* conversion, unsupported */
90126209Sache	WT_CONV,
91126209Sache	/* continuation, unsupported at the moment */
92126209Sache	WT_CONT,
93250724Sjkim	/* invalid type */
94126209Sache	LAST_WT
95126209Sache} warc_type_t;
96126209Sache
97250724Sjkimtypedef struct {
98126209Sache	size_t len;
99126209Sache	const char *str;
100126209Sache} warc_string_t;
101126209Sache
102250724Sjkimtypedef struct {
103126209Sache	size_t len;
104126209Sache	char *str;
105126209Sache} warc_strbuf_t;
106250724Sjkim
107126209Sachestruct warc_s {
108126209Sache	/* content length ahead */
109126209Sache	size_t cntlen;
110250724Sjkim	/* and how much we've processed so far */
111126209Sache	size_t cntoff;
112126209Sache	/* and how much we need to consume between calls */
113126209Sache	size_t unconsumed;
114126209Sache
115250724Sjkim	/* string pool */
116126209Sache	warc_strbuf_t pool;
117126209Sache	/* previous version */
118126209Sache	unsigned int pver;
119250724Sjkim	/* stringified format name */
120126209Sache	struct archive_string sver;
121126209Sache};
122126209Sache
123250724Sjkimstatic int _warc_bid(struct archive_read *a, int);
124126209Sachestatic int _warc_cleanup(struct archive_read *a);
125126209Sachestatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126126209Sachestatic int _warc_skip(struct archive_read *a);
127250724Sjkimstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128126209Sache
129126209Sache/* private routines */
130126209Sachestatic unsigned int _warc_rdver(const char buf[10], size_t bsz);
131126209Sachestatic unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132126209Sachestatic warc_string_t _warc_rduri(const char *buf, size_t bsz);
133250724Sjkimstatic ssize_t _warc_rdlen(const char *buf, size_t bsz);
134126209Sachestatic time_t _warc_rdrtm(const char *buf, size_t bsz);
135126209Sachestatic time_t _warc_rdmtm(const char *buf, size_t bsz);
136126209Sachestatic const char *_warc_find_eoh(const char *buf, size_t bsz);
137250724Sjkimstatic const char *_warc_find_eol(const char *buf, size_t bsz);
138126209Sache
139126209Sacheint
140126209Sachearchive_read_support_format_warc(struct archive *_a)
141250724Sjkim{
142126209Sache	struct archive_read *a = (struct archive_read *)_a;
143126209Sache	struct warc_s *w;
144126209Sache	int r;
145250724Sjkim
146126209Sache	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
147126209Sache	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
148126209Sache
149126209Sache	if ((w = calloc(1, sizeof(*w))) == NULL) {
150126209Sache		archive_set_error(&a->archive, ENOMEM,
151126209Sache		    "Can't allocate warc data");
152126209Sache		return (ARCHIVE_FATAL);
153250724Sjkim	}
154126209Sache
155131543Stjr	r = __archive_read_register_format(
156131543Stjr		a, w, "warc",
157131543Stjr		_warc_bid, NULL, _warc_rdhdr, _warc_read,
158250724Sjkim		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
159131543Stjr
160146040Stjr	if (r != ARCHIVE_OK) {
161146040Stjr		free(w);
162250724Sjkim		return (r);
163146040Stjr	}
164146040Stjr	return (ARCHIVE_OK);
165146040Stjr}
166146040Stjr
167250724Sjkimstatic int
168146040Stjr_warc_cleanup(struct archive_read *a)
169146040Stjr{
170146040Stjr	struct warc_s *w = a->format->data;
171250724Sjkim
172146040Stjr	if (w->pool.len > 0U) {
173146040Stjr		free(w->pool.str);
174146040Stjr	}
175250724Sjkim	archive_string_free(&w->sver);
176250724Sjkim	free(w);
177146040Stjr	a->format->data = NULL;
178126209Sache	return (ARCHIVE_OK);
179126209Sache}
180126209Sache
181126209Sachestatic int
182126209Sache_warc_bid(struct archive_read *a, int best_bid)
183126209Sache{
184250724Sjkim	const char *hdr;
185126209Sache	ssize_t nrd;
186126209Sache	unsigned int ver;
187126209Sache
188126209Sache	(void)best_bid; /* UNUSED */
189126209Sache
190126209Sache	/* check first line of file, it should be a record already */
191126209Sache	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
192126209Sache		/* no idea what to do */
193126209Sache		return -1;
194126209Sache	} else if (nrd < 12) {
195126209Sache		/* nah, not for us, our magic cookie is at least 12 bytes */
196250724Sjkim		return -1;
197126209Sache	}
198126209Sache
199126209Sache	/* otherwise snarf the record's version number */
200250724Sjkim	ver = _warc_rdver(hdr, nrd);
201250724Sjkim	if (ver < 1200U || ver > 10000U) {
202250724Sjkim		/* we only support WARC 0.12 to 1.0 */
203250724Sjkim		return -1;
204126209Sache	}
205250724Sjkim
206126209Sache	/* otherwise be confident */
207250724Sjkim	return (64);
208250724Sjkim}
209126209Sache
210126209Sachestatic int
211126209Sache_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
212126209Sache{
213126209Sache#define HDR_PROBE_LEN		(12U)
214126209Sache	struct warc_s *w = a->format->data;
215126209Sache	unsigned int ver;
216126209Sache	const char *buf;
217126209Sache	ssize_t nrd;
218126209Sache	const char *eoh;
219126209Sache	/* for the file name, saves some strndup()'ing */
220126209Sache	warc_string_t fnam;
221126209Sache	/* warc record type, not that we really use it a lot */
222131543Stjr	warc_type_t ftyp;
223131543Stjr	/* content-length+error monad */
224126209Sache	ssize_t cntlen;
225126209Sache	/* record time is the WARC-Date time we reinterpret it as ctime */
226126209Sache	time_t rtime;
227126209Sache	/* mtime is the Last-Modified time which will be the entry's mtime */
228126209Sache	time_t mtime;
229126209Sache
230126209Sachestart_over:
231126209Sache	/* just use read_ahead() they keep track of unconsumed
232126209Sache	 * bits and bobs for us; no need to put an extra shift in
233126209Sache	 * and reproduce that functionality here */
234126209Sache	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235126209Sache
236146040Stjr	if (nrd < 0) {
237126209Sache		/* no good */
238126209Sache		archive_set_error(
239126209Sache			&a->archive, ARCHIVE_ERRNO_MISC,
240126209Sache			"Bad record header");
241126209Sache		return (ARCHIVE_FATAL);
242126209Sache	} else if (buf == NULL) {
243126209Sache		/* there should be room for at least WARC/bla\r\n
244126209Sache		 * must be EOF therefore */
245131543Stjr		return (ARCHIVE_EOF);
246131543Stjr	}
247131543Stjr 	/* looks good so far, try and find the end of the header now */
248131543Stjr	eoh = _warc_find_eoh(buf, nrd);
249126209Sache	if (eoh == NULL) {
250131543Stjr		/* still no good, the header end might be beyond the
251131543Stjr		 * probe we've requested, but then again who'd cram
252126209Sache		 * so much stuff into the header *and* be 28500-compliant */
253126209Sache		archive_set_error(
254126209Sache			&a->archive, ARCHIVE_ERRNO_MISC,
255126209Sache			"Bad record header");
256126209Sache		return (ARCHIVE_FATAL);
257126209Sache	}
258126209Sache	ver = _warc_rdver(buf, eoh - buf);
259126209Sache	/* we currently support WARC 0.12 to 1.0 */
260126209Sache	if (ver == 0U) {
261126209Sache		archive_set_error(
262250724Sjkim			&a->archive, ARCHIVE_ERRNO_MISC,
263250724Sjkim			"Invalid record version");
264250724Sjkim		return (ARCHIVE_FATAL);
265250724Sjkim	} else if (ver < 1200U || ver > 10000U) {
266250724Sjkim		archive_set_error(
267126209Sache			&a->archive, ARCHIVE_ERRNO_MISC,
268126209Sache			"Unsupported record version: %u.%u",
269126209Sache			ver / 10000, (ver % 10000) / 100);
270126209Sache		return (ARCHIVE_FATAL);
271126209Sache	}
272126209Sache	cntlen = _warc_rdlen(buf, eoh - buf);
273126209Sache	if (cntlen < 0) {
274126209Sache		/* nightmare!  the specs say content-length is mandatory
275126209Sache		 * so I don't feel overly bad stopping the reader here */
276126209Sache		archive_set_error(
277126209Sache			&a->archive, EINVAL,
278126209Sache			"Bad content length");
279126209Sache		return (ARCHIVE_FATAL);
280126209Sache	}
281126209Sache	rtime = _warc_rdrtm(buf, eoh - buf);
282126209Sache	if (rtime == (time_t)-1) {
283126209Sache		/* record time is mandatory as per WARC/1.0,
284126209Sache		 * so just barf here, fast and loud */
285126209Sache		archive_set_error(
286126209Sache			&a->archive, EINVAL,
287126209Sache			"Bad record time");
288126209Sache		return (ARCHIVE_FATAL);
289126209Sache	}
290126209Sache
291126209Sache	/* let the world know we're a WARC archive */
292126209Sache	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293126209Sache	if (ver != w->pver) {
294126209Sache		/* stringify this entry's version */
295126209Sache		archive_string_sprintf(&w->sver,
296126209Sache			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297126209Sache		/* remember the version */
298126209Sache		w->pver = ver;
299126209Sache	}
300126209Sache	/* start off with the type */
301126209Sache	ftyp = _warc_rdtyp(buf, eoh - buf);
302146040Stjr	/* and let future calls know about the content */
303146040Stjr	w->cntlen = cntlen;
304146040Stjr	w->cntoff = 0U;
305126209Sache	mtime = 0;/* Avoid compiling error on some platform. */
306146040Stjr
307126209Sache	switch (ftyp) {
308126209Sache	case WT_RSRC:
309126209Sache	case WT_RSP:
310126209Sache		/* only try and read the filename in the cases that are
311250724Sjkim		 * guaranteed to have one */
312126209Sache		fnam = _warc_rduri(buf, eoh - buf);
313126209Sache		/* check the last character in the URI to avoid creating
314126209Sache		 * directory endpoints as files, see Todo above */
315126209Sache		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316126209Sache			/* break here for now */
317126209Sache			fnam.len = 0U;
318126209Sache			fnam.str = NULL;
319126209Sache			break;
320126209Sache		}
321146040Stjr		/* bang to our string pool, so we save a
322126209Sache		 * malloc()+free() roundtrip */
323126209Sache		if (fnam.len + 1U > w->pool.len) {
324126209Sache			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325126209Sache			w->pool.str = realloc(w->pool.str, w->pool.len);
326126209Sache		}
327126209Sache		memcpy(w->pool.str, fnam.str, fnam.len);
328126209Sache		w->pool.str[fnam.len] = '\0';
329126209Sache		/* let no one else know about the pool, it's a secret, shhh */
330126209Sache		fnam.str = w->pool.str;
331126209Sache
332126209Sache		/* snarf mtime or deduce from rtime
333126209Sache		 * this is a custom header added by our writer, it's quite
334126209Sache		 * hard to believe anyone else would go through with it
335126209Sache		 * (apart from being part of some http responses of course) */
336126209Sache		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
337126209Sache			mtime = rtime;
338126209Sache		}
339126209Sache		break;
340126209Sache	default:
341250724Sjkim		fnam.len = 0U;
342250724Sjkim		fnam.str = NULL;
343250724Sjkim		break;
344126209Sache	}
345126209Sache
346250724Sjkim	/* now eat some of those delicious buffer bits */
347250724Sjkim	__archive_read_consume(a, eoh - buf);
348250724Sjkim
349250724Sjkim	switch (ftyp) {
350126209Sache	case WT_RSRC:
351126209Sache	case WT_RSP:
352250724Sjkim		if (fnam.len > 0U) {
353250724Sjkim			/* populate entry object */
354250724Sjkim			archive_entry_set_filetype(entry, AE_IFREG);
355250724Sjkim			archive_entry_copy_pathname(entry, fnam.str);
356250724Sjkim			archive_entry_set_size(entry, cntlen);
357250724Sjkim			archive_entry_set_perm(entry, 0644);
358126209Sache			/* rtime is the new ctime, mtime stays mtime */
359126209Sache			archive_entry_set_ctime(entry, rtime, 0L);
360250724Sjkim			archive_entry_set_mtime(entry, mtime, 0L);
361250724Sjkim			break;
362250724Sjkim		}
363250724Sjkim		/* FALLTHROUGH */
364126209Sache	default:
365250724Sjkim		/* consume the content and start over */
366250724Sjkim		_warc_skip(a);
367126209Sache		goto start_over;
368250724Sjkim	}
369250724Sjkim	return (ARCHIVE_OK);
370126209Sache}
371250724Sjkim
372250724Sjkimstatic int
373126209Sache_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
374250724Sjkim{
375250724Sjkim	struct warc_s *w = a->format->data;
376250724Sjkim	const char *rab;
377250724Sjkim	ssize_t nrd;
378126209Sache
379250724Sjkim	if (w->cntoff >= w->cntlen) {
380250724Sjkim	eof:
381250724Sjkim		/* it's our lucky day, no work, we can leave early */
382250724Sjkim		*buf = NULL;
383250724Sjkim		*bsz = 0U;
384126209Sache		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
385250724Sjkim		w->unconsumed = 0U;
386126209Sache		return (ARCHIVE_EOF);
387126209Sache	}
388250724Sjkim
389250724Sjkim	rab = __archive_read_ahead(a, 1U, &nrd);
390250724Sjkim	if (nrd < 0) {
391250724Sjkim		*bsz = 0U;
392250724Sjkim		/* big catastrophe */
393126209Sache		return (int)nrd;
394250724Sjkim	} else if (nrd == 0) {
395250724Sjkim		goto eof;
396250724Sjkim	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
397250724Sjkim		/* clamp to content-length */
398250724Sjkim		nrd = w->cntlen - w->cntoff;
399250724Sjkim	}
400250724Sjkim	*off = w->cntoff;
401250724Sjkim	*bsz = nrd;
402250724Sjkim	*buf = rab;
403250724Sjkim
404126209Sache	w->cntoff += nrd;
405250724Sjkim	w->unconsumed = (size_t)nrd;
406250724Sjkim	return (ARCHIVE_OK);
407250724Sjkim}
408126209Sache
409250724Sjkimstatic int
410250724Sjkim_warc_skip(struct archive_read *a)
411250724Sjkim{
412126209Sache	struct warc_s *w = a->format->data;
413250724Sjkim
414250724Sjkim	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
415250724Sjkim	w->cntlen = 0U;
416126209Sache	w->cntoff = 0U;
417250724Sjkim	return (ARCHIVE_OK);
418250724Sjkim}
419126209Sache
420250724Sjkim
421250724Sjkim/* private routines */
422126209Sachestatic void*
423126209Sachedeconst(const void *c)
424126209Sache{
425126209Sache	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
426126209Sache}
427126209Sache
428126209Sachestatic char*
429126209Sachexmemmem(const char *hay, const size_t haysize,
430250724Sjkim	const char *needle, const size_t needlesize)
431126209Sache{
432126209Sache	const char *const eoh = hay + haysize;
433126209Sache	const char *const eon = needle + needlesize;
434126209Sache	const char *hp;
435126209Sache	const char *np;
436126209Sache	const char *cand;
437126209Sache	unsigned int hsum;
438126209Sache	unsigned int nsum;
439126209Sache	unsigned int eqp;
440126209Sache
441126209Sache	/* trivial checks first
442126209Sache         * a 0-sized needle is defined to be found anywhere in haystack
443126209Sache         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
444250724Sjkim         * that happens to begin with *NEEDLE) */
445250724Sjkim	if (needlesize == 0UL) {
446250724Sjkim		return deconst(hay);
447126209Sache	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
448126209Sache		/* trivial */
449126209Sache		return NULL;
450126209Sache	}
451126209Sache
452126209Sache	/* First characters of haystack and needle are the same now. Both are
453126209Sache	 * guaranteed to be at least one character long.  Now computes the sum
454126209Sache	 * of characters values of needle together with the sum of the first
455126209Sache	 * needle_len characters of haystack. */
456126209Sache	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
457126209Sache	     hp < eoh && np < eon;
458126209Sache	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
459126209Sache
460126209Sache	/* HP now references the (NEEDLESIZE + 1)-th character. */
461250724Sjkim	if (np < eon) {
462126209Sache		/* haystack is smaller than needle, :O */
463126209Sache		return NULL;
464250724Sjkim	} else if (eqp) {
465126209Sache		/* found a match */
466126209Sache		return deconst(hay);
467126209Sache	}
468250724Sjkim
469126209Sache	/* now loop through the rest of haystack,
470250724Sjkim	 * updating the sum iteratively */
471250724Sjkim	for (cand = hay; hp < eoh; hp++) {
472250724Sjkim		hsum ^= *cand++;
473250724Sjkim		hsum ^= *hp;
474250724Sjkim
475250724Sjkim		/* Since the sum of the characters is already known to be
476126209Sache		 * equal at that point, it is enough to check just NEEDLESIZE - 1
477250724Sjkim		 * characters for equality,
478126209Sache		 * also CAND is by design < HP, so no need for range checks */
479126209Sache		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
480126209Sache			return deconst(cand);
481250724Sjkim		}
482126209Sache	}
483126209Sache	return NULL;
484126209Sache}
485126209Sache
486126209Sachestatic int
487126209Sachestrtoi_lim(const char *str, const char **ep, int llim, int ulim)
488126209Sache{
489250724Sjkim	int res = 0;
490250724Sjkim	const char *sp;
491250724Sjkim	/* we keep track of the number of digits via rulim */
492126209Sache	int rulim;
493126209Sache
494126209Sache	for (sp = str, rulim = ulim > 10 ? ulim : 10;
495126209Sache	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
496250724Sjkim	     sp++, rulim /= 10) {
497250724Sjkim		res *= 10;
498250724Sjkim		res += *sp - '0';
499250724Sjkim	}
500126209Sache	if (sp == str) {
501126209Sache		res = -1;
502126209Sache	} else if (res < llim || res > ulim) {
503126209Sache		res = -2;
504250724Sjkim	}
505250724Sjkim	*ep = (const char*)sp;
506126209Sache	return res;
507126209Sache}
508126209Sache
509250724Sjkimstatic time_t
510250724Sjkimtime_from_tm(struct tm *t)
511250724Sjkim{
512250724Sjkim#if HAVE_TIMEGM
513126209Sache        /* Use platform timegm() if available. */
514126209Sache        return (timegm(t));
515126209Sache#elif HAVE__MKGMTIME64
516126209Sache        return (_mkgmtime64(t));
517126209Sache#else
518126209Sache        /* Else use direct calculation using POSIX assumptions. */
519126209Sache        /* First, fix up tm_yday based on the year/month/day. */
520126209Sache        if (mktime(t) == (time_t)-1)
521126209Sache                return ((time_t)-1);
522126209Sache        /* Then we can compute timegm() from first principles. */
523126209Sache        return (t->tm_sec
524126209Sache            + t->tm_min * 60
525126209Sache            + t->tm_hour * 3600
526126209Sache            + t->tm_yday * 86400
527250724Sjkim            + (t->tm_year - 70) * 31536000
528250724Sjkim            + ((t->tm_year - 69) / 4) * 86400
529250724Sjkim            - ((t->tm_year - 1) / 100) * 86400
530250724Sjkim            + ((t->tm_year + 299) / 400) * 86400);
531250724Sjkim#endif
532126209Sache}
533250724Sjkim
534126209Sachestatic time_t
535126209Sachexstrpisotime(const char *s, char **endptr)
536250724Sjkim{
537250724Sjkim/** like strptime() but strictly for ISO 8601 Zulu strings */
538126209Sache	struct tm tm;
539126209Sache	time_t res = (time_t)-1;
540126209Sache
541131543Stjr	/* make sure tm is clean */
542131543Stjr	memset(&tm, 0, sizeof(tm));
543131543Stjr
544131543Stjr	/* as a courtesy to our callers, and since this is a non-standard
545131543Stjr	 * routine, we skip leading whitespace */
546131543Stjr	while (*s == ' ' || *s == '\t')
547131543Stjr		++s;
548131543Stjr
549131543Stjr	/* read year */
550131543Stjr	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
551131543Stjr		goto out;
552146040Stjr	}
553146040Stjr	/* read month */
554250724Sjkim	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
555250724Sjkim		goto out;
556146040Stjr	}
557146040Stjr	/* read day-of-month */
558146040Stjr	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
559146040Stjr		goto out;
560146040Stjr	}
561131543Stjr	/* read hour */
562126209Sache	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
563250724Sjkim		goto out;
564250724Sjkim	}
565250724Sjkim	/* read minute */
566126209Sache	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
567250724Sjkim		goto out;
568250724Sjkim	}
569250724Sjkim	/* read second */
570250724Sjkim	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
571126209Sache		goto out;
572250724Sjkim	}
573250724Sjkim
574126209Sache	/* massage TM to fulfill some of POSIX' constraints */
575250724Sjkim	tm.tm_year -= 1900;
576126209Sache	tm.tm_mon--;
577126209Sache
578126209Sache	/* now convert our custom tm struct to a unix stamp using UTC */
579126209Sache	res = time_from_tm(&tm);
580126209Sache
581126209Sacheout:
582126209Sache	if (endptr != NULL) {
583		*endptr = deconst(s);
584	}
585	return res;
586}
587
588static unsigned int
589_warc_rdver(const char *buf, size_t bsz)
590{
591	static const char magic[] = "WARC/";
592	const char *c;
593	unsigned int ver = 0U;
594	unsigned int end = 0U;
595
596	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
597		/* buffer too small or invalid magic */
598		return ver;
599	}
600	/* looks good so far, read the version number for a laugh */
601	buf += sizeof(magic) - 1U;
602
603	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
604	    isdigit((unsigned char)buf[2U])) {
605		/* we support a maximum of 2 digits in the minor version */
606		if (isdigit((unsigned char)buf[3U]))
607			end = 1U;
608		/* set up major version */
609		ver = (buf[0U] - '0') * 10000U;
610		/* set up minor version */
611		if (end == 1U) {
612			ver += (buf[2U] - '0') * 1000U;
613			ver += (buf[3U] - '0') * 100U;
614		} else
615			ver += (buf[2U] - '0') * 100U;
616		/*
617		 * WARC below version 0.12 has a space-separated header
618		 * WARC 0.12 and above terminates the version with a CRLF
619		 */
620		c = buf + 3U + end;
621		if (ver >= 1200U) {
622			if (memcmp(c, "\r\n", 2U) != 0)
623				ver = 0U;
624		} else {
625			/* ver < 1200U */
626			if (*c != ' ' && *c != '\t')
627				ver = 0U;
628		}
629	}
630	return ver;
631}
632
633static unsigned int
634_warc_rdtyp(const char *buf, size_t bsz)
635{
636	static const char _key[] = "\r\nWARC-Type:";
637	const char *val, *eol;
638
639	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
640		/* no bother */
641		return WT_NONE;
642	}
643	val += sizeof(_key) - 1U;
644	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
645		/* no end of line */
646		return WT_NONE;
647	}
648
649	/* overread whitespace */
650	while (val < eol && (*val == ' ' || *val == '\t'))
651		++val;
652
653	if (val + 8U == eol) {
654		if (memcmp(val, "resource", 8U) == 0)
655			return WT_RSRC;
656		else if (memcmp(val, "response", 8U) == 0)
657			return WT_RSP;
658	}
659	return WT_NONE;
660}
661
662static warc_string_t
663_warc_rduri(const char *buf, size_t bsz)
664{
665	static const char _key[] = "\r\nWARC-Target-URI:";
666	const char *val, *uri, *eol, *p;
667	warc_string_t res = {0U, NULL};
668
669	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
670		/* no bother */
671		return res;
672	}
673	/* overread whitespace */
674	val += sizeof(_key) - 1U;
675	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
676		/* no end of line */
677		return res;
678	}
679
680	while (val < eol && (*val == ' ' || *val == '\t'))
681		++val;
682
683	/* overread URL designators */
684	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
685		/* not touching that! */
686		return res;
687	}
688
689	/* spaces inside uri are not allowed, CRLF should follow */
690	for (p = val; p < eol; p++) {
691		if (isspace((unsigned char)*p))
692			return res;
693	}
694
695	/* there must be at least space for ftp */
696	if (uri < (val + 3U))
697		return res;
698
699	/* move uri to point to after :// */
700	uri += 3U;
701
702	/* now then, inspect the URI */
703	if (memcmp(val, "file", 4U) == 0) {
704		/* perfect, nothing left to do here */
705
706	} else if (memcmp(val, "http", 4U) == 0 ||
707		   memcmp(val, "ftp", 3U) == 0) {
708		/* overread domain, and the first / */
709		while (uri < eol && *uri++ != '/');
710	} else {
711		/* not sure what to do? best to bugger off */
712		return res;
713	}
714	res.str = uri;
715	res.len = eol - uri;
716	return res;
717}
718
719static ssize_t
720_warc_rdlen(const char *buf, size_t bsz)
721{
722	static const char _key[] = "\r\nContent-Length:";
723	const char *val, *eol;
724	char *on = NULL;
725	long int len;
726
727	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
728		/* no bother */
729		return -1;
730	}
731	val += sizeof(_key) - 1U;
732	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
733		/* no end of line */
734		return -1;
735	}
736
737	/* skip leading whitespace */
738	while (val < eol && (*val == ' ' || *val == '\t'))
739		val++;
740	/* there must be at least one digit */
741	if (!isdigit((unsigned char)*val))
742		return -1;
743	errno = 0;
744	len = strtol(val, &on, 10);
745	if (errno != 0 || on != eol) {
746		/* line must end here */
747		return -1;
748	}
749
750	return (size_t)len;
751}
752
753static time_t
754_warc_rdrtm(const char *buf, size_t bsz)
755{
756	static const char _key[] = "\r\nWARC-Date:";
757	const char *val, *eol;
758	char *on = NULL;
759	time_t res;
760
761	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
762		/* no bother */
763		return (time_t)-1;
764	}
765	val += sizeof(_key) - 1U;
766	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
767		/* no end of line */
768		return -1;
769	}
770
771	/* xstrpisotime() kindly overreads whitespace for us, so use that */
772	res = xstrpisotime(val, &on);
773	if (on != eol) {
774		/* line must end here */
775		return -1;
776	}
777	return res;
778}
779
780static time_t
781_warc_rdmtm(const char *buf, size_t bsz)
782{
783	static const char _key[] = "\r\nLast-Modified:";
784	const char *val, *eol;
785	char *on = NULL;
786	time_t res;
787
788	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
789		/* no bother */
790		return (time_t)-1;
791	}
792	val += sizeof(_key) - 1U;
793	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
794		/* no end of line */
795		return -1;
796	}
797
798	/* xstrpisotime() kindly overreads whitespace for us, so use that */
799	res = xstrpisotime(val, &on);
800	if (on != eol) {
801		/* line must end here */
802		return -1;
803	}
804	return res;
805}
806
807static const char*
808_warc_find_eoh(const char *buf, size_t bsz)
809{
810	static const char _marker[] = "\r\n\r\n";
811	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
812
813	if (hit != NULL) {
814		hit += sizeof(_marker) - 1U;
815	}
816	return hit;
817}
818
819static const char*
820_warc_find_eol(const char *buf, size_t bsz)
821{
822	static const char _marker[] = "\r\n";
823	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
824
825	return hit;
826}
827/* archive_read_support_format_warc.c ends here */
828