archive_read_support_format_warc.c revision 315433
11590Srgrimes/*-
21590Srgrimes * Copyright (c) 2014 Sebastian Freundt
31590Srgrimes * All rights reserved.
41590Srgrimes *
51590Srgrimes * Redistribution and use in source and binary forms, with or without
61590Srgrimes * modification, are permitted provided that the following conditions
71590Srgrimes * are met:
81590Srgrimes * 1. Redistributions of source code must retain the above copyright
91590Srgrimes *    notice, this list of conditions and the following disclaimer.
101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright
111590Srgrimes *    notice, this list of conditions and the following disclaimer in the
121590Srgrimes *    documentation and/or other materials provided with the distribution.
131590Srgrimes *
141590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
151590Srgrimes * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
161590Srgrimes * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
171590Srgrimes * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
181590Srgrimes * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
191590Srgrimes * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
201590Srgrimes * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
211590Srgrimes * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
221590Srgrimes * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
231590Srgrimes * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
241590Srgrimes */
251590Srgrimes
261590Srgrimes#include "archive_platform.h"
271590Srgrimes__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 315433 2017-03-16 23:08:18Z mm $");
281590Srgrimes
291590Srgrimes/**
301590Srgrimes * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
311590Srgrimes * ISO 28500:2009.
321590Srgrimes * For the purposes of this file we used the final draft from:
331590Srgrimes * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
34194792Sdelphij *
351590Srgrimes * Todo:
361590Srgrimes * [ ] real-world warcs can contain resources at endpoints ending in /
371590Srgrimes *     e.g. http://bibnum.bnf.fr/warc/
381590Srgrimes *     if you're lucky their response contains a Content-Location: header
391590Srgrimes *     pointing to a unix-compliant filename, in the example above it's
4032649Sbde *     Content-Location: http://bibnum.bnf.fr/warc/index.html
4132649Sbde *     however, that's not mandated and github for example doesn't follow
4232649Sbde *     this convention.
433819Swollman *     We need a set of archive options to control what to do with
4450477Speter *     entries like these, at the moment care is taken to skip them.
451590Srgrimes *
461590Srgrimes **/
471590Srgrimes
48194190Sed#ifdef HAVE_SYS_STAT_H
491590Srgrimes#include <sys/stat.h>
5011936Sphk#endif
513819Swollman#ifdef HAVE_ERRNO_H
529336Sdfr#include <errno.h>
5383653Speter#endif
5483653Speter#ifdef HAVE_STDLIB_H
55192762Srmacklem#include <stdlib.h>
56192762Srmacklem#endif
57192762Srmacklem#ifdef HAVE_STRING_H
58192762Srmacklem#include <string.h>
591590Srgrimes#endif
601590Srgrimes#ifdef HAVE_LIMITS_H
61200462Sdelphij#include <limits.h>
621590Srgrimes#endif
631590Srgrimes#ifdef HAVE_CTYPE_H
6477207Stmm#include <ctype.h>
651590Srgrimes#endif
661590Srgrimes#ifdef HAVE_TIME_H
671590Srgrimes#include <time.h>
681590Srgrimes#endif
691590Srgrimes
70200462Sdelphij#include "archive.h"
713819Swollman#include "archive_entry.h"
721590Srgrimes#include "archive_private.h"
731590Srgrimes#include "archive_read_private.h"
741590Srgrimes
75194792Sdelphijtypedef enum {
7683653Speter	WT_NONE,
77194792Sdelphij	/* warcinfo */
78194792Sdelphij	WT_INFO,
791590Srgrimes	/* metadata */
801590Srgrimes	WT_META,
811590Srgrimes	/* resource */
823819Swollman	WT_RSRC,
8352493Sdillon	/* request, unsupported */
84172759Sjhb	WT_REQ,
85221455Srmacklem	/* response, unsupported */
86192762Srmacklem	WT_RSP,
87192762Srmacklem	/* revisit, unsupported */
88221455Srmacklem	WT_RVIS,
891590Srgrimes	/* conversion, unsupported */
9092921Simp	WT_CONV,
9192921Simp	/* continuation, unsupported at the moment */
9292921Simp	WT_CONT,
9392921Simp	/* invalid type */
9492921Simp	LAST_WT
9592921Simp} warc_type_t;
96192762Srmacklem
97192762Srmacklemtypedef struct {
983819Swollman	size_t len;
9952493Sdillon	const char *str;
10052493Sdillon} warc_string_t;
101131990Sstefanf
102172759Sjhbtypedef struct {
1031590Srgrimes	size_t len;
1041590Srgrimes	char *str;
10552493Sdillon} warc_strbuf_t;
10652493Sdillon
1071590Srgrimesstruct warc_s {
1081590Srgrimes	/* content length ahead */
10977207Stmm	size_t cntlen;
110243783Srmacklem	/* and how much we've processed so far */
111243783Srmacklem	size_t cntoff;
112243783Srmacklem	/* and how much we need to consume between calls */
113243783Srmacklem	size_t unconsumed;
1141590Srgrimes
1151590Srgrimes	/* string pool */
1161590Srgrimes	warc_strbuf_t pool;
117243783Srmacklem	/* previous version */
1181590Srgrimes	unsigned int pver;
1191590Srgrimes	/* stringified format name */
1201590Srgrimes	struct archive_string sver;
1211590Srgrimes};
122243783Srmacklem
123243783Srmacklemstatic int _warc_bid(struct archive_read *a, int);
124243783Srmacklemstatic int _warc_cleanup(struct archive_read *a);
125243783Srmacklemstatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
126243783Srmacklemstatic int _warc_skip(struct archive_read *a);
127243783Srmacklemstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
128243783Srmacklem
129243783Srmacklem/* private routines */
130243783Srmacklemstatic unsigned int _warc_rdver(const char buf[10], size_t bsz);
131243783Srmacklemstatic unsigned int _warc_rdtyp(const char *buf, size_t bsz);
132243783Srmacklemstatic warc_string_t _warc_rduri(const char *buf, size_t bsz);
133243783Srmacklemstatic ssize_t _warc_rdlen(const char *buf, size_t bsz);
134243783Srmacklemstatic time_t _warc_rdrtm(const char *buf, size_t bsz);
135243783Srmacklemstatic time_t _warc_rdmtm(const char *buf, size_t bsz);
136251585Srmacklemstatic const char *_warc_find_eoh(const char *buf, size_t bsz);
137251585Srmacklemstatic const char *_warc_find_eol(const char *buf, size_t bsz);
138251585Srmacklem
139243783Srmacklemint
140243783Srmacklemarchive_read_support_format_warc(struct archive *_a)
141243783Srmacklem{
142243783Srmacklem	struct archive_read *a = (struct archive_read *)_a;
1431590Srgrimes	struct warc_s *w;
1441590Srgrimes	int r;
1451590Srgrimes
14652493Sdillon	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
14752493Sdillon	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
14852493Sdillon
1491590Srgrimes	if ((w = calloc(1, sizeof(*w))) == NULL) {
1501590Srgrimes		archive_set_error(&a->archive, ENOMEM,
1511590Srgrimes		    "Can't allocate warc data");
15252493Sdillon		return (ARCHIVE_FATAL);
15352493Sdillon	}
15452493Sdillon
15552493Sdillon	r = __archive_read_register_format(
15652493Sdillon		a, w, "warc",
15752493Sdillon		_warc_bid, NULL, _warc_rdhdr, _warc_read,
15852493Sdillon		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
15952493Sdillon
16052493Sdillon	if (r != ARCHIVE_OK) {
16152493Sdillon		free(w);
162172759Sjhb		return (r);
163172759Sjhb	}
164172759Sjhb	return (ARCHIVE_OK);
165221455Srmacklem}
166221455Srmacklem
167221455Srmacklemstatic int
168221455Srmacklem_warc_cleanup(struct archive_read *a)
169221455Srmacklem{
170193258Srmacklem	struct warc_s *w = a->format->data;
171221455Srmacklem
172221455Srmacklem	if (w->pool.len > 0U) {
173221455Srmacklem		free(w->pool.str);
174192762Srmacklem	}
1751590Srgrimes	archive_string_free(&w->sver);
1761590Srgrimes	free(w);
1771590Srgrimes	a->format->data = NULL;
1781590Srgrimes	return (ARCHIVE_OK);
1791590Srgrimes}
1801590Srgrimes
1811590Srgrimesstatic int
1821590Srgrimes_warc_bid(struct archive_read *a, int best_bid)
1831590Srgrimes{
1841590Srgrimes	const char *hdr;
1851590Srgrimes	ssize_t nrd;
1861590Srgrimes	unsigned int ver;
1871590Srgrimes
1881590Srgrimes	(void)best_bid; /* UNUSED */
1891590Srgrimes
1901590Srgrimes	/* check first line of file, it should be a record already */
1911590Srgrimes	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
1921590Srgrimes		/* no idea what to do */
193192762Srmacklem		return -1;
194221455Srmacklem	} else if (nrd < 12) {
195192762Srmacklem		/* nah, not for us, our magic cookie is at least 12 bytes */
196221440Srmacklem		return -1;
1973819Swollman	}
1981590Srgrimes
1993819Swollman	/* otherwise snarf the record's version number */
2003819Swollman	ver = _warc_rdver(hdr, nrd);
2013819Swollman	if (ver < 1200U || ver > 10000U) {
2023819Swollman		/* we only support WARC 0.12 to 1.0 */
2033819Swollman		return -1;
2043819Swollman	}
2053819Swollman
2061590Srgrimes	/* otherwise be confident */
2071590Srgrimes	return (64);
208192762Srmacklem}
209192762Srmacklem
210192762Srmacklemstatic int
211192762Srmacklem_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
212192762Srmacklem{
213192762Srmacklem#define HDR_PROBE_LEN		(12U)
214221455Srmacklem	struct warc_s *w = a->format->data;
215192762Srmacklem	unsigned int ver;
216192762Srmacklem	const char *buf;
217192762Srmacklem	ssize_t nrd;
218192762Srmacklem	const char *eoh;
2191590Srgrimes	/* for the file name, saves some strndup()'ing */
2201590Srgrimes	warc_string_t fnam;
2211590Srgrimes	/* warc record type, not that we really use it a lot */
2221590Srgrimes	warc_type_t ftyp;
2233819Swollman	/* content-length+error monad */
2243819Swollman	ssize_t cntlen;
2253819Swollman	/* record time is the WARC-Date time we reinterpret it as ctime */
226194792Sdelphij	time_t rtime;
227172759Sjhb	/* mtime is the Last-Modified time which will be the entry's mtime */
2283819Swollman	time_t mtime;
229172759Sjhb
230172759Sjhbstart_over:
231172759Sjhb	/* just use read_ahead() they keep track of unconsumed
232172759Sjhb	 * bits and bobs for us; no need to put an extra shift in
23383653Speter	 * and reproduce that functionality here */
23483653Speter	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
23583653Speter
236172759Sjhb	if (nrd < 0) {
237172759Sjhb		/* no good */
23883653Speter		archive_set_error(
2393819Swollman			&a->archive, ARCHIVE_ERRNO_MISC,
240172759Sjhb			"Bad record header");
241172759Sjhb		return (ARCHIVE_FATAL);
242172759Sjhb	} else if (buf == NULL) {
24383653Speter		/* there should be room for at least WARC/bla\r\n
24483653Speter		 * must be EOF therefore */
2453819Swollman		return (ARCHIVE_EOF);
246172759Sjhb	}
247172759Sjhb 	/* looks good so far, try and find the end of the header now */
24883653Speter	eoh = _warc_find_eoh(buf, nrd);
249221973Srmacklem	if (eoh == NULL) {
250172759Sjhb		/* still no good, the header end might be beyond the
251172759Sjhb		 * probe we've requested, but then again who'd cram
252221973Srmacklem		 * so much stuff into the header *and* be 28500-compliant */
25383653Speter		archive_set_error(
2543819Swollman			&a->archive, ARCHIVE_ERRNO_MISC,
25583653Speter			"Bad record header");
256172759Sjhb		return (ARCHIVE_FATAL);
257172759Sjhb	}
258172759Sjhb	ver = _warc_rdver(buf, eoh - buf);
259172759Sjhb	/* we currently support WARC 0.12 to 1.0 */
260172759Sjhb	if (ver == 0U) {
26183653Speter		archive_set_error(
26283653Speter			&a->archive, ARCHIVE_ERRNO_MISC,
2633819Swollman			"Invalid record version");
2643819Swollman		return (ARCHIVE_FATAL);
2653819Swollman	} else if (ver < 1200U || ver > 10000U) {
2663819Swollman		archive_set_error(
2671590Srgrimes			&a->archive, ARCHIVE_ERRNO_MISC,
2681590Srgrimes			"Unsupported record version: %u.%u",
2691590Srgrimes			ver / 10000, (ver % 10000) / 100);
27052493Sdillon		return (ARCHIVE_FATAL);
2711590Srgrimes	}
27283653Speter	cntlen = _warc_rdlen(buf, eoh - buf);
27383653Speter	if (cntlen < 0) {
274221455Srmacklem		/* nightmare!  the specs say content-length is mandatory
2751590Srgrimes		 * so I don't feel overly bad stopping the reader here */
276221455Srmacklem		archive_set_error(
277221455Srmacklem			&a->archive, EINVAL,
278221455Srmacklem			"Bad content length");
279221455Srmacklem		return (ARCHIVE_FATAL);
280221455Srmacklem	}
281221455Srmacklem	rtime = _warc_rdrtm(buf, eoh - buf);
282221455Srmacklem	if (rtime == (time_t)-1) {
283221455Srmacklem		/* record time is mandatory as per WARC/1.0,
284221455Srmacklem		 * so just barf here, fast and loud */
285221455Srmacklem		archive_set_error(
286221455Srmacklem			&a->archive, EINVAL,
287221455Srmacklem			"Bad record time");
288221455Srmacklem		return (ARCHIVE_FATAL);
289221455Srmacklem	}
290221455Srmacklem
291221455Srmacklem	/* let the world know we're a WARC archive */
292221455Srmacklem	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293221455Srmacklem	if (ver != w->pver) {
294221455Srmacklem		/* stringify this entry's version */
295221455Srmacklem		archive_string_sprintf(&w->sver,
296221455Srmacklem			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297221455Srmacklem		/* remember the version */
298221455Srmacklem		w->pver = ver;
299221455Srmacklem	}
300221455Srmacklem	/* start off with the type */
301221455Srmacklem	ftyp = _warc_rdtyp(buf, eoh - buf);
302221455Srmacklem	/* and let future calls know about the content */
303221455Srmacklem	w->cntlen = cntlen;
304221455Srmacklem	w->cntoff = 0U;
305221455Srmacklem	mtime = 0;/* Avoid compiling error on some platform. */
30683653Speter
30752493Sdillon	switch (ftyp) {
30852493Sdillon	case WT_RSRC:
30952493Sdillon	case WT_RSP:
31052493Sdillon		/* only try and read the filename in the cases that are
31152493Sdillon		 * guaranteed to have one */
31252493Sdillon		fnam = _warc_rduri(buf, eoh - buf);
313221455Srmacklem		/* check the last character in the URI to avoid creating
314221455Srmacklem		 * directory endpoints as files, see Todo above */
315221455Srmacklem		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316221455Srmacklem			/* break here for now */
317221455Srmacklem			fnam.len = 0U;
318221455Srmacklem			fnam.str = NULL;
319221455Srmacklem			break;
320221455Srmacklem		}
321221455Srmacklem		/* bang to our string pool, so we save a
322221455Srmacklem		 * malloc()+free() roundtrip */
323221455Srmacklem		if (fnam.len + 1U > w->pool.len) {
324221455Srmacklem			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325221455Srmacklem			w->pool.str = realloc(w->pool.str, w->pool.len);
326221455Srmacklem		}
327221455Srmacklem		memcpy(w->pool.str, fnam.str, fnam.len);
328221455Srmacklem		w->pool.str[fnam.len] = '\0';
329221455Srmacklem		/* let no one else know about the pool, it's a secret, shhh */
330221455Srmacklem		fnam.str = w->pool.str;
331221455Srmacklem
332221455Srmacklem		/* snarf mtime or deduce from rtime
33352493Sdillon		 * this is a custom header added by our writer, it's quite
33452493Sdillon		 * hard to believe anyone else would go through with it
33552493Sdillon		 * (apart from being part of some http responses of course) */
336221455Srmacklem		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
337221455Srmacklem			mtime = rtime;
338221455Srmacklem		}
339221455Srmacklem		break;
340221455Srmacklem	default:
341221455Srmacklem		fnam.len = 0U;
342221455Srmacklem		fnam.str = NULL;
343221455Srmacklem		break;
344221455Srmacklem	}
345221455Srmacklem
346221455Srmacklem	/* now eat some of those delicious buffer bits */
347221455Srmacklem	__archive_read_consume(a, eoh - buf);
348221455Srmacklem
349221455Srmacklem	switch (ftyp) {
350221455Srmacklem	case WT_RSRC:
351221455Srmacklem	case WT_RSP:
352221455Srmacklem		if (fnam.len > 0U) {
353221455Srmacklem			/* populate entry object */
354221455Srmacklem			archive_entry_set_filetype(entry, AE_IFREG);
355221455Srmacklem			archive_entry_copy_pathname(entry, fnam.str);
35683653Speter			archive_entry_set_size(entry, cntlen);
35783653Speter			archive_entry_set_perm(entry, 0644);
358221455Srmacklem			/* rtime is the new ctime, mtime stays mtime */
359221455Srmacklem			archive_entry_set_ctime(entry, rtime, 0L);
360221455Srmacklem			archive_entry_set_mtime(entry, mtime, 0L);
361221455Srmacklem			break;
362221455Srmacklem		}
363221455Srmacklem		/* FALLTHROUGH */
364221455Srmacklem	default:
365221455Srmacklem		/* consume the content and start over */
366221455Srmacklem		_warc_skip(a);
367221455Srmacklem		goto start_over;
368221455Srmacklem	}
369221455Srmacklem	return (ARCHIVE_OK);
370221455Srmacklem}
371221455Srmacklem
37252493Sdillonstatic int
37352493Sdillon_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
37452493Sdillon{
37552493Sdillon	struct warc_s *w = a->format->data;
376221455Srmacklem	const char *rab;
377221455Srmacklem	ssize_t nrd;
378221455Srmacklem
379221455Srmacklem	if (w->cntoff >= w->cntlen) {
380221455Srmacklem	eof:
381221455Srmacklem		/* it's our lucky day, no work, we can leave early */
382221455Srmacklem		*buf = NULL;
383221455Srmacklem		*bsz = 0U;
384221455Srmacklem		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
385221455Srmacklem		w->unconsumed = 0U;
386221455Srmacklem		return (ARCHIVE_EOF);
387221455Srmacklem	}
388221455Srmacklem
389221455Srmacklem	rab = __archive_read_ahead(a, 1U, &nrd);
39052493Sdillon	if (nrd < 0) {
39152493Sdillon		*bsz = 0U;
39252493Sdillon		/* big catastrophe */
39352493Sdillon		return (int)nrd;
39452493Sdillon	} else if (nrd == 0) {
395221455Srmacklem		goto eof;
396221455Srmacklem	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
397221455Srmacklem		/* clamp to content-length */
398221455Srmacklem		nrd = w->cntlen - w->cntoff;
399221455Srmacklem	}
400221455Srmacklem	*off = w->cntoff;
401221455Srmacklem	*bsz = nrd;
402221455Srmacklem	*buf = rab;
403221455Srmacklem
404221455Srmacklem	w->cntoff += nrd;
405221455Srmacklem	w->unconsumed = (size_t)nrd;
406221455Srmacklem	return (ARCHIVE_OK);
407221455Srmacklem}
408221455Srmacklem
409221455Srmacklemstatic int
410221455Srmacklem_warc_skip(struct archive_read *a)
411221455Srmacklem{
412221455Srmacklem	struct warc_s *w = a->format->data;
413221455Srmacklem
414221455Srmacklem	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
415221455Srmacklem	w->cntlen = 0U;
416221455Srmacklem	w->cntoff = 0U;
417221455Srmacklem	return (ARCHIVE_OK);
418221455Srmacklem}
419221455Srmacklem
42052493Sdillon
42152493Sdillon/* private routines */
422220596Srustatic void*
423221455Srmacklemdeconst(const void *c)
424221455Srmacklem{
425221455Srmacklem	return (char *)0x1 + (((const char *)c) - (const char *)0x1);
426221455Srmacklem}
427221455Srmacklem
428221455Srmacklemstatic char*
429221455Srmacklemxmemmem(const char *hay, const size_t haysize,
430221455Srmacklem	const char *needle, const size_t needlesize)
431221455Srmacklem{
432221455Srmacklem	const char *const eoh = hay + haysize;
433221455Srmacklem	const char *const eon = needle + needlesize;
434221455Srmacklem	const char *hp;
435221455Srmacklem	const char *np;
436221455Srmacklem	const char *cand;
437221455Srmacklem	unsigned int hsum;
438221455Srmacklem	unsigned int nsum;
439221455Srmacklem	unsigned int eqp;
440221455Srmacklem
441221455Srmacklem	/* trivial checks first
442221455Srmacklem         * a 0-sized needle is defined to be found anywhere in haystack
443221455Srmacklem         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
444221455Srmacklem         * that happens to begin with *NEEDLE) */
445221455Srmacklem	if (needlesize == 0UL) {
446221455Srmacklem		return deconst(hay);
447221455Srmacklem	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
448221455Srmacklem		/* trivial */
449221455Srmacklem		return NULL;
45052493Sdillon	}
451221455Srmacklem
45283653Speter	/* First characters of haystack and needle are the same now. Both are
45383653Speter	 * guaranteed to be at least one character long.  Now computes the sum
45483653Speter	 * of characters values of needle together with the sum of the first
45552493Sdillon	 * needle_len characters of haystack. */
45652493Sdillon	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
45752493Sdillon	     hp < eoh && np < eon;
45852493Sdillon	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
45952493Sdillon
460221455Srmacklem	/* HP now references the (NEEDLESIZE + 1)-th character. */
461221455Srmacklem	if (np < eon) {
462221455Srmacklem		/* haystack is smaller than needle, :O */
463221455Srmacklem		return NULL;
464221455Srmacklem	} else if (eqp) {
465221455Srmacklem		/* found a match */
466221455Srmacklem		return deconst(hay);
467221455Srmacklem	}
468221455Srmacklem
469221455Srmacklem	/* now loop through the rest of haystack,
470221455Srmacklem	 * updating the sum iteratively */
471221455Srmacklem	for (cand = hay; hp < eoh; hp++) {
472225113Srmacklem		hsum ^= *cand++;
473225113Srmacklem		hsum ^= *hp;
474225113Srmacklem
475225113Srmacklem		/* Since the sum of the characters is already known to be
476225113Srmacklem		 * equal at that point, it is enough to check just NEEDLESIZE - 1
477225113Srmacklem		 * characters for equality,
478225113Srmacklem		 * also CAND is by design < HP, so no need for range checks */
479225113Srmacklem		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
48052493Sdillon			return deconst(cand);
48152493Sdillon		}
48252493Sdillon	}
483221455Srmacklem	return NULL;
484221455Srmacklem}
485221455Srmacklem
486221455Srmacklemstatic int
487221455Srmacklemstrtoi_lim(const char *str, const char **ep, int llim, int ulim)
488221455Srmacklem{
489221455Srmacklem	int res = 0;
490221455Srmacklem	const char *sp;
491221455Srmacklem	/* we keep track of the number of digits via rulim */
492221455Srmacklem	int rulim;
493221455Srmacklem
494221455Srmacklem	for (sp = str, rulim = ulim > 10 ? ulim : 10;
495225113Srmacklem	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
496225113Srmacklem	     sp++, rulim /= 10) {
497225113Srmacklem		res *= 10;
498225113Srmacklem		res += *sp - '0';
499225113Srmacklem	}
500225113Srmacklem	if (sp == str) {
501225113Srmacklem		res = -1;
502225113Srmacklem	} else if (res < llim || res > ulim) {
50383653Speter		res = -2;
50483653Speter	}
505221455Srmacklem	*ep = (const char*)sp;
506221455Srmacklem	return res;
507221455Srmacklem}
508221455Srmacklem
509221455Srmacklemstatic time_t
510221455Srmacklemtime_from_tm(struct tm *t)
511221455Srmacklem{
512221455Srmacklem#if HAVE_TIMEGM
513221455Srmacklem        /* Use platform timegm() if available. */
514225113Srmacklem        return (timegm(t));
515225113Srmacklem#elif HAVE__MKGMTIME64
516225113Srmacklem        return (_mkgmtime64(t));
517225113Srmacklem#else
518225113Srmacklem        /* Else use direct calculation using POSIX assumptions. */
51952493Sdillon        /* First, fix up tm_yday based on the year/month/day. */
520221455Srmacklem        if (mktime(t) == (time_t)-1)
521221455Srmacklem                return ((time_t)-1);
522221455Srmacklem        /* Then we can compute timegm() from first principles. */
523221455Srmacklem        return (t->tm_sec
52452493Sdillon            + t->tm_min * 60
525221455Srmacklem            + t->tm_hour * 3600
526221455Srmacklem            + t->tm_yday * 86400
527221455Srmacklem            + (t->tm_year - 70) * 31536000
528221455Srmacklem            + ((t->tm_year - 69) / 4) * 86400
52952493Sdillon            - ((t->tm_year - 1) / 100) * 86400
53052493Sdillon            + ((t->tm_year + 299) / 400) * 86400);
53152493Sdillon#endif
532221455Srmacklem}
533221455Srmacklem
534221455Srmacklemstatic time_t
535221455Srmacklemxstrpisotime(const char *s, char **endptr)
536221455Srmacklem{
537221455Srmacklem/** like strptime() but strictly for ISO 8601 Zulu strings */
538221455Srmacklem	struct tm tm;
539221455Srmacklem	time_t res = (time_t)-1;
540221455Srmacklem
541221455Srmacklem	/* make sure tm is clean */
542221455Srmacklem	memset(&tm, 0, sizeof(tm));
543221455Srmacklem
54452493Sdillon	/* as a courtesy to our callers, and since this is a non-standard
54552493Sdillon	 * routine, we skip leading whitespace */
54652493Sdillon	while (*s == ' ' || *s == '\t')
547221455Srmacklem		++s;
548221455Srmacklem
549221455Srmacklem	/* read year */
550221455Srmacklem	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
551221455Srmacklem		goto out;
552221455Srmacklem	}
553221455Srmacklem	/* read month */
554221455Srmacklem	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
555221455Srmacklem		goto out;
556221455Srmacklem	}
557221455Srmacklem	/* read day-of-month */
558221455Srmacklem	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
559225113Srmacklem		goto out;
560225113Srmacklem	}
56152493Sdillon	/* read hour */
5621590Srgrimes	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
5631590Srgrimes		goto out;
5641590Srgrimes	}
5651590Srgrimes	/* read minute */
5661590Srgrimes	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
5671590Srgrimes		goto out;
5681590Srgrimes	}
5691590Srgrimes	/* read second */
5701590Srgrimes	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
5711590Srgrimes		goto out;
5721590Srgrimes	}
57352493Sdillon
5741590Srgrimes	/* massage TM to fulfill some of POSIX' constraints */
57583653Speter	tm.tm_year -= 1900;
57683653Speter	tm.tm_mon--;
57752493Sdillon
5781590Srgrimes	/* now convert our custom tm struct to a unix stamp using UTC */
57983653Speter	res = time_from_tm(&tm);
58083653Speter
581172759Sjhbout:
58283653Speter	if (endptr != NULL) {
58383653Speter		*endptr = deconst(s);
58483653Speter	}
58583653Speter	return res;
58683653Speter}
58783653Speter
58883653Speterstatic unsigned int
58983653Speter_warc_rdver(const char *buf, size_t bsz)
59052493Sdillon{
5911590Srgrimes	static const char magic[] = "WARC/";
59252493Sdillon	const char *c;
59383653Speter	unsigned int ver = 0U;
59483653Speter	unsigned int end = 0U;
595172759Sjhb
59652493Sdillon	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
59752493Sdillon		/* buffer too small or invalid magic */
59852493Sdillon		return ver;
59952493Sdillon	}
60052493Sdillon	/* looks good so far, read the version number for a laugh */
60152493Sdillon	buf += sizeof(magic) - 1U;
60252493Sdillon
6031590Srgrimes	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
60452493Sdillon	    isdigit((unsigned char)buf[2U])) {
60552493Sdillon		/* we support a maximum of 2 digits in the minor version */
60652493Sdillon		if (isdigit((unsigned char)buf[3U]))
607262229Sjhb			end = 1U;
608262229Sjhb		/* set up major version */
609262229Sjhb		ver = (buf[0U] - '0') * 10000U;
610262229Sjhb		/* set up minor version */
611262229Sjhb		if (end == 1U) {
612262229Sjhb			ver += (buf[2U] - '0') * 1000U;
613262229Sjhb			ver += (buf[3U] - '0') * 100U;
614262229Sjhb		} else
615262229Sjhb			ver += (buf[2U] - '0') * 100U;
61652493Sdillon		/*
61752493Sdillon		 * WARC below version 0.12 has a space-separated header
61852493Sdillon		 * WARC 0.12 and above terminates the version with a CRLF
61952493Sdillon		 */
62052493Sdillon		c = buf + 3U + end;
62152493Sdillon		if (ver >= 1200U) {
62252493Sdillon			if (memcmp(c, "\r\n", 2U) != 0)
62352493Sdillon				ver = 0U;
62452493Sdillon		} else if (ver < 1200U) {
62552493Sdillon			if (*c != ' ' && *c != '\t')
62652493Sdillon				ver = 0U;
62752493Sdillon		}
62852493Sdillon	}
62952493Sdillon	return ver;
63052493Sdillon}
63152493Sdillon
63252493Sdillonstatic unsigned int
63352493Sdillon_warc_rdtyp(const char *buf, size_t bsz)
63483653Speter{
63552493Sdillon	static const char _key[] = "\r\nWARC-Type:";
63652493Sdillon	const char *val, *eol;
63752493Sdillon
63852493Sdillon	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
63983653Speter		/* no bother */
64083653Speter		return WT_NONE;
64183653Speter	}
64283653Speter	val += sizeof(_key) - 1U;
64383653Speter	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
64483653Speter		/* no end of line */
64583653Speter		return WT_NONE;
64683653Speter	}
64783653Speter
64852493Sdillon	/* overread whitespace */
64983653Speter	while (val < eol && (*val == ' ' || *val == '\t'))
65052493Sdillon		++val;
6511590Srgrimes
65252493Sdillon	if (val + 8U == eol) {
6531590Srgrimes		if (memcmp(val, "resource", 8U) == 0)
6541590Srgrimes			return WT_RSRC;
6551590Srgrimes		else if (memcmp(val, "response", 8U) == 0)
6561590Srgrimes			return WT_RSP;
6571590Srgrimes	}
65852493Sdillon	return WT_NONE;
6591590Srgrimes}
66052493Sdillon
66152493Sdillonstatic warc_string_t
66252493Sdillon_warc_rduri(const char *buf, size_t bsz)
66352493Sdillon{
66452493Sdillon	static const char _key[] = "\r\nWARC-Target-URI:";
66552493Sdillon	const char *val, *uri, *eol, *p;
66652493Sdillon	warc_string_t res = {0U, NULL};
66752493Sdillon
6681590Srgrimes	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
6691590Srgrimes		/* no bother */
6701590Srgrimes		return res;
6711590Srgrimes	}
672172759Sjhb	/* overread whitespace */
6731590Srgrimes	val += sizeof(_key) - 1U;
6741590Srgrimes	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
675243783Srmacklem		/* no end of line */
6761590Srgrimes		return res;
6771590Srgrimes	}
67852493Sdillon
67952493Sdillon	while (val < eol && (*val == ' ' || *val == '\t'))
68052493Sdillon		++val;
68152493Sdillon
68252493Sdillon	/* overread URL designators */
68352493Sdillon	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
68452493Sdillon		/* not touching that! */
68552493Sdillon		return res;
68652493Sdillon	}
68752493Sdillon
68852493Sdillon	/* spaces inside uri are not allowed, CRLF should follow */
68952493Sdillon	for (p = val; p < eol; p++) {
69052493Sdillon		if (isspace((unsigned char)*p))
69152493Sdillon			return res;
69252493Sdillon	}
69352493Sdillon
69452493Sdillon	/* there must be at least space for ftp */
69552493Sdillon	if (uri < (val + 3U))
69652493Sdillon		return res;
69752493Sdillon
69852493Sdillon	/* move uri to point to after :// */
69952493Sdillon	uri += 3U;
70052493Sdillon
70152493Sdillon	/* now then, inspect the URI */
70252493Sdillon	if (memcmp(val, "file", 4U) == 0) {
70352493Sdillon		/* perfect, nothing left to do here */
70452493Sdillon
70552493Sdillon	} else if (memcmp(val, "http", 4U) == 0 ||
70652493Sdillon		   memcmp(val, "ftp", 3U) == 0) {
70752493Sdillon		/* overread domain, and the first / */
70852493Sdillon		while (uri < eol && *uri++ != '/');
70952493Sdillon	} else {
71052493Sdillon		/* not sure what to do? best to bugger off */
71152493Sdillon		return res;
712192762Srmacklem	}
713192762Srmacklem	res.str = uri;
714192762Srmacklem	res.len = eol - uri;
715192762Srmacklem	return res;
716192762Srmacklem}
717192762Srmacklem
718221440Srmacklemstatic ssize_t
719192762Srmacklem_warc_rdlen(const char *buf, size_t bsz)
720221440Srmacklem{
721221440Srmacklem	static const char _key[] = "\r\nContent-Length:";
722221440Srmacklem	const char *val, *eol;
723221440Srmacklem	char *on = NULL;
724221440Srmacklem	long int len;
725221440Srmacklem
726221440Srmacklem	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
727221440Srmacklem		/* no bother */
728221440Srmacklem		return -1;
729192762Srmacklem	}
730192762Srmacklem	val += sizeof(_key) - 1U;
731192762Srmacklem	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
732192762Srmacklem		/* no end of line */
733192762Srmacklem		return -1;
734192762Srmacklem	}
735192762Srmacklem
736192762Srmacklem	/* skip leading whitespace */
737192762Srmacklem	while (val < eol && (*val == ' ' || *val == '\t'))
738192762Srmacklem		val++;
739192762Srmacklem	/* there must be at least one digit */
740192762Srmacklem	if (!isdigit((unsigned char)*val))
741192762Srmacklem		return -1;
742192762Srmacklem	len = strtol(val, &on, 10);
743192762Srmacklem	if (on != eol) {
744192762Srmacklem		/* line must end here */
745192762Srmacklem		return -1;
746192762Srmacklem	}
747192762Srmacklem
748192762Srmacklem	return (size_t)len;
749192762Srmacklem}
750192762Srmacklem
751192762Srmacklemstatic time_t
752192762Srmacklem_warc_rdrtm(const char *buf, size_t bsz)
753192762Srmacklem{
754192762Srmacklem	static const char _key[] = "\r\nWARC-Date:";
755192762Srmacklem	const char *val, *eol;
756192762Srmacklem	char *on = NULL;
757192762Srmacklem	time_t res;
758192762Srmacklem
759192762Srmacklem	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
760192762Srmacklem		/* no bother */
761192762Srmacklem		return (time_t)-1;
762192762Srmacklem	}
763192762Srmacklem	val += sizeof(_key) - 1U;
764192762Srmacklem	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
765192762Srmacklem		/* no end of line */
766192762Srmacklem		return -1;
767192762Srmacklem	}
768192762Srmacklem
769192762Srmacklem	/* xstrpisotime() kindly overreads whitespace for us, so use that */
770192762Srmacklem	res = xstrpisotime(val, &on);
771192762Srmacklem	if (on != eol) {
772192762Srmacklem		/* line must end here */
773192762Srmacklem		return -1;
774192762Srmacklem	}
775192762Srmacklem	return res;
776192762Srmacklem}
777192762Srmacklem
778192762Srmacklemstatic time_t
779192762Srmacklem_warc_rdmtm(const char *buf, size_t bsz)
780192762Srmacklem{
781192762Srmacklem	static const char _key[] = "\r\nLast-Modified:";
782192762Srmacklem	const char *val, *eol;
783192762Srmacklem	char *on = NULL;
784192762Srmacklem	time_t res;
785192762Srmacklem
786192762Srmacklem	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
787192762Srmacklem		/* no bother */
788192762Srmacklem		return (time_t)-1;
789192762Srmacklem	}
790192762Srmacklem	val += sizeof(_key) - 1U;
791192762Srmacklem	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
792192762Srmacklem		/* no end of line */
793192762Srmacklem		return -1;
794192762Srmacklem	}
795192762Srmacklem
796192762Srmacklem	/* xstrpisotime() kindly overreads whitespace for us, so use that */
797192762Srmacklem	res = xstrpisotime(val, &on);
798192762Srmacklem	if (on != eol) {
799192762Srmacklem		/* line must end here */
800192762Srmacklem		return -1;
801192762Srmacklem	}
802192762Srmacklem	return res;
803192762Srmacklem}
804192762Srmacklem
805192762Srmacklemstatic const char*
806192762Srmacklem_warc_find_eoh(const char *buf, size_t bsz)
807192762Srmacklem{
808192762Srmacklem	static const char _marker[] = "\r\n\r\n";
809192762Srmacklem	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
810192762Srmacklem
811192762Srmacklem	if (hit != NULL) {
812192762Srmacklem		hit += sizeof(_marker) - 1U;
813192762Srmacklem	}
814192762Srmacklem	return hit;
815192762Srmacklem}
816192762Srmacklem
817192762Srmacklemstatic const char*
818192762Srmacklem_warc_find_eol(const char *buf, size_t bsz)
819192762Srmacklem{
820192762Srmacklem	static const char _marker[] = "\r\n";
821192762Srmacklem	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
822192762Srmacklem
823192762Srmacklem	return hit;
824192762Srmacklem}
825192762Srmacklem/* archive_read_support_format_warc.c ends here */
826221454Srmacklem