1/*-
2 * Copyright (c) 2014 Sebastian Freundt
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 *    notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 *    notice, this list of conditions and the following disclaimer in the
12 *    documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "archive_platform.h"
27
28/**
29 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as
30 * ISO 28500:2009.
31 * For the purposes of this file we used the final draft from:
32 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf
33 *
34 * Todo:
35 * [ ] real-world warcs can contain resources at endpoints ending in /
36 *     e.g. http://bibnum.bnf.fr/warc/
37 *     if you're lucky their response contains a Content-Location: header
38 *     pointing to a unix-compliant filename, in the example above it's
39 *     Content-Location: http://bibnum.bnf.fr/warc/index.html
40 *     however, that's not mandated and github for example doesn't follow
41 *     this convention.
42 *     We need a set of archive options to control what to do with
43 *     entries like these, at the moment care is taken to skip them.
44 *
45 **/
46
47#ifdef HAVE_SYS_STAT_H
48#include <sys/stat.h>
49#endif
50#ifdef HAVE_ERRNO_H
51#include <errno.h>
52#endif
53#ifdef HAVE_STDLIB_H
54#include <stdlib.h>
55#endif
56#ifdef HAVE_STRING_H
57#include <string.h>
58#endif
59#ifdef HAVE_LIMITS_H
60#include <limits.h>
61#endif
62#ifdef HAVE_CTYPE_H
63#include <ctype.h>
64#endif
65#ifdef HAVE_TIME_H
66#include <time.h>
67#endif
68
69#include "archive.h"
70#include "archive_entry.h"
71#include "archive_private.h"
72#include "archive_read_private.h"
73
74typedef enum {
75	WT_NONE,
76	/* warcinfo */
77	WT_INFO,
78	/* metadata */
79	WT_META,
80	/* resource */
81	WT_RSRC,
82	/* request, unsupported */
83	WT_REQ,
84	/* response, unsupported */
85	WT_RSP,
86	/* revisit, unsupported */
87	WT_RVIS,
88	/* conversion, unsupported */
89	WT_CONV,
90	/* continuation, unsupported at the moment */
91	WT_CONT,
92	/* invalid type */
93	LAST_WT
94} warc_type_t;
95
96typedef struct {
97	size_t len;
98	const char *str;
99} warc_string_t;
100
101typedef struct {
102	size_t len;
103	char *str;
104} warc_strbuf_t;
105
106struct warc_s {
107	/* content length ahead */
108	size_t cntlen;
109	/* and how much we've processed so far */
110	size_t cntoff;
111	/* and how much we need to consume between calls */
112	size_t unconsumed;
113
114	/* string pool */
115	warc_strbuf_t pool;
116	/* previous version */
117	unsigned int pver;
118	/* stringified format name */
119	struct archive_string sver;
120};
121
122static int _warc_bid(struct archive_read *a, int);
123static int _warc_cleanup(struct archive_read *a);
124static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*);
125static int _warc_skip(struct archive_read *a);
126static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e);
127
128/* private routines */
129static unsigned int _warc_rdver(const char *buf, size_t bsz);
130static unsigned int _warc_rdtyp(const char *buf, size_t bsz);
131static warc_string_t _warc_rduri(const char *buf, size_t bsz);
132static ssize_t _warc_rdlen(const char *buf, size_t bsz);
133static time_t _warc_rdrtm(const char *buf, size_t bsz);
134static time_t _warc_rdmtm(const char *buf, size_t bsz);
135static const char *_warc_find_eoh(const char *buf, size_t bsz);
136static const char *_warc_find_eol(const char *buf, size_t bsz);
137
138int
139archive_read_support_format_warc(struct archive *_a)
140{
141	struct archive_read *a = (struct archive_read *)_a;
142	struct warc_s *w;
143	int r;
144
145	archive_check_magic(_a, ARCHIVE_READ_MAGIC,
146	    ARCHIVE_STATE_NEW, "archive_read_support_format_warc");
147
148	if ((w = calloc(1, sizeof(*w))) == NULL) {
149		archive_set_error(&a->archive, ENOMEM,
150		    "Can't allocate warc data");
151		return (ARCHIVE_FATAL);
152	}
153
154	r = __archive_read_register_format(
155		a, w, "warc",
156		_warc_bid, NULL, _warc_rdhdr, _warc_read,
157		_warc_skip, NULL, _warc_cleanup, NULL, NULL);
158
159	if (r != ARCHIVE_OK) {
160		free(w);
161		return (r);
162	}
163	return (ARCHIVE_OK);
164}
165
166static int
167_warc_cleanup(struct archive_read *a)
168{
169	struct warc_s *w = a->format->data;
170
171	if (w->pool.len > 0U) {
172		free(w->pool.str);
173	}
174	archive_string_free(&w->sver);
175	free(w);
176	a->format->data = NULL;
177	return (ARCHIVE_OK);
178}
179
180static int
181_warc_bid(struct archive_read *a, int best_bid)
182{
183	const char *hdr;
184	ssize_t nrd;
185	unsigned int ver;
186
187	(void)best_bid; /* UNUSED */
188
189	/* check first line of file, it should be a record already */
190	if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) {
191		/* no idea what to do */
192		return -1;
193	} else if (nrd < 12) {
194		/* nah, not for us, our magic cookie is at least 12 bytes */
195		return -1;
196	}
197
198	/* otherwise snarf the record's version number */
199	ver = _warc_rdver(hdr, nrd);
200	if (ver < 1200U || ver > 10000U) {
201		/* we only support WARC 0.12 to 1.0 */
202		return -1;
203	}
204
205	/* otherwise be confident */
206	return (64);
207}
208
209static int
210_warc_rdhdr(struct archive_read *a, struct archive_entry *entry)
211{
212#define HDR_PROBE_LEN		(12U)
213	struct warc_s *w = a->format->data;
214	unsigned int ver;
215	const char *buf;
216	ssize_t nrd;
217	const char *eoh;
218	char *tmp;
219	/* for the file name, saves some strndup()'ing */
220	warc_string_t fnam;
221	/* warc record type, not that we really use it a lot */
222	warc_type_t ftyp;
223	/* content-length+error monad */
224	ssize_t cntlen;
225	/* record time is the WARC-Date time we reinterpret it as ctime */
226	time_t rtime;
227	/* mtime is the Last-Modified time which will be the entry's mtime */
228	time_t mtime;
229
230start_over:
231	/* just use read_ahead() they keep track of unconsumed
232	 * bits and bobs for us; no need to put an extra shift in
233	 * and reproduce that functionality here */
234	buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd);
235
236	if (nrd < 0) {
237		/* no good */
238		archive_set_error(
239			&a->archive, ARCHIVE_ERRNO_MISC,
240			"Bad record header");
241		return (ARCHIVE_FATAL);
242	} else if (buf == NULL) {
243		/* there should be room for at least WARC/bla\r\n
244		 * must be EOF therefore */
245		return (ARCHIVE_EOF);
246	}
247 	/* looks good so far, try and find the end of the header now */
248	eoh = _warc_find_eoh(buf, nrd);
249	if (eoh == NULL) {
250		/* still no good, the header end might be beyond the
251		 * probe we've requested, but then again who'd cram
252		 * so much stuff into the header *and* be 28500-compliant */
253		archive_set_error(
254			&a->archive, ARCHIVE_ERRNO_MISC,
255			"Bad record header");
256		return (ARCHIVE_FATAL);
257	}
258	ver = _warc_rdver(buf, eoh - buf);
259	/* we currently support WARC 0.12 to 1.0 */
260	if (ver == 0U) {
261		archive_set_error(
262			&a->archive, ARCHIVE_ERRNO_MISC,
263			"Invalid record version");
264		return (ARCHIVE_FATAL);
265	} else if (ver < 1200U || ver > 10000U) {
266		archive_set_error(
267			&a->archive, ARCHIVE_ERRNO_MISC,
268			"Unsupported record version: %u.%u",
269			ver / 10000, (ver % 10000) / 100);
270		return (ARCHIVE_FATAL);
271	}
272	cntlen = _warc_rdlen(buf, eoh - buf);
273	if (cntlen < 0) {
274		/* nightmare!  the specs say content-length is mandatory
275		 * so I don't feel overly bad stopping the reader here */
276		archive_set_error(
277			&a->archive, EINVAL,
278			"Bad content length");
279		return (ARCHIVE_FATAL);
280	}
281	rtime = _warc_rdrtm(buf, eoh - buf);
282	if (rtime == (time_t)-1) {
283		/* record time is mandatory as per WARC/1.0,
284		 * so just barf here, fast and loud */
285		archive_set_error(
286			&a->archive, EINVAL,
287			"Bad record time");
288		return (ARCHIVE_FATAL);
289	}
290
291	/* let the world know we're a WARC archive */
292	a->archive.archive_format = ARCHIVE_FORMAT_WARC;
293	if (ver != w->pver) {
294		/* stringify this entry's version */
295		archive_string_sprintf(&w->sver,
296			"WARC/%u.%u", ver / 10000, (ver % 10000) / 100);
297		/* remember the version */
298		w->pver = ver;
299	}
300	/* start off with the type */
301	ftyp = _warc_rdtyp(buf, eoh - buf);
302	/* and let future calls know about the content */
303	w->cntlen = cntlen;
304	w->cntoff = 0U;
305	mtime = 0;/* Avoid compiling error on some platform. */
306
307	switch (ftyp) {
308	case WT_RSRC:
309	case WT_RSP:
310		/* only try and read the filename in the cases that are
311		 * guaranteed to have one */
312		fnam = _warc_rduri(buf, eoh - buf);
313		/* check the last character in the URI to avoid creating
314		 * directory endpoints as files, see Todo above */
315		if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') {
316			/* break here for now */
317			fnam.len = 0U;
318			fnam.str = NULL;
319			break;
320		}
321		/* bang to our string pool, so we save a
322		 * malloc()+free() roundtrip */
323		if (fnam.len + 1U > w->pool.len) {
324			w->pool.len = ((fnam.len + 64U) / 64U) * 64U;
325			tmp = realloc(w->pool.str, w->pool.len);
326			if (tmp == NULL) {
327				archive_set_error(
328					&a->archive, ENOMEM,
329					"Out of memory");
330				return (ARCHIVE_FATAL);
331			}
332			w->pool.str = tmp;
333		}
334		memcpy(w->pool.str, fnam.str, fnam.len);
335		w->pool.str[fnam.len] = '\0';
336		/* let no one else know about the pool, it's a secret, shhh */
337		fnam.str = w->pool.str;
338
339		/* snarf mtime or deduce from rtime
340		 * this is a custom header added by our writer, it's quite
341		 * hard to believe anyone else would go through with it
342		 * (apart from being part of some http responses of course) */
343		if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) {
344			mtime = rtime;
345		}
346		break;
347	case WT_NONE:
348	case WT_INFO:
349	case WT_META:
350	case WT_REQ:
351	case WT_RVIS:
352	case WT_CONV:
353	case WT_CONT:
354	case LAST_WT:
355	default:
356		fnam.len = 0U;
357		fnam.str = NULL;
358		break;
359	}
360
361	/* now eat some of those delicious buffer bits */
362	__archive_read_consume(a, eoh - buf);
363
364	switch (ftyp) {
365	case WT_RSRC:
366	case WT_RSP:
367		if (fnam.len > 0U) {
368			/* populate entry object */
369			archive_entry_set_filetype(entry, AE_IFREG);
370			archive_entry_copy_pathname(entry, fnam.str);
371			archive_entry_set_size(entry, cntlen);
372			archive_entry_set_perm(entry, 0644);
373			/* rtime is the new ctime, mtime stays mtime */
374			archive_entry_set_ctime(entry, rtime, 0L);
375			archive_entry_set_mtime(entry, mtime, 0L);
376			break;
377		}
378		/* FALLTHROUGH */
379	case WT_NONE:
380	case WT_INFO:
381	case WT_META:
382	case WT_REQ:
383	case WT_RVIS:
384	case WT_CONV:
385	case WT_CONT:
386	case LAST_WT:
387	default:
388		/* consume the content and start over */
389		_warc_skip(a);
390		goto start_over;
391	}
392	return (ARCHIVE_OK);
393}
394
395static int
396_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off)
397{
398	struct warc_s *w = a->format->data;
399	const char *rab;
400	ssize_t nrd;
401
402	if (w->cntoff >= w->cntlen) {
403	eof:
404		/* it's our lucky day, no work, we can leave early */
405		*buf = NULL;
406		*bsz = 0U;
407		*off = w->cntoff + 4U/*for \r\n\r\n separator*/;
408		w->unconsumed = 0U;
409		return (ARCHIVE_EOF);
410	}
411
412	if (w->unconsumed) {
413		__archive_read_consume(a, w->unconsumed);
414		w->unconsumed = 0U;
415	}
416
417	rab = __archive_read_ahead(a, 1U, &nrd);
418	if (nrd < 0) {
419		*bsz = 0U;
420		/* big catastrophe */
421		return (int)nrd;
422	} else if (nrd == 0) {
423		goto eof;
424	} else if ((size_t)nrd > w->cntlen - w->cntoff) {
425		/* clamp to content-length */
426		nrd = w->cntlen - w->cntoff;
427	}
428	*off = w->cntoff;
429	*bsz = nrd;
430	*buf = rab;
431
432	w->cntoff += nrd;
433	w->unconsumed = (size_t)nrd;
434	return (ARCHIVE_OK);
435}
436
437static int
438_warc_skip(struct archive_read *a)
439{
440	struct warc_s *w = a->format->data;
441
442	__archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/);
443	w->cntlen = 0U;
444	w->cntoff = 0U;
445	return (ARCHIVE_OK);
446}
447
448
449/* private routines */
450static void*
451deconst(const void *c)
452{
453	return (void *)(uintptr_t)c;
454}
455
456static char*
457xmemmem(const char *hay, const size_t haysize,
458	const char *needle, const size_t needlesize)
459{
460	const char *const eoh = hay + haysize;
461	const char *const eon = needle + needlesize;
462	const char *hp;
463	const char *np;
464	const char *cand;
465	unsigned int hsum;
466	unsigned int nsum;
467	unsigned int eqp;
468
469	/* trivial checks first
470         * a 0-sized needle is defined to be found anywhere in haystack
471         * then run strchr() to find a candidate in HAYSTACK (i.e. a portion
472         * that happens to begin with *NEEDLE) */
473	if (needlesize == 0UL) {
474		return deconst(hay);
475	} else if ((hay = memchr(hay, *needle, haysize)) == NULL) {
476		/* trivial */
477		return NULL;
478	}
479
480	/* First characters of haystack and needle are the same now. Both are
481	 * guaranteed to be at least one character long.  Now computes the sum
482	 * of characters values of needle together with the sum of the first
483	 * needle_len characters of haystack. */
484	for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U;
485	     hp < eoh && np < eon;
486	     hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++);
487
488	/* HP now references the (NEEDLESIZE + 1)-th character. */
489	if (np < eon) {
490		/* haystack is smaller than needle, :O */
491		return NULL;
492	} else if (eqp) {
493		/* found a match */
494		return deconst(hay);
495	}
496
497	/* now loop through the rest of haystack,
498	 * updating the sum iteratively */
499	for (cand = hay; hp < eoh; hp++) {
500		hsum ^= *cand++;
501		hsum ^= *hp;
502
503		/* Since the sum of the characters is already known to be
504		 * equal at that point, it is enough to check just NEEDLESIZE - 1
505		 * characters for equality,
506		 * also CAND is by design < HP, so no need for range checks */
507		if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) {
508			return deconst(cand);
509		}
510	}
511	return NULL;
512}
513
514static int
515strtoi_lim(const char *str, const char **ep, int llim, int ulim)
516{
517	int res = 0;
518	const char *sp;
519	/* we keep track of the number of digits via rulim */
520	int rulim;
521
522	for (sp = str, rulim = ulim > 10 ? ulim : 10;
523	     res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9';
524	     sp++, rulim /= 10) {
525		res *= 10;
526		res += *sp - '0';
527	}
528	if (sp == str) {
529		res = -1;
530	} else if (res < llim || res > ulim) {
531		res = -2;
532	}
533	*ep = (const char*)sp;
534	return res;
535}
536
537static time_t
538time_from_tm(struct tm *t)
539{
540#if HAVE__MKGMTIME
541        return _mkgmtime(t);
542#elif HAVE_TIMEGM
543        /* Use platform timegm() if available. */
544        return (timegm(t));
545#else
546        /* Else use direct calculation using POSIX assumptions. */
547        /* First, fix up tm_yday based on the year/month/day. */
548        if (mktime(t) == (time_t)-1)
549                return ((time_t)-1);
550        /* Then we can compute timegm() from first principles. */
551        return (t->tm_sec
552            + t->tm_min * 60
553            + t->tm_hour * 3600
554            + t->tm_yday * 86400
555            + (t->tm_year - 70) * 31536000
556            + ((t->tm_year - 69) / 4) * 86400
557            - ((t->tm_year - 1) / 100) * 86400
558            + ((t->tm_year + 299) / 400) * 86400);
559#endif
560}
561
562static time_t
563xstrpisotime(const char *s, char **endptr)
564{
565/** like strptime() but strictly for ISO 8601 Zulu strings */
566	struct tm tm;
567	time_t res = (time_t)-1;
568
569	/* make sure tm is clean */
570	memset(&tm, 0, sizeof(tm));
571
572	/* as a courtesy to our callers, and since this is a non-standard
573	 * routine, we skip leading whitespace */
574	while (*s == ' ' || *s == '\t')
575		++s;
576
577	/* read year */
578	if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') {
579		goto out;
580	}
581	/* read month */
582	if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') {
583		goto out;
584	}
585	/* read day-of-month */
586	if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') {
587		goto out;
588	}
589	/* read hour */
590	if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') {
591		goto out;
592	}
593	/* read minute */
594	if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') {
595		goto out;
596	}
597	/* read second */
598	if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') {
599		goto out;
600	}
601
602	/* massage TM to fulfill some of POSIX' constraints */
603	tm.tm_year -= 1900;
604	tm.tm_mon--;
605
606	/* now convert our custom tm struct to a unix stamp using UTC */
607	res = time_from_tm(&tm);
608
609out:
610	if (endptr != NULL) {
611		*endptr = deconst(s);
612	}
613	return res;
614}
615
616static unsigned int
617_warc_rdver(const char *buf, size_t bsz)
618{
619	static const char magic[] = "WARC/";
620	const char *c;
621	unsigned int ver = 0U;
622	unsigned int end = 0U;
623
624	if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) {
625		/* buffer too small or invalid magic */
626		return ver;
627	}
628	/* looks good so far, read the version number for a laugh */
629	buf += sizeof(magic) - 1U;
630
631	if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') &&
632	    isdigit((unsigned char)buf[2U])) {
633		/* we support a maximum of 2 digits in the minor version */
634		if (isdigit((unsigned char)buf[3U]))
635			end = 1U;
636		/* set up major version */
637		ver = (buf[0U] - '0') * 10000U;
638		/* set up minor version */
639		if (end == 1U) {
640			ver += (buf[2U] - '0') * 1000U;
641			ver += (buf[3U] - '0') * 100U;
642		} else
643			ver += (buf[2U] - '0') * 100U;
644		/*
645		 * WARC below version 0.12 has a space-separated header
646		 * WARC 0.12 and above terminates the version with a CRLF
647		 */
648		c = buf + 3U + end;
649		if (ver >= 1200U) {
650			if (memcmp(c, "\r\n", 2U) != 0)
651				ver = 0U;
652		} else {
653			/* ver < 1200U */
654			if (*c != ' ' && *c != '\t')
655				ver = 0U;
656		}
657	}
658	return ver;
659}
660
661static unsigned int
662_warc_rdtyp(const char *buf, size_t bsz)
663{
664	static const char _key[] = "\r\nWARC-Type:";
665	const char *val, *eol;
666
667	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
668		/* no bother */
669		return WT_NONE;
670	}
671	val += sizeof(_key) - 1U;
672	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
673		/* no end of line */
674		return WT_NONE;
675	}
676
677	/* overread whitespace */
678	while (val < eol && (*val == ' ' || *val == '\t'))
679		++val;
680
681	if (val + 8U == eol) {
682		if (memcmp(val, "resource", 8U) == 0)
683			return WT_RSRC;
684		else if (memcmp(val, "response", 8U) == 0)
685			return WT_RSP;
686	}
687	return WT_NONE;
688}
689
690static warc_string_t
691_warc_rduri(const char *buf, size_t bsz)
692{
693	static const char _key[] = "\r\nWARC-Target-URI:";
694	const char *val, *uri, *eol, *p;
695	warc_string_t res = {0U, NULL};
696
697	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
698		/* no bother */
699		return res;
700	}
701	/* overread whitespace */
702	val += sizeof(_key) - 1U;
703	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
704		/* no end of line */
705		return res;
706	}
707
708	while (val < eol && (*val == ' ' || *val == '\t'))
709		++val;
710
711	/* overread URL designators */
712	if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) {
713		/* not touching that! */
714		return res;
715	}
716
717	/* spaces inside uri are not allowed, CRLF should follow */
718	for (p = val; p < eol; p++) {
719		if (isspace((unsigned char)*p))
720			return res;
721	}
722
723	/* there must be at least space for ftp */
724	if (uri < (val + 3U))
725		return res;
726
727	/* move uri to point to after :// */
728	uri += 3U;
729
730	/* now then, inspect the URI */
731	if (memcmp(val, "file", 4U) == 0) {
732		/* perfect, nothing left to do here */
733
734	} else if (memcmp(val, "http", 4U) == 0 ||
735		   memcmp(val, "ftp", 3U) == 0) {
736		/* overread domain, and the first / */
737		while (uri < eol && *uri++ != '/');
738	} else {
739		/* not sure what to do? best to bugger off */
740		return res;
741	}
742	res.str = uri;
743	res.len = eol - uri;
744	return res;
745}
746
747static ssize_t
748_warc_rdlen(const char *buf, size_t bsz)
749{
750	static const char _key[] = "\r\nContent-Length:";
751	const char *val, *eol;
752	char *on = NULL;
753	long int len;
754
755	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
756		/* no bother */
757		return -1;
758	}
759	val += sizeof(_key) - 1U;
760	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) {
761		/* no end of line */
762		return -1;
763	}
764
765	/* skip leading whitespace */
766	while (val < eol && (*val == ' ' || *val == '\t'))
767		val++;
768	/* there must be at least one digit */
769	if (!isdigit((unsigned char)*val))
770		return -1;
771	errno = 0;
772	len = strtol(val, &on, 10);
773	if (errno != 0 || on != eol) {
774		/* line must end here */
775		return -1;
776	}
777
778	return (size_t)len;
779}
780
781static time_t
782_warc_rdrtm(const char *buf, size_t bsz)
783{
784	static const char _key[] = "\r\nWARC-Date:";
785	const char *val, *eol;
786	char *on = NULL;
787	time_t res;
788
789	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
790		/* no bother */
791		return (time_t)-1;
792	}
793	val += sizeof(_key) - 1U;
794	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
795		/* no end of line */
796		return -1;
797	}
798
799	/* xstrpisotime() kindly overreads whitespace for us, so use that */
800	res = xstrpisotime(val, &on);
801	if (on != eol) {
802		/* line must end here */
803		return -1;
804	}
805	return res;
806}
807
808static time_t
809_warc_rdmtm(const char *buf, size_t bsz)
810{
811	static const char _key[] = "\r\nLast-Modified:";
812	const char *val, *eol;
813	char *on = NULL;
814	time_t res;
815
816	if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) {
817		/* no bother */
818		return (time_t)-1;
819	}
820	val += sizeof(_key) - 1U;
821	if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) {
822		/* no end of line */
823		return -1;
824	}
825
826	/* xstrpisotime() kindly overreads whitespace for us, so use that */
827	res = xstrpisotime(val, &on);
828	if (on != eol) {
829		/* line must end here */
830		return -1;
831	}
832	return res;
833}
834
835static const char*
836_warc_find_eoh(const char *buf, size_t bsz)
837{
838	static const char _marker[] = "\r\n\r\n";
839	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
840
841	if (hit != NULL) {
842		hit += sizeof(_marker) - 1U;
843	}
844	return hit;
845}
846
847static const char*
848_warc_find_eol(const char *buf, size_t bsz)
849{
850	static const char _marker[] = "\r\n";
851	const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U);
852
853	return hit;
854}
855/* archive_read_support_format_warc.c ends here */
856