archive_read_support_format_warc.c revision 315433
11590Srgrimes/*- 21590Srgrimes * Copyright (c) 2014 Sebastian Freundt 31590Srgrimes * All rights reserved. 41590Srgrimes * 51590Srgrimes * Redistribution and use in source and binary forms, with or without 61590Srgrimes * modification, are permitted provided that the following conditions 71590Srgrimes * are met: 81590Srgrimes * 1. Redistributions of source code must retain the above copyright 91590Srgrimes * notice, this list of conditions and the following disclaimer. 101590Srgrimes * 2. Redistributions in binary form must reproduce the above copyright 111590Srgrimes * notice, this list of conditions and the following disclaimer in the 121590Srgrimes * documentation and/or other materials provided with the distribution. 131590Srgrimes * 141590Srgrimes * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 151590Srgrimes * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 161590Srgrimes * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 171590Srgrimes * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 181590Srgrimes * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 191590Srgrimes * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 201590Srgrimes * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 211590Srgrimes * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 221590Srgrimes * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 231590Srgrimes * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 241590Srgrimes */ 251590Srgrimes 261590Srgrimes#include "archive_platform.h" 271590Srgrimes__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 315433 2017-03-16 23:08:18Z mm $"); 281590Srgrimes 291590Srgrimes/** 301590Srgrimes * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 311590Srgrimes * ISO 28500:2009. 321590Srgrimes * For the purposes of this file we used the final draft from: 331590Srgrimes * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 34194792Sdelphij * 351590Srgrimes * Todo: 361590Srgrimes * [ ] real-world warcs can contain resources at endpoints ending in / 371590Srgrimes * e.g. http://bibnum.bnf.fr/warc/ 381590Srgrimes * if you're lucky their response contains a Content-Location: header 391590Srgrimes * pointing to a unix-compliant filename, in the example above it's 4032649Sbde * Content-Location: http://bibnum.bnf.fr/warc/index.html 4132649Sbde * however, that's not mandated and github for example doesn't follow 4232649Sbde * this convention. 433819Swollman * We need a set of archive options to control what to do with 4450477Speter * entries like these, at the moment care is taken to skip them. 451590Srgrimes * 461590Srgrimes **/ 471590Srgrimes 48194190Sed#ifdef HAVE_SYS_STAT_H 491590Srgrimes#include <sys/stat.h> 5011936Sphk#endif 513819Swollman#ifdef HAVE_ERRNO_H 529336Sdfr#include <errno.h> 5383653Speter#endif 5483653Speter#ifdef HAVE_STDLIB_H 55192762Srmacklem#include <stdlib.h> 56192762Srmacklem#endif 57192762Srmacklem#ifdef HAVE_STRING_H 58192762Srmacklem#include <string.h> 591590Srgrimes#endif 601590Srgrimes#ifdef HAVE_LIMITS_H 61200462Sdelphij#include <limits.h> 621590Srgrimes#endif 631590Srgrimes#ifdef HAVE_CTYPE_H 6477207Stmm#include <ctype.h> 651590Srgrimes#endif 661590Srgrimes#ifdef HAVE_TIME_H 671590Srgrimes#include <time.h> 681590Srgrimes#endif 691590Srgrimes 70200462Sdelphij#include "archive.h" 713819Swollman#include "archive_entry.h" 721590Srgrimes#include "archive_private.h" 731590Srgrimes#include "archive_read_private.h" 741590Srgrimes 75194792Sdelphijtypedef enum { 7683653Speter WT_NONE, 77194792Sdelphij /* warcinfo */ 78194792Sdelphij WT_INFO, 791590Srgrimes /* metadata */ 801590Srgrimes WT_META, 811590Srgrimes /* resource */ 823819Swollman WT_RSRC, 8352493Sdillon /* request, unsupported */ 84172759Sjhb WT_REQ, 85221455Srmacklem /* response, unsupported */ 86192762Srmacklem WT_RSP, 87192762Srmacklem /* revisit, unsupported */ 88221455Srmacklem WT_RVIS, 891590Srgrimes /* conversion, unsupported */ 9092921Simp WT_CONV, 9192921Simp /* continuation, unsupported at the moment */ 9292921Simp WT_CONT, 9392921Simp /* invalid type */ 9492921Simp LAST_WT 9592921Simp} warc_type_t; 96192762Srmacklem 97192762Srmacklemtypedef struct { 983819Swollman size_t len; 9952493Sdillon const char *str; 10052493Sdillon} warc_string_t; 101131990Sstefanf 102172759Sjhbtypedef struct { 1031590Srgrimes size_t len; 1041590Srgrimes char *str; 10552493Sdillon} warc_strbuf_t; 10652493Sdillon 1071590Srgrimesstruct warc_s { 1081590Srgrimes /* content length ahead */ 10977207Stmm size_t cntlen; 110243783Srmacklem /* and how much we've processed so far */ 111243783Srmacklem size_t cntoff; 112243783Srmacklem /* and how much we need to consume between calls */ 113243783Srmacklem size_t unconsumed; 1141590Srgrimes 1151590Srgrimes /* string pool */ 1161590Srgrimes warc_strbuf_t pool; 117243783Srmacklem /* previous version */ 1181590Srgrimes unsigned int pver; 1191590Srgrimes /* stringified format name */ 1201590Srgrimes struct archive_string sver; 1211590Srgrimes}; 122243783Srmacklem 123243783Srmacklemstatic int _warc_bid(struct archive_read *a, int); 124243783Srmacklemstatic int _warc_cleanup(struct archive_read *a); 125243783Srmacklemstatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 126243783Srmacklemstatic int _warc_skip(struct archive_read *a); 127243783Srmacklemstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 128243783Srmacklem 129243783Srmacklem/* private routines */ 130243783Srmacklemstatic unsigned int _warc_rdver(const char buf[10], size_t bsz); 131243783Srmacklemstatic unsigned int _warc_rdtyp(const char *buf, size_t bsz); 132243783Srmacklemstatic warc_string_t _warc_rduri(const char *buf, size_t bsz); 133243783Srmacklemstatic ssize_t _warc_rdlen(const char *buf, size_t bsz); 134243783Srmacklemstatic time_t _warc_rdrtm(const char *buf, size_t bsz); 135243783Srmacklemstatic time_t _warc_rdmtm(const char *buf, size_t bsz); 136251585Srmacklemstatic const char *_warc_find_eoh(const char *buf, size_t bsz); 137251585Srmacklemstatic const char *_warc_find_eol(const char *buf, size_t bsz); 138251585Srmacklem 139243783Srmacklemint 140243783Srmacklemarchive_read_support_format_warc(struct archive *_a) 141243783Srmacklem{ 142243783Srmacklem struct archive_read *a = (struct archive_read *)_a; 1431590Srgrimes struct warc_s *w; 1441590Srgrimes int r; 1451590Srgrimes 14652493Sdillon archive_check_magic(_a, ARCHIVE_READ_MAGIC, 14752493Sdillon ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 14852493Sdillon 1491590Srgrimes if ((w = calloc(1, sizeof(*w))) == NULL) { 1501590Srgrimes archive_set_error(&a->archive, ENOMEM, 1511590Srgrimes "Can't allocate warc data"); 15252493Sdillon return (ARCHIVE_FATAL); 15352493Sdillon } 15452493Sdillon 15552493Sdillon r = __archive_read_register_format( 15652493Sdillon a, w, "warc", 15752493Sdillon _warc_bid, NULL, _warc_rdhdr, _warc_read, 15852493Sdillon _warc_skip, NULL, _warc_cleanup, NULL, NULL); 15952493Sdillon 16052493Sdillon if (r != ARCHIVE_OK) { 16152493Sdillon free(w); 162172759Sjhb return (r); 163172759Sjhb } 164172759Sjhb return (ARCHIVE_OK); 165221455Srmacklem} 166221455Srmacklem 167221455Srmacklemstatic int 168221455Srmacklem_warc_cleanup(struct archive_read *a) 169221455Srmacklem{ 170193258Srmacklem struct warc_s *w = a->format->data; 171221455Srmacklem 172221455Srmacklem if (w->pool.len > 0U) { 173221455Srmacklem free(w->pool.str); 174192762Srmacklem } 1751590Srgrimes archive_string_free(&w->sver); 1761590Srgrimes free(w); 1771590Srgrimes a->format->data = NULL; 1781590Srgrimes return (ARCHIVE_OK); 1791590Srgrimes} 1801590Srgrimes 1811590Srgrimesstatic int 1821590Srgrimes_warc_bid(struct archive_read *a, int best_bid) 1831590Srgrimes{ 1841590Srgrimes const char *hdr; 1851590Srgrimes ssize_t nrd; 1861590Srgrimes unsigned int ver; 1871590Srgrimes 1881590Srgrimes (void)best_bid; /* UNUSED */ 1891590Srgrimes 1901590Srgrimes /* check first line of file, it should be a record already */ 1911590Srgrimes if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 1921590Srgrimes /* no idea what to do */ 193192762Srmacklem return -1; 194221455Srmacklem } else if (nrd < 12) { 195192762Srmacklem /* nah, not for us, our magic cookie is at least 12 bytes */ 196221440Srmacklem return -1; 1973819Swollman } 1981590Srgrimes 1993819Swollman /* otherwise snarf the record's version number */ 2003819Swollman ver = _warc_rdver(hdr, nrd); 2013819Swollman if (ver < 1200U || ver > 10000U) { 2023819Swollman /* we only support WARC 0.12 to 1.0 */ 2033819Swollman return -1; 2043819Swollman } 2053819Swollman 2061590Srgrimes /* otherwise be confident */ 2071590Srgrimes return (64); 208192762Srmacklem} 209192762Srmacklem 210192762Srmacklemstatic int 211192762Srmacklem_warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 212192762Srmacklem{ 213192762Srmacklem#define HDR_PROBE_LEN (12U) 214221455Srmacklem struct warc_s *w = a->format->data; 215192762Srmacklem unsigned int ver; 216192762Srmacklem const char *buf; 217192762Srmacklem ssize_t nrd; 218192762Srmacklem const char *eoh; 2191590Srgrimes /* for the file name, saves some strndup()'ing */ 2201590Srgrimes warc_string_t fnam; 2211590Srgrimes /* warc record type, not that we really use it a lot */ 2221590Srgrimes warc_type_t ftyp; 2233819Swollman /* content-length+error monad */ 2243819Swollman ssize_t cntlen; 2253819Swollman /* record time is the WARC-Date time we reinterpret it as ctime */ 226194792Sdelphij time_t rtime; 227172759Sjhb /* mtime is the Last-Modified time which will be the entry's mtime */ 2283819Swollman time_t mtime; 229172759Sjhb 230172759Sjhbstart_over: 231172759Sjhb /* just use read_ahead() they keep track of unconsumed 232172759Sjhb * bits and bobs for us; no need to put an extra shift in 23383653Speter * and reproduce that functionality here */ 23483653Speter buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 23583653Speter 236172759Sjhb if (nrd < 0) { 237172759Sjhb /* no good */ 23883653Speter archive_set_error( 2393819Swollman &a->archive, ARCHIVE_ERRNO_MISC, 240172759Sjhb "Bad record header"); 241172759Sjhb return (ARCHIVE_FATAL); 242172759Sjhb } else if (buf == NULL) { 24383653Speter /* there should be room for at least WARC/bla\r\n 24483653Speter * must be EOF therefore */ 2453819Swollman return (ARCHIVE_EOF); 246172759Sjhb } 247172759Sjhb /* looks good so far, try and find the end of the header now */ 24883653Speter eoh = _warc_find_eoh(buf, nrd); 249221973Srmacklem if (eoh == NULL) { 250172759Sjhb /* still no good, the header end might be beyond the 251172759Sjhb * probe we've requested, but then again who'd cram 252221973Srmacklem * so much stuff into the header *and* be 28500-compliant */ 25383653Speter archive_set_error( 2543819Swollman &a->archive, ARCHIVE_ERRNO_MISC, 25583653Speter "Bad record header"); 256172759Sjhb return (ARCHIVE_FATAL); 257172759Sjhb } 258172759Sjhb ver = _warc_rdver(buf, eoh - buf); 259172759Sjhb /* we currently support WARC 0.12 to 1.0 */ 260172759Sjhb if (ver == 0U) { 26183653Speter archive_set_error( 26283653Speter &a->archive, ARCHIVE_ERRNO_MISC, 2633819Swollman "Invalid record version"); 2643819Swollman return (ARCHIVE_FATAL); 2653819Swollman } else if (ver < 1200U || ver > 10000U) { 2663819Swollman archive_set_error( 2671590Srgrimes &a->archive, ARCHIVE_ERRNO_MISC, 2681590Srgrimes "Unsupported record version: %u.%u", 2691590Srgrimes ver / 10000, (ver % 10000) / 100); 27052493Sdillon return (ARCHIVE_FATAL); 2711590Srgrimes } 27283653Speter cntlen = _warc_rdlen(buf, eoh - buf); 27383653Speter if (cntlen < 0) { 274221455Srmacklem /* nightmare! the specs say content-length is mandatory 2751590Srgrimes * so I don't feel overly bad stopping the reader here */ 276221455Srmacklem archive_set_error( 277221455Srmacklem &a->archive, EINVAL, 278221455Srmacklem "Bad content length"); 279221455Srmacklem return (ARCHIVE_FATAL); 280221455Srmacklem } 281221455Srmacklem rtime = _warc_rdrtm(buf, eoh - buf); 282221455Srmacklem if (rtime == (time_t)-1) { 283221455Srmacklem /* record time is mandatory as per WARC/1.0, 284221455Srmacklem * so just barf here, fast and loud */ 285221455Srmacklem archive_set_error( 286221455Srmacklem &a->archive, EINVAL, 287221455Srmacklem "Bad record time"); 288221455Srmacklem return (ARCHIVE_FATAL); 289221455Srmacklem } 290221455Srmacklem 291221455Srmacklem /* let the world know we're a WARC archive */ 292221455Srmacklem a->archive.archive_format = ARCHIVE_FORMAT_WARC; 293221455Srmacklem if (ver != w->pver) { 294221455Srmacklem /* stringify this entry's version */ 295221455Srmacklem archive_string_sprintf(&w->sver, 296221455Srmacklem "WARC/%u.%u", ver / 10000, (ver % 10000) / 100); 297221455Srmacklem /* remember the version */ 298221455Srmacklem w->pver = ver; 299221455Srmacklem } 300221455Srmacklem /* start off with the type */ 301221455Srmacklem ftyp = _warc_rdtyp(buf, eoh - buf); 302221455Srmacklem /* and let future calls know about the content */ 303221455Srmacklem w->cntlen = cntlen; 304221455Srmacklem w->cntoff = 0U; 305221455Srmacklem mtime = 0;/* Avoid compiling error on some platform. */ 30683653Speter 30752493Sdillon switch (ftyp) { 30852493Sdillon case WT_RSRC: 30952493Sdillon case WT_RSP: 31052493Sdillon /* only try and read the filename in the cases that are 31152493Sdillon * guaranteed to have one */ 31252493Sdillon fnam = _warc_rduri(buf, eoh - buf); 313221455Srmacklem /* check the last character in the URI to avoid creating 314221455Srmacklem * directory endpoints as files, see Todo above */ 315221455Srmacklem if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 316221455Srmacklem /* break here for now */ 317221455Srmacklem fnam.len = 0U; 318221455Srmacklem fnam.str = NULL; 319221455Srmacklem break; 320221455Srmacklem } 321221455Srmacklem /* bang to our string pool, so we save a 322221455Srmacklem * malloc()+free() roundtrip */ 323221455Srmacklem if (fnam.len + 1U > w->pool.len) { 324221455Srmacklem w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 325221455Srmacklem w->pool.str = realloc(w->pool.str, w->pool.len); 326221455Srmacklem } 327221455Srmacklem memcpy(w->pool.str, fnam.str, fnam.len); 328221455Srmacklem w->pool.str[fnam.len] = '\0'; 329221455Srmacklem /* let no one else know about the pool, it's a secret, shhh */ 330221455Srmacklem fnam.str = w->pool.str; 331221455Srmacklem 332221455Srmacklem /* snarf mtime or deduce from rtime 33352493Sdillon * this is a custom header added by our writer, it's quite 33452493Sdillon * hard to believe anyone else would go through with it 33552493Sdillon * (apart from being part of some http responses of course) */ 336221455Srmacklem if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 337221455Srmacklem mtime = rtime; 338221455Srmacklem } 339221455Srmacklem break; 340221455Srmacklem default: 341221455Srmacklem fnam.len = 0U; 342221455Srmacklem fnam.str = NULL; 343221455Srmacklem break; 344221455Srmacklem } 345221455Srmacklem 346221455Srmacklem /* now eat some of those delicious buffer bits */ 347221455Srmacklem __archive_read_consume(a, eoh - buf); 348221455Srmacklem 349221455Srmacklem switch (ftyp) { 350221455Srmacklem case WT_RSRC: 351221455Srmacklem case WT_RSP: 352221455Srmacklem if (fnam.len > 0U) { 353221455Srmacklem /* populate entry object */ 354221455Srmacklem archive_entry_set_filetype(entry, AE_IFREG); 355221455Srmacklem archive_entry_copy_pathname(entry, fnam.str); 35683653Speter archive_entry_set_size(entry, cntlen); 35783653Speter archive_entry_set_perm(entry, 0644); 358221455Srmacklem /* rtime is the new ctime, mtime stays mtime */ 359221455Srmacklem archive_entry_set_ctime(entry, rtime, 0L); 360221455Srmacklem archive_entry_set_mtime(entry, mtime, 0L); 361221455Srmacklem break; 362221455Srmacklem } 363221455Srmacklem /* FALLTHROUGH */ 364221455Srmacklem default: 365221455Srmacklem /* consume the content and start over */ 366221455Srmacklem _warc_skip(a); 367221455Srmacklem goto start_over; 368221455Srmacklem } 369221455Srmacklem return (ARCHIVE_OK); 370221455Srmacklem} 371221455Srmacklem 37252493Sdillonstatic int 37352493Sdillon_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 37452493Sdillon{ 37552493Sdillon struct warc_s *w = a->format->data; 376221455Srmacklem const char *rab; 377221455Srmacklem ssize_t nrd; 378221455Srmacklem 379221455Srmacklem if (w->cntoff >= w->cntlen) { 380221455Srmacklem eof: 381221455Srmacklem /* it's our lucky day, no work, we can leave early */ 382221455Srmacklem *buf = NULL; 383221455Srmacklem *bsz = 0U; 384221455Srmacklem *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 385221455Srmacklem w->unconsumed = 0U; 386221455Srmacklem return (ARCHIVE_EOF); 387221455Srmacklem } 388221455Srmacklem 389221455Srmacklem rab = __archive_read_ahead(a, 1U, &nrd); 39052493Sdillon if (nrd < 0) { 39152493Sdillon *bsz = 0U; 39252493Sdillon /* big catastrophe */ 39352493Sdillon return (int)nrd; 39452493Sdillon } else if (nrd == 0) { 395221455Srmacklem goto eof; 396221455Srmacklem } else if ((size_t)nrd > w->cntlen - w->cntoff) { 397221455Srmacklem /* clamp to content-length */ 398221455Srmacklem nrd = w->cntlen - w->cntoff; 399221455Srmacklem } 400221455Srmacklem *off = w->cntoff; 401221455Srmacklem *bsz = nrd; 402221455Srmacklem *buf = rab; 403221455Srmacklem 404221455Srmacklem w->cntoff += nrd; 405221455Srmacklem w->unconsumed = (size_t)nrd; 406221455Srmacklem return (ARCHIVE_OK); 407221455Srmacklem} 408221455Srmacklem 409221455Srmacklemstatic int 410221455Srmacklem_warc_skip(struct archive_read *a) 411221455Srmacklem{ 412221455Srmacklem struct warc_s *w = a->format->data; 413221455Srmacklem 414221455Srmacklem __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 415221455Srmacklem w->cntlen = 0U; 416221455Srmacklem w->cntoff = 0U; 417221455Srmacklem return (ARCHIVE_OK); 418221455Srmacklem} 419221455Srmacklem 42052493Sdillon 42152493Sdillon/* private routines */ 422220596Srustatic void* 423221455Srmacklemdeconst(const void *c) 424221455Srmacklem{ 425221455Srmacklem return (char *)0x1 + (((const char *)c) - (const char *)0x1); 426221455Srmacklem} 427221455Srmacklem 428221455Srmacklemstatic char* 429221455Srmacklemxmemmem(const char *hay, const size_t haysize, 430221455Srmacklem const char *needle, const size_t needlesize) 431221455Srmacklem{ 432221455Srmacklem const char *const eoh = hay + haysize; 433221455Srmacklem const char *const eon = needle + needlesize; 434221455Srmacklem const char *hp; 435221455Srmacklem const char *np; 436221455Srmacklem const char *cand; 437221455Srmacklem unsigned int hsum; 438221455Srmacklem unsigned int nsum; 439221455Srmacklem unsigned int eqp; 440221455Srmacklem 441221455Srmacklem /* trivial checks first 442221455Srmacklem * a 0-sized needle is defined to be found anywhere in haystack 443221455Srmacklem * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 444221455Srmacklem * that happens to begin with *NEEDLE) */ 445221455Srmacklem if (needlesize == 0UL) { 446221455Srmacklem return deconst(hay); 447221455Srmacklem } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 448221455Srmacklem /* trivial */ 449221455Srmacklem return NULL; 45052493Sdillon } 451221455Srmacklem 45283653Speter /* First characters of haystack and needle are the same now. Both are 45383653Speter * guaranteed to be at least one character long. Now computes the sum 45483653Speter * of characters values of needle together with the sum of the first 45552493Sdillon * needle_len characters of haystack. */ 45652493Sdillon for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 45752493Sdillon hp < eoh && np < eon; 45852493Sdillon hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 45952493Sdillon 460221455Srmacklem /* HP now references the (NEEDLESIZE + 1)-th character. */ 461221455Srmacklem if (np < eon) { 462221455Srmacklem /* haystack is smaller than needle, :O */ 463221455Srmacklem return NULL; 464221455Srmacklem } else if (eqp) { 465221455Srmacklem /* found a match */ 466221455Srmacklem return deconst(hay); 467221455Srmacklem } 468221455Srmacklem 469221455Srmacklem /* now loop through the rest of haystack, 470221455Srmacklem * updating the sum iteratively */ 471221455Srmacklem for (cand = hay; hp < eoh; hp++) { 472225113Srmacklem hsum ^= *cand++; 473225113Srmacklem hsum ^= *hp; 474225113Srmacklem 475225113Srmacklem /* Since the sum of the characters is already known to be 476225113Srmacklem * equal at that point, it is enough to check just NEEDLESIZE - 1 477225113Srmacklem * characters for equality, 478225113Srmacklem * also CAND is by design < HP, so no need for range checks */ 479225113Srmacklem if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 48052493Sdillon return deconst(cand); 48152493Sdillon } 48252493Sdillon } 483221455Srmacklem return NULL; 484221455Srmacklem} 485221455Srmacklem 486221455Srmacklemstatic int 487221455Srmacklemstrtoi_lim(const char *str, const char **ep, int llim, int ulim) 488221455Srmacklem{ 489221455Srmacklem int res = 0; 490221455Srmacklem const char *sp; 491221455Srmacklem /* we keep track of the number of digits via rulim */ 492221455Srmacklem int rulim; 493221455Srmacklem 494221455Srmacklem for (sp = str, rulim = ulim > 10 ? ulim : 10; 495225113Srmacklem res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 496225113Srmacklem sp++, rulim /= 10) { 497225113Srmacklem res *= 10; 498225113Srmacklem res += *sp - '0'; 499225113Srmacklem } 500225113Srmacklem if (sp == str) { 501225113Srmacklem res = -1; 502225113Srmacklem } else if (res < llim || res > ulim) { 50383653Speter res = -2; 50483653Speter } 505221455Srmacklem *ep = (const char*)sp; 506221455Srmacklem return res; 507221455Srmacklem} 508221455Srmacklem 509221455Srmacklemstatic time_t 510221455Srmacklemtime_from_tm(struct tm *t) 511221455Srmacklem{ 512221455Srmacklem#if HAVE_TIMEGM 513221455Srmacklem /* Use platform timegm() if available. */ 514225113Srmacklem return (timegm(t)); 515225113Srmacklem#elif HAVE__MKGMTIME64 516225113Srmacklem return (_mkgmtime64(t)); 517225113Srmacklem#else 518225113Srmacklem /* Else use direct calculation using POSIX assumptions. */ 51952493Sdillon /* First, fix up tm_yday based on the year/month/day. */ 520221455Srmacklem if (mktime(t) == (time_t)-1) 521221455Srmacklem return ((time_t)-1); 522221455Srmacklem /* Then we can compute timegm() from first principles. */ 523221455Srmacklem return (t->tm_sec 52452493Sdillon + t->tm_min * 60 525221455Srmacklem + t->tm_hour * 3600 526221455Srmacklem + t->tm_yday * 86400 527221455Srmacklem + (t->tm_year - 70) * 31536000 528221455Srmacklem + ((t->tm_year - 69) / 4) * 86400 52952493Sdillon - ((t->tm_year - 1) / 100) * 86400 53052493Sdillon + ((t->tm_year + 299) / 400) * 86400); 53152493Sdillon#endif 532221455Srmacklem} 533221455Srmacklem 534221455Srmacklemstatic time_t 535221455Srmacklemxstrpisotime(const char *s, char **endptr) 536221455Srmacklem{ 537221455Srmacklem/** like strptime() but strictly for ISO 8601 Zulu strings */ 538221455Srmacklem struct tm tm; 539221455Srmacklem time_t res = (time_t)-1; 540221455Srmacklem 541221455Srmacklem /* make sure tm is clean */ 542221455Srmacklem memset(&tm, 0, sizeof(tm)); 543221455Srmacklem 54452493Sdillon /* as a courtesy to our callers, and since this is a non-standard 54552493Sdillon * routine, we skip leading whitespace */ 54652493Sdillon while (*s == ' ' || *s == '\t') 547221455Srmacklem ++s; 548221455Srmacklem 549221455Srmacklem /* read year */ 550221455Srmacklem if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 551221455Srmacklem goto out; 552221455Srmacklem } 553221455Srmacklem /* read month */ 554221455Srmacklem if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 555221455Srmacklem goto out; 556221455Srmacklem } 557221455Srmacklem /* read day-of-month */ 558221455Srmacklem if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 559225113Srmacklem goto out; 560225113Srmacklem } 56152493Sdillon /* read hour */ 5621590Srgrimes if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 5631590Srgrimes goto out; 5641590Srgrimes } 5651590Srgrimes /* read minute */ 5661590Srgrimes if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 5671590Srgrimes goto out; 5681590Srgrimes } 5691590Srgrimes /* read second */ 5701590Srgrimes if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 5711590Srgrimes goto out; 5721590Srgrimes } 57352493Sdillon 5741590Srgrimes /* massage TM to fulfill some of POSIX' constraints */ 57583653Speter tm.tm_year -= 1900; 57683653Speter tm.tm_mon--; 57752493Sdillon 5781590Srgrimes /* now convert our custom tm struct to a unix stamp using UTC */ 57983653Speter res = time_from_tm(&tm); 58083653Speter 581172759Sjhbout: 58283653Speter if (endptr != NULL) { 58383653Speter *endptr = deconst(s); 58483653Speter } 58583653Speter return res; 58683653Speter} 58783653Speter 58883653Speterstatic unsigned int 58983653Speter_warc_rdver(const char *buf, size_t bsz) 59052493Sdillon{ 5911590Srgrimes static const char magic[] = "WARC/"; 59252493Sdillon const char *c; 59383653Speter unsigned int ver = 0U; 59483653Speter unsigned int end = 0U; 595172759Sjhb 59652493Sdillon if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 59752493Sdillon /* buffer too small or invalid magic */ 59852493Sdillon return ver; 59952493Sdillon } 60052493Sdillon /* looks good so far, read the version number for a laugh */ 60152493Sdillon buf += sizeof(magic) - 1U; 60252493Sdillon 6031590Srgrimes if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') && 60452493Sdillon isdigit((unsigned char)buf[2U])) { 60552493Sdillon /* we support a maximum of 2 digits in the minor version */ 60652493Sdillon if (isdigit((unsigned char)buf[3U])) 607262229Sjhb end = 1U; 608262229Sjhb /* set up major version */ 609262229Sjhb ver = (buf[0U] - '0') * 10000U; 610262229Sjhb /* set up minor version */ 611262229Sjhb if (end == 1U) { 612262229Sjhb ver += (buf[2U] - '0') * 1000U; 613262229Sjhb ver += (buf[3U] - '0') * 100U; 614262229Sjhb } else 615262229Sjhb ver += (buf[2U] - '0') * 100U; 61652493Sdillon /* 61752493Sdillon * WARC below version 0.12 has a space-separated header 61852493Sdillon * WARC 0.12 and above terminates the version with a CRLF 61952493Sdillon */ 62052493Sdillon c = buf + 3U + end; 62152493Sdillon if (ver >= 1200U) { 62252493Sdillon if (memcmp(c, "\r\n", 2U) != 0) 62352493Sdillon ver = 0U; 62452493Sdillon } else if (ver < 1200U) { 62552493Sdillon if (*c != ' ' && *c != '\t') 62652493Sdillon ver = 0U; 62752493Sdillon } 62852493Sdillon } 62952493Sdillon return ver; 63052493Sdillon} 63152493Sdillon 63252493Sdillonstatic unsigned int 63352493Sdillon_warc_rdtyp(const char *buf, size_t bsz) 63483653Speter{ 63552493Sdillon static const char _key[] = "\r\nWARC-Type:"; 63652493Sdillon const char *val, *eol; 63752493Sdillon 63852493Sdillon if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 63983653Speter /* no bother */ 64083653Speter return WT_NONE; 64183653Speter } 64283653Speter val += sizeof(_key) - 1U; 64383653Speter if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 64483653Speter /* no end of line */ 64583653Speter return WT_NONE; 64683653Speter } 64783653Speter 64852493Sdillon /* overread whitespace */ 64983653Speter while (val < eol && (*val == ' ' || *val == '\t')) 65052493Sdillon ++val; 6511590Srgrimes 65252493Sdillon if (val + 8U == eol) { 6531590Srgrimes if (memcmp(val, "resource", 8U) == 0) 6541590Srgrimes return WT_RSRC; 6551590Srgrimes else if (memcmp(val, "response", 8U) == 0) 6561590Srgrimes return WT_RSP; 6571590Srgrimes } 65852493Sdillon return WT_NONE; 6591590Srgrimes} 66052493Sdillon 66152493Sdillonstatic warc_string_t 66252493Sdillon_warc_rduri(const char *buf, size_t bsz) 66352493Sdillon{ 66452493Sdillon static const char _key[] = "\r\nWARC-Target-URI:"; 66552493Sdillon const char *val, *uri, *eol, *p; 66652493Sdillon warc_string_t res = {0U, NULL}; 66752493Sdillon 6681590Srgrimes if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 6691590Srgrimes /* no bother */ 6701590Srgrimes return res; 6711590Srgrimes } 672172759Sjhb /* overread whitespace */ 6731590Srgrimes val += sizeof(_key) - 1U; 6741590Srgrimes if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 675243783Srmacklem /* no end of line */ 6761590Srgrimes return res; 6771590Srgrimes } 67852493Sdillon 67952493Sdillon while (val < eol && (*val == ' ' || *val == '\t')) 68052493Sdillon ++val; 68152493Sdillon 68252493Sdillon /* overread URL designators */ 68352493Sdillon if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) { 68452493Sdillon /* not touching that! */ 68552493Sdillon return res; 68652493Sdillon } 68752493Sdillon 68852493Sdillon /* spaces inside uri are not allowed, CRLF should follow */ 68952493Sdillon for (p = val; p < eol; p++) { 69052493Sdillon if (isspace((unsigned char)*p)) 69152493Sdillon return res; 69252493Sdillon } 69352493Sdillon 69452493Sdillon /* there must be at least space for ftp */ 69552493Sdillon if (uri < (val + 3U)) 69652493Sdillon return res; 69752493Sdillon 69852493Sdillon /* move uri to point to after :// */ 69952493Sdillon uri += 3U; 70052493Sdillon 70152493Sdillon /* now then, inspect the URI */ 70252493Sdillon if (memcmp(val, "file", 4U) == 0) { 70352493Sdillon /* perfect, nothing left to do here */ 70452493Sdillon 70552493Sdillon } else if (memcmp(val, "http", 4U) == 0 || 70652493Sdillon memcmp(val, "ftp", 3U) == 0) { 70752493Sdillon /* overread domain, and the first / */ 70852493Sdillon while (uri < eol && *uri++ != '/'); 70952493Sdillon } else { 71052493Sdillon /* not sure what to do? best to bugger off */ 71152493Sdillon return res; 712192762Srmacklem } 713192762Srmacklem res.str = uri; 714192762Srmacklem res.len = eol - uri; 715192762Srmacklem return res; 716192762Srmacklem} 717192762Srmacklem 718221440Srmacklemstatic ssize_t 719192762Srmacklem_warc_rdlen(const char *buf, size_t bsz) 720221440Srmacklem{ 721221440Srmacklem static const char _key[] = "\r\nContent-Length:"; 722221440Srmacklem const char *val, *eol; 723221440Srmacklem char *on = NULL; 724221440Srmacklem long int len; 725221440Srmacklem 726221440Srmacklem if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 727221440Srmacklem /* no bother */ 728221440Srmacklem return -1; 729192762Srmacklem } 730192762Srmacklem val += sizeof(_key) - 1U; 731192762Srmacklem if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 732192762Srmacklem /* no end of line */ 733192762Srmacklem return -1; 734192762Srmacklem } 735192762Srmacklem 736192762Srmacklem /* skip leading whitespace */ 737192762Srmacklem while (val < eol && (*val == ' ' || *val == '\t')) 738192762Srmacklem val++; 739192762Srmacklem /* there must be at least one digit */ 740192762Srmacklem if (!isdigit((unsigned char)*val)) 741192762Srmacklem return -1; 742192762Srmacklem len = strtol(val, &on, 10); 743192762Srmacklem if (on != eol) { 744192762Srmacklem /* line must end here */ 745192762Srmacklem return -1; 746192762Srmacklem } 747192762Srmacklem 748192762Srmacklem return (size_t)len; 749192762Srmacklem} 750192762Srmacklem 751192762Srmacklemstatic time_t 752192762Srmacklem_warc_rdrtm(const char *buf, size_t bsz) 753192762Srmacklem{ 754192762Srmacklem static const char _key[] = "\r\nWARC-Date:"; 755192762Srmacklem const char *val, *eol; 756192762Srmacklem char *on = NULL; 757192762Srmacklem time_t res; 758192762Srmacklem 759192762Srmacklem if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 760192762Srmacklem /* no bother */ 761192762Srmacklem return (time_t)-1; 762192762Srmacklem } 763192762Srmacklem val += sizeof(_key) - 1U; 764192762Srmacklem if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 765192762Srmacklem /* no end of line */ 766192762Srmacklem return -1; 767192762Srmacklem } 768192762Srmacklem 769192762Srmacklem /* xstrpisotime() kindly overreads whitespace for us, so use that */ 770192762Srmacklem res = xstrpisotime(val, &on); 771192762Srmacklem if (on != eol) { 772192762Srmacklem /* line must end here */ 773192762Srmacklem return -1; 774192762Srmacklem } 775192762Srmacklem return res; 776192762Srmacklem} 777192762Srmacklem 778192762Srmacklemstatic time_t 779192762Srmacklem_warc_rdmtm(const char *buf, size_t bsz) 780192762Srmacklem{ 781192762Srmacklem static const char _key[] = "\r\nLast-Modified:"; 782192762Srmacklem const char *val, *eol; 783192762Srmacklem char *on = NULL; 784192762Srmacklem time_t res; 785192762Srmacklem 786192762Srmacklem if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 787192762Srmacklem /* no bother */ 788192762Srmacklem return (time_t)-1; 789192762Srmacklem } 790192762Srmacklem val += sizeof(_key) - 1U; 791192762Srmacklem if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 792192762Srmacklem /* no end of line */ 793192762Srmacklem return -1; 794192762Srmacklem } 795192762Srmacklem 796192762Srmacklem /* xstrpisotime() kindly overreads whitespace for us, so use that */ 797192762Srmacklem res = xstrpisotime(val, &on); 798192762Srmacklem if (on != eol) { 799192762Srmacklem /* line must end here */ 800192762Srmacklem return -1; 801192762Srmacklem } 802192762Srmacklem return res; 803192762Srmacklem} 804192762Srmacklem 805192762Srmacklemstatic const char* 806192762Srmacklem_warc_find_eoh(const char *buf, size_t bsz) 807192762Srmacklem{ 808192762Srmacklem static const char _marker[] = "\r\n\r\n"; 809192762Srmacklem const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 810192762Srmacklem 811192762Srmacklem if (hit != NULL) { 812192762Srmacklem hit += sizeof(_marker) - 1U; 813192762Srmacklem } 814192762Srmacklem return hit; 815192762Srmacklem} 816192762Srmacklem 817192762Srmacklemstatic const char* 818192762Srmacklem_warc_find_eol(const char *buf, size_t bsz) 819192762Srmacklem{ 820192762Srmacklem static const char _marker[] = "\r\n"; 821192762Srmacklem const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 822192762Srmacklem 823192762Srmacklem return hit; 824192762Srmacklem} 825192762Srmacklem/* archive_read_support_format_warc.c ends here */ 826221454Srmacklem