1299425Smm/*- 2299425Smm * Copyright (c) 2014 Sebastian Freundt 3299425Smm * All rights reserved. 4299425Smm * 5299425Smm * Redistribution and use in source and binary forms, with or without 6299425Smm * modification, are permitted provided that the following conditions 7299425Smm * are met: 8299425Smm * 1. Redistributions of source code must retain the above copyright 9299425Smm * notice, this list of conditions and the following disclaimer. 10299425Smm * 2. Redistributions in binary form must reproduce the above copyright 11299425Smm * notice, this list of conditions and the following disclaimer in the 12299425Smm * documentation and/or other materials provided with the distribution. 13299425Smm * 14299425Smm * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15299425Smm * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16299425Smm * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17299425Smm * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18299425Smm * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19299425Smm * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20299425Smm * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21299425Smm * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22299425Smm * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23299425Smm * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24299425Smm */ 25299425Smm 26299425Smm#include "archive_platform.h" 27299425Smm__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 368708 2020-12-16 22:25:40Z mm $"); 28299425Smm 29299425Smm/** 30299425Smm * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 31299425Smm * ISO 28500:2009. 32299425Smm * For the purposes of this file we used the final draft from: 33299425Smm * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 34299425Smm * 35299425Smm * Todo: 36299425Smm * [ ] real-world warcs can contain resources at endpoints ending in / 37299425Smm * e.g. http://bibnum.bnf.fr/warc/ 38299425Smm * if you're lucky their response contains a Content-Location: header 39299425Smm * pointing to a unix-compliant filename, in the example above it's 40299425Smm * Content-Location: http://bibnum.bnf.fr/warc/index.html 41299425Smm * however, that's not mandated and github for example doesn't follow 42299425Smm * this convention. 43299425Smm * We need a set of archive options to control what to do with 44299425Smm * entries like these, at the moment care is taken to skip them. 45299425Smm * 46299425Smm **/ 47299425Smm 48299425Smm#ifdef HAVE_SYS_STAT_H 49299425Smm#include <sys/stat.h> 50299425Smm#endif 51299425Smm#ifdef HAVE_ERRNO_H 52299425Smm#include <errno.h> 53299425Smm#endif 54299425Smm#ifdef HAVE_STDLIB_H 55299425Smm#include <stdlib.h> 56299425Smm#endif 57299425Smm#ifdef HAVE_STRING_H 58299425Smm#include <string.h> 59299425Smm#endif 60299425Smm#ifdef HAVE_LIMITS_H 61299425Smm#include <limits.h> 62299425Smm#endif 63299425Smm#ifdef HAVE_CTYPE_H 64299425Smm#include <ctype.h> 65299425Smm#endif 66299425Smm#ifdef HAVE_TIME_H 67299425Smm#include <time.h> 68299425Smm#endif 69299425Smm 70299425Smm#include "archive.h" 71299425Smm#include "archive_entry.h" 72299425Smm#include "archive_private.h" 73299425Smm#include "archive_read_private.h" 74299425Smm 75299425Smmtypedef enum { 76299425Smm WT_NONE, 77299425Smm /* warcinfo */ 78299425Smm WT_INFO, 79299425Smm /* metadata */ 80299425Smm WT_META, 81299425Smm /* resource */ 82299425Smm WT_RSRC, 83299425Smm /* request, unsupported */ 84299425Smm WT_REQ, 85299425Smm /* response, unsupported */ 86299425Smm WT_RSP, 87299425Smm /* revisit, unsupported */ 88299425Smm WT_RVIS, 89299425Smm /* conversion, unsupported */ 90299425Smm WT_CONV, 91313571Smm /* continuation, unsupported at the moment */ 92299425Smm WT_CONT, 93299425Smm /* invalid type */ 94299425Smm LAST_WT 95299425Smm} warc_type_t; 96299425Smm 97299425Smmtypedef struct { 98299425Smm size_t len; 99299425Smm const char *str; 100299425Smm} warc_string_t; 101299425Smm 102299425Smmtypedef struct { 103299425Smm size_t len; 104299425Smm char *str; 105299425Smm} warc_strbuf_t; 106299425Smm 107299425Smmstruct warc_s { 108299425Smm /* content length ahead */ 109299425Smm size_t cntlen; 110299425Smm /* and how much we've processed so far */ 111299425Smm size_t cntoff; 112299425Smm /* and how much we need to consume between calls */ 113299425Smm size_t unconsumed; 114299425Smm 115299425Smm /* string pool */ 116299425Smm warc_strbuf_t pool; 117299425Smm /* previous version */ 118299425Smm unsigned int pver; 119299425Smm /* stringified format name */ 120299425Smm struct archive_string sver; 121299425Smm}; 122299425Smm 123299425Smmstatic int _warc_bid(struct archive_read *a, int); 124299425Smmstatic int _warc_cleanup(struct archive_read *a); 125299425Smmstatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 126299425Smmstatic int _warc_skip(struct archive_read *a); 127299425Smmstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 128299425Smm 129299425Smm/* private routines */ 130299425Smmstatic unsigned int _warc_rdver(const char buf[10], size_t bsz); 131299425Smmstatic unsigned int _warc_rdtyp(const char *buf, size_t bsz); 132299425Smmstatic warc_string_t _warc_rduri(const char *buf, size_t bsz); 133299425Smmstatic ssize_t _warc_rdlen(const char *buf, size_t bsz); 134299425Smmstatic time_t _warc_rdrtm(const char *buf, size_t bsz); 135299425Smmstatic time_t _warc_rdmtm(const char *buf, size_t bsz); 136299425Smmstatic const char *_warc_find_eoh(const char *buf, size_t bsz); 137313929Smmstatic const char *_warc_find_eol(const char *buf, size_t bsz); 138299425Smm 139299425Smmint 140299425Smmarchive_read_support_format_warc(struct archive *_a) 141299425Smm{ 142299425Smm struct archive_read *a = (struct archive_read *)_a; 143299425Smm struct warc_s *w; 144299425Smm int r; 145299425Smm 146299425Smm archive_check_magic(_a, ARCHIVE_READ_MAGIC, 147299425Smm ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 148299425Smm 149311042Smm if ((w = calloc(1, sizeof(*w))) == NULL) { 150299425Smm archive_set_error(&a->archive, ENOMEM, 151299425Smm "Can't allocate warc data"); 152299425Smm return (ARCHIVE_FATAL); 153299425Smm } 154299425Smm 155299425Smm r = __archive_read_register_format( 156299425Smm a, w, "warc", 157299425Smm _warc_bid, NULL, _warc_rdhdr, _warc_read, 158299425Smm _warc_skip, NULL, _warc_cleanup, NULL, NULL); 159299425Smm 160299425Smm if (r != ARCHIVE_OK) { 161299425Smm free(w); 162299425Smm return (r); 163299425Smm } 164299425Smm return (ARCHIVE_OK); 165299425Smm} 166299425Smm 167299425Smmstatic int 168299425Smm_warc_cleanup(struct archive_read *a) 169299425Smm{ 170299425Smm struct warc_s *w = a->format->data; 171299425Smm 172299425Smm if (w->pool.len > 0U) { 173299425Smm free(w->pool.str); 174299425Smm } 175299425Smm archive_string_free(&w->sver); 176299425Smm free(w); 177299425Smm a->format->data = NULL; 178299425Smm return (ARCHIVE_OK); 179299425Smm} 180299425Smm 181299425Smmstatic int 182299425Smm_warc_bid(struct archive_read *a, int best_bid) 183299425Smm{ 184299425Smm const char *hdr; 185299425Smm ssize_t nrd; 186299425Smm unsigned int ver; 187299425Smm 188299425Smm (void)best_bid; /* UNUSED */ 189299425Smm 190299425Smm /* check first line of file, it should be a record already */ 191299425Smm if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 192299425Smm /* no idea what to do */ 193299425Smm return -1; 194299425Smm } else if (nrd < 12) { 195299425Smm /* nah, not for us, our magic cookie is at least 12 bytes */ 196299425Smm return -1; 197299425Smm } 198299425Smm 199299425Smm /* otherwise snarf the record's version number */ 200299425Smm ver = _warc_rdver(hdr, nrd); 201313929Smm if (ver < 1200U || ver > 10000U) { 202313929Smm /* we only support WARC 0.12 to 1.0 */ 203299425Smm return -1; 204299425Smm } 205299425Smm 206299425Smm /* otherwise be confident */ 207299425Smm return (64); 208299425Smm} 209299425Smm 210299425Smmstatic int 211299425Smm_warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 212299425Smm{ 213299425Smm#define HDR_PROBE_LEN (12U) 214299425Smm struct warc_s *w = a->format->data; 215299425Smm unsigned int ver; 216299425Smm const char *buf; 217299425Smm ssize_t nrd; 218299425Smm const char *eoh; 219299425Smm /* for the file name, saves some strndup()'ing */ 220299425Smm warc_string_t fnam; 221299425Smm /* warc record type, not that we really use it a lot */ 222299425Smm warc_type_t ftyp; 223299425Smm /* content-length+error monad */ 224299425Smm ssize_t cntlen; 225299425Smm /* record time is the WARC-Date time we reinterpret it as ctime */ 226299425Smm time_t rtime; 227299425Smm /* mtime is the Last-Modified time which will be the entry's mtime */ 228299425Smm time_t mtime; 229299425Smm 230299425Smmstart_over: 231299425Smm /* just use read_ahead() they keep track of unconsumed 232299425Smm * bits and bobs for us; no need to put an extra shift in 233299425Smm * and reproduce that functionality here */ 234299425Smm buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 235299425Smm 236299425Smm if (nrd < 0) { 237299425Smm /* no good */ 238299425Smm archive_set_error( 239299425Smm &a->archive, ARCHIVE_ERRNO_MISC, 240299425Smm "Bad record header"); 241299425Smm return (ARCHIVE_FATAL); 242299425Smm } else if (buf == NULL) { 243299425Smm /* there should be room for at least WARC/bla\r\n 244299425Smm * must be EOF therefore */ 245299425Smm return (ARCHIVE_EOF); 246299425Smm } 247299425Smm /* looks good so far, try and find the end of the header now */ 248299425Smm eoh = _warc_find_eoh(buf, nrd); 249299425Smm if (eoh == NULL) { 250299425Smm /* still no good, the header end might be beyond the 251299425Smm * probe we've requested, but then again who'd cram 252299425Smm * so much stuff into the header *and* be 28500-compliant */ 253299425Smm archive_set_error( 254299425Smm &a->archive, ARCHIVE_ERRNO_MISC, 255299425Smm "Bad record header"); 256299425Smm return (ARCHIVE_FATAL); 257313929Smm } 258313929Smm ver = _warc_rdver(buf, eoh - buf); 259313929Smm /* we currently support WARC 0.12 to 1.0 */ 260313929Smm if (ver == 0U) { 261299425Smm archive_set_error( 262299425Smm &a->archive, ARCHIVE_ERRNO_MISC, 263313929Smm "Invalid record version"); 264299425Smm return (ARCHIVE_FATAL); 265313929Smm } else if (ver < 1200U || ver > 10000U) { 266313929Smm archive_set_error( 267313929Smm &a->archive, ARCHIVE_ERRNO_MISC, 268313929Smm "Unsupported record version: %u.%u", 269313929Smm ver / 10000, (ver % 10000) / 100); 270313929Smm return (ARCHIVE_FATAL); 271313929Smm } 272313929Smm cntlen = _warc_rdlen(buf, eoh - buf); 273313929Smm if (cntlen < 0) { 274299425Smm /* nightmare! the specs say content-length is mandatory 275299425Smm * so I don't feel overly bad stopping the reader here */ 276299425Smm archive_set_error( 277299425Smm &a->archive, EINVAL, 278299425Smm "Bad content length"); 279299425Smm return (ARCHIVE_FATAL); 280313929Smm } 281313929Smm rtime = _warc_rdrtm(buf, eoh - buf); 282313929Smm if (rtime == (time_t)-1) { 283299425Smm /* record time is mandatory as per WARC/1.0, 284299425Smm * so just barf here, fast and loud */ 285299425Smm archive_set_error( 286299425Smm &a->archive, EINVAL, 287299425Smm "Bad record time"); 288299425Smm return (ARCHIVE_FATAL); 289299425Smm } 290299425Smm 291299425Smm /* let the world know we're a WARC archive */ 292299425Smm a->archive.archive_format = ARCHIVE_FORMAT_WARC; 293299425Smm if (ver != w->pver) { 294299425Smm /* stringify this entry's version */ 295299425Smm archive_string_sprintf(&w->sver, 296313929Smm "WARC/%u.%u", ver / 10000, (ver % 10000) / 100); 297299425Smm /* remember the version */ 298299425Smm w->pver = ver; 299299425Smm } 300299425Smm /* start off with the type */ 301299425Smm ftyp = _warc_rdtyp(buf, eoh - buf); 302299425Smm /* and let future calls know about the content */ 303299425Smm w->cntlen = cntlen; 304299425Smm w->cntoff = 0U; 305299425Smm mtime = 0;/* Avoid compiling error on some platform. */ 306299425Smm 307299425Smm switch (ftyp) { 308299425Smm case WT_RSRC: 309299425Smm case WT_RSP: 310299425Smm /* only try and read the filename in the cases that are 311299425Smm * guaranteed to have one */ 312299425Smm fnam = _warc_rduri(buf, eoh - buf); 313299425Smm /* check the last character in the URI to avoid creating 314299425Smm * directory endpoints as files, see Todo above */ 315299425Smm if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 316299425Smm /* break here for now */ 317299425Smm fnam.len = 0U; 318299425Smm fnam.str = NULL; 319299425Smm break; 320299425Smm } 321299425Smm /* bang to our string pool, so we save a 322299425Smm * malloc()+free() roundtrip */ 323299425Smm if (fnam.len + 1U > w->pool.len) { 324299425Smm w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 325299425Smm w->pool.str = realloc(w->pool.str, w->pool.len); 326299425Smm } 327299425Smm memcpy(w->pool.str, fnam.str, fnam.len); 328299425Smm w->pool.str[fnam.len] = '\0'; 329305192Smm /* let no one else know about the pool, it's a secret, shhh */ 330299425Smm fnam.str = w->pool.str; 331299425Smm 332299425Smm /* snarf mtime or deduce from rtime 333299425Smm * this is a custom header added by our writer, it's quite 334299425Smm * hard to believe anyone else would go through with it 335299425Smm * (apart from being part of some http responses of course) */ 336299425Smm if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 337299425Smm mtime = rtime; 338299425Smm } 339299425Smm break; 340368708Smm case WT_NONE: 341368708Smm case WT_INFO: 342368708Smm case WT_META: 343368708Smm case WT_REQ: 344368708Smm case WT_RVIS: 345368708Smm case WT_CONV: 346368708Smm case WT_CONT: 347368708Smm case LAST_WT: 348299425Smm default: 349299425Smm fnam.len = 0U; 350299425Smm fnam.str = NULL; 351299425Smm break; 352299425Smm } 353299425Smm 354299425Smm /* now eat some of those delicious buffer bits */ 355299425Smm __archive_read_consume(a, eoh - buf); 356299425Smm 357299425Smm switch (ftyp) { 358299425Smm case WT_RSRC: 359299425Smm case WT_RSP: 360299425Smm if (fnam.len > 0U) { 361299425Smm /* populate entry object */ 362299425Smm archive_entry_set_filetype(entry, AE_IFREG); 363299425Smm archive_entry_copy_pathname(entry, fnam.str); 364299425Smm archive_entry_set_size(entry, cntlen); 365299425Smm archive_entry_set_perm(entry, 0644); 366299425Smm /* rtime is the new ctime, mtime stays mtime */ 367299425Smm archive_entry_set_ctime(entry, rtime, 0L); 368299425Smm archive_entry_set_mtime(entry, mtime, 0L); 369299425Smm break; 370299425Smm } 371299425Smm /* FALLTHROUGH */ 372368708Smm case WT_NONE: 373368708Smm case WT_INFO: 374368708Smm case WT_META: 375368708Smm case WT_REQ: 376368708Smm case WT_RVIS: 377368708Smm case WT_CONV: 378368708Smm case WT_CONT: 379368708Smm case LAST_WT: 380299425Smm default: 381299425Smm /* consume the content and start over */ 382299425Smm _warc_skip(a); 383299425Smm goto start_over; 384299425Smm } 385299425Smm return (ARCHIVE_OK); 386299425Smm} 387299425Smm 388299425Smmstatic int 389299425Smm_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 390299425Smm{ 391299425Smm struct warc_s *w = a->format->data; 392299425Smm const char *rab; 393299425Smm ssize_t nrd; 394299425Smm 395299425Smm if (w->cntoff >= w->cntlen) { 396299425Smm eof: 397299425Smm /* it's our lucky day, no work, we can leave early */ 398299425Smm *buf = NULL; 399299425Smm *bsz = 0U; 400299425Smm *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 401299425Smm w->unconsumed = 0U; 402299425Smm return (ARCHIVE_EOF); 403299425Smm } 404299425Smm 405299425Smm rab = __archive_read_ahead(a, 1U, &nrd); 406299425Smm if (nrd < 0) { 407299425Smm *bsz = 0U; 408299425Smm /* big catastrophe */ 409299425Smm return (int)nrd; 410299425Smm } else if (nrd == 0) { 411299425Smm goto eof; 412299425Smm } else if ((size_t)nrd > w->cntlen - w->cntoff) { 413299425Smm /* clamp to content-length */ 414299425Smm nrd = w->cntlen - w->cntoff; 415299425Smm } 416299425Smm *off = w->cntoff; 417299425Smm *bsz = nrd; 418299425Smm *buf = rab; 419299425Smm 420299425Smm w->cntoff += nrd; 421299425Smm w->unconsumed = (size_t)nrd; 422299425Smm return (ARCHIVE_OK); 423299425Smm} 424299425Smm 425299425Smmstatic int 426299425Smm_warc_skip(struct archive_read *a) 427299425Smm{ 428299425Smm struct warc_s *w = a->format->data; 429299425Smm 430299425Smm __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 431299425Smm w->cntlen = 0U; 432299425Smm w->cntoff = 0U; 433299425Smm return (ARCHIVE_OK); 434299425Smm} 435299425Smm 436299425Smm 437299425Smm/* private routines */ 438299425Smmstatic void* 439299425Smmdeconst(const void *c) 440299425Smm{ 441299425Smm return (char *)0x1 + (((const char *)c) - (const char *)0x1); 442299425Smm} 443299425Smm 444299425Smmstatic char* 445299425Smmxmemmem(const char *hay, const size_t haysize, 446299425Smm const char *needle, const size_t needlesize) 447299425Smm{ 448299425Smm const char *const eoh = hay + haysize; 449299425Smm const char *const eon = needle + needlesize; 450299425Smm const char *hp; 451299425Smm const char *np; 452299425Smm const char *cand; 453299425Smm unsigned int hsum; 454299425Smm unsigned int nsum; 455299425Smm unsigned int eqp; 456299425Smm 457299425Smm /* trivial checks first 458299425Smm * a 0-sized needle is defined to be found anywhere in haystack 459299425Smm * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 460299425Smm * that happens to begin with *NEEDLE) */ 461299425Smm if (needlesize == 0UL) { 462299425Smm return deconst(hay); 463299425Smm } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 464299425Smm /* trivial */ 465299425Smm return NULL; 466299425Smm } 467299425Smm 468299425Smm /* First characters of haystack and needle are the same now. Both are 469299425Smm * guaranteed to be at least one character long. Now computes the sum 470299425Smm * of characters values of needle together with the sum of the first 471299425Smm * needle_len characters of haystack. */ 472299425Smm for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 473299425Smm hp < eoh && np < eon; 474299425Smm hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 475299425Smm 476299425Smm /* HP now references the (NEEDLESIZE + 1)-th character. */ 477299425Smm if (np < eon) { 478299425Smm /* haystack is smaller than needle, :O */ 479299425Smm return NULL; 480299425Smm } else if (eqp) { 481299425Smm /* found a match */ 482299425Smm return deconst(hay); 483299425Smm } 484299425Smm 485299425Smm /* now loop through the rest of haystack, 486299425Smm * updating the sum iteratively */ 487299425Smm for (cand = hay; hp < eoh; hp++) { 488299425Smm hsum ^= *cand++; 489299425Smm hsum ^= *hp; 490299425Smm 491299425Smm /* Since the sum of the characters is already known to be 492299425Smm * equal at that point, it is enough to check just NEEDLESIZE - 1 493299425Smm * characters for equality, 494299425Smm * also CAND is by design < HP, so no need for range checks */ 495299425Smm if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 496299425Smm return deconst(cand); 497299425Smm } 498299425Smm } 499299425Smm return NULL; 500299425Smm} 501299425Smm 502299425Smmstatic int 503299425Smmstrtoi_lim(const char *str, const char **ep, int llim, int ulim) 504299425Smm{ 505299425Smm int res = 0; 506299425Smm const char *sp; 507299425Smm /* we keep track of the number of digits via rulim */ 508299425Smm int rulim; 509299425Smm 510299425Smm for (sp = str, rulim = ulim > 10 ? ulim : 10; 511299425Smm res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 512299425Smm sp++, rulim /= 10) { 513299425Smm res *= 10; 514299425Smm res += *sp - '0'; 515299425Smm } 516299425Smm if (sp == str) { 517299425Smm res = -1; 518299425Smm } else if (res < llim || res > ulim) { 519299425Smm res = -2; 520299425Smm } 521299425Smm *ep = (const char*)sp; 522299425Smm return res; 523299425Smm} 524299425Smm 525299425Smmstatic time_t 526299425Smmtime_from_tm(struct tm *t) 527299425Smm{ 528299425Smm#if HAVE_TIMEGM 529299425Smm /* Use platform timegm() if available. */ 530299425Smm return (timegm(t)); 531299425Smm#elif HAVE__MKGMTIME64 532299425Smm return (_mkgmtime64(t)); 533299425Smm#else 534299425Smm /* Else use direct calculation using POSIX assumptions. */ 535299425Smm /* First, fix up tm_yday based on the year/month/day. */ 536299425Smm if (mktime(t) == (time_t)-1) 537299425Smm return ((time_t)-1); 538299425Smm /* Then we can compute timegm() from first principles. */ 539299425Smm return (t->tm_sec 540299425Smm + t->tm_min * 60 541299425Smm + t->tm_hour * 3600 542299425Smm + t->tm_yday * 86400 543299425Smm + (t->tm_year - 70) * 31536000 544299425Smm + ((t->tm_year - 69) / 4) * 86400 545299425Smm - ((t->tm_year - 1) / 100) * 86400 546299425Smm + ((t->tm_year + 299) / 400) * 86400); 547299425Smm#endif 548299425Smm} 549299425Smm 550299425Smmstatic time_t 551299425Smmxstrpisotime(const char *s, char **endptr) 552299425Smm{ 553299425Smm/** like strptime() but strictly for ISO 8601 Zulu strings */ 554299425Smm struct tm tm; 555299425Smm time_t res = (time_t)-1; 556299425Smm 557299425Smm /* make sure tm is clean */ 558299425Smm memset(&tm, 0, sizeof(tm)); 559299425Smm 560299425Smm /* as a courtesy to our callers, and since this is a non-standard 561299425Smm * routine, we skip leading whitespace */ 562315433Smm while (*s == ' ' || *s == '\t') 563302295Smm ++s; 564299425Smm 565299425Smm /* read year */ 566299425Smm if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 567299425Smm goto out; 568299425Smm } 569299425Smm /* read month */ 570299425Smm if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 571299425Smm goto out; 572299425Smm } 573299425Smm /* read day-of-month */ 574299425Smm if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 575299425Smm goto out; 576299425Smm } 577299425Smm /* read hour */ 578299425Smm if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 579299425Smm goto out; 580299425Smm } 581299425Smm /* read minute */ 582299425Smm if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 583299425Smm goto out; 584299425Smm } 585299425Smm /* read second */ 586299425Smm if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 587299425Smm goto out; 588299425Smm } 589299425Smm 590313571Smm /* massage TM to fulfill some of POSIX' constraints */ 591299425Smm tm.tm_year -= 1900; 592299425Smm tm.tm_mon--; 593299425Smm 594299425Smm /* now convert our custom tm struct to a unix stamp using UTC */ 595299425Smm res = time_from_tm(&tm); 596299425Smm 597299425Smmout: 598299425Smm if (endptr != NULL) { 599299425Smm *endptr = deconst(s); 600299425Smm } 601299425Smm return res; 602299425Smm} 603299425Smm 604299425Smmstatic unsigned int 605313929Smm_warc_rdver(const char *buf, size_t bsz) 606299425Smm{ 607299425Smm static const char magic[] = "WARC/"; 608315433Smm const char *c; 609313929Smm unsigned int ver = 0U; 610313929Smm unsigned int end = 0U; 611299425Smm 612313929Smm if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 613313929Smm /* buffer too small or invalid magic */ 614313929Smm return ver; 615299425Smm } 616299425Smm /* looks good so far, read the version number for a laugh */ 617299425Smm buf += sizeof(magic) - 1U; 618299425Smm 619315433Smm if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') && 620315433Smm isdigit((unsigned char)buf[2U])) { 621313929Smm /* we support a maximum of 2 digits in the minor version */ 622315433Smm if (isdigit((unsigned char)buf[3U])) 623313929Smm end = 1U; 624313929Smm /* set up major version */ 625313929Smm ver = (buf[0U] - '0') * 10000U; 626313929Smm /* set up minor version */ 627313929Smm if (end == 1U) { 628313929Smm ver += (buf[2U] - '0') * 1000U; 629313929Smm ver += (buf[3U] - '0') * 100U; 630313929Smm } else 631313929Smm ver += (buf[2U] - '0') * 100U; 632313929Smm /* 633313929Smm * WARC below version 0.12 has a space-separated header 634313929Smm * WARC 0.12 and above terminates the version with a CRLF 635313929Smm */ 636315433Smm c = buf + 3U + end; 637313929Smm if (ver >= 1200U) { 638315433Smm if (memcmp(c, "\r\n", 2U) != 0) 639313929Smm ver = 0U; 640358090Smm } else { 641358090Smm /* ver < 1200U */ 642315433Smm if (*c != ' ' && *c != '\t') 643313929Smm ver = 0U; 644299425Smm } 645299425Smm } 646299425Smm return ver; 647299425Smm} 648299425Smm 649299425Smmstatic unsigned int 650299425Smm_warc_rdtyp(const char *buf, size_t bsz) 651299425Smm{ 652299425Smm static const char _key[] = "\r\nWARC-Type:"; 653313929Smm const char *val, *eol; 654299425Smm 655299425Smm if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 656299425Smm /* no bother */ 657299425Smm return WT_NONE; 658299425Smm } 659313929Smm val += sizeof(_key) - 1U; 660313929Smm if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 661313929Smm /* no end of line */ 662313929Smm return WT_NONE; 663313929Smm } 664313929Smm 665299425Smm /* overread whitespace */ 666315433Smm while (val < eol && (*val == ' ' || *val == '\t')) 667302295Smm ++val; 668299425Smm 669313929Smm if (val + 8U == eol) { 670313929Smm if (memcmp(val, "resource", 8U) == 0) 671313929Smm return WT_RSRC; 672313929Smm else if (memcmp(val, "response", 8U) == 0) 673313929Smm return WT_RSP; 674299425Smm } 675299425Smm return WT_NONE; 676299425Smm} 677299425Smm 678299425Smmstatic warc_string_t 679299425Smm_warc_rduri(const char *buf, size_t bsz) 680299425Smm{ 681299425Smm static const char _key[] = "\r\nWARC-Target-URI:"; 682313929Smm const char *val, *uri, *eol, *p; 683299425Smm warc_string_t res = {0U, NULL}; 684299425Smm 685299425Smm if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 686299425Smm /* no bother */ 687299425Smm return res; 688299425Smm } 689299425Smm /* overread whitespace */ 690302295Smm val += sizeof(_key) - 1U; 691313929Smm if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 692313929Smm /* no end of line */ 693313929Smm return res; 694313929Smm } 695313929Smm 696315433Smm while (val < eol && (*val == ' ' || *val == '\t')) 697302295Smm ++val; 698299425Smm 699299425Smm /* overread URL designators */ 700313929Smm if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) { 701299425Smm /* not touching that! */ 702299425Smm return res; 703299425Smm } 704299425Smm 705313929Smm /* spaces inside uri are not allowed, CRLF should follow */ 706313929Smm for (p = val; p < eol; p++) { 707315433Smm if (isspace((unsigned char)*p)) 708313929Smm return res; 709313929Smm } 710313929Smm 711313929Smm /* there must be at least space for ftp */ 712313929Smm if (uri < (val + 3U)) 713313929Smm return res; 714313929Smm 715313929Smm /* move uri to point to after :// */ 716299425Smm uri += 3U; 717299425Smm 718299425Smm /* now then, inspect the URI */ 719299425Smm if (memcmp(val, "file", 4U) == 0) { 720299425Smm /* perfect, nothing left to do here */ 721299425Smm 722299425Smm } else if (memcmp(val, "http", 4U) == 0 || 723299425Smm memcmp(val, "ftp", 3U) == 0) { 724299425Smm /* overread domain, and the first / */ 725299425Smm while (uri < eol && *uri++ != '/'); 726299425Smm } else { 727299425Smm /* not sure what to do? best to bugger off */ 728299425Smm return res; 729299425Smm } 730299425Smm res.str = uri; 731299425Smm res.len = eol - uri; 732299425Smm return res; 733299425Smm} 734299425Smm 735299425Smmstatic ssize_t 736299425Smm_warc_rdlen(const char *buf, size_t bsz) 737299425Smm{ 738299425Smm static const char _key[] = "\r\nContent-Length:"; 739313929Smm const char *val, *eol; 740299425Smm char *on = NULL; 741299425Smm long int len; 742299425Smm 743299425Smm if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 744299425Smm /* no bother */ 745299425Smm return -1; 746299425Smm } 747313929Smm val += sizeof(_key) - 1U; 748313929Smm if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 749313929Smm /* no end of line */ 750313929Smm return -1; 751313929Smm } 752299425Smm 753313929Smm /* skip leading whitespace */ 754315433Smm while (val < eol && (*val == ' ' || *val == '\t')) 755313929Smm val++; 756313929Smm /* there must be at least one digit */ 757315433Smm if (!isdigit((unsigned char)*val)) 758313929Smm return -1; 759348608Smm errno = 0; 760299425Smm len = strtol(val, &on, 10); 761348608Smm if (errno != 0 || on != eol) { 762313929Smm /* line must end here */ 763299425Smm return -1; 764299425Smm } 765313929Smm 766299425Smm return (size_t)len; 767299425Smm} 768299425Smm 769299425Smmstatic time_t 770299425Smm_warc_rdrtm(const char *buf, size_t bsz) 771299425Smm{ 772299425Smm static const char _key[] = "\r\nWARC-Date:"; 773313929Smm const char *val, *eol; 774299425Smm char *on = NULL; 775299425Smm time_t res; 776299425Smm 777299425Smm if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 778299425Smm /* no bother */ 779299425Smm return (time_t)-1; 780299425Smm } 781313929Smm val += sizeof(_key) - 1U; 782313929Smm if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 783313929Smm /* no end of line */ 784313929Smm return -1; 785313929Smm } 786299425Smm 787299425Smm /* xstrpisotime() kindly overreads whitespace for us, so use that */ 788299425Smm res = xstrpisotime(val, &on); 789313929Smm if (on != eol) { 790313929Smm /* line must end here */ 791313929Smm return -1; 792299425Smm } 793299425Smm return res; 794299425Smm} 795299425Smm 796299425Smmstatic time_t 797299425Smm_warc_rdmtm(const char *buf, size_t bsz) 798299425Smm{ 799299425Smm static const char _key[] = "\r\nLast-Modified:"; 800313929Smm const char *val, *eol; 801299425Smm char *on = NULL; 802299425Smm time_t res; 803299425Smm 804299425Smm if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 805299425Smm /* no bother */ 806299425Smm return (time_t)-1; 807299425Smm } 808313929Smm val += sizeof(_key) - 1U; 809313929Smm if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 810313929Smm /* no end of line */ 811313929Smm return -1; 812313929Smm } 813299425Smm 814299425Smm /* xstrpisotime() kindly overreads whitespace for us, so use that */ 815299425Smm res = xstrpisotime(val, &on); 816313929Smm if (on != eol) { 817313929Smm /* line must end here */ 818313929Smm return -1; 819299425Smm } 820299425Smm return res; 821299425Smm} 822299425Smm 823299425Smmstatic const char* 824299425Smm_warc_find_eoh(const char *buf, size_t bsz) 825299425Smm{ 826299425Smm static const char _marker[] = "\r\n\r\n"; 827299425Smm const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 828299425Smm 829299425Smm if (hit != NULL) { 830299425Smm hit += sizeof(_marker) - 1U; 831299425Smm } 832299425Smm return hit; 833299425Smm} 834299425Smm 835313929Smmstatic const char* 836313929Smm_warc_find_eol(const char *buf, size_t bsz) 837313929Smm{ 838313929Smm static const char _marker[] = "\r\n"; 839313929Smm const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 840313929Smm 841313929Smm return hit; 842313929Smm} 843299425Smm/* archive_read_support_format_warc.c ends here */ 844