archive_read_support_format_warc.c revision 358090
1126209Sache/*- 2146040Stjr * Copyright (c) 2014 Sebastian Freundt 3250724Sjkim * All rights reserved. 4146040Stjr * 5146040Stjr * Redistribution and use in source and binary forms, with or without 6126209Sache * modification, are permitted provided that the following conditions 7126209Sache * are met: 8146040Stjr * 1. Redistributions of source code must retain the above copyright 9146040Stjr * notice, this list of conditions and the following disclaimer. 10146040Stjr * 2. Redistributions in binary form must reproduce the above copyright 11126209Sache * notice, this list of conditions and the following disclaimer in the 12126209Sache * documentation and/or other materials provided with the distribution. 13126209Sache * 14126209Sache * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15146040Stjr * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16126209Sache * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17146040Stjr * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18250724Sjkim * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19250724Sjkim * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20126209Sache * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21126209Sache * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22126209Sache * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23126209Sache * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24146040Stjr */ 25146040Stjr 26126209Sache#include "archive_platform.h" 27126209Sache__FBSDID("$FreeBSD: stable/10/contrib/libarchive/libarchive/archive_read_support_format_warc.c 358090 2020-02-19 01:51:44Z mm $"); 28126209Sache 29126209Sache/** 30126209Sache * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 31126209Sache * ISO 28500:2009. 32126209Sache * For the purposes of this file we used the final draft from: 33126209Sache * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 34126209Sache * 35126209Sache * Todo: 36126209Sache * [ ] real-world warcs can contain resources at endpoints ending in / 37126209Sache * e.g. http://bibnum.bnf.fr/warc/ 38126209Sache * if you're lucky their response contains a Content-Location: header 39126209Sache * pointing to a unix-compliant filename, in the example above it's 40126209Sache * Content-Location: http://bibnum.bnf.fr/warc/index.html 41126209Sache * however, that's not mandated and github for example doesn't follow 42126209Sache * this convention. 43126209Sache * We need a set of archive options to control what to do with 44126209Sache * entries like these, at the moment care is taken to skip them. 45250724Sjkim * 46126209Sache **/ 47126209Sache 48250724Sjkim#ifdef HAVE_SYS_STAT_H 49126209Sache#include <sys/stat.h> 50126209Sache#endif 51126209Sache#ifdef HAVE_ERRNO_H 52126209Sache#include <errno.h> 53250724Sjkim#endif 54126209Sache#ifdef HAVE_STDLIB_H 55126209Sache#include <stdlib.h> 56126209Sache#endif 57126209Sache#ifdef HAVE_STRING_H 58126209Sache#include <string.h> 59250724Sjkim#endif 60126209Sache#ifdef HAVE_LIMITS_H 61126209Sache#include <limits.h> 62126209Sache#endif 63126209Sache#ifdef HAVE_CTYPE_H 64250724Sjkim#include <ctype.h> 65250724Sjkim#endif 66250724Sjkim#ifdef HAVE_TIME_H 67250724Sjkim#include <time.h> 68126209Sache#endif 69126209Sache 70126209Sache#include "archive.h" 71126209Sache#include "archive_entry.h" 72126209Sache#include "archive_private.h" 73250724Sjkim#include "archive_read_private.h" 74126209Sache 75126209Sachetypedef enum { 76126209Sache WT_NONE, 77126209Sache /* warcinfo */ 78126209Sache WT_INFO, 79126209Sache /* metadata */ 80126209Sache WT_META, 81250724Sjkim /* resource */ 82126209Sache WT_RSRC, 83126209Sache /* request, unsupported */ 84126209Sache WT_REQ, 85250724Sjkim /* response, unsupported */ 86126209Sache WT_RSP, 87126209Sache /* revisit, unsupported */ 88126209Sache WT_RVIS, 89250724Sjkim /* conversion, unsupported */ 90126209Sache WT_CONV, 91126209Sache /* continuation, unsupported at the moment */ 92126209Sache WT_CONT, 93250724Sjkim /* invalid type */ 94126209Sache LAST_WT 95126209Sache} warc_type_t; 96126209Sache 97250724Sjkimtypedef struct { 98126209Sache size_t len; 99126209Sache const char *str; 100126209Sache} warc_string_t; 101126209Sache 102250724Sjkimtypedef struct { 103126209Sache size_t len; 104126209Sache char *str; 105126209Sache} warc_strbuf_t; 106250724Sjkim 107126209Sachestruct warc_s { 108126209Sache /* content length ahead */ 109126209Sache size_t cntlen; 110250724Sjkim /* and how much we've processed so far */ 111126209Sache size_t cntoff; 112126209Sache /* and how much we need to consume between calls */ 113126209Sache size_t unconsumed; 114126209Sache 115250724Sjkim /* string pool */ 116126209Sache warc_strbuf_t pool; 117126209Sache /* previous version */ 118126209Sache unsigned int pver; 119250724Sjkim /* stringified format name */ 120126209Sache struct archive_string sver; 121126209Sache}; 122126209Sache 123250724Sjkimstatic int _warc_bid(struct archive_read *a, int); 124126209Sachestatic int _warc_cleanup(struct archive_read *a); 125126209Sachestatic int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 126126209Sachestatic int _warc_skip(struct archive_read *a); 127250724Sjkimstatic int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 128126209Sache 129126209Sache/* private routines */ 130126209Sachestatic unsigned int _warc_rdver(const char buf[10], size_t bsz); 131126209Sachestatic unsigned int _warc_rdtyp(const char *buf, size_t bsz); 132126209Sachestatic warc_string_t _warc_rduri(const char *buf, size_t bsz); 133250724Sjkimstatic ssize_t _warc_rdlen(const char *buf, size_t bsz); 134126209Sachestatic time_t _warc_rdrtm(const char *buf, size_t bsz); 135126209Sachestatic time_t _warc_rdmtm(const char *buf, size_t bsz); 136126209Sachestatic const char *_warc_find_eoh(const char *buf, size_t bsz); 137250724Sjkimstatic const char *_warc_find_eol(const char *buf, size_t bsz); 138126209Sache 139126209Sacheint 140126209Sachearchive_read_support_format_warc(struct archive *_a) 141250724Sjkim{ 142126209Sache struct archive_read *a = (struct archive_read *)_a; 143126209Sache struct warc_s *w; 144126209Sache int r; 145250724Sjkim 146126209Sache archive_check_magic(_a, ARCHIVE_READ_MAGIC, 147126209Sache ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 148126209Sache 149126209Sache if ((w = calloc(1, sizeof(*w))) == NULL) { 150126209Sache archive_set_error(&a->archive, ENOMEM, 151126209Sache "Can't allocate warc data"); 152126209Sache return (ARCHIVE_FATAL); 153250724Sjkim } 154126209Sache 155131543Stjr r = __archive_read_register_format( 156131543Stjr a, w, "warc", 157131543Stjr _warc_bid, NULL, _warc_rdhdr, _warc_read, 158250724Sjkim _warc_skip, NULL, _warc_cleanup, NULL, NULL); 159131543Stjr 160146040Stjr if (r != ARCHIVE_OK) { 161146040Stjr free(w); 162250724Sjkim return (r); 163146040Stjr } 164146040Stjr return (ARCHIVE_OK); 165146040Stjr} 166146040Stjr 167250724Sjkimstatic int 168146040Stjr_warc_cleanup(struct archive_read *a) 169146040Stjr{ 170146040Stjr struct warc_s *w = a->format->data; 171250724Sjkim 172146040Stjr if (w->pool.len > 0U) { 173146040Stjr free(w->pool.str); 174146040Stjr } 175250724Sjkim archive_string_free(&w->sver); 176250724Sjkim free(w); 177146040Stjr a->format->data = NULL; 178126209Sache return (ARCHIVE_OK); 179126209Sache} 180126209Sache 181126209Sachestatic int 182126209Sache_warc_bid(struct archive_read *a, int best_bid) 183126209Sache{ 184250724Sjkim const char *hdr; 185126209Sache ssize_t nrd; 186126209Sache unsigned int ver; 187126209Sache 188126209Sache (void)best_bid; /* UNUSED */ 189126209Sache 190126209Sache /* check first line of file, it should be a record already */ 191126209Sache if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 192126209Sache /* no idea what to do */ 193126209Sache return -1; 194126209Sache } else if (nrd < 12) { 195126209Sache /* nah, not for us, our magic cookie is at least 12 bytes */ 196250724Sjkim return -1; 197126209Sache } 198126209Sache 199126209Sache /* otherwise snarf the record's version number */ 200250724Sjkim ver = _warc_rdver(hdr, nrd); 201250724Sjkim if (ver < 1200U || ver > 10000U) { 202250724Sjkim /* we only support WARC 0.12 to 1.0 */ 203250724Sjkim return -1; 204126209Sache } 205250724Sjkim 206126209Sache /* otherwise be confident */ 207250724Sjkim return (64); 208250724Sjkim} 209126209Sache 210126209Sachestatic int 211126209Sache_warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 212126209Sache{ 213126209Sache#define HDR_PROBE_LEN (12U) 214126209Sache struct warc_s *w = a->format->data; 215126209Sache unsigned int ver; 216126209Sache const char *buf; 217126209Sache ssize_t nrd; 218126209Sache const char *eoh; 219126209Sache /* for the file name, saves some strndup()'ing */ 220126209Sache warc_string_t fnam; 221126209Sache /* warc record type, not that we really use it a lot */ 222131543Stjr warc_type_t ftyp; 223131543Stjr /* content-length+error monad */ 224126209Sache ssize_t cntlen; 225126209Sache /* record time is the WARC-Date time we reinterpret it as ctime */ 226126209Sache time_t rtime; 227126209Sache /* mtime is the Last-Modified time which will be the entry's mtime */ 228126209Sache time_t mtime; 229126209Sache 230126209Sachestart_over: 231126209Sache /* just use read_ahead() they keep track of unconsumed 232126209Sache * bits and bobs for us; no need to put an extra shift in 233126209Sache * and reproduce that functionality here */ 234126209Sache buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 235126209Sache 236146040Stjr if (nrd < 0) { 237126209Sache /* no good */ 238126209Sache archive_set_error( 239126209Sache &a->archive, ARCHIVE_ERRNO_MISC, 240126209Sache "Bad record header"); 241126209Sache return (ARCHIVE_FATAL); 242126209Sache } else if (buf == NULL) { 243126209Sache /* there should be room for at least WARC/bla\r\n 244126209Sache * must be EOF therefore */ 245131543Stjr return (ARCHIVE_EOF); 246131543Stjr } 247131543Stjr /* looks good so far, try and find the end of the header now */ 248131543Stjr eoh = _warc_find_eoh(buf, nrd); 249126209Sache if (eoh == NULL) { 250131543Stjr /* still no good, the header end might be beyond the 251131543Stjr * probe we've requested, but then again who'd cram 252126209Sache * so much stuff into the header *and* be 28500-compliant */ 253126209Sache archive_set_error( 254126209Sache &a->archive, ARCHIVE_ERRNO_MISC, 255126209Sache "Bad record header"); 256126209Sache return (ARCHIVE_FATAL); 257126209Sache } 258126209Sache ver = _warc_rdver(buf, eoh - buf); 259126209Sache /* we currently support WARC 0.12 to 1.0 */ 260126209Sache if (ver == 0U) { 261126209Sache archive_set_error( 262250724Sjkim &a->archive, ARCHIVE_ERRNO_MISC, 263250724Sjkim "Invalid record version"); 264250724Sjkim return (ARCHIVE_FATAL); 265250724Sjkim } else if (ver < 1200U || ver > 10000U) { 266250724Sjkim archive_set_error( 267126209Sache &a->archive, ARCHIVE_ERRNO_MISC, 268126209Sache "Unsupported record version: %u.%u", 269126209Sache ver / 10000, (ver % 10000) / 100); 270126209Sache return (ARCHIVE_FATAL); 271126209Sache } 272126209Sache cntlen = _warc_rdlen(buf, eoh - buf); 273126209Sache if (cntlen < 0) { 274126209Sache /* nightmare! the specs say content-length is mandatory 275126209Sache * so I don't feel overly bad stopping the reader here */ 276126209Sache archive_set_error( 277126209Sache &a->archive, EINVAL, 278126209Sache "Bad content length"); 279126209Sache return (ARCHIVE_FATAL); 280126209Sache } 281126209Sache rtime = _warc_rdrtm(buf, eoh - buf); 282126209Sache if (rtime == (time_t)-1) { 283126209Sache /* record time is mandatory as per WARC/1.0, 284126209Sache * so just barf here, fast and loud */ 285126209Sache archive_set_error( 286126209Sache &a->archive, EINVAL, 287126209Sache "Bad record time"); 288126209Sache return (ARCHIVE_FATAL); 289126209Sache } 290126209Sache 291126209Sache /* let the world know we're a WARC archive */ 292126209Sache a->archive.archive_format = ARCHIVE_FORMAT_WARC; 293126209Sache if (ver != w->pver) { 294126209Sache /* stringify this entry's version */ 295126209Sache archive_string_sprintf(&w->sver, 296126209Sache "WARC/%u.%u", ver / 10000, (ver % 10000) / 100); 297126209Sache /* remember the version */ 298126209Sache w->pver = ver; 299126209Sache } 300126209Sache /* start off with the type */ 301126209Sache ftyp = _warc_rdtyp(buf, eoh - buf); 302146040Stjr /* and let future calls know about the content */ 303146040Stjr w->cntlen = cntlen; 304146040Stjr w->cntoff = 0U; 305126209Sache mtime = 0;/* Avoid compiling error on some platform. */ 306146040Stjr 307126209Sache switch (ftyp) { 308126209Sache case WT_RSRC: 309126209Sache case WT_RSP: 310126209Sache /* only try and read the filename in the cases that are 311250724Sjkim * guaranteed to have one */ 312126209Sache fnam = _warc_rduri(buf, eoh - buf); 313126209Sache /* check the last character in the URI to avoid creating 314126209Sache * directory endpoints as files, see Todo above */ 315126209Sache if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 316126209Sache /* break here for now */ 317126209Sache fnam.len = 0U; 318126209Sache fnam.str = NULL; 319126209Sache break; 320126209Sache } 321146040Stjr /* bang to our string pool, so we save a 322126209Sache * malloc()+free() roundtrip */ 323126209Sache if (fnam.len + 1U > w->pool.len) { 324126209Sache w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 325126209Sache w->pool.str = realloc(w->pool.str, w->pool.len); 326126209Sache } 327126209Sache memcpy(w->pool.str, fnam.str, fnam.len); 328126209Sache w->pool.str[fnam.len] = '\0'; 329126209Sache /* let no one else know about the pool, it's a secret, shhh */ 330126209Sache fnam.str = w->pool.str; 331126209Sache 332126209Sache /* snarf mtime or deduce from rtime 333126209Sache * this is a custom header added by our writer, it's quite 334126209Sache * hard to believe anyone else would go through with it 335126209Sache * (apart from being part of some http responses of course) */ 336126209Sache if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 337126209Sache mtime = rtime; 338126209Sache } 339126209Sache break; 340126209Sache default: 341250724Sjkim fnam.len = 0U; 342250724Sjkim fnam.str = NULL; 343250724Sjkim break; 344126209Sache } 345126209Sache 346250724Sjkim /* now eat some of those delicious buffer bits */ 347250724Sjkim __archive_read_consume(a, eoh - buf); 348250724Sjkim 349250724Sjkim switch (ftyp) { 350126209Sache case WT_RSRC: 351126209Sache case WT_RSP: 352250724Sjkim if (fnam.len > 0U) { 353250724Sjkim /* populate entry object */ 354250724Sjkim archive_entry_set_filetype(entry, AE_IFREG); 355250724Sjkim archive_entry_copy_pathname(entry, fnam.str); 356250724Sjkim archive_entry_set_size(entry, cntlen); 357250724Sjkim archive_entry_set_perm(entry, 0644); 358126209Sache /* rtime is the new ctime, mtime stays mtime */ 359126209Sache archive_entry_set_ctime(entry, rtime, 0L); 360250724Sjkim archive_entry_set_mtime(entry, mtime, 0L); 361250724Sjkim break; 362250724Sjkim } 363250724Sjkim /* FALLTHROUGH */ 364126209Sache default: 365250724Sjkim /* consume the content and start over */ 366250724Sjkim _warc_skip(a); 367126209Sache goto start_over; 368250724Sjkim } 369250724Sjkim return (ARCHIVE_OK); 370126209Sache} 371250724Sjkim 372250724Sjkimstatic int 373126209Sache_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 374250724Sjkim{ 375250724Sjkim struct warc_s *w = a->format->data; 376250724Sjkim const char *rab; 377250724Sjkim ssize_t nrd; 378126209Sache 379250724Sjkim if (w->cntoff >= w->cntlen) { 380250724Sjkim eof: 381250724Sjkim /* it's our lucky day, no work, we can leave early */ 382250724Sjkim *buf = NULL; 383250724Sjkim *bsz = 0U; 384126209Sache *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 385250724Sjkim w->unconsumed = 0U; 386126209Sache return (ARCHIVE_EOF); 387126209Sache } 388250724Sjkim 389250724Sjkim rab = __archive_read_ahead(a, 1U, &nrd); 390250724Sjkim if (nrd < 0) { 391250724Sjkim *bsz = 0U; 392250724Sjkim /* big catastrophe */ 393126209Sache return (int)nrd; 394250724Sjkim } else if (nrd == 0) { 395250724Sjkim goto eof; 396250724Sjkim } else if ((size_t)nrd > w->cntlen - w->cntoff) { 397250724Sjkim /* clamp to content-length */ 398250724Sjkim nrd = w->cntlen - w->cntoff; 399250724Sjkim } 400250724Sjkim *off = w->cntoff; 401250724Sjkim *bsz = nrd; 402250724Sjkim *buf = rab; 403250724Sjkim 404126209Sache w->cntoff += nrd; 405250724Sjkim w->unconsumed = (size_t)nrd; 406250724Sjkim return (ARCHIVE_OK); 407250724Sjkim} 408126209Sache 409250724Sjkimstatic int 410250724Sjkim_warc_skip(struct archive_read *a) 411250724Sjkim{ 412126209Sache struct warc_s *w = a->format->data; 413250724Sjkim 414250724Sjkim __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 415250724Sjkim w->cntlen = 0U; 416126209Sache w->cntoff = 0U; 417250724Sjkim return (ARCHIVE_OK); 418250724Sjkim} 419126209Sache 420250724Sjkim 421250724Sjkim/* private routines */ 422126209Sachestatic void* 423126209Sachedeconst(const void *c) 424126209Sache{ 425126209Sache return (char *)0x1 + (((const char *)c) - (const char *)0x1); 426126209Sache} 427126209Sache 428126209Sachestatic char* 429126209Sachexmemmem(const char *hay, const size_t haysize, 430250724Sjkim const char *needle, const size_t needlesize) 431126209Sache{ 432126209Sache const char *const eoh = hay + haysize; 433126209Sache const char *const eon = needle + needlesize; 434126209Sache const char *hp; 435126209Sache const char *np; 436126209Sache const char *cand; 437126209Sache unsigned int hsum; 438126209Sache unsigned int nsum; 439126209Sache unsigned int eqp; 440126209Sache 441126209Sache /* trivial checks first 442126209Sache * a 0-sized needle is defined to be found anywhere in haystack 443126209Sache * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 444250724Sjkim * that happens to begin with *NEEDLE) */ 445250724Sjkim if (needlesize == 0UL) { 446250724Sjkim return deconst(hay); 447126209Sache } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 448126209Sache /* trivial */ 449126209Sache return NULL; 450126209Sache } 451126209Sache 452126209Sache /* First characters of haystack and needle are the same now. Both are 453126209Sache * guaranteed to be at least one character long. Now computes the sum 454126209Sache * of characters values of needle together with the sum of the first 455126209Sache * needle_len characters of haystack. */ 456126209Sache for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 457126209Sache hp < eoh && np < eon; 458126209Sache hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 459126209Sache 460126209Sache /* HP now references the (NEEDLESIZE + 1)-th character. */ 461250724Sjkim if (np < eon) { 462126209Sache /* haystack is smaller than needle, :O */ 463126209Sache return NULL; 464250724Sjkim } else if (eqp) { 465126209Sache /* found a match */ 466126209Sache return deconst(hay); 467126209Sache } 468250724Sjkim 469126209Sache /* now loop through the rest of haystack, 470250724Sjkim * updating the sum iteratively */ 471250724Sjkim for (cand = hay; hp < eoh; hp++) { 472250724Sjkim hsum ^= *cand++; 473250724Sjkim hsum ^= *hp; 474250724Sjkim 475250724Sjkim /* Since the sum of the characters is already known to be 476126209Sache * equal at that point, it is enough to check just NEEDLESIZE - 1 477250724Sjkim * characters for equality, 478126209Sache * also CAND is by design < HP, so no need for range checks */ 479126209Sache if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 480126209Sache return deconst(cand); 481250724Sjkim } 482126209Sache } 483126209Sache return NULL; 484126209Sache} 485126209Sache 486126209Sachestatic int 487126209Sachestrtoi_lim(const char *str, const char **ep, int llim, int ulim) 488126209Sache{ 489250724Sjkim int res = 0; 490250724Sjkim const char *sp; 491250724Sjkim /* we keep track of the number of digits via rulim */ 492126209Sache int rulim; 493126209Sache 494126209Sache for (sp = str, rulim = ulim > 10 ? ulim : 10; 495126209Sache res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 496250724Sjkim sp++, rulim /= 10) { 497250724Sjkim res *= 10; 498250724Sjkim res += *sp - '0'; 499250724Sjkim } 500126209Sache if (sp == str) { 501126209Sache res = -1; 502126209Sache } else if (res < llim || res > ulim) { 503126209Sache res = -2; 504250724Sjkim } 505250724Sjkim *ep = (const char*)sp; 506126209Sache return res; 507126209Sache} 508126209Sache 509250724Sjkimstatic time_t 510250724Sjkimtime_from_tm(struct tm *t) 511250724Sjkim{ 512250724Sjkim#if HAVE_TIMEGM 513126209Sache /* Use platform timegm() if available. */ 514126209Sache return (timegm(t)); 515126209Sache#elif HAVE__MKGMTIME64 516126209Sache return (_mkgmtime64(t)); 517126209Sache#else 518126209Sache /* Else use direct calculation using POSIX assumptions. */ 519126209Sache /* First, fix up tm_yday based on the year/month/day. */ 520126209Sache if (mktime(t) == (time_t)-1) 521126209Sache return ((time_t)-1); 522126209Sache /* Then we can compute timegm() from first principles. */ 523126209Sache return (t->tm_sec 524126209Sache + t->tm_min * 60 525126209Sache + t->tm_hour * 3600 526126209Sache + t->tm_yday * 86400 527250724Sjkim + (t->tm_year - 70) * 31536000 528250724Sjkim + ((t->tm_year - 69) / 4) * 86400 529250724Sjkim - ((t->tm_year - 1) / 100) * 86400 530250724Sjkim + ((t->tm_year + 299) / 400) * 86400); 531250724Sjkim#endif 532126209Sache} 533250724Sjkim 534126209Sachestatic time_t 535126209Sachexstrpisotime(const char *s, char **endptr) 536250724Sjkim{ 537250724Sjkim/** like strptime() but strictly for ISO 8601 Zulu strings */ 538126209Sache struct tm tm; 539126209Sache time_t res = (time_t)-1; 540126209Sache 541131543Stjr /* make sure tm is clean */ 542131543Stjr memset(&tm, 0, sizeof(tm)); 543131543Stjr 544131543Stjr /* as a courtesy to our callers, and since this is a non-standard 545131543Stjr * routine, we skip leading whitespace */ 546131543Stjr while (*s == ' ' || *s == '\t') 547131543Stjr ++s; 548131543Stjr 549131543Stjr /* read year */ 550131543Stjr if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 551131543Stjr goto out; 552146040Stjr } 553146040Stjr /* read month */ 554250724Sjkim if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 555250724Sjkim goto out; 556146040Stjr } 557146040Stjr /* read day-of-month */ 558146040Stjr if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 559146040Stjr goto out; 560146040Stjr } 561131543Stjr /* read hour */ 562126209Sache if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 563250724Sjkim goto out; 564250724Sjkim } 565250724Sjkim /* read minute */ 566126209Sache if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 567250724Sjkim goto out; 568250724Sjkim } 569250724Sjkim /* read second */ 570250724Sjkim if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 571126209Sache goto out; 572250724Sjkim } 573250724Sjkim 574126209Sache /* massage TM to fulfill some of POSIX' constraints */ 575250724Sjkim tm.tm_year -= 1900; 576126209Sache tm.tm_mon--; 577126209Sache 578126209Sache /* now convert our custom tm struct to a unix stamp using UTC */ 579126209Sache res = time_from_tm(&tm); 580126209Sache 581126209Sacheout: 582126209Sache if (endptr != NULL) { 583 *endptr = deconst(s); 584 } 585 return res; 586} 587 588static unsigned int 589_warc_rdver(const char *buf, size_t bsz) 590{ 591 static const char magic[] = "WARC/"; 592 const char *c; 593 unsigned int ver = 0U; 594 unsigned int end = 0U; 595 596 if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 597 /* buffer too small or invalid magic */ 598 return ver; 599 } 600 /* looks good so far, read the version number for a laugh */ 601 buf += sizeof(magic) - 1U; 602 603 if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') && 604 isdigit((unsigned char)buf[2U])) { 605 /* we support a maximum of 2 digits in the minor version */ 606 if (isdigit((unsigned char)buf[3U])) 607 end = 1U; 608 /* set up major version */ 609 ver = (buf[0U] - '0') * 10000U; 610 /* set up minor version */ 611 if (end == 1U) { 612 ver += (buf[2U] - '0') * 1000U; 613 ver += (buf[3U] - '0') * 100U; 614 } else 615 ver += (buf[2U] - '0') * 100U; 616 /* 617 * WARC below version 0.12 has a space-separated header 618 * WARC 0.12 and above terminates the version with a CRLF 619 */ 620 c = buf + 3U + end; 621 if (ver >= 1200U) { 622 if (memcmp(c, "\r\n", 2U) != 0) 623 ver = 0U; 624 } else { 625 /* ver < 1200U */ 626 if (*c != ' ' && *c != '\t') 627 ver = 0U; 628 } 629 } 630 return ver; 631} 632 633static unsigned int 634_warc_rdtyp(const char *buf, size_t bsz) 635{ 636 static const char _key[] = "\r\nWARC-Type:"; 637 const char *val, *eol; 638 639 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 640 /* no bother */ 641 return WT_NONE; 642 } 643 val += sizeof(_key) - 1U; 644 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 645 /* no end of line */ 646 return WT_NONE; 647 } 648 649 /* overread whitespace */ 650 while (val < eol && (*val == ' ' || *val == '\t')) 651 ++val; 652 653 if (val + 8U == eol) { 654 if (memcmp(val, "resource", 8U) == 0) 655 return WT_RSRC; 656 else if (memcmp(val, "response", 8U) == 0) 657 return WT_RSP; 658 } 659 return WT_NONE; 660} 661 662static warc_string_t 663_warc_rduri(const char *buf, size_t bsz) 664{ 665 static const char _key[] = "\r\nWARC-Target-URI:"; 666 const char *val, *uri, *eol, *p; 667 warc_string_t res = {0U, NULL}; 668 669 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 670 /* no bother */ 671 return res; 672 } 673 /* overread whitespace */ 674 val += sizeof(_key) - 1U; 675 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 676 /* no end of line */ 677 return res; 678 } 679 680 while (val < eol && (*val == ' ' || *val == '\t')) 681 ++val; 682 683 /* overread URL designators */ 684 if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) { 685 /* not touching that! */ 686 return res; 687 } 688 689 /* spaces inside uri are not allowed, CRLF should follow */ 690 for (p = val; p < eol; p++) { 691 if (isspace((unsigned char)*p)) 692 return res; 693 } 694 695 /* there must be at least space for ftp */ 696 if (uri < (val + 3U)) 697 return res; 698 699 /* move uri to point to after :// */ 700 uri += 3U; 701 702 /* now then, inspect the URI */ 703 if (memcmp(val, "file", 4U) == 0) { 704 /* perfect, nothing left to do here */ 705 706 } else if (memcmp(val, "http", 4U) == 0 || 707 memcmp(val, "ftp", 3U) == 0) { 708 /* overread domain, and the first / */ 709 while (uri < eol && *uri++ != '/'); 710 } else { 711 /* not sure what to do? best to bugger off */ 712 return res; 713 } 714 res.str = uri; 715 res.len = eol - uri; 716 return res; 717} 718 719static ssize_t 720_warc_rdlen(const char *buf, size_t bsz) 721{ 722 static const char _key[] = "\r\nContent-Length:"; 723 const char *val, *eol; 724 char *on = NULL; 725 long int len; 726 727 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 728 /* no bother */ 729 return -1; 730 } 731 val += sizeof(_key) - 1U; 732 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 733 /* no end of line */ 734 return -1; 735 } 736 737 /* skip leading whitespace */ 738 while (val < eol && (*val == ' ' || *val == '\t')) 739 val++; 740 /* there must be at least one digit */ 741 if (!isdigit((unsigned char)*val)) 742 return -1; 743 errno = 0; 744 len = strtol(val, &on, 10); 745 if (errno != 0 || on != eol) { 746 /* line must end here */ 747 return -1; 748 } 749 750 return (size_t)len; 751} 752 753static time_t 754_warc_rdrtm(const char *buf, size_t bsz) 755{ 756 static const char _key[] = "\r\nWARC-Date:"; 757 const char *val, *eol; 758 char *on = NULL; 759 time_t res; 760 761 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 762 /* no bother */ 763 return (time_t)-1; 764 } 765 val += sizeof(_key) - 1U; 766 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 767 /* no end of line */ 768 return -1; 769 } 770 771 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 772 res = xstrpisotime(val, &on); 773 if (on != eol) { 774 /* line must end here */ 775 return -1; 776 } 777 return res; 778} 779 780static time_t 781_warc_rdmtm(const char *buf, size_t bsz) 782{ 783 static const char _key[] = "\r\nLast-Modified:"; 784 const char *val, *eol; 785 char *on = NULL; 786 time_t res; 787 788 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 789 /* no bother */ 790 return (time_t)-1; 791 } 792 val += sizeof(_key) - 1U; 793 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 794 /* no end of line */ 795 return -1; 796 } 797 798 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 799 res = xstrpisotime(val, &on); 800 if (on != eol) { 801 /* line must end here */ 802 return -1; 803 } 804 return res; 805} 806 807static const char* 808_warc_find_eoh(const char *buf, size_t bsz) 809{ 810 static const char _marker[] = "\r\n\r\n"; 811 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 812 813 if (hit != NULL) { 814 hit += sizeof(_marker) - 1U; 815 } 816 return hit; 817} 818 819static const char* 820_warc_find_eol(const char *buf, size_t bsz) 821{ 822 static const char _marker[] = "\r\n"; 823 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 824 825 return hit; 826} 827/* archive_read_support_format_warc.c ends here */ 828