1/*- 2 * Copyright (c) 2014 Sebastian Freundt 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR 15 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 16 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 17 * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, 18 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 19 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 23 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26#include "archive_platform.h" 27 28/** 29 * WARC is standardised by ISO TC46/SC4/WG12 and currently available as 30 * ISO 28500:2009. 31 * For the purposes of this file we used the final draft from: 32 * http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf 33 * 34 * Todo: 35 * [ ] real-world warcs can contain resources at endpoints ending in / 36 * e.g. http://bibnum.bnf.fr/warc/ 37 * if you're lucky their response contains a Content-Location: header 38 * pointing to a unix-compliant filename, in the example above it's 39 * Content-Location: http://bibnum.bnf.fr/warc/index.html 40 * however, that's not mandated and github for example doesn't follow 41 * this convention. 42 * We need a set of archive options to control what to do with 43 * entries like these, at the moment care is taken to skip them. 44 * 45 **/ 46 47#ifdef HAVE_SYS_STAT_H 48#include <sys/stat.h> 49#endif 50#ifdef HAVE_ERRNO_H 51#include <errno.h> 52#endif 53#ifdef HAVE_STDLIB_H 54#include <stdlib.h> 55#endif 56#ifdef HAVE_STRING_H 57#include <string.h> 58#endif 59#ifdef HAVE_LIMITS_H 60#include <limits.h> 61#endif 62#ifdef HAVE_CTYPE_H 63#include <ctype.h> 64#endif 65#ifdef HAVE_TIME_H 66#include <time.h> 67#endif 68 69#include "archive.h" 70#include "archive_entry.h" 71#include "archive_private.h" 72#include "archive_read_private.h" 73 74typedef enum { 75 WT_NONE, 76 /* warcinfo */ 77 WT_INFO, 78 /* metadata */ 79 WT_META, 80 /* resource */ 81 WT_RSRC, 82 /* request, unsupported */ 83 WT_REQ, 84 /* response, unsupported */ 85 WT_RSP, 86 /* revisit, unsupported */ 87 WT_RVIS, 88 /* conversion, unsupported */ 89 WT_CONV, 90 /* continuation, unsupported at the moment */ 91 WT_CONT, 92 /* invalid type */ 93 LAST_WT 94} warc_type_t; 95 96typedef struct { 97 size_t len; 98 const char *str; 99} warc_string_t; 100 101typedef struct { 102 size_t len; 103 char *str; 104} warc_strbuf_t; 105 106struct warc_s { 107 /* content length ahead */ 108 size_t cntlen; 109 /* and how much we've processed so far */ 110 size_t cntoff; 111 /* and how much we need to consume between calls */ 112 size_t unconsumed; 113 114 /* string pool */ 115 warc_strbuf_t pool; 116 /* previous version */ 117 unsigned int pver; 118 /* stringified format name */ 119 struct archive_string sver; 120}; 121 122static int _warc_bid(struct archive_read *a, int); 123static int _warc_cleanup(struct archive_read *a); 124static int _warc_read(struct archive_read*, const void**, size_t*, int64_t*); 125static int _warc_skip(struct archive_read *a); 126static int _warc_rdhdr(struct archive_read *a, struct archive_entry *e); 127 128/* private routines */ 129static unsigned int _warc_rdver(const char *buf, size_t bsz); 130static unsigned int _warc_rdtyp(const char *buf, size_t bsz); 131static warc_string_t _warc_rduri(const char *buf, size_t bsz); 132static ssize_t _warc_rdlen(const char *buf, size_t bsz); 133static time_t _warc_rdrtm(const char *buf, size_t bsz); 134static time_t _warc_rdmtm(const char *buf, size_t bsz); 135static const char *_warc_find_eoh(const char *buf, size_t bsz); 136static const char *_warc_find_eol(const char *buf, size_t bsz); 137 138int 139archive_read_support_format_warc(struct archive *_a) 140{ 141 struct archive_read *a = (struct archive_read *)_a; 142 struct warc_s *w; 143 int r; 144 145 archive_check_magic(_a, ARCHIVE_READ_MAGIC, 146 ARCHIVE_STATE_NEW, "archive_read_support_format_warc"); 147 148 if ((w = calloc(1, sizeof(*w))) == NULL) { 149 archive_set_error(&a->archive, ENOMEM, 150 "Can't allocate warc data"); 151 return (ARCHIVE_FATAL); 152 } 153 154 r = __archive_read_register_format( 155 a, w, "warc", 156 _warc_bid, NULL, _warc_rdhdr, _warc_read, 157 _warc_skip, NULL, _warc_cleanup, NULL, NULL); 158 159 if (r != ARCHIVE_OK) { 160 free(w); 161 return (r); 162 } 163 return (ARCHIVE_OK); 164} 165 166static int 167_warc_cleanup(struct archive_read *a) 168{ 169 struct warc_s *w = a->format->data; 170 171 if (w->pool.len > 0U) { 172 free(w->pool.str); 173 } 174 archive_string_free(&w->sver); 175 free(w); 176 a->format->data = NULL; 177 return (ARCHIVE_OK); 178} 179 180static int 181_warc_bid(struct archive_read *a, int best_bid) 182{ 183 const char *hdr; 184 ssize_t nrd; 185 unsigned int ver; 186 187 (void)best_bid; /* UNUSED */ 188 189 /* check first line of file, it should be a record already */ 190 if ((hdr = __archive_read_ahead(a, 12U, &nrd)) == NULL) { 191 /* no idea what to do */ 192 return -1; 193 } else if (nrd < 12) { 194 /* nah, not for us, our magic cookie is at least 12 bytes */ 195 return -1; 196 } 197 198 /* otherwise snarf the record's version number */ 199 ver = _warc_rdver(hdr, nrd); 200 if (ver < 1200U || ver > 10000U) { 201 /* we only support WARC 0.12 to 1.0 */ 202 return -1; 203 } 204 205 /* otherwise be confident */ 206 return (64); 207} 208 209static int 210_warc_rdhdr(struct archive_read *a, struct archive_entry *entry) 211{ 212#define HDR_PROBE_LEN (12U) 213 struct warc_s *w = a->format->data; 214 unsigned int ver; 215 const char *buf; 216 ssize_t nrd; 217 const char *eoh; 218 char *tmp; 219 /* for the file name, saves some strndup()'ing */ 220 warc_string_t fnam; 221 /* warc record type, not that we really use it a lot */ 222 warc_type_t ftyp; 223 /* content-length+error monad */ 224 ssize_t cntlen; 225 /* record time is the WARC-Date time we reinterpret it as ctime */ 226 time_t rtime; 227 /* mtime is the Last-Modified time which will be the entry's mtime */ 228 time_t mtime; 229 230start_over: 231 /* just use read_ahead() they keep track of unconsumed 232 * bits and bobs for us; no need to put an extra shift in 233 * and reproduce that functionality here */ 234 buf = __archive_read_ahead(a, HDR_PROBE_LEN, &nrd); 235 236 if (nrd < 0) { 237 /* no good */ 238 archive_set_error( 239 &a->archive, ARCHIVE_ERRNO_MISC, 240 "Bad record header"); 241 return (ARCHIVE_FATAL); 242 } else if (buf == NULL) { 243 /* there should be room for at least WARC/bla\r\n 244 * must be EOF therefore */ 245 return (ARCHIVE_EOF); 246 } 247 /* looks good so far, try and find the end of the header now */ 248 eoh = _warc_find_eoh(buf, nrd); 249 if (eoh == NULL) { 250 /* still no good, the header end might be beyond the 251 * probe we've requested, but then again who'd cram 252 * so much stuff into the header *and* be 28500-compliant */ 253 archive_set_error( 254 &a->archive, ARCHIVE_ERRNO_MISC, 255 "Bad record header"); 256 return (ARCHIVE_FATAL); 257 } 258 ver = _warc_rdver(buf, eoh - buf); 259 /* we currently support WARC 0.12 to 1.0 */ 260 if (ver == 0U) { 261 archive_set_error( 262 &a->archive, ARCHIVE_ERRNO_MISC, 263 "Invalid record version"); 264 return (ARCHIVE_FATAL); 265 } else if (ver < 1200U || ver > 10000U) { 266 archive_set_error( 267 &a->archive, ARCHIVE_ERRNO_MISC, 268 "Unsupported record version: %u.%u", 269 ver / 10000, (ver % 10000) / 100); 270 return (ARCHIVE_FATAL); 271 } 272 cntlen = _warc_rdlen(buf, eoh - buf); 273 if (cntlen < 0) { 274 /* nightmare! the specs say content-length is mandatory 275 * so I don't feel overly bad stopping the reader here */ 276 archive_set_error( 277 &a->archive, EINVAL, 278 "Bad content length"); 279 return (ARCHIVE_FATAL); 280 } 281 rtime = _warc_rdrtm(buf, eoh - buf); 282 if (rtime == (time_t)-1) { 283 /* record time is mandatory as per WARC/1.0, 284 * so just barf here, fast and loud */ 285 archive_set_error( 286 &a->archive, EINVAL, 287 "Bad record time"); 288 return (ARCHIVE_FATAL); 289 } 290 291 /* let the world know we're a WARC archive */ 292 a->archive.archive_format = ARCHIVE_FORMAT_WARC; 293 if (ver != w->pver) { 294 /* stringify this entry's version */ 295 archive_string_sprintf(&w->sver, 296 "WARC/%u.%u", ver / 10000, (ver % 10000) / 100); 297 /* remember the version */ 298 w->pver = ver; 299 } 300 /* start off with the type */ 301 ftyp = _warc_rdtyp(buf, eoh - buf); 302 /* and let future calls know about the content */ 303 w->cntlen = cntlen; 304 w->cntoff = 0U; 305 mtime = 0;/* Avoid compiling error on some platform. */ 306 307 switch (ftyp) { 308 case WT_RSRC: 309 case WT_RSP: 310 /* only try and read the filename in the cases that are 311 * guaranteed to have one */ 312 fnam = _warc_rduri(buf, eoh - buf); 313 /* check the last character in the URI to avoid creating 314 * directory endpoints as files, see Todo above */ 315 if (fnam.len == 0 || fnam.str[fnam.len - 1] == '/') { 316 /* break here for now */ 317 fnam.len = 0U; 318 fnam.str = NULL; 319 break; 320 } 321 /* bang to our string pool, so we save a 322 * malloc()+free() roundtrip */ 323 if (fnam.len + 1U > w->pool.len) { 324 w->pool.len = ((fnam.len + 64U) / 64U) * 64U; 325 tmp = realloc(w->pool.str, w->pool.len); 326 if (tmp == NULL) { 327 archive_set_error( 328 &a->archive, ENOMEM, 329 "Out of memory"); 330 return (ARCHIVE_FATAL); 331 } 332 w->pool.str = tmp; 333 } 334 memcpy(w->pool.str, fnam.str, fnam.len); 335 w->pool.str[fnam.len] = '\0'; 336 /* let no one else know about the pool, it's a secret, shhh */ 337 fnam.str = w->pool.str; 338 339 /* snarf mtime or deduce from rtime 340 * this is a custom header added by our writer, it's quite 341 * hard to believe anyone else would go through with it 342 * (apart from being part of some http responses of course) */ 343 if ((mtime = _warc_rdmtm(buf, eoh - buf)) == (time_t)-1) { 344 mtime = rtime; 345 } 346 break; 347 case WT_NONE: 348 case WT_INFO: 349 case WT_META: 350 case WT_REQ: 351 case WT_RVIS: 352 case WT_CONV: 353 case WT_CONT: 354 case LAST_WT: 355 default: 356 fnam.len = 0U; 357 fnam.str = NULL; 358 break; 359 } 360 361 /* now eat some of those delicious buffer bits */ 362 __archive_read_consume(a, eoh - buf); 363 364 switch (ftyp) { 365 case WT_RSRC: 366 case WT_RSP: 367 if (fnam.len > 0U) { 368 /* populate entry object */ 369 archive_entry_set_filetype(entry, AE_IFREG); 370 archive_entry_copy_pathname(entry, fnam.str); 371 archive_entry_set_size(entry, cntlen); 372 archive_entry_set_perm(entry, 0644); 373 /* rtime is the new ctime, mtime stays mtime */ 374 archive_entry_set_ctime(entry, rtime, 0L); 375 archive_entry_set_mtime(entry, mtime, 0L); 376 break; 377 } 378 /* FALLTHROUGH */ 379 case WT_NONE: 380 case WT_INFO: 381 case WT_META: 382 case WT_REQ: 383 case WT_RVIS: 384 case WT_CONV: 385 case WT_CONT: 386 case LAST_WT: 387 default: 388 /* consume the content and start over */ 389 _warc_skip(a); 390 goto start_over; 391 } 392 return (ARCHIVE_OK); 393} 394 395static int 396_warc_read(struct archive_read *a, const void **buf, size_t *bsz, int64_t *off) 397{ 398 struct warc_s *w = a->format->data; 399 const char *rab; 400 ssize_t nrd; 401 402 if (w->cntoff >= w->cntlen) { 403 eof: 404 /* it's our lucky day, no work, we can leave early */ 405 *buf = NULL; 406 *bsz = 0U; 407 *off = w->cntoff + 4U/*for \r\n\r\n separator*/; 408 w->unconsumed = 0U; 409 return (ARCHIVE_EOF); 410 } 411 412 if (w->unconsumed) { 413 __archive_read_consume(a, w->unconsumed); 414 w->unconsumed = 0U; 415 } 416 417 rab = __archive_read_ahead(a, 1U, &nrd); 418 if (nrd < 0) { 419 *bsz = 0U; 420 /* big catastrophe */ 421 return (int)nrd; 422 } else if (nrd == 0) { 423 goto eof; 424 } else if ((size_t)nrd > w->cntlen - w->cntoff) { 425 /* clamp to content-length */ 426 nrd = w->cntlen - w->cntoff; 427 } 428 *off = w->cntoff; 429 *bsz = nrd; 430 *buf = rab; 431 432 w->cntoff += nrd; 433 w->unconsumed = (size_t)nrd; 434 return (ARCHIVE_OK); 435} 436 437static int 438_warc_skip(struct archive_read *a) 439{ 440 struct warc_s *w = a->format->data; 441 442 __archive_read_consume(a, w->cntlen + 4U/*\r\n\r\n separator*/); 443 w->cntlen = 0U; 444 w->cntoff = 0U; 445 return (ARCHIVE_OK); 446} 447 448 449/* private routines */ 450static void* 451deconst(const void *c) 452{ 453 return (void *)(uintptr_t)c; 454} 455 456static char* 457xmemmem(const char *hay, const size_t haysize, 458 const char *needle, const size_t needlesize) 459{ 460 const char *const eoh = hay + haysize; 461 const char *const eon = needle + needlesize; 462 const char *hp; 463 const char *np; 464 const char *cand; 465 unsigned int hsum; 466 unsigned int nsum; 467 unsigned int eqp; 468 469 /* trivial checks first 470 * a 0-sized needle is defined to be found anywhere in haystack 471 * then run strchr() to find a candidate in HAYSTACK (i.e. a portion 472 * that happens to begin with *NEEDLE) */ 473 if (needlesize == 0UL) { 474 return deconst(hay); 475 } else if ((hay = memchr(hay, *needle, haysize)) == NULL) { 476 /* trivial */ 477 return NULL; 478 } 479 480 /* First characters of haystack and needle are the same now. Both are 481 * guaranteed to be at least one character long. Now computes the sum 482 * of characters values of needle together with the sum of the first 483 * needle_len characters of haystack. */ 484 for (hp = hay + 1U, np = needle + 1U, hsum = *hay, nsum = *hay, eqp = 1U; 485 hp < eoh && np < eon; 486 hsum ^= *hp, nsum ^= *np, eqp &= *hp == *np, hp++, np++); 487 488 /* HP now references the (NEEDLESIZE + 1)-th character. */ 489 if (np < eon) { 490 /* haystack is smaller than needle, :O */ 491 return NULL; 492 } else if (eqp) { 493 /* found a match */ 494 return deconst(hay); 495 } 496 497 /* now loop through the rest of haystack, 498 * updating the sum iteratively */ 499 for (cand = hay; hp < eoh; hp++) { 500 hsum ^= *cand++; 501 hsum ^= *hp; 502 503 /* Since the sum of the characters is already known to be 504 * equal at that point, it is enough to check just NEEDLESIZE - 1 505 * characters for equality, 506 * also CAND is by design < HP, so no need for range checks */ 507 if (hsum == nsum && memcmp(cand, needle, needlesize - 1U) == 0) { 508 return deconst(cand); 509 } 510 } 511 return NULL; 512} 513 514static int 515strtoi_lim(const char *str, const char **ep, int llim, int ulim) 516{ 517 int res = 0; 518 const char *sp; 519 /* we keep track of the number of digits via rulim */ 520 int rulim; 521 522 for (sp = str, rulim = ulim > 10 ? ulim : 10; 523 res * 10 <= ulim && rulim && *sp >= '0' && *sp <= '9'; 524 sp++, rulim /= 10) { 525 res *= 10; 526 res += *sp - '0'; 527 } 528 if (sp == str) { 529 res = -1; 530 } else if (res < llim || res > ulim) { 531 res = -2; 532 } 533 *ep = (const char*)sp; 534 return res; 535} 536 537static time_t 538time_from_tm(struct tm *t) 539{ 540#if HAVE__MKGMTIME 541 return _mkgmtime(t); 542#elif HAVE_TIMEGM 543 /* Use platform timegm() if available. */ 544 return (timegm(t)); 545#else 546 /* Else use direct calculation using POSIX assumptions. */ 547 /* First, fix up tm_yday based on the year/month/day. */ 548 if (mktime(t) == (time_t)-1) 549 return ((time_t)-1); 550 /* Then we can compute timegm() from first principles. */ 551 return (t->tm_sec 552 + t->tm_min * 60 553 + t->tm_hour * 3600 554 + t->tm_yday * 86400 555 + (t->tm_year - 70) * 31536000 556 + ((t->tm_year - 69) / 4) * 86400 557 - ((t->tm_year - 1) / 100) * 86400 558 + ((t->tm_year + 299) / 400) * 86400); 559#endif 560} 561 562static time_t 563xstrpisotime(const char *s, char **endptr) 564{ 565/** like strptime() but strictly for ISO 8601 Zulu strings */ 566 struct tm tm; 567 time_t res = (time_t)-1; 568 569 /* make sure tm is clean */ 570 memset(&tm, 0, sizeof(tm)); 571 572 /* as a courtesy to our callers, and since this is a non-standard 573 * routine, we skip leading whitespace */ 574 while (*s == ' ' || *s == '\t') 575 ++s; 576 577 /* read year */ 578 if ((tm.tm_year = strtoi_lim(s, &s, 1583, 4095)) < 0 || *s++ != '-') { 579 goto out; 580 } 581 /* read month */ 582 if ((tm.tm_mon = strtoi_lim(s, &s, 1, 12)) < 0 || *s++ != '-') { 583 goto out; 584 } 585 /* read day-of-month */ 586 if ((tm.tm_mday = strtoi_lim(s, &s, 1, 31)) < 0 || *s++ != 'T') { 587 goto out; 588 } 589 /* read hour */ 590 if ((tm.tm_hour = strtoi_lim(s, &s, 0, 23)) < 0 || *s++ != ':') { 591 goto out; 592 } 593 /* read minute */ 594 if ((tm.tm_min = strtoi_lim(s, &s, 0, 59)) < 0 || *s++ != ':') { 595 goto out; 596 } 597 /* read second */ 598 if ((tm.tm_sec = strtoi_lim(s, &s, 0, 60)) < 0 || *s++ != 'Z') { 599 goto out; 600 } 601 602 /* massage TM to fulfill some of POSIX' constraints */ 603 tm.tm_year -= 1900; 604 tm.tm_mon--; 605 606 /* now convert our custom tm struct to a unix stamp using UTC */ 607 res = time_from_tm(&tm); 608 609out: 610 if (endptr != NULL) { 611 *endptr = deconst(s); 612 } 613 return res; 614} 615 616static unsigned int 617_warc_rdver(const char *buf, size_t bsz) 618{ 619 static const char magic[] = "WARC/"; 620 const char *c; 621 unsigned int ver = 0U; 622 unsigned int end = 0U; 623 624 if (bsz < 12 || memcmp(buf, magic, sizeof(magic) - 1U) != 0) { 625 /* buffer too small or invalid magic */ 626 return ver; 627 } 628 /* looks good so far, read the version number for a laugh */ 629 buf += sizeof(magic) - 1U; 630 631 if (isdigit((unsigned char)buf[0U]) && (buf[1U] == '.') && 632 isdigit((unsigned char)buf[2U])) { 633 /* we support a maximum of 2 digits in the minor version */ 634 if (isdigit((unsigned char)buf[3U])) 635 end = 1U; 636 /* set up major version */ 637 ver = (buf[0U] - '0') * 10000U; 638 /* set up minor version */ 639 if (end == 1U) { 640 ver += (buf[2U] - '0') * 1000U; 641 ver += (buf[3U] - '0') * 100U; 642 } else 643 ver += (buf[2U] - '0') * 100U; 644 /* 645 * WARC below version 0.12 has a space-separated header 646 * WARC 0.12 and above terminates the version with a CRLF 647 */ 648 c = buf + 3U + end; 649 if (ver >= 1200U) { 650 if (memcmp(c, "\r\n", 2U) != 0) 651 ver = 0U; 652 } else { 653 /* ver < 1200U */ 654 if (*c != ' ' && *c != '\t') 655 ver = 0U; 656 } 657 } 658 return ver; 659} 660 661static unsigned int 662_warc_rdtyp(const char *buf, size_t bsz) 663{ 664 static const char _key[] = "\r\nWARC-Type:"; 665 const char *val, *eol; 666 667 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 668 /* no bother */ 669 return WT_NONE; 670 } 671 val += sizeof(_key) - 1U; 672 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 673 /* no end of line */ 674 return WT_NONE; 675 } 676 677 /* overread whitespace */ 678 while (val < eol && (*val == ' ' || *val == '\t')) 679 ++val; 680 681 if (val + 8U == eol) { 682 if (memcmp(val, "resource", 8U) == 0) 683 return WT_RSRC; 684 else if (memcmp(val, "response", 8U) == 0) 685 return WT_RSP; 686 } 687 return WT_NONE; 688} 689 690static warc_string_t 691_warc_rduri(const char *buf, size_t bsz) 692{ 693 static const char _key[] = "\r\nWARC-Target-URI:"; 694 const char *val, *uri, *eol, *p; 695 warc_string_t res = {0U, NULL}; 696 697 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 698 /* no bother */ 699 return res; 700 } 701 /* overread whitespace */ 702 val += sizeof(_key) - 1U; 703 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 704 /* no end of line */ 705 return res; 706 } 707 708 while (val < eol && (*val == ' ' || *val == '\t')) 709 ++val; 710 711 /* overread URL designators */ 712 if ((uri = xmemmem(val, eol - val, "://", 3U)) == NULL) { 713 /* not touching that! */ 714 return res; 715 } 716 717 /* spaces inside uri are not allowed, CRLF should follow */ 718 for (p = val; p < eol; p++) { 719 if (isspace((unsigned char)*p)) 720 return res; 721 } 722 723 /* there must be at least space for ftp */ 724 if (uri < (val + 3U)) 725 return res; 726 727 /* move uri to point to after :// */ 728 uri += 3U; 729 730 /* now then, inspect the URI */ 731 if (memcmp(val, "file", 4U) == 0) { 732 /* perfect, nothing left to do here */ 733 734 } else if (memcmp(val, "http", 4U) == 0 || 735 memcmp(val, "ftp", 3U) == 0) { 736 /* overread domain, and the first / */ 737 while (uri < eol && *uri++ != '/'); 738 } else { 739 /* not sure what to do? best to bugger off */ 740 return res; 741 } 742 res.str = uri; 743 res.len = eol - uri; 744 return res; 745} 746 747static ssize_t 748_warc_rdlen(const char *buf, size_t bsz) 749{ 750 static const char _key[] = "\r\nContent-Length:"; 751 const char *val, *eol; 752 char *on = NULL; 753 long int len; 754 755 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 756 /* no bother */ 757 return -1; 758 } 759 val += sizeof(_key) - 1U; 760 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL) { 761 /* no end of line */ 762 return -1; 763 } 764 765 /* skip leading whitespace */ 766 while (val < eol && (*val == ' ' || *val == '\t')) 767 val++; 768 /* there must be at least one digit */ 769 if (!isdigit((unsigned char)*val)) 770 return -1; 771 errno = 0; 772 len = strtol(val, &on, 10); 773 if (errno != 0 || on != eol) { 774 /* line must end here */ 775 return -1; 776 } 777 778 return (size_t)len; 779} 780 781static time_t 782_warc_rdrtm(const char *buf, size_t bsz) 783{ 784 static const char _key[] = "\r\nWARC-Date:"; 785 const char *val, *eol; 786 char *on = NULL; 787 time_t res; 788 789 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 790 /* no bother */ 791 return (time_t)-1; 792 } 793 val += sizeof(_key) - 1U; 794 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 795 /* no end of line */ 796 return -1; 797 } 798 799 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 800 res = xstrpisotime(val, &on); 801 if (on != eol) { 802 /* line must end here */ 803 return -1; 804 } 805 return res; 806} 807 808static time_t 809_warc_rdmtm(const char *buf, size_t bsz) 810{ 811 static const char _key[] = "\r\nLast-Modified:"; 812 const char *val, *eol; 813 char *on = NULL; 814 time_t res; 815 816 if ((val = xmemmem(buf, bsz, _key, sizeof(_key) - 1U)) == NULL) { 817 /* no bother */ 818 return (time_t)-1; 819 } 820 val += sizeof(_key) - 1U; 821 if ((eol = _warc_find_eol(val, buf + bsz - val)) == NULL ) { 822 /* no end of line */ 823 return -1; 824 } 825 826 /* xstrpisotime() kindly overreads whitespace for us, so use that */ 827 res = xstrpisotime(val, &on); 828 if (on != eol) { 829 /* line must end here */ 830 return -1; 831 } 832 return res; 833} 834 835static const char* 836_warc_find_eoh(const char *buf, size_t bsz) 837{ 838 static const char _marker[] = "\r\n\r\n"; 839 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 840 841 if (hit != NULL) { 842 hit += sizeof(_marker) - 1U; 843 } 844 return hit; 845} 846 847static const char* 848_warc_find_eol(const char *buf, size_t bsz) 849{ 850 static const char _marker[] = "\r\n"; 851 const char *hit = xmemmem(buf, bsz, _marker, sizeof(_marker) - 1U); 852 853 return hit; 854} 855/* archive_read_support_format_warc.c ends here */ 856