1/* URL handling. 2 Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 3 2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. 4 5This file is part of GNU Wget. 6 7GNU Wget is free software; you can redistribute it and/or modify 8it under the terms of the GNU General Public License as published by 9the Free Software Foundation; either version 3 of the License, or (at 10your option) any later version. 11 12GNU Wget is distributed in the hope that it will be useful, 13but WITHOUT ANY WARRANTY; without even the implied warranty of 14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15GNU General Public License for more details. 16 17You should have received a copy of the GNU General Public License 18along with Wget. If not, see <http://www.gnu.org/licenses/>. 19 20Additional permission under GNU GPL version 3 section 7 21 22If you modify this program, or any covered work, by linking or 23combining it with the OpenSSL project's OpenSSL library (or a 24modified version of that library), containing parts covered by the 25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation 26grants you additional permission to convey the resulting work. 27Corresponding Source for a non-source form of such a combination 28shall include the source code for the parts of OpenSSL used as well 29as that of the covered work. */ 30 31#include "wget.h" 32 33#include <stdio.h> 34#include <stdlib.h> 35#include <string.h> 36#ifdef HAVE_UNISTD_H 37# include <unistd.h> 38#endif 39#include <errno.h> 40#include <assert.h> 41 42#include "utils.h" 43#include "url.h" 44#include "host.h" /* for is_valid_ipv6_address */ 45 46#ifdef __VMS 47#include "vms.h" 48#endif /* def __VMS */ 49 50#ifdef TESTING 51#include "test.h" 52#endif 53 54enum { 55 scm_disabled = 1, /* for https when OpenSSL fails to init. */ 56 scm_has_params = 2, /* whether scheme has ;params */ 57 scm_has_query = 4, /* whether scheme has ?query */ 58 scm_has_fragment = 8 /* whether scheme has #fragment */ 59}; 60 61struct scheme_data 62{ 63 /* Short name of the scheme, such as "http" or "ftp". */ 64 const char *name; 65 /* Leading string that identifies the scheme, such as "https://". */ 66 const char *leading_string; 67 /* Default port of the scheme when none is specified. */ 68 int default_port; 69 /* Various flags. */ 70 int flags; 71}; 72 73/* Supported schemes: */ 74static struct scheme_data supported_schemes[] = 75{ 76 { "http", "http://", DEFAULT_HTTP_PORT, scm_has_query|scm_has_fragment }, 77#ifdef HAVE_SSL 78 { "https", "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment }, 79#endif 80 { "ftp", "ftp://", DEFAULT_FTP_PORT, scm_has_params|scm_has_fragment }, 81 82 /* SCHEME_INVALID */ 83 { NULL, NULL, -1, 0 } 84}; 85 86/* Forward declarations: */ 87 88static bool path_simplify (enum url_scheme, char *); 89 90/* Support for escaping and unescaping of URL strings. */ 91 92/* Table of "reserved" and "unsafe" characters. Those terms are 93 rfc1738-speak, as such largely obsoleted by rfc2396 and later 94 specs, but the general idea remains. 95 96 A reserved character is the one that you can't decode without 97 changing the meaning of the URL. For example, you can't decode 98 "/foo/%2f/bar" into "/foo///bar" because the number and contents of 99 path components is different. Non-reserved characters can be 100 changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar". The 101 unsafe characters are loosely based on rfc1738, plus "$" and ",", 102 as recommended by rfc2396, and minus "~", which is very frequently 103 used (and sometimes unrecognized as %7E by broken servers). 104 105 An unsafe character is the one that should be encoded when URLs are 106 placed in foreign environments. E.g. space and newline are unsafe 107 in HTTP contexts because HTTP uses them as separator and line 108 terminator, so they must be encoded to %20 and %0A respectively. 109 "*" is unsafe in shell context, etc. 110 111 We determine whether a character is unsafe through static table 112 lookup. This code assumes ASCII character set and 8-bit chars. */ 113 114enum { 115 /* rfc1738 reserved chars + "$" and ",". */ 116 urlchr_reserved = 1, 117 118 /* rfc1738 unsafe chars, plus non-printables. */ 119 urlchr_unsafe = 2 120}; 121 122#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask)) 123#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved) 124#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe) 125 126/* Shorthands for the table: */ 127#define R urlchr_reserved 128#define U urlchr_unsafe 129#define RU R|U 130 131static const unsigned char urlchr_table[256] = 132{ 133 U, U, U, U, U, U, U, U, /* NUL SOH STX ETX EOT ENQ ACK BEL */ 134 U, U, U, U, U, U, U, U, /* BS HT LF VT FF CR SO SI */ 135 U, U, U, U, U, U, U, U, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ 136 U, U, U, U, U, U, U, U, /* CAN EM SUB ESC FS GS RS US */ 137 U, 0, U, RU, R, U, R, 0, /* SP ! " # $ % & ' */ 138 0, 0, 0, R, R, 0, 0, R, /* ( ) * + , - . / */ 139 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 140 0, 0, RU, R, U, R, U, R, /* 8 9 : ; < = > ? */ 141 RU, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 142 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 143 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 144 0, 0, 0, RU, U, RU, U, 0, /* X Y Z [ \ ] ^ _ */ 145 U, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 146 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 147 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 148 0, 0, 0, U, U, U, 0, U, /* x y z { | } ~ DEL */ 149 150 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 151 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 152 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 153 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 154 155 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 156 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 157 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 158 U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, U, 159}; 160#undef R 161#undef U 162#undef RU 163 164/* URL-unescape the string S. 165 166 This is done by transforming the sequences "%HH" to the character 167 represented by the hexadecimal digits HH. If % is not followed by 168 two hexadecimal digits, it is inserted literally. 169 170 The transformation is done in place. If you need the original 171 string intact, make a copy before calling this function. */ 172 173static void 174url_unescape (char *s) 175{ 176 char *t = s; /* t - tortoise */ 177 char *h = s; /* h - hare */ 178 179 for (; *h; h++, t++) 180 { 181 if (*h != '%') 182 { 183 copychar: 184 *t = *h; 185 } 186 else 187 { 188 char c; 189 /* Do nothing if '%' is not followed by two hex digits. */ 190 if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2]))) 191 goto copychar; 192 c = X2DIGITS_TO_NUM (h[1], h[2]); 193 /* Don't unescape %00 because there is no way to insert it 194 into a C string without effectively truncating it. */ 195 if (c == '\0') 196 goto copychar; 197 *t = c; 198 h += 2; 199 } 200 } 201 *t = '\0'; 202} 203 204/* The core of url_escape_* functions. Escapes the characters that 205 match the provided mask in urlchr_table. 206 207 If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be 208 returned unchanged. If ALLOW_PASSTHROUGH is false, a freshly 209 allocated string will be returned in all cases. */ 210 211static char * 212url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough) 213{ 214 const char *p1; 215 char *p2, *newstr; 216 int newlen; 217 int addition = 0; 218 219 for (p1 = s; *p1; p1++) 220 if (urlchr_test (*p1, mask)) 221 addition += 2; /* Two more characters (hex digits) */ 222 223 if (!addition) 224 return allow_passthrough ? (char *)s : xstrdup (s); 225 226 newlen = (p1 - s) + addition; 227 newstr = xmalloc (newlen + 1); 228 229 p1 = s; 230 p2 = newstr; 231 while (*p1) 232 { 233 /* Quote the characters that match the test mask. */ 234 if (urlchr_test (*p1, mask)) 235 { 236 unsigned char c = *p1++; 237 *p2++ = '%'; 238 *p2++ = XNUM_TO_DIGIT (c >> 4); 239 *p2++ = XNUM_TO_DIGIT (c & 0xf); 240 } 241 else 242 *p2++ = *p1++; 243 } 244 assert (p2 - newstr == newlen); 245 *p2 = '\0'; 246 247 return newstr; 248} 249 250/* URL-escape the unsafe characters (see urlchr_table) in a given 251 string, returning a freshly allocated string. */ 252 253char * 254url_escape (const char *s) 255{ 256 return url_escape_1 (s, urlchr_unsafe, false); 257} 258 259/* URL-escape the unsafe and reserved characters (see urlchr_table) in 260 a given string, returning a freshly allocated string. */ 261 262char * 263url_escape_unsafe_and_reserved (const char *s) 264{ 265 return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false); 266} 267 268/* URL-escape the unsafe characters (see urlchr_table) in a given 269 string. If no characters are unsafe, S is returned. */ 270 271static char * 272url_escape_allow_passthrough (const char *s) 273{ 274 return url_escape_1 (s, urlchr_unsafe, true); 275} 276 277/* Decide whether the char at position P needs to be encoded. (It is 278 not enough to pass a single char *P because the function may need 279 to inspect the surrounding context.) 280 281 Return true if the char should be escaped as %XX, false otherwise. */ 282 283static inline bool 284char_needs_escaping (const char *p) 285{ 286 if (*p == '%') 287 { 288 if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2))) 289 return false; 290 else 291 /* Garbled %.. sequence: encode `%'. */ 292 return true; 293 } 294 else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p)) 295 return true; 296 else 297 return false; 298} 299 300/* Translate a %-escaped (but possibly non-conformant) input string S 301 into a %-escaped (and conformant) output string. If no characters 302 are encoded or decoded, return the same string S; otherwise, return 303 a freshly allocated string with the new contents. 304 305 After a URL has been run through this function, the protocols that 306 use `%' as the quote character can use the resulting string as-is, 307 while those that don't can use url_unescape to get to the intended 308 data. This function is stable: once the input is transformed, 309 further transformations of the result yield the same output. 310 311 Let's discuss why this function is needed. 312 313 Imagine Wget is asked to retrieve `http://abc.xyz/abc def'. Since 314 a raw space character would mess up the HTTP request, it needs to 315 be quoted, like this: 316 317 GET /abc%20def HTTP/1.0 318 319 It would appear that the unsafe chars need to be quoted, for 320 example with url_escape. But what if we're requested to download 321 `abc%20def'? url_escape transforms "%" to "%25", which would leave 322 us with `abc%2520def'. This is incorrect -- since %-escapes are 323 part of URL syntax, "%20" is the correct way to denote a literal 324 space on the Wget command line. This leads to the conclusion that 325 in that case Wget should not call url_escape, but leave the `%20' 326 as is. This is clearly contradictory, but it only gets worse. 327 328 What if the requested URI is `abc%20 def'? If we call url_escape, 329 we end up with `/abc%2520%20def', which is almost certainly not 330 intended. If we don't call url_escape, we are left with the 331 embedded space and cannot complete the request. What the user 332 meant was for Wget to request `/abc%20%20def', and this is where 333 reencode_escapes kicks in. 334 335 Wget used to solve this by first decoding %-quotes, and then 336 encoding all the "unsafe" characters found in the resulting string. 337 This was wrong because it didn't preserve certain URL special 338 (reserved) characters. For instance, URI containing "a%2B+b" (0x2b 339 == '+') would get translated to "a%2B%2Bb" or "a++b" depending on 340 whether we considered `+' reserved (it is). One of these results 341 is inevitable because by the second step we would lose information 342 on whether the `+' was originally encoded or not. Both results 343 were wrong because in CGI parameters + means space, while %2B means 344 literal plus. reencode_escapes correctly translates the above to 345 "a%2B+b", i.e. returns the original string. 346 347 This function uses a modified version of the algorithm originally 348 proposed by Anon Sricharoenchai: 349 350 * Encode all "unsafe" characters, except those that are also 351 "reserved", to %XX. See urlchr_table for which characters are 352 unsafe and reserved. 353 354 * Encode the "%" characters not followed by two hex digits to 355 "%25". 356 357 * Pass through all other characters and %XX escapes as-is. (Up to 358 Wget 1.10 this decoded %XX escapes corresponding to "safe" 359 characters, but that was obtrusive and broke some servers.) 360 361 Anon's test case: 362 363 "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc" 364 -> 365 "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc" 366 367 Simpler test cases: 368 369 "foo bar" -> "foo%20bar" 370 "foo%20bar" -> "foo%20bar" 371 "foo %20bar" -> "foo%20%20bar" 372 "foo%%20bar" -> "foo%25%20bar" (0x25 == '%') 373 "foo%25%20bar" -> "foo%25%20bar" 374 "foo%2%20bar" -> "foo%252%20bar" 375 "foo+bar" -> "foo+bar" (plus is reserved!) 376 "foo%2b+bar" -> "foo%2b+bar" */ 377 378static char * 379reencode_escapes (const char *s) 380{ 381 const char *p1; 382 char *newstr, *p2; 383 int oldlen, newlen; 384 385 int encode_count = 0; 386 387 /* First pass: inspect the string to see if there's anything to do, 388 and to calculate the new length. */ 389 for (p1 = s; *p1; p1++) 390 if (char_needs_escaping (p1)) 391 ++encode_count; 392 393 if (!encode_count) 394 /* The string is good as it is. */ 395 return (char *) s; /* C const model sucks. */ 396 397 oldlen = p1 - s; 398 /* Each encoding adds two characters (hex digits). */ 399 newlen = oldlen + 2 * encode_count; 400 newstr = xmalloc (newlen + 1); 401 402 /* Second pass: copy the string to the destination address, encoding 403 chars when needed. */ 404 p1 = s; 405 p2 = newstr; 406 407 while (*p1) 408 if (char_needs_escaping (p1)) 409 { 410 unsigned char c = *p1++; 411 *p2++ = '%'; 412 *p2++ = XNUM_TO_DIGIT (c >> 4); 413 *p2++ = XNUM_TO_DIGIT (c & 0xf); 414 } 415 else 416 *p2++ = *p1++; 417 418 *p2 = '\0'; 419 assert (p2 - newstr == newlen); 420 return newstr; 421} 422 423/* Returns the scheme type if the scheme is supported, or 424 SCHEME_INVALID if not. */ 425 426enum url_scheme 427url_scheme (const char *url) 428{ 429 int i; 430 431 for (i = 0; supported_schemes[i].leading_string; i++) 432 if (0 == strncasecmp (url, supported_schemes[i].leading_string, 433 strlen (supported_schemes[i].leading_string))) 434 { 435 if (!(supported_schemes[i].flags & scm_disabled)) 436 return (enum url_scheme) i; 437 else 438 return SCHEME_INVALID; 439 } 440 441 return SCHEME_INVALID; 442} 443 444#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+') 445 446/* Return 1 if the URL begins with any "scheme", 0 otherwise. As 447 currently implemented, it returns true if URL begins with 448 [-+a-zA-Z0-9]+: . */ 449 450bool 451url_has_scheme (const char *url) 452{ 453 const char *p = url; 454 455 /* The first char must be a scheme char. */ 456 if (!*p || !SCHEME_CHAR (*p)) 457 return false; 458 ++p; 459 /* Followed by 0 or more scheme chars. */ 460 while (*p && SCHEME_CHAR (*p)) 461 ++p; 462 /* Terminated by ':'. */ 463 return *p == ':'; 464} 465 466int 467scheme_default_port (enum url_scheme scheme) 468{ 469 return supported_schemes[scheme].default_port; 470} 471 472void 473scheme_disable (enum url_scheme scheme) 474{ 475 supported_schemes[scheme].flags |= scm_disabled; 476} 477 478/* Skip the username and password, if present in the URL. The 479 function should *not* be called with the complete URL, but with the 480 portion after the scheme. 481 482 If no username and password are found, return URL. */ 483 484static const char * 485url_skip_credentials (const char *url) 486{ 487 /* Look for '@' that comes before terminators, such as '/', '?', 488 '#', or ';'. */ 489 const char *p = (const char *)strpbrk (url, "@/?#;"); 490 if (!p || *p != '@') 491 return url; 492 return p + 1; 493} 494 495/* Parse credentials contained in [BEG, END). The region is expected 496 to have come from a URL and is unescaped. */ 497 498static bool 499parse_credentials (const char *beg, const char *end, char **user, char **passwd) 500{ 501 char *colon; 502 const char *userend; 503 504 if (beg == end) 505 return false; /* empty user name */ 506 507 colon = memchr (beg, ':', end - beg); 508 if (colon == beg) 509 return false; /* again empty user name */ 510 511 if (colon) 512 { 513 *passwd = strdupdelim (colon + 1, end); 514 userend = colon; 515 url_unescape (*passwd); 516 } 517 else 518 { 519 *passwd = NULL; 520 userend = end; 521 } 522 *user = strdupdelim (beg, userend); 523 url_unescape (*user); 524 return true; 525} 526 527/* Used by main.c: detect URLs written using the "shorthand" URL forms 528 originally popularized by Netscape and NcFTP. HTTP shorthands look 529 like this: 530 531 www.foo.com[:port]/dir/file -> http://www.foo.com[:port]/dir/file 532 www.foo.com[:port] -> http://www.foo.com[:port] 533 534 FTP shorthands look like this: 535 536 foo.bar.com:dir/file -> ftp://foo.bar.com/dir/file 537 foo.bar.com:/absdir/file -> ftp://foo.bar.com//absdir/file 538 539 If the URL needs not or cannot be rewritten, return NULL. */ 540 541char * 542rewrite_shorthand_url (const char *url) 543{ 544 const char *p; 545 char *ret; 546 547 if (url_scheme (url) != SCHEME_INVALID) 548 return NULL; 549 550 /* Look for a ':' or '/'. The former signifies NcFTP syntax, the 551 latter Netscape. */ 552 p = strpbrk (url, ":/"); 553 if (p == url) 554 return NULL; 555 556 /* If we're looking at "://", it means the URL uses a scheme we 557 don't support, which may include "https" when compiled without 558 SSL support. Don't bogusly rewrite such URLs. */ 559 if (p && p[0] == ':' && p[1] == '/' && p[2] == '/') 560 return NULL; 561 562 if (p && *p == ':') 563 { 564 /* Colon indicates ftp, as in foo.bar.com:path. Check for 565 special case of http port number ("localhost:10000"). */ 566 int digits = strspn (p + 1, "0123456789"); 567 if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0')) 568 goto http; 569 570 /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */ 571 ret = aprintf ("ftp://%s", url); 572 ret[6 + (p - url)] = '/'; 573 } 574 else 575 { 576 http: 577 /* Just prepend "http://" to URL. */ 578 ret = aprintf ("http://%s", url); 579 } 580 return ret; 581} 582 583static void split_path (const char *, char **, char **); 584 585/* Like strpbrk, with the exception that it returns the pointer to the 586 terminating zero (end-of-string aka "eos") if no matching character 587 is found. */ 588 589static inline char * 590strpbrk_or_eos (const char *s, const char *accept) 591{ 592 char *p = strpbrk (s, accept); 593 if (!p) 594 p = strchr (s, '\0'); 595 return p; 596} 597 598/* Turn STR into lowercase; return true if a character was actually 599 changed. */ 600 601static bool 602lowercase_str (char *str) 603{ 604 bool changed = false; 605 for (; *str; str++) 606 if (c_isupper (*str)) 607 { 608 changed = true; 609 *str = c_tolower (*str); 610 } 611 return changed; 612} 613 614static const char * 615init_seps (enum url_scheme scheme) 616{ 617 static char seps[8] = ":/"; 618 char *p = seps + 2; 619 int flags = supported_schemes[scheme].flags; 620 621 if (flags & scm_has_params) 622 *p++ = ';'; 623 if (flags & scm_has_query) 624 *p++ = '?'; 625 if (flags & scm_has_fragment) 626 *p++ = '#'; 627 *p++ = '\0'; 628 return seps; 629} 630 631static const char *parse_errors[] = { 632#define PE_NO_ERROR 0 633 N_("No error"), 634#define PE_UNSUPPORTED_SCHEME 1 635 N_("Unsupported scheme %s"), /* support for format token only here */ 636#define PE_MISSING_SCHEME 2 637 N_("Scheme missing"), 638#define PE_INVALID_HOST_NAME 3 639 N_("Invalid host name"), 640#define PE_BAD_PORT_NUMBER 4 641 N_("Bad port number"), 642#define PE_INVALID_USER_NAME 5 643 N_("Invalid user name"), 644#define PE_UNTERMINATED_IPV6_ADDRESS 6 645 N_("Unterminated IPv6 numeric address"), 646#define PE_IPV6_NOT_SUPPORTED 7 647 N_("IPv6 addresses not supported"), 648#define PE_INVALID_IPV6_ADDRESS 8 649 N_("Invalid IPv6 numeric address") 650}; 651 652/* Parse a URL. 653 654 Return a new struct url if successful, NULL on error. In case of 655 error, and if ERROR is not NULL, also set *ERROR to the appropriate 656 error code. */ 657struct url * 658url_parse (const char *url, int *error, struct iri *iri, bool percent_encode) 659{ 660 struct url *u; 661 const char *p; 662 bool path_modified, host_modified; 663 664 enum url_scheme scheme; 665 const char *seps; 666 667 const char *uname_b, *uname_e; 668 const char *host_b, *host_e; 669 const char *path_b, *path_e; 670 const char *params_b, *params_e; 671 const char *query_b, *query_e; 672 const char *fragment_b, *fragment_e; 673 674 int port; 675 char *user = NULL, *passwd = NULL; 676 677 const char *url_encoded = NULL; 678 char *new_url = NULL; 679 680 int error_code; 681 682 scheme = url_scheme (url); 683 if (scheme == SCHEME_INVALID) 684 { 685 if (url_has_scheme (url)) 686 error_code = PE_UNSUPPORTED_SCHEME; 687 else 688 error_code = PE_MISSING_SCHEME; 689 goto error; 690 } 691 692 if (iri && iri->utf8_encode) 693 { 694 iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url); 695 if (!iri->utf8_encode) 696 new_url = NULL; 697 else 698 iri->orig_url = xstrdup (url); 699 } 700 701 /* XXX XXX Could that change introduce (security) bugs ??? XXX XXX*/ 702 if (percent_encode) 703 url_encoded = reencode_escapes (new_url ? new_url : url); 704 else 705 url_encoded = new_url ? new_url : url; 706 707 p = url_encoded; 708 709 if (new_url && url_encoded != new_url) 710 xfree (new_url); 711 712 p += strlen (supported_schemes[scheme].leading_string); 713 uname_b = p; 714 p = url_skip_credentials (p); 715 uname_e = p; 716 717 /* scheme://user:pass@host[:port]... */ 718 /* ^ */ 719 720 /* We attempt to break down the URL into the components path, 721 params, query, and fragment. They are ordered like this: 722 723 scheme://host[:port][/path][;params][?query][#fragment] */ 724 725 path_b = path_e = NULL; 726 params_b = params_e = NULL; 727 query_b = query_e = NULL; 728 fragment_b = fragment_e = NULL; 729 730 /* Initialize separators for optional parts of URL, depending on the 731 scheme. For example, FTP has params, and HTTP and HTTPS have 732 query string and fragment. */ 733 seps = init_seps (scheme); 734 735 host_b = p; 736 737 if (*p == '[') 738 { 739 /* Handle IPv6 address inside square brackets. Ideally we'd 740 just look for the terminating ']', but rfc2732 mandates 741 rejecting invalid IPv6 addresses. */ 742 743 /* The address begins after '['. */ 744 host_b = p + 1; 745 host_e = strchr (host_b, ']'); 746 747 if (!host_e) 748 { 749 error_code = PE_UNTERMINATED_IPV6_ADDRESS; 750 goto error; 751 } 752 753#ifdef ENABLE_IPV6 754 /* Check if the IPv6 address is valid. */ 755 if (!is_valid_ipv6_address(host_b, host_e)) 756 { 757 error_code = PE_INVALID_IPV6_ADDRESS; 758 goto error; 759 } 760 761 /* Continue parsing after the closing ']'. */ 762 p = host_e + 1; 763#else 764 error_code = PE_IPV6_NOT_SUPPORTED; 765 goto error; 766#endif 767 768 /* The closing bracket must be followed by a separator or by the 769 null char. */ 770 /* http://[::1]... */ 771 /* ^ */ 772 if (!strchr (seps, *p)) 773 { 774 /* Trailing garbage after []-delimited IPv6 address. */ 775 error_code = PE_INVALID_HOST_NAME; 776 goto error; 777 } 778 } 779 else 780 { 781 p = strpbrk_or_eos (p, seps); 782 host_e = p; 783 } 784 ++seps; /* advance to '/' */ 785 786 if (host_b == host_e) 787 { 788 error_code = PE_INVALID_HOST_NAME; 789 goto error; 790 } 791 792 port = scheme_default_port (scheme); 793 if (*p == ':') 794 { 795 const char *port_b, *port_e, *pp; 796 797 /* scheme://host:port/tralala */ 798 /* ^ */ 799 ++p; 800 port_b = p; 801 p = strpbrk_or_eos (p, seps); 802 port_e = p; 803 804 /* Allow empty port, as per rfc2396. */ 805 if (port_b != port_e) 806 for (port = 0, pp = port_b; pp < port_e; pp++) 807 { 808 if (!c_isdigit (*pp)) 809 { 810 /* http://host:12randomgarbage/blah */ 811 /* ^ */ 812 error_code = PE_BAD_PORT_NUMBER; 813 goto error; 814 } 815 port = 10 * port + (*pp - '0'); 816 /* Check for too large port numbers here, before we have 817 a chance to overflow on bogus port values. */ 818 if (port > 0xffff) 819 { 820 error_code = PE_BAD_PORT_NUMBER; 821 goto error; 822 } 823 } 824 } 825 /* Advance to the first separator *after* '/' (either ';' or '?', 826 depending on the scheme). */ 827 ++seps; 828 829 /* Get the optional parts of URL, each part being delimited by 830 current location and the position of the next separator. */ 831#define GET_URL_PART(sepchar, var) do { \ 832 if (*p == sepchar) \ 833 var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps); \ 834 ++seps; \ 835} while (0) 836 837 GET_URL_PART ('/', path); 838 if (supported_schemes[scheme].flags & scm_has_params) 839 GET_URL_PART (';', params); 840 if (supported_schemes[scheme].flags & scm_has_query) 841 GET_URL_PART ('?', query); 842 if (supported_schemes[scheme].flags & scm_has_fragment) 843 GET_URL_PART ('#', fragment); 844 845#undef GET_URL_PART 846 assert (*p == 0); 847 848 if (uname_b != uname_e) 849 { 850 /* http://user:pass@host */ 851 /* ^ ^ */ 852 /* uname_b uname_e */ 853 if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd)) 854 { 855 error_code = PE_INVALID_USER_NAME; 856 goto error; 857 } 858 } 859 860 u = xnew0 (struct url); 861 u->scheme = scheme; 862 u->host = strdupdelim (host_b, host_e); 863 u->port = port; 864 u->user = user; 865 u->passwd = passwd; 866 867 u->path = strdupdelim (path_b, path_e); 868 path_modified = path_simplify (scheme, u->path); 869 split_path (u->path, &u->dir, &u->file); 870 871 host_modified = lowercase_str (u->host); 872 873 /* Decode %HH sequences in host name. This is important not so much 874 to support %HH sequences in host names (which other browser 875 don't), but to support binary characters (which will have been 876 converted to %HH by reencode_escapes). */ 877 if (strchr (u->host, '%')) 878 { 879 url_unescape (u->host); 880 host_modified = true; 881 882 /* Apply IDNA regardless of iri->utf8_encode status */ 883 if (opt.enable_iri && iri) 884 { 885 char *new = idn_encode (iri, u->host); 886 if (new) 887 { 888 xfree (u->host); 889 u->host = new; 890 host_modified = true; 891 } 892 } 893 } 894 895 if (params_b) 896 u->params = strdupdelim (params_b, params_e); 897 if (query_b) 898 u->query = strdupdelim (query_b, query_e); 899 if (fragment_b) 900 u->fragment = strdupdelim (fragment_b, fragment_e); 901 902 if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e) 903 { 904 /* If we suspect that a transformation has rendered what 905 url_string might return different from URL_ENCODED, rebuild 906 u->url using url_string. */ 907 u->url = url_string (u, URL_AUTH_SHOW); 908 909 if (url_encoded != url) 910 xfree ((char *) url_encoded); 911 } 912 else 913 { 914 if (url_encoded == url) 915 u->url = xstrdup (url); 916 else 917 u->url = (char *) url_encoded; 918 } 919 920 return u; 921 922 error: 923 /* Cleanup in case of error: */ 924 if (url_encoded && url_encoded != url) 925 xfree ((char *) url_encoded); 926 927 /* Transmit the error code to the caller, if the caller wants to 928 know. */ 929 if (error) 930 *error = error_code; 931 return NULL; 932} 933 934/* Return the error message string from ERROR_CODE, which should have 935 been retrieved from url_parse. The error message is translated. */ 936 937char * 938url_error (const char *url, int error_code) 939{ 940 assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors)); 941 942 if (error_code == PE_UNSUPPORTED_SCHEME) 943 { 944 char *error, *p; 945 char *scheme = xstrdup (url); 946 assert (url_has_scheme (url)); 947 948 if ((p = strchr (scheme, ':'))) 949 *p = '\0'; 950 if (!strcasecmp (scheme, "https")) 951 error = aprintf (_("HTTPS support not compiled in")); 952 else 953 error = aprintf (_(parse_errors[error_code]), quote (scheme)); 954 xfree (scheme); 955 956 return error; 957 } 958 else 959 return xstrdup (_(parse_errors[error_code])); 960} 961 962/* Split PATH into DIR and FILE. PATH comes from the URL and is 963 expected to be URL-escaped. 964 965 The path is split into directory (the part up to the last slash) 966 and file (the part after the last slash), which are subsequently 967 unescaped. Examples: 968 969 PATH DIR FILE 970 "foo/bar/baz" "foo/bar" "baz" 971 "foo/bar/" "foo/bar" "" 972 "foo" "" "foo" 973 "foo/bar/baz%2fqux" "foo/bar" "baz/qux" (!) 974 975 DIR and FILE are freshly allocated. */ 976 977static void 978split_path (const char *path, char **dir, char **file) 979{ 980 char *last_slash = strrchr (path, '/'); 981 if (!last_slash) 982 { 983 *dir = xstrdup (""); 984 *file = xstrdup (path); 985 } 986 else 987 { 988 *dir = strdupdelim (path, last_slash); 989 *file = xstrdup (last_slash + 1); 990 } 991 url_unescape (*dir); 992 url_unescape (*file); 993} 994 995/* Note: URL's "full path" is the path with the query string and 996 params appended. The "fragment" (#foo) is intentionally ignored, 997 but that might be changed. For example, if the original URL was 998 "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment", 999 the full path will be "/foo/bar/baz;bullshit?querystring". */ 1000 1001/* Return the length of the full path, without the terminating 1002 zero. */ 1003 1004static int 1005full_path_length (const struct url *url) 1006{ 1007 int len = 0; 1008 1009#define FROB(el) if (url->el) len += 1 + strlen (url->el) 1010 1011 FROB (path); 1012 FROB (params); 1013 FROB (query); 1014 1015#undef FROB 1016 1017 return len; 1018} 1019 1020/* Write out the full path. */ 1021 1022static void 1023full_path_write (const struct url *url, char *where) 1024{ 1025#define FROB(el, chr) do { \ 1026 char *f_el = url->el; \ 1027 if (f_el) { \ 1028 int l = strlen (f_el); \ 1029 *where++ = chr; \ 1030 memcpy (where, f_el, l); \ 1031 where += l; \ 1032 } \ 1033} while (0) 1034 1035 FROB (path, '/'); 1036 FROB (params, ';'); 1037 FROB (query, '?'); 1038 1039#undef FROB 1040} 1041 1042/* Public function for getting the "full path". E.g. if u->path is 1043 "foo/bar" and u->query is "param=value", full_path will be 1044 "/foo/bar?param=value". */ 1045 1046char * 1047url_full_path (const struct url *url) 1048{ 1049 int length = full_path_length (url); 1050 char *full_path = xmalloc (length + 1); 1051 1052 full_path_write (url, full_path); 1053 full_path[length] = '\0'; 1054 1055 return full_path; 1056} 1057 1058/* Unescape CHR in an otherwise escaped STR. Used to selectively 1059 escaping of certain characters, such as "/" and ":". Returns a 1060 count of unescaped chars. */ 1061 1062static void 1063unescape_single_char (char *str, char chr) 1064{ 1065 const char c1 = XNUM_TO_DIGIT (chr >> 4); 1066 const char c2 = XNUM_TO_DIGIT (chr & 0xf); 1067 char *h = str; /* hare */ 1068 char *t = str; /* tortoise */ 1069 for (; *h; h++, t++) 1070 { 1071 if (h[0] == '%' && h[1] == c1 && h[2] == c2) 1072 { 1073 *t = chr; 1074 h += 2; 1075 } 1076 else 1077 *t = *h; 1078 } 1079 *t = '\0'; 1080} 1081 1082/* Escape unsafe and reserved characters, except for the slash 1083 characters. */ 1084 1085static char * 1086url_escape_dir (const char *dir) 1087{ 1088 char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1); 1089 if (newdir == dir) 1090 return (char *)dir; 1091 1092 unescape_single_char (newdir, '/'); 1093 return newdir; 1094} 1095 1096/* Sync u->path and u->url with u->dir and u->file. Called after 1097 u->file or u->dir have been changed, typically by the FTP code. */ 1098 1099static void 1100sync_path (struct url *u) 1101{ 1102 char *newpath, *efile, *edir; 1103 1104 xfree (u->path); 1105 1106 /* u->dir and u->file are not escaped. URL-escape them before 1107 reassembling them into u->path. That way, if they contain 1108 separators like '?' or even if u->file contains slashes, the 1109 path will be correctly assembled. (u->file can contain slashes 1110 if the URL specifies it with %2f, or if an FTP server returns 1111 it.) */ 1112 edir = url_escape_dir (u->dir); 1113 efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1); 1114 1115 if (!*edir) 1116 newpath = xstrdup (efile); 1117 else 1118 { 1119 int dirlen = strlen (edir); 1120 int filelen = strlen (efile); 1121 1122 /* Copy "DIR/FILE" to newpath. */ 1123 char *p = newpath = xmalloc (dirlen + 1 + filelen + 1); 1124 memcpy (p, edir, dirlen); 1125 p += dirlen; 1126 *p++ = '/'; 1127 memcpy (p, efile, filelen); 1128 p += filelen; 1129 *p = '\0'; 1130 } 1131 1132 u->path = newpath; 1133 1134 if (edir != u->dir) 1135 xfree (edir); 1136 if (efile != u->file) 1137 xfree (efile); 1138 1139 /* Regenerate u->url as well. */ 1140 xfree (u->url); 1141 u->url = url_string (u, URL_AUTH_SHOW); 1142} 1143 1144/* Mutators. Code in ftp.c insists on changing u->dir and u->file. 1145 This way we can sync u->path and u->url when they get changed. */ 1146 1147void 1148url_set_dir (struct url *url, const char *newdir) 1149{ 1150 xfree (url->dir); 1151 url->dir = xstrdup (newdir); 1152 sync_path (url); 1153} 1154 1155void 1156url_set_file (struct url *url, const char *newfile) 1157{ 1158 xfree (url->file); 1159 url->file = xstrdup (newfile); 1160 sync_path (url); 1161} 1162 1163void 1164url_free (struct url *url) 1165{ 1166 xfree (url->host); 1167 xfree (url->path); 1168 xfree (url->url); 1169 1170 xfree_null (url->params); 1171 xfree_null (url->query); 1172 xfree_null (url->fragment); 1173 xfree_null (url->user); 1174 xfree_null (url->passwd); 1175 1176 xfree (url->dir); 1177 xfree (url->file); 1178 1179 xfree (url); 1180} 1181 1182/* Create all the necessary directories for PATH (a file). Calls 1183 make_directory internally. */ 1184int 1185mkalldirs (const char *path) 1186{ 1187 const char *p; 1188 char *t; 1189 struct_stat st; 1190 int res; 1191 1192 p = path + strlen (path); 1193 for (; *p != '/' && p != path; p--) 1194 ; 1195 1196 /* Don't create if it's just a file. */ 1197 if ((p == path) && (*p != '/')) 1198 return 0; 1199 t = strdupdelim (path, p); 1200 1201 /* Check whether the directory exists. */ 1202 if ((stat (t, &st) == 0)) 1203 { 1204 if (S_ISDIR (st.st_mode)) 1205 { 1206 xfree (t); 1207 return 0; 1208 } 1209 else 1210 { 1211 /* If the dir exists as a file name, remove it first. This 1212 is *only* for Wget to work with buggy old CERN http 1213 servers. Here is the scenario: When Wget tries to 1214 retrieve a directory without a slash, e.g. 1215 http://foo/bar (bar being a directory), CERN server will 1216 not redirect it too http://foo/bar/ -- it will generate a 1217 directory listing containing links to bar/file1, 1218 bar/file2, etc. Wget will lose because it saves this 1219 HTML listing to a file `bar', so it cannot create the 1220 directory. To work around this, if the file of the same 1221 name exists, we just remove it and create the directory 1222 anyway. */ 1223 DEBUGP (("Removing %s because of directory danger!\n", t)); 1224 unlink (t); 1225 } 1226 } 1227 res = make_directory (t); 1228 if (res != 0) 1229 logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno)); 1230 xfree (t); 1231 return res; 1232} 1233 1234/* Functions for constructing the file name out of URL components. */ 1235 1236/* A growable string structure, used by url_file_name and friends. 1237 This should perhaps be moved to utils.c. 1238 1239 The idea is to have a convenient and efficient way to construct a 1240 string by having various functions append data to it. Instead of 1241 passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the 1242 functions in questions, we pass the pointer to this struct. */ 1243 1244struct growable { 1245 char *base; 1246 int size; 1247 int tail; 1248}; 1249 1250/* Ensure that the string can accept APPEND_COUNT more characters past 1251 the current TAIL position. If necessary, this will grow the string 1252 and update its allocated size. If the string is already large 1253 enough to take TAIL+APPEND_COUNT characters, this does nothing. */ 1254#define GROW(g, append_size) do { \ 1255 struct growable *G_ = g; \ 1256 DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char); \ 1257} while (0) 1258 1259/* Return the tail position of the string. */ 1260#define TAIL(r) ((r)->base + (r)->tail) 1261 1262/* Move the tail position by APPEND_COUNT characters. */ 1263#define TAIL_INCR(r, append_count) ((r)->tail += append_count) 1264 1265/* Append the string STR to DEST. NOTICE: the string in DEST is not 1266 terminated. */ 1267 1268static void 1269append_string (const char *str, struct growable *dest) 1270{ 1271 int l = strlen (str); 1272 GROW (dest, l); 1273 memcpy (TAIL (dest), str, l); 1274 TAIL_INCR (dest, l); 1275} 1276 1277/* Append CH to DEST. For example, append_char (0, DEST) 1278 zero-terminates DEST. */ 1279 1280static void 1281append_char (char ch, struct growable *dest) 1282{ 1283 GROW (dest, 1); 1284 *TAIL (dest) = ch; 1285 TAIL_INCR (dest, 1); 1286} 1287 1288enum { 1289 filechr_not_unix = 1, /* unusable on Unix, / and \0 */ 1290 filechr_not_windows = 2, /* unusable on Windows, one of \|/<>?:*" */ 1291 filechr_control = 4 /* a control character, e.g. 0-31 */ 1292}; 1293 1294#define FILE_CHAR_TEST(c, mask) \ 1295 ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \ 1296 (filechr_table[(unsigned char)(c)] & (mask))) 1297 1298/* Shorthands for the table: */ 1299#define U filechr_not_unix 1300#define W filechr_not_windows 1301#define C filechr_control 1302 1303#define UW U|W 1304#define UWC U|W|C 1305 1306/* Table of characters unsafe under various conditions (see above). 1307 1308 Arguably we could also claim `%' to be unsafe, since we use it as 1309 the escape character. If we ever want to be able to reliably 1310 translate file name back to URL, this would become important 1311 crucial. Right now, it's better to be minimal in escaping. */ 1312 1313static const unsigned char filechr_table[256] = 1314{ 1315UWC, C, C, C, C, C, C, C, /* NUL SOH STX ETX EOT ENQ ACK BEL */ 1316 C, C, C, C, C, C, C, C, /* BS HT LF VT FF CR SO SI */ 1317 C, C, C, C, C, C, C, C, /* DLE DC1 DC2 DC3 DC4 NAK SYN ETB */ 1318 C, C, C, C, C, C, C, C, /* CAN EM SUB ESC FS GS RS US */ 1319 0, 0, W, 0, 0, 0, 0, 0, /* SP ! " # $ % & ' */ 1320 0, 0, W, 0, 0, 0, 0, UW, /* ( ) * + , - . / */ 1321 0, 0, 0, 0, 0, 0, 0, 0, /* 0 1 2 3 4 5 6 7 */ 1322 0, 0, W, 0, W, 0, W, W, /* 8 9 : ; < = > ? */ 1323 0, 0, 0, 0, 0, 0, 0, 0, /* @ A B C D E F G */ 1324 0, 0, 0, 0, 0, 0, 0, 0, /* H I J K L M N O */ 1325 0, 0, 0, 0, 0, 0, 0, 0, /* P Q R S T U V W */ 1326 0, 0, 0, 0, W, 0, 0, 0, /* X Y Z [ \ ] ^ _ */ 1327 0, 0, 0, 0, 0, 0, 0, 0, /* ` a b c d e f g */ 1328 0, 0, 0, 0, 0, 0, 0, 0, /* h i j k l m n o */ 1329 0, 0, 0, 0, 0, 0, 0, 0, /* p q r s t u v w */ 1330 0, 0, 0, 0, W, 0, 0, C, /* x y z { | } ~ DEL */ 1331 1332 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 128-143 */ 1333 C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, /* 144-159 */ 1334 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1335 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1336 1337 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1338 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1339 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1340 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1341}; 1342#undef U 1343#undef W 1344#undef C 1345#undef UW 1346#undef UWC 1347 1348/* FN_PORT_SEP is the separator between host and port in file names 1349 for non-standard port numbers. On Unix this is normally ':', as in 1350 "www.xemacs.org:4001/index.html". Under Windows, we set it to + 1351 because Windows can't handle ':' in file names. */ 1352#define FN_PORT_SEP (opt.restrict_files_os != restrict_windows ? ':' : '+') 1353 1354/* FN_QUERY_SEP is the separator between the file name and the URL 1355 query, normally '?'. Since Windows cannot handle '?' as part of 1356 file name, we use '@' instead there. */ 1357#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@') 1358 1359/* Quote path element, characters in [b, e), as file name, and append 1360 the quoted string to DEST. Each character is quoted as per 1361 file_unsafe_char and the corresponding table. 1362 1363 If ESCAPED is true, the path element is considered to be 1364 URL-escaped and will be unescaped prior to inspection. */ 1365 1366static void 1367append_uri_pathel (const char *b, const char *e, bool escaped, 1368 struct growable *dest) 1369{ 1370 const char *p; 1371 int quoted, outlen; 1372 1373 int mask; 1374 if (opt.restrict_files_os == restrict_unix) 1375 mask = filechr_not_unix; 1376 else 1377 mask = filechr_not_windows; 1378 if (opt.restrict_files_ctrl) 1379 mask |= filechr_control; 1380 1381 /* Copy [b, e) to PATHEL and URL-unescape it. */ 1382 if (escaped) 1383 { 1384 char *unescaped; 1385 BOUNDED_TO_ALLOCA (b, e, unescaped); 1386 url_unescape (unescaped); 1387 b = unescaped; 1388 e = unescaped + strlen (unescaped); 1389 } 1390 1391 /* Defang ".." when found as component of path. Remember that path 1392 comes from the URL and might contain malicious input. */ 1393 if (e - b == 2 && b[0] == '.' && b[1] == '.') 1394 { 1395 b = "%2E%2E"; 1396 e = b + 6; 1397 } 1398 1399 /* Walk the PATHEL string and check how many characters we'll need 1400 to quote. */ 1401 quoted = 0; 1402 for (p = b; p < e; p++) 1403 if (FILE_CHAR_TEST (*p, mask)) 1404 ++quoted; 1405 1406 /* Calculate the length of the output string. e-b is the input 1407 string length. Each quoted char introduces two additional 1408 characters in the string, hence 2*quoted. */ 1409 outlen = (e - b) + (2 * quoted); 1410 GROW (dest, outlen); 1411 1412 if (!quoted) 1413 { 1414 /* If there's nothing to quote, we can simply append the string 1415 without processing it again. */ 1416 memcpy (TAIL (dest), b, outlen); 1417 } 1418 else 1419 { 1420 char *q = TAIL (dest); 1421 for (p = b; p < e; p++) 1422 { 1423 if (!FILE_CHAR_TEST (*p, mask)) 1424 *q++ = *p; 1425 else 1426 { 1427 unsigned char ch = *p; 1428 *q++ = '%'; 1429 *q++ = XNUM_TO_DIGIT (ch >> 4); 1430 *q++ = XNUM_TO_DIGIT (ch & 0xf); 1431 } 1432 } 1433 assert (q - TAIL (dest) == outlen); 1434 } 1435 1436 /* Perform inline case transformation if required. */ 1437 if (opt.restrict_files_case == restrict_lowercase 1438 || opt.restrict_files_case == restrict_uppercase) 1439 { 1440 char *q; 1441 for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q) 1442 { 1443 if (opt.restrict_files_case == restrict_lowercase) 1444 *q = c_tolower (*q); 1445 else 1446 *q = c_toupper (*q); 1447 } 1448 } 1449 1450 TAIL_INCR (dest, outlen); 1451} 1452 1453/* Append to DEST the directory structure that corresponds the 1454 directory part of URL's path. For example, if the URL is 1455 http://server/dir1/dir2/file, this appends "/dir1/dir2". 1456 1457 Each path element ("dir1" and "dir2" in the above example) is 1458 examined, url-unescaped, and re-escaped as file name element. 1459 1460 Additionally, it cuts as many directories from the path as 1461 specified by opt.cut_dirs. For example, if opt.cut_dirs is 1, it 1462 will produce "bar" for the above example. For 2 or more, it will 1463 produce "". 1464 1465 Each component of the path is quoted for use as file name. */ 1466 1467static void 1468append_dir_structure (const struct url *u, struct growable *dest) 1469{ 1470 char *pathel, *next; 1471 int cut = opt.cut_dirs; 1472 1473 /* Go through the path components, de-URL-quote them, and quote them 1474 (if necessary) as file names. */ 1475 1476 pathel = u->path; 1477 for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1) 1478 { 1479 if (cut-- > 0) 1480 continue; 1481 if (pathel == next) 1482 /* Ignore empty pathels. */ 1483 continue; 1484 1485 if (dest->tail) 1486 append_char ('/', dest); 1487 append_uri_pathel (pathel, next, true, dest); 1488 } 1489} 1490 1491/* Return a unique file name that matches the given URL as good as 1492 possible. Does not create directories on the file system. */ 1493 1494char * 1495url_file_name (const struct url *u) 1496{ 1497 struct growable fnres; /* stands for "file name result" */ 1498 1499 const char *u_file, *u_query; 1500 char *fname, *unique; 1501 char *index_filename = "index.html"; /* The default index file is index.html */ 1502 1503 fnres.base = NULL; 1504 fnres.size = 0; 1505 fnres.tail = 0; 1506 1507 /* If an alternative index file was defined, change index_filename */ 1508 if (opt.default_page) 1509 index_filename = opt.default_page; 1510 1511 1512 /* Start with the directory prefix, if specified. */ 1513 if (opt.dir_prefix) 1514 append_string (opt.dir_prefix, &fnres); 1515 1516 /* If "dirstruct" is turned on (typically the case with -r), add 1517 the host and port (unless those have been turned off) and 1518 directory structure. */ 1519 if (opt.dirstruct) 1520 { 1521 if (opt.protocol_directories) 1522 { 1523 if (fnres.tail) 1524 append_char ('/', &fnres); 1525 append_string (supported_schemes[u->scheme].name, &fnres); 1526 } 1527 if (opt.add_hostdir) 1528 { 1529 if (fnres.tail) 1530 append_char ('/', &fnres); 1531 if (0 != strcmp (u->host, "..")) 1532 append_string (u->host, &fnres); 1533 else 1534 /* Host name can come from the network; malicious DNS may 1535 allow ".." to be resolved, causing us to write to 1536 "../<file>". Defang such host names. */ 1537 append_string ("%2E%2E", &fnres); 1538 if (u->port != scheme_default_port (u->scheme)) 1539 { 1540 char portstr[24]; 1541 number_to_string (portstr, u->port); 1542 append_char (FN_PORT_SEP, &fnres); 1543 append_string (portstr, &fnres); 1544 } 1545 } 1546 1547 append_dir_structure (u, &fnres); 1548 } 1549 1550 /* Add the file name. */ 1551 if (fnres.tail) 1552 append_char ('/', &fnres); 1553 u_file = *u->file ? u->file : index_filename; 1554 append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres); 1555 1556 /* Append "?query" to the file name. */ 1557 u_query = u->query && *u->query ? u->query : NULL; 1558 if (u_query) 1559 { 1560 append_char (FN_QUERY_SEP, &fnres); 1561 append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres); 1562 } 1563 1564 /* Zero-terminate the file name. */ 1565 append_char ('\0', &fnres); 1566 1567 fname = fnres.base; 1568 1569 /* Check the cases in which the unique extensions are not used: 1570 1) Clobbering is turned off (-nc). 1571 2) Retrieval with regetting. 1572 3) Timestamping is used. 1573 4) Hierarchy is built. 1574 1575 The exception is the case when file does exist and is a 1576 directory (see `mkalldirs' for explanation). */ 1577 1578 if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct) 1579 && !(file_exists_p (fname) && !file_non_directory_p (fname))) 1580 { 1581 unique = fname; 1582 } 1583 else 1584 { 1585 unique = unique_name (fname, true); 1586 if (unique != fname) 1587 xfree (fname); 1588 } 1589 1590/* On VMS, alter the name as required. */ 1591#ifdef __VMS 1592 { 1593 char *unique2; 1594 1595 unique2 = ods_conform( unique); 1596 if (unique2 != unique) 1597 { 1598 xfree (unique); 1599 unique = unique2; 1600 } 1601 } 1602#endif /* def __VMS */ 1603 1604 return unique; 1605} 1606 1607/* Resolve "." and ".." elements of PATH by destructively modifying 1608 PATH and return true if PATH has been modified, false otherwise. 1609 1610 The algorithm is in spirit similar to the one described in rfc1808, 1611 although implemented differently, in one pass. To recap, path 1612 elements containing only "." are removed, and ".." is taken to mean 1613 "back up one element". Single leading and trailing slashes are 1614 preserved. 1615 1616 For example, "a/b/c/./../d/.." will yield "a/b/". More exhaustive 1617 test examples are provided below. If you change anything in this 1618 function, run test_path_simplify to make sure you haven't broken a 1619 test case. */ 1620 1621static bool 1622path_simplify (enum url_scheme scheme, char *path) 1623{ 1624 char *h = path; /* hare */ 1625 char *t = path; /* tortoise */ 1626 char *beg = path; 1627 char *end = strchr (path, '\0'); 1628 1629 while (h < end) 1630 { 1631 /* Hare should be at the beginning of a path element. */ 1632 1633 if (h[0] == '.' && (h[1] == '/' || h[1] == '\0')) 1634 { 1635 /* Ignore "./". */ 1636 h += 2; 1637 } 1638 else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0')) 1639 { 1640 /* Handle "../" by retreating the tortoise by one path 1641 element -- but not past beggining. */ 1642 if (t > beg) 1643 { 1644 /* Move backwards until T hits the beginning of the 1645 previous path element or the beginning of path. */ 1646 for (--t; t > beg && t[-1] != '/'; t--) 1647 ; 1648 } 1649 else if (scheme == SCHEME_FTP) 1650 { 1651 /* If we're at the beginning, copy the "../" literally 1652 and move the beginning so a later ".." doesn't remove 1653 it. This violates RFC 3986; but we do it for FTP 1654 anyway because there is otherwise no way to get at a 1655 parent directory, when the FTP server drops us in a 1656 non-root directory (which is not uncommon). */ 1657 beg = t + 3; 1658 goto regular; 1659 } 1660 h += 3; 1661 } 1662 else 1663 { 1664 regular: 1665 /* A regular path element. If H hasn't advanced past T, 1666 simply skip to the next path element. Otherwise, copy 1667 the path element until the next slash. */ 1668 if (t == h) 1669 { 1670 /* Skip the path element, including the slash. */ 1671 while (h < end && *h != '/') 1672 t++, h++; 1673 if (h < end) 1674 t++, h++; 1675 } 1676 else 1677 { 1678 /* Copy the path element, including the final slash. */ 1679 while (h < end && *h != '/') 1680 *t++ = *h++; 1681 if (h < end) 1682 *t++ = *h++; 1683 } 1684 } 1685 } 1686 1687 if (t != h) 1688 *t = '\0'; 1689 1690 return t != h; 1691} 1692 1693/* Return the length of URL's path. Path is considered to be 1694 terminated by one or more of the ?query or ;params or #fragment, 1695 depending on the scheme. */ 1696 1697static const char * 1698path_end (const char *url) 1699{ 1700 enum url_scheme scheme = url_scheme (url); 1701 const char *seps; 1702 if (scheme == SCHEME_INVALID) 1703 scheme = SCHEME_HTTP; /* use http semantics for rel links */ 1704 /* +2 to ignore the first two separators ':' and '/' */ 1705 seps = init_seps (scheme) + 2; 1706 return strpbrk_or_eos (url, seps); 1707} 1708 1709/* Find the last occurrence of character C in the range [b, e), or 1710 NULL, if none are present. */ 1711#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b)) 1712 1713/* Merge BASE with LINK and return the resulting URI. 1714 1715 Either of the URIs may be absolute or relative, complete with the 1716 host name, or path only. This tries to reasonably handle all 1717 foreseeable cases. It only employs minimal URL parsing, without 1718 knowledge of the specifics of schemes. 1719 1720 I briefly considered making this function call path_simplify after 1721 the merging process, as rfc1738 seems to suggest. This is a bad 1722 idea for several reasons: 1) it complexifies the code, and 2) 1723 url_parse has to simplify path anyway, so it's wasteful to boot. */ 1724 1725char * 1726uri_merge (const char *base, const char *link) 1727{ 1728 int linklength; 1729 const char *end; 1730 char *merge; 1731 1732 if (url_has_scheme (link)) 1733 return xstrdup (link); 1734 1735 /* We may not examine BASE past END. */ 1736 end = path_end (base); 1737 linklength = strlen (link); 1738 1739 if (!*link) 1740 { 1741 /* Empty LINK points back to BASE, query string and all. */ 1742 return xstrdup (base); 1743 } 1744 else if (*link == '?') 1745 { 1746 /* LINK points to the same location, but changes the query 1747 string. Examples: */ 1748 /* uri_merge("path", "?new") -> "path?new" */ 1749 /* uri_merge("path?foo", "?new") -> "path?new" */ 1750 /* uri_merge("path?foo#bar", "?new") -> "path?new" */ 1751 /* uri_merge("path#foo", "?new") -> "path?new" */ 1752 int baselength = end - base; 1753 merge = xmalloc (baselength + linklength + 1); 1754 memcpy (merge, base, baselength); 1755 memcpy (merge + baselength, link, linklength); 1756 merge[baselength + linklength] = '\0'; 1757 } 1758 else if (*link == '#') 1759 { 1760 /* uri_merge("path", "#new") -> "path#new" */ 1761 /* uri_merge("path#foo", "#new") -> "path#new" */ 1762 /* uri_merge("path?foo", "#new") -> "path?foo#new" */ 1763 /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */ 1764 int baselength; 1765 const char *end1 = strchr (base, '#'); 1766 if (!end1) 1767 end1 = base + strlen (base); 1768 baselength = end1 - base; 1769 merge = xmalloc (baselength + linklength + 1); 1770 memcpy (merge, base, baselength); 1771 memcpy (merge + baselength, link, linklength); 1772 merge[baselength + linklength] = '\0'; 1773 } 1774 else if (*link == '/' && *(link + 1) == '/') 1775 { 1776 /* LINK begins with "//" and so is a net path: we need to 1777 replace everything after (and including) the double slash 1778 with LINK. */ 1779 1780 /* uri_merge("foo", "//new/bar") -> "//new/bar" */ 1781 /* uri_merge("//old/foo", "//new/bar") -> "//new/bar" */ 1782 /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */ 1783 1784 int span; 1785 const char *slash; 1786 const char *start_insert; 1787 1788 /* Look for first slash. */ 1789 slash = memchr (base, '/', end - base); 1790 /* If found slash and it is a double slash, then replace 1791 from this point, else default to replacing from the 1792 beginning. */ 1793 if (slash && *(slash + 1) == '/') 1794 start_insert = slash; 1795 else 1796 start_insert = base; 1797 1798 span = start_insert - base; 1799 merge = xmalloc (span + linklength + 1); 1800 if (span) 1801 memcpy (merge, base, span); 1802 memcpy (merge + span, link, linklength); 1803 merge[span + linklength] = '\0'; 1804 } 1805 else if (*link == '/') 1806 { 1807 /* LINK is an absolute path: we need to replace everything 1808 after (and including) the FIRST slash with LINK. 1809 1810 So, if BASE is "http://host/whatever/foo/bar", and LINK is 1811 "/qux/xyzzy", our result should be 1812 "http://host/qux/xyzzy". */ 1813 int span; 1814 const char *slash; 1815 const char *start_insert = NULL; /* for gcc to shut up. */ 1816 const char *pos = base; 1817 bool seen_slash_slash = false; 1818 /* We're looking for the first slash, but want to ignore 1819 double slash. */ 1820 again: 1821 slash = memchr (pos, '/', end - pos); 1822 if (slash && !seen_slash_slash) 1823 if (*(slash + 1) == '/') 1824 { 1825 pos = slash + 2; 1826 seen_slash_slash = true; 1827 goto again; 1828 } 1829 1830 /* At this point, SLASH is the location of the first / after 1831 "//", or the first slash altogether. START_INSERT is the 1832 pointer to the location where LINK will be inserted. When 1833 examining the last two examples, keep in mind that LINK 1834 begins with '/'. */ 1835 1836 if (!slash && !seen_slash_slash) 1837 /* example: "foo" */ 1838 /* ^ */ 1839 start_insert = base; 1840 else if (!slash && seen_slash_slash) 1841 /* example: "http://foo" */ 1842 /* ^ */ 1843 start_insert = end; 1844 else if (slash && !seen_slash_slash) 1845 /* example: "foo/bar" */ 1846 /* ^ */ 1847 start_insert = base; 1848 else if (slash && seen_slash_slash) 1849 /* example: "http://something/" */ 1850 /* ^ */ 1851 start_insert = slash; 1852 1853 span = start_insert - base; 1854 merge = xmalloc (span + linklength + 1); 1855 if (span) 1856 memcpy (merge, base, span); 1857 memcpy (merge + span, link, linklength); 1858 merge[span + linklength] = '\0'; 1859 } 1860 else 1861 { 1862 /* LINK is a relative URL: we need to replace everything 1863 after last slash (possibly empty) with LINK. 1864 1865 So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy", 1866 our result should be "whatever/foo/qux/xyzzy". */ 1867 bool need_explicit_slash = false; 1868 int span; 1869 const char *start_insert; 1870 const char *last_slash = find_last_char (base, end, '/'); 1871 if (!last_slash) 1872 { 1873 /* No slash found at all. Replace what we have with LINK. */ 1874 start_insert = base; 1875 } 1876 else if (last_slash && last_slash >= base + 2 1877 && last_slash[-2] == ':' && last_slash[-1] == '/') 1878 { 1879 /* example: http://host" */ 1880 /* ^ */ 1881 start_insert = end + 1; 1882 need_explicit_slash = true; 1883 } 1884 else 1885 { 1886 /* example: "whatever/foo/bar" */ 1887 /* ^ */ 1888 start_insert = last_slash + 1; 1889 } 1890 1891 span = start_insert - base; 1892 merge = xmalloc (span + linklength + 1); 1893 if (span) 1894 memcpy (merge, base, span); 1895 if (need_explicit_slash) 1896 merge[span - 1] = '/'; 1897 memcpy (merge + span, link, linklength); 1898 merge[span + linklength] = '\0'; 1899 } 1900 1901 return merge; 1902} 1903 1904#define APPEND(p, s) do { \ 1905 int len = strlen (s); \ 1906 memcpy (p, s, len); \ 1907 p += len; \ 1908} while (0) 1909 1910/* Use this instead of password when the actual password is supposed 1911 to be hidden. We intentionally use a generic string without giving 1912 away the number of characters in the password, like previous 1913 versions did. */ 1914#define HIDDEN_PASSWORD "*password*" 1915 1916/* Recreate the URL string from the data in URL. 1917 1918 If HIDE is true (as it is when we're calling this on a URL we plan 1919 to print, but not when calling it to canonicalize a URL for use 1920 within the program), password will be hidden. Unsafe characters in 1921 the URL will be quoted. */ 1922 1923char * 1924url_string (const struct url *url, enum url_auth_mode auth_mode) 1925{ 1926 int size; 1927 char *result, *p; 1928 char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL; 1929 1930 int scheme_port = supported_schemes[url->scheme].default_port; 1931 const char *scheme_str = supported_schemes[url->scheme].leading_string; 1932 int fplen = full_path_length (url); 1933 1934 bool brackets_around_host; 1935 1936 assert (scheme_str != NULL); 1937 1938 /* Make sure the user name and password are quoted. */ 1939 if (url->user) 1940 { 1941 if (auth_mode != URL_AUTH_HIDE) 1942 { 1943 quoted_user = url_escape_allow_passthrough (url->user); 1944 if (url->passwd) 1945 { 1946 if (auth_mode == URL_AUTH_HIDE_PASSWD) 1947 quoted_passwd = HIDDEN_PASSWORD; 1948 else 1949 quoted_passwd = url_escape_allow_passthrough (url->passwd); 1950 } 1951 } 1952 } 1953 1954 /* In the unlikely event that the host name contains non-printable 1955 characters, quote it for displaying to the user. */ 1956 quoted_host = url_escape_allow_passthrough (url->host); 1957 1958 /* Undo the quoting of colons that URL escaping performs. IPv6 1959 addresses may legally contain colons, and in that case must be 1960 placed in square brackets. */ 1961 if (quoted_host != url->host) 1962 unescape_single_char (quoted_host, ':'); 1963 brackets_around_host = strchr (quoted_host, ':') != NULL; 1964 1965 size = (strlen (scheme_str) 1966 + strlen (quoted_host) 1967 + (brackets_around_host ? 2 : 0) 1968 + fplen 1969 + 1); 1970 if (url->port != scheme_port) 1971 size += 1 + numdigit (url->port); 1972 if (quoted_user) 1973 { 1974 size += 1 + strlen (quoted_user); 1975 if (quoted_passwd) 1976 size += 1 + strlen (quoted_passwd); 1977 } 1978 1979 p = result = xmalloc (size); 1980 1981 APPEND (p, scheme_str); 1982 if (quoted_user) 1983 { 1984 APPEND (p, quoted_user); 1985 if (quoted_passwd) 1986 { 1987 *p++ = ':'; 1988 APPEND (p, quoted_passwd); 1989 } 1990 *p++ = '@'; 1991 } 1992 1993 if (brackets_around_host) 1994 *p++ = '['; 1995 APPEND (p, quoted_host); 1996 if (brackets_around_host) 1997 *p++ = ']'; 1998 if (url->port != scheme_port) 1999 { 2000 *p++ = ':'; 2001 p = number_to_string (p, url->port); 2002 } 2003 2004 full_path_write (url, p); 2005 p += fplen; 2006 *p++ = '\0'; 2007 2008 assert (p - result == size); 2009 2010 if (quoted_user && quoted_user != url->user) 2011 xfree (quoted_user); 2012 if (quoted_passwd && auth_mode == URL_AUTH_SHOW 2013 && quoted_passwd != url->passwd) 2014 xfree (quoted_passwd); 2015 if (quoted_host != url->host) 2016 xfree (quoted_host); 2017 2018 return result; 2019} 2020 2021/* Return true if scheme a is similar to scheme b. 2022 2023 Schemes are similar if they are equal. If SSL is supported, schemes 2024 are also similar if one is http (SCHEME_HTTP) and the other is https 2025 (SCHEME_HTTPS). */ 2026bool 2027schemes_are_similar_p (enum url_scheme a, enum url_scheme b) 2028{ 2029 if (a == b) 2030 return true; 2031#ifdef HAVE_SSL 2032 if ((a == SCHEME_HTTP && b == SCHEME_HTTPS) 2033 || (a == SCHEME_HTTPS && b == SCHEME_HTTP)) 2034 return true; 2035#endif 2036 return false; 2037} 2038 2039static int 2040getchar_from_escaped_string (const char *str, char *c) 2041{ 2042 const char *p = str; 2043 2044 assert (str && *str); 2045 assert (c); 2046 2047 if (p[0] == '%') 2048 { 2049 if (!c_isxdigit(p[1]) || !c_isxdigit(p[2])) 2050 { 2051 *c = '%'; 2052 return 1; 2053 } 2054 else 2055 { 2056 if (p[2] == 0) 2057 return 0; /* error: invalid string */ 2058 2059 *c = X2DIGITS_TO_NUM (p[1], p[2]); 2060 if (URL_RESERVED_CHAR(*c)) 2061 { 2062 *c = '%'; 2063 return 1; 2064 } 2065 else 2066 return 3; 2067 } 2068 } 2069 else 2070 { 2071 *c = p[0]; 2072 } 2073 2074 return 1; 2075} 2076 2077bool 2078are_urls_equal (const char *u1, const char *u2) 2079{ 2080 const char *p, *q; 2081 int pp, qq; 2082 char ch1, ch2; 2083 assert(u1 && u2); 2084 2085 p = u1; 2086 q = u2; 2087 2088 while (*p && *q 2089 && (pp = getchar_from_escaped_string (p, &ch1)) 2090 && (qq = getchar_from_escaped_string (q, &ch2)) 2091 && (c_tolower(ch1) == c_tolower(ch2))) 2092 { 2093 p += pp; 2094 q += qq; 2095 } 2096 2097 return (*p == 0 && *q == 0 ? true : false); 2098} 2099 2100#ifdef TESTING 2101/* Debugging and testing support for path_simplify. */ 2102 2103#if 0 2104/* Debug: run path_simplify on PATH and return the result in a new 2105 string. Useful for calling from the debugger. */ 2106static char * 2107ps (char *path) 2108{ 2109 char *copy = xstrdup (path); 2110 path_simplify (copy); 2111 return copy; 2112} 2113#endif 2114 2115static const char * 2116run_test (char *test, char *expected_result, enum url_scheme scheme, 2117 bool expected_change) 2118{ 2119 char *test_copy = xstrdup (test); 2120 bool modified = path_simplify (scheme, test_copy); 2121 2122 if (0 != strcmp (test_copy, expected_result)) 2123 { 2124 printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n", 2125 test, expected_result, test_copy); 2126 mu_assert ("", 0); 2127 } 2128 if (modified != expected_change) 2129 { 2130 if (expected_change) 2131 printf ("Expected modification with path_simplify(\"%s\").\n", 2132 test); 2133 else 2134 printf ("Expected no modification with path_simplify(\"%s\").\n", 2135 test); 2136 } 2137 xfree (test_copy); 2138 mu_assert ("", modified == expected_change); 2139 return NULL; 2140} 2141 2142const char * 2143test_path_simplify (void) 2144{ 2145 static struct { 2146 char *test, *result; 2147 enum url_scheme scheme; 2148 bool should_modify; 2149 } tests[] = { 2150 { "", "", SCHEME_HTTP, false }, 2151 { ".", "", SCHEME_HTTP, true }, 2152 { "./", "", SCHEME_HTTP, true }, 2153 { "..", "", SCHEME_HTTP, true }, 2154 { "../", "", SCHEME_HTTP, true }, 2155 { "..", "..", SCHEME_FTP, false }, 2156 { "../", "../", SCHEME_FTP, false }, 2157 { "foo", "foo", SCHEME_HTTP, false }, 2158 { "foo/bar", "foo/bar", SCHEME_HTTP, false }, 2159 { "foo///bar", "foo///bar", SCHEME_HTTP, false }, 2160 { "foo/.", "foo/", SCHEME_HTTP, true }, 2161 { "foo/./", "foo/", SCHEME_HTTP, true }, 2162 { "foo./", "foo./", SCHEME_HTTP, false }, 2163 { "foo/../bar", "bar", SCHEME_HTTP, true }, 2164 { "foo/../bar/", "bar/", SCHEME_HTTP, true }, 2165 { "foo/bar/..", "foo/", SCHEME_HTTP, true }, 2166 { "foo/bar/../x", "foo/x", SCHEME_HTTP, true }, 2167 { "foo/bar/../x/", "foo/x/", SCHEME_HTTP, true }, 2168 { "foo/..", "", SCHEME_HTTP, true }, 2169 { "foo/../..", "", SCHEME_HTTP, true }, 2170 { "foo/../../..", "", SCHEME_HTTP, true }, 2171 { "foo/../../bar/../../baz", "baz", SCHEME_HTTP, true }, 2172 { "foo/../..", "..", SCHEME_FTP, true }, 2173 { "foo/../../..", "../..", SCHEME_FTP, true }, 2174 { "foo/../../bar/../../baz", "../../baz", SCHEME_FTP, true }, 2175 { "a/b/../../c", "c", SCHEME_HTTP, true }, 2176 { "./a/../b", "b", SCHEME_HTTP, true } 2177 }; 2178 int i; 2179 2180 for (i = 0; i < countof (tests); i++) 2181 { 2182 const char *message; 2183 char *test = tests[i].test; 2184 char *expected_result = tests[i].result; 2185 enum url_scheme scheme = tests[i].scheme; 2186 bool expected_change = tests[i].should_modify; 2187 message = run_test (test, expected_result, scheme, expected_change); 2188 if (message) return message; 2189 } 2190 return NULL; 2191} 2192 2193const char * 2194test_append_uri_pathel() 2195{ 2196 int i; 2197 struct { 2198 char *original_url; 2199 char *input; 2200 bool escaped; 2201 char *expected_result; 2202 } test_array[] = { 2203 { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" }, 2204 }; 2205 2206 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 2207 { 2208 struct growable dest; 2209 const char *p = test_array[i].input; 2210 2211 memset (&dest, 0, sizeof (dest)); 2212 2213 append_string (test_array[i].original_url, &dest); 2214 append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest); 2215 append_char ('\0', &dest); 2216 2217 mu_assert ("test_append_uri_pathel: wrong result", 2218 strcmp (dest.base, test_array[i].expected_result) == 0); 2219 } 2220 2221 return NULL; 2222} 2223 2224const char* 2225test_are_urls_equal() 2226{ 2227 int i; 2228 struct { 2229 char *url1; 2230 char *url2; 2231 bool expected_result; 2232 } test_array[] = { 2233 { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/", true }, 2234 { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false }, 2235 { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/", false }, 2236 { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/", true }, 2237 { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/", false }, 2238 { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/", false }, 2239 }; 2240 2241 for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i) 2242 { 2243 mu_assert ("test_are_urls_equal: wrong result", 2244 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result); 2245 } 2246 2247 return NULL; 2248} 2249 2250#endif /* TESTING */ 2251 2252/* 2253 * vim: et ts=2 sw=2 2254 */ 2255 2256