1/* URL handling.
2   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3   2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31#include "wget.h"
32
33#include <stdio.h>
34#include <stdlib.h>
35#include <string.h>
36#ifdef HAVE_UNISTD_H
37# include <unistd.h>
38#endif
39#include <errno.h>
40#include <assert.h>
41
42#include "utils.h"
43#include "url.h"
44#include "host.h"  /* for is_valid_ipv6_address */
45
46#ifdef __VMS
47#include "vms.h"
48#endif /* def __VMS */
49
50#ifdef TESTING
51#include "test.h"
52#endif
53
54enum {
55  scm_disabled = 1,             /* for https when OpenSSL fails to init. */
56  scm_has_params = 2,           /* whether scheme has ;params */
57  scm_has_query = 4,            /* whether scheme has ?query */
58  scm_has_fragment = 8          /* whether scheme has #fragment */
59};
60
61struct scheme_data
62{
63  /* Short name of the scheme, such as "http" or "ftp". */
64  const char *name;
65  /* Leading string that identifies the scheme, such as "https://". */
66  const char *leading_string;
67  /* Default port of the scheme when none is specified. */
68  int default_port;
69  /* Various flags. */
70  int flags;
71};
72
73/* Supported schemes: */
74static struct scheme_data supported_schemes[] =
75{
76  { "http",     "http://",  DEFAULT_HTTP_PORT,  scm_has_query|scm_has_fragment },
77#ifdef HAVE_SSL
78  { "https",    "https://", DEFAULT_HTTPS_PORT, scm_has_query|scm_has_fragment },
79#endif
80  { "ftp",      "ftp://",   DEFAULT_FTP_PORT,   scm_has_params|scm_has_fragment },
81
82  /* SCHEME_INVALID */
83  { NULL,       NULL,       -1,                 0 }
84};
85
86/* Forward declarations: */
87
88static bool path_simplify (enum url_scheme, char *);
89
90/* Support for escaping and unescaping of URL strings.  */
91
92/* Table of "reserved" and "unsafe" characters.  Those terms are
93   rfc1738-speak, as such largely obsoleted by rfc2396 and later
94   specs, but the general idea remains.
95
96   A reserved character is the one that you can't decode without
97   changing the meaning of the URL.  For example, you can't decode
98   "/foo/%2f/bar" into "/foo///bar" because the number and contents of
99   path components is different.  Non-reserved characters can be
100   changed, so "/foo/%78/bar" is safe to change to "/foo/x/bar".  The
101   unsafe characters are loosely based on rfc1738, plus "$" and ",",
102   as recommended by rfc2396, and minus "~", which is very frequently
103   used (and sometimes unrecognized as %7E by broken servers).
104
105   An unsafe character is the one that should be encoded when URLs are
106   placed in foreign environments.  E.g. space and newline are unsafe
107   in HTTP contexts because HTTP uses them as separator and line
108   terminator, so they must be encoded to %20 and %0A respectively.
109   "*" is unsafe in shell context, etc.
110
111   We determine whether a character is unsafe through static table
112   lookup.  This code assumes ASCII character set and 8-bit chars.  */
113
114enum {
115  /* rfc1738 reserved chars + "$" and ",".  */
116  urlchr_reserved = 1,
117
118  /* rfc1738 unsafe chars, plus non-printables.  */
119  urlchr_unsafe   = 2
120};
121
122#define urlchr_test(c, mask) (urlchr_table[(unsigned char)(c)] & (mask))
123#define URL_RESERVED_CHAR(c) urlchr_test(c, urlchr_reserved)
124#define URL_UNSAFE_CHAR(c) urlchr_test(c, urlchr_unsafe)
125
126/* Shorthands for the table: */
127#define R  urlchr_reserved
128#define U  urlchr_unsafe
129#define RU R|U
130
131static const unsigned char urlchr_table[256] =
132{
133  U,  U,  U,  U,   U,  U,  U,  U,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
134  U,  U,  U,  U,   U,  U,  U,  U,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
135  U,  U,  U,  U,   U,  U,  U,  U,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
136  U,  U,  U,  U,   U,  U,  U,  U,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
137  U,  0,  U, RU,   R,  U,  R,  0,   /* SP  !   "   #    $   %   &   '   */
138  0,  0,  0,  R,   R,  0,  0,  R,   /* (   )   *   +    ,   -   .   /   */
139  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
140  0,  0, RU,  R,   U,  R,  U,  R,   /* 8   9   :   ;    <   =   >   ?   */
141 RU,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
142  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
143  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
144  0,  0,  0, RU,   U, RU,  U,  0,   /* X   Y   Z   [    \   ]   ^   _   */
145  U,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
146  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
147  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
148  0,  0,  0,  U,   U,  U,  0,  U,   /* x   y   z   {    |   }   ~   DEL */
149
150  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
151  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
152  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
153  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
154
155  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
156  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
157  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
158  U, U, U, U,  U, U, U, U,  U, U, U, U,  U, U, U, U,
159};
160#undef R
161#undef U
162#undef RU
163
164/* URL-unescape the string S.
165
166   This is done by transforming the sequences "%HH" to the character
167   represented by the hexadecimal digits HH.  If % is not followed by
168   two hexadecimal digits, it is inserted literally.
169
170   The transformation is done in place.  If you need the original
171   string intact, make a copy before calling this function.  */
172
173static void
174url_unescape (char *s)
175{
176  char *t = s;                  /* t - tortoise */
177  char *h = s;                  /* h - hare     */
178
179  for (; *h; h++, t++)
180    {
181      if (*h != '%')
182        {
183        copychar:
184          *t = *h;
185        }
186      else
187        {
188          char c;
189          /* Do nothing if '%' is not followed by two hex digits. */
190          if (!h[1] || !h[2] || !(c_isxdigit (h[1]) && c_isxdigit (h[2])))
191            goto copychar;
192          c = X2DIGITS_TO_NUM (h[1], h[2]);
193          /* Don't unescape %00 because there is no way to insert it
194             into a C string without effectively truncating it. */
195          if (c == '\0')
196            goto copychar;
197          *t = c;
198          h += 2;
199        }
200    }
201  *t = '\0';
202}
203
204/* The core of url_escape_* functions.  Escapes the characters that
205   match the provided mask in urlchr_table.
206
207   If ALLOW_PASSTHROUGH is true, a string with no unsafe chars will be
208   returned unchanged.  If ALLOW_PASSTHROUGH is false, a freshly
209   allocated string will be returned in all cases.  */
210
211static char *
212url_escape_1 (const char *s, unsigned char mask, bool allow_passthrough)
213{
214  const char *p1;
215  char *p2, *newstr;
216  int newlen;
217  int addition = 0;
218
219  for (p1 = s; *p1; p1++)
220    if (urlchr_test (*p1, mask))
221      addition += 2;            /* Two more characters (hex digits) */
222
223  if (!addition)
224    return allow_passthrough ? (char *)s : xstrdup (s);
225
226  newlen = (p1 - s) + addition;
227  newstr = xmalloc (newlen + 1);
228
229  p1 = s;
230  p2 = newstr;
231  while (*p1)
232    {
233      /* Quote the characters that match the test mask. */
234      if (urlchr_test (*p1, mask))
235        {
236          unsigned char c = *p1++;
237          *p2++ = '%';
238          *p2++ = XNUM_TO_DIGIT (c >> 4);
239          *p2++ = XNUM_TO_DIGIT (c & 0xf);
240        }
241      else
242        *p2++ = *p1++;
243    }
244  assert (p2 - newstr == newlen);
245  *p2 = '\0';
246
247  return newstr;
248}
249
250/* URL-escape the unsafe characters (see urlchr_table) in a given
251   string, returning a freshly allocated string.  */
252
253char *
254url_escape (const char *s)
255{
256  return url_escape_1 (s, urlchr_unsafe, false);
257}
258
259/* URL-escape the unsafe and reserved characters (see urlchr_table) in
260   a given string, returning a freshly allocated string.  */
261
262char *
263url_escape_unsafe_and_reserved (const char *s)
264{
265  return url_escape_1 (s, urlchr_unsafe|urlchr_reserved, false);
266}
267
268/* URL-escape the unsafe characters (see urlchr_table) in a given
269   string.  If no characters are unsafe, S is returned.  */
270
271static char *
272url_escape_allow_passthrough (const char *s)
273{
274  return url_escape_1 (s, urlchr_unsafe, true);
275}
276
277/* Decide whether the char at position P needs to be encoded.  (It is
278   not enough to pass a single char *P because the function may need
279   to inspect the surrounding context.)
280
281   Return true if the char should be escaped as %XX, false otherwise.  */
282
283static inline bool
284char_needs_escaping (const char *p)
285{
286  if (*p == '%')
287    {
288      if (c_isxdigit (*(p + 1)) && c_isxdigit (*(p + 2)))
289        return false;
290      else
291        /* Garbled %.. sequence: encode `%'. */
292        return true;
293    }
294  else if (URL_UNSAFE_CHAR (*p) && !URL_RESERVED_CHAR (*p))
295    return true;
296  else
297    return false;
298}
299
300/* Translate a %-escaped (but possibly non-conformant) input string S
301   into a %-escaped (and conformant) output string.  If no characters
302   are encoded or decoded, return the same string S; otherwise, return
303   a freshly allocated string with the new contents.
304
305   After a URL has been run through this function, the protocols that
306   use `%' as the quote character can use the resulting string as-is,
307   while those that don't can use url_unescape to get to the intended
308   data.  This function is stable: once the input is transformed,
309   further transformations of the result yield the same output.
310
311   Let's discuss why this function is needed.
312
313   Imagine Wget is asked to retrieve `http://abc.xyz/abc def'.  Since
314   a raw space character would mess up the HTTP request, it needs to
315   be quoted, like this:
316
317       GET /abc%20def HTTP/1.0
318
319   It would appear that the unsafe chars need to be quoted, for
320   example with url_escape.  But what if we're requested to download
321   `abc%20def'?  url_escape transforms "%" to "%25", which would leave
322   us with `abc%2520def'.  This is incorrect -- since %-escapes are
323   part of URL syntax, "%20" is the correct way to denote a literal
324   space on the Wget command line.  This leads to the conclusion that
325   in that case Wget should not call url_escape, but leave the `%20'
326   as is.  This is clearly contradictory, but it only gets worse.
327
328   What if the requested URI is `abc%20 def'?  If we call url_escape,
329   we end up with `/abc%2520%20def', which is almost certainly not
330   intended.  If we don't call url_escape, we are left with the
331   embedded space and cannot complete the request.  What the user
332   meant was for Wget to request `/abc%20%20def', and this is where
333   reencode_escapes kicks in.
334
335   Wget used to solve this by first decoding %-quotes, and then
336   encoding all the "unsafe" characters found in the resulting string.
337   This was wrong because it didn't preserve certain URL special
338   (reserved) characters.  For instance, URI containing "a%2B+b" (0x2b
339   == '+') would get translated to "a%2B%2Bb" or "a++b" depending on
340   whether we considered `+' reserved (it is).  One of these results
341   is inevitable because by the second step we would lose information
342   on whether the `+' was originally encoded or not.  Both results
343   were wrong because in CGI parameters + means space, while %2B means
344   literal plus.  reencode_escapes correctly translates the above to
345   "a%2B+b", i.e. returns the original string.
346
347   This function uses a modified version of the algorithm originally
348   proposed by Anon Sricharoenchai:
349
350   * Encode all "unsafe" characters, except those that are also
351     "reserved", to %XX.  See urlchr_table for which characters are
352     unsafe and reserved.
353
354   * Encode the "%" characters not followed by two hex digits to
355     "%25".
356
357   * Pass through all other characters and %XX escapes as-is.  (Up to
358     Wget 1.10 this decoded %XX escapes corresponding to "safe"
359     characters, but that was obtrusive and broke some servers.)
360
361   Anon's test case:
362
363   "http://abc.xyz/%20%3F%%36%31%25aa% a?a=%61+a%2Ba&b=b%26c%3Dc"
364   ->
365   "http://abc.xyz/%20%3F%25%36%31%25aa%25%20a?a=%61+a%2Ba&b=b%26c%3Dc"
366
367   Simpler test cases:
368
369   "foo bar"         -> "foo%20bar"
370   "foo%20bar"       -> "foo%20bar"
371   "foo %20bar"      -> "foo%20%20bar"
372   "foo%%20bar"      -> "foo%25%20bar"       (0x25 == '%')
373   "foo%25%20bar"    -> "foo%25%20bar"
374   "foo%2%20bar"     -> "foo%252%20bar"
375   "foo+bar"         -> "foo+bar"            (plus is reserved!)
376   "foo%2b+bar"      -> "foo%2b+bar"  */
377
378static char *
379reencode_escapes (const char *s)
380{
381  const char *p1;
382  char *newstr, *p2;
383  int oldlen, newlen;
384
385  int encode_count = 0;
386
387  /* First pass: inspect the string to see if there's anything to do,
388     and to calculate the new length.  */
389  for (p1 = s; *p1; p1++)
390    if (char_needs_escaping (p1))
391      ++encode_count;
392
393  if (!encode_count)
394    /* The string is good as it is. */
395    return (char *) s;          /* C const model sucks. */
396
397  oldlen = p1 - s;
398  /* Each encoding adds two characters (hex digits).  */
399  newlen = oldlen + 2 * encode_count;
400  newstr = xmalloc (newlen + 1);
401
402  /* Second pass: copy the string to the destination address, encoding
403     chars when needed.  */
404  p1 = s;
405  p2 = newstr;
406
407  while (*p1)
408    if (char_needs_escaping (p1))
409      {
410        unsigned char c = *p1++;
411        *p2++ = '%';
412        *p2++ = XNUM_TO_DIGIT (c >> 4);
413        *p2++ = XNUM_TO_DIGIT (c & 0xf);
414      }
415    else
416      *p2++ = *p1++;
417
418  *p2 = '\0';
419  assert (p2 - newstr == newlen);
420  return newstr;
421}
422
423/* Returns the scheme type if the scheme is supported, or
424   SCHEME_INVALID if not.  */
425
426enum url_scheme
427url_scheme (const char *url)
428{
429  int i;
430
431  for (i = 0; supported_schemes[i].leading_string; i++)
432    if (0 == strncasecmp (url, supported_schemes[i].leading_string,
433                          strlen (supported_schemes[i].leading_string)))
434      {
435        if (!(supported_schemes[i].flags & scm_disabled))
436          return (enum url_scheme) i;
437        else
438          return SCHEME_INVALID;
439      }
440
441  return SCHEME_INVALID;
442}
443
444#define SCHEME_CHAR(ch) (c_isalnum (ch) || (ch) == '-' || (ch) == '+')
445
446/* Return 1 if the URL begins with any "scheme", 0 otherwise.  As
447   currently implemented, it returns true if URL begins with
448   [-+a-zA-Z0-9]+: .  */
449
450bool
451url_has_scheme (const char *url)
452{
453  const char *p = url;
454
455  /* The first char must be a scheme char. */
456  if (!*p || !SCHEME_CHAR (*p))
457    return false;
458  ++p;
459  /* Followed by 0 or more scheme chars. */
460  while (*p && SCHEME_CHAR (*p))
461    ++p;
462  /* Terminated by ':'. */
463  return *p == ':';
464}
465
466int
467scheme_default_port (enum url_scheme scheme)
468{
469  return supported_schemes[scheme].default_port;
470}
471
472void
473scheme_disable (enum url_scheme scheme)
474{
475  supported_schemes[scheme].flags |= scm_disabled;
476}
477
478/* Skip the username and password, if present in the URL.  The
479   function should *not* be called with the complete URL, but with the
480   portion after the scheme.
481
482   If no username and password are found, return URL.  */
483
484static const char *
485url_skip_credentials (const char *url)
486{
487  /* Look for '@' that comes before terminators, such as '/', '?',
488     '#', or ';'.  */
489  const char *p = (const char *)strpbrk (url, "@/?#;");
490  if (!p || *p != '@')
491    return url;
492  return p + 1;
493}
494
495/* Parse credentials contained in [BEG, END).  The region is expected
496   to have come from a URL and is unescaped.  */
497
498static bool
499parse_credentials (const char *beg, const char *end, char **user, char **passwd)
500{
501  char *colon;
502  const char *userend;
503
504  if (beg == end)
505    return false;               /* empty user name */
506
507  colon = memchr (beg, ':', end - beg);
508  if (colon == beg)
509    return false;               /* again empty user name */
510
511  if (colon)
512    {
513      *passwd = strdupdelim (colon + 1, end);
514      userend = colon;
515      url_unescape (*passwd);
516    }
517  else
518    {
519      *passwd = NULL;
520      userend = end;
521    }
522  *user = strdupdelim (beg, userend);
523  url_unescape (*user);
524  return true;
525}
526
527/* Used by main.c: detect URLs written using the "shorthand" URL forms
528   originally popularized by Netscape and NcFTP.  HTTP shorthands look
529   like this:
530
531   www.foo.com[:port]/dir/file   -> http://www.foo.com[:port]/dir/file
532   www.foo.com[:port]            -> http://www.foo.com[:port]
533
534   FTP shorthands look like this:
535
536   foo.bar.com:dir/file          -> ftp://foo.bar.com/dir/file
537   foo.bar.com:/absdir/file      -> ftp://foo.bar.com//absdir/file
538
539   If the URL needs not or cannot be rewritten, return NULL.  */
540
541char *
542rewrite_shorthand_url (const char *url)
543{
544  const char *p;
545  char *ret;
546
547  if (url_scheme (url) != SCHEME_INVALID)
548    return NULL;
549
550  /* Look for a ':' or '/'.  The former signifies NcFTP syntax, the
551     latter Netscape.  */
552  p = strpbrk (url, ":/");
553  if (p == url)
554    return NULL;
555
556  /* If we're looking at "://", it means the URL uses a scheme we
557     don't support, which may include "https" when compiled without
558     SSL support.  Don't bogusly rewrite such URLs.  */
559  if (p && p[0] == ':' && p[1] == '/' && p[2] == '/')
560    return NULL;
561
562  if (p && *p == ':')
563    {
564      /* Colon indicates ftp, as in foo.bar.com:path.  Check for
565         special case of http port number ("localhost:10000").  */
566      int digits = strspn (p + 1, "0123456789");
567      if (digits && (p[1 + digits] == '/' || p[1 + digits] == '\0'))
568        goto http;
569
570      /* Turn "foo.bar.com:path" to "ftp://foo.bar.com/path". */
571      ret = aprintf ("ftp://%s", url);
572      ret[6 + (p - url)] = '/';
573    }
574  else
575    {
576    http:
577      /* Just prepend "http://" to URL. */
578      ret = aprintf ("http://%s", url);
579    }
580  return ret;
581}
582
583static void split_path (const char *, char **, char **);
584
585/* Like strpbrk, with the exception that it returns the pointer to the
586   terminating zero (end-of-string aka "eos") if no matching character
587   is found.  */
588
589static inline char *
590strpbrk_or_eos (const char *s, const char *accept)
591{
592  char *p = strpbrk (s, accept);
593  if (!p)
594    p = strchr (s, '\0');
595  return p;
596}
597
598/* Turn STR into lowercase; return true if a character was actually
599   changed. */
600
601static bool
602lowercase_str (char *str)
603{
604  bool changed = false;
605  for (; *str; str++)
606    if (c_isupper (*str))
607      {
608        changed = true;
609        *str = c_tolower (*str);
610      }
611  return changed;
612}
613
614static const char *
615init_seps (enum url_scheme scheme)
616{
617  static char seps[8] = ":/";
618  char *p = seps + 2;
619  int flags = supported_schemes[scheme].flags;
620
621  if (flags & scm_has_params)
622    *p++ = ';';
623  if (flags & scm_has_query)
624    *p++ = '?';
625  if (flags & scm_has_fragment)
626    *p++ = '#';
627  *p++ = '\0';
628  return seps;
629}
630
631static const char *parse_errors[] = {
632#define PE_NO_ERROR                     0
633  N_("No error"),
634#define PE_UNSUPPORTED_SCHEME           1
635  N_("Unsupported scheme %s"), /* support for format token only here */
636#define PE_MISSING_SCHEME               2
637  N_("Scheme missing"),
638#define PE_INVALID_HOST_NAME            3
639  N_("Invalid host name"),
640#define PE_BAD_PORT_NUMBER              4
641  N_("Bad port number"),
642#define PE_INVALID_USER_NAME            5
643  N_("Invalid user name"),
644#define PE_UNTERMINATED_IPV6_ADDRESS    6
645  N_("Unterminated IPv6 numeric address"),
646#define PE_IPV6_NOT_SUPPORTED           7
647  N_("IPv6 addresses not supported"),
648#define PE_INVALID_IPV6_ADDRESS         8
649  N_("Invalid IPv6 numeric address")
650};
651
652/* Parse a URL.
653
654   Return a new struct url if successful, NULL on error.  In case of
655   error, and if ERROR is not NULL, also set *ERROR to the appropriate
656   error code. */
657struct url *
658url_parse (const char *url, int *error, struct iri *iri, bool percent_encode)
659{
660  struct url *u;
661  const char *p;
662  bool path_modified, host_modified;
663
664  enum url_scheme scheme;
665  const char *seps;
666
667  const char *uname_b,     *uname_e;
668  const char *host_b,      *host_e;
669  const char *path_b,      *path_e;
670  const char *params_b,    *params_e;
671  const char *query_b,     *query_e;
672  const char *fragment_b,  *fragment_e;
673
674  int port;
675  char *user = NULL, *passwd = NULL;
676
677  const char *url_encoded = NULL;
678  char *new_url = NULL;
679
680  int error_code;
681
682  scheme = url_scheme (url);
683  if (scheme == SCHEME_INVALID)
684    {
685      if (url_has_scheme (url))
686        error_code = PE_UNSUPPORTED_SCHEME;
687      else
688        error_code = PE_MISSING_SCHEME;
689      goto error;
690    }
691
692  if (iri && iri->utf8_encode)
693    {
694      iri->utf8_encode = remote_to_utf8 (iri, iri->orig_url ? iri->orig_url : url, (const char **) &new_url);
695      if (!iri->utf8_encode)
696        new_url = NULL;
697      else
698        iri->orig_url = xstrdup (url);
699    }
700
701  /* XXX XXX Could that change introduce (security) bugs ???  XXX XXX*/
702  if (percent_encode)
703    url_encoded = reencode_escapes (new_url ? new_url : url);
704  else
705    url_encoded = new_url ? new_url : url;
706
707  p = url_encoded;
708
709  if (new_url && url_encoded != new_url)
710    xfree (new_url);
711
712  p += strlen (supported_schemes[scheme].leading_string);
713  uname_b = p;
714  p = url_skip_credentials (p);
715  uname_e = p;
716
717  /* scheme://user:pass@host[:port]... */
718  /*                    ^              */
719
720  /* We attempt to break down the URL into the components path,
721     params, query, and fragment.  They are ordered like this:
722
723       scheme://host[:port][/path][;params][?query][#fragment]  */
724
725  path_b     = path_e     = NULL;
726  params_b   = params_e   = NULL;
727  query_b    = query_e    = NULL;
728  fragment_b = fragment_e = NULL;
729
730  /* Initialize separators for optional parts of URL, depending on the
731     scheme.  For example, FTP has params, and HTTP and HTTPS have
732     query string and fragment. */
733  seps = init_seps (scheme);
734
735  host_b = p;
736
737  if (*p == '[')
738    {
739      /* Handle IPv6 address inside square brackets.  Ideally we'd
740         just look for the terminating ']', but rfc2732 mandates
741         rejecting invalid IPv6 addresses.  */
742
743      /* The address begins after '['. */
744      host_b = p + 1;
745      host_e = strchr (host_b, ']');
746
747      if (!host_e)
748        {
749          error_code = PE_UNTERMINATED_IPV6_ADDRESS;
750          goto error;
751        }
752
753#ifdef ENABLE_IPV6
754      /* Check if the IPv6 address is valid. */
755      if (!is_valid_ipv6_address(host_b, host_e))
756        {
757          error_code = PE_INVALID_IPV6_ADDRESS;
758          goto error;
759        }
760
761      /* Continue parsing after the closing ']'. */
762      p = host_e + 1;
763#else
764      error_code = PE_IPV6_NOT_SUPPORTED;
765      goto error;
766#endif
767
768      /* The closing bracket must be followed by a separator or by the
769         null char.  */
770      /* http://[::1]... */
771      /*             ^   */
772      if (!strchr (seps, *p))
773        {
774          /* Trailing garbage after []-delimited IPv6 address. */
775          error_code = PE_INVALID_HOST_NAME;
776          goto error;
777        }
778    }
779  else
780    {
781      p = strpbrk_or_eos (p, seps);
782      host_e = p;
783    }
784  ++seps;                       /* advance to '/' */
785
786  if (host_b == host_e)
787    {
788      error_code = PE_INVALID_HOST_NAME;
789      goto error;
790    }
791
792  port = scheme_default_port (scheme);
793  if (*p == ':')
794    {
795      const char *port_b, *port_e, *pp;
796
797      /* scheme://host:port/tralala */
798      /*              ^             */
799      ++p;
800      port_b = p;
801      p = strpbrk_or_eos (p, seps);
802      port_e = p;
803
804      /* Allow empty port, as per rfc2396. */
805      if (port_b != port_e)
806        for (port = 0, pp = port_b; pp < port_e; pp++)
807          {
808            if (!c_isdigit (*pp))
809              {
810                /* http://host:12randomgarbage/blah */
811                /*               ^                  */
812                error_code = PE_BAD_PORT_NUMBER;
813                goto error;
814              }
815            port = 10 * port + (*pp - '0');
816            /* Check for too large port numbers here, before we have
817               a chance to overflow on bogus port values.  */
818            if (port > 0xffff)
819              {
820                error_code = PE_BAD_PORT_NUMBER;
821                goto error;
822              }
823          }
824    }
825  /* Advance to the first separator *after* '/' (either ';' or '?',
826     depending on the scheme).  */
827  ++seps;
828
829  /* Get the optional parts of URL, each part being delimited by
830     current location and the position of the next separator.  */
831#define GET_URL_PART(sepchar, var) do {                         \
832  if (*p == sepchar)                                            \
833    var##_b = ++p, var##_e = p = strpbrk_or_eos (p, seps);      \
834  ++seps;                                                       \
835} while (0)
836
837  GET_URL_PART ('/', path);
838  if (supported_schemes[scheme].flags & scm_has_params)
839    GET_URL_PART (';', params);
840  if (supported_schemes[scheme].flags & scm_has_query)
841    GET_URL_PART ('?', query);
842  if (supported_schemes[scheme].flags & scm_has_fragment)
843    GET_URL_PART ('#', fragment);
844
845#undef GET_URL_PART
846  assert (*p == 0);
847
848  if (uname_b != uname_e)
849    {
850      /* http://user:pass@host */
851      /*        ^         ^    */
852      /*     uname_b   uname_e */
853      if (!parse_credentials (uname_b, uname_e - 1, &user, &passwd))
854        {
855          error_code = PE_INVALID_USER_NAME;
856          goto error;
857        }
858    }
859
860  u = xnew0 (struct url);
861  u->scheme = scheme;
862  u->host   = strdupdelim (host_b, host_e);
863  u->port   = port;
864  u->user   = user;
865  u->passwd = passwd;
866
867  u->path = strdupdelim (path_b, path_e);
868  path_modified = path_simplify (scheme, u->path);
869  split_path (u->path, &u->dir, &u->file);
870
871  host_modified = lowercase_str (u->host);
872
873  /* Decode %HH sequences in host name.  This is important not so much
874     to support %HH sequences in host names (which other browser
875     don't), but to support binary characters (which will have been
876     converted to %HH by reencode_escapes).  */
877  if (strchr (u->host, '%'))
878    {
879      url_unescape (u->host);
880      host_modified = true;
881
882      /* Apply IDNA regardless of iri->utf8_encode status */
883      if (opt.enable_iri && iri)
884        {
885          char *new = idn_encode (iri, u->host);
886          if (new)
887            {
888              xfree (u->host);
889              u->host = new;
890              host_modified = true;
891            }
892        }
893    }
894
895  if (params_b)
896    u->params = strdupdelim (params_b, params_e);
897  if (query_b)
898    u->query = strdupdelim (query_b, query_e);
899  if (fragment_b)
900    u->fragment = strdupdelim (fragment_b, fragment_e);
901
902  if (opt.enable_iri || path_modified || u->fragment || host_modified || path_b == path_e)
903    {
904      /* If we suspect that a transformation has rendered what
905         url_string might return different from URL_ENCODED, rebuild
906         u->url using url_string.  */
907      u->url = url_string (u, URL_AUTH_SHOW);
908
909      if (url_encoded != url)
910        xfree ((char *) url_encoded);
911    }
912  else
913    {
914      if (url_encoded == url)
915        u->url = xstrdup (url);
916      else
917        u->url = (char *) url_encoded;
918    }
919
920  return u;
921
922 error:
923  /* Cleanup in case of error: */
924  if (url_encoded && url_encoded != url)
925    xfree ((char *) url_encoded);
926
927  /* Transmit the error code to the caller, if the caller wants to
928     know.  */
929  if (error)
930    *error = error_code;
931  return NULL;
932}
933
934/* Return the error message string from ERROR_CODE, which should have
935   been retrieved from url_parse.  The error message is translated.  */
936
937char *
938url_error (const char *url, int error_code)
939{
940  assert (error_code >= 0 && ((size_t) error_code) < countof (parse_errors));
941
942  if (error_code == PE_UNSUPPORTED_SCHEME)
943    {
944      char *error, *p;
945      char *scheme = xstrdup (url);
946      assert (url_has_scheme (url));
947
948      if ((p = strchr (scheme, ':')))
949        *p = '\0';
950      if (!strcasecmp (scheme, "https"))
951        error = aprintf (_("HTTPS support not compiled in"));
952      else
953        error = aprintf (_(parse_errors[error_code]), quote (scheme));
954      xfree (scheme);
955
956      return error;
957    }
958  else
959    return xstrdup (_(parse_errors[error_code]));
960}
961
962/* Split PATH into DIR and FILE.  PATH comes from the URL and is
963   expected to be URL-escaped.
964
965   The path is split into directory (the part up to the last slash)
966   and file (the part after the last slash), which are subsequently
967   unescaped.  Examples:
968
969   PATH                 DIR           FILE
970   "foo/bar/baz"        "foo/bar"     "baz"
971   "foo/bar/"           "foo/bar"     ""
972   "foo"                ""            "foo"
973   "foo/bar/baz%2fqux"  "foo/bar"     "baz/qux" (!)
974
975   DIR and FILE are freshly allocated.  */
976
977static void
978split_path (const char *path, char **dir, char **file)
979{
980  char *last_slash = strrchr (path, '/');
981  if (!last_slash)
982    {
983      *dir = xstrdup ("");
984      *file = xstrdup (path);
985    }
986  else
987    {
988      *dir = strdupdelim (path, last_slash);
989      *file = xstrdup (last_slash + 1);
990    }
991  url_unescape (*dir);
992  url_unescape (*file);
993}
994
995/* Note: URL's "full path" is the path with the query string and
996   params appended.  The "fragment" (#foo) is intentionally ignored,
997   but that might be changed.  For example, if the original URL was
998   "http://host:port/foo/bar/baz;bullshit?querystring#uselessfragment",
999   the full path will be "/foo/bar/baz;bullshit?querystring".  */
1000
1001/* Return the length of the full path, without the terminating
1002   zero.  */
1003
1004static int
1005full_path_length (const struct url *url)
1006{
1007  int len = 0;
1008
1009#define FROB(el) if (url->el) len += 1 + strlen (url->el)
1010
1011  FROB (path);
1012  FROB (params);
1013  FROB (query);
1014
1015#undef FROB
1016
1017  return len;
1018}
1019
1020/* Write out the full path. */
1021
1022static void
1023full_path_write (const struct url *url, char *where)
1024{
1025#define FROB(el, chr) do {                      \
1026  char *f_el = url->el;                         \
1027  if (f_el) {                                   \
1028    int l = strlen (f_el);                      \
1029    *where++ = chr;                             \
1030    memcpy (where, f_el, l);                    \
1031    where += l;                                 \
1032  }                                             \
1033} while (0)
1034
1035  FROB (path, '/');
1036  FROB (params, ';');
1037  FROB (query, '?');
1038
1039#undef FROB
1040}
1041
1042/* Public function for getting the "full path".  E.g. if u->path is
1043   "foo/bar" and u->query is "param=value", full_path will be
1044   "/foo/bar?param=value". */
1045
1046char *
1047url_full_path (const struct url *url)
1048{
1049  int length = full_path_length (url);
1050  char *full_path = xmalloc (length + 1);
1051
1052  full_path_write (url, full_path);
1053  full_path[length] = '\0';
1054
1055  return full_path;
1056}
1057
1058/* Unescape CHR in an otherwise escaped STR.  Used to selectively
1059   escaping of certain characters, such as "/" and ":".  Returns a
1060   count of unescaped chars.  */
1061
1062static void
1063unescape_single_char (char *str, char chr)
1064{
1065  const char c1 = XNUM_TO_DIGIT (chr >> 4);
1066  const char c2 = XNUM_TO_DIGIT (chr & 0xf);
1067  char *h = str;                /* hare */
1068  char *t = str;                /* tortoise */
1069  for (; *h; h++, t++)
1070    {
1071      if (h[0] == '%' && h[1] == c1 && h[2] == c2)
1072        {
1073          *t = chr;
1074          h += 2;
1075        }
1076      else
1077        *t = *h;
1078    }
1079  *t = '\0';
1080}
1081
1082/* Escape unsafe and reserved characters, except for the slash
1083   characters.  */
1084
1085static char *
1086url_escape_dir (const char *dir)
1087{
1088  char *newdir = url_escape_1 (dir, urlchr_unsafe | urlchr_reserved, 1);
1089  if (newdir == dir)
1090    return (char *)dir;
1091
1092  unescape_single_char (newdir, '/');
1093  return newdir;
1094}
1095
1096/* Sync u->path and u->url with u->dir and u->file.  Called after
1097   u->file or u->dir have been changed, typically by the FTP code.  */
1098
1099static void
1100sync_path (struct url *u)
1101{
1102  char *newpath, *efile, *edir;
1103
1104  xfree (u->path);
1105
1106  /* u->dir and u->file are not escaped.  URL-escape them before
1107     reassembling them into u->path.  That way, if they contain
1108     separators like '?' or even if u->file contains slashes, the
1109     path will be correctly assembled.  (u->file can contain slashes
1110     if the URL specifies it with %2f, or if an FTP server returns
1111     it.)  */
1112  edir = url_escape_dir (u->dir);
1113  efile = url_escape_1 (u->file, urlchr_unsafe | urlchr_reserved, 1);
1114
1115  if (!*edir)
1116    newpath = xstrdup (efile);
1117  else
1118    {
1119      int dirlen = strlen (edir);
1120      int filelen = strlen (efile);
1121
1122      /* Copy "DIR/FILE" to newpath. */
1123      char *p = newpath = xmalloc (dirlen + 1 + filelen + 1);
1124      memcpy (p, edir, dirlen);
1125      p += dirlen;
1126      *p++ = '/';
1127      memcpy (p, efile, filelen);
1128      p += filelen;
1129      *p = '\0';
1130    }
1131
1132  u->path = newpath;
1133
1134  if (edir != u->dir)
1135    xfree (edir);
1136  if (efile != u->file)
1137    xfree (efile);
1138
1139  /* Regenerate u->url as well.  */
1140  xfree (u->url);
1141  u->url = url_string (u, URL_AUTH_SHOW);
1142}
1143
1144/* Mutators.  Code in ftp.c insists on changing u->dir and u->file.
1145   This way we can sync u->path and u->url when they get changed.  */
1146
1147void
1148url_set_dir (struct url *url, const char *newdir)
1149{
1150  xfree (url->dir);
1151  url->dir = xstrdup (newdir);
1152  sync_path (url);
1153}
1154
1155void
1156url_set_file (struct url *url, const char *newfile)
1157{
1158  xfree (url->file);
1159  url->file = xstrdup (newfile);
1160  sync_path (url);
1161}
1162
1163void
1164url_free (struct url *url)
1165{
1166  xfree (url->host);
1167  xfree (url->path);
1168  xfree (url->url);
1169
1170  xfree_null (url->params);
1171  xfree_null (url->query);
1172  xfree_null (url->fragment);
1173  xfree_null (url->user);
1174  xfree_null (url->passwd);
1175
1176  xfree (url->dir);
1177  xfree (url->file);
1178
1179  xfree (url);
1180}
1181
1182/* Create all the necessary directories for PATH (a file).  Calls
1183   make_directory internally.  */
1184int
1185mkalldirs (const char *path)
1186{
1187  const char *p;
1188  char *t;
1189  struct_stat st;
1190  int res;
1191
1192  p = path + strlen (path);
1193  for (; *p != '/' && p != path; p--)
1194    ;
1195
1196  /* Don't create if it's just a file.  */
1197  if ((p == path) && (*p != '/'))
1198    return 0;
1199  t = strdupdelim (path, p);
1200
1201  /* Check whether the directory exists.  */
1202  if ((stat (t, &st) == 0))
1203    {
1204      if (S_ISDIR (st.st_mode))
1205        {
1206          xfree (t);
1207          return 0;
1208        }
1209      else
1210        {
1211          /* If the dir exists as a file name, remove it first.  This
1212             is *only* for Wget to work with buggy old CERN http
1213             servers.  Here is the scenario: When Wget tries to
1214             retrieve a directory without a slash, e.g.
1215             http://foo/bar (bar being a directory), CERN server will
1216             not redirect it too http://foo/bar/ -- it will generate a
1217             directory listing containing links to bar/file1,
1218             bar/file2, etc.  Wget will lose because it saves this
1219             HTML listing to a file `bar', so it cannot create the
1220             directory.  To work around this, if the file of the same
1221             name exists, we just remove it and create the directory
1222             anyway.  */
1223          DEBUGP (("Removing %s because of directory danger!\n", t));
1224          unlink (t);
1225        }
1226    }
1227  res = make_directory (t);
1228  if (res != 0)
1229    logprintf (LOG_NOTQUIET, "%s: %s", t, strerror (errno));
1230  xfree (t);
1231  return res;
1232}
1233
1234/* Functions for constructing the file name out of URL components.  */
1235
1236/* A growable string structure, used by url_file_name and friends.
1237   This should perhaps be moved to utils.c.
1238
1239   The idea is to have a convenient and efficient way to construct a
1240   string by having various functions append data to it.  Instead of
1241   passing the obligatory BASEVAR, SIZEVAR and TAILPOS to all the
1242   functions in questions, we pass the pointer to this struct.  */
1243
1244struct growable {
1245  char *base;
1246  int size;
1247  int tail;
1248};
1249
1250/* Ensure that the string can accept APPEND_COUNT more characters past
1251   the current TAIL position.  If necessary, this will grow the string
1252   and update its allocated size.  If the string is already large
1253   enough to take TAIL+APPEND_COUNT characters, this does nothing.  */
1254#define GROW(g, append_size) do {                                       \
1255  struct growable *G_ = g;                                              \
1256  DO_REALLOC (G_->base, G_->size, G_->tail + append_size, char);        \
1257} while (0)
1258
1259/* Return the tail position of the string. */
1260#define TAIL(r) ((r)->base + (r)->tail)
1261
1262/* Move the tail position by APPEND_COUNT characters. */
1263#define TAIL_INCR(r, append_count) ((r)->tail += append_count)
1264
1265/* Append the string STR to DEST.  NOTICE: the string in DEST is not
1266   terminated.  */
1267
1268static void
1269append_string (const char *str, struct growable *dest)
1270{
1271  int l = strlen (str);
1272  GROW (dest, l);
1273  memcpy (TAIL (dest), str, l);
1274  TAIL_INCR (dest, l);
1275}
1276
1277/* Append CH to DEST.  For example, append_char (0, DEST)
1278   zero-terminates DEST.  */
1279
1280static void
1281append_char (char ch, struct growable *dest)
1282{
1283  GROW (dest, 1);
1284  *TAIL (dest) = ch;
1285  TAIL_INCR (dest, 1);
1286}
1287
1288enum {
1289  filechr_not_unix    = 1,      /* unusable on Unix, / and \0 */
1290  filechr_not_windows = 2,      /* unusable on Windows, one of \|/<>?:*" */
1291  filechr_control     = 4       /* a control character, e.g. 0-31 */
1292};
1293
1294#define FILE_CHAR_TEST(c, mask) \
1295    ((opt.restrict_files_nonascii && !c_isascii ((unsigned char)(c))) || \
1296    (filechr_table[(unsigned char)(c)] & (mask)))
1297
1298/* Shorthands for the table: */
1299#define U filechr_not_unix
1300#define W filechr_not_windows
1301#define C filechr_control
1302
1303#define UW U|W
1304#define UWC U|W|C
1305
1306/* Table of characters unsafe under various conditions (see above).
1307
1308   Arguably we could also claim `%' to be unsafe, since we use it as
1309   the escape character.  If we ever want to be able to reliably
1310   translate file name back to URL, this would become important
1311   crucial.  Right now, it's better to be minimal in escaping.  */
1312
1313static const unsigned char filechr_table[256] =
1314{
1315UWC,  C,  C,  C,   C,  C,  C,  C,   /* NUL SOH STX ETX  EOT ENQ ACK BEL */
1316  C,  C,  C,  C,   C,  C,  C,  C,   /* BS  HT  LF  VT   FF  CR  SO  SI  */
1317  C,  C,  C,  C,   C,  C,  C,  C,   /* DLE DC1 DC2 DC3  DC4 NAK SYN ETB */
1318  C,  C,  C,  C,   C,  C,  C,  C,   /* CAN EM  SUB ESC  FS  GS  RS  US  */
1319  0,  0,  W,  0,   0,  0,  0,  0,   /* SP  !   "   #    $   %   &   '   */
1320  0,  0,  W,  0,   0,  0,  0, UW,   /* (   )   *   +    ,   -   .   /   */
1321  0,  0,  0,  0,   0,  0,  0,  0,   /* 0   1   2   3    4   5   6   7   */
1322  0,  0,  W,  0,   W,  0,  W,  W,   /* 8   9   :   ;    <   =   >   ?   */
1323  0,  0,  0,  0,   0,  0,  0,  0,   /* @   A   B   C    D   E   F   G   */
1324  0,  0,  0,  0,   0,  0,  0,  0,   /* H   I   J   K    L   M   N   O   */
1325  0,  0,  0,  0,   0,  0,  0,  0,   /* P   Q   R   S    T   U   V   W   */
1326  0,  0,  0,  0,   W,  0,  0,  0,   /* X   Y   Z   [    \   ]   ^   _   */
1327  0,  0,  0,  0,   0,  0,  0,  0,   /* `   a   b   c    d   e   f   g   */
1328  0,  0,  0,  0,   0,  0,  0,  0,   /* h   i   j   k    l   m   n   o   */
1329  0,  0,  0,  0,   0,  0,  0,  0,   /* p   q   r   s    t   u   v   w   */
1330  0,  0,  0,  0,   W,  0,  0,  C,   /* x   y   z   {    |   }   ~   DEL */
1331
1332  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 128-143 */
1333  C, C, C, C,  C, C, C, C,  C, C, C, C,  C, C, C, C, /* 144-159 */
1334  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1335  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1336
1337  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1338  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1339  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1340  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
1341};
1342#undef U
1343#undef W
1344#undef C
1345#undef UW
1346#undef UWC
1347
1348/* FN_PORT_SEP is the separator between host and port in file names
1349   for non-standard port numbers.  On Unix this is normally ':', as in
1350   "www.xemacs.org:4001/index.html".  Under Windows, we set it to +
1351   because Windows can't handle ':' in file names.  */
1352#define FN_PORT_SEP  (opt.restrict_files_os != restrict_windows ? ':' : '+')
1353
1354/* FN_QUERY_SEP is the separator between the file name and the URL
1355   query, normally '?'.  Since Windows cannot handle '?' as part of
1356   file name, we use '@' instead there.  */
1357#define FN_QUERY_SEP (opt.restrict_files_os != restrict_windows ? '?' : '@')
1358
1359/* Quote path element, characters in [b, e), as file name, and append
1360   the quoted string to DEST.  Each character is quoted as per
1361   file_unsafe_char and the corresponding table.
1362
1363   If ESCAPED is true, the path element is considered to be
1364   URL-escaped and will be unescaped prior to inspection.  */
1365
1366static void
1367append_uri_pathel (const char *b, const char *e, bool escaped,
1368                   struct growable *dest)
1369{
1370  const char *p;
1371  int quoted, outlen;
1372
1373  int mask;
1374  if (opt.restrict_files_os == restrict_unix)
1375    mask = filechr_not_unix;
1376  else
1377    mask = filechr_not_windows;
1378  if (opt.restrict_files_ctrl)
1379    mask |= filechr_control;
1380
1381  /* Copy [b, e) to PATHEL and URL-unescape it. */
1382  if (escaped)
1383    {
1384      char *unescaped;
1385      BOUNDED_TO_ALLOCA (b, e, unescaped);
1386      url_unescape (unescaped);
1387      b = unescaped;
1388      e = unescaped + strlen (unescaped);
1389    }
1390
1391  /* Defang ".." when found as component of path.  Remember that path
1392     comes from the URL and might contain malicious input.  */
1393  if (e - b == 2 && b[0] == '.' && b[1] == '.')
1394    {
1395      b = "%2E%2E";
1396      e = b + 6;
1397    }
1398
1399  /* Walk the PATHEL string and check how many characters we'll need
1400     to quote.  */
1401  quoted = 0;
1402  for (p = b; p < e; p++)
1403    if (FILE_CHAR_TEST (*p, mask))
1404      ++quoted;
1405
1406  /* Calculate the length of the output string.  e-b is the input
1407     string length.  Each quoted char introduces two additional
1408     characters in the string, hence 2*quoted.  */
1409  outlen = (e - b) + (2 * quoted);
1410  GROW (dest, outlen);
1411
1412  if (!quoted)
1413    {
1414      /* If there's nothing to quote, we can simply append the string
1415         without processing it again.  */
1416      memcpy (TAIL (dest), b, outlen);
1417    }
1418  else
1419    {
1420      char *q = TAIL (dest);
1421      for (p = b; p < e; p++)
1422        {
1423          if (!FILE_CHAR_TEST (*p, mask))
1424            *q++ = *p;
1425          else
1426            {
1427              unsigned char ch = *p;
1428              *q++ = '%';
1429              *q++ = XNUM_TO_DIGIT (ch >> 4);
1430              *q++ = XNUM_TO_DIGIT (ch & 0xf);
1431            }
1432        }
1433      assert (q - TAIL (dest) == outlen);
1434    }
1435
1436  /* Perform inline case transformation if required.  */
1437  if (opt.restrict_files_case == restrict_lowercase
1438      || opt.restrict_files_case == restrict_uppercase)
1439    {
1440      char *q;
1441      for (q = TAIL (dest); q < TAIL (dest) + outlen; ++q)
1442        {
1443          if (opt.restrict_files_case == restrict_lowercase)
1444            *q = c_tolower (*q);
1445          else
1446            *q = c_toupper (*q);
1447        }
1448    }
1449
1450  TAIL_INCR (dest, outlen);
1451}
1452
1453/* Append to DEST the directory structure that corresponds the
1454   directory part of URL's path.  For example, if the URL is
1455   http://server/dir1/dir2/file, this appends "/dir1/dir2".
1456
1457   Each path element ("dir1" and "dir2" in the above example) is
1458   examined, url-unescaped, and re-escaped as file name element.
1459
1460   Additionally, it cuts as many directories from the path as
1461   specified by opt.cut_dirs.  For example, if opt.cut_dirs is 1, it
1462   will produce "bar" for the above example.  For 2 or more, it will
1463   produce "".
1464
1465   Each component of the path is quoted for use as file name.  */
1466
1467static void
1468append_dir_structure (const struct url *u, struct growable *dest)
1469{
1470  char *pathel, *next;
1471  int cut = opt.cut_dirs;
1472
1473  /* Go through the path components, de-URL-quote them, and quote them
1474     (if necessary) as file names.  */
1475
1476  pathel = u->path;
1477  for (; (next = strchr (pathel, '/')) != NULL; pathel = next + 1)
1478    {
1479      if (cut-- > 0)
1480        continue;
1481      if (pathel == next)
1482        /* Ignore empty pathels.  */
1483        continue;
1484
1485      if (dest->tail)
1486        append_char ('/', dest);
1487      append_uri_pathel (pathel, next, true, dest);
1488    }
1489}
1490
1491/* Return a unique file name that matches the given URL as good as
1492   possible.  Does not create directories on the file system.  */
1493
1494char *
1495url_file_name (const struct url *u)
1496{
1497  struct growable fnres;        /* stands for "file name result" */
1498
1499  const char *u_file, *u_query;
1500  char *fname, *unique;
1501  char *index_filename = "index.html"; /* The default index file is index.html */
1502
1503  fnres.base = NULL;
1504  fnres.size = 0;
1505  fnres.tail = 0;
1506
1507  /* If an alternative index file was defined, change index_filename */
1508  if (opt.default_page)
1509    index_filename = opt.default_page;
1510
1511
1512  /* Start with the directory prefix, if specified. */
1513  if (opt.dir_prefix)
1514    append_string (opt.dir_prefix, &fnres);
1515
1516  /* If "dirstruct" is turned on (typically the case with -r), add
1517     the host and port (unless those have been turned off) and
1518     directory structure.  */
1519  if (opt.dirstruct)
1520    {
1521      if (opt.protocol_directories)
1522        {
1523          if (fnres.tail)
1524            append_char ('/', &fnres);
1525          append_string (supported_schemes[u->scheme].name, &fnres);
1526        }
1527      if (opt.add_hostdir)
1528        {
1529          if (fnres.tail)
1530            append_char ('/', &fnres);
1531          if (0 != strcmp (u->host, ".."))
1532            append_string (u->host, &fnres);
1533          else
1534            /* Host name can come from the network; malicious DNS may
1535               allow ".." to be resolved, causing us to write to
1536               "../<file>".  Defang such host names.  */
1537            append_string ("%2E%2E", &fnres);
1538          if (u->port != scheme_default_port (u->scheme))
1539            {
1540              char portstr[24];
1541              number_to_string (portstr, u->port);
1542              append_char (FN_PORT_SEP, &fnres);
1543              append_string (portstr, &fnres);
1544            }
1545        }
1546
1547      append_dir_structure (u, &fnres);
1548    }
1549
1550  /* Add the file name. */
1551  if (fnres.tail)
1552    append_char ('/', &fnres);
1553  u_file = *u->file ? u->file : index_filename;
1554  append_uri_pathel (u_file, u_file + strlen (u_file), false, &fnres);
1555
1556  /* Append "?query" to the file name. */
1557  u_query = u->query && *u->query ? u->query : NULL;
1558  if (u_query)
1559    {
1560      append_char (FN_QUERY_SEP, &fnres);
1561      append_uri_pathel (u_query, u_query + strlen (u_query), true, &fnres);
1562    }
1563
1564  /* Zero-terminate the file name. */
1565  append_char ('\0', &fnres);
1566
1567  fname = fnres.base;
1568
1569  /* Check the cases in which the unique extensions are not used:
1570     1) Clobbering is turned off (-nc).
1571     2) Retrieval with regetting.
1572     3) Timestamping is used.
1573     4) Hierarchy is built.
1574
1575     The exception is the case when file does exist and is a
1576     directory (see `mkalldirs' for explanation).  */
1577
1578  if ((opt.noclobber || opt.always_rest || opt.timestamping || opt.dirstruct)
1579      && !(file_exists_p (fname) && !file_non_directory_p (fname)))
1580    {
1581      unique = fname;
1582    }
1583  else
1584    {
1585      unique = unique_name (fname, true);
1586      if (unique != fname)
1587        xfree (fname);
1588    }
1589
1590/* On VMS, alter the name as required. */
1591#ifdef __VMS
1592  {
1593    char *unique2;
1594
1595    unique2 = ods_conform( unique);
1596    if (unique2 != unique)
1597      {
1598        xfree (unique);
1599        unique = unique2;
1600      }
1601  }
1602#endif /* def __VMS */
1603
1604  return unique;
1605}
1606
1607/* Resolve "." and ".." elements of PATH by destructively modifying
1608   PATH and return true if PATH has been modified, false otherwise.
1609
1610   The algorithm is in spirit similar to the one described in rfc1808,
1611   although implemented differently, in one pass.  To recap, path
1612   elements containing only "." are removed, and ".." is taken to mean
1613   "back up one element".  Single leading and trailing slashes are
1614   preserved.
1615
1616   For example, "a/b/c/./../d/.." will yield "a/b/".  More exhaustive
1617   test examples are provided below.  If you change anything in this
1618   function, run test_path_simplify to make sure you haven't broken a
1619   test case.  */
1620
1621static bool
1622path_simplify (enum url_scheme scheme, char *path)
1623{
1624  char *h = path;               /* hare */
1625  char *t = path;               /* tortoise */
1626  char *beg = path;
1627  char *end = strchr (path, '\0');
1628
1629  while (h < end)
1630    {
1631      /* Hare should be at the beginning of a path element. */
1632
1633      if (h[0] == '.' && (h[1] == '/' || h[1] == '\0'))
1634        {
1635          /* Ignore "./". */
1636          h += 2;
1637        }
1638      else if (h[0] == '.' && h[1] == '.' && (h[2] == '/' || h[2] == '\0'))
1639        {
1640          /* Handle "../" by retreating the tortoise by one path
1641             element -- but not past beggining.  */
1642          if (t > beg)
1643            {
1644              /* Move backwards until T hits the beginning of the
1645                 previous path element or the beginning of path. */
1646              for (--t; t > beg && t[-1] != '/'; t--)
1647                ;
1648            }
1649          else if (scheme == SCHEME_FTP)
1650            {
1651              /* If we're at the beginning, copy the "../" literally
1652                 and move the beginning so a later ".." doesn't remove
1653                 it.  This violates RFC 3986; but we do it for FTP
1654                 anyway because there is otherwise no way to get at a
1655                 parent directory, when the FTP server drops us in a
1656                 non-root directory (which is not uncommon). */
1657              beg = t + 3;
1658              goto regular;
1659            }
1660          h += 3;
1661        }
1662      else
1663        {
1664        regular:
1665          /* A regular path element.  If H hasn't advanced past T,
1666             simply skip to the next path element.  Otherwise, copy
1667             the path element until the next slash.  */
1668          if (t == h)
1669            {
1670              /* Skip the path element, including the slash.  */
1671              while (h < end && *h != '/')
1672                t++, h++;
1673              if (h < end)
1674                t++, h++;
1675            }
1676          else
1677            {
1678              /* Copy the path element, including the final slash.  */
1679              while (h < end && *h != '/')
1680                *t++ = *h++;
1681              if (h < end)
1682                *t++ = *h++;
1683            }
1684        }
1685    }
1686
1687  if (t != h)
1688    *t = '\0';
1689
1690  return t != h;
1691}
1692
1693/* Return the length of URL's path.  Path is considered to be
1694   terminated by one or more of the ?query or ;params or #fragment,
1695   depending on the scheme.  */
1696
1697static const char *
1698path_end (const char *url)
1699{
1700  enum url_scheme scheme = url_scheme (url);
1701  const char *seps;
1702  if (scheme == SCHEME_INVALID)
1703    scheme = SCHEME_HTTP;       /* use http semantics for rel links */
1704  /* +2 to ignore the first two separators ':' and '/' */
1705  seps = init_seps (scheme) + 2;
1706  return strpbrk_or_eos (url, seps);
1707}
1708
1709/* Find the last occurrence of character C in the range [b, e), or
1710   NULL, if none are present.  */
1711#define find_last_char(b, e, c) memrchr ((b), (c), (e) - (b))
1712
1713/* Merge BASE with LINK and return the resulting URI.
1714
1715   Either of the URIs may be absolute or relative, complete with the
1716   host name, or path only.  This tries to reasonably handle all
1717   foreseeable cases.  It only employs minimal URL parsing, without
1718   knowledge of the specifics of schemes.
1719
1720   I briefly considered making this function call path_simplify after
1721   the merging process, as rfc1738 seems to suggest.  This is a bad
1722   idea for several reasons: 1) it complexifies the code, and 2)
1723   url_parse has to simplify path anyway, so it's wasteful to boot.  */
1724
1725char *
1726uri_merge (const char *base, const char *link)
1727{
1728  int linklength;
1729  const char *end;
1730  char *merge;
1731
1732  if (url_has_scheme (link))
1733    return xstrdup (link);
1734
1735  /* We may not examine BASE past END. */
1736  end = path_end (base);
1737  linklength = strlen (link);
1738
1739  if (!*link)
1740    {
1741      /* Empty LINK points back to BASE, query string and all. */
1742      return xstrdup (base);
1743    }
1744  else if (*link == '?')
1745    {
1746      /* LINK points to the same location, but changes the query
1747         string.  Examples: */
1748      /* uri_merge("path",         "?new") -> "path?new"     */
1749      /* uri_merge("path?foo",     "?new") -> "path?new"     */
1750      /* uri_merge("path?foo#bar", "?new") -> "path?new"     */
1751      /* uri_merge("path#foo",     "?new") -> "path?new"     */
1752      int baselength = end - base;
1753      merge = xmalloc (baselength + linklength + 1);
1754      memcpy (merge, base, baselength);
1755      memcpy (merge + baselength, link, linklength);
1756      merge[baselength + linklength] = '\0';
1757    }
1758  else if (*link == '#')
1759    {
1760      /* uri_merge("path",         "#new") -> "path#new"     */
1761      /* uri_merge("path#foo",     "#new") -> "path#new"     */
1762      /* uri_merge("path?foo",     "#new") -> "path?foo#new" */
1763      /* uri_merge("path?foo#bar", "#new") -> "path?foo#new" */
1764      int baselength;
1765      const char *end1 = strchr (base, '#');
1766      if (!end1)
1767        end1 = base + strlen (base);
1768      baselength = end1 - base;
1769      merge = xmalloc (baselength + linklength + 1);
1770      memcpy (merge, base, baselength);
1771      memcpy (merge + baselength, link, linklength);
1772      merge[baselength + linklength] = '\0';
1773    }
1774  else if (*link == '/' && *(link + 1) == '/')
1775    {
1776      /* LINK begins with "//" and so is a net path: we need to
1777         replace everything after (and including) the double slash
1778         with LINK. */
1779
1780      /* uri_merge("foo", "//new/bar")            -> "//new/bar"      */
1781      /* uri_merge("//old/foo", "//new/bar")      -> "//new/bar"      */
1782      /* uri_merge("http://old/foo", "//new/bar") -> "http://new/bar" */
1783
1784      int span;
1785      const char *slash;
1786      const char *start_insert;
1787
1788      /* Look for first slash. */
1789      slash = memchr (base, '/', end - base);
1790      /* If found slash and it is a double slash, then replace
1791         from this point, else default to replacing from the
1792         beginning.  */
1793      if (slash && *(slash + 1) == '/')
1794        start_insert = slash;
1795      else
1796        start_insert = base;
1797
1798      span = start_insert - base;
1799      merge = xmalloc (span + linklength + 1);
1800      if (span)
1801        memcpy (merge, base, span);
1802      memcpy (merge + span, link, linklength);
1803      merge[span + linklength] = '\0';
1804    }
1805  else if (*link == '/')
1806    {
1807      /* LINK is an absolute path: we need to replace everything
1808         after (and including) the FIRST slash with LINK.
1809
1810         So, if BASE is "http://host/whatever/foo/bar", and LINK is
1811         "/qux/xyzzy", our result should be
1812         "http://host/qux/xyzzy".  */
1813      int span;
1814      const char *slash;
1815      const char *start_insert = NULL; /* for gcc to shut up. */
1816      const char *pos = base;
1817      bool seen_slash_slash = false;
1818      /* We're looking for the first slash, but want to ignore
1819         double slash. */
1820    again:
1821      slash = memchr (pos, '/', end - pos);
1822      if (slash && !seen_slash_slash)
1823        if (*(slash + 1) == '/')
1824          {
1825            pos = slash + 2;
1826            seen_slash_slash = true;
1827            goto again;
1828          }
1829
1830      /* At this point, SLASH is the location of the first / after
1831         "//", or the first slash altogether.  START_INSERT is the
1832         pointer to the location where LINK will be inserted.  When
1833         examining the last two examples, keep in mind that LINK
1834         begins with '/'. */
1835
1836      if (!slash && !seen_slash_slash)
1837        /* example: "foo" */
1838        /*           ^    */
1839        start_insert = base;
1840      else if (!slash && seen_slash_slash)
1841        /* example: "http://foo" */
1842        /*                     ^ */
1843        start_insert = end;
1844      else if (slash && !seen_slash_slash)
1845        /* example: "foo/bar" */
1846        /*           ^        */
1847        start_insert = base;
1848      else if (slash && seen_slash_slash)
1849        /* example: "http://something/" */
1850        /*                           ^  */
1851        start_insert = slash;
1852
1853      span = start_insert - base;
1854      merge = xmalloc (span + linklength + 1);
1855      if (span)
1856        memcpy (merge, base, span);
1857      memcpy (merge + span, link, linklength);
1858      merge[span + linklength] = '\0';
1859    }
1860  else
1861    {
1862      /* LINK is a relative URL: we need to replace everything
1863         after last slash (possibly empty) with LINK.
1864
1865         So, if BASE is "whatever/foo/bar", and LINK is "qux/xyzzy",
1866         our result should be "whatever/foo/qux/xyzzy".  */
1867      bool need_explicit_slash = false;
1868      int span;
1869      const char *start_insert;
1870      const char *last_slash = find_last_char (base, end, '/');
1871      if (!last_slash)
1872        {
1873          /* No slash found at all.  Replace what we have with LINK. */
1874          start_insert = base;
1875        }
1876      else if (last_slash && last_slash >= base + 2
1877               && last_slash[-2] == ':' && last_slash[-1] == '/')
1878        {
1879          /* example: http://host"  */
1880          /*                      ^ */
1881          start_insert = end + 1;
1882          need_explicit_slash = true;
1883        }
1884      else
1885        {
1886          /* example: "whatever/foo/bar" */
1887          /*                        ^    */
1888          start_insert = last_slash + 1;
1889        }
1890
1891      span = start_insert - base;
1892      merge = xmalloc (span + linklength + 1);
1893      if (span)
1894        memcpy (merge, base, span);
1895      if (need_explicit_slash)
1896        merge[span - 1] = '/';
1897      memcpy (merge + span, link, linklength);
1898      merge[span + linklength] = '\0';
1899    }
1900
1901  return merge;
1902}
1903
1904#define APPEND(p, s) do {                       \
1905  int len = strlen (s);                         \
1906  memcpy (p, s, len);                           \
1907  p += len;                                     \
1908} while (0)
1909
1910/* Use this instead of password when the actual password is supposed
1911   to be hidden.  We intentionally use a generic string without giving
1912   away the number of characters in the password, like previous
1913   versions did.  */
1914#define HIDDEN_PASSWORD "*password*"
1915
1916/* Recreate the URL string from the data in URL.
1917
1918   If HIDE is true (as it is when we're calling this on a URL we plan
1919   to print, but not when calling it to canonicalize a URL for use
1920   within the program), password will be hidden.  Unsafe characters in
1921   the URL will be quoted.  */
1922
1923char *
1924url_string (const struct url *url, enum url_auth_mode auth_mode)
1925{
1926  int size;
1927  char *result, *p;
1928  char *quoted_host, *quoted_user = NULL, *quoted_passwd = NULL;
1929
1930  int scheme_port = supported_schemes[url->scheme].default_port;
1931  const char *scheme_str = supported_schemes[url->scheme].leading_string;
1932  int fplen = full_path_length (url);
1933
1934  bool brackets_around_host;
1935
1936  assert (scheme_str != NULL);
1937
1938  /* Make sure the user name and password are quoted. */
1939  if (url->user)
1940    {
1941      if (auth_mode != URL_AUTH_HIDE)
1942        {
1943          quoted_user = url_escape_allow_passthrough (url->user);
1944          if (url->passwd)
1945            {
1946              if (auth_mode == URL_AUTH_HIDE_PASSWD)
1947                quoted_passwd = HIDDEN_PASSWORD;
1948              else
1949                quoted_passwd = url_escape_allow_passthrough (url->passwd);
1950            }
1951        }
1952    }
1953
1954  /* In the unlikely event that the host name contains non-printable
1955     characters, quote it for displaying to the user.  */
1956  quoted_host = url_escape_allow_passthrough (url->host);
1957
1958  /* Undo the quoting of colons that URL escaping performs.  IPv6
1959     addresses may legally contain colons, and in that case must be
1960     placed in square brackets.  */
1961  if (quoted_host != url->host)
1962    unescape_single_char (quoted_host, ':');
1963  brackets_around_host = strchr (quoted_host, ':') != NULL;
1964
1965  size = (strlen (scheme_str)
1966          + strlen (quoted_host)
1967          + (brackets_around_host ? 2 : 0)
1968          + fplen
1969          + 1);
1970  if (url->port != scheme_port)
1971    size += 1 + numdigit (url->port);
1972  if (quoted_user)
1973    {
1974      size += 1 + strlen (quoted_user);
1975      if (quoted_passwd)
1976        size += 1 + strlen (quoted_passwd);
1977    }
1978
1979  p = result = xmalloc (size);
1980
1981  APPEND (p, scheme_str);
1982  if (quoted_user)
1983    {
1984      APPEND (p, quoted_user);
1985      if (quoted_passwd)
1986        {
1987          *p++ = ':';
1988          APPEND (p, quoted_passwd);
1989        }
1990      *p++ = '@';
1991    }
1992
1993  if (brackets_around_host)
1994    *p++ = '[';
1995  APPEND (p, quoted_host);
1996  if (brackets_around_host)
1997    *p++ = ']';
1998  if (url->port != scheme_port)
1999    {
2000      *p++ = ':';
2001      p = number_to_string (p, url->port);
2002    }
2003
2004  full_path_write (url, p);
2005  p += fplen;
2006  *p++ = '\0';
2007
2008  assert (p - result == size);
2009
2010  if (quoted_user && quoted_user != url->user)
2011    xfree (quoted_user);
2012  if (quoted_passwd && auth_mode == URL_AUTH_SHOW
2013      && quoted_passwd != url->passwd)
2014    xfree (quoted_passwd);
2015  if (quoted_host != url->host)
2016    xfree (quoted_host);
2017
2018  return result;
2019}
2020
2021/* Return true if scheme a is similar to scheme b.
2022
2023   Schemes are similar if they are equal.  If SSL is supported, schemes
2024   are also similar if one is http (SCHEME_HTTP) and the other is https
2025   (SCHEME_HTTPS).  */
2026bool
2027schemes_are_similar_p (enum url_scheme a, enum url_scheme b)
2028{
2029  if (a == b)
2030    return true;
2031#ifdef HAVE_SSL
2032  if ((a == SCHEME_HTTP && b == SCHEME_HTTPS)
2033      || (a == SCHEME_HTTPS && b == SCHEME_HTTP))
2034    return true;
2035#endif
2036  return false;
2037}
2038
2039static int
2040getchar_from_escaped_string (const char *str, char *c)
2041{
2042  const char *p = str;
2043
2044  assert (str && *str);
2045  assert (c);
2046
2047  if (p[0] == '%')
2048    {
2049      if (!c_isxdigit(p[1]) || !c_isxdigit(p[2]))
2050        {
2051          *c = '%';
2052          return 1;
2053        }
2054      else
2055        {
2056          if (p[2] == 0)
2057            return 0; /* error: invalid string */
2058
2059          *c = X2DIGITS_TO_NUM (p[1], p[2]);
2060          if (URL_RESERVED_CHAR(*c))
2061            {
2062              *c = '%';
2063              return 1;
2064            }
2065          else
2066            return 3;
2067        }
2068    }
2069  else
2070    {
2071      *c = p[0];
2072    }
2073
2074  return 1;
2075}
2076
2077bool
2078are_urls_equal (const char *u1, const char *u2)
2079{
2080  const char *p, *q;
2081  int pp, qq;
2082  char ch1, ch2;
2083  assert(u1 && u2);
2084
2085  p = u1;
2086  q = u2;
2087
2088  while (*p && *q
2089         && (pp = getchar_from_escaped_string (p, &ch1))
2090         && (qq = getchar_from_escaped_string (q, &ch2))
2091         && (c_tolower(ch1) == c_tolower(ch2)))
2092    {
2093      p += pp;
2094      q += qq;
2095    }
2096
2097  return (*p == 0 && *q == 0 ? true : false);
2098}
2099
2100#ifdef TESTING
2101/* Debugging and testing support for path_simplify. */
2102
2103#if 0
2104/* Debug: run path_simplify on PATH and return the result in a new
2105   string.  Useful for calling from the debugger.  */
2106static char *
2107ps (char *path)
2108{
2109  char *copy = xstrdup (path);
2110  path_simplify (copy);
2111  return copy;
2112}
2113#endif
2114
2115static const char *
2116run_test (char *test, char *expected_result, enum url_scheme scheme,
2117          bool expected_change)
2118{
2119  char *test_copy = xstrdup (test);
2120  bool modified = path_simplify (scheme, test_copy);
2121
2122  if (0 != strcmp (test_copy, expected_result))
2123    {
2124      printf ("Failed path_simplify(\"%s\"): expected \"%s\", got \"%s\".\n",
2125              test, expected_result, test_copy);
2126      mu_assert ("", 0);
2127    }
2128  if (modified != expected_change)
2129    {
2130      if (expected_change)
2131        printf ("Expected modification with path_simplify(\"%s\").\n",
2132                test);
2133      else
2134        printf ("Expected no modification with path_simplify(\"%s\").\n",
2135                test);
2136    }
2137  xfree (test_copy);
2138  mu_assert ("", modified == expected_change);
2139  return NULL;
2140}
2141
2142const char *
2143test_path_simplify (void)
2144{
2145  static struct {
2146    char *test, *result;
2147    enum url_scheme scheme;
2148    bool should_modify;
2149  } tests[] = {
2150    { "",                       "",             SCHEME_HTTP, false },
2151    { ".",                      "",             SCHEME_HTTP, true },
2152    { "./",                     "",             SCHEME_HTTP, true },
2153    { "..",                     "",             SCHEME_HTTP, true },
2154    { "../",                    "",             SCHEME_HTTP, true },
2155    { "..",                     "..",           SCHEME_FTP,  false },
2156    { "../",                    "../",          SCHEME_FTP,  false },
2157    { "foo",                    "foo",          SCHEME_HTTP, false },
2158    { "foo/bar",                "foo/bar",      SCHEME_HTTP, false },
2159    { "foo///bar",              "foo///bar",    SCHEME_HTTP, false },
2160    { "foo/.",                  "foo/",         SCHEME_HTTP, true },
2161    { "foo/./",                 "foo/",         SCHEME_HTTP, true },
2162    { "foo./",                  "foo./",        SCHEME_HTTP, false },
2163    { "foo/../bar",             "bar",          SCHEME_HTTP, true },
2164    { "foo/../bar/",            "bar/",         SCHEME_HTTP, true },
2165    { "foo/bar/..",             "foo/",         SCHEME_HTTP, true },
2166    { "foo/bar/../x",           "foo/x",        SCHEME_HTTP, true },
2167    { "foo/bar/../x/",          "foo/x/",       SCHEME_HTTP, true },
2168    { "foo/..",                 "",             SCHEME_HTTP, true },
2169    { "foo/../..",              "",             SCHEME_HTTP, true },
2170    { "foo/../../..",           "",             SCHEME_HTTP, true },
2171    { "foo/../../bar/../../baz", "baz",         SCHEME_HTTP, true },
2172    { "foo/../..",              "..",           SCHEME_FTP,  true },
2173    { "foo/../../..",           "../..",        SCHEME_FTP,  true },
2174    { "foo/../../bar/../../baz", "../../baz",   SCHEME_FTP,  true },
2175    { "a/b/../../c",            "c",            SCHEME_HTTP, true },
2176    { "./a/../b",               "b",            SCHEME_HTTP, true }
2177  };
2178  int i;
2179
2180  for (i = 0; i < countof (tests); i++)
2181    {
2182      const char *message;
2183      char *test = tests[i].test;
2184      char *expected_result = tests[i].result;
2185      enum url_scheme scheme = tests[i].scheme;
2186      bool  expected_change = tests[i].should_modify;
2187      message = run_test (test, expected_result, scheme, expected_change);
2188      if (message) return message;
2189    }
2190  return NULL;
2191}
2192
2193const char *
2194test_append_uri_pathel()
2195{
2196  int i;
2197  struct {
2198    char *original_url;
2199    char *input;
2200    bool escaped;
2201    char *expected_result;
2202  } test_array[] = {
2203    { "http://www.yoyodyne.com/path/", "somepage.html", false, "http://www.yoyodyne.com/path/somepage.html" },
2204  };
2205
2206  for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2207    {
2208      struct growable dest;
2209      const char *p = test_array[i].input;
2210
2211      memset (&dest, 0, sizeof (dest));
2212
2213      append_string (test_array[i].original_url, &dest);
2214      append_uri_pathel (p, p + strlen(p), test_array[i].escaped, &dest);
2215      append_char ('\0', &dest);
2216
2217      mu_assert ("test_append_uri_pathel: wrong result",
2218                 strcmp (dest.base, test_array[i].expected_result) == 0);
2219    }
2220
2221  return NULL;
2222}
2223
2224const char*
2225test_are_urls_equal()
2226{
2227  int i;
2228  struct {
2229    char *url1;
2230    char *url2;
2231    bool expected_result;
2232  } test_array[] = {
2233    { "http://www.adomain.com/apath/", "http://www.adomain.com/apath/",       true },
2234    { "http://www.adomain.com/apath/", "http://www.adomain.com/anotherpath/", false },
2235    { "http://www.adomain.com/apath/", "http://www.anotherdomain.com/path/",  false },
2236    { "http://www.adomain.com/~path/", "http://www.adomain.com/%7epath/",     true },
2237    { "http://www.adomain.com/longer-path/", "http://www.adomain.com/path/",  false },
2238    { "http://www.adomain.com/path%2f", "http://www.adomain.com/path/",       false },
2239  };
2240
2241  for (i = 0; i < sizeof(test_array)/sizeof(test_array[0]); ++i)
2242    {
2243      mu_assert ("test_are_urls_equal: wrong result",
2244                 are_urls_equal (test_array[i].url1, test_array[i].url2) == test_array[i].expected_result);
2245    }
2246
2247  return NULL;
2248}
2249
2250#endif /* TESTING */
2251
2252/*
2253 * vim: et ts=2 sw=2
2254 */
2255
2256