1/* Support for cookies.
2   Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
3   Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful, but
13WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31/* Written by Hrvoje Niksic.  Parts are loosely inspired by the
32   cookie patch submitted by Tomasz Wegrzanowski.
33
34   This implements the client-side cookie support, as specified
35   (loosely) by Netscape's "preliminary specification", currently
36   available at:
37
38       http://wp.netscape.com/newsref/std/cookie_spec.html
39
40   rfc2109 is not supported because of its incompatibilities with the
41   above widely-used specification.  rfc2965 is entirely ignored,
42   since popular client software doesn't implement it, and even the
43   sites that do send Set-Cookie2 also emit Set-Cookie for
44   compatibility.  */
45
46#include "wget.h"
47
48#include <stdio.h>
49#include <string.h>
50#include <stdlib.h>
51#include <assert.h>
52#include <errno.h>
53#include <time.h>
54#include "utils.h"
55#include "hash.h"
56#include "cookies.h"
57#include "http.h"               /* for http_atotm */
58
59/* Declarations of `struct cookie' and the most basic functions. */
60
61/* Cookie jar serves as cookie storage and a means of retrieving
62   cookies efficiently.  All cookies with the same domain are stored
63   in a linked list called "chain".  A cookie chain can be reached by
64   looking up the domain in the cookie jar's chains_by_domain table.
65
66   For example, to reach all the cookies under google.com, one must
67   execute hash_table_get(jar->chains_by_domain, "google.com").  Of
68   course, when sending a cookie to `www.google.com', one must search
69   for cookies that belong to either `www.google.com' or `google.com'
70   -- but the point is that the code doesn't need to go through *all*
71   the cookies.  */
72
73struct cookie_jar {
74  /* Cookie chains indexed by domain.  */
75  struct hash_table *chains;
76
77  int cookie_count;             /* number of cookies in the jar. */
78};
79
80/* Value set by entry point functions, so that the low-level
81   routines don't need to call time() all the time.  */
82static time_t cookies_now;
83
84struct cookie_jar *
85cookie_jar_new (void)
86{
87  struct cookie_jar *jar = xnew (struct cookie_jar);
88  jar->chains = make_nocase_string_hash_table (0);
89  jar->cookie_count = 0;
90  return jar;
91}
92
93struct cookie {
94  char *domain;                 /* domain of the cookie */
95  int port;                     /* port number */
96  char *path;                   /* path prefix of the cookie */
97
98  unsigned discard_requested :1; /* whether cookie was created to
99                                   request discarding another
100                                   cookie. */
101
102  unsigned secure :1;           /* whether cookie should be
103                                   transmitted over non-https
104                                   connections. */
105  unsigned domain_exact :1;     /* whether DOMAIN must match as a
106                                   whole. */
107
108  unsigned permanent :1;        /* whether the cookie should outlive
109                                   the session. */
110  time_t expiry_time;           /* time when the cookie expires, 0
111                                   means undetermined. */
112
113  char *attr;                   /* cookie attribute name */
114  char *value;                  /* cookie attribute value */
115
116  struct cookie *next;          /* used for chaining of cookies in the
117                                   same domain. */
118};
119
120#define PORT_ANY (-1)
121
122/* Allocate and return a new, empty cookie structure. */
123
124static struct cookie *
125cookie_new (void)
126{
127  struct cookie *cookie = xnew0 (struct cookie);
128
129  /* Both cookie->permanent and cookie->expiry_time are now 0.  This
130     means that the cookie doesn't expire, but is only valid for this
131     session (i.e. not written out to disk).  */
132
133  cookie->port = PORT_ANY;
134  return cookie;
135}
136
137/* Non-zero if the cookie has expired.  Assumes cookies_now has been
138   set by one of the entry point functions.  */
139
140static bool
141cookie_expired_p (const struct cookie *c)
142{
143  return c->expiry_time != 0 && c->expiry_time < cookies_now;
144}
145
146/* Deallocate COOKIE and its components. */
147
148static void
149delete_cookie (struct cookie *cookie)
150{
151  xfree_null (cookie->domain);
152  xfree_null (cookie->path);
153  xfree_null (cookie->attr);
154  xfree_null (cookie->value);
155  xfree (cookie);
156}
157
158/* Functions for storing cookies.
159
160   All cookies can be reached beginning with jar->chains.  The key in
161   that table is the domain name, and the value is a linked list of
162   all cookies from that domain.  Every new cookie is placed on the
163   head of the list.  */
164
165/* Find and return a cookie in JAR whose domain, path, and attribute
166   name correspond to COOKIE.  If found, PREVPTR will point to the
167   location of the cookie previous in chain, or NULL if the found
168   cookie is the head of a chain.
169
170   If no matching cookie is found, return NULL. */
171
172static struct cookie *
173find_matching_cookie (struct cookie_jar *jar, struct cookie *cookie,
174                      struct cookie **prevptr)
175{
176  struct cookie *chain, *prev;
177
178  chain = hash_table_get (jar->chains, cookie->domain);
179  if (!chain)
180    goto nomatch;
181
182  prev = NULL;
183  for (; chain; prev = chain, chain = chain->next)
184    if (0 == strcmp (cookie->path, chain->path)
185        && 0 == strcmp (cookie->attr, chain->attr)
186        && cookie->port == chain->port)
187      {
188        *prevptr = prev;
189        return chain;
190      }
191
192 nomatch:
193  *prevptr = NULL;
194  return NULL;
195}
196
197/* Store COOKIE to the jar.
198
199   This is done by placing COOKIE at the head of its chain.  However,
200   if COOKIE matches a cookie already in memory, as determined by
201   find_matching_cookie, the old cookie is unlinked and destroyed.
202
203   The key of each chain's hash table entry is allocated only the
204   first time; next hash_table_put's reuse the same key.  */
205
206static void
207store_cookie (struct cookie_jar *jar, struct cookie *cookie)
208{
209  struct cookie *chain_head;
210  char *chain_key;
211
212  if (hash_table_get_pair (jar->chains, cookie->domain,
213                           &chain_key, &chain_head))
214    {
215      /* A chain of cookies in this domain already exists.  Check for
216         duplicates -- if an extant cookie exactly matches our domain,
217         port, path, and name, replace it.  */
218      struct cookie *prev;
219      struct cookie *victim = find_matching_cookie (jar, cookie, &prev);
220
221      if (victim)
222        {
223          /* Remove VICTIM from the chain.  COOKIE will be placed at
224             the head. */
225          if (prev)
226            {
227              prev->next = victim->next;
228              cookie->next = chain_head;
229            }
230          else
231            {
232              /* prev is NULL; apparently VICTIM was at the head of
233                 the chain.  This place will be taken by COOKIE, so
234                 all we need to do is:  */
235              cookie->next = victim->next;
236            }
237          delete_cookie (victim);
238          --jar->cookie_count;
239          DEBUGP (("Deleted old cookie (to be replaced.)\n"));
240        }
241      else
242        cookie->next = chain_head;
243    }
244  else
245    {
246      /* We are now creating the chain.  Use a copy of cookie->domain
247         as the key for the life-time of the chain.  Using
248         cookie->domain would be unsafe because the life-time of the
249         chain may exceed the life-time of the cookie.  (Cookies may
250         be deleted from the chain by this very function.)  */
251      cookie->next = NULL;
252      chain_key = xstrdup (cookie->domain);
253    }
254
255  hash_table_put (jar->chains, chain_key, cookie);
256  ++jar->cookie_count;
257
258  IF_DEBUG
259    {
260      time_t exptime = cookie->expiry_time;
261      DEBUGP (("\nStored cookie %s %d%s %s <%s> <%s> [expiry %s] %s %s\n",
262               cookie->domain, cookie->port,
263               cookie->port == PORT_ANY ? " (ANY)" : "",
264               cookie->path,
265               cookie->permanent ? "permanent" : "session",
266               cookie->secure ? "secure" : "insecure",
267               cookie->expiry_time ? datetime_str (exptime) : "none",
268               cookie->attr, cookie->value));
269    }
270}
271
272/* Discard a cookie matching COOKIE's domain, port, path, and
273   attribute name.  This gets called when we encounter a cookie whose
274   expiry date is in the past, or whose max-age is set to 0.  The
275   former corresponds to netscape cookie spec, while the latter is
276   specified by rfc2109.  */
277
278static void
279discard_matching_cookie (struct cookie_jar *jar, struct cookie *cookie)
280{
281  struct cookie *prev, *victim;
282
283  if (!hash_table_count (jar->chains))
284    /* No elements == nothing to discard. */
285    return;
286
287  victim = find_matching_cookie (jar, cookie, &prev);
288  if (victim)
289    {
290      if (prev)
291        /* Simply unchain the victim. */
292        prev->next = victim->next;
293      else
294        {
295          /* VICTIM was head of its chain.  We need to place a new
296             cookie at the head.  */
297          char *chain_key = NULL;
298          int res;
299
300          res = hash_table_get_pair (jar->chains, victim->domain,
301                                     &chain_key, NULL);
302          assert (res != 0);
303          if (!victim->next)
304            {
305              /* VICTIM was the only cookie in the chain.  Destroy the
306                 chain and deallocate the chain key.  */
307              hash_table_remove (jar->chains, victim->domain);
308              xfree (chain_key);
309            }
310          else
311            hash_table_put (jar->chains, chain_key, victim->next);
312        }
313      delete_cookie (victim);
314      DEBUGP (("Discarded old cookie.\n"));
315    }
316}
317
318/* Functions for parsing the `Set-Cookie' header, and creating new
319   cookies from the wire.  */
320
321#define TOKEN_IS(token, string_literal)                         \
322  BOUNDED_EQUAL_NO_CASE (token.b, token.e, string_literal)
323
324#define TOKEN_NON_EMPTY(token) (token.b != NULL && token.b != token.e)
325
326/* Parse the contents of the `Set-Cookie' header.  The header looks
327   like this:
328
329   name1=value1; name2=value2; ...
330
331   Trailing semicolon is optional; spaces are allowed between all
332   tokens.  Additionally, values may be quoted.
333
334   A new cookie is returned upon success, NULL otherwise.
335
336   The first name-value pair will be used to set the cookie's
337   attribute name and value.  Subsequent parameters will be checked
338   against field names such as `domain', `path', etc.  Recognized
339   fields will be parsed and the corresponding members of COOKIE
340   filled.  */
341
342static struct cookie *
343parse_set_cookie (const char *set_cookie, bool silent)
344{
345  const char *ptr = set_cookie;
346  struct cookie *cookie = cookie_new ();
347  param_token name, value;
348
349  if (!extract_param (&ptr, &name, &value, ';'))
350    goto error;
351  if (!value.b)
352    goto error;
353  cookie->attr = strdupdelim (name.b, name.e);
354  cookie->value = strdupdelim (value.b, value.e);
355
356  while (extract_param (&ptr, &name, &value, ';'))
357    {
358      if (TOKEN_IS (name, "domain"))
359        {
360          if (!TOKEN_NON_EMPTY (value))
361            goto error;
362          xfree_null (cookie->domain);
363          /* Strictly speaking, we should set cookie->domain_exact if the
364             domain doesn't begin with a dot.  But many sites set the
365             domain to "foo.com" and expect "subhost.foo.com" to get the
366             cookie, and it apparently works in browsers.  */
367          if (*value.b == '.')
368            ++value.b;
369          cookie->domain = strdupdelim (value.b, value.e);
370        }
371      else if (TOKEN_IS (name, "path"))
372        {
373          if (!TOKEN_NON_EMPTY (value))
374            goto error;
375          xfree_null (cookie->path);
376          cookie->path = strdupdelim (value.b, value.e);
377        }
378      else if (TOKEN_IS (name, "expires"))
379        {
380          char *value_copy;
381          time_t expires;
382
383          if (!TOKEN_NON_EMPTY (value))
384            goto error;
385          BOUNDED_TO_ALLOCA (value.b, value.e, value_copy);
386
387          expires = http_atotm (value_copy);
388          if (expires != (time_t) -1)
389            {
390              cookie->permanent = 1;
391              cookie->expiry_time = expires;
392              /* According to netscape's specification, expiry time in
393                 the past means that discarding of a matching cookie
394                 is requested.  */
395              if (cookie->expiry_time < cookies_now)
396                cookie->discard_requested = 1;
397            }
398          else
399            /* Error in expiration spec.  Assume default (cookie doesn't
400               expire, but valid only for this session.)  */
401            ;
402        }
403      else if (TOKEN_IS (name, "max-age"))
404        {
405          double maxage = -1;
406          char *value_copy;
407
408          if (!TOKEN_NON_EMPTY (value))
409            goto error;
410          BOUNDED_TO_ALLOCA (value.b, value.e, value_copy);
411
412          sscanf (value_copy, "%lf", &maxage);
413          if (maxage == -1)
414            /* something went wrong. */
415            goto error;
416          cookie->permanent = 1;
417          cookie->expiry_time = cookies_now + maxage;
418
419          /* According to rfc2109, a cookie with max-age of 0 means that
420             discarding of a matching cookie is requested.  */
421          if (maxage == 0)
422            cookie->discard_requested = 1;
423        }
424      else if (TOKEN_IS (name, "secure"))
425        {
426          /* ignore value completely */
427          cookie->secure = 1;
428        }
429      else
430        /* Ignore unrecognized attribute. */
431        ;
432    }
433  if (*ptr)
434    /* extract_param has encountered a syntax error */
435    goto error;
436
437  /* The cookie has been successfully constructed; return it. */
438  return cookie;
439
440 error:
441  if (!silent)
442    logprintf (LOG_NOTQUIET,
443               _("Syntax error in Set-Cookie: %s at position %d.\n"),
444               quotearg_style (escape_quoting_style, set_cookie),
445               (int) (ptr - set_cookie));
446  delete_cookie (cookie);
447  return NULL;
448}
449
450#undef TOKEN_IS
451#undef TOKEN_NON_EMPTY
452
453/* Sanity checks.  These are important, otherwise it is possible for
454   mailcious attackers to destroy important cookie information and/or
455   violate your privacy.  */
456
457
458#define REQUIRE_DIGITS(p) do {                  \
459  if (!c_isdigit (*p))                            \
460    return false;                               \
461  for (++p; c_isdigit (*p); p++)                  \
462    ;                                           \
463} while (0)
464
465#define REQUIRE_DOT(p) do {                     \
466  if (*p++ != '.')                              \
467    return false;                               \
468} while (0)
469
470/* Check whether ADDR matches <digits>.<digits>.<digits>.<digits>.
471
472   We don't want to call network functions like inet_addr() because
473   all we need is a check, preferrably one that is small, fast, and
474   well-defined.  */
475
476static bool
477numeric_address_p (const char *addr)
478{
479  const char *p = addr;
480
481  REQUIRE_DIGITS (p);           /* A */
482  REQUIRE_DOT (p);              /* . */
483  REQUIRE_DIGITS (p);           /* B */
484  REQUIRE_DOT (p);              /* . */
485  REQUIRE_DIGITS (p);           /* C */
486  REQUIRE_DOT (p);              /* . */
487  REQUIRE_DIGITS (p);           /* D */
488
489  if (*p != '\0')
490    return false;
491  return true;
492}
493
494/* Check whether COOKIE_DOMAIN is an appropriate domain for HOST.
495   Originally I tried to make the check compliant with rfc2109, but
496   the sites deviated too often, so I had to fall back to "tail
497   matching", as defined by the original Netscape's cookie spec.  */
498
499static bool
500check_domain_match (const char *cookie_domain, const char *host)
501{
502  DEBUGP (("cdm: 1"));
503
504  /* Numeric address requires exact match.  It also requires HOST to
505     be an IP address.  */
506  if (numeric_address_p (cookie_domain))
507    return 0 == strcmp (cookie_domain, host);
508
509  DEBUGP ((" 2"));
510
511  /* For the sake of efficiency, check for exact match first. */
512  if (0 == strcasecmp (cookie_domain, host))
513    return true;
514
515  DEBUGP ((" 3"));
516
517  /* HOST must match the tail of cookie_domain. */
518  if (!match_tail (host, cookie_domain, true))
519    return false;
520
521  /* We know that COOKIE_DOMAIN is a subset of HOST; however, we must
522     make sure that somebody is not trying to set the cookie for a
523     subdomain shared by many entities.  For example, "company.co.uk"
524     must not be allowed to set a cookie for ".co.uk".  On the other
525     hand, "sso.redhat.de" should be able to set a cookie for
526     ".redhat.de".
527
528     The only marginally sane way to handle this I can think of is to
529     reject on the basis of the length of the second-level domain name
530     (but when the top-level domain is unknown), with the assumption
531     that those of three or less characters could be reserved.  For
532     example:
533
534          .co.org -> works because the TLD is known
535           .co.uk -> doesn't work because "co" is only two chars long
536          .com.au -> doesn't work because "com" is only 3 chars long
537          .cnn.uk -> doesn't work because "cnn" is also only 3 chars long (ugh)
538          .cnn.de -> doesn't work for the same reason (ugh!!)
539         .abcd.de -> works because "abcd" is 4 chars long
540      .img.cnn.de -> works because it's not trying to set the 2nd level domain
541       .cnn.co.uk -> works for the same reason
542
543    That should prevent misuse, while allowing reasonable usage.  If
544    someone knows of a better way to handle this, please let me
545    know.  */
546  {
547    const char *p = cookie_domain;
548    int dccount = 1;            /* number of domain components */
549    int ldcl  = 0;              /* last domain component length */
550    int nldcl = 0;              /* next to last domain component length */
551    int out;
552    if (*p == '.')
553      /* Ignore leading period in this calculation. */
554      ++p;
555    DEBUGP ((" 4"));
556    for (out = 0; !out; p++)
557      switch (*p)
558        {
559        case '\0':
560          out = 1;
561          break;
562        case '.':
563          if (ldcl == 0)
564            /* Empty domain component found -- the domain is invalid. */
565            return false;
566          if (*(p + 1) == '\0')
567            {
568              /* Tolerate trailing '.' by not treating the domain as
569                 one ending with an empty domain component.  */
570              out = 1;
571              break;
572            }
573          nldcl = ldcl;
574          ldcl  = 0;
575          ++dccount;
576          break;
577        default:
578          ++ldcl;
579        }
580
581    DEBUGP ((" 5"));
582
583    if (dccount < 2)
584      return false;
585
586    DEBUGP ((" 6"));
587
588    if (dccount == 2)
589      {
590        size_t i;
591        int known_toplevel = false;
592        static const char *known_toplevel_domains[] = {
593          ".com", ".edu", ".net", ".org", ".gov", ".mil", ".int"
594        };
595        for (i = 0; i < countof (known_toplevel_domains); i++)
596          if (match_tail (cookie_domain, known_toplevel_domains[i], true))
597            {
598              known_toplevel = true;
599              break;
600            }
601        if (!known_toplevel && nldcl <= 3)
602          return false;
603      }
604  }
605
606  DEBUGP ((" 7"));
607
608  /* Don't allow the host "foobar.com" to set a cookie for domain
609     "bar.com".  */
610  if (*cookie_domain != '.')
611    {
612      int dlen = strlen (cookie_domain);
613      int hlen = strlen (host);
614      /* cookie host:    hostname.foobar.com */
615      /* desired domain:             bar.com */
616      /* '.' must be here in host-> ^        */
617      if (hlen > dlen && host[hlen - dlen - 1] != '.')
618        return false;
619    }
620
621  DEBUGP ((" 8"));
622
623  return true;
624}
625
626static int path_matches (const char *, const char *);
627
628/* Check whether PATH begins with COOKIE_PATH. */
629
630static bool
631check_path_match (const char *cookie_path, const char *path)
632{
633  return path_matches (path, cookie_path) != 0;
634}
635
636/* Prepend '/' to string S.  S is copied to fresh stack-allocated
637   space and its value is modified to point to the new location.  */
638
639#define PREPEND_SLASH(s) do {                                   \
640  char *PS_newstr = (char *) alloca (1 + strlen (s) + 1);       \
641  *PS_newstr = '/';                                             \
642  strcpy (PS_newstr + 1, s);                                    \
643  s = PS_newstr;                                                \
644} while (0)
645
646
647/* Process the HTTP `Set-Cookie' header.  This results in storing the
648   cookie or discarding a matching one, or ignoring it completely, all
649   depending on the contents.  */
650
651void
652cookie_handle_set_cookie (struct cookie_jar *jar,
653                          const char *host, int port,
654                          const char *path, const char *set_cookie)
655{
656  struct cookie *cookie;
657  cookies_now = time (NULL);
658
659  /* Wget's paths don't begin with '/' (blame rfc1808), but cookie
660     usage assumes /-prefixed paths.  Until the rest of Wget is fixed,
661     simply prepend slash to PATH.  */
662  PREPEND_SLASH (path);
663
664  cookie = parse_set_cookie (set_cookie, false);
665  if (!cookie)
666    goto out;
667
668  /* Sanitize parts of cookie. */
669
670  if (!cookie->domain)
671    {
672    copy_domain:
673      /* If the domain was not provided, we use the one we're talking
674         to, and set exact match.  */
675      cookie->domain = xstrdup (host);
676      cookie->domain_exact = 1;
677      /* Set the port, but only if it's non-default. */
678      if (port != 80 && port != 443)
679        cookie->port = port;
680    }
681  else
682    {
683      if (!check_domain_match (cookie->domain, host))
684        {
685          logprintf (LOG_NOTQUIET,
686                     _("Cookie coming from %s attempted to set domain to %s\n"),
687                     quotearg_style (escape_quoting_style, host),
688                     quotearg_style (escape_quoting_style, cookie->domain));
689          xfree (cookie->domain);
690          goto copy_domain;
691        }
692    }
693
694  if (!cookie->path)
695    {
696      /* The cookie doesn't set path: set it to the URL path, sans the
697         file part ("/dir/file" truncated to "/dir/").  */
698      char *trailing_slash = strrchr (path, '/');
699      if (trailing_slash)
700        cookie->path = strdupdelim (path, trailing_slash + 1);
701      else
702        /* no slash in the string -- can this even happen? */
703        cookie->path = xstrdup (path);
704    }
705  else
706    {
707      /* The cookie sets its own path; verify that it is legal. */
708      if (!check_path_match (cookie->path, path))
709        {
710          DEBUGP (("Attempt to fake the path: %s, %s\n",
711                   cookie->path, path));
712          goto out;
713        }
714    }
715
716  /* Now store the cookie, or discard an existing cookie, if
717     discarding was requested.  */
718
719  if (cookie->discard_requested)
720    {
721      discard_matching_cookie (jar, cookie);
722      goto out;
723    }
724
725  store_cookie (jar, cookie);
726  return;
727
728 out:
729  if (cookie)
730    delete_cookie (cookie);
731}
732
733/* Support for sending out cookies in HTTP requests, based on
734   previously stored cookies.  Entry point is
735   `build_cookies_request'.  */
736
737/* Return a count of how many times CHR occurs in STRING. */
738
739static int
740count_char (const char *string, char chr)
741{
742  const char *p;
743  int count = 0;
744  for (p = string; *p; p++)
745    if (*p == chr)
746      ++count;
747  return count;
748}
749
750/* Find the cookie chains whose domains match HOST and store them to
751   DEST.
752
753   A cookie chain is the head of a list of cookies that belong to a
754   host/domain.  Given HOST "img.search.xemacs.org", this function
755   will return the chains for "img.search.xemacs.org",
756   "search.xemacs.org", and "xemacs.org" -- those of them that exist
757   (if any), that is.
758
759   DEST should be large enough to accept (in the worst case) as many
760   elements as there are domain components of HOST.  */
761
762static int
763find_chains_of_host (struct cookie_jar *jar, const char *host,
764                     struct cookie *dest[])
765{
766  int dest_count = 0;
767  int passes, passcnt;
768
769  /* Bail out quickly if there are no cookies in the jar.  */
770  if (!hash_table_count (jar->chains))
771    return 0;
772
773  if (numeric_address_p (host))
774    /* If host is an IP address, only check for the exact match. */
775    passes = 1;
776  else
777    /* Otherwise, check all the subdomains except the top-level (last)
778       one.  As a domain with N components has N-1 dots, the number of
779       passes equals the number of dots.  */
780    passes = count_char (host, '.');
781
782  passcnt = 0;
783
784  /* Find chains that match HOST, starting with exact match and
785     progressing to less specific domains.  For instance, given HOST
786     fly.srk.fer.hr, first look for fly.srk.fer.hr's chain, then
787     srk.fer.hr's, then fer.hr's.  */
788  while (1)
789    {
790      struct cookie *chain = hash_table_get (jar->chains, host);
791      if (chain)
792        dest[dest_count++] = chain;
793      if (++passcnt >= passes)
794        break;
795      host = strchr (host, '.') + 1;
796    }
797
798  return dest_count;
799}
800
801/* If FULL_PATH begins with PREFIX, return the length of PREFIX, zero
802   otherwise.  */
803
804static int
805path_matches (const char *full_path, const char *prefix)
806{
807  int len = strlen (prefix);
808
809  if (0 != strncmp (full_path, prefix, len))
810    /* FULL_PATH doesn't begin with PREFIX. */
811    return 0;
812
813  /* Length of PREFIX determines the quality of the match. */
814  return len + 1;
815}
816
817/* Return true iff COOKIE matches the provided parameters of the URL
818   being downloaded: HOST, PORT, PATH, and SECFLAG.
819
820   If PATH_GOODNESS is non-NULL, store the "path goodness" value
821   there.  That value is a measure of how closely COOKIE matches PATH,
822   used for ordering cookies.  */
823
824static bool
825cookie_matches_url (const struct cookie *cookie,
826                    const char *host, int port, const char *path,
827                    bool secflag, int *path_goodness)
828{
829  int pg;
830
831  if (cookie_expired_p (cookie))
832    /* Ignore stale cookies.  Don't bother unchaining the cookie at
833       this point -- Wget is a relatively short-lived application, and
834       stale cookies will not be saved by `save_cookies'.  On the
835       other hand, this function should be as efficient as
836       possible.  */
837    return false;
838
839  if (cookie->secure && !secflag)
840    /* Don't transmit secure cookies over insecure connections.  */
841    return false;
842  if (cookie->port != PORT_ANY && cookie->port != port)
843    return false;
844
845  /* If exact domain match is required, verify that cookie's domain is
846     equal to HOST.  If not, assume success on the grounds of the
847     cookie's chain having been found by find_chains_of_host.  */
848  if (cookie->domain_exact
849      && 0 != strcasecmp (host, cookie->domain))
850    return false;
851
852  pg = path_matches (path, cookie->path);
853  if (pg == 0)
854    return false;
855
856  if (path_goodness)
857    /* If the caller requested path_goodness, we return it.  This is
858       an optimization, so that the caller doesn't need to call
859       path_matches() again.  */
860    *path_goodness = pg;
861  return true;
862}
863
864/* A structure that points to a cookie, along with the additional
865   information about the cookie's "goodness".  This allows us to sort
866   the cookies when returning them to the server, as required by the
867   spec.  */
868
869struct weighed_cookie {
870  struct cookie *cookie;
871  int domain_goodness;
872  int path_goodness;
873};
874
875/* Comparator used for uniquifying the list. */
876
877static int
878equality_comparator (const void *p1, const void *p2)
879{
880  struct weighed_cookie *wc1 = (struct weighed_cookie *)p1;
881  struct weighed_cookie *wc2 = (struct weighed_cookie *)p2;
882
883  int namecmp  = strcmp (wc1->cookie->attr, wc2->cookie->attr);
884  int valuecmp = strcmp (wc1->cookie->value, wc2->cookie->value);
885
886  /* We only really care whether both name and value are equal.  We
887     return them in this order only for consistency...  */
888  return namecmp ? namecmp : valuecmp;
889}
890
891/* Eliminate duplicate cookies.  "Duplicate cookies" are any two
892   cookies with the same attr name and value.  Whenever a duplicate
893   pair is found, one of the cookies is removed.  */
894
895static int
896eliminate_dups (struct weighed_cookie *outgoing, int count)
897{
898  struct weighed_cookie *h;     /* hare */
899  struct weighed_cookie *t;     /* tortoise */
900  struct weighed_cookie *end = outgoing + count;
901
902  /* We deploy a simple uniquify algorithm: first sort the array
903     according to our sort criteria, then copy it to itself, comparing
904     each cookie to its neighbor and ignoring the duplicates.  */
905
906  qsort (outgoing, count, sizeof (struct weighed_cookie), equality_comparator);
907
908  /* "Hare" runs through all the entries in the array, followed by
909     "tortoise".  If a duplicate is found, the hare skips it.
910     Non-duplicate entries are copied to the tortoise ptr.  */
911
912  for (h = t = outgoing; h < end; h++)
913    {
914      if (h != end - 1)
915        {
916          struct cookie *c0 = h[0].cookie;
917          struct cookie *c1 = h[1].cookie;
918          if (!strcmp (c0->attr, c1->attr) && !strcmp (c0->value, c1->value))
919            continue;           /* ignore the duplicate */
920        }
921
922      /* If the hare has advanced past the tortoise (because of
923         previous dups), make sure the values get copied.  Otherwise,
924         no copying is necessary.  */
925      if (h != t)
926        *t++ = *h;
927      else
928        t++;
929    }
930  return t - outgoing;
931}
932
933/* Comparator used for sorting by quality. */
934
935static int
936goodness_comparator (const void *p1, const void *p2)
937{
938  struct weighed_cookie *wc1 = (struct weighed_cookie *)p1;
939  struct weighed_cookie *wc2 = (struct weighed_cookie *)p2;
940
941  /* Subtractions take `wc2' as the first argument becauase we want a
942     sort in *decreasing* order of goodness.  */
943  int dgdiff = wc2->domain_goodness - wc1->domain_goodness;
944  int pgdiff = wc2->path_goodness - wc1->path_goodness;
945
946  /* Sort by domain goodness; if these are the same, sort by path
947     goodness.  (The sorting order isn't really specified; maybe it
948     should be the other way around.)  */
949  return dgdiff ? dgdiff : pgdiff;
950}
951
952/* Generate a `Cookie' header for a request that goes to HOST:PORT and
953   requests PATH from the server.  The resulting string is allocated
954   with `malloc', and the caller is responsible for freeing it.  If no
955   cookies pertain to this request, i.e. no cookie header should be
956   generated, NULL is returned.  */
957
958char *
959cookie_header (struct cookie_jar *jar, const char *host,
960               int port, const char *path, bool secflag)
961{
962  struct cookie **chains;
963  int chain_count;
964
965  struct cookie *cookie;
966  struct weighed_cookie *outgoing;
967  int count, i, ocnt;
968  char *result;
969  int result_size, pos;
970  PREPEND_SLASH (path);         /* see cookie_handle_set_cookie */
971
972  /* First, find the cookie chains whose domains match HOST. */
973
974  /* Allocate room for find_chains_of_host to write to.  The number of
975     chains can at most equal the number of subdomains, hence
976     1+<number of dots>.  */
977  chains = alloca_array (struct cookie *, 1 + count_char (host, '.'));
978  chain_count = find_chains_of_host (jar, host, chains);
979
980  /* No cookies for this host. */
981  if (!chain_count)
982    return NULL;
983
984  cookies_now = time (NULL);
985
986  /* Now extract from the chains those cookies that match our host
987     (for domain_exact cookies), port (for cookies with port other
988     than PORT_ANY), etc.  See matching_cookie for details.  */
989
990  /* Count the number of matching cookies. */
991  count = 0;
992  for (i = 0; i < chain_count; i++)
993    for (cookie = chains[i]; cookie; cookie = cookie->next)
994      if (cookie_matches_url (cookie, host, port, path, secflag, NULL))
995        ++count;
996  if (!count)
997    return NULL;                /* no cookies matched */
998
999  /* Allocate the array. */
1000  outgoing = alloca_array (struct weighed_cookie, count);
1001
1002  /* Fill the array with all the matching cookies from the chains that
1003     match HOST. */
1004  ocnt = 0;
1005  for (i = 0; i < chain_count; i++)
1006    for (cookie = chains[i]; cookie; cookie = cookie->next)
1007      {
1008        int pg;
1009        if (!cookie_matches_url (cookie, host, port, path, secflag, &pg))
1010          continue;
1011        outgoing[ocnt].cookie = cookie;
1012        outgoing[ocnt].domain_goodness = strlen (cookie->domain);
1013        outgoing[ocnt].path_goodness   = pg;
1014        ++ocnt;
1015      }
1016  assert (ocnt == count);
1017
1018  /* Eliminate duplicate cookies; that is, those whose name and value
1019     are the same.  */
1020  count = eliminate_dups (outgoing, count);
1021
1022  /* Sort the array so that best-matching domains come first, and
1023     that, within one domain, best-matching paths come first. */
1024  qsort (outgoing, count, sizeof (struct weighed_cookie), goodness_comparator);
1025
1026  /* Count the space the name=value pairs will take. */
1027  result_size = 0;
1028  for (i = 0; i < count; i++)
1029    {
1030      struct cookie *c = outgoing[i].cookie;
1031      /* name=value */
1032      result_size += strlen (c->attr) + 1 + strlen (c->value);
1033    }
1034
1035  /* Allocate output buffer:
1036     name=value pairs -- result_size
1037     "; " separators  -- (count - 1) * 2
1038     \0 terminator    -- 1 */
1039  result_size = result_size + (count - 1) * 2 + 1;
1040  result = xmalloc (result_size);
1041  pos = 0;
1042  for (i = 0; i < count; i++)
1043    {
1044      struct cookie *c = outgoing[i].cookie;
1045      int namlen = strlen (c->attr);
1046      int vallen = strlen (c->value);
1047
1048      memcpy (result + pos, c->attr, namlen);
1049      pos += namlen;
1050      result[pos++] = '=';
1051      memcpy (result + pos, c->value, vallen);
1052      pos += vallen;
1053      if (i < count - 1)
1054        {
1055          result[pos++] = ';';
1056          result[pos++] = ' ';
1057        }
1058    }
1059  result[pos++] = '\0';
1060  assert (pos == result_size);
1061  return result;
1062}
1063
1064/* Support for loading and saving cookies.  The format used for
1065   loading and saving should be the format of the `cookies.txt' file
1066   used by Netscape and Mozilla, at least the Unix versions.
1067   (Apparently IE can export cookies in that format as well.)  The
1068   format goes like this:
1069
1070       DOMAIN DOMAIN-FLAG PATH SECURE-FLAG TIMESTAMP ATTR-NAME ATTR-VALUE
1071
1072     DOMAIN      -- cookie domain, optionally followed by :PORT
1073     DOMAIN-FLAG -- whether all hosts in the domain match
1074     PATH        -- cookie path
1075     SECURE-FLAG -- whether cookie requires secure connection
1076     TIMESTAMP   -- expiry timestamp, number of seconds since epoch
1077     ATTR-NAME   -- name of the cookie attribute
1078     ATTR-VALUE  -- value of the cookie attribute (empty if absent)
1079
1080   The fields are separated by TABs.  All fields are mandatory, except
1081   for ATTR-VALUE.  The `-FLAG' fields are boolean, their legal values
1082   being "TRUE" and "FALSE'.  Empty lines, lines consisting of
1083   whitespace only, and comment lines (beginning with # optionally
1084   preceded by whitespace) are ignored.
1085
1086   Example line from cookies.txt (split in two lines for readability):
1087
1088       .google.com      TRUE    /       FALSE   2147368447      \
1089       PREF     ID=34bb47565bbcd47b:LD=en:NR=20:TM=985172580:LM=985739012
1090
1091*/
1092
1093/* If the region [B, E) ends with :<digits>, parse the number, return
1094   it, and store new boundary (location of the `:') to DOMAIN_E_PTR.
1095   If port is not specified, return 0.  */
1096
1097static int
1098domain_port (const char *domain_b, const char *domain_e,
1099             const char **domain_e_ptr)
1100{
1101  int port = 0;
1102  const char *p;
1103  const char *colon = memchr (domain_b, ':', domain_e - domain_b);
1104  if (!colon)
1105    return 0;
1106  for (p = colon + 1; p < domain_e && c_isdigit (*p); p++)
1107    port = 10 * port + (*p - '0');
1108  if (p < domain_e)
1109    /* Garbage following port number. */
1110    return 0;
1111  *domain_e_ptr = colon;
1112  return port;
1113}
1114
1115#define GET_WORD(p, b, e) do {                  \
1116  b = p;                                        \
1117  while (*p && *p != '\t')                      \
1118    ++p;                                        \
1119  e = p;                                        \
1120  if (b == e || !*p)                            \
1121    goto next;                                  \
1122  ++p;                                          \
1123} while (0)
1124
1125/* Load cookies from FILE.  */
1126
1127void
1128cookie_jar_load (struct cookie_jar *jar, const char *file)
1129{
1130  char *line;
1131  FILE *fp = fopen (file, "r");
1132  if (!fp)
1133    {
1134      logprintf (LOG_NOTQUIET, _("Cannot open cookies file %s: %s\n"),
1135                 quote (file), strerror (errno));
1136      return;
1137    }
1138  cookies_now = time (NULL);
1139
1140  for (; ((line = read_whole_line (fp)) != NULL); xfree (line))
1141    {
1142      struct cookie *cookie;
1143      char *p = line;
1144
1145      double expiry;
1146      int port;
1147
1148      char *domain_b  = NULL, *domain_e  = NULL;
1149      char *domflag_b = NULL, *domflag_e = NULL;
1150      char *path_b    = NULL, *path_e    = NULL;
1151      char *secure_b  = NULL, *secure_e  = NULL;
1152      char *expires_b = NULL, *expires_e = NULL;
1153      char *name_b    = NULL, *name_e    = NULL;
1154      char *value_b   = NULL, *value_e   = NULL;
1155
1156      /* Skip leading white-space. */
1157      while (*p && c_isspace (*p))
1158        ++p;
1159      /* Ignore empty lines.  */
1160      if (!*p || *p == '#')
1161        continue;
1162
1163      GET_WORD (p, domain_b,  domain_e);
1164      GET_WORD (p, domflag_b, domflag_e);
1165      GET_WORD (p, path_b,    path_e);
1166      GET_WORD (p, secure_b,  secure_e);
1167      GET_WORD (p, expires_b, expires_e);
1168      GET_WORD (p, name_b,    name_e);
1169
1170      /* Don't use GET_WORD for value because it ends with newline,
1171         not TAB.  */
1172      value_b = p;
1173      value_e = p + strlen (p);
1174      if (value_e > value_b && value_e[-1] == '\n')
1175        --value_e;
1176      if (value_e > value_b && value_e[-1] == '\r')
1177        --value_e;
1178      /* Empty values are legal (I think), so don't bother checking. */
1179
1180      cookie = cookie_new ();
1181
1182      cookie->attr    = strdupdelim (name_b, name_e);
1183      cookie->value   = strdupdelim (value_b, value_e);
1184      cookie->path    = strdupdelim (path_b, path_e);
1185      cookie->secure  = BOUNDED_EQUAL (secure_b, secure_e, "TRUE");
1186
1187      /* Curl source says, quoting Andre Garcia: "flag: A TRUE/FALSE
1188         value indicating if all machines within a given domain can
1189         access the variable.  This value is set automatically by the
1190         browser, depending on the value set for the domain."  */
1191      cookie->domain_exact = !BOUNDED_EQUAL (domflag_b, domflag_e, "TRUE");
1192
1193      /* DOMAIN needs special treatment because we might need to
1194         extract the port.  */
1195      port = domain_port (domain_b, domain_e, (const char **)&domain_e);
1196      if (port)
1197        cookie->port = port;
1198
1199      if (*domain_b == '.')
1200        ++domain_b;             /* remove leading dot internally */
1201      cookie->domain  = strdupdelim (domain_b, domain_e);
1202
1203      /* safe default in case EXPIRES field is garbled. */
1204      expiry = (double)cookies_now - 1;
1205
1206      /* I don't like changing the line, but it's safe here.  (line is
1207         malloced.)  */
1208      *expires_e = '\0';
1209      sscanf (expires_b, "%lf", &expiry);
1210
1211      if (expiry == 0)
1212        {
1213          /* EXPIRY can be 0 for session cookies saved because the
1214             user specified `--keep-session-cookies' in the past.
1215             They remain session cookies, and will be saved only if
1216             the user has specified `keep-session-cookies' again.  */
1217        }
1218      else
1219        {
1220          if (expiry < cookies_now)
1221            goto abort_cookie;  /* ignore stale cookie. */
1222          cookie->expiry_time = expiry;
1223          cookie->permanent = 1;
1224        }
1225
1226      store_cookie (jar, cookie);
1227
1228    next:
1229      continue;
1230
1231    abort_cookie:
1232      delete_cookie (cookie);
1233    }
1234  fclose (fp);
1235}
1236
1237/* Save cookies, in format described above, to FILE. */
1238
1239void
1240cookie_jar_save (struct cookie_jar *jar, const char *file)
1241{
1242  FILE *fp;
1243  hash_table_iterator iter;
1244
1245  DEBUGP (("Saving cookies to %s.\n", file));
1246
1247  cookies_now = time (NULL);
1248
1249  fp = fopen (file, "w");
1250  if (!fp)
1251    {
1252      logprintf (LOG_NOTQUIET, _("Cannot open cookies file %s: %s\n"),
1253                 quote (file), strerror (errno));
1254      return;
1255    }
1256
1257  fputs ("# HTTP cookie file.\n", fp);
1258  fprintf (fp, "# Generated by Wget on %s.\n", datetime_str (cookies_now));
1259  fputs ("# Edit at your own risk.\n\n", fp);
1260
1261  for (hash_table_iterate (jar->chains, &iter);
1262       hash_table_iter_next (&iter);
1263       )
1264    {
1265      const char *domain = iter.key;
1266      struct cookie *cookie = iter.value;
1267      for (; cookie; cookie = cookie->next)
1268        {
1269          if (!cookie->permanent && !opt.keep_session_cookies)
1270            continue;
1271          if (cookie_expired_p (cookie))
1272            continue;
1273          if (!cookie->domain_exact)
1274            fputc ('.', fp);
1275          fputs (domain, fp);
1276          if (cookie->port != PORT_ANY)
1277            fprintf (fp, ":%d", cookie->port);
1278          fprintf (fp, "\t%s\t%s\t%s\t%.0f\t%s\t%s\n",
1279                   cookie->domain_exact ? "FALSE" : "TRUE",
1280                   cookie->path, cookie->secure ? "TRUE" : "FALSE",
1281                   (double)cookie->expiry_time,
1282                   cookie->attr, cookie->value);
1283          if (ferror (fp))
1284            goto out;
1285        }
1286    }
1287 out:
1288  if (ferror (fp))
1289    logprintf (LOG_NOTQUIET, _("Error writing to %s: %s\n"),
1290               quote (file), strerror (errno));
1291  if (fclose (fp) < 0)
1292    logprintf (LOG_NOTQUIET, _("Error closing %s: %s\n"),
1293               quote (file), strerror (errno));
1294
1295  DEBUGP (("Done saving cookies.\n"));
1296}
1297
1298/* Clean up cookie-related data. */
1299
1300void
1301cookie_jar_delete (struct cookie_jar *jar)
1302{
1303  /* Iterate over chains (indexed by domain) and free them. */
1304  hash_table_iterator iter;
1305  for (hash_table_iterate (jar->chains, &iter); hash_table_iter_next (&iter); )
1306    {
1307      struct cookie *chain = iter.value;
1308      xfree (iter.key);
1309      /* Then all cookies in this chain. */
1310      while (chain)
1311        {
1312          struct cookie *next = chain->next;
1313          delete_cookie (chain);
1314          chain = next;
1315        }
1316    }
1317  hash_table_destroy (jar->chains);
1318  xfree (jar);
1319}
1320
1321/* Test cases.  Currently this is only tests parse_set_cookies.  To
1322   use, recompile Wget with -DTEST_COOKIES and call test_cookies()
1323   from main.  */
1324
1325#ifdef TEST_COOKIES
1326void
1327test_cookies (void)
1328{
1329  /* Tests expected to succeed: */
1330  static struct {
1331    const char *data;
1332    const char *results[10];
1333  } tests_succ[] = {
1334    { "arg=value", {"arg", "value", NULL} },
1335    { "arg1=value1;arg2=value2", {"arg1", "value1", "arg2", "value2", NULL} },
1336    { "arg1=value1; arg2=value2", {"arg1", "value1", "arg2", "value2", NULL} },
1337    { "arg1=value1;  arg2=value2;", {"arg1", "value1", "arg2", "value2", NULL} },
1338    { "arg1=value1;  arg2=value2;  ", {"arg1", "value1", "arg2", "value2", NULL} },
1339    { "arg1=\"value1\"; arg2=\"\"", {"arg1", "value1", "arg2", "", NULL} },
1340    { "arg=", {"arg", "", NULL} },
1341    { "arg1=; arg2=", {"arg1", "", "arg2", "", NULL} },
1342    { "arg1 = ; arg2= ", {"arg1", "", "arg2", "", NULL} },
1343  };
1344
1345  /* Tests expected to fail: */
1346  static char *tests_fail[] = {
1347    ";",
1348    "arg=\"unterminated",
1349    "=empty-name",
1350    "arg1=;=another-empty-name",
1351  };
1352  int i;
1353
1354  for (i = 0; i < countof (tests_succ); i++)
1355    {
1356      int ind;
1357      const char *data = tests_succ[i].data;
1358      const char **expected = tests_succ[i].results;
1359      struct cookie *c;
1360
1361      c = parse_set_cookie (data, true);
1362      if (!c)
1363        {
1364          printf ("NULL cookie returned for valid data: %s\n", data);
1365          continue;
1366        }
1367
1368      /* Test whether extract_param handles these cases correctly. */
1369      {
1370        param_token name, value;
1371        const char *ptr = data;
1372        int j = 0;
1373        while (extract_param (&ptr, &name, &value, ';'))
1374          {
1375            char *n = strdupdelim (name.b, name.e);
1376            char *v = strdupdelim (value.b, value.e);
1377            if (!expected[j])
1378              {
1379                printf ("Too many parameters for '%s'\n", data);
1380                break;
1381              }
1382            if (0 != strcmp (expected[j], n))
1383              printf ("Invalid name %d for '%s' (expected '%s', got '%s')\n",
1384                      j / 2 + 1, data, expected[j], n);
1385            if (0 != strcmp (expected[j + 1], v))
1386              printf ("Invalid value %d for '%s' (expected '%s', got '%s')\n",
1387                      j / 2 + 1, data, expected[j + 1], v);
1388            j += 2;
1389            free (n);
1390            free (v);
1391          }
1392        if (expected[j])
1393          printf ("Too few parameters for '%s'\n", data);
1394      }
1395    }
1396
1397  for (i = 0; i < countof (tests_fail); i++)
1398    {
1399      struct cookie *c;
1400      char *data = tests_fail[i];
1401      c = parse_set_cookie (data, true);
1402      if (c)
1403        printf ("Failed to report error on invalid data: %s\n", data);
1404    }
1405}
1406#endif /* TEST_COOKIES */
1407