1/* File retrieval.
2   Copyright (C) 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
3   2005, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
4
5This file is part of GNU Wget.
6
7GNU Wget is free software; you can redistribute it and/or modify
8it under the terms of the GNU General Public License as published by
9the Free Software Foundation; either version 3 of the License, or (at
10your option) any later version.
11
12GNU Wget is distributed in the hope that it will be useful,
13but WITHOUT ANY WARRANTY; without even the implied warranty of
14MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15GNU General Public License for more details.
16
17You should have received a copy of the GNU General Public License
18along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20Additional permission under GNU GPL version 3 section 7
21
22If you modify this program, or any covered work, by linking or
23combining it with the OpenSSL project's OpenSSL library (or a
24modified version of that library), containing parts covered by the
25terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26grants you additional permission to convey the resulting work.
27Corresponding Source for a non-source form of such a combination
28shall include the source code for the parts of OpenSSL used as well
29as that of the covered work.  */
30
31#include "wget.h"
32
33#include <stdio.h>
34#include <stdlib.h>
35#ifdef HAVE_UNISTD_H
36# include <unistd.h>
37#endif /* HAVE_UNISTD_H */
38#include <errno.h>
39#include <string.h>
40#include <assert.h>
41
42#include "exits.h"
43#include "utils.h"
44#include "retr.h"
45#include "progress.h"
46#include "url.h"
47#include "recur.h"
48#include "ftp.h"
49#include "http.h"
50#include "host.h"
51#include "connect.h"
52#include "hash.h"
53#include "convert.h"
54#include "ptimer.h"
55#include "html-url.h"
56#include "iri.h"
57
58/* Total size of downloaded files.  Used to enforce quota.  */
59SUM_SIZE_INT total_downloaded_bytes;
60
61/* Total download time in seconds. */
62double total_download_time;
63
64/* If non-NULL, the stream to which output should be written.  This
65   stream is initialized when `-O' is used.  */
66FILE *output_stream;
67
68/* Whether output_document is a regular file we can manipulate,
69   i.e. not `-' or a device file. */
70bool output_stream_regular;
71
72static struct {
73  wgint chunk_bytes;
74  double chunk_start;
75  double sleep_adjust;
76} limit_data;
77
78static void
79limit_bandwidth_reset (void)
80{
81  xzero (limit_data);
82}
83
84/* Limit the bandwidth by pausing the download for an amount of time.
85   BYTES is the number of bytes received from the network, and TIMER
86   is the timer that started at the beginning of download.  */
87
88static void
89limit_bandwidth (wgint bytes, struct ptimer *timer)
90{
91  double delta_t = ptimer_read (timer) - limit_data.chunk_start;
92  double expected;
93
94  limit_data.chunk_bytes += bytes;
95
96  /* Calculate the amount of time we expect downloading the chunk
97     should take.  If in reality it took less time, sleep to
98     compensate for the difference.  */
99  expected = (double) limit_data.chunk_bytes / opt.limit_rate;
100
101  if (expected > delta_t)
102    {
103      double slp = expected - delta_t + limit_data.sleep_adjust;
104      double t0, t1;
105      if (slp < 0.2)
106        {
107          DEBUGP (("deferring a %.2f ms sleep (%s/%.2f).\n",
108                   slp * 1000, number_to_static_string (limit_data.chunk_bytes),
109                   delta_t));
110          return;
111        }
112      DEBUGP (("\nsleeping %.2f ms for %s bytes, adjust %.2f ms\n",
113               slp * 1000, number_to_static_string (limit_data.chunk_bytes),
114               limit_data.sleep_adjust));
115
116      t0 = ptimer_read (timer);
117      xsleep (slp);
118      t1 = ptimer_measure (timer);
119
120      /* Due to scheduling, we probably slept slightly longer (or
121         shorter) than desired.  Calculate the difference between the
122         desired and the actual sleep, and adjust the next sleep by
123         that amount.  */
124      limit_data.sleep_adjust = slp - (t1 - t0);
125      /* If sleep_adjust is very large, it's likely due to suspension
126         and not clock inaccuracy.  Don't enforce those.  */
127      if (limit_data.sleep_adjust > 0.5)
128        limit_data.sleep_adjust = 0.5;
129      else if (limit_data.sleep_adjust < -0.5)
130        limit_data.sleep_adjust = -0.5;
131    }
132
133  limit_data.chunk_bytes = 0;
134  limit_data.chunk_start = ptimer_read (timer);
135}
136
137#ifndef MIN
138# define MIN(i, j) ((i) <= (j) ? (i) : (j))
139#endif
140
141/* Write data in BUF to OUT.  However, if *SKIP is non-zero, skip that
142   amount of data and decrease SKIP.  Increment *TOTAL by the amount
143   of data written.  */
144
145static int
146write_data (FILE *out, const char *buf, int bufsize, wgint *skip,
147            wgint *written)
148{
149  if (!out)
150    return 1;
151  if (*skip > bufsize)
152    {
153      *skip -= bufsize;
154      return 1;
155    }
156  if (*skip)
157    {
158      buf += *skip;
159      bufsize -= *skip;
160      *skip = 0;
161      if (bufsize == 0)
162        return 1;
163    }
164
165  fwrite (buf, 1, bufsize, out);
166  *written += bufsize;
167
168  /* Immediately flush the downloaded data.  This should not hinder
169     performance: fast downloads will arrive in large 16K chunks
170     (which stdio would write out immediately anyway), and slow
171     downloads wouldn't be limited by disk speed.  */
172
173  /* 2005-04-20 SMS.
174     Perhaps it shouldn't hinder performance, but it sure does, at least
175     on VMS (more than 2X).  Rather than speculate on what it should or
176     shouldn't do, it might make more sense to test it.  Even better, it
177     might be nice to explain what possible benefit it could offer, as
178     it appears to be a clear invitation to poor performance with no
179     actual justification.  (Also, why 16K?  Anyone test other values?)
180  */
181#ifndef __VMS
182  fflush (out);
183#endif /* ndef __VMS */
184  return !ferror (out);
185}
186
187/* Read the contents of file descriptor FD until it the connection
188   terminates or a read error occurs.  The data is read in portions of
189   up to 16K and written to OUT as it arrives.  If opt.verbose is set,
190   the progress is shown.
191
192   TOREAD is the amount of data expected to arrive, normally only used
193   by the progress gauge.
194
195   STARTPOS is the position from which the download starts, used by
196   the progress gauge.  If QTYREAD is non-NULL, the value it points to
197   is incremented by the amount of data read from the network.  If
198   QTYWRITTEN is non-NULL, the value it points to is incremented by
199   the amount of data written to disk.  The time it took to download
200   the data is stored to ELAPSED.
201
202   The function exits and returns the amount of data read.  In case of
203   error while reading data, -1 is returned.  In case of error while
204   writing data, -2 is returned.  */
205
206int
207fd_read_body (int fd, FILE *out, wgint toread, wgint startpos,
208              wgint *qtyread, wgint *qtywritten, double *elapsed, int flags)
209{
210  int ret = 0;
211
212  static char dlbuf[16384];
213  int dlbufsize = sizeof (dlbuf);
214
215  struct ptimer *timer = NULL;
216  double last_successful_read_tm = 0;
217
218  /* The progress gauge, set according to the user preferences. */
219  void *progress = NULL;
220
221  /* Non-zero if the progress gauge is interactive, i.e. if it can
222     continually update the display.  When true, smaller timeout
223     values are used so that the gauge can update the display when
224     data arrives slowly. */
225  bool progress_interactive = false;
226
227  bool exact = !!(flags & rb_read_exactly);
228  wgint skip = 0;
229
230  /* How much data we've read/written.  */
231  wgint sum_read = 0;
232  wgint sum_written = 0;
233
234  if (flags & rb_skip_startpos)
235    skip = startpos;
236
237  if (opt.verbose)
238    {
239      /* If we're skipping STARTPOS bytes, pass 0 as the INITIAL
240         argument to progress_create because the indicator doesn't
241         (yet) know about "skipping" data.  */
242      wgint start = skip ? 0 : startpos;
243      progress = progress_create (start, start + toread);
244      progress_interactive = progress_interactive_p (progress);
245    }
246
247  if (opt.limit_rate)
248    limit_bandwidth_reset ();
249
250  /* A timer is needed for tracking progress, for throttling, and for
251     tracking elapsed time.  If either of these are requested, start
252     the timer.  */
253  if (progress || opt.limit_rate || elapsed)
254    {
255      timer = ptimer_new ();
256      last_successful_read_tm = 0;
257    }
258
259  /* Use a smaller buffer for low requested bandwidths.  For example,
260     with --limit-rate=2k, it doesn't make sense to slurp in 16K of
261     data and then sleep for 8s.  With buffer size equal to the limit,
262     we never have to sleep for more than one second.  */
263  if (opt.limit_rate && opt.limit_rate < dlbufsize)
264    dlbufsize = opt.limit_rate;
265
266  /* Read from FD while there is data to read.  Normally toread==0
267     means that it is unknown how much data is to arrive.  However, if
268     EXACT is set, then toread==0 means what it says: that no data
269     should be read.  */
270  while (!exact || (sum_read < toread))
271    {
272      int rdsize = exact ? MIN (toread - sum_read, dlbufsize) : dlbufsize;
273      double tmout = opt.read_timeout;
274      if (progress_interactive)
275        {
276          /* For interactive progress gauges, always specify a ~1s
277             timeout, so that the gauge can be updated regularly even
278             when the data arrives very slowly or stalls.  */
279          tmout = 0.95;
280          if (opt.read_timeout)
281            {
282              double waittm;
283              waittm = ptimer_read (timer) - last_successful_read_tm;
284              if (waittm + tmout > opt.read_timeout)
285                {
286                  /* Don't let total idle time exceed read timeout. */
287                  tmout = opt.read_timeout - waittm;
288                  if (tmout < 0)
289                    {
290                      /* We've already exceeded the timeout. */
291                      ret = -1, errno = ETIMEDOUT;
292                      break;
293                    }
294                }
295            }
296        }
297      ret = fd_read (fd, dlbuf, rdsize, tmout);
298
299      if (progress_interactive && ret < 0 && errno == ETIMEDOUT)
300        ret = 0;                /* interactive timeout, handled above */
301      else if (ret <= 0)
302        break;                  /* EOF or read error */
303
304      if (progress || opt.limit_rate)
305        {
306          ptimer_measure (timer);
307          if (ret > 0)
308            last_successful_read_tm = ptimer_read (timer);
309        }
310
311      if (ret > 0)
312        {
313          sum_read += ret;
314          if (!write_data (out, dlbuf, ret, &skip, &sum_written))
315            {
316              ret = -2;
317              goto out;
318            }
319        }
320
321      if (opt.limit_rate)
322        limit_bandwidth (ret, timer);
323
324      if (progress)
325        progress_update (progress, ret, ptimer_read (timer));
326#ifdef WINDOWS
327      if (toread > 0 && !opt.quiet)
328        ws_percenttitle (100.0 *
329                         (startpos + sum_read) / (startpos + toread));
330#endif
331    }
332  if (ret < -1)
333    ret = -1;
334
335 out:
336  if (progress)
337    progress_finish (progress, ptimer_read (timer));
338
339  if (elapsed)
340    *elapsed = ptimer_read (timer);
341  if (timer)
342    ptimer_destroy (timer);
343
344  if (qtyread)
345    *qtyread += sum_read;
346  if (qtywritten)
347    *qtywritten += sum_written;
348
349  return ret;
350}
351
352/* Read a hunk of data from FD, up until a terminator.  The hunk is
353   limited by whatever the TERMINATOR callback chooses as its
354   terminator.  For example, if terminator stops at newline, the hunk
355   will consist of a line of data; if terminator stops at two
356   newlines, it can be used to read the head of an HTTP response.
357   Upon determining the boundary, the function returns the data (up to
358   the terminator) in malloc-allocated storage.
359
360   In case of read error, NULL is returned.  In case of EOF and no
361   data read, NULL is returned and errno set to 0.  In case of having
362   read some data, but encountering EOF before seeing the terminator,
363   the data that has been read is returned, but it will (obviously)
364   not contain the terminator.
365
366   The TERMINATOR function is called with three arguments: the
367   beginning of the data read so far, the beginning of the current
368   block of peeked-at data, and the length of the current block.
369   Depending on its needs, the function is free to choose whether to
370   analyze all data or just the newly arrived data.  If TERMINATOR
371   returns NULL, it means that the terminator has not been seen.
372   Otherwise it should return a pointer to the charactre immediately
373   following the terminator.
374
375   The idea is to be able to read a line of input, or otherwise a hunk
376   of text, such as the head of an HTTP request, without crossing the
377   boundary, so that the next call to fd_read etc. reads the data
378   after the hunk.  To achieve that, this function does the following:
379
380   1. Peek at incoming data.
381
382   2. Determine whether the peeked data, along with the previously
383      read data, includes the terminator.
384
385      2a. If yes, read the data until the end of the terminator, and
386          exit.
387
388      2b. If no, read the peeked data and goto 1.
389
390   The function is careful to assume as little as possible about the
391   implementation of peeking.  For example, every peek is followed by
392   a read.  If the read returns a different amount of data, the
393   process is retried until all data arrives safely.
394
395   SIZEHINT is the buffer size sufficient to hold all the data in the
396   typical case (it is used as the initial buffer size).  MAXSIZE is
397   the maximum amount of memory this function is allowed to allocate,
398   or 0 if no upper limit is to be enforced.
399
400   This function should be used as a building block for other
401   functions -- see fd_read_line as a simple example.  */
402
403char *
404fd_read_hunk (int fd, hunk_terminator_t terminator, long sizehint, long maxsize)
405{
406  long bufsize = sizehint;
407  char *hunk = xmalloc (bufsize);
408  int tail = 0;                 /* tail position in HUNK */
409
410  assert (!maxsize || maxsize >= bufsize);
411
412  while (1)
413    {
414      const char *end;
415      int pklen, rdlen, remain;
416
417      /* First, peek at the available data. */
418
419      pklen = fd_peek (fd, hunk + tail, bufsize - 1 - tail, -1);
420      if (pklen < 0)
421        {
422          xfree (hunk);
423          return NULL;
424        }
425      end = terminator (hunk, hunk + tail, pklen);
426      if (end)
427        {
428          /* The data contains the terminator: we'll drain the data up
429             to the end of the terminator.  */
430          remain = end - (hunk + tail);
431          assert (remain >= 0);
432          if (remain == 0)
433            {
434              /* No more data needs to be read. */
435              hunk[tail] = '\0';
436              return hunk;
437            }
438          if (bufsize - 1 < tail + remain)
439            {
440              bufsize = tail + remain + 1;
441              hunk = xrealloc (hunk, bufsize);
442            }
443        }
444      else
445        /* No terminator: simply read the data we know is (or should
446           be) available.  */
447        remain = pklen;
448
449      /* Now, read the data.  Note that we make no assumptions about
450         how much data we'll get.  (Some TCP stacks are notorious for
451         read returning less data than the previous MSG_PEEK.)  */
452
453      rdlen = fd_read (fd, hunk + tail, remain, 0);
454      if (rdlen < 0)
455        {
456          xfree_null (hunk);
457          return NULL;
458        }
459      tail += rdlen;
460      hunk[tail] = '\0';
461
462      if (rdlen == 0)
463        {
464          if (tail == 0)
465            {
466              /* EOF without anything having been read */
467              xfree (hunk);
468              errno = 0;
469              return NULL;
470            }
471          else
472            /* EOF seen: return the data we've read. */
473            return hunk;
474        }
475      if (end && rdlen == remain)
476        /* The terminator was seen and the remaining data drained --
477           we got what we came for.  */
478        return hunk;
479
480      /* Keep looping until all the data arrives. */
481
482      if (tail == bufsize - 1)
483        {
484          /* Double the buffer size, but refuse to allocate more than
485             MAXSIZE bytes.  */
486          if (maxsize && bufsize >= maxsize)
487            {
488              xfree (hunk);
489              errno = ENOMEM;
490              return NULL;
491            }
492          bufsize <<= 1;
493          if (maxsize && bufsize > maxsize)
494            bufsize = maxsize;
495          hunk = xrealloc (hunk, bufsize);
496        }
497    }
498}
499
500static const char *
501line_terminator (const char *start, const char *peeked, int peeklen)
502{
503  const char *p = memchr (peeked, '\n', peeklen);
504  if (p)
505    /* p+1 because the line must include '\n' */
506    return p + 1;
507  return NULL;
508}
509
510/* The maximum size of the single line we agree to accept.  This is
511   not meant to impose an arbitrary limit, but to protect the user
512   from Wget slurping up available memory upon encountering malicious
513   or buggy server output.  Define it to 0 to remove the limit.  */
514#define FD_READ_LINE_MAX 4096
515
516/* Read one line from FD and return it.  The line is allocated using
517   malloc, but is never larger than FD_READ_LINE_MAX.
518
519   If an error occurs, or if no data can be read, NULL is returned.
520   In the former case errno indicates the error condition, and in the
521   latter case, errno is NULL.  */
522
523char *
524fd_read_line (int fd)
525{
526  return fd_read_hunk (fd, line_terminator, 128, FD_READ_LINE_MAX);
527}
528
529/* Return a printed representation of the download rate, along with
530   the units appropriate for the download speed.  */
531
532const char *
533retr_rate (wgint bytes, double secs)
534{
535  static char res[20];
536  static const char *rate_names[] = {"B/s", "KB/s", "MB/s", "GB/s" };
537  int units;
538
539  double dlrate = calc_rate (bytes, secs, &units);
540  /* Use more digits for smaller numbers (regardless of unit used),
541     e.g. "1022", "247", "12.5", "2.38".  */
542  sprintf (res, "%.*f %s",
543           dlrate >= 99.95 ? 0 : dlrate >= 9.995 ? 1 : 2,
544           dlrate, rate_names[units]);
545
546  return res;
547}
548
549/* Calculate the download rate and trim it as appropriate for the
550   speed.  Appropriate means that if rate is greater than 1K/s,
551   kilobytes are used, and if rate is greater than 1MB/s, megabytes
552   are used.
553
554   UNITS is zero for B/s, one for KB/s, two for MB/s, and three for
555   GB/s.  */
556
557double
558calc_rate (wgint bytes, double secs, int *units)
559{
560  double dlrate;
561
562  assert (secs >= 0);
563  assert (bytes >= 0);
564
565  if (secs == 0)
566    /* If elapsed time is exactly zero, it means we're under the
567       resolution of the timer.  This can easily happen on systems
568       that use time() for the timer.  Since the interval lies between
569       0 and the timer's resolution, assume half the resolution.  */
570    secs = ptimer_resolution () / 2.0;
571
572  dlrate = bytes / secs;
573  if (dlrate < 1024.0)
574    *units = 0;
575  else if (dlrate < 1024.0 * 1024.0)
576    *units = 1, dlrate /= 1024.0;
577  else if (dlrate < 1024.0 * 1024.0 * 1024.0)
578    *units = 2, dlrate /= (1024.0 * 1024.0);
579  else
580    /* Maybe someone will need this, one day. */
581    *units = 3, dlrate /= (1024.0 * 1024.0 * 1024.0);
582
583  return dlrate;
584}
585
586
587#define SUSPEND_POST_DATA do {                  \
588  post_data_suspended = true;                   \
589  saved_post_data = opt.post_data;              \
590  saved_post_file_name = opt.post_file_name;    \
591  opt.post_data = NULL;                         \
592  opt.post_file_name = NULL;                    \
593} while (0)
594
595#define RESTORE_POST_DATA do {                          \
596  if (post_data_suspended)                              \
597    {                                                   \
598      opt.post_data = saved_post_data;                  \
599      opt.post_file_name = saved_post_file_name;        \
600      post_data_suspended = false;                      \
601    }                                                   \
602} while (0)
603
604static char *getproxy (struct url *);
605
606/* Retrieve the given URL.  Decides which loop to call -- HTTP, FTP,
607   FTP, proxy, etc.  */
608
609/* #### This function should be rewritten so it doesn't return from
610   multiple points. */
611
612uerr_t
613retrieve_url (struct url * orig_parsed, const char *origurl, char **file,
614              char **newloc, const char *refurl, int *dt, bool recursive,
615              struct iri *iri, bool register_status)
616{
617  uerr_t result;
618  char *url;
619  bool location_changed;
620  bool iri_fallbacked = 0;
621  int dummy;
622  char *mynewloc, *proxy;
623  struct url *u = orig_parsed, *proxy_url;
624  int up_error_code;            /* url parse error code */
625  char *local_file;
626  int redirection_count = 0;
627
628  bool post_data_suspended = false;
629  char *saved_post_data = NULL;
630  char *saved_post_file_name = NULL;
631
632  /* If dt is NULL, use local storage.  */
633  if (!dt)
634    {
635      dt = &dummy;
636      dummy = 0;
637    }
638  url = xstrdup (origurl);
639  if (newloc)
640    *newloc = NULL;
641  if (file)
642    *file = NULL;
643
644  if (!refurl)
645    refurl = opt.referer;
646
647 redirected:
648  /* (also for IRI fallbacking) */
649
650  result = NOCONERROR;
651  mynewloc = NULL;
652  local_file = NULL;
653  proxy_url = NULL;
654
655  proxy = getproxy (u);
656  if (proxy)
657    {
658      struct iri *pi = iri_new ();
659      set_uri_encoding (pi, opt.locale, true);
660      pi->utf8_encode = false;
661
662      /* Parse the proxy URL.  */
663      proxy_url = url_parse (proxy, &up_error_code, NULL, true);
664      if (!proxy_url)
665        {
666          char *error = url_error (proxy, up_error_code);
667          logprintf (LOG_NOTQUIET, _("Error parsing proxy URL %s: %s.\n"),
668                     proxy, error);
669          xfree (url);
670          xfree (error);
671          RESTORE_POST_DATA;
672          result = PROXERR;
673          goto bail;
674        }
675      if (proxy_url->scheme != SCHEME_HTTP && proxy_url->scheme != u->scheme)
676        {
677          logprintf (LOG_NOTQUIET, _("Error in proxy URL %s: Must be HTTP.\n"), proxy);
678          url_free (proxy_url);
679          xfree (url);
680          RESTORE_POST_DATA;
681          result = PROXERR;
682          goto bail;
683        }
684    }
685
686  if (u->scheme == SCHEME_HTTP
687#ifdef HAVE_SSL
688      || u->scheme == SCHEME_HTTPS
689#endif
690      || (proxy_url && proxy_url->scheme == SCHEME_HTTP))
691    {
692      result = http_loop (u, &mynewloc, &local_file, refurl, dt, proxy_url, iri);
693    }
694  else if (u->scheme == SCHEME_FTP)
695    {
696      /* If this is a redirection, temporarily turn off opt.ftp_glob
697         and opt.recursive, both being undesirable when following
698         redirects.  */
699      bool oldrec = recursive, glob = opt.ftp_glob;
700      if (redirection_count)
701        oldrec = glob = false;
702
703      result = ftp_loop (u, dt, proxy_url, recursive, glob);
704      recursive = oldrec;
705
706      /* There is a possibility of having HTTP being redirected to
707         FTP.  In these cases we must decide whether the text is HTML
708         according to the suffix.  The HTML suffixes are `.html',
709         `.htm' and a few others, case-insensitive.  */
710      if (redirection_count && local_file && u->scheme == SCHEME_FTP)
711        {
712          if (has_html_suffix_p (local_file))
713            *dt |= TEXTHTML;
714        }
715    }
716
717  if (proxy_url)
718    {
719      url_free (proxy_url);
720      proxy_url = NULL;
721    }
722
723  location_changed = (result == NEWLOCATION);
724  if (location_changed)
725    {
726      char *construced_newloc;
727      struct url *newloc_parsed;
728
729      assert (mynewloc != NULL);
730
731      if (local_file)
732        xfree (local_file);
733
734      /* The HTTP specs only allow absolute URLs to appear in
735         redirects, but a ton of boneheaded webservers and CGIs out
736         there break the rules and use relative URLs, and popular
737         browsers are lenient about this, so wget should be too. */
738      construced_newloc = uri_merge (url, mynewloc);
739      xfree (mynewloc);
740      mynewloc = construced_newloc;
741
742      /* Reset UTF-8 encoding state, keep the URI encoding and reset
743         the content encoding. */
744      iri->utf8_encode = opt.enable_iri;
745      set_content_encoding (iri, NULL);
746      xfree_null (iri->orig_url);
747
748      /* Now, see if this new location makes sense. */
749      newloc_parsed = url_parse (mynewloc, &up_error_code, iri, true);
750      if (!newloc_parsed)
751        {
752          char *error = url_error (mynewloc, up_error_code);
753          logprintf (LOG_NOTQUIET, "%s: %s.\n", escnonprint_uri (mynewloc),
754                     error);
755          if (orig_parsed != u)
756            {
757              url_free (u);
758            }
759          xfree (url);
760          xfree (mynewloc);
761          xfree (error);
762          RESTORE_POST_DATA;
763          goto bail;
764        }
765
766      /* Now mynewloc will become newloc_parsed->url, because if the
767         Location contained relative paths like .././something, we
768         don't want that propagating as url.  */
769      xfree (mynewloc);
770      mynewloc = xstrdup (newloc_parsed->url);
771
772      /* Check for max. number of redirections.  */
773      if (++redirection_count > opt.max_redirect)
774        {
775          logprintf (LOG_NOTQUIET, _("%d redirections exceeded.\n"),
776                     opt.max_redirect);
777          url_free (newloc_parsed);
778          if (orig_parsed != u)
779            {
780              url_free (u);
781            }
782          xfree (url);
783          xfree (mynewloc);
784          RESTORE_POST_DATA;
785          result = WRONGCODE;
786          goto bail;
787        }
788
789      xfree (url);
790      url = mynewloc;
791      if (orig_parsed != u)
792        {
793          url_free (u);
794        }
795      u = newloc_parsed;
796
797      /* If we're being redirected from POST, we don't want to POST
798         again.  Many requests answer POST with a redirection to an
799         index page; that redirection is clearly a GET.  We "suspend"
800         POST data for the duration of the redirections, and restore
801         it when we're done. */
802      if (!post_data_suspended)
803        SUSPEND_POST_DATA;
804
805      goto redirected;
806    }
807
808  /* Try to not encode in UTF-8 if fetching failed */
809  if (!(*dt & RETROKF) && iri->utf8_encode)
810    {
811      iri->utf8_encode = false;
812      if (orig_parsed != u)
813        {
814          url_free (u);
815        }
816      u = url_parse (origurl, NULL, iri, true);
817      if (u)
818        {
819          DEBUGP (("[IRI fallbacking to non-utf8 for %s\n", quote (url)));
820          url = xstrdup (u->url);
821          iri_fallbacked = 1;
822          goto redirected;
823        }
824      else
825          DEBUGP (("[Couldn't fallback to non-utf8 for %s\n", quote (url)));
826    }
827
828  if (local_file && *dt & RETROKF)
829    {
830      register_download (u->url, local_file);
831      if (redirection_count && 0 != strcmp (origurl, u->url))
832        register_redirection (origurl, u->url);
833      if (*dt & TEXTHTML)
834        register_html (u->url, local_file);
835      if (*dt & RETROKF)
836        {
837          register_download (u->url, local_file);
838          if (redirection_count && 0 != strcmp (origurl, u->url))
839            register_redirection (origurl, u->url);
840          if (*dt & TEXTHTML)
841            register_html (u->url, local_file);
842          if (*dt & TEXTCSS)
843            register_css (u->url, local_file);
844        }
845    }
846
847  if (file)
848    *file = local_file ? local_file : NULL;
849  else
850    xfree_null (local_file);
851
852  if (orig_parsed != u)
853    {
854      url_free (u);
855    }
856
857  if (redirection_count || iri_fallbacked)
858    {
859      if (newloc)
860        *newloc = url;
861      else
862        xfree (url);
863    }
864  else
865    {
866      if (newloc)
867        *newloc = NULL;
868      xfree (url);
869    }
870
871  RESTORE_POST_DATA;
872
873bail:
874  if (register_status)
875    inform_exit_status (result);
876  return result;
877}
878
879/* Find the URLs in the file and call retrieve_url() for each of them.
880   If HTML is true, treat the file as HTML, and construct the URLs
881   accordingly.
882
883   If opt.recursive is set, call retrieve_tree() for each file.  */
884
885uerr_t
886retrieve_from_file (const char *file, bool html, int *count)
887{
888  uerr_t status;
889  struct urlpos *url_list, *cur_url;
890  struct iri *iri = iri_new();
891
892  char *input_file = NULL;
893  const char *url = file;
894
895  status = RETROK;             /* Suppose everything is OK.  */
896  *count = 0;                  /* Reset the URL count.  */
897
898  /* sXXXav : Assume filename and links in the file are in the locale */
899  set_uri_encoding (iri, opt.locale, true);
900  set_content_encoding (iri, opt.locale);
901
902  if (url_has_scheme (url))
903    {
904      int dt,url_err;
905      uerr_t status;
906      struct url * url_parsed = url_parse(url, &url_err, iri, true);
907
908      if (!url_parsed)
909        {
910          char *error = url_error (url, url_err);
911          logprintf (LOG_NOTQUIET, "%s: %s.\n", url, error);
912          xfree (error);
913          return URLERROR;
914        }
915
916      if (!opt.base_href)
917        opt.base_href = xstrdup (url);
918
919      status = retrieve_url (url_parsed, url, &input_file, NULL, NULL, &dt,
920                             false, iri, true);
921      if (status != RETROK)
922        return status;
923
924      if (dt & TEXTHTML)
925        html = true;
926
927      /* If we have a found a content encoding, use it.
928       * ( == is okay, because we're checking for identical object) */
929      if (iri->content_encoding != opt.locale)
930	  set_uri_encoding (iri, iri->content_encoding, false);
931
932      /* Reset UTF-8 encode status */
933      iri->utf8_encode = opt.enable_iri;
934      xfree_null (iri->orig_url);
935      iri->orig_url = NULL;
936    }
937  else
938    input_file = (char *) file;
939
940  url_list = (html ? get_urls_html (input_file, NULL, NULL, iri)
941              : get_urls_file (input_file));
942
943  for (cur_url = url_list; cur_url; cur_url = cur_url->next, ++*count)
944    {
945      char *filename = NULL, *new_file = NULL;
946      int dt;
947      struct iri *tmpiri = iri_dup (iri);
948      struct url *parsed_url = NULL;
949
950      if (cur_url->ignore_when_downloading)
951        continue;
952
953      if (opt.quota && total_downloaded_bytes > opt.quota)
954        {
955          status = QUOTEXC;
956          break;
957        }
958
959      /* Need to reparse the url, since it didn't have iri information. */
960      if (opt.enable_iri)
961          parsed_url = url_parse (cur_url->url->url, NULL, tmpiri, true);
962
963      if ((opt.recursive || opt.page_requisites)
964          && (cur_url->url->scheme != SCHEME_FTP || getproxy (cur_url->url)))
965        {
966          int old_follow_ftp = opt.follow_ftp;
967
968          /* Turn opt.follow_ftp on in case of recursive FTP retrieval */
969          if (cur_url->url->scheme == SCHEME_FTP)
970            opt.follow_ftp = 1;
971
972          status = retrieve_tree (parsed_url ? parsed_url : cur_url->url,
973                                  tmpiri);
974
975          opt.follow_ftp = old_follow_ftp;
976        }
977      else
978        status = retrieve_url (parsed_url ? parsed_url : cur_url->url,
979                               cur_url->url->url, &filename,
980                               &new_file, NULL, &dt, opt.recursive, tmpiri,
981                               true);
982
983      if (parsed_url)
984          url_free (parsed_url);
985
986      if (filename && opt.delete_after && file_exists_p (filename))
987        {
988          DEBUGP (("\
989Removing file due to --delete-after in retrieve_from_file():\n"));
990          logprintf (LOG_VERBOSE, _("Removing %s.\n"), filename);
991          if (unlink (filename))
992            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
993          dt &= ~RETROKF;
994        }
995
996      xfree_null (new_file);
997      xfree_null (filename);
998      iri_free (tmpiri);
999    }
1000
1001  /* Free the linked list of URL-s.  */
1002  free_urlpos (url_list);
1003
1004  iri_free (iri);
1005
1006  return status;
1007}
1008
1009/* Print `giving up', or `retrying', depending on the impending
1010   action.  N1 and N2 are the attempt number and the attempt limit.  */
1011void
1012printwhat (int n1, int n2)
1013{
1014  logputs (LOG_VERBOSE, (n1 == n2) ? _("Giving up.\n\n") : _("Retrying.\n\n"));
1015}
1016
1017/* If opt.wait or opt.waitretry are specified, and if certain
1018   conditions are met, sleep the appropriate number of seconds.  See
1019   the documentation of --wait and --waitretry for more information.
1020
1021   COUNT is the count of current retrieval, beginning with 1. */
1022
1023void
1024sleep_between_retrievals (int count)
1025{
1026  static bool first_retrieval = true;
1027
1028  if (first_retrieval)
1029    {
1030      /* Don't sleep before the very first retrieval. */
1031      first_retrieval = false;
1032      return;
1033    }
1034
1035  if (opt.waitretry && count > 1)
1036    {
1037      /* If opt.waitretry is specified and this is a retry, wait for
1038         COUNT-1 number of seconds, or for opt.waitretry seconds.  */
1039      if (count <= opt.waitretry)
1040        xsleep (count - 1);
1041      else
1042        xsleep (opt.waitretry);
1043    }
1044  else if (opt.wait)
1045    {
1046      if (!opt.random_wait || count > 1)
1047        /* If random-wait is not specified, or if we are sleeping
1048           between retries of the same download, sleep the fixed
1049           interval.  */
1050        xsleep (opt.wait);
1051      else
1052        {
1053          /* Sleep a random amount of time averaging in opt.wait
1054             seconds.  The sleeping amount ranges from 0.5*opt.wait to
1055             1.5*opt.wait.  */
1056          double waitsecs = (0.5 + random_float ()) * opt.wait;
1057          DEBUGP (("sleep_between_retrievals: avg=%f,sleep=%f\n",
1058                   opt.wait, waitsecs));
1059          xsleep (waitsecs);
1060        }
1061    }
1062}
1063
1064/* Free the linked list of urlpos.  */
1065void
1066free_urlpos (struct urlpos *l)
1067{
1068  while (l)
1069    {
1070      struct urlpos *next = l->next;
1071      if (l->url)
1072        url_free (l->url);
1073      xfree_null (l->local_name);
1074      xfree (l);
1075      l = next;
1076    }
1077}
1078
1079/* Rotate FNAME opt.backups times */
1080void
1081rotate_backups(const char *fname)
1082{
1083  int maxlen = strlen (fname) + 1 + numdigit (opt.backups) + 1;
1084  char *from = (char *)alloca (maxlen);
1085  char *to = (char *)alloca (maxlen);
1086  struct_stat sb;
1087  int i;
1088
1089  if (stat (fname, &sb) == 0)
1090    if (S_ISREG (sb.st_mode) == 0)
1091      return;
1092
1093  for (i = opt.backups; i > 1; i--)
1094    {
1095      sprintf (from, "%s.%d", fname, i - 1);
1096      sprintf (to, "%s.%d", fname, i);
1097      rename (from, to);
1098    }
1099
1100  sprintf (to, "%s.%d", fname, 1);
1101  rename(fname, to);
1102}
1103
1104static bool no_proxy_match (const char *, const char **);
1105
1106/* Return the URL of the proxy appropriate for url U.  */
1107
1108static char *
1109getproxy (struct url *u)
1110{
1111  char *proxy = NULL;
1112  char *rewritten_url;
1113  static char rewritten_storage[1024];
1114
1115  if (!opt.use_proxy)
1116    return NULL;
1117  if (no_proxy_match (u->host, (const char **)opt.no_proxy))
1118    return NULL;
1119
1120  switch (u->scheme)
1121    {
1122    case SCHEME_HTTP:
1123      proxy = opt.http_proxy ? opt.http_proxy : getenv ("http_proxy");
1124      break;
1125#ifdef HAVE_SSL
1126    case SCHEME_HTTPS:
1127      proxy = opt.https_proxy ? opt.https_proxy : getenv ("https_proxy");
1128      break;
1129#endif
1130    case SCHEME_FTP:
1131      proxy = opt.ftp_proxy ? opt.ftp_proxy : getenv ("ftp_proxy");
1132      break;
1133    case SCHEME_INVALID:
1134      break;
1135    }
1136  if (!proxy || !*proxy)
1137    return NULL;
1138
1139  /* Handle shorthands.  `rewritten_storage' is a kludge to allow
1140     getproxy() to return static storage. */
1141  rewritten_url = rewrite_shorthand_url (proxy);
1142  if (rewritten_url)
1143    {
1144      strncpy (rewritten_storage, rewritten_url, sizeof (rewritten_storage));
1145      rewritten_storage[sizeof (rewritten_storage) - 1] = '\0';
1146      proxy = rewritten_storage;
1147    }
1148
1149  return proxy;
1150}
1151
1152/* Returns true if URL would be downloaded through a proxy. */
1153
1154bool
1155url_uses_proxy (struct url * u)
1156{
1157  bool ret;
1158  if (!u)
1159    return false;
1160  ret = getproxy (u) != NULL;
1161  return ret;
1162}
1163
1164/* Should a host be accessed through proxy, concerning no_proxy?  */
1165static bool
1166no_proxy_match (const char *host, const char **no_proxy)
1167{
1168  if (!no_proxy)
1169    return false;
1170  else
1171    return sufmatch (no_proxy, host);
1172}
1173
1174/* Set the file parameter to point to the local file string.  */
1175void
1176set_local_file (const char **file, const char *default_file)
1177{
1178  if (opt.output_document)
1179    {
1180      if (output_stream_regular)
1181        *file = opt.output_document;
1182    }
1183  else
1184    *file = default_file;
1185}
1186