subversion/libsvn_diff/parse-diff.c

/*
 * parse-diff.c: functions for parsing diff files
 *
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

#include <stdlib.h>
#include <stddef.h>
#include <string.h>

#include "svn_hash.h"
#include "svn_types.h"
#include "svn_error.h"
#include "svn_io.h"
#include "svn_pools.h"
#include "svn_props.h"
#include "svn_string.h"
#include "svn_utf.h"
#include "svn_dirent_uri.h"
#include "svn_diff.h"

#include "private/svn_eol_private.h"
#include "private/svn_dep_compat.h"

/* Helper macro for readability */
#define starts_with(str, start)  \
  (strncmp((str), (start), strlen(start)) == 0)

/* Like strlen() but for string literals. */
#define STRLEN_LITERAL(str) (sizeof(str) - 1)

/* This struct describes a range within a file, as well as the
 * current cursor position within the range. All numbers are in bytes. */
struct svn_diff__hunk_range {
  apr_off_t start;
  apr_off_t end;
  apr_off_t current;
};

struct svn_diff_hunk_t {
  /* The patch this hunk belongs to. */
  svn_patch_t *patch;

  /* APR file handle to the patch file this hunk came from. */
  apr_file_t *apr_file;

  /* Ranges used to keep track of this hunk's texts positions within
   * the patch file. */
  struct svn_diff__hunk_range diff_text_range;
  struct svn_diff__hunk_range original_text_range;
  struct svn_diff__hunk_range modified_text_range;

  /* Hunk ranges as they appeared in the patch file.
   * All numbers are lines, not bytes. */
  svn_linenum_t original_start;
  svn_linenum_t original_length;
  svn_linenum_t modified_start;
  svn_linenum_t modified_length;

  /* Number of lines of leading and trailing hunk context. */
  svn_linenum_t leading_context;
  svn_linenum_t trailing_context;
};

void
svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk)
{
  hunk->diff_text_range.current = hunk->diff_text_range.start;
}

void
svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk)
{
  if (hunk->patch->reverse)
    hunk->modified_text_range.current = hunk->modified_text_range.start;
  else
    hunk->original_text_range.current = hunk->original_text_range.start;
}

void
svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk)
{
  if (hunk->patch->reverse)
    hunk->original_text_range.current = hunk->original_text_range.start;
  else
    hunk->modified_text_range.current = hunk->modified_text_range.start;
}

svn_linenum_t
svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk)
{
  return hunk->patch->reverse ? hunk->modified_start : hunk->original_start;
}

svn_linenum_t
svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk)
{
  return hunk->patch->reverse ? hunk->modified_length : hunk->original_length;
}

svn_linenum_t
svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk)
{
  return hunk->patch->reverse ? hunk->original_start : hunk->modified_start;
}

svn_linenum_t
svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk)
{
  return hunk->patch->reverse ? hunk->original_length : hunk->modified_length;
}

svn_linenum_t
svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk)
{
  return hunk->leading_context;
}

svn_linenum_t
svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk)
{
  return hunk->trailing_context;
}

/* Try to parse a positive number from a decimal number encoded
 * in the string NUMBER. Return parsed number in OFFSET, and return
 * TRUE if parsing was successful. */
static svn_boolean_t
parse_offset(svn_linenum_t *offset, const char *number)
{
  svn_error_t *err;
  apr_uint64_t val;

  err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10);
  if (err)
    {
      svn_error_clear(err);
      return FALSE;
    }

  *offset = (svn_linenum_t)val;

  return TRUE;
}

/* Try to parse a hunk range specification from the string RANGE.
 * Return parsed information in *START and *LENGTH, and return TRUE
 * if the range parsed correctly. Note: This function may modify the
 * input value RANGE. */
static svn_boolean_t
parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range)
{
  char *comma;

  if (*range == 0)
    return FALSE;

  comma = strstr(range, ",");
  if (comma)
    {
      if (strlen(comma + 1) > 0)
        {
          /* Try to parse the length. */
          if (! parse_offset(length, comma + 1))
            return FALSE;

          /* Snip off the end of the string,
           * so we can comfortably parse the line
           * number the hunk starts at. */
          *comma = '\0';
        }
       else
         /* A comma but no length? */
         return FALSE;
    }
  else
    {
      *length = 1;
    }

  /* Try to parse the line number the hunk starts at. */
  return parse_offset(start, range);
}

/* Try to parse a hunk header in string HEADER, putting parsed information
 * into HUNK. Return TRUE if the header parsed correctly. ATAT is the
 * character string used to delimit the hunk header.
 * Do all allocations in POOL. */
static svn_boolean_t
parse_hunk_header(const char *header, svn_diff_hunk_t *hunk,
                  const char *atat, apr_pool_t *pool)
{
  const char *p;
  const char *start;
  svn_stringbuf_t *range;

  p = header + strlen(atat);
  if (*p != ' ')
    /* No. */
    return FALSE;
  p++;
  if (*p != '-')
    /* Nah... */
    return FALSE;
  /* OK, this may be worth allocating some memory for... */
  range = svn_stringbuf_create_ensure(31, pool);
  start = ++p;
  while (*p && *p != ' ')
    {
      p++;
    }

  if (*p != ' ')
    /* No no no... */
    return FALSE;

  svn_stringbuf_appendbytes(range, start, p - start);

  /* Try to parse the first range. */
  if (! parse_range(&hunk->original_start, &hunk->original_length, range->data))
    return FALSE;

  /* Clear the stringbuf so we can reuse it for the second range. */
  svn_stringbuf_setempty(range);
  p++;
  if (*p != '+')
    /* Eeek! */
    return FALSE;
  /* OK, this may be worth copying... */
  start = ++p;
  while (*p && *p != ' ')
    {
      p++;
    }
  if (*p != ' ')
    /* No no no... */
    return FALSE;

  svn_stringbuf_appendbytes(range, start, p - start);

  /* Check for trailing @@ */
  p++;
  if (! starts_with(p, atat))
    return FALSE;

  /* There may be stuff like C-function names after the trailing @@,
   * but we ignore that. */

  /* Try to parse the second range. */
  if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data))
    return FALSE;

  /* Hunk header is good. */
  return TRUE;
}

/* Read a line of original or modified hunk text from the specified
 * RANGE within FILE. FILE is expected to contain unidiff text.
 * Leading unidiff symbols ('+', '-', and ' ') are removed from the line,
 * Any lines commencing with the VERBOTEN character are discarded.
 * VERBOTEN should be '+' or '-', depending on which form of hunk text
 * is being read.
 *
 * All other parameters are as in svn_diff_hunk_readline_original_text()
 * and svn_diff_hunk_readline_modified_text().
 */
static svn_error_t *
hunk_readline_original_or_modified(apr_file_t *file,
                                   struct svn_diff__hunk_range *range,
                                   svn_stringbuf_t **stringbuf,
                                   const char **eol,
                                   svn_boolean_t *eof,
                                   char verboten,
                                   apr_pool_t *result_pool,
                                   apr_pool_t *scratch_pool)
{
  apr_size_t max_len;
  svn_boolean_t filtered;
  apr_off_t pos;
  svn_stringbuf_t *str;

  if (range->current >= range->end)
    {
      /* We're past the range. Indicate that no bytes can be read. */
      *eof = TRUE;
      if (eol)
        *eol = NULL;
      *stringbuf = svn_stringbuf_create_empty(result_pool);
      return SVN_NO_ERROR;
    }

  pos = 0;
  SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos,  scratch_pool));
  SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool));
  do
    {
      max_len = range->end - range->current;
      SVN_ERR(svn_io_file_readline(file, &str, eol, eof, max_len,
                                   result_pool, scratch_pool));
      range->current = 0;
      SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool));
      filtered = (str->data[0] == verboten || str->data[0] == '\\');
    }
  while (filtered && ! *eof);

  if (filtered)
    {
      /* EOF, return an empty string. */
      *stringbuf = svn_stringbuf_create_ensure(0, result_pool);
    }
  else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ')
    {
      /* Shave off leading unidiff symbols. */
      *stringbuf = svn_stringbuf_create(str->data + 1, result_pool);
    }
  else
    {
      /* Return the line as-is. */
      *stringbuf = svn_stringbuf_dup(str, result_pool);
    }

  SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool));

  return SVN_NO_ERROR;
}

svn_error_t *
svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk,
                                     svn_stringbuf_t **stringbuf,
                                     const char **eol,
                                     svn_boolean_t *eof,
                                     apr_pool_t *result_pool,
                                     apr_pool_t *scratch_pool)
{
  return svn_error_trace(
    hunk_readline_original_or_modified(hunk->apr_file,
                                       hunk->patch->reverse ?
                                         &hunk->modified_text_range :
                                         &hunk->original_text_range,
                                       stringbuf, eol, eof,
                                       hunk->patch->reverse ? '-' : '+',
                                       result_pool, scratch_pool));
}

svn_error_t *
svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk,
                                     svn_stringbuf_t **stringbuf,
                                     const char **eol,
                                     svn_boolean_t *eof,
                                     apr_pool_t *result_pool,
                                     apr_pool_t *scratch_pool)
{
  return svn_error_trace(
    hunk_readline_original_or_modified(hunk->apr_file,
                                       hunk->patch->reverse ?
                                         &hunk->original_text_range :
                                         &hunk->modified_text_range,
                                       stringbuf, eol, eof,
                                       hunk->patch->reverse ? '+' : '-',
                                       result_pool, scratch_pool));
}

svn_error_t *
svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk,
                                 svn_stringbuf_t **stringbuf,
                                 const char **eol,
                                 svn_boolean_t *eof,
                                 apr_pool_t *result_pool,
                                 apr_pool_t *scratch_pool)
{
  svn_diff_hunk_t dummy;
  svn_stringbuf_t *line;
  apr_size_t max_len;
  apr_off_t pos;

  if (hunk->diff_text_range.current >= hunk->diff_text_range.end)
    {
      /* We're past the range. Indicate that no bytes can be read. */
      *eof = TRUE;
      if (eol)
        *eol = NULL;
      *stringbuf = svn_stringbuf_create_empty(result_pool);
      return SVN_NO_ERROR;
    }

  pos = 0;
  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool));
  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET,
                           &hunk->diff_text_range.current, scratch_pool));
  max_len = hunk->diff_text_range.end - hunk->diff_text_range.current;
  SVN_ERR(svn_io_file_readline(hunk->apr_file, &line, eol, eof, max_len,
                               result_pool,
                   scratch_pool));
  hunk->diff_text_range.current = 0;
  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR,
                           &hunk->diff_text_range.current, scratch_pool));
  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool));

  if (hunk->patch->reverse)
    {
      if (parse_hunk_header(line->data, &dummy, "@@", scratch_pool))
        {
          /* Line is a hunk header, reverse it. */
          line = svn_stringbuf_createf(result_pool,
                                       "@@ -%lu,%lu +%lu,%lu @@",
                                       hunk->modified_start,
                                       hunk->modified_length,
                                       hunk->original_start,
                                       hunk->original_length);
        }
      else if (parse_hunk_header(line->data, &dummy, "##", scratch_pool))
        {
          /* Line is a hunk header, reverse it. */
          line = svn_stringbuf_createf(result_pool,
                                       "## -%lu,%lu +%lu,%lu ##",
                                       hunk->modified_start,
                                       hunk->modified_length,
                                       hunk->original_start,
                                       hunk->original_length);
        }
      else
        {
          if (line->data[0] == '+')
            line->data[0] = '-';
          else if (line->data[0] == '-')
            line->data[0] = '+';
        }
    }

  *stringbuf = line;

  return SVN_NO_ERROR;
}

/* Parse *PROP_NAME from HEADER as the part after the INDICATOR line.
 * Allocate *PROP_NAME in RESULT_POOL.
 * Set *PROP_NAME to NULL if no valid property name was found. */
static svn_error_t *
parse_prop_name(const char **prop_name, const char *header,
                const char *indicator, apr_pool_t *result_pool)
{
  SVN_ERR(svn_utf_cstring_to_utf8(prop_name,
                                  header + strlen(indicator),
                                  result_pool));
  if (**prop_name == '\0')
    *prop_name = NULL;
  else if (! svn_prop_name_is_valid(*prop_name))
    {
      svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool);
      svn_stringbuf_strip_whitespace(buf);
      *prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL);
    }

  return SVN_NO_ERROR;
}

/* Return the next *HUNK from a PATCH in APR_FILE.
 * If no hunk can be found, set *HUNK to NULL.
 * Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK
 * is the first belonging to a certain property, then PROP_NAME and
 * PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be
 * NULL.  If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be
 * treated as context lines.  Allocate results in RESULT_POOL.
 * Use SCRATCH_POOL for all other allocations. */
static svn_error_t *
parse_next_hunk(svn_diff_hunk_t **hunk,
                svn_boolean_t *is_property,
                const char **prop_name,
                svn_diff_operation_kind_t *prop_operation,
                svn_patch_t *patch,
                apr_file_t *apr_file,
                svn_boolean_t ignore_whitespace,
                apr_pool_t *result_pool,
                apr_pool_t *scratch_pool)
{
  static const char * const minus = "--- ";
  static const char * const text_atat = "@@";
  static const char * const prop_atat = "##";
  svn_stringbuf_t *line;
  svn_boolean_t eof, in_hunk, hunk_seen;
  apr_off_t pos, last_line;
  apr_off_t start, end;
  apr_off_t original_end;
  apr_off_t modified_end;
  svn_linenum_t original_lines;
  svn_linenum_t modified_lines;
  svn_linenum_t leading_context;
  svn_linenum_t trailing_context;
  svn_boolean_t changed_line_seen;
  enum {
    noise_line,
    original_line,
    modified_line,
    context_line
  } last_line_type;
  apr_pool_t *iterpool;

  *prop_operation = svn_diff_op_unchanged;

  /* We only set this if we have a property hunk header. */
  *prop_name = NULL;
  *is_property = FALSE;

  if (apr_file_eof(apr_file) == APR_EOF)
    {
      /* No more hunks here. */
      *hunk = NULL;
      return SVN_NO_ERROR;
    }

  in_hunk = FALSE;
  hunk_seen = FALSE;
  leading_context = 0;
  trailing_context = 0;
  changed_line_seen = FALSE;
  original_end = 0;
  modified_end = 0;
  *hunk = apr_pcalloc(result_pool, sizeof(**hunk));

  /* Get current seek position -- APR has no ftell() :( */
  pos = 0;
  SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool));

  /* Start out assuming noise. */
  last_line_type = noise_line;

  iterpool = svn_pool_create(scratch_pool);
  do
    {

      svn_pool_clear(iterpool);

      /* Remember the current line's offset, and read the line. */
      last_line = pos;
      SVN_ERR(svn_io_file_readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX,
                                   iterpool, iterpool));

      /* Update line offset for next iteration. */
      pos = 0;
      SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool));

      /* Lines starting with a backslash indicate a missing EOL:
       * "\ No newline at end of file" or "end of property". */
      if (line->data[0] == '\\')
        {
          if (in_hunk)
            {
              char eolbuf[2];
              apr_size_t len;
              apr_off_t off;
              apr_off_t hunk_text_end;

              /* Comment terminates the hunk text and says the hunk text
               * has no trailing EOL. Snip off trailing EOL which is part
               * of the patch file but not part of the hunk text. */
              off = last_line - 2;
              SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool));
              len = sizeof(eolbuf);
              SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len,
                                             &eof, iterpool));
              if (eolbuf[0] == '\r' && eolbuf[1] == '\n')
                hunk_text_end = last_line - 2;
              else if (eolbuf[1] == '\n' || eolbuf[1] == '\r')
                hunk_text_end = last_line - 1;
              else
                hunk_text_end = last_line;

              if (last_line_type == original_line && original_end == 0)
                original_end = hunk_text_end;
              else if (last_line_type == modified_line && modified_end == 0)
                modified_end = hunk_text_end;
              else if (last_line_type == context_line)
                {
                  if (original_end == 0)
                    original_end = hunk_text_end;
                  if (modified_end == 0)
                    modified_end = hunk_text_end;
                }

              SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool));
            }

          continue;
        }

      if (in_hunk)
        {
          char c;
          static const char add = '+';
          static const char del = '-';

          if (! hunk_seen)
            {
              /* We're reading the first line of the hunk, so the start
               * of the line just read is the hunk text's byte offset. */
              start = last_line;
            }

          c = line->data[0];
          if (original_lines > 0 && modified_lines > 0 &&
              ((c == ' ')
               /* Tolerate chopped leading spaces on empty lines. */
               || (! eof && line->len == 0)
               /* Maybe tolerate chopped leading spaces on non-empty lines. */
               || (ignore_whitespace && c != del && c != add)))
            {
              /* It's a "context" line in the hunk. */
              hunk_seen = TRUE;
              original_lines--;
              modified_lines--;
              if (changed_line_seen)
                trailing_context++;
              else
                leading_context++;
              last_line_type = context_line;
            }
          else if (original_lines > 0 && c == del)
            {
              /* It's a "deleted" line in the hunk. */
              hunk_seen = TRUE;
              changed_line_seen = TRUE;

              /* A hunk may have context in the middle. We only want
                 trailing lines of context. */
              if (trailing_context > 0)
                trailing_context = 0;

              original_lines--;
              last_line_type = original_line;
            }
          else if (modified_lines > 0 && c == add)
            {
              /* It's an "added" line in the hunk. */
              hunk_seen = TRUE;
              changed_line_seen = TRUE;

              /* A hunk may have context in the middle. We only want
                 trailing lines of context. */
              if (trailing_context > 0)
                trailing_context = 0;

              modified_lines--;
              last_line_type = modified_line;
            }
          else
            {
              if (eof)
                {
                  /* The hunk ends at EOF. */
                  end = pos;
                }
              else
                {
                  /* The start of the current line marks the first byte
                   * after the hunk text. */
                  end = last_line;
                }

              if (original_end == 0)
                original_end = end;
              if (modified_end == 0)
                modified_end = end;
              break; /* Hunk was empty or has been read. */
            }
        }
      else
        {
          if (starts_with(line->data, text_atat))
            {
              /* Looks like we have a hunk header, try to rip it apart. */
              in_hunk = parse_hunk_header(line->data, *hunk, text_atat,
                                          iterpool);
              if (in_hunk)
                {
                  original_lines = (*hunk)->original_length;
                  modified_lines = (*hunk)->modified_length;
                  *is_property = FALSE;
                }
              }
          else if (starts_with(line->data, prop_atat))
            {
              /* Looks like we have a property hunk header, try to rip it
               * apart. */
              in_hunk = parse_hunk_header(line->data, *hunk, prop_atat,
                                          iterpool);
              if (in_hunk)
                {
                  original_lines = (*hunk)->original_length;
                  modified_lines = (*hunk)->modified_length;
                  *is_property = TRUE;
                }
            }
          else if (starts_with(line->data, "Added: "))
            {
              SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ",
                                      result_pool));
              if (*prop_name)
                *prop_operation = svn_diff_op_added;
            }
          else if (starts_with(line->data, "Deleted: "))
            {
              SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ",
                                      result_pool));
              if (*prop_name)
                *prop_operation = svn_diff_op_deleted;
            }
          else if (starts_with(line->data, "Modified: "))
            {
              SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ",
                                      result_pool));
              if (*prop_name)
                *prop_operation = svn_diff_op_modified;
            }
          else if (starts_with(line->data, minus)
                   || starts_with(line->data, "diff --git "))
            /* This could be a header of another patch. Bail out. */
            break;
        }
    }
  /* Check for the line length since a file may not have a newline at the
   * end and we depend upon the last line to be an empty one. */
  while (! eof || line->len > 0);
  svn_pool_destroy(iterpool);

  if (! eof)
    /* Rewind to the start of the line just read, so subsequent calls
     * to this function or svn_diff_parse_next_patch() don't end
     * up skipping the line -- it may contain a patch or hunk header. */
    SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool));

  if (hunk_seen && start < end)
    {
      (*hunk)->patch = patch;
      (*hunk)->apr_file = apr_file;
      (*hunk)->leading_context = leading_context;
      (*hunk)->trailing_context = trailing_context;
      (*hunk)->diff_text_range.start = start;
      (*hunk)->diff_text_range.current = start;
      (*hunk)->diff_text_range.end = end;
      (*hunk)->original_text_range.start = start;
      (*hunk)->original_text_range.current = start;
      (*hunk)->original_text_range.end = original_end;
      (*hunk)->modified_text_range.start = start;
      (*hunk)->modified_text_range.current = start;
      (*hunk)->modified_text_range.end = modified_end;
    }
  else
    /* Something went wrong, just discard the result. */
    *hunk = NULL;

  return SVN_NO_ERROR;
}

/* Compare function for sorting hunks after parsing.
 * We sort hunks by their original line offset. */
static int
compare_hunks(const void *a, const void *b)
{
  const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a);
  const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b);

  if (ha->original_start < hb->original_start)
    return -1;
  if (ha->original_start > hb->original_start)
    return 1;
  return 0;
}

/* Possible states of the diff header parser. */
enum parse_state
{
   state_start,           /* initial */
   state_git_diff_seen,   /* diff --git */
   state_git_tree_seen,   /* a tree operation, rather then content change */
   state_git_minus_seen,  /* --- /dev/null; or --- a/ */
   state_git_plus_seen,   /* +++ /dev/null; or +++ a/ */
   state_move_from_seen,  /* rename from foo.c */
   state_copy_from_seen,  /* copy from foo.c */
   state_minus_seen,      /* --- foo.c */
   state_unidiff_found,   /* valid start of a regular unidiff header */
   state_git_header_found /* valid start of a --git diff header */
};

/* Data type describing a valid state transition of the parser. */
struct transition
{
  const char *expected_input;
  enum parse_state required_state;

  /* A callback called upon each parser state transition. */
  svn_error_t *(*fn)(enum parse_state *new_state, char *input,
                     svn_patch_t *patch, apr_pool_t *result_pool,
                     apr_pool_t *scratch_pool);
};

/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */
static svn_error_t *
grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool,
              apr_pool_t *scratch_pool)
{
  const char *utf8_path;
  const char *canon_path;

  /* Grab the filename and encode it in UTF-8. */
  /* TODO: Allow specifying the patch file's encoding.
   *       For now, we assume its encoding is native. */
  /* ### This can fail if the filename cannot be represented in the current
   * ### locale's encoding. */
  SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
                                  line,
                                  scratch_pool));

  /* Canonicalize the path name. */
  canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool);

  *file_name = apr_pstrdup(result_pool, canon_path);

  return SVN_NO_ERROR;
}

/* Parse the '--- ' line of a regular unidiff. */
static svn_error_t *
diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  /* If we can find a tab, it separates the filename from
   * the rest of the line which we can discard. */
  char *tab = strchr(line, '\t');
  if (tab)
    *tab = '\0';

  SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "),
                        result_pool, scratch_pool));

  *new_state = state_minus_seen;

  return SVN_NO_ERROR;
}

/* Parse the '+++ ' line of a regular unidiff. */
static svn_error_t *
diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  /* If we can find a tab, it separates the filename from
   * the rest of the line which we can discard. */
  char *tab = strchr(line, '\t');
  if (tab)
    *tab = '\0';

  SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "),
                        result_pool, scratch_pool));

  *new_state = state_unidiff_found;

  return SVN_NO_ERROR;
}

/* Parse the first line of a git extended unidiff. */
static svn_error_t *
git_start(enum parse_state *new_state, char *line, svn_patch_t *patch,
          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  const char *old_path_start;
  char *old_path_end;
  const char *new_path_start;
  const char *new_path_end;
  char *new_path_marker;
  const char *old_path_marker;

  /* ### Add handling of escaped paths
   * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html:
   *
   * TAB, LF, double quote and backslash characters in pathnames are
   * represented as \t, \n, \" and \\, respectively. If there is need for
   * such substitution then the whole pathname is put in double quotes.
   */

  /* Our line should look like this: 'diff --git a/path b/path'.
   *
   * If we find any deviations from that format, we return with state reset
   * to start.
   */
  old_path_marker = strstr(line, " a/");

  if (! old_path_marker)
    {
      *new_state = state_start;
      return SVN_NO_ERROR;
    }

  if (! *(old_path_marker + 3))
    {
      *new_state = state_start;
      return SVN_NO_ERROR;
    }

  new_path_marker = strstr(old_path_marker, " b/");

  if (! new_path_marker)
    {
      *new_state = state_start;
      return SVN_NO_ERROR;
    }

  if (! *(new_path_marker + 3))
    {
      *new_state = state_start;
      return SVN_NO_ERROR;
    }

  /* By now, we know that we have a line on the form '--git diff a/.+ b/.+'
   * We only need the filenames when we have deleted or added empty
   * files. In those cases the old_path and new_path is identical on the
   * 'diff --git' line.  For all other cases we fetch the filenames from
   * other header lines. */
  old_path_start = line + STRLEN_LITERAL("diff --git a/");
  new_path_end = line + strlen(line);
  new_path_start = old_path_start;

  while (TRUE)
    {
      ptrdiff_t len_old;
      ptrdiff_t len_new;

      new_path_marker = strstr(new_path_start, " b/");

      /* No new path marker, bail out. */
      if (! new_path_marker)
        break;

      old_path_end = new_path_marker;
      new_path_start = new_path_marker + STRLEN_LITERAL(" b/");

      /* No path after the marker. */
      if (! *new_path_start)
        break;

      len_old = old_path_end - old_path_start;
      len_new = new_path_end - new_path_start;

      /* Are the paths before and after the " b/" marker the same? */
      if (len_old == len_new
          && ! strncmp(old_path_start, new_path_start, len_old))
        {
          *old_path_end = '\0';
          SVN_ERR(grab_filename(&patch->old_filename, old_path_start,
                                result_pool, scratch_pool));

          SVN_ERR(grab_filename(&patch->new_filename, new_path_start,
                                result_pool, scratch_pool));
          break;
        }
    }

  /* We assume that the path is only modified until we've found a 'tree'
   * header */
  patch->operation = svn_diff_op_modified;

  *new_state = state_git_diff_seen;
  return SVN_NO_ERROR;
}

/* Parse the '--- ' line of a git extended unidiff. */
static svn_error_t *
git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  /* If we can find a tab, it separates the filename from
   * the rest of the line which we can discard. */
  char *tab = strchr(line, '\t');
  if (tab)
    *tab = '\0';

  if (starts_with(line, "--- /dev/null"))
    SVN_ERR(grab_filename(&patch->old_filename, "/dev/null",
                          result_pool, scratch_pool));
  else
    SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"),
                          result_pool, scratch_pool));

  *new_state = state_git_minus_seen;
  return SVN_NO_ERROR;
}

/* Parse the '+++ ' line of a git extended unidiff. */
static svn_error_t *
git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  /* If we can find a tab, it separates the filename from
   * the rest of the line which we can discard. */
  char *tab = strchr(line, '\t');
  if (tab)
    *tab = '\0';

  if (starts_with(line, "+++ /dev/null"))
    SVN_ERR(grab_filename(&patch->new_filename, "/dev/null",
                          result_pool, scratch_pool));
  else
    SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"),
                          result_pool, scratch_pool));

  *new_state = state_git_header_found;
  return SVN_NO_ERROR;
}

/* Parse the 'rename from ' line of a git extended unidiff. */
static svn_error_t *
git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  SVN_ERR(grab_filename(&patch->old_filename,
                        line + STRLEN_LITERAL("rename from "),
                        result_pool, scratch_pool));

  *new_state = state_move_from_seen;
  return SVN_NO_ERROR;
}

/* Parse the 'rename to ' line of a git extended unidiff. */
static svn_error_t *
git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  SVN_ERR(grab_filename(&patch->new_filename,
                        line + STRLEN_LITERAL("rename to "),
                        result_pool, scratch_pool));

  patch->operation = svn_diff_op_moved;

  *new_state = state_git_tree_seen;
  return SVN_NO_ERROR;
}

/* Parse the 'copy from ' line of a git extended unidiff. */
static svn_error_t *
git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  SVN_ERR(grab_filename(&patch->old_filename,
                        line + STRLEN_LITERAL("copy from "),
                        result_pool, scratch_pool));

  *new_state = state_copy_from_seen;
  return SVN_NO_ERROR;
}

/* Parse the 'copy to ' line of a git extended unidiff. */
static svn_error_t *
git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "),
                        result_pool, scratch_pool));

  patch->operation = svn_diff_op_copied;

  *new_state = state_git_tree_seen;
  return SVN_NO_ERROR;
}

/* Parse the 'new file ' line of a git extended unidiff. */
static svn_error_t *
git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
             apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  patch->operation = svn_diff_op_added;

  /* Filename already retrieved from diff --git header. */

  *new_state = state_git_tree_seen;
  return SVN_NO_ERROR;
}

/* Parse the 'deleted file ' line of a git extended unidiff. */
static svn_error_t *
git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
                 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  patch->operation = svn_diff_op_deleted;

  /* Filename already retrieved from diff --git header. */

  *new_state = state_git_tree_seen;
  return SVN_NO_ERROR;
}

/* Add a HUNK associated with the property PROP_NAME to PATCH. */
static svn_error_t *
add_property_hunk(svn_patch_t *patch, const char *prop_name,
                  svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation,
                  apr_pool_t *result_pool)
{
  svn_prop_patch_t *prop_patch;

  prop_patch = svn_hash_gets(patch->prop_patches, prop_name);

  if (! prop_patch)
    {
      prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t));
      prop_patch->name = prop_name;
      prop_patch->operation = operation;
      prop_patch->hunks = apr_array_make(result_pool, 1,
                                         sizeof(svn_diff_hunk_t *));

      svn_hash_sets(patch->prop_patches, prop_name, prop_patch);
    }

  APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk;

  return SVN_NO_ERROR;
}

struct svn_patch_file_t
{
  /* The APR file handle to the patch file. */
  apr_file_t *apr_file;

  /* The file offset at which the next patch is expected. */
  apr_off_t next_patch_offset;
};

svn_error_t *
svn_diff_open_patch_file(svn_patch_file_t **patch_file,
                         const char *local_abspath,
                         apr_pool_t *result_pool)
{
  svn_patch_file_t *p;

  p = apr_palloc(result_pool, sizeof(*p));
  SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath,
                           APR_READ | APR_BUFFERED, APR_OS_DEFAULT,
                           result_pool));
  p->next_patch_offset = 0;
  *patch_file = p;

  return SVN_NO_ERROR;
}

/* Parse hunks from APR_FILE and store them in PATCH->HUNKS.
 * Parsing stops if no valid next hunk can be found.
 * If IGNORE_WHITESPACE is TRUE, lines without
 * leading spaces will be treated as context lines.
 * Allocate results in RESULT_POOL.
 * Use SCRATCH_POOL for temporary allocations. */
static svn_error_t *
parse_hunks(svn_patch_t *patch, apr_file_t *apr_file,
            svn_boolean_t ignore_whitespace,
            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
  svn_diff_hunk_t *hunk;
  svn_boolean_t is_property;
  const char *last_prop_name;
  const char *prop_name;
  svn_diff_operation_kind_t prop_operation;
  apr_pool_t *iterpool;

  last_prop_name = NULL;

  patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *));
  patch->prop_patches = apr_hash_make(result_pool);
  iterpool = svn_pool_create(scratch_pool);
  do
    {
      svn_pool_clear(iterpool);

      SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation,
                              patch, apr_file, ignore_whitespace, result_pool,
                              iterpool));

      if (hunk && is_property)
        {
          if (! prop_name)
            prop_name = last_prop_name;
          else
            last_prop_name = prop_name;
          SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation,
                                    result_pool));
        }
      else if (hunk)
        {
          APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk;
          last_prop_name = NULL;
        }

    }
  while (hunk);
  svn_pool_destroy(iterpool);

  return SVN_NO_ERROR;
}

/* State machine for the diff header parser.
 * Expected Input   Required state          Function to call */
static struct transition transitions[] =
{
  {"--- ",          state_start,            diff_minus},
  {"+++ ",          state_minus_seen,       diff_plus},
  {"diff --git",    state_start,            git_start},
  {"--- a/",        state_git_diff_seen,    git_minus},
  {"--- a/",        state_git_tree_seen,    git_minus},
  {"--- /dev/null", state_git_tree_seen,    git_minus},
  {"+++ b/",        state_git_minus_seen,   git_plus},
  {"+++ /dev/null", state_git_minus_seen,   git_plus},
  {"rename from ",  state_git_diff_seen,    git_move_from},
  {"rename to ",    state_move_from_seen,   git_move_to},
  {"copy from ",    state_git_diff_seen,    git_copy_from},
  {"copy to ",      state_copy_from_seen,   git_copy_to},
  {"new file ",     state_git_diff_seen,    git_new_file},
  {"deleted file ", state_git_diff_seen,    git_deleted_file},
};

svn_error_t *
svn_diff_parse_next_patch(svn_patch_t **patch,
                          svn_patch_file_t *patch_file,
                          svn_boolean_t reverse,
                          svn_boolean_t ignore_whitespace,
                          apr_pool_t *result_pool,
                          apr_pool_t *scratch_pool)
{
  apr_off_t pos, last_line;
  svn_boolean_t eof;
  svn_boolean_t line_after_tree_header_read = FALSE;
  apr_pool_t *iterpool;
  enum parse_state state = state_start;

  if (apr_file_eof(patch_file->apr_file) == APR_EOF)
    {
      /* No more patches here. */
      *patch = NULL;
      return SVN_NO_ERROR;
    }

  *patch = apr_pcalloc(result_pool, sizeof(**patch));

  pos = patch_file->next_patch_offset;
  SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool));

  iterpool = svn_pool_create(scratch_pool);
  do
    {
      svn_stringbuf_t *line;
      svn_boolean_t valid_header_line = FALSE;
      int i;

      svn_pool_clear(iterpool);

      /* Remember the current line's offset, and read the line. */
      last_line = pos;
      SVN_ERR(svn_io_file_readline(patch_file->apr_file, &line, NULL, &eof,
                                   APR_SIZE_MAX, iterpool, iterpool));

      if (! eof)
        {
          /* Update line offset for next iteration. */
          pos = 0;
          SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos,
                                   iterpool));
        }

      /* Run the state machine. */
      for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++)
        {
          if (starts_with(line->data, transitions[i].expected_input)
              && state == transitions[i].required_state)
            {
              SVN_ERR(transitions[i].fn(&state, line->data, *patch,
                                        result_pool, iterpool));
              valid_header_line = TRUE;
              break;
            }
        }

      if (state == state_unidiff_found || state == state_git_header_found)
        {
          /* We have a valid diff header, yay! */
          break;
        }
      else if (state == state_git_tree_seen && line_after_tree_header_read)
        {
          /* git patches can contain an index line after the file mode line */
          if (!starts_with(line->data, "index "))
          {
            /* We have a valid diff header for a patch with only tree changes.
             * Rewind to the start of the line just read, so subsequent calls
             * to this function don't end up skipping the line -- it may
             * contain a patch. */
            SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
                    scratch_pool));
            break;
          }
        }
      else if (state == state_git_tree_seen)
        {
          line_after_tree_header_read = TRUE;
        }
      else if (! valid_header_line && state != state_start
               && state != state_git_diff_seen
               && !starts_with(line->data, "index "))
        {
          /* We've encountered an invalid diff header.
           *
           * Rewind to the start of the line just read - it may be a new
           * header that begins there. */
          SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
                                   scratch_pool));
          state = state_start;
        }

    }
  while (! eof);

  (*patch)->reverse = reverse;
  if (reverse)
    {
      const char *temp;
      temp = (*patch)->old_filename;
      (*patch)->old_filename = (*patch)->new_filename;
      (*patch)->new_filename = temp;
    }

  if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL)
    {
      /* Something went wrong, just discard the result. */
      *patch = NULL;
    }
  else
    SVN_ERR(parse_hunks(*patch, patch_file->apr_file, ignore_whitespace,
                        result_pool, iterpool));

  svn_pool_destroy(iterpool);

  patch_file->next_patch_offset = 0;
  SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR,
                           &patch_file->next_patch_offset, scratch_pool));

  if (*patch)
    {
      /* Usually, hunks appear in the patch sorted by their original line
       * offset. But just in case they weren't parsed in this order for
       * some reason, we sort them so that our caller can assume that hunks
       * are sorted as if parsed from a usual patch. */
      qsort((*patch)->hunks->elts, (*patch)->hunks->nelts,
            (*patch)->hunks->elt_size, compare_hunks);
    }

  return SVN_NO_ERROR;
}

svn_error_t *
svn_diff_close_patch_file(svn_patch_file_t *patch_file,
                          apr_pool_t *scratch_pool)
{
  return svn_error_trace(svn_io_file_close(patch_file->apr_file,
                                           scratch_pool));
}