parse-diff.c revision 289166
1/*
2 * parse-diff.c: functions for parsing diff files
3 *
4 * ====================================================================
5 *    Licensed to the Apache Software Foundation (ASF) under one
6 *    or more contributor license agreements.  See the NOTICE file
7 *    distributed with this work for additional information
8 *    regarding copyright ownership.  The ASF licenses this file
9 *    to you under the Apache License, Version 2.0 (the
10 *    "License"); you may not use this file except in compliance
11 *    with the License.  You may obtain a copy of the License at
12 *
13 *      http://www.apache.org/licenses/LICENSE-2.0
14 *
15 *    Unless required by applicable law or agreed to in writing,
16 *    software distributed under the License is distributed on an
17 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 *    KIND, either express or implied.  See the License for the
19 *    specific language governing permissions and limitations
20 *    under the License.
21 * ====================================================================
22 */
23
24#include <stdlib.h>
25#include <stddef.h>
26#include <string.h>
27
28#include "svn_hash.h"
29#include "svn_types.h"
30#include "svn_error.h"
31#include "svn_io.h"
32#include "svn_pools.h"
33#include "svn_props.h"
34#include "svn_string.h"
35#include "svn_utf.h"
36#include "svn_dirent_uri.h"
37#include "svn_diff.h"
38
39#include "private/svn_eol_private.h"
40#include "private/svn_dep_compat.h"
41
42/* Helper macro for readability */
43#define starts_with(str, start)  \
44  (strncmp((str), (start), strlen(start)) == 0)
45
46/* Like strlen() but for string literals. */
47#define STRLEN_LITERAL(str) (sizeof(str) - 1)
48
49/* This struct describes a range within a file, as well as the
50 * current cursor position within the range. All numbers are in bytes. */
51struct svn_diff__hunk_range {
52  apr_off_t start;
53  apr_off_t end;
54  apr_off_t current;
55};
56
57struct svn_diff_hunk_t {
58  /* The patch this hunk belongs to. */
59  svn_patch_t *patch;
60
61  /* APR file handle to the patch file this hunk came from. */
62  apr_file_t *apr_file;
63
64  /* Ranges used to keep track of this hunk's texts positions within
65   * the patch file. */
66  struct svn_diff__hunk_range diff_text_range;
67  struct svn_diff__hunk_range original_text_range;
68  struct svn_diff__hunk_range modified_text_range;
69
70  /* Hunk ranges as they appeared in the patch file.
71   * All numbers are lines, not bytes. */
72  svn_linenum_t original_start;
73  svn_linenum_t original_length;
74  svn_linenum_t modified_start;
75  svn_linenum_t modified_length;
76
77  /* Number of lines of leading and trailing hunk context. */
78  svn_linenum_t leading_context;
79  svn_linenum_t trailing_context;
80};
81
82void
83svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk)
84{
85  hunk->diff_text_range.current = hunk->diff_text_range.start;
86}
87
88void
89svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk)
90{
91  if (hunk->patch->reverse)
92    hunk->modified_text_range.current = hunk->modified_text_range.start;
93  else
94    hunk->original_text_range.current = hunk->original_text_range.start;
95}
96
97void
98svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk)
99{
100  if (hunk->patch->reverse)
101    hunk->original_text_range.current = hunk->original_text_range.start;
102  else
103    hunk->modified_text_range.current = hunk->modified_text_range.start;
104}
105
106svn_linenum_t
107svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk)
108{
109  return hunk->patch->reverse ? hunk->modified_start : hunk->original_start;
110}
111
112svn_linenum_t
113svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk)
114{
115  return hunk->patch->reverse ? hunk->modified_length : hunk->original_length;
116}
117
118svn_linenum_t
119svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk)
120{
121  return hunk->patch->reverse ? hunk->original_start : hunk->modified_start;
122}
123
124svn_linenum_t
125svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk)
126{
127  return hunk->patch->reverse ? hunk->original_length : hunk->modified_length;
128}
129
130svn_linenum_t
131svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk)
132{
133  return hunk->leading_context;
134}
135
136svn_linenum_t
137svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk)
138{
139  return hunk->trailing_context;
140}
141
142/* Try to parse a positive number from a decimal number encoded
143 * in the string NUMBER. Return parsed number in OFFSET, and return
144 * TRUE if parsing was successful. */
145static svn_boolean_t
146parse_offset(svn_linenum_t *offset, const char *number)
147{
148  svn_error_t *err;
149  apr_uint64_t val;
150
151  err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10);
152  if (err)
153    {
154      svn_error_clear(err);
155      return FALSE;
156    }
157
158  *offset = (svn_linenum_t)val;
159
160  return TRUE;
161}
162
163/* Try to parse a hunk range specification from the string RANGE.
164 * Return parsed information in *START and *LENGTH, and return TRUE
165 * if the range parsed correctly. Note: This function may modify the
166 * input value RANGE. */
167static svn_boolean_t
168parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range)
169{
170  char *comma;
171
172  if (*range == 0)
173    return FALSE;
174
175  comma = strstr(range, ",");
176  if (comma)
177    {
178      if (strlen(comma + 1) > 0)
179        {
180          /* Try to parse the length. */
181          if (! parse_offset(length, comma + 1))
182            return FALSE;
183
184          /* Snip off the end of the string,
185           * so we can comfortably parse the line
186           * number the hunk starts at. */
187          *comma = '\0';
188        }
189       else
190         /* A comma but no length? */
191         return FALSE;
192    }
193  else
194    {
195      *length = 1;
196    }
197
198  /* Try to parse the line number the hunk starts at. */
199  return parse_offset(start, range);
200}
201
202/* Try to parse a hunk header in string HEADER, putting parsed information
203 * into HUNK. Return TRUE if the header parsed correctly. ATAT is the
204 * character string used to delimit the hunk header.
205 * Do all allocations in POOL. */
206static svn_boolean_t
207parse_hunk_header(const char *header, svn_diff_hunk_t *hunk,
208                  const char *atat, apr_pool_t *pool)
209{
210  const char *p;
211  const char *start;
212  svn_stringbuf_t *range;
213
214  p = header + strlen(atat);
215  if (*p != ' ')
216    /* No. */
217    return FALSE;
218  p++;
219  if (*p != '-')
220    /* Nah... */
221    return FALSE;
222  /* OK, this may be worth allocating some memory for... */
223  range = svn_stringbuf_create_ensure(31, pool);
224  start = ++p;
225  while (*p && *p != ' ')
226    {
227      p++;
228    }
229
230  if (*p != ' ')
231    /* No no no... */
232    return FALSE;
233
234  svn_stringbuf_appendbytes(range, start, p - start);
235
236  /* Try to parse the first range. */
237  if (! parse_range(&hunk->original_start, &hunk->original_length, range->data))
238    return FALSE;
239
240  /* Clear the stringbuf so we can reuse it for the second range. */
241  svn_stringbuf_setempty(range);
242  p++;
243  if (*p != '+')
244    /* Eeek! */
245    return FALSE;
246  /* OK, this may be worth copying... */
247  start = ++p;
248  while (*p && *p != ' ')
249    {
250      p++;
251    }
252  if (*p != ' ')
253    /* No no no... */
254    return FALSE;
255
256  svn_stringbuf_appendbytes(range, start, p - start);
257
258  /* Check for trailing @@ */
259  p++;
260  if (! starts_with(p, atat))
261    return FALSE;
262
263  /* There may be stuff like C-function names after the trailing @@,
264   * but we ignore that. */
265
266  /* Try to parse the second range. */
267  if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data))
268    return FALSE;
269
270  /* Hunk header is good. */
271  return TRUE;
272}
273
274/* Read a line of original or modified hunk text from the specified
275 * RANGE within FILE. FILE is expected to contain unidiff text.
276 * Leading unidiff symbols ('+', '-', and ' ') are removed from the line,
277 * Any lines commencing with the VERBOTEN character are discarded.
278 * VERBOTEN should be '+' or '-', depending on which form of hunk text
279 * is being read.
280 *
281 * All other parameters are as in svn_diff_hunk_readline_original_text()
282 * and svn_diff_hunk_readline_modified_text().
283 */
284static svn_error_t *
285hunk_readline_original_or_modified(apr_file_t *file,
286                                   struct svn_diff__hunk_range *range,
287                                   svn_stringbuf_t **stringbuf,
288                                   const char **eol,
289                                   svn_boolean_t *eof,
290                                   char verboten,
291                                   apr_pool_t *result_pool,
292                                   apr_pool_t *scratch_pool)
293{
294  apr_size_t max_len;
295  svn_boolean_t filtered;
296  apr_off_t pos;
297  svn_stringbuf_t *str;
298
299  if (range->current >= range->end)
300    {
301      /* We're past the range. Indicate that no bytes can be read. */
302      *eof = TRUE;
303      if (eol)
304        *eol = NULL;
305      *stringbuf = svn_stringbuf_create_empty(result_pool);
306      return SVN_NO_ERROR;
307    }
308
309  pos = 0;
310  SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos,  scratch_pool));
311  SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool));
312  do
313    {
314      max_len = range->end - range->current;
315      SVN_ERR(svn_io_file_readline(file, &str, eol, eof, max_len,
316                                   result_pool, scratch_pool));
317      range->current = 0;
318      SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool));
319      filtered = (str->data[0] == verboten || str->data[0] == '\\');
320    }
321  while (filtered && ! *eof);
322
323  if (filtered)
324    {
325      /* EOF, return an empty string. */
326      *stringbuf = svn_stringbuf_create_ensure(0, result_pool);
327    }
328  else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ')
329    {
330      /* Shave off leading unidiff symbols. */
331      *stringbuf = svn_stringbuf_create(str->data + 1, result_pool);
332    }
333  else
334    {
335      /* Return the line as-is. */
336      *stringbuf = svn_stringbuf_dup(str, result_pool);
337    }
338
339  SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool));
340
341  return SVN_NO_ERROR;
342}
343
344svn_error_t *
345svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk,
346                                     svn_stringbuf_t **stringbuf,
347                                     const char **eol,
348                                     svn_boolean_t *eof,
349                                     apr_pool_t *result_pool,
350                                     apr_pool_t *scratch_pool)
351{
352  return svn_error_trace(
353    hunk_readline_original_or_modified(hunk->apr_file,
354                                       hunk->patch->reverse ?
355                                         &hunk->modified_text_range :
356                                         &hunk->original_text_range,
357                                       stringbuf, eol, eof,
358                                       hunk->patch->reverse ? '-' : '+',
359                                       result_pool, scratch_pool));
360}
361
362svn_error_t *
363svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk,
364                                     svn_stringbuf_t **stringbuf,
365                                     const char **eol,
366                                     svn_boolean_t *eof,
367                                     apr_pool_t *result_pool,
368                                     apr_pool_t *scratch_pool)
369{
370  return svn_error_trace(
371    hunk_readline_original_or_modified(hunk->apr_file,
372                                       hunk->patch->reverse ?
373                                         &hunk->original_text_range :
374                                         &hunk->modified_text_range,
375                                       stringbuf, eol, eof,
376                                       hunk->patch->reverse ? '+' : '-',
377                                       result_pool, scratch_pool));
378}
379
380svn_error_t *
381svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk,
382                                 svn_stringbuf_t **stringbuf,
383                                 const char **eol,
384                                 svn_boolean_t *eof,
385                                 apr_pool_t *result_pool,
386                                 apr_pool_t *scratch_pool)
387{
388  svn_diff_hunk_t dummy;
389  svn_stringbuf_t *line;
390  apr_size_t max_len;
391  apr_off_t pos;
392
393  if (hunk->diff_text_range.current >= hunk->diff_text_range.end)
394    {
395      /* We're past the range. Indicate that no bytes can be read. */
396      *eof = TRUE;
397      if (eol)
398        *eol = NULL;
399      *stringbuf = svn_stringbuf_create_empty(result_pool);
400      return SVN_NO_ERROR;
401    }
402
403  pos = 0;
404  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool));
405  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET,
406                           &hunk->diff_text_range.current, scratch_pool));
407  max_len = hunk->diff_text_range.end - hunk->diff_text_range.current;
408  SVN_ERR(svn_io_file_readline(hunk->apr_file, &line, eol, eof, max_len,
409                               result_pool,
410                   scratch_pool));
411  hunk->diff_text_range.current = 0;
412  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR,
413                           &hunk->diff_text_range.current, scratch_pool));
414  SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool));
415
416  if (hunk->patch->reverse)
417    {
418      if (parse_hunk_header(line->data, &dummy, "@@", scratch_pool))
419        {
420          /* Line is a hunk header, reverse it. */
421          line = svn_stringbuf_createf(result_pool,
422                                       "@@ -%lu,%lu +%lu,%lu @@",
423                                       hunk->modified_start,
424                                       hunk->modified_length,
425                                       hunk->original_start,
426                                       hunk->original_length);
427        }
428      else if (parse_hunk_header(line->data, &dummy, "##", scratch_pool))
429        {
430          /* Line is a hunk header, reverse it. */
431          line = svn_stringbuf_createf(result_pool,
432                                       "## -%lu,%lu +%lu,%lu ##",
433                                       hunk->modified_start,
434                                       hunk->modified_length,
435                                       hunk->original_start,
436                                       hunk->original_length);
437        }
438      else
439        {
440          if (line->data[0] == '+')
441            line->data[0] = '-';
442          else if (line->data[0] == '-')
443            line->data[0] = '+';
444        }
445    }
446
447  *stringbuf = line;
448
449  return SVN_NO_ERROR;
450}
451
452/* Parse *PROP_NAME from HEADER as the part after the INDICATOR line.
453 * Allocate *PROP_NAME in RESULT_POOL.
454 * Set *PROP_NAME to NULL if no valid property name was found. */
455static svn_error_t *
456parse_prop_name(const char **prop_name, const char *header,
457                const char *indicator, apr_pool_t *result_pool)
458{
459  SVN_ERR(svn_utf_cstring_to_utf8(prop_name,
460                                  header + strlen(indicator),
461                                  result_pool));
462  if (**prop_name == '\0')
463    *prop_name = NULL;
464  else if (! svn_prop_name_is_valid(*prop_name))
465    {
466      svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool);
467      svn_stringbuf_strip_whitespace(buf);
468      *prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL);
469    }
470
471  return SVN_NO_ERROR;
472}
473
474/* Return the next *HUNK from a PATCH in APR_FILE.
475 * If no hunk can be found, set *HUNK to NULL.
476 * Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK
477 * is the first belonging to a certain property, then PROP_NAME and
478 * PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be
479 * NULL.  If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be
480 * treated as context lines.  Allocate results in RESULT_POOL.
481 * Use SCRATCH_POOL for all other allocations. */
482static svn_error_t *
483parse_next_hunk(svn_diff_hunk_t **hunk,
484                svn_boolean_t *is_property,
485                const char **prop_name,
486                svn_diff_operation_kind_t *prop_operation,
487                svn_patch_t *patch,
488                apr_file_t *apr_file,
489                svn_boolean_t ignore_whitespace,
490                apr_pool_t *result_pool,
491                apr_pool_t *scratch_pool)
492{
493  static const char * const minus = "--- ";
494  static const char * const text_atat = "@@";
495  static const char * const prop_atat = "##";
496  svn_stringbuf_t *line;
497  svn_boolean_t eof, in_hunk, hunk_seen;
498  apr_off_t pos, last_line;
499  apr_off_t start, end;
500  apr_off_t original_end;
501  apr_off_t modified_end;
502  svn_linenum_t original_lines;
503  svn_linenum_t modified_lines;
504  svn_linenum_t leading_context;
505  svn_linenum_t trailing_context;
506  svn_boolean_t changed_line_seen;
507  enum {
508    noise_line,
509    original_line,
510    modified_line,
511    context_line
512  } last_line_type;
513  apr_pool_t *iterpool;
514
515  *prop_operation = svn_diff_op_unchanged;
516
517  /* We only set this if we have a property hunk header. */
518  *prop_name = NULL;
519  *is_property = FALSE;
520
521  if (apr_file_eof(apr_file) == APR_EOF)
522    {
523      /* No more hunks here. */
524      *hunk = NULL;
525      return SVN_NO_ERROR;
526    }
527
528  in_hunk = FALSE;
529  hunk_seen = FALSE;
530  leading_context = 0;
531  trailing_context = 0;
532  changed_line_seen = FALSE;
533  original_end = 0;
534  modified_end = 0;
535  *hunk = apr_pcalloc(result_pool, sizeof(**hunk));
536
537  /* Get current seek position -- APR has no ftell() :( */
538  pos = 0;
539  SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool));
540
541  /* Start out assuming noise. */
542  last_line_type = noise_line;
543
544  iterpool = svn_pool_create(scratch_pool);
545  do
546    {
547
548      svn_pool_clear(iterpool);
549
550      /* Remember the current line's offset, and read the line. */
551      last_line = pos;
552      SVN_ERR(svn_io_file_readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX,
553                                   iterpool, iterpool));
554
555      /* Update line offset for next iteration. */
556      pos = 0;
557      SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool));
558
559      /* Lines starting with a backslash indicate a missing EOL:
560       * "\ No newline at end of file" or "end of property". */
561      if (line->data[0] == '\\')
562        {
563          if (in_hunk)
564            {
565              char eolbuf[2];
566              apr_size_t len;
567              apr_off_t off;
568              apr_off_t hunk_text_end;
569
570              /* Comment terminates the hunk text and says the hunk text
571               * has no trailing EOL. Snip off trailing EOL which is part
572               * of the patch file but not part of the hunk text. */
573              off = last_line - 2;
574              SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool));
575              len = sizeof(eolbuf);
576              SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len,
577                                             &eof, iterpool));
578              if (eolbuf[0] == '\r' && eolbuf[1] == '\n')
579                hunk_text_end = last_line - 2;
580              else if (eolbuf[1] == '\n' || eolbuf[1] == '\r')
581                hunk_text_end = last_line - 1;
582              else
583                hunk_text_end = last_line;
584
585              if (last_line_type == original_line && original_end == 0)
586                original_end = hunk_text_end;
587              else if (last_line_type == modified_line && modified_end == 0)
588                modified_end = hunk_text_end;
589              else if (last_line_type == context_line)
590                {
591                  if (original_end == 0)
592                    original_end = hunk_text_end;
593                  if (modified_end == 0)
594                    modified_end = hunk_text_end;
595                }
596
597              SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool));
598            }
599
600          continue;
601        }
602
603      if (in_hunk)
604        {
605          char c;
606          static const char add = '+';
607          static const char del = '-';
608
609          if (! hunk_seen)
610            {
611              /* We're reading the first line of the hunk, so the start
612               * of the line just read is the hunk text's byte offset. */
613              start = last_line;
614            }
615
616          c = line->data[0];
617          if (original_lines > 0 && modified_lines > 0 &&
618              ((c == ' ')
619               /* Tolerate chopped leading spaces on empty lines. */
620               || (! eof && line->len == 0)
621               /* Maybe tolerate chopped leading spaces on non-empty lines. */
622               || (ignore_whitespace && c != del && c != add)))
623            {
624              /* It's a "context" line in the hunk. */
625              hunk_seen = TRUE;
626              original_lines--;
627              modified_lines--;
628              if (changed_line_seen)
629                trailing_context++;
630              else
631                leading_context++;
632              last_line_type = context_line;
633            }
634          else if (original_lines > 0 && c == del)
635            {
636              /* It's a "deleted" line in the hunk. */
637              hunk_seen = TRUE;
638              changed_line_seen = TRUE;
639
640              /* A hunk may have context in the middle. We only want
641                 trailing lines of context. */
642              if (trailing_context > 0)
643                trailing_context = 0;
644
645              original_lines--;
646              last_line_type = original_line;
647            }
648          else if (modified_lines > 0 && c == add)
649            {
650              /* It's an "added" line in the hunk. */
651              hunk_seen = TRUE;
652              changed_line_seen = TRUE;
653
654              /* A hunk may have context in the middle. We only want
655                 trailing lines of context. */
656              if (trailing_context > 0)
657                trailing_context = 0;
658
659              modified_lines--;
660              last_line_type = modified_line;
661            }
662          else
663            {
664              if (eof)
665                {
666                  /* The hunk ends at EOF. */
667                  end = pos;
668                }
669              else
670                {
671                  /* The start of the current line marks the first byte
672                   * after the hunk text. */
673                  end = last_line;
674                }
675
676              if (original_end == 0)
677                original_end = end;
678              if (modified_end == 0)
679                modified_end = end;
680              break; /* Hunk was empty or has been read. */
681            }
682        }
683      else
684        {
685          if (starts_with(line->data, text_atat))
686            {
687              /* Looks like we have a hunk header, try to rip it apart. */
688              in_hunk = parse_hunk_header(line->data, *hunk, text_atat,
689                                          iterpool);
690              if (in_hunk)
691                {
692                  original_lines = (*hunk)->original_length;
693                  modified_lines = (*hunk)->modified_length;
694                  *is_property = FALSE;
695                }
696              }
697          else if (starts_with(line->data, prop_atat))
698            {
699              /* Looks like we have a property hunk header, try to rip it
700               * apart. */
701              in_hunk = parse_hunk_header(line->data, *hunk, prop_atat,
702                                          iterpool);
703              if (in_hunk)
704                {
705                  original_lines = (*hunk)->original_length;
706                  modified_lines = (*hunk)->modified_length;
707                  *is_property = TRUE;
708                }
709            }
710          else if (starts_with(line->data, "Added: "))
711            {
712              SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ",
713                                      result_pool));
714              if (*prop_name)
715                *prop_operation = svn_diff_op_added;
716            }
717          else if (starts_with(line->data, "Deleted: "))
718            {
719              SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ",
720                                      result_pool));
721              if (*prop_name)
722                *prop_operation = svn_diff_op_deleted;
723            }
724          else if (starts_with(line->data, "Modified: "))
725            {
726              SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ",
727                                      result_pool));
728              if (*prop_name)
729                *prop_operation = svn_diff_op_modified;
730            }
731          else if (starts_with(line->data, minus)
732                   || starts_with(line->data, "diff --git "))
733            /* This could be a header of another patch. Bail out. */
734            break;
735        }
736    }
737  /* Check for the line length since a file may not have a newline at the
738   * end and we depend upon the last line to be an empty one. */
739  while (! eof || line->len > 0);
740  svn_pool_destroy(iterpool);
741
742  if (! eof)
743    /* Rewind to the start of the line just read, so subsequent calls
744     * to this function or svn_diff_parse_next_patch() don't end
745     * up skipping the line -- it may contain a patch or hunk header. */
746    SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool));
747
748  if (hunk_seen && start < end)
749    {
750      (*hunk)->patch = patch;
751      (*hunk)->apr_file = apr_file;
752      (*hunk)->leading_context = leading_context;
753      (*hunk)->trailing_context = trailing_context;
754      (*hunk)->diff_text_range.start = start;
755      (*hunk)->diff_text_range.current = start;
756      (*hunk)->diff_text_range.end = end;
757      (*hunk)->original_text_range.start = start;
758      (*hunk)->original_text_range.current = start;
759      (*hunk)->original_text_range.end = original_end;
760      (*hunk)->modified_text_range.start = start;
761      (*hunk)->modified_text_range.current = start;
762      (*hunk)->modified_text_range.end = modified_end;
763    }
764  else
765    /* Something went wrong, just discard the result. */
766    *hunk = NULL;
767
768  return SVN_NO_ERROR;
769}
770
771/* Compare function for sorting hunks after parsing.
772 * We sort hunks by their original line offset. */
773static int
774compare_hunks(const void *a, const void *b)
775{
776  const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a);
777  const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b);
778
779  if (ha->original_start < hb->original_start)
780    return -1;
781  if (ha->original_start > hb->original_start)
782    return 1;
783  return 0;
784}
785
786/* Possible states of the diff header parser. */
787enum parse_state
788{
789   state_start,           /* initial */
790   state_git_diff_seen,   /* diff --git */
791   state_git_tree_seen,   /* a tree operation, rather then content change */
792   state_git_minus_seen,  /* --- /dev/null; or --- a/ */
793   state_git_plus_seen,   /* +++ /dev/null; or +++ a/ */
794   state_move_from_seen,  /* rename from foo.c */
795   state_copy_from_seen,  /* copy from foo.c */
796   state_minus_seen,      /* --- foo.c */
797   state_unidiff_found,   /* valid start of a regular unidiff header */
798   state_git_header_found /* valid start of a --git diff header */
799};
800
801/* Data type describing a valid state transition of the parser. */
802struct transition
803{
804  const char *expected_input;
805  enum parse_state required_state;
806
807  /* A callback called upon each parser state transition. */
808  svn_error_t *(*fn)(enum parse_state *new_state, char *input,
809                     svn_patch_t *patch, apr_pool_t *result_pool,
810                     apr_pool_t *scratch_pool);
811};
812
813/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */
814static svn_error_t *
815grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool,
816              apr_pool_t *scratch_pool)
817{
818  const char *utf8_path;
819  const char *canon_path;
820
821  /* Grab the filename and encode it in UTF-8. */
822  /* TODO: Allow specifying the patch file's encoding.
823   *       For now, we assume its encoding is native. */
824  /* ### This can fail if the filename cannot be represented in the current
825   * ### locale's encoding. */
826  SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
827                                  line,
828                                  scratch_pool));
829
830  /* Canonicalize the path name. */
831  canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool);
832
833  *file_name = apr_pstrdup(result_pool, canon_path);
834
835  return SVN_NO_ERROR;
836}
837
838/* Parse the '--- ' line of a regular unidiff. */
839static svn_error_t *
840diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
841           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
842{
843  /* If we can find a tab, it separates the filename from
844   * the rest of the line which we can discard. */
845  char *tab = strchr(line, '\t');
846  if (tab)
847    *tab = '\0';
848
849  SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "),
850                        result_pool, scratch_pool));
851
852  *new_state = state_minus_seen;
853
854  return SVN_NO_ERROR;
855}
856
857/* Parse the '+++ ' line of a regular unidiff. */
858static svn_error_t *
859diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
860           apr_pool_t *result_pool, apr_pool_t *scratch_pool)
861{
862  /* If we can find a tab, it separates the filename from
863   * the rest of the line which we can discard. */
864  char *tab = strchr(line, '\t');
865  if (tab)
866    *tab = '\0';
867
868  SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "),
869                        result_pool, scratch_pool));
870
871  *new_state = state_unidiff_found;
872
873  return SVN_NO_ERROR;
874}
875
876/* Parse the first line of a git extended unidiff. */
877static svn_error_t *
878git_start(enum parse_state *new_state, char *line, svn_patch_t *patch,
879          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
880{
881  const char *old_path_start;
882  char *old_path_end;
883  const char *new_path_start;
884  const char *new_path_end;
885  char *new_path_marker;
886  const char *old_path_marker;
887
888  /* ### Add handling of escaped paths
889   * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html:
890   *
891   * TAB, LF, double quote and backslash characters in pathnames are
892   * represented as \t, \n, \" and \\, respectively. If there is need for
893   * such substitution then the whole pathname is put in double quotes.
894   */
895
896  /* Our line should look like this: 'diff --git a/path b/path'.
897   *
898   * If we find any deviations from that format, we return with state reset
899   * to start.
900   */
901  old_path_marker = strstr(line, " a/");
902
903  if (! old_path_marker)
904    {
905      *new_state = state_start;
906      return SVN_NO_ERROR;
907    }
908
909  if (! *(old_path_marker + 3))
910    {
911      *new_state = state_start;
912      return SVN_NO_ERROR;
913    }
914
915  new_path_marker = strstr(old_path_marker, " b/");
916
917  if (! new_path_marker)
918    {
919      *new_state = state_start;
920      return SVN_NO_ERROR;
921    }
922
923  if (! *(new_path_marker + 3))
924    {
925      *new_state = state_start;
926      return SVN_NO_ERROR;
927    }
928
929  /* By now, we know that we have a line on the form '--git diff a/.+ b/.+'
930   * We only need the filenames when we have deleted or added empty
931   * files. In those cases the old_path and new_path is identical on the
932   * 'diff --git' line.  For all other cases we fetch the filenames from
933   * other header lines. */
934  old_path_start = line + STRLEN_LITERAL("diff --git a/");
935  new_path_end = line + strlen(line);
936  new_path_start = old_path_start;
937
938  while (TRUE)
939    {
940      ptrdiff_t len_old;
941      ptrdiff_t len_new;
942
943      new_path_marker = strstr(new_path_start, " b/");
944
945      /* No new path marker, bail out. */
946      if (! new_path_marker)
947        break;
948
949      old_path_end = new_path_marker;
950      new_path_start = new_path_marker + STRLEN_LITERAL(" b/");
951
952      /* No path after the marker. */
953      if (! *new_path_start)
954        break;
955
956      len_old = old_path_end - old_path_start;
957      len_new = new_path_end - new_path_start;
958
959      /* Are the paths before and after the " b/" marker the same? */
960      if (len_old == len_new
961          && ! strncmp(old_path_start, new_path_start, len_old))
962        {
963          *old_path_end = '\0';
964          SVN_ERR(grab_filename(&patch->old_filename, old_path_start,
965                                result_pool, scratch_pool));
966
967          SVN_ERR(grab_filename(&patch->new_filename, new_path_start,
968                                result_pool, scratch_pool));
969          break;
970        }
971    }
972
973  /* We assume that the path is only modified until we've found a 'tree'
974   * header */
975  patch->operation = svn_diff_op_modified;
976
977  *new_state = state_git_diff_seen;
978  return SVN_NO_ERROR;
979}
980
981/* Parse the '--- ' line of a git extended unidiff. */
982static svn_error_t *
983git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
984          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
985{
986  /* If we can find a tab, it separates the filename from
987   * the rest of the line which we can discard. */
988  char *tab = strchr(line, '\t');
989  if (tab)
990    *tab = '\0';
991
992  if (starts_with(line, "--- /dev/null"))
993    SVN_ERR(grab_filename(&patch->old_filename, "/dev/null",
994                          result_pool, scratch_pool));
995  else
996    SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"),
997                          result_pool, scratch_pool));
998
999  *new_state = state_git_minus_seen;
1000  return SVN_NO_ERROR;
1001}
1002
1003/* Parse the '+++ ' line of a git extended unidiff. */
1004static svn_error_t *
1005git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
1006          apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1007{
1008  /* If we can find a tab, it separates the filename from
1009   * the rest of the line which we can discard. */
1010  char *tab = strchr(line, '\t');
1011  if (tab)
1012    *tab = '\0';
1013
1014  if (starts_with(line, "+++ /dev/null"))
1015    SVN_ERR(grab_filename(&patch->new_filename, "/dev/null",
1016                          result_pool, scratch_pool));
1017  else
1018    SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"),
1019                          result_pool, scratch_pool));
1020
1021  *new_state = state_git_header_found;
1022  return SVN_NO_ERROR;
1023}
1024
1025/* Parse the 'rename from ' line of a git extended unidiff. */
1026static svn_error_t *
1027git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
1028              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1029{
1030  SVN_ERR(grab_filename(&patch->old_filename,
1031                        line + STRLEN_LITERAL("rename from "),
1032                        result_pool, scratch_pool));
1033
1034  *new_state = state_move_from_seen;
1035  return SVN_NO_ERROR;
1036}
1037
1038/* Parse the 'rename to ' line of a git extended unidiff. */
1039static svn_error_t *
1040git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
1041            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1042{
1043  SVN_ERR(grab_filename(&patch->new_filename,
1044                        line + STRLEN_LITERAL("rename to "),
1045                        result_pool, scratch_pool));
1046
1047  patch->operation = svn_diff_op_moved;
1048
1049  *new_state = state_git_tree_seen;
1050  return SVN_NO_ERROR;
1051}
1052
1053/* Parse the 'copy from ' line of a git extended unidiff. */
1054static svn_error_t *
1055git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
1056              apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1057{
1058  SVN_ERR(grab_filename(&patch->old_filename,
1059                        line + STRLEN_LITERAL("copy from "),
1060                        result_pool, scratch_pool));
1061
1062  *new_state = state_copy_from_seen;
1063  return SVN_NO_ERROR;
1064}
1065
1066/* Parse the 'copy to ' line of a git extended unidiff. */
1067static svn_error_t *
1068git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
1069            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1070{
1071  SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "),
1072                        result_pool, scratch_pool));
1073
1074  patch->operation = svn_diff_op_copied;
1075
1076  *new_state = state_git_tree_seen;
1077  return SVN_NO_ERROR;
1078}
1079
1080/* Parse the 'new file ' line of a git extended unidiff. */
1081static svn_error_t *
1082git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
1083             apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1084{
1085  patch->operation = svn_diff_op_added;
1086
1087  /* Filename already retrieved from diff --git header. */
1088
1089  *new_state = state_git_tree_seen;
1090  return SVN_NO_ERROR;
1091}
1092
1093/* Parse the 'deleted file ' line of a git extended unidiff. */
1094static svn_error_t *
1095git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
1096                 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1097{
1098  patch->operation = svn_diff_op_deleted;
1099
1100  /* Filename already retrieved from diff --git header. */
1101
1102  *new_state = state_git_tree_seen;
1103  return SVN_NO_ERROR;
1104}
1105
1106/* Add a HUNK associated with the property PROP_NAME to PATCH. */
1107static svn_error_t *
1108add_property_hunk(svn_patch_t *patch, const char *prop_name,
1109                  svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation,
1110                  apr_pool_t *result_pool)
1111{
1112  svn_prop_patch_t *prop_patch;
1113
1114  prop_patch = svn_hash_gets(patch->prop_patches, prop_name);
1115
1116  if (! prop_patch)
1117    {
1118      prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t));
1119      prop_patch->name = prop_name;
1120      prop_patch->operation = operation;
1121      prop_patch->hunks = apr_array_make(result_pool, 1,
1122                                         sizeof(svn_diff_hunk_t *));
1123
1124      svn_hash_sets(patch->prop_patches, prop_name, prop_patch);
1125    }
1126
1127  APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk;
1128
1129  return SVN_NO_ERROR;
1130}
1131
1132struct svn_patch_file_t
1133{
1134  /* The APR file handle to the patch file. */
1135  apr_file_t *apr_file;
1136
1137  /* The file offset at which the next patch is expected. */
1138  apr_off_t next_patch_offset;
1139};
1140
1141svn_error_t *
1142svn_diff_open_patch_file(svn_patch_file_t **patch_file,
1143                         const char *local_abspath,
1144                         apr_pool_t *result_pool)
1145{
1146  svn_patch_file_t *p;
1147
1148  p = apr_palloc(result_pool, sizeof(*p));
1149  SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath,
1150                           APR_READ | APR_BUFFERED, APR_OS_DEFAULT,
1151                           result_pool));
1152  p->next_patch_offset = 0;
1153  *patch_file = p;
1154
1155  return SVN_NO_ERROR;
1156}
1157
1158/* Parse hunks from APR_FILE and store them in PATCH->HUNKS.
1159 * Parsing stops if no valid next hunk can be found.
1160 * If IGNORE_WHITESPACE is TRUE, lines without
1161 * leading spaces will be treated as context lines.
1162 * Allocate results in RESULT_POOL.
1163 * Use SCRATCH_POOL for temporary allocations. */
1164static svn_error_t *
1165parse_hunks(svn_patch_t *patch, apr_file_t *apr_file,
1166            svn_boolean_t ignore_whitespace,
1167            apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1168{
1169  svn_diff_hunk_t *hunk;
1170  svn_boolean_t is_property;
1171  const char *last_prop_name;
1172  const char *prop_name;
1173  svn_diff_operation_kind_t prop_operation;
1174  apr_pool_t *iterpool;
1175
1176  last_prop_name = NULL;
1177
1178  patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *));
1179  patch->prop_patches = apr_hash_make(result_pool);
1180  iterpool = svn_pool_create(scratch_pool);
1181  do
1182    {
1183      svn_pool_clear(iterpool);
1184
1185      SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation,
1186                              patch, apr_file, ignore_whitespace, result_pool,
1187                              iterpool));
1188
1189      if (hunk && is_property)
1190        {
1191          if (! prop_name)
1192            prop_name = last_prop_name;
1193          else
1194            last_prop_name = prop_name;
1195          SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation,
1196                                    result_pool));
1197        }
1198      else if (hunk)
1199        {
1200          APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk;
1201          last_prop_name = NULL;
1202        }
1203
1204    }
1205  while (hunk);
1206  svn_pool_destroy(iterpool);
1207
1208  return SVN_NO_ERROR;
1209}
1210
1211/* State machine for the diff header parser.
1212 * Expected Input   Required state          Function to call */
1213static struct transition transitions[] =
1214{
1215  {"--- ",          state_start,            diff_minus},
1216  {"+++ ",          state_minus_seen,       diff_plus},
1217  {"diff --git",    state_start,            git_start},
1218  {"--- a/",        state_git_diff_seen,    git_minus},
1219  {"--- a/",        state_git_tree_seen,    git_minus},
1220  {"--- /dev/null", state_git_tree_seen,    git_minus},
1221  {"+++ b/",        state_git_minus_seen,   git_plus},
1222  {"+++ /dev/null", state_git_minus_seen,   git_plus},
1223  {"rename from ",  state_git_diff_seen,    git_move_from},
1224  {"rename to ",    state_move_from_seen,   git_move_to},
1225  {"copy from ",    state_git_diff_seen,    git_copy_from},
1226  {"copy to ",      state_copy_from_seen,   git_copy_to},
1227  {"new file ",     state_git_diff_seen,    git_new_file},
1228  {"deleted file ", state_git_diff_seen,    git_deleted_file},
1229};
1230
1231svn_error_t *
1232svn_diff_parse_next_patch(svn_patch_t **patch,
1233                          svn_patch_file_t *patch_file,
1234                          svn_boolean_t reverse,
1235                          svn_boolean_t ignore_whitespace,
1236                          apr_pool_t *result_pool,
1237                          apr_pool_t *scratch_pool)
1238{
1239  apr_off_t pos, last_line;
1240  svn_boolean_t eof;
1241  svn_boolean_t line_after_tree_header_read = FALSE;
1242  apr_pool_t *iterpool;
1243  enum parse_state state = state_start;
1244
1245  if (apr_file_eof(patch_file->apr_file) == APR_EOF)
1246    {
1247      /* No more patches here. */
1248      *patch = NULL;
1249      return SVN_NO_ERROR;
1250    }
1251
1252  *patch = apr_pcalloc(result_pool, sizeof(**patch));
1253
1254  pos = patch_file->next_patch_offset;
1255  SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool));
1256
1257  iterpool = svn_pool_create(scratch_pool);
1258  do
1259    {
1260      svn_stringbuf_t *line;
1261      svn_boolean_t valid_header_line = FALSE;
1262      int i;
1263
1264      svn_pool_clear(iterpool);
1265
1266      /* Remember the current line's offset, and read the line. */
1267      last_line = pos;
1268      SVN_ERR(svn_io_file_readline(patch_file->apr_file, &line, NULL, &eof,
1269                                   APR_SIZE_MAX, iterpool, iterpool));
1270
1271      if (! eof)
1272        {
1273          /* Update line offset for next iteration. */
1274          pos = 0;
1275          SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos,
1276                                   iterpool));
1277        }
1278
1279      /* Run the state machine. */
1280      for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++)
1281        {
1282          if (starts_with(line->data, transitions[i].expected_input)
1283              && state == transitions[i].required_state)
1284            {
1285              SVN_ERR(transitions[i].fn(&state, line->data, *patch,
1286                                        result_pool, iterpool));
1287              valid_header_line = TRUE;
1288              break;
1289            }
1290        }
1291
1292      if (state == state_unidiff_found || state == state_git_header_found)
1293        {
1294          /* We have a valid diff header, yay! */
1295          break;
1296        }
1297      else if (state == state_git_tree_seen && line_after_tree_header_read)
1298        {
1299          /* git patches can contain an index line after the file mode line */
1300          if (!starts_with(line->data, "index "))
1301          {
1302            /* We have a valid diff header for a patch with only tree changes.
1303             * Rewind to the start of the line just read, so subsequent calls
1304             * to this function don't end up skipping the line -- it may
1305             * contain a patch. */
1306            SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
1307                    scratch_pool));
1308            break;
1309          }
1310        }
1311      else if (state == state_git_tree_seen)
1312        {
1313          line_after_tree_header_read = TRUE;
1314        }
1315      else if (! valid_header_line && state != state_start
1316               && state != state_git_diff_seen
1317               && !starts_with(line->data, "index "))
1318        {
1319          /* We've encountered an invalid diff header.
1320           *
1321           * Rewind to the start of the line just read - it may be a new
1322           * header that begins there. */
1323          SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
1324                                   scratch_pool));
1325          state = state_start;
1326        }
1327
1328    }
1329  while (! eof);
1330
1331  (*patch)->reverse = reverse;
1332  if (reverse)
1333    {
1334      const char *temp;
1335      temp = (*patch)->old_filename;
1336      (*patch)->old_filename = (*patch)->new_filename;
1337      (*patch)->new_filename = temp;
1338    }
1339
1340  if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL)
1341    {
1342      /* Something went wrong, just discard the result. */
1343      *patch = NULL;
1344    }
1345  else
1346    SVN_ERR(parse_hunks(*patch, patch_file->apr_file, ignore_whitespace,
1347                        result_pool, iterpool));
1348
1349  svn_pool_destroy(iterpool);
1350
1351  patch_file->next_patch_offset = 0;
1352  SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR,
1353                           &patch_file->next_patch_offset, scratch_pool));
1354
1355  if (*patch)
1356    {
1357      /* Usually, hunks appear in the patch sorted by their original line
1358       * offset. But just in case they weren't parsed in this order for
1359       * some reason, we sort them so that our caller can assume that hunks
1360       * are sorted as if parsed from a usual patch. */
1361      qsort((*patch)->hunks->elts, (*patch)->hunks->nelts,
1362            (*patch)->hunks->elt_size, compare_hunks);
1363    }
1364
1365  return SVN_NO_ERROR;
1366}
1367
1368svn_error_t *
1369svn_diff_close_patch_file(svn_patch_file_t *patch_file,
1370                          apr_pool_t *scratch_pool)
1371{
1372  return svn_error_trace(svn_io_file_close(patch_file->apr_file,
1373                                           scratch_pool));
1374}
1375