1/* Analyze file differences for GNU DIFF.
2
3   Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002
4   Free Software Foundation, Inc.
5
6   This file is part of GNU DIFF.
7
8   GNU DIFF is free software; you can redistribute it and/or modify
9   it under the terms of the GNU General Public License as published by
10   the Free Software Foundation; either version 2, or (at your option)
11   any later version.
12
13   GNU DIFF is distributed in the hope that it will be useful,
14   but WITHOUT ANY WARRANTY; without even the implied warranty of
15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16   GNU General Public License for more details.
17
18   You should have received a copy of the GNU General Public License
19   along with this program; see the file COPYING.
20   If not, write to the Free Software Foundation,
21   59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
22
23/* The basic algorithm is described in:
24   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
25   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
26   see especially section 4.2, which describes the variation used below.
27   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
28   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
29   at the price of producing suboptimal output for large inputs with
30   many differences.
31
32   The basic algorithm was independently discovered as described in:
33   "Algorithms for Approximate String Matching", E. Ukkonen,
34   Information and Control Vol. 64, 1985, pp. 100-118.  */
35
36#include "diff.h"
37#include <cmpbuf.h>
38#include <error.h>
39#include <regex.h>
40#include <xalloc.h>
41
42static lin *xvec, *yvec;	/* Vectors being compared. */
43static lin *fdiag;		/* Vector, indexed by diagonal, containing
44				   1 + the X coordinate of the point furthest
45				   along the given diagonal in the forward
46				   search of the edit matrix. */
47static lin *bdiag;		/* Vector, indexed by diagonal, containing
48				   the X coordinate of the point furthest
49				   along the given diagonal in the backward
50				   search of the edit matrix. */
51static lin too_expensive;	/* Edit scripts longer than this are too
52				   expensive to compute.  */
53
54#define SNAKE_LIMIT 20	/* Snakes bigger than this are considered `big'.  */
55
56struct partition
57{
58  lin xmid, ymid;	/* Midpoints of this partition.  */
59  bool lo_minimal;	/* Nonzero if low half will be analyzed minimally.  */
60  bool hi_minimal;	/* Likewise for high half.  */
61};
62
63/* Find the midpoint of the shortest edit script for a specified
64   portion of the two files.
65
66   Scan from the beginnings of the files, and simultaneously from the ends,
67   doing a breadth-first search through the space of edit-sequence.
68   When the two searches meet, we have found the midpoint of the shortest
69   edit sequence.
70
71   If FIND_MINIMAL is nonzero, find the minimal edit script regardless
72   of expense.  Otherwise, if the search is too expensive, use
73   heuristics to stop the search and report a suboptimal answer.
74
75   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
76   XMID - YMID equals the number of inserted lines minus the number
77   of deleted lines (counting only lines before the midpoint).
78   Return the approximate edit cost; this is the total number of
79   lines inserted or deleted (counting only lines before the midpoint),
80   unless a heuristic is used to terminate the search prematurely.
81
82   Set PART->lo_minimal to true iff the minimal edit script for the
83   left half of the partition is known; similarly for PART->hi_minimal.
84
85   This function assumes that the first lines of the specified portions
86   of the two files do not match, and likewise that the last lines do not
87   match.  The caller must trim matching lines from the beginning and end
88   of the portions it is going to specify.
89
90   If we return the "wrong" partitions,
91   the worst this can do is cause suboptimal diff output.
92   It cannot cause incorrect diff output.  */
93
94static lin
95diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
96      struct partition *part)
97{
98  lin *const fd = fdiag;	/* Give the compiler a chance. */
99  lin *const bd = bdiag;	/* Additional help for the compiler. */
100  lin const *const xv = xvec;	/* Still more help for the compiler. */
101  lin const *const yv = yvec;	/* And more and more . . . */
102  lin const dmin = xoff - ylim;	/* Minimum valid diagonal. */
103  lin const dmax = xlim - yoff;	/* Maximum valid diagonal. */
104  lin const fmid = xoff - yoff;	/* Center diagonal of top-down search. */
105  lin const bmid = xlim - ylim;	/* Center diagonal of bottom-up search. */
106  lin fmin = fmid, fmax = fmid;	/* Limits of top-down search. */
107  lin bmin = bmid, bmax = bmid;	/* Limits of bottom-up search. */
108  lin c;			/* Cost. */
109  bool odd = (fmid - bmid) & 1;	/* True if southeast corner is on an odd
110				   diagonal with respect to the northwest. */
111
112  fd[fmid] = xoff;
113  bd[bmid] = xlim;
114
115  for (c = 1;; ++c)
116    {
117      lin d;			/* Active diagonal. */
118      bool big_snake = 0;
119
120      /* Extend the top-down search by an edit step in each diagonal. */
121      fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
122      fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
123      for (d = fmax; d >= fmin; d -= 2)
124	{
125	  lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
126
127	  if (tlo >= thi)
128	    x = tlo + 1;
129	  else
130	    x = thi;
131	  oldx = x;
132	  y = x - d;
133	  while (x < xlim && y < ylim && xv[x] == yv[y])
134	    ++x, ++y;
135	  if (x - oldx > SNAKE_LIMIT)
136	    big_snake = 1;
137	  fd[d] = x;
138	  if (odd && bmin <= d && d <= bmax && bd[d] <= x)
139	    {
140	      part->xmid = x;
141	      part->ymid = y;
142	      part->lo_minimal = part->hi_minimal = 1;
143	      return 2 * c - 1;
144	    }
145	}
146
147      /* Similarly extend the bottom-up search.  */
148      bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
149      bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
150      for (d = bmax; d >= bmin; d -= 2)
151	{
152	  lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
153
154	  if (tlo < thi)
155	    x = tlo;
156	  else
157	    x = thi - 1;
158	  oldx = x;
159	  y = x - d;
160	  while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
161	    --x, --y;
162	  if (oldx - x > SNAKE_LIMIT)
163	    big_snake = 1;
164	  bd[d] = x;
165	  if (!odd && fmin <= d && d <= fmax && x <= fd[d])
166	    {
167	      part->xmid = x;
168	      part->ymid = y;
169	      part->lo_minimal = part->hi_minimal = 1;
170	      return 2 * c;
171	    }
172	}
173
174      if (find_minimal)
175	continue;
176
177      /* Heuristic: check occasionally for a diagonal that has made
178	 lots of progress compared with the edit distance.
179	 If we have any such, find the one that has made the most
180	 progress and return it as if it had succeeded.
181
182	 With this heuristic, for files with a constant small density
183	 of changes, the algorithm is linear in the file size.  */
184
185      if (200 < c && big_snake && speed_large_files)
186	{
187	  lin best;
188
189	  best = 0;
190	  for (d = fmax; d >= fmin; d -= 2)
191	    {
192	      lin dd = d - fmid;
193	      lin x = fd[d];
194	      lin y = x - d;
195	      lin v = (x - xoff) * 2 - dd;
196	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
197		{
198		  if (v > best
199		      && xoff + SNAKE_LIMIT <= x && x < xlim
200		      && yoff + SNAKE_LIMIT <= y && y < ylim)
201		    {
202		      /* We have a good enough best diagonal;
203			 now insist that it end with a significant snake.  */
204		      int k;
205
206		      for (k = 1; xv[x - k] == yv[y - k]; k++)
207			if (k == SNAKE_LIMIT)
208			  {
209			    best = v;
210			    part->xmid = x;
211			    part->ymid = y;
212			    break;
213			  }
214		    }
215		}
216	    }
217	  if (best > 0)
218	    {
219	      part->lo_minimal = 1;
220	      part->hi_minimal = 0;
221	      return 2 * c - 1;
222	    }
223
224	  best = 0;
225	  for (d = bmax; d >= bmin; d -= 2)
226	    {
227	      lin dd = d - bmid;
228	      lin x = bd[d];
229	      lin y = x - d;
230	      lin v = (xlim - x) * 2 + dd;
231	      if (v > 12 * (c + (dd < 0 ? -dd : dd)))
232		{
233		  if (v > best
234		      && xoff < x && x <= xlim - SNAKE_LIMIT
235		      && yoff < y && y <= ylim - SNAKE_LIMIT)
236		    {
237		      /* We have a good enough best diagonal;
238			 now insist that it end with a significant snake.  */
239		      int k;
240
241		      for (k = 0; xv[x + k] == yv[y + k]; k++)
242			if (k == SNAKE_LIMIT - 1)
243			  {
244			    best = v;
245			    part->xmid = x;
246			    part->ymid = y;
247			    break;
248			  }
249		    }
250		}
251	    }
252	  if (best > 0)
253	    {
254	      part->lo_minimal = 0;
255	      part->hi_minimal = 1;
256	      return 2 * c - 1;
257	    }
258	}
259
260      /* Heuristic: if we've gone well beyond the call of duty,
261	 give up and report halfway between our best results so far.  */
262      if (c >= too_expensive)
263	{
264	  lin fxybest, fxbest;
265	  lin bxybest, bxbest;
266
267	  fxbest = bxbest = 0;  /* Pacify `gcc -Wall'.  */
268
269	  /* Find forward diagonal that maximizes X + Y.  */
270	  fxybest = -1;
271	  for (d = fmax; d >= fmin; d -= 2)
272	    {
273	      lin x = MIN (fd[d], xlim);
274	      lin y = x - d;
275	      if (ylim < y)
276		x = ylim + d, y = ylim;
277	      if (fxybest < x + y)
278		{
279		  fxybest = x + y;
280		  fxbest = x;
281		}
282	    }
283
284	  /* Find backward diagonal that minimizes X + Y.  */
285	  bxybest = LIN_MAX;
286	  for (d = bmax; d >= bmin; d -= 2)
287	    {
288	      lin x = MAX (xoff, bd[d]);
289	      lin y = x - d;
290	      if (y < yoff)
291		x = yoff + d, y = yoff;
292	      if (x + y < bxybest)
293		{
294		  bxybest = x + y;
295		  bxbest = x;
296		}
297	    }
298
299	  /* Use the better of the two diagonals.  */
300	  if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
301	    {
302	      part->xmid = fxbest;
303	      part->ymid = fxybest - fxbest;
304	      part->lo_minimal = 1;
305	      part->hi_minimal = 0;
306	    }
307	  else
308	    {
309	      part->xmid = bxbest;
310	      part->ymid = bxybest - bxbest;
311	      part->lo_minimal = 0;
312	      part->hi_minimal = 1;
313	    }
314	  return 2 * c - 1;
315	}
316    }
317}
318
319/* Compare in detail contiguous subsequences of the two files
320   which are known, as a whole, to match each other.
321
322   The results are recorded in the vectors files[N].changed, by
323   storing 1 in the element for each line that is an insertion or deletion.
324
325   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
326
327   Note that XLIM, YLIM are exclusive bounds.
328   All line numbers are origin-0 and discarded lines are not counted.
329
330   If FIND_MINIMAL, find a minimal difference no matter how
331   expensive it is.  */
332
333static void
334compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
335{
336  lin * const xv = xvec; /* Help the compiler.  */
337  lin * const yv = yvec;
338
339  /* Slide down the bottom initial diagonal. */
340  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
341    ++xoff, ++yoff;
342  /* Slide up the top initial diagonal. */
343  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
344    --xlim, --ylim;
345
346  /* Handle simple cases. */
347  if (xoff == xlim)
348    while (yoff < ylim)
349      files[1].changed[files[1].realindexes[yoff++]] = 1;
350  else if (yoff == ylim)
351    while (xoff < xlim)
352      files[0].changed[files[0].realindexes[xoff++]] = 1;
353  else
354    {
355      lin c;
356      struct partition part;
357
358      /* Find a point of correspondence in the middle of the files.  */
359
360      c = diag (xoff, xlim, yoff, ylim, find_minimal, &part);
361
362      if (c == 1)
363	{
364	  /* This should be impossible, because it implies that
365	     one of the two subsequences is empty,
366	     and that case was handled above without calling `diag'.
367	     Let's verify that this is true.  */
368	  abort ();
369#if 0
370	  /* The two subsequences differ by a single insert or delete;
371	     record it and we are done.  */
372	  if (part.xmid - part.ymid < xoff - yoff)
373	    files[1].changed[files[1].realindexes[part.ymid - 1]] = 1;
374	  else
375	    files[0].changed[files[0].realindexes[part.xmid]] = 1;
376#endif
377	}
378      else
379	{
380	  /* Use the partitions to split this problem into subproblems.  */
381	  compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
382	  compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
383	}
384    }
385}
386
387/* Discard lines from one file that have no matches in the other file.
388
389   A line which is discarded will not be considered by the actual
390   comparison algorithm; it will be as if that line were not in the file.
391   The file's `realindexes' table maps virtual line numbers
392   (which don't count the discarded lines) into real line numbers;
393   this is how the actual comparison algorithm produces results
394   that are comprehensible when the discarded lines are counted.
395
396   When we discard a line, we also mark it as a deletion or insertion
397   so that it will be printed in the output.  */
398
399static void
400discard_confusing_lines (struct file_data filevec[])
401{
402  int f;
403  lin i;
404  char *discarded[2];
405  lin *equiv_count[2];
406  lin *p;
407
408  /* Allocate our results.  */
409  p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines)
410	       * (2 * sizeof *p));
411  for (f = 0; f < 2; f++)
412    {
413      filevec[f].undiscarded = p;  p += filevec[f].buffered_lines;
414      filevec[f].realindexes = p;  p += filevec[f].buffered_lines;
415    }
416
417  /* Set up equiv_count[F][I] as the number of lines in file F
418     that fall in equivalence class I.  */
419
420  p = zalloc (filevec[0].equiv_max * (2 * sizeof *p));
421  equiv_count[0] = p;
422  equiv_count[1] = p + filevec[0].equiv_max;
423
424  for (i = 0; i < filevec[0].buffered_lines; ++i)
425    ++equiv_count[0][filevec[0].equivs[i]];
426  for (i = 0; i < filevec[1].buffered_lines; ++i)
427    ++equiv_count[1][filevec[1].equivs[i]];
428
429  /* Set up tables of which lines are going to be discarded.  */
430
431  discarded[0] = zalloc (filevec[0].buffered_lines
432			 + filevec[1].buffered_lines);
433  discarded[1] = discarded[0] + filevec[0].buffered_lines;
434
435  /* Mark to be discarded each line that matches no line of the other file.
436     If a line matches many lines, mark it as provisionally discardable.  */
437
438  for (f = 0; f < 2; f++)
439    {
440      size_t end = filevec[f].buffered_lines;
441      char *discards = discarded[f];
442      lin *counts = equiv_count[1 - f];
443      lin *equivs = filevec[f].equivs;
444      size_t many = 5;
445      size_t tem = end / 64;
446
447      /* Multiply MANY by approximate square root of number of lines.
448	 That is the threshold for provisionally discardable lines.  */
449      while ((tem = tem >> 2) > 0)
450	many *= 2;
451
452      for (i = 0; i < end; i++)
453	{
454	  lin nmatch;
455	  if (equivs[i] == 0)
456	    continue;
457	  nmatch = counts[equivs[i]];
458	  if (nmatch == 0)
459	    discards[i] = 1;
460	  else if (nmatch > many)
461	    discards[i] = 2;
462	}
463    }
464
465  /* Don't really discard the provisional lines except when they occur
466     in a run of discardables, with nonprovisionals at the beginning
467     and end.  */
468
469  for (f = 0; f < 2; f++)
470    {
471      lin end = filevec[f].buffered_lines;
472      register char *discards = discarded[f];
473
474      for (i = 0; i < end; i++)
475	{
476	  /* Cancel provisional discards not in middle of run of discards.  */
477	  if (discards[i] == 2)
478	    discards[i] = 0;
479	  else if (discards[i] != 0)
480	    {
481	      /* We have found a nonprovisional discard.  */
482	      register lin j;
483	      lin length;
484	      lin provisional = 0;
485
486	      /* Find end of this run of discardable lines.
487		 Count how many are provisionally discardable.  */
488	      for (j = i; j < end; j++)
489		{
490		  if (discards[j] == 0)
491		    break;
492		  if (discards[j] == 2)
493		    ++provisional;
494		}
495
496	      /* Cancel provisional discards at end, and shrink the run.  */
497	      while (j > i && discards[j - 1] == 2)
498		discards[--j] = 0, --provisional;
499
500	      /* Now we have the length of a run of discardable lines
501		 whose first and last are not provisional.  */
502	      length = j - i;
503
504	      /* If 1/4 of the lines in the run are provisional,
505		 cancel discarding of all provisional lines in the run.  */
506	      if (provisional * 4 > length)
507		{
508		  while (j > i)
509		    if (discards[--j] == 2)
510		      discards[j] = 0;
511		}
512	      else
513		{
514		  register lin consec;
515		  lin minimum = 1;
516		  lin tem = length >> 2;
517
518		  /* MINIMUM is approximate square root of LENGTH/4.
519		     A subrun of two or more provisionals can stand
520		     when LENGTH is at least 16.
521		     A subrun of 4 or more can stand when LENGTH >= 64.  */
522		  while (0 < (tem >>= 2))
523		    minimum <<= 1;
524		  minimum++;
525
526		  /* Cancel any subrun of MINIMUM or more provisionals
527		     within the larger run.  */
528		  for (j = 0, consec = 0; j < length; j++)
529		    if (discards[i + j] != 2)
530		      consec = 0;
531		    else if (minimum == ++consec)
532		      /* Back up to start of subrun, to cancel it all.  */
533		      j -= consec;
534		    else if (minimum < consec)
535		      discards[i + j] = 0;
536
537		  /* Scan from beginning of run
538		     until we find 3 or more nonprovisionals in a row
539		     or until the first nonprovisional at least 8 lines in.
540		     Until that point, cancel any provisionals.  */
541		  for (j = 0, consec = 0; j < length; j++)
542		    {
543		      if (j >= 8 && discards[i + j] == 1)
544			break;
545		      if (discards[i + j] == 2)
546			consec = 0, discards[i + j] = 0;
547		      else if (discards[i + j] == 0)
548			consec = 0;
549		      else
550			consec++;
551		      if (consec == 3)
552			break;
553		    }
554
555		  /* I advances to the last line of the run.  */
556		  i += length - 1;
557
558		  /* Same thing, from end.  */
559		  for (j = 0, consec = 0; j < length; j++)
560		    {
561		      if (j >= 8 && discards[i - j] == 1)
562			break;
563		      if (discards[i - j] == 2)
564			consec = 0, discards[i - j] = 0;
565		      else if (discards[i - j] == 0)
566			consec = 0;
567		      else
568			consec++;
569		      if (consec == 3)
570			break;
571		    }
572		}
573	    }
574	}
575    }
576
577  /* Actually discard the lines. */
578  for (f = 0; f < 2; f++)
579    {
580      char *discards = discarded[f];
581      lin end = filevec[f].buffered_lines;
582      lin j = 0;
583      for (i = 0; i < end; ++i)
584	if (minimal || discards[i] == 0)
585	  {
586	    filevec[f].undiscarded[j] = filevec[f].equivs[i];
587	    filevec[f].realindexes[j++] = i;
588	  }
589	else
590	  filevec[f].changed[i] = 1;
591      filevec[f].nondiscarded_lines = j;
592    }
593
594  free (discarded[0]);
595  free (equiv_count[0]);
596}
597
598/* Adjust inserts/deletes of identical lines to join changes
599   as much as possible.
600
601   We do something when a run of changed lines include a
602   line at one end and have an excluded, identical line at the other.
603   We are free to choose which identical line is included.
604   `compareseq' usually chooses the one at the beginning,
605   but usually it is cleaner to consider the following identical line
606   to be the "change".  */
607
608static void
609shift_boundaries (struct file_data filevec[])
610{
611  int f;
612
613  for (f = 0; f < 2; f++)
614    {
615      bool *changed = filevec[f].changed;
616      bool const *other_changed = filevec[1 - f].changed;
617      lin const *equivs = filevec[f].equivs;
618      lin i = 0;
619      lin j = 0;
620      lin i_end = filevec[f].buffered_lines;
621
622      while (1)
623	{
624	  lin runlength, start, corresponding;
625
626	  /* Scan forwards to find beginning of another run of changes.
627	     Also keep track of the corresponding point in the other file.  */
628
629	  while (i < i_end && !changed[i])
630	    {
631	      while (other_changed[j++])
632		continue;
633	      i++;
634	    }
635
636	  if (i == i_end)
637	    break;
638
639	  start = i;
640
641	  /* Find the end of this run of changes.  */
642
643	  while (changed[++i])
644	    continue;
645	  while (other_changed[j])
646	    j++;
647
648	  do
649	    {
650	      /* Record the length of this run of changes, so that
651		 we can later determine whether the run has grown.  */
652	      runlength = i - start;
653
654	      /* Move the changed region back, so long as the
655		 previous unchanged line matches the last changed one.
656		 This merges with previous changed regions.  */
657
658	      while (start && equivs[start - 1] == equivs[i - 1])
659		{
660		  changed[--start] = 1;
661		  changed[--i] = 0;
662		  while (changed[start - 1])
663		    start--;
664		  while (other_changed[--j])
665		    continue;
666		}
667
668	      /* Set CORRESPONDING to the end of the changed run, at the last
669		 point where it corresponds to a changed run in the other file.
670		 CORRESPONDING == I_END means no such point has been found.  */
671	      corresponding = other_changed[j - 1] ? i : i_end;
672
673	      /* Move the changed region forward, so long as the
674		 first changed line matches the following unchanged one.
675		 This merges with following changed regions.
676		 Do this second, so that if there are no merges,
677		 the changed region is moved forward as far as possible.  */
678
679	      while (i != i_end && equivs[start] == equivs[i])
680		{
681		  changed[start++] = 0;
682		  changed[i++] = 1;
683		  while (changed[i])
684		    i++;
685		  while (other_changed[++j])
686		    corresponding = i;
687		}
688	    }
689	  while (runlength != i - start);
690
691	  /* If possible, move the fully-merged run of changes
692	     back to a corresponding run in the other file.  */
693
694	  while (corresponding < i)
695	    {
696	      changed[--start] = 1;
697	      changed[--i] = 0;
698	      while (other_changed[--j])
699		continue;
700	    }
701	}
702    }
703}
704
705/* Cons an additional entry onto the front of an edit script OLD.
706   LINE0 and LINE1 are the first affected lines in the two files (origin 0).
707   DELETED is the number of lines deleted here from file 0.
708   INSERTED is the number of lines inserted here in file 1.
709
710   If DELETED is 0 then LINE0 is the number of the line before
711   which the insertion was done; vice versa for INSERTED and LINE1.  */
712
713static struct change *
714add_change (lin line0, lin line1, lin deleted, lin inserted,
715	    struct change *old)
716{
717  struct change *new = xmalloc (sizeof *new);
718
719  new->line0 = line0;
720  new->line1 = line1;
721  new->inserted = inserted;
722  new->deleted = deleted;
723  new->link = old;
724  return new;
725}
726
727/* Scan the tables of which lines are inserted and deleted,
728   producing an edit script in reverse order.  */
729
730static struct change *
731build_reverse_script (struct file_data const filevec[])
732{
733  struct change *script = 0;
734  bool *changed0 = filevec[0].changed;
735  bool *changed1 = filevec[1].changed;
736  lin len0 = filevec[0].buffered_lines;
737  lin len1 = filevec[1].buffered_lines;
738
739  /* Note that changedN[len0] does exist, and is 0.  */
740
741  lin i0 = 0, i1 = 0;
742
743  while (i0 < len0 || i1 < len1)
744    {
745      if (changed0[i0] | changed1[i1])
746	{
747	  lin line0 = i0, line1 = i1;
748
749	  /* Find # lines changed here in each file.  */
750	  while (changed0[i0]) ++i0;
751	  while (changed1[i1]) ++i1;
752
753	  /* Record this change.  */
754	  script = add_change (line0, line1, i0 - line0, i1 - line1, script);
755	}
756
757      /* We have reached lines in the two files that match each other.  */
758      i0++, i1++;
759    }
760
761  return script;
762}
763
764/* Scan the tables of which lines are inserted and deleted,
765   producing an edit script in forward order.  */
766
767static struct change *
768build_script (struct file_data const filevec[])
769{
770  struct change *script = 0;
771  bool *changed0 = filevec[0].changed;
772  bool *changed1 = filevec[1].changed;
773  lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
774
775  /* Note that changedN[-1] does exist, and is 0.  */
776
777  while (i0 >= 0 || i1 >= 0)
778    {
779      if (changed0[i0 - 1] | changed1[i1 - 1])
780	{
781	  lin line0 = i0, line1 = i1;
782
783	  /* Find # lines changed here in each file.  */
784	  while (changed0[i0 - 1]) --i0;
785	  while (changed1[i1 - 1]) --i1;
786
787	  /* Record this change.  */
788	  script = add_change (i0, i1, line0 - i0, line1 - i1, script);
789	}
790
791      /* We have reached lines in the two files that match each other.  */
792      i0--, i1--;
793    }
794
795  return script;
796}
797
798/* If CHANGES, briefly report that two files differed.
799   Return 2 if trouble, CHANGES otherwise.  */
800static int
801briefly_report (int changes, struct file_data const filevec[])
802{
803  if (changes)
804    {
805      char const *label0 = file_label[0] ? file_label[0] : filevec[0].name;
806      char const *label1 = file_label[1] ? file_label[1] : filevec[1].name;
807
808      if (brief)
809	message ("Files %s and %s differ\n", label0, label1);
810      else
811	{
812	  message ("Binary files %s and %s differ\n", label0, label1);
813	  changes = 2;
814	}
815    }
816
817  return changes;
818}
819
820/* Report the differences of two files.  */
821int
822diff_2_files (struct comparison *cmp)
823{
824  lin diags;
825  int f;
826  struct change *e, *p;
827  struct change *script;
828  int changes;
829
830
831  /* If we have detected that either file is binary,
832     compare the two files as binary.  This can happen
833     only when the first chunk is read.
834     Also, --brief without any --ignore-* options means
835     we can speed things up by treating the files as binary.  */
836
837  if (read_files (cmp->file, files_can_be_treated_as_binary))
838    {
839      /* Files with different lengths must be different.  */
840      if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
841	  && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
842	  && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
843	changes = 1;
844
845      /* Standard input equals itself.  */
846      else if (cmp->file[0].desc == cmp->file[1].desc)
847	changes = 0;
848
849      else
850	/* Scan both files, a buffer at a time, looking for a difference.  */
851	{
852	  /* Allocate same-sized buffers for both files.  */
853	  size_t lcm_max = PTRDIFF_MAX - 1;
854	  size_t buffer_size =
855	    buffer_lcm (sizeof (word),
856			buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat),
857				    STAT_BLOCKSIZE (cmp->file[1].stat),
858				    lcm_max),
859			lcm_max);
860	  for (f = 0; f < 2; f++)
861	    cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size);
862
863	  for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0)
864	    {
865	      /* Read a buffer's worth from both files.  */
866	      for (f = 0; f < 2; f++)
867		if (0 <= cmp->file[f].desc)
868		  file_block_read (&cmp->file[f],
869				   buffer_size - cmp->file[f].buffered);
870
871	      /* If the buffers differ, the files differ.  */
872	      if (cmp->file[0].buffered != cmp->file[1].buffered
873		  || memcmp (cmp->file[0].buffer,
874			     cmp->file[1].buffer,
875			     cmp->file[0].buffered))
876		{
877		  changes = 1;
878		  break;
879		}
880
881	      /* If we reach end of file, the files are the same.  */
882	      if (cmp->file[0].buffered != buffer_size)
883		{
884		  changes = 0;
885		  break;
886		}
887	    }
888	}
889
890      changes = briefly_report (changes, cmp->file);
891    }
892  else
893    {
894      /* Allocate vectors for the results of comparison:
895	 a flag for each line of each file, saying whether that line
896	 is an insertion or deletion.
897	 Allocate an extra element, always 0, at each end of each vector.  */
898
899      size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
900      bool *flag_space = zalloc (s * sizeof *flag_space);
901      cmp->file[0].changed = flag_space + 1;
902      cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
903
904      /* Some lines are obviously insertions or deletions
905	 because they don't match anything.  Detect them now, and
906	 avoid even thinking about them in the main comparison algorithm.  */
907
908      discard_confusing_lines (cmp->file);
909
910      /* Now do the main comparison algorithm, considering just the
911	 undiscarded lines.  */
912
913      xvec = cmp->file[0].undiscarded;
914      yvec = cmp->file[1].undiscarded;
915      diags = (cmp->file[0].nondiscarded_lines
916	       + cmp->file[1].nondiscarded_lines + 3);
917      fdiag = xmalloc (diags * (2 * sizeof *fdiag));
918      bdiag = fdiag + diags;
919      fdiag += cmp->file[1].nondiscarded_lines + 1;
920      bdiag += cmp->file[1].nondiscarded_lines + 1;
921
922      /* Set TOO_EXPENSIVE to be approximate square root of input size,
923	 bounded below by 256.  */
924      too_expensive = 1;
925      for (;  diags != 0;  diags >>= 2)
926	too_expensive <<= 1;
927      too_expensive = MAX (256, too_expensive);
928
929      files[0] = cmp->file[0];
930      files[1] = cmp->file[1];
931
932      compareseq (0, cmp->file[0].nondiscarded_lines,
933		  0, cmp->file[1].nondiscarded_lines, minimal);
934
935      free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
936
937      /* Modify the results slightly to make them prettier
938	 in cases where that can validly be done.  */
939
940      shift_boundaries (cmp->file);
941
942      /* Get the results of comparison in the form of a chain
943	 of `struct change's -- an edit script.  */
944
945      if (output_style == OUTPUT_ED)
946	script = build_reverse_script (cmp->file);
947      else
948	script = build_script (cmp->file);
949
950      /* Set CHANGES if we had any diffs.
951	 If some changes are ignored, we must scan the script to decide.  */
952      if (ignore_blank_lines || ignore_regexp.fastmap)
953	{
954	  struct change *next = script;
955	  changes = 0;
956
957	  while (next && changes == 0)
958	    {
959	      struct change *this, *end;
960	      lin first0, last0, first1, last1;
961
962	      /* Find a set of changes that belong together.  */
963	      this = next;
964	      end = find_change (next);
965
966	      /* Disconnect them from the rest of the changes, making them
967		 a hunk, and remember the rest for next iteration.  */
968	      next = end->link;
969	      end->link = 0;
970
971	      /* Determine whether this hunk is really a difference.  */
972	      if (analyze_hunk (this, &first0, &last0, &first1, &last1))
973		changes = 1;
974
975	      /* Reconnect the script so it will all be freed properly.  */
976	      end->link = next;
977	    }
978	}
979      else
980	changes = (script != 0);
981
982      if (brief)
983	changes = briefly_report (changes, cmp->file);
984      else
985	{
986	  if (changes | !no_diff_means_no_output)
987	    {
988	      /* Record info for starting up output,
989		 to be used if and when we have some output to print.  */
990	      setup_output (file_label[0] ? file_label[0] : cmp->file[0].name,
991			    file_label[1] ? file_label[1] : cmp->file[1].name,
992			    cmp->parent != 0);
993
994	      switch (output_style)
995		{
996		case OUTPUT_CONTEXT:
997		  print_context_script (script, 0);
998		  break;
999
1000		case OUTPUT_UNIFIED:
1001		  print_context_script (script, 1);
1002		  break;
1003
1004		case OUTPUT_ED:
1005		  print_ed_script (script);
1006		  break;
1007
1008		case OUTPUT_FORWARD_ED:
1009		  pr_forward_ed_script (script);
1010		  break;
1011
1012		case OUTPUT_RCS:
1013		  print_rcs_script (script);
1014		  break;
1015
1016		case OUTPUT_NORMAL:
1017		  print_normal_script (script);
1018		  break;
1019
1020		case OUTPUT_IFDEF:
1021		  print_ifdef_script (script);
1022		  break;
1023
1024		case OUTPUT_SDIFF:
1025		  print_sdiff_script (script);
1026		  break;
1027
1028		default:
1029		  abort ();
1030		}
1031
1032	      finish_output ();
1033	    }
1034	}
1035
1036      free (cmp->file[0].undiscarded);
1037
1038      free (flag_space);
1039
1040      for (f = 0; f < 2; f++)
1041	{
1042	  free (cmp->file[f].equivs);
1043	  free (cmp->file[f].linbuf + cmp->file[f].linbuf_base);
1044	}
1045
1046      for (e = script; e; e = p)
1047	{
1048	  p = e->link;
1049	  free (e);
1050	}
1051
1052      if (! ROBUST_OUTPUT_STYLE (output_style))
1053	for (f = 0; f < 2; ++f)
1054	  if (cmp->file[f].missing_newline)
1055	    {
1056	      error (0, 0, "%s: %s\n",
1057		     file_label[f] ? file_label[f] : cmp->file[f].name,
1058		     _("No newline at end of file"));
1059	      changes = 2;
1060	    }
1061    }
1062
1063  if (cmp->file[0].buffer != cmp->file[1].buffer)
1064    free (cmp->file[0].buffer);
1065  free (cmp->file[1].buffer);
1066
1067  return changes;
1068}
1069