1/* Analyze file differences for GNU DIFF. 2 3 Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002 4 Free Software Foundation, Inc. 5 6 This file is part of GNU DIFF. 7 8 GNU DIFF is free software; you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation; either version 2, or (at your option) 11 any later version. 12 13 GNU DIFF is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING. 20 If not, write to the Free Software Foundation, 21 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ 22 23/* The basic algorithm is described in: 24 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, 25 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; 26 see especially section 4.2, which describes the variation used below. 27 Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE 28 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) 29 at the price of producing suboptimal output for large inputs with 30 many differences. 31 32 The basic algorithm was independently discovered as described in: 33 "Algorithms for Approximate String Matching", E. Ukkonen, 34 Information and Control Vol. 64, 1985, pp. 100-118. */ 35 36#include "diff.h" 37#include <cmpbuf.h> 38#include <error.h> 39#include <regex.h> 40#include <xalloc.h> 41 42static lin *xvec, *yvec; /* Vectors being compared. */ 43static lin *fdiag; /* Vector, indexed by diagonal, containing 44 1 + the X coordinate of the point furthest 45 along the given diagonal in the forward 46 search of the edit matrix. */ 47static lin *bdiag; /* Vector, indexed by diagonal, containing 48 the X coordinate of the point furthest 49 along the given diagonal in the backward 50 search of the edit matrix. */ 51static lin too_expensive; /* Edit scripts longer than this are too 52 expensive to compute. */ 53 54#define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ 55 56struct partition 57{ 58 lin xmid, ymid; /* Midpoints of this partition. */ 59 bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */ 60 bool hi_minimal; /* Likewise for high half. */ 61}; 62 63/* Find the midpoint of the shortest edit script for a specified 64 portion of the two files. 65 66 Scan from the beginnings of the files, and simultaneously from the ends, 67 doing a breadth-first search through the space of edit-sequence. 68 When the two searches meet, we have found the midpoint of the shortest 69 edit sequence. 70 71 If FIND_MINIMAL is nonzero, find the minimal edit script regardless 72 of expense. Otherwise, if the search is too expensive, use 73 heuristics to stop the search and report a suboptimal answer. 74 75 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number 76 XMID - YMID equals the number of inserted lines minus the number 77 of deleted lines (counting only lines before the midpoint). 78 Return the approximate edit cost; this is the total number of 79 lines inserted or deleted (counting only lines before the midpoint), 80 unless a heuristic is used to terminate the search prematurely. 81 82 Set PART->lo_minimal to true iff the minimal edit script for the 83 left half of the partition is known; similarly for PART->hi_minimal. 84 85 This function assumes that the first lines of the specified portions 86 of the two files do not match, and likewise that the last lines do not 87 match. The caller must trim matching lines from the beginning and end 88 of the portions it is going to specify. 89 90 If we return the "wrong" partitions, 91 the worst this can do is cause suboptimal diff output. 92 It cannot cause incorrect diff output. */ 93 94static lin 95diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal, 96 struct partition *part) 97{ 98 lin *const fd = fdiag; /* Give the compiler a chance. */ 99 lin *const bd = bdiag; /* Additional help for the compiler. */ 100 lin const *const xv = xvec; /* Still more help for the compiler. */ 101 lin const *const yv = yvec; /* And more and more . . . */ 102 lin const dmin = xoff - ylim; /* Minimum valid diagonal. */ 103 lin const dmax = xlim - yoff; /* Maximum valid diagonal. */ 104 lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */ 105 lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ 106 lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */ 107 lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ 108 lin c; /* Cost. */ 109 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd 110 diagonal with respect to the northwest. */ 111 112 fd[fmid] = xoff; 113 bd[bmid] = xlim; 114 115 for (c = 1;; ++c) 116 { 117 lin d; /* Active diagonal. */ 118 bool big_snake = 0; 119 120 /* Extend the top-down search by an edit step in each diagonal. */ 121 fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; 122 fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; 123 for (d = fmax; d >= fmin; d -= 2) 124 { 125 lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; 126 127 if (tlo >= thi) 128 x = tlo + 1; 129 else 130 x = thi; 131 oldx = x; 132 y = x - d; 133 while (x < xlim && y < ylim && xv[x] == yv[y]) 134 ++x, ++y; 135 if (x - oldx > SNAKE_LIMIT) 136 big_snake = 1; 137 fd[d] = x; 138 if (odd && bmin <= d && d <= bmax && bd[d] <= x) 139 { 140 part->xmid = x; 141 part->ymid = y; 142 part->lo_minimal = part->hi_minimal = 1; 143 return 2 * c - 1; 144 } 145 } 146 147 /* Similarly extend the bottom-up search. */ 148 bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin; 149 bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax; 150 for (d = bmax; d >= bmin; d -= 2) 151 { 152 lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; 153 154 if (tlo < thi) 155 x = tlo; 156 else 157 x = thi - 1; 158 oldx = x; 159 y = x - d; 160 while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) 161 --x, --y; 162 if (oldx - x > SNAKE_LIMIT) 163 big_snake = 1; 164 bd[d] = x; 165 if (!odd && fmin <= d && d <= fmax && x <= fd[d]) 166 { 167 part->xmid = x; 168 part->ymid = y; 169 part->lo_minimal = part->hi_minimal = 1; 170 return 2 * c; 171 } 172 } 173 174 if (find_minimal) 175 continue; 176 177 /* Heuristic: check occasionally for a diagonal that has made 178 lots of progress compared with the edit distance. 179 If we have any such, find the one that has made the most 180 progress and return it as if it had succeeded. 181 182 With this heuristic, for files with a constant small density 183 of changes, the algorithm is linear in the file size. */ 184 185 if (200 < c && big_snake && speed_large_files) 186 { 187 lin best; 188 189 best = 0; 190 for (d = fmax; d >= fmin; d -= 2) 191 { 192 lin dd = d - fmid; 193 lin x = fd[d]; 194 lin y = x - d; 195 lin v = (x - xoff) * 2 - dd; 196 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 197 { 198 if (v > best 199 && xoff + SNAKE_LIMIT <= x && x < xlim 200 && yoff + SNAKE_LIMIT <= y && y < ylim) 201 { 202 /* We have a good enough best diagonal; 203 now insist that it end with a significant snake. */ 204 int k; 205 206 for (k = 1; xv[x - k] == yv[y - k]; k++) 207 if (k == SNAKE_LIMIT) 208 { 209 best = v; 210 part->xmid = x; 211 part->ymid = y; 212 break; 213 } 214 } 215 } 216 } 217 if (best > 0) 218 { 219 part->lo_minimal = 1; 220 part->hi_minimal = 0; 221 return 2 * c - 1; 222 } 223 224 best = 0; 225 for (d = bmax; d >= bmin; d -= 2) 226 { 227 lin dd = d - bmid; 228 lin x = bd[d]; 229 lin y = x - d; 230 lin v = (xlim - x) * 2 + dd; 231 if (v > 12 * (c + (dd < 0 ? -dd : dd))) 232 { 233 if (v > best 234 && xoff < x && x <= xlim - SNAKE_LIMIT 235 && yoff < y && y <= ylim - SNAKE_LIMIT) 236 { 237 /* We have a good enough best diagonal; 238 now insist that it end with a significant snake. */ 239 int k; 240 241 for (k = 0; xv[x + k] == yv[y + k]; k++) 242 if (k == SNAKE_LIMIT - 1) 243 { 244 best = v; 245 part->xmid = x; 246 part->ymid = y; 247 break; 248 } 249 } 250 } 251 } 252 if (best > 0) 253 { 254 part->lo_minimal = 0; 255 part->hi_minimal = 1; 256 return 2 * c - 1; 257 } 258 } 259 260 /* Heuristic: if we've gone well beyond the call of duty, 261 give up and report halfway between our best results so far. */ 262 if (c >= too_expensive) 263 { 264 lin fxybest, fxbest; 265 lin bxybest, bxbest; 266 267 fxbest = bxbest = 0; /* Pacify `gcc -Wall'. */ 268 269 /* Find forward diagonal that maximizes X + Y. */ 270 fxybest = -1; 271 for (d = fmax; d >= fmin; d -= 2) 272 { 273 lin x = MIN (fd[d], xlim); 274 lin y = x - d; 275 if (ylim < y) 276 x = ylim + d, y = ylim; 277 if (fxybest < x + y) 278 { 279 fxybest = x + y; 280 fxbest = x; 281 } 282 } 283 284 /* Find backward diagonal that minimizes X + Y. */ 285 bxybest = LIN_MAX; 286 for (d = bmax; d >= bmin; d -= 2) 287 { 288 lin x = MAX (xoff, bd[d]); 289 lin y = x - d; 290 if (y < yoff) 291 x = yoff + d, y = yoff; 292 if (x + y < bxybest) 293 { 294 bxybest = x + y; 295 bxbest = x; 296 } 297 } 298 299 /* Use the better of the two diagonals. */ 300 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) 301 { 302 part->xmid = fxbest; 303 part->ymid = fxybest - fxbest; 304 part->lo_minimal = 1; 305 part->hi_minimal = 0; 306 } 307 else 308 { 309 part->xmid = bxbest; 310 part->ymid = bxybest - bxbest; 311 part->lo_minimal = 0; 312 part->hi_minimal = 1; 313 } 314 return 2 * c - 1; 315 } 316 } 317} 318 319/* Compare in detail contiguous subsequences of the two files 320 which are known, as a whole, to match each other. 321 322 The results are recorded in the vectors files[N].changed, by 323 storing 1 in the element for each line that is an insertion or deletion. 324 325 The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. 326 327 Note that XLIM, YLIM are exclusive bounds. 328 All line numbers are origin-0 and discarded lines are not counted. 329 330 If FIND_MINIMAL, find a minimal difference no matter how 331 expensive it is. */ 332 333static void 334compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal) 335{ 336 lin * const xv = xvec; /* Help the compiler. */ 337 lin * const yv = yvec; 338 339 /* Slide down the bottom initial diagonal. */ 340 while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) 341 ++xoff, ++yoff; 342 /* Slide up the top initial diagonal. */ 343 while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) 344 --xlim, --ylim; 345 346 /* Handle simple cases. */ 347 if (xoff == xlim) 348 while (yoff < ylim) 349 files[1].changed[files[1].realindexes[yoff++]] = 1; 350 else if (yoff == ylim) 351 while (xoff < xlim) 352 files[0].changed[files[0].realindexes[xoff++]] = 1; 353 else 354 { 355 lin c; 356 struct partition part; 357 358 /* Find a point of correspondence in the middle of the files. */ 359 360 c = diag (xoff, xlim, yoff, ylim, find_minimal, &part); 361 362 if (c == 1) 363 { 364 /* This should be impossible, because it implies that 365 one of the two subsequences is empty, 366 and that case was handled above without calling `diag'. 367 Let's verify that this is true. */ 368 abort (); 369#if 0 370 /* The two subsequences differ by a single insert or delete; 371 record it and we are done. */ 372 if (part.xmid - part.ymid < xoff - yoff) 373 files[1].changed[files[1].realindexes[part.ymid - 1]] = 1; 374 else 375 files[0].changed[files[0].realindexes[part.xmid]] = 1; 376#endif 377 } 378 else 379 { 380 /* Use the partitions to split this problem into subproblems. */ 381 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); 382 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); 383 } 384 } 385} 386 387/* Discard lines from one file that have no matches in the other file. 388 389 A line which is discarded will not be considered by the actual 390 comparison algorithm; it will be as if that line were not in the file. 391 The file's `realindexes' table maps virtual line numbers 392 (which don't count the discarded lines) into real line numbers; 393 this is how the actual comparison algorithm produces results 394 that are comprehensible when the discarded lines are counted. 395 396 When we discard a line, we also mark it as a deletion or insertion 397 so that it will be printed in the output. */ 398 399static void 400discard_confusing_lines (struct file_data filevec[]) 401{ 402 int f; 403 lin i; 404 char *discarded[2]; 405 lin *equiv_count[2]; 406 lin *p; 407 408 /* Allocate our results. */ 409 p = xmalloc ((filevec[0].buffered_lines + filevec[1].buffered_lines) 410 * (2 * sizeof *p)); 411 for (f = 0; f < 2; f++) 412 { 413 filevec[f].undiscarded = p; p += filevec[f].buffered_lines; 414 filevec[f].realindexes = p; p += filevec[f].buffered_lines; 415 } 416 417 /* Set up equiv_count[F][I] as the number of lines in file F 418 that fall in equivalence class I. */ 419 420 p = zalloc (filevec[0].equiv_max * (2 * sizeof *p)); 421 equiv_count[0] = p; 422 equiv_count[1] = p + filevec[0].equiv_max; 423 424 for (i = 0; i < filevec[0].buffered_lines; ++i) 425 ++equiv_count[0][filevec[0].equivs[i]]; 426 for (i = 0; i < filevec[1].buffered_lines; ++i) 427 ++equiv_count[1][filevec[1].equivs[i]]; 428 429 /* Set up tables of which lines are going to be discarded. */ 430 431 discarded[0] = zalloc (filevec[0].buffered_lines 432 + filevec[1].buffered_lines); 433 discarded[1] = discarded[0] + filevec[0].buffered_lines; 434 435 /* Mark to be discarded each line that matches no line of the other file. 436 If a line matches many lines, mark it as provisionally discardable. */ 437 438 for (f = 0; f < 2; f++) 439 { 440 size_t end = filevec[f].buffered_lines; 441 char *discards = discarded[f]; 442 lin *counts = equiv_count[1 - f]; 443 lin *equivs = filevec[f].equivs; 444 size_t many = 5; 445 size_t tem = end / 64; 446 447 /* Multiply MANY by approximate square root of number of lines. 448 That is the threshold for provisionally discardable lines. */ 449 while ((tem = tem >> 2) > 0) 450 many *= 2; 451 452 for (i = 0; i < end; i++) 453 { 454 lin nmatch; 455 if (equivs[i] == 0) 456 continue; 457 nmatch = counts[equivs[i]]; 458 if (nmatch == 0) 459 discards[i] = 1; 460 else if (nmatch > many) 461 discards[i] = 2; 462 } 463 } 464 465 /* Don't really discard the provisional lines except when they occur 466 in a run of discardables, with nonprovisionals at the beginning 467 and end. */ 468 469 for (f = 0; f < 2; f++) 470 { 471 lin end = filevec[f].buffered_lines; 472 register char *discards = discarded[f]; 473 474 for (i = 0; i < end; i++) 475 { 476 /* Cancel provisional discards not in middle of run of discards. */ 477 if (discards[i] == 2) 478 discards[i] = 0; 479 else if (discards[i] != 0) 480 { 481 /* We have found a nonprovisional discard. */ 482 register lin j; 483 lin length; 484 lin provisional = 0; 485 486 /* Find end of this run of discardable lines. 487 Count how many are provisionally discardable. */ 488 for (j = i; j < end; j++) 489 { 490 if (discards[j] == 0) 491 break; 492 if (discards[j] == 2) 493 ++provisional; 494 } 495 496 /* Cancel provisional discards at end, and shrink the run. */ 497 while (j > i && discards[j - 1] == 2) 498 discards[--j] = 0, --provisional; 499 500 /* Now we have the length of a run of discardable lines 501 whose first and last are not provisional. */ 502 length = j - i; 503 504 /* If 1/4 of the lines in the run are provisional, 505 cancel discarding of all provisional lines in the run. */ 506 if (provisional * 4 > length) 507 { 508 while (j > i) 509 if (discards[--j] == 2) 510 discards[j] = 0; 511 } 512 else 513 { 514 register lin consec; 515 lin minimum = 1; 516 lin tem = length >> 2; 517 518 /* MINIMUM is approximate square root of LENGTH/4. 519 A subrun of two or more provisionals can stand 520 when LENGTH is at least 16. 521 A subrun of 4 or more can stand when LENGTH >= 64. */ 522 while (0 < (tem >>= 2)) 523 minimum <<= 1; 524 minimum++; 525 526 /* Cancel any subrun of MINIMUM or more provisionals 527 within the larger run. */ 528 for (j = 0, consec = 0; j < length; j++) 529 if (discards[i + j] != 2) 530 consec = 0; 531 else if (minimum == ++consec) 532 /* Back up to start of subrun, to cancel it all. */ 533 j -= consec; 534 else if (minimum < consec) 535 discards[i + j] = 0; 536 537 /* Scan from beginning of run 538 until we find 3 or more nonprovisionals in a row 539 or until the first nonprovisional at least 8 lines in. 540 Until that point, cancel any provisionals. */ 541 for (j = 0, consec = 0; j < length; j++) 542 { 543 if (j >= 8 && discards[i + j] == 1) 544 break; 545 if (discards[i + j] == 2) 546 consec = 0, discards[i + j] = 0; 547 else if (discards[i + j] == 0) 548 consec = 0; 549 else 550 consec++; 551 if (consec == 3) 552 break; 553 } 554 555 /* I advances to the last line of the run. */ 556 i += length - 1; 557 558 /* Same thing, from end. */ 559 for (j = 0, consec = 0; j < length; j++) 560 { 561 if (j >= 8 && discards[i - j] == 1) 562 break; 563 if (discards[i - j] == 2) 564 consec = 0, discards[i - j] = 0; 565 else if (discards[i - j] == 0) 566 consec = 0; 567 else 568 consec++; 569 if (consec == 3) 570 break; 571 } 572 } 573 } 574 } 575 } 576 577 /* Actually discard the lines. */ 578 for (f = 0; f < 2; f++) 579 { 580 char *discards = discarded[f]; 581 lin end = filevec[f].buffered_lines; 582 lin j = 0; 583 for (i = 0; i < end; ++i) 584 if (minimal || discards[i] == 0) 585 { 586 filevec[f].undiscarded[j] = filevec[f].equivs[i]; 587 filevec[f].realindexes[j++] = i; 588 } 589 else 590 filevec[f].changed[i] = 1; 591 filevec[f].nondiscarded_lines = j; 592 } 593 594 free (discarded[0]); 595 free (equiv_count[0]); 596} 597 598/* Adjust inserts/deletes of identical lines to join changes 599 as much as possible. 600 601 We do something when a run of changed lines include a 602 line at one end and have an excluded, identical line at the other. 603 We are free to choose which identical line is included. 604 `compareseq' usually chooses the one at the beginning, 605 but usually it is cleaner to consider the following identical line 606 to be the "change". */ 607 608static void 609shift_boundaries (struct file_data filevec[]) 610{ 611 int f; 612 613 for (f = 0; f < 2; f++) 614 { 615 bool *changed = filevec[f].changed; 616 bool const *other_changed = filevec[1 - f].changed; 617 lin const *equivs = filevec[f].equivs; 618 lin i = 0; 619 lin j = 0; 620 lin i_end = filevec[f].buffered_lines; 621 622 while (1) 623 { 624 lin runlength, start, corresponding; 625 626 /* Scan forwards to find beginning of another run of changes. 627 Also keep track of the corresponding point in the other file. */ 628 629 while (i < i_end && !changed[i]) 630 { 631 while (other_changed[j++]) 632 continue; 633 i++; 634 } 635 636 if (i == i_end) 637 break; 638 639 start = i; 640 641 /* Find the end of this run of changes. */ 642 643 while (changed[++i]) 644 continue; 645 while (other_changed[j]) 646 j++; 647 648 do 649 { 650 /* Record the length of this run of changes, so that 651 we can later determine whether the run has grown. */ 652 runlength = i - start; 653 654 /* Move the changed region back, so long as the 655 previous unchanged line matches the last changed one. 656 This merges with previous changed regions. */ 657 658 while (start && equivs[start - 1] == equivs[i - 1]) 659 { 660 changed[--start] = 1; 661 changed[--i] = 0; 662 while (changed[start - 1]) 663 start--; 664 while (other_changed[--j]) 665 continue; 666 } 667 668 /* Set CORRESPONDING to the end of the changed run, at the last 669 point where it corresponds to a changed run in the other file. 670 CORRESPONDING == I_END means no such point has been found. */ 671 corresponding = other_changed[j - 1] ? i : i_end; 672 673 /* Move the changed region forward, so long as the 674 first changed line matches the following unchanged one. 675 This merges with following changed regions. 676 Do this second, so that if there are no merges, 677 the changed region is moved forward as far as possible. */ 678 679 while (i != i_end && equivs[start] == equivs[i]) 680 { 681 changed[start++] = 0; 682 changed[i++] = 1; 683 while (changed[i]) 684 i++; 685 while (other_changed[++j]) 686 corresponding = i; 687 } 688 } 689 while (runlength != i - start); 690 691 /* If possible, move the fully-merged run of changes 692 back to a corresponding run in the other file. */ 693 694 while (corresponding < i) 695 { 696 changed[--start] = 1; 697 changed[--i] = 0; 698 while (other_changed[--j]) 699 continue; 700 } 701 } 702 } 703} 704 705/* Cons an additional entry onto the front of an edit script OLD. 706 LINE0 and LINE1 are the first affected lines in the two files (origin 0). 707 DELETED is the number of lines deleted here from file 0. 708 INSERTED is the number of lines inserted here in file 1. 709 710 If DELETED is 0 then LINE0 is the number of the line before 711 which the insertion was done; vice versa for INSERTED and LINE1. */ 712 713static struct change * 714add_change (lin line0, lin line1, lin deleted, lin inserted, 715 struct change *old) 716{ 717 struct change *new = xmalloc (sizeof *new); 718 719 new->line0 = line0; 720 new->line1 = line1; 721 new->inserted = inserted; 722 new->deleted = deleted; 723 new->link = old; 724 return new; 725} 726 727/* Scan the tables of which lines are inserted and deleted, 728 producing an edit script in reverse order. */ 729 730static struct change * 731build_reverse_script (struct file_data const filevec[]) 732{ 733 struct change *script = 0; 734 bool *changed0 = filevec[0].changed; 735 bool *changed1 = filevec[1].changed; 736 lin len0 = filevec[0].buffered_lines; 737 lin len1 = filevec[1].buffered_lines; 738 739 /* Note that changedN[len0] does exist, and is 0. */ 740 741 lin i0 = 0, i1 = 0; 742 743 while (i0 < len0 || i1 < len1) 744 { 745 if (changed0[i0] | changed1[i1]) 746 { 747 lin line0 = i0, line1 = i1; 748 749 /* Find # lines changed here in each file. */ 750 while (changed0[i0]) ++i0; 751 while (changed1[i1]) ++i1; 752 753 /* Record this change. */ 754 script = add_change (line0, line1, i0 - line0, i1 - line1, script); 755 } 756 757 /* We have reached lines in the two files that match each other. */ 758 i0++, i1++; 759 } 760 761 return script; 762} 763 764/* Scan the tables of which lines are inserted and deleted, 765 producing an edit script in forward order. */ 766 767static struct change * 768build_script (struct file_data const filevec[]) 769{ 770 struct change *script = 0; 771 bool *changed0 = filevec[0].changed; 772 bool *changed1 = filevec[1].changed; 773 lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines; 774 775 /* Note that changedN[-1] does exist, and is 0. */ 776 777 while (i0 >= 0 || i1 >= 0) 778 { 779 if (changed0[i0 - 1] | changed1[i1 - 1]) 780 { 781 lin line0 = i0, line1 = i1; 782 783 /* Find # lines changed here in each file. */ 784 while (changed0[i0 - 1]) --i0; 785 while (changed1[i1 - 1]) --i1; 786 787 /* Record this change. */ 788 script = add_change (i0, i1, line0 - i0, line1 - i1, script); 789 } 790 791 /* We have reached lines in the two files that match each other. */ 792 i0--, i1--; 793 } 794 795 return script; 796} 797 798/* If CHANGES, briefly report that two files differed. 799 Return 2 if trouble, CHANGES otherwise. */ 800static int 801briefly_report (int changes, struct file_data const filevec[]) 802{ 803 if (changes) 804 { 805 char const *label0 = file_label[0] ? file_label[0] : filevec[0].name; 806 char const *label1 = file_label[1] ? file_label[1] : filevec[1].name; 807 808 if (brief) 809 message ("Files %s and %s differ\n", label0, label1); 810 else 811 { 812 message ("Binary files %s and %s differ\n", label0, label1); 813 changes = 2; 814 } 815 } 816 817 return changes; 818} 819 820/* Report the differences of two files. */ 821int 822diff_2_files (struct comparison *cmp) 823{ 824 lin diags; 825 int f; 826 struct change *e, *p; 827 struct change *script; 828 int changes; 829 830 831 /* If we have detected that either file is binary, 832 compare the two files as binary. This can happen 833 only when the first chunk is read. 834 Also, --brief without any --ignore-* options means 835 we can speed things up by treating the files as binary. */ 836 837 if (read_files (cmp->file, files_can_be_treated_as_binary)) 838 { 839 /* Files with different lengths must be different. */ 840 if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size 841 && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode)) 842 && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode))) 843 changes = 1; 844 845 /* Standard input equals itself. */ 846 else if (cmp->file[0].desc == cmp->file[1].desc) 847 changes = 0; 848 849 else 850 /* Scan both files, a buffer at a time, looking for a difference. */ 851 { 852 /* Allocate same-sized buffers for both files. */ 853 size_t lcm_max = PTRDIFF_MAX - 1; 854 size_t buffer_size = 855 buffer_lcm (sizeof (word), 856 buffer_lcm (STAT_BLOCKSIZE (cmp->file[0].stat), 857 STAT_BLOCKSIZE (cmp->file[1].stat), 858 lcm_max), 859 lcm_max); 860 for (f = 0; f < 2; f++) 861 cmp->file[f].buffer = xrealloc (cmp->file[f].buffer, buffer_size); 862 863 for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0) 864 { 865 /* Read a buffer's worth from both files. */ 866 for (f = 0; f < 2; f++) 867 if (0 <= cmp->file[f].desc) 868 file_block_read (&cmp->file[f], 869 buffer_size - cmp->file[f].buffered); 870 871 /* If the buffers differ, the files differ. */ 872 if (cmp->file[0].buffered != cmp->file[1].buffered 873 || memcmp (cmp->file[0].buffer, 874 cmp->file[1].buffer, 875 cmp->file[0].buffered)) 876 { 877 changes = 1; 878 break; 879 } 880 881 /* If we reach end of file, the files are the same. */ 882 if (cmp->file[0].buffered != buffer_size) 883 { 884 changes = 0; 885 break; 886 } 887 } 888 } 889 890 changes = briefly_report (changes, cmp->file); 891 } 892 else 893 { 894 /* Allocate vectors for the results of comparison: 895 a flag for each line of each file, saying whether that line 896 is an insertion or deletion. 897 Allocate an extra element, always 0, at each end of each vector. */ 898 899 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4; 900 bool *flag_space = zalloc (s * sizeof *flag_space); 901 cmp->file[0].changed = flag_space + 1; 902 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3; 903 904 /* Some lines are obviously insertions or deletions 905 because they don't match anything. Detect them now, and 906 avoid even thinking about them in the main comparison algorithm. */ 907 908 discard_confusing_lines (cmp->file); 909 910 /* Now do the main comparison algorithm, considering just the 911 undiscarded lines. */ 912 913 xvec = cmp->file[0].undiscarded; 914 yvec = cmp->file[1].undiscarded; 915 diags = (cmp->file[0].nondiscarded_lines 916 + cmp->file[1].nondiscarded_lines + 3); 917 fdiag = xmalloc (diags * (2 * sizeof *fdiag)); 918 bdiag = fdiag + diags; 919 fdiag += cmp->file[1].nondiscarded_lines + 1; 920 bdiag += cmp->file[1].nondiscarded_lines + 1; 921 922 /* Set TOO_EXPENSIVE to be approximate square root of input size, 923 bounded below by 256. */ 924 too_expensive = 1; 925 for (; diags != 0; diags >>= 2) 926 too_expensive <<= 1; 927 too_expensive = MAX (256, too_expensive); 928 929 files[0] = cmp->file[0]; 930 files[1] = cmp->file[1]; 931 932 compareseq (0, cmp->file[0].nondiscarded_lines, 933 0, cmp->file[1].nondiscarded_lines, minimal); 934 935 free (fdiag - (cmp->file[1].nondiscarded_lines + 1)); 936 937 /* Modify the results slightly to make them prettier 938 in cases where that can validly be done. */ 939 940 shift_boundaries (cmp->file); 941 942 /* Get the results of comparison in the form of a chain 943 of `struct change's -- an edit script. */ 944 945 if (output_style == OUTPUT_ED) 946 script = build_reverse_script (cmp->file); 947 else 948 script = build_script (cmp->file); 949 950 /* Set CHANGES if we had any diffs. 951 If some changes are ignored, we must scan the script to decide. */ 952 if (ignore_blank_lines || ignore_regexp.fastmap) 953 { 954 struct change *next = script; 955 changes = 0; 956 957 while (next && changes == 0) 958 { 959 struct change *this, *end; 960 lin first0, last0, first1, last1; 961 962 /* Find a set of changes that belong together. */ 963 this = next; 964 end = find_change (next); 965 966 /* Disconnect them from the rest of the changes, making them 967 a hunk, and remember the rest for next iteration. */ 968 next = end->link; 969 end->link = 0; 970 971 /* Determine whether this hunk is really a difference. */ 972 if (analyze_hunk (this, &first0, &last0, &first1, &last1)) 973 changes = 1; 974 975 /* Reconnect the script so it will all be freed properly. */ 976 end->link = next; 977 } 978 } 979 else 980 changes = (script != 0); 981 982 if (brief) 983 changes = briefly_report (changes, cmp->file); 984 else 985 { 986 if (changes | !no_diff_means_no_output) 987 { 988 /* Record info for starting up output, 989 to be used if and when we have some output to print. */ 990 setup_output (file_label[0] ? file_label[0] : cmp->file[0].name, 991 file_label[1] ? file_label[1] : cmp->file[1].name, 992 cmp->parent != 0); 993 994 switch (output_style) 995 { 996 case OUTPUT_CONTEXT: 997 print_context_script (script, 0); 998 break; 999 1000 case OUTPUT_UNIFIED: 1001 print_context_script (script, 1); 1002 break; 1003 1004 case OUTPUT_ED: 1005 print_ed_script (script); 1006 break; 1007 1008 case OUTPUT_FORWARD_ED: 1009 pr_forward_ed_script (script); 1010 break; 1011 1012 case OUTPUT_RCS: 1013 print_rcs_script (script); 1014 break; 1015 1016 case OUTPUT_NORMAL: 1017 print_normal_script (script); 1018 break; 1019 1020 case OUTPUT_IFDEF: 1021 print_ifdef_script (script); 1022 break; 1023 1024 case OUTPUT_SDIFF: 1025 print_sdiff_script (script); 1026 break; 1027 1028 default: 1029 abort (); 1030 } 1031 1032 finish_output (); 1033 } 1034 } 1035 1036 free (cmp->file[0].undiscarded); 1037 1038 free (flag_space); 1039 1040 for (f = 0; f < 2; f++) 1041 { 1042 free (cmp->file[f].equivs); 1043 free (cmp->file[f].linbuf + cmp->file[f].linbuf_base); 1044 } 1045 1046 for (e = script; e; e = p) 1047 { 1048 p = e->link; 1049 free (e); 1050 } 1051 1052 if (! ROBUST_OUTPUT_STYLE (output_style)) 1053 for (f = 0; f < 2; ++f) 1054 if (cmp->file[f].missing_newline) 1055 { 1056 error (0, 0, "%s: %s\n", 1057 file_label[f] ? file_label[f] : cmp->file[f].name, 1058 _("No newline at end of file")); 1059 changes = 2; 1060 } 1061 } 1062 1063 if (cmp->file[0].buffer != cmp->file[1].buffer) 1064 free (cmp->file[0].buffer); 1065 free (cmp->file[1].buffer); 1066 1067 return changes; 1068} 1069