parse-diff.c revision 289166
1/* 2 * parse-diff.c: functions for parsing diff files 3 * 4 * ==================================================================== 5 * Licensed to the Apache Software Foundation (ASF) under one 6 * or more contributor license agreements. See the NOTICE file 7 * distributed with this work for additional information 8 * regarding copyright ownership. The ASF licenses this file 9 * to you under the Apache License, Version 2.0 (the 10 * "License"); you may not use this file except in compliance 11 * with the License. You may obtain a copy of the License at 12 * 13 * http://www.apache.org/licenses/LICENSE-2.0 14 * 15 * Unless required by applicable law or agreed to in writing, 16 * software distributed under the License is distributed on an 17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 18 * KIND, either express or implied. See the License for the 19 * specific language governing permissions and limitations 20 * under the License. 21 * ==================================================================== 22 */ 23 24#include <stdlib.h> 25#include <stddef.h> 26#include <string.h> 27 28#include "svn_hash.h" 29#include "svn_types.h" 30#include "svn_error.h" 31#include "svn_io.h" 32#include "svn_pools.h" 33#include "svn_props.h" 34#include "svn_string.h" 35#include "svn_utf.h" 36#include "svn_dirent_uri.h" 37#include "svn_diff.h" 38 39#include "private/svn_eol_private.h" 40#include "private/svn_dep_compat.h" 41 42/* Helper macro for readability */ 43#define starts_with(str, start) \ 44 (strncmp((str), (start), strlen(start)) == 0) 45 46/* Like strlen() but for string literals. */ 47#define STRLEN_LITERAL(str) (sizeof(str) - 1) 48 49/* This struct describes a range within a file, as well as the 50 * current cursor position within the range. All numbers are in bytes. */ 51struct svn_diff__hunk_range { 52 apr_off_t start; 53 apr_off_t end; 54 apr_off_t current; 55}; 56 57struct svn_diff_hunk_t { 58 /* The patch this hunk belongs to. */ 59 svn_patch_t *patch; 60 61 /* APR file handle to the patch file this hunk came from. */ 62 apr_file_t *apr_file; 63 64 /* Ranges used to keep track of this hunk's texts positions within 65 * the patch file. */ 66 struct svn_diff__hunk_range diff_text_range; 67 struct svn_diff__hunk_range original_text_range; 68 struct svn_diff__hunk_range modified_text_range; 69 70 /* Hunk ranges as they appeared in the patch file. 71 * All numbers are lines, not bytes. */ 72 svn_linenum_t original_start; 73 svn_linenum_t original_length; 74 svn_linenum_t modified_start; 75 svn_linenum_t modified_length; 76 77 /* Number of lines of leading and trailing hunk context. */ 78 svn_linenum_t leading_context; 79 svn_linenum_t trailing_context; 80}; 81 82void 83svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk) 84{ 85 hunk->diff_text_range.current = hunk->diff_text_range.start; 86} 87 88void 89svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk) 90{ 91 if (hunk->patch->reverse) 92 hunk->modified_text_range.current = hunk->modified_text_range.start; 93 else 94 hunk->original_text_range.current = hunk->original_text_range.start; 95} 96 97void 98svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk) 99{ 100 if (hunk->patch->reverse) 101 hunk->original_text_range.current = hunk->original_text_range.start; 102 else 103 hunk->modified_text_range.current = hunk->modified_text_range.start; 104} 105 106svn_linenum_t 107svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk) 108{ 109 return hunk->patch->reverse ? hunk->modified_start : hunk->original_start; 110} 111 112svn_linenum_t 113svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk) 114{ 115 return hunk->patch->reverse ? hunk->modified_length : hunk->original_length; 116} 117 118svn_linenum_t 119svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk) 120{ 121 return hunk->patch->reverse ? hunk->original_start : hunk->modified_start; 122} 123 124svn_linenum_t 125svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk) 126{ 127 return hunk->patch->reverse ? hunk->original_length : hunk->modified_length; 128} 129 130svn_linenum_t 131svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk) 132{ 133 return hunk->leading_context; 134} 135 136svn_linenum_t 137svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk) 138{ 139 return hunk->trailing_context; 140} 141 142/* Try to parse a positive number from a decimal number encoded 143 * in the string NUMBER. Return parsed number in OFFSET, and return 144 * TRUE if parsing was successful. */ 145static svn_boolean_t 146parse_offset(svn_linenum_t *offset, const char *number) 147{ 148 svn_error_t *err; 149 apr_uint64_t val; 150 151 err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10); 152 if (err) 153 { 154 svn_error_clear(err); 155 return FALSE; 156 } 157 158 *offset = (svn_linenum_t)val; 159 160 return TRUE; 161} 162 163/* Try to parse a hunk range specification from the string RANGE. 164 * Return parsed information in *START and *LENGTH, and return TRUE 165 * if the range parsed correctly. Note: This function may modify the 166 * input value RANGE. */ 167static svn_boolean_t 168parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range) 169{ 170 char *comma; 171 172 if (*range == 0) 173 return FALSE; 174 175 comma = strstr(range, ","); 176 if (comma) 177 { 178 if (strlen(comma + 1) > 0) 179 { 180 /* Try to parse the length. */ 181 if (! parse_offset(length, comma + 1)) 182 return FALSE; 183 184 /* Snip off the end of the string, 185 * so we can comfortably parse the line 186 * number the hunk starts at. */ 187 *comma = '\0'; 188 } 189 else 190 /* A comma but no length? */ 191 return FALSE; 192 } 193 else 194 { 195 *length = 1; 196 } 197 198 /* Try to parse the line number the hunk starts at. */ 199 return parse_offset(start, range); 200} 201 202/* Try to parse a hunk header in string HEADER, putting parsed information 203 * into HUNK. Return TRUE if the header parsed correctly. ATAT is the 204 * character string used to delimit the hunk header. 205 * Do all allocations in POOL. */ 206static svn_boolean_t 207parse_hunk_header(const char *header, svn_diff_hunk_t *hunk, 208 const char *atat, apr_pool_t *pool) 209{ 210 const char *p; 211 const char *start; 212 svn_stringbuf_t *range; 213 214 p = header + strlen(atat); 215 if (*p != ' ') 216 /* No. */ 217 return FALSE; 218 p++; 219 if (*p != '-') 220 /* Nah... */ 221 return FALSE; 222 /* OK, this may be worth allocating some memory for... */ 223 range = svn_stringbuf_create_ensure(31, pool); 224 start = ++p; 225 while (*p && *p != ' ') 226 { 227 p++; 228 } 229 230 if (*p != ' ') 231 /* No no no... */ 232 return FALSE; 233 234 svn_stringbuf_appendbytes(range, start, p - start); 235 236 /* Try to parse the first range. */ 237 if (! parse_range(&hunk->original_start, &hunk->original_length, range->data)) 238 return FALSE; 239 240 /* Clear the stringbuf so we can reuse it for the second range. */ 241 svn_stringbuf_setempty(range); 242 p++; 243 if (*p != '+') 244 /* Eeek! */ 245 return FALSE; 246 /* OK, this may be worth copying... */ 247 start = ++p; 248 while (*p && *p != ' ') 249 { 250 p++; 251 } 252 if (*p != ' ') 253 /* No no no... */ 254 return FALSE; 255 256 svn_stringbuf_appendbytes(range, start, p - start); 257 258 /* Check for trailing @@ */ 259 p++; 260 if (! starts_with(p, atat)) 261 return FALSE; 262 263 /* There may be stuff like C-function names after the trailing @@, 264 * but we ignore that. */ 265 266 /* Try to parse the second range. */ 267 if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data)) 268 return FALSE; 269 270 /* Hunk header is good. */ 271 return TRUE; 272} 273 274/* Read a line of original or modified hunk text from the specified 275 * RANGE within FILE. FILE is expected to contain unidiff text. 276 * Leading unidiff symbols ('+', '-', and ' ') are removed from the line, 277 * Any lines commencing with the VERBOTEN character are discarded. 278 * VERBOTEN should be '+' or '-', depending on which form of hunk text 279 * is being read. 280 * 281 * All other parameters are as in svn_diff_hunk_readline_original_text() 282 * and svn_diff_hunk_readline_modified_text(). 283 */ 284static svn_error_t * 285hunk_readline_original_or_modified(apr_file_t *file, 286 struct svn_diff__hunk_range *range, 287 svn_stringbuf_t **stringbuf, 288 const char **eol, 289 svn_boolean_t *eof, 290 char verboten, 291 apr_pool_t *result_pool, 292 apr_pool_t *scratch_pool) 293{ 294 apr_size_t max_len; 295 svn_boolean_t filtered; 296 apr_off_t pos; 297 svn_stringbuf_t *str; 298 299 if (range->current >= range->end) 300 { 301 /* We're past the range. Indicate that no bytes can be read. */ 302 *eof = TRUE; 303 if (eol) 304 *eol = NULL; 305 *stringbuf = svn_stringbuf_create_empty(result_pool); 306 return SVN_NO_ERROR; 307 } 308 309 pos = 0; 310 SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos, scratch_pool)); 311 SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool)); 312 do 313 { 314 max_len = range->end - range->current; 315 SVN_ERR(svn_io_file_readline(file, &str, eol, eof, max_len, 316 result_pool, scratch_pool)); 317 range->current = 0; 318 SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool)); 319 filtered = (str->data[0] == verboten || str->data[0] == '\\'); 320 } 321 while (filtered && ! *eof); 322 323 if (filtered) 324 { 325 /* EOF, return an empty string. */ 326 *stringbuf = svn_stringbuf_create_ensure(0, result_pool); 327 } 328 else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ') 329 { 330 /* Shave off leading unidiff symbols. */ 331 *stringbuf = svn_stringbuf_create(str->data + 1, result_pool); 332 } 333 else 334 { 335 /* Return the line as-is. */ 336 *stringbuf = svn_stringbuf_dup(str, result_pool); 337 } 338 339 SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool)); 340 341 return SVN_NO_ERROR; 342} 343 344svn_error_t * 345svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk, 346 svn_stringbuf_t **stringbuf, 347 const char **eol, 348 svn_boolean_t *eof, 349 apr_pool_t *result_pool, 350 apr_pool_t *scratch_pool) 351{ 352 return svn_error_trace( 353 hunk_readline_original_or_modified(hunk->apr_file, 354 hunk->patch->reverse ? 355 &hunk->modified_text_range : 356 &hunk->original_text_range, 357 stringbuf, eol, eof, 358 hunk->patch->reverse ? '-' : '+', 359 result_pool, scratch_pool)); 360} 361 362svn_error_t * 363svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk, 364 svn_stringbuf_t **stringbuf, 365 const char **eol, 366 svn_boolean_t *eof, 367 apr_pool_t *result_pool, 368 apr_pool_t *scratch_pool) 369{ 370 return svn_error_trace( 371 hunk_readline_original_or_modified(hunk->apr_file, 372 hunk->patch->reverse ? 373 &hunk->original_text_range : 374 &hunk->modified_text_range, 375 stringbuf, eol, eof, 376 hunk->patch->reverse ? '+' : '-', 377 result_pool, scratch_pool)); 378} 379 380svn_error_t * 381svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk, 382 svn_stringbuf_t **stringbuf, 383 const char **eol, 384 svn_boolean_t *eof, 385 apr_pool_t *result_pool, 386 apr_pool_t *scratch_pool) 387{ 388 svn_diff_hunk_t dummy; 389 svn_stringbuf_t *line; 390 apr_size_t max_len; 391 apr_off_t pos; 392 393 if (hunk->diff_text_range.current >= hunk->diff_text_range.end) 394 { 395 /* We're past the range. Indicate that no bytes can be read. */ 396 *eof = TRUE; 397 if (eol) 398 *eol = NULL; 399 *stringbuf = svn_stringbuf_create_empty(result_pool); 400 return SVN_NO_ERROR; 401 } 402 403 pos = 0; 404 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool)); 405 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, 406 &hunk->diff_text_range.current, scratch_pool)); 407 max_len = hunk->diff_text_range.end - hunk->diff_text_range.current; 408 SVN_ERR(svn_io_file_readline(hunk->apr_file, &line, eol, eof, max_len, 409 result_pool, 410 scratch_pool)); 411 hunk->diff_text_range.current = 0; 412 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, 413 &hunk->diff_text_range.current, scratch_pool)); 414 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool)); 415 416 if (hunk->patch->reverse) 417 { 418 if (parse_hunk_header(line->data, &dummy, "@@", scratch_pool)) 419 { 420 /* Line is a hunk header, reverse it. */ 421 line = svn_stringbuf_createf(result_pool, 422 "@@ -%lu,%lu +%lu,%lu @@", 423 hunk->modified_start, 424 hunk->modified_length, 425 hunk->original_start, 426 hunk->original_length); 427 } 428 else if (parse_hunk_header(line->data, &dummy, "##", scratch_pool)) 429 { 430 /* Line is a hunk header, reverse it. */ 431 line = svn_stringbuf_createf(result_pool, 432 "## -%lu,%lu +%lu,%lu ##", 433 hunk->modified_start, 434 hunk->modified_length, 435 hunk->original_start, 436 hunk->original_length); 437 } 438 else 439 { 440 if (line->data[0] == '+') 441 line->data[0] = '-'; 442 else if (line->data[0] == '-') 443 line->data[0] = '+'; 444 } 445 } 446 447 *stringbuf = line; 448 449 return SVN_NO_ERROR; 450} 451 452/* Parse *PROP_NAME from HEADER as the part after the INDICATOR line. 453 * Allocate *PROP_NAME in RESULT_POOL. 454 * Set *PROP_NAME to NULL if no valid property name was found. */ 455static svn_error_t * 456parse_prop_name(const char **prop_name, const char *header, 457 const char *indicator, apr_pool_t *result_pool) 458{ 459 SVN_ERR(svn_utf_cstring_to_utf8(prop_name, 460 header + strlen(indicator), 461 result_pool)); 462 if (**prop_name == '\0') 463 *prop_name = NULL; 464 else if (! svn_prop_name_is_valid(*prop_name)) 465 { 466 svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool); 467 svn_stringbuf_strip_whitespace(buf); 468 *prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL); 469 } 470 471 return SVN_NO_ERROR; 472} 473 474/* Return the next *HUNK from a PATCH in APR_FILE. 475 * If no hunk can be found, set *HUNK to NULL. 476 * Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK 477 * is the first belonging to a certain property, then PROP_NAME and 478 * PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be 479 * NULL. If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be 480 * treated as context lines. Allocate results in RESULT_POOL. 481 * Use SCRATCH_POOL for all other allocations. */ 482static svn_error_t * 483parse_next_hunk(svn_diff_hunk_t **hunk, 484 svn_boolean_t *is_property, 485 const char **prop_name, 486 svn_diff_operation_kind_t *prop_operation, 487 svn_patch_t *patch, 488 apr_file_t *apr_file, 489 svn_boolean_t ignore_whitespace, 490 apr_pool_t *result_pool, 491 apr_pool_t *scratch_pool) 492{ 493 static const char * const minus = "--- "; 494 static const char * const text_atat = "@@"; 495 static const char * const prop_atat = "##"; 496 svn_stringbuf_t *line; 497 svn_boolean_t eof, in_hunk, hunk_seen; 498 apr_off_t pos, last_line; 499 apr_off_t start, end; 500 apr_off_t original_end; 501 apr_off_t modified_end; 502 svn_linenum_t original_lines; 503 svn_linenum_t modified_lines; 504 svn_linenum_t leading_context; 505 svn_linenum_t trailing_context; 506 svn_boolean_t changed_line_seen; 507 enum { 508 noise_line, 509 original_line, 510 modified_line, 511 context_line 512 } last_line_type; 513 apr_pool_t *iterpool; 514 515 *prop_operation = svn_diff_op_unchanged; 516 517 /* We only set this if we have a property hunk header. */ 518 *prop_name = NULL; 519 *is_property = FALSE; 520 521 if (apr_file_eof(apr_file) == APR_EOF) 522 { 523 /* No more hunks here. */ 524 *hunk = NULL; 525 return SVN_NO_ERROR; 526 } 527 528 in_hunk = FALSE; 529 hunk_seen = FALSE; 530 leading_context = 0; 531 trailing_context = 0; 532 changed_line_seen = FALSE; 533 original_end = 0; 534 modified_end = 0; 535 *hunk = apr_pcalloc(result_pool, sizeof(**hunk)); 536 537 /* Get current seek position -- APR has no ftell() :( */ 538 pos = 0; 539 SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool)); 540 541 /* Start out assuming noise. */ 542 last_line_type = noise_line; 543 544 iterpool = svn_pool_create(scratch_pool); 545 do 546 { 547 548 svn_pool_clear(iterpool); 549 550 /* Remember the current line's offset, and read the line. */ 551 last_line = pos; 552 SVN_ERR(svn_io_file_readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX, 553 iterpool, iterpool)); 554 555 /* Update line offset for next iteration. */ 556 pos = 0; 557 SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool)); 558 559 /* Lines starting with a backslash indicate a missing EOL: 560 * "\ No newline at end of file" or "end of property". */ 561 if (line->data[0] == '\\') 562 { 563 if (in_hunk) 564 { 565 char eolbuf[2]; 566 apr_size_t len; 567 apr_off_t off; 568 apr_off_t hunk_text_end; 569 570 /* Comment terminates the hunk text and says the hunk text 571 * has no trailing EOL. Snip off trailing EOL which is part 572 * of the patch file but not part of the hunk text. */ 573 off = last_line - 2; 574 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool)); 575 len = sizeof(eolbuf); 576 SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len, 577 &eof, iterpool)); 578 if (eolbuf[0] == '\r' && eolbuf[1] == '\n') 579 hunk_text_end = last_line - 2; 580 else if (eolbuf[1] == '\n' || eolbuf[1] == '\r') 581 hunk_text_end = last_line - 1; 582 else 583 hunk_text_end = last_line; 584 585 if (last_line_type == original_line && original_end == 0) 586 original_end = hunk_text_end; 587 else if (last_line_type == modified_line && modified_end == 0) 588 modified_end = hunk_text_end; 589 else if (last_line_type == context_line) 590 { 591 if (original_end == 0) 592 original_end = hunk_text_end; 593 if (modified_end == 0) 594 modified_end = hunk_text_end; 595 } 596 597 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool)); 598 } 599 600 continue; 601 } 602 603 if (in_hunk) 604 { 605 char c; 606 static const char add = '+'; 607 static const char del = '-'; 608 609 if (! hunk_seen) 610 { 611 /* We're reading the first line of the hunk, so the start 612 * of the line just read is the hunk text's byte offset. */ 613 start = last_line; 614 } 615 616 c = line->data[0]; 617 if (original_lines > 0 && modified_lines > 0 && 618 ((c == ' ') 619 /* Tolerate chopped leading spaces on empty lines. */ 620 || (! eof && line->len == 0) 621 /* Maybe tolerate chopped leading spaces on non-empty lines. */ 622 || (ignore_whitespace && c != del && c != add))) 623 { 624 /* It's a "context" line in the hunk. */ 625 hunk_seen = TRUE; 626 original_lines--; 627 modified_lines--; 628 if (changed_line_seen) 629 trailing_context++; 630 else 631 leading_context++; 632 last_line_type = context_line; 633 } 634 else if (original_lines > 0 && c == del) 635 { 636 /* It's a "deleted" line in the hunk. */ 637 hunk_seen = TRUE; 638 changed_line_seen = TRUE; 639 640 /* A hunk may have context in the middle. We only want 641 trailing lines of context. */ 642 if (trailing_context > 0) 643 trailing_context = 0; 644 645 original_lines--; 646 last_line_type = original_line; 647 } 648 else if (modified_lines > 0 && c == add) 649 { 650 /* It's an "added" line in the hunk. */ 651 hunk_seen = TRUE; 652 changed_line_seen = TRUE; 653 654 /* A hunk may have context in the middle. We only want 655 trailing lines of context. */ 656 if (trailing_context > 0) 657 trailing_context = 0; 658 659 modified_lines--; 660 last_line_type = modified_line; 661 } 662 else 663 { 664 if (eof) 665 { 666 /* The hunk ends at EOF. */ 667 end = pos; 668 } 669 else 670 { 671 /* The start of the current line marks the first byte 672 * after the hunk text. */ 673 end = last_line; 674 } 675 676 if (original_end == 0) 677 original_end = end; 678 if (modified_end == 0) 679 modified_end = end; 680 break; /* Hunk was empty or has been read. */ 681 } 682 } 683 else 684 { 685 if (starts_with(line->data, text_atat)) 686 { 687 /* Looks like we have a hunk header, try to rip it apart. */ 688 in_hunk = parse_hunk_header(line->data, *hunk, text_atat, 689 iterpool); 690 if (in_hunk) 691 { 692 original_lines = (*hunk)->original_length; 693 modified_lines = (*hunk)->modified_length; 694 *is_property = FALSE; 695 } 696 } 697 else if (starts_with(line->data, prop_atat)) 698 { 699 /* Looks like we have a property hunk header, try to rip it 700 * apart. */ 701 in_hunk = parse_hunk_header(line->data, *hunk, prop_atat, 702 iterpool); 703 if (in_hunk) 704 { 705 original_lines = (*hunk)->original_length; 706 modified_lines = (*hunk)->modified_length; 707 *is_property = TRUE; 708 } 709 } 710 else if (starts_with(line->data, "Added: ")) 711 { 712 SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ", 713 result_pool)); 714 if (*prop_name) 715 *prop_operation = svn_diff_op_added; 716 } 717 else if (starts_with(line->data, "Deleted: ")) 718 { 719 SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ", 720 result_pool)); 721 if (*prop_name) 722 *prop_operation = svn_diff_op_deleted; 723 } 724 else if (starts_with(line->data, "Modified: ")) 725 { 726 SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ", 727 result_pool)); 728 if (*prop_name) 729 *prop_operation = svn_diff_op_modified; 730 } 731 else if (starts_with(line->data, minus) 732 || starts_with(line->data, "diff --git ")) 733 /* This could be a header of another patch. Bail out. */ 734 break; 735 } 736 } 737 /* Check for the line length since a file may not have a newline at the 738 * end and we depend upon the last line to be an empty one. */ 739 while (! eof || line->len > 0); 740 svn_pool_destroy(iterpool); 741 742 if (! eof) 743 /* Rewind to the start of the line just read, so subsequent calls 744 * to this function or svn_diff_parse_next_patch() don't end 745 * up skipping the line -- it may contain a patch or hunk header. */ 746 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool)); 747 748 if (hunk_seen && start < end) 749 { 750 (*hunk)->patch = patch; 751 (*hunk)->apr_file = apr_file; 752 (*hunk)->leading_context = leading_context; 753 (*hunk)->trailing_context = trailing_context; 754 (*hunk)->diff_text_range.start = start; 755 (*hunk)->diff_text_range.current = start; 756 (*hunk)->diff_text_range.end = end; 757 (*hunk)->original_text_range.start = start; 758 (*hunk)->original_text_range.current = start; 759 (*hunk)->original_text_range.end = original_end; 760 (*hunk)->modified_text_range.start = start; 761 (*hunk)->modified_text_range.current = start; 762 (*hunk)->modified_text_range.end = modified_end; 763 } 764 else 765 /* Something went wrong, just discard the result. */ 766 *hunk = NULL; 767 768 return SVN_NO_ERROR; 769} 770 771/* Compare function for sorting hunks after parsing. 772 * We sort hunks by their original line offset. */ 773static int 774compare_hunks(const void *a, const void *b) 775{ 776 const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a); 777 const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b); 778 779 if (ha->original_start < hb->original_start) 780 return -1; 781 if (ha->original_start > hb->original_start) 782 return 1; 783 return 0; 784} 785 786/* Possible states of the diff header parser. */ 787enum parse_state 788{ 789 state_start, /* initial */ 790 state_git_diff_seen, /* diff --git */ 791 state_git_tree_seen, /* a tree operation, rather then content change */ 792 state_git_minus_seen, /* --- /dev/null; or --- a/ */ 793 state_git_plus_seen, /* +++ /dev/null; or +++ a/ */ 794 state_move_from_seen, /* rename from foo.c */ 795 state_copy_from_seen, /* copy from foo.c */ 796 state_minus_seen, /* --- foo.c */ 797 state_unidiff_found, /* valid start of a regular unidiff header */ 798 state_git_header_found /* valid start of a --git diff header */ 799}; 800 801/* Data type describing a valid state transition of the parser. */ 802struct transition 803{ 804 const char *expected_input; 805 enum parse_state required_state; 806 807 /* A callback called upon each parser state transition. */ 808 svn_error_t *(*fn)(enum parse_state *new_state, char *input, 809 svn_patch_t *patch, apr_pool_t *result_pool, 810 apr_pool_t *scratch_pool); 811}; 812 813/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */ 814static svn_error_t * 815grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool, 816 apr_pool_t *scratch_pool) 817{ 818 const char *utf8_path; 819 const char *canon_path; 820 821 /* Grab the filename and encode it in UTF-8. */ 822 /* TODO: Allow specifying the patch file's encoding. 823 * For now, we assume its encoding is native. */ 824 /* ### This can fail if the filename cannot be represented in the current 825 * ### locale's encoding. */ 826 SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path, 827 line, 828 scratch_pool)); 829 830 /* Canonicalize the path name. */ 831 canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool); 832 833 *file_name = apr_pstrdup(result_pool, canon_path); 834 835 return SVN_NO_ERROR; 836} 837 838/* Parse the '--- ' line of a regular unidiff. */ 839static svn_error_t * 840diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch, 841 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 842{ 843 /* If we can find a tab, it separates the filename from 844 * the rest of the line which we can discard. */ 845 char *tab = strchr(line, '\t'); 846 if (tab) 847 *tab = '\0'; 848 849 SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "), 850 result_pool, scratch_pool)); 851 852 *new_state = state_minus_seen; 853 854 return SVN_NO_ERROR; 855} 856 857/* Parse the '+++ ' line of a regular unidiff. */ 858static svn_error_t * 859diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch, 860 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 861{ 862 /* If we can find a tab, it separates the filename from 863 * the rest of the line which we can discard. */ 864 char *tab = strchr(line, '\t'); 865 if (tab) 866 *tab = '\0'; 867 868 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "), 869 result_pool, scratch_pool)); 870 871 *new_state = state_unidiff_found; 872 873 return SVN_NO_ERROR; 874} 875 876/* Parse the first line of a git extended unidiff. */ 877static svn_error_t * 878git_start(enum parse_state *new_state, char *line, svn_patch_t *patch, 879 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 880{ 881 const char *old_path_start; 882 char *old_path_end; 883 const char *new_path_start; 884 const char *new_path_end; 885 char *new_path_marker; 886 const char *old_path_marker; 887 888 /* ### Add handling of escaped paths 889 * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html: 890 * 891 * TAB, LF, double quote and backslash characters in pathnames are 892 * represented as \t, \n, \" and \\, respectively. If there is need for 893 * such substitution then the whole pathname is put in double quotes. 894 */ 895 896 /* Our line should look like this: 'diff --git a/path b/path'. 897 * 898 * If we find any deviations from that format, we return with state reset 899 * to start. 900 */ 901 old_path_marker = strstr(line, " a/"); 902 903 if (! old_path_marker) 904 { 905 *new_state = state_start; 906 return SVN_NO_ERROR; 907 } 908 909 if (! *(old_path_marker + 3)) 910 { 911 *new_state = state_start; 912 return SVN_NO_ERROR; 913 } 914 915 new_path_marker = strstr(old_path_marker, " b/"); 916 917 if (! new_path_marker) 918 { 919 *new_state = state_start; 920 return SVN_NO_ERROR; 921 } 922 923 if (! *(new_path_marker + 3)) 924 { 925 *new_state = state_start; 926 return SVN_NO_ERROR; 927 } 928 929 /* By now, we know that we have a line on the form '--git diff a/.+ b/.+' 930 * We only need the filenames when we have deleted or added empty 931 * files. In those cases the old_path and new_path is identical on the 932 * 'diff --git' line. For all other cases we fetch the filenames from 933 * other header lines. */ 934 old_path_start = line + STRLEN_LITERAL("diff --git a/"); 935 new_path_end = line + strlen(line); 936 new_path_start = old_path_start; 937 938 while (TRUE) 939 { 940 ptrdiff_t len_old; 941 ptrdiff_t len_new; 942 943 new_path_marker = strstr(new_path_start, " b/"); 944 945 /* No new path marker, bail out. */ 946 if (! new_path_marker) 947 break; 948 949 old_path_end = new_path_marker; 950 new_path_start = new_path_marker + STRLEN_LITERAL(" b/"); 951 952 /* No path after the marker. */ 953 if (! *new_path_start) 954 break; 955 956 len_old = old_path_end - old_path_start; 957 len_new = new_path_end - new_path_start; 958 959 /* Are the paths before and after the " b/" marker the same? */ 960 if (len_old == len_new 961 && ! strncmp(old_path_start, new_path_start, len_old)) 962 { 963 *old_path_end = '\0'; 964 SVN_ERR(grab_filename(&patch->old_filename, old_path_start, 965 result_pool, scratch_pool)); 966 967 SVN_ERR(grab_filename(&patch->new_filename, new_path_start, 968 result_pool, scratch_pool)); 969 break; 970 } 971 } 972 973 /* We assume that the path is only modified until we've found a 'tree' 974 * header */ 975 patch->operation = svn_diff_op_modified; 976 977 *new_state = state_git_diff_seen; 978 return SVN_NO_ERROR; 979} 980 981/* Parse the '--- ' line of a git extended unidiff. */ 982static svn_error_t * 983git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch, 984 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 985{ 986 /* If we can find a tab, it separates the filename from 987 * the rest of the line which we can discard. */ 988 char *tab = strchr(line, '\t'); 989 if (tab) 990 *tab = '\0'; 991 992 if (starts_with(line, "--- /dev/null")) 993 SVN_ERR(grab_filename(&patch->old_filename, "/dev/null", 994 result_pool, scratch_pool)); 995 else 996 SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"), 997 result_pool, scratch_pool)); 998 999 *new_state = state_git_minus_seen; 1000 return SVN_NO_ERROR; 1001} 1002 1003/* Parse the '+++ ' line of a git extended unidiff. */ 1004static svn_error_t * 1005git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch, 1006 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1007{ 1008 /* If we can find a tab, it separates the filename from 1009 * the rest of the line which we can discard. */ 1010 char *tab = strchr(line, '\t'); 1011 if (tab) 1012 *tab = '\0'; 1013 1014 if (starts_with(line, "+++ /dev/null")) 1015 SVN_ERR(grab_filename(&patch->new_filename, "/dev/null", 1016 result_pool, scratch_pool)); 1017 else 1018 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"), 1019 result_pool, scratch_pool)); 1020 1021 *new_state = state_git_header_found; 1022 return SVN_NO_ERROR; 1023} 1024 1025/* Parse the 'rename from ' line of a git extended unidiff. */ 1026static svn_error_t * 1027git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch, 1028 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1029{ 1030 SVN_ERR(grab_filename(&patch->old_filename, 1031 line + STRLEN_LITERAL("rename from "), 1032 result_pool, scratch_pool)); 1033 1034 *new_state = state_move_from_seen; 1035 return SVN_NO_ERROR; 1036} 1037 1038/* Parse the 'rename to ' line of a git extended unidiff. */ 1039static svn_error_t * 1040git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch, 1041 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1042{ 1043 SVN_ERR(grab_filename(&patch->new_filename, 1044 line + STRLEN_LITERAL("rename to "), 1045 result_pool, scratch_pool)); 1046 1047 patch->operation = svn_diff_op_moved; 1048 1049 *new_state = state_git_tree_seen; 1050 return SVN_NO_ERROR; 1051} 1052 1053/* Parse the 'copy from ' line of a git extended unidiff. */ 1054static svn_error_t * 1055git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch, 1056 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1057{ 1058 SVN_ERR(grab_filename(&patch->old_filename, 1059 line + STRLEN_LITERAL("copy from "), 1060 result_pool, scratch_pool)); 1061 1062 *new_state = state_copy_from_seen; 1063 return SVN_NO_ERROR; 1064} 1065 1066/* Parse the 'copy to ' line of a git extended unidiff. */ 1067static svn_error_t * 1068git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch, 1069 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1070{ 1071 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "), 1072 result_pool, scratch_pool)); 1073 1074 patch->operation = svn_diff_op_copied; 1075 1076 *new_state = state_git_tree_seen; 1077 return SVN_NO_ERROR; 1078} 1079 1080/* Parse the 'new file ' line of a git extended unidiff. */ 1081static svn_error_t * 1082git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch, 1083 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1084{ 1085 patch->operation = svn_diff_op_added; 1086 1087 /* Filename already retrieved from diff --git header. */ 1088 1089 *new_state = state_git_tree_seen; 1090 return SVN_NO_ERROR; 1091} 1092 1093/* Parse the 'deleted file ' line of a git extended unidiff. */ 1094static svn_error_t * 1095git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch, 1096 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1097{ 1098 patch->operation = svn_diff_op_deleted; 1099 1100 /* Filename already retrieved from diff --git header. */ 1101 1102 *new_state = state_git_tree_seen; 1103 return SVN_NO_ERROR; 1104} 1105 1106/* Add a HUNK associated with the property PROP_NAME to PATCH. */ 1107static svn_error_t * 1108add_property_hunk(svn_patch_t *patch, const char *prop_name, 1109 svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation, 1110 apr_pool_t *result_pool) 1111{ 1112 svn_prop_patch_t *prop_patch; 1113 1114 prop_patch = svn_hash_gets(patch->prop_patches, prop_name); 1115 1116 if (! prop_patch) 1117 { 1118 prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t)); 1119 prop_patch->name = prop_name; 1120 prop_patch->operation = operation; 1121 prop_patch->hunks = apr_array_make(result_pool, 1, 1122 sizeof(svn_diff_hunk_t *)); 1123 1124 svn_hash_sets(patch->prop_patches, prop_name, prop_patch); 1125 } 1126 1127 APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk; 1128 1129 return SVN_NO_ERROR; 1130} 1131 1132struct svn_patch_file_t 1133{ 1134 /* The APR file handle to the patch file. */ 1135 apr_file_t *apr_file; 1136 1137 /* The file offset at which the next patch is expected. */ 1138 apr_off_t next_patch_offset; 1139}; 1140 1141svn_error_t * 1142svn_diff_open_patch_file(svn_patch_file_t **patch_file, 1143 const char *local_abspath, 1144 apr_pool_t *result_pool) 1145{ 1146 svn_patch_file_t *p; 1147 1148 p = apr_palloc(result_pool, sizeof(*p)); 1149 SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath, 1150 APR_READ | APR_BUFFERED, APR_OS_DEFAULT, 1151 result_pool)); 1152 p->next_patch_offset = 0; 1153 *patch_file = p; 1154 1155 return SVN_NO_ERROR; 1156} 1157 1158/* Parse hunks from APR_FILE and store them in PATCH->HUNKS. 1159 * Parsing stops if no valid next hunk can be found. 1160 * If IGNORE_WHITESPACE is TRUE, lines without 1161 * leading spaces will be treated as context lines. 1162 * Allocate results in RESULT_POOL. 1163 * Use SCRATCH_POOL for temporary allocations. */ 1164static svn_error_t * 1165parse_hunks(svn_patch_t *patch, apr_file_t *apr_file, 1166 svn_boolean_t ignore_whitespace, 1167 apr_pool_t *result_pool, apr_pool_t *scratch_pool) 1168{ 1169 svn_diff_hunk_t *hunk; 1170 svn_boolean_t is_property; 1171 const char *last_prop_name; 1172 const char *prop_name; 1173 svn_diff_operation_kind_t prop_operation; 1174 apr_pool_t *iterpool; 1175 1176 last_prop_name = NULL; 1177 1178 patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *)); 1179 patch->prop_patches = apr_hash_make(result_pool); 1180 iterpool = svn_pool_create(scratch_pool); 1181 do 1182 { 1183 svn_pool_clear(iterpool); 1184 1185 SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation, 1186 patch, apr_file, ignore_whitespace, result_pool, 1187 iterpool)); 1188 1189 if (hunk && is_property) 1190 { 1191 if (! prop_name) 1192 prop_name = last_prop_name; 1193 else 1194 last_prop_name = prop_name; 1195 SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation, 1196 result_pool)); 1197 } 1198 else if (hunk) 1199 { 1200 APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk; 1201 last_prop_name = NULL; 1202 } 1203 1204 } 1205 while (hunk); 1206 svn_pool_destroy(iterpool); 1207 1208 return SVN_NO_ERROR; 1209} 1210 1211/* State machine for the diff header parser. 1212 * Expected Input Required state Function to call */ 1213static struct transition transitions[] = 1214{ 1215 {"--- ", state_start, diff_minus}, 1216 {"+++ ", state_minus_seen, diff_plus}, 1217 {"diff --git", state_start, git_start}, 1218 {"--- a/", state_git_diff_seen, git_minus}, 1219 {"--- a/", state_git_tree_seen, git_minus}, 1220 {"--- /dev/null", state_git_tree_seen, git_minus}, 1221 {"+++ b/", state_git_minus_seen, git_plus}, 1222 {"+++ /dev/null", state_git_minus_seen, git_plus}, 1223 {"rename from ", state_git_diff_seen, git_move_from}, 1224 {"rename to ", state_move_from_seen, git_move_to}, 1225 {"copy from ", state_git_diff_seen, git_copy_from}, 1226 {"copy to ", state_copy_from_seen, git_copy_to}, 1227 {"new file ", state_git_diff_seen, git_new_file}, 1228 {"deleted file ", state_git_diff_seen, git_deleted_file}, 1229}; 1230 1231svn_error_t * 1232svn_diff_parse_next_patch(svn_patch_t **patch, 1233 svn_patch_file_t *patch_file, 1234 svn_boolean_t reverse, 1235 svn_boolean_t ignore_whitespace, 1236 apr_pool_t *result_pool, 1237 apr_pool_t *scratch_pool) 1238{ 1239 apr_off_t pos, last_line; 1240 svn_boolean_t eof; 1241 svn_boolean_t line_after_tree_header_read = FALSE; 1242 apr_pool_t *iterpool; 1243 enum parse_state state = state_start; 1244 1245 if (apr_file_eof(patch_file->apr_file) == APR_EOF) 1246 { 1247 /* No more patches here. */ 1248 *patch = NULL; 1249 return SVN_NO_ERROR; 1250 } 1251 1252 *patch = apr_pcalloc(result_pool, sizeof(**patch)); 1253 1254 pos = patch_file->next_patch_offset; 1255 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool)); 1256 1257 iterpool = svn_pool_create(scratch_pool); 1258 do 1259 { 1260 svn_stringbuf_t *line; 1261 svn_boolean_t valid_header_line = FALSE; 1262 int i; 1263 1264 svn_pool_clear(iterpool); 1265 1266 /* Remember the current line's offset, and read the line. */ 1267 last_line = pos; 1268 SVN_ERR(svn_io_file_readline(patch_file->apr_file, &line, NULL, &eof, 1269 APR_SIZE_MAX, iterpool, iterpool)); 1270 1271 if (! eof) 1272 { 1273 /* Update line offset for next iteration. */ 1274 pos = 0; 1275 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos, 1276 iterpool)); 1277 } 1278 1279 /* Run the state machine. */ 1280 for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++) 1281 { 1282 if (starts_with(line->data, transitions[i].expected_input) 1283 && state == transitions[i].required_state) 1284 { 1285 SVN_ERR(transitions[i].fn(&state, line->data, *patch, 1286 result_pool, iterpool)); 1287 valid_header_line = TRUE; 1288 break; 1289 } 1290 } 1291 1292 if (state == state_unidiff_found || state == state_git_header_found) 1293 { 1294 /* We have a valid diff header, yay! */ 1295 break; 1296 } 1297 else if (state == state_git_tree_seen && line_after_tree_header_read) 1298 { 1299 /* git patches can contain an index line after the file mode line */ 1300 if (!starts_with(line->data, "index ")) 1301 { 1302 /* We have a valid diff header for a patch with only tree changes. 1303 * Rewind to the start of the line just read, so subsequent calls 1304 * to this function don't end up skipping the line -- it may 1305 * contain a patch. */ 1306 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line, 1307 scratch_pool)); 1308 break; 1309 } 1310 } 1311 else if (state == state_git_tree_seen) 1312 { 1313 line_after_tree_header_read = TRUE; 1314 } 1315 else if (! valid_header_line && state != state_start 1316 && state != state_git_diff_seen 1317 && !starts_with(line->data, "index ")) 1318 { 1319 /* We've encountered an invalid diff header. 1320 * 1321 * Rewind to the start of the line just read - it may be a new 1322 * header that begins there. */ 1323 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line, 1324 scratch_pool)); 1325 state = state_start; 1326 } 1327 1328 } 1329 while (! eof); 1330 1331 (*patch)->reverse = reverse; 1332 if (reverse) 1333 { 1334 const char *temp; 1335 temp = (*patch)->old_filename; 1336 (*patch)->old_filename = (*patch)->new_filename; 1337 (*patch)->new_filename = temp; 1338 } 1339 1340 if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL) 1341 { 1342 /* Something went wrong, just discard the result. */ 1343 *patch = NULL; 1344 } 1345 else 1346 SVN_ERR(parse_hunks(*patch, patch_file->apr_file, ignore_whitespace, 1347 result_pool, iterpool)); 1348 1349 svn_pool_destroy(iterpool); 1350 1351 patch_file->next_patch_offset = 0; 1352 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, 1353 &patch_file->next_patch_offset, scratch_pool)); 1354 1355 if (*patch) 1356 { 1357 /* Usually, hunks appear in the patch sorted by their original line 1358 * offset. But just in case they weren't parsed in this order for 1359 * some reason, we sort them so that our caller can assume that hunks 1360 * are sorted as if parsed from a usual patch. */ 1361 qsort((*patch)->hunks->elts, (*patch)->hunks->nelts, 1362 (*patch)->hunks->elt_size, compare_hunks); 1363 } 1364 1365 return SVN_NO_ERROR; 1366} 1367 1368svn_error_t * 1369svn_diff_close_patch_file(svn_patch_file_t *patch_file, 1370 apr_pool_t *scratch_pool) 1371{ 1372 return svn_error_trace(svn_io_file_close(patch_file->apr_file, 1373 scratch_pool)); 1374} 1375