1139743Simp/* 243412Snewton * lcs.c : routines for creating an lcs 343412Snewton * 443412Snewton * ==================================================================== 543412Snewton * Licensed to the Apache Software Foundation (ASF) under one 643412Snewton * or more contributor license agreements. See the NOTICE file 743412Snewton * distributed with this work for additional information 843412Snewton * regarding copyright ownership. The ASF licenses this file 943412Snewton * to you under the Apache License, Version 2.0 (the 1043412Snewton * "License"); you may not use this file except in compliance 1143412Snewton * with the License. You may obtain a copy of the License at 1243412Snewton * 1343412Snewton * http://www.apache.org/licenses/LICENSE-2.0 1443412Snewton * 1543412Snewton * Unless required by applicable law or agreed to in writing, 1643412Snewton * software distributed under the License is distributed on an 1743412Snewton * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 1843412Snewton * KIND, either express or implied. See the License for the 1943412Snewton * specific language governing permissions and limitations 2043412Snewton * under the License. 2143412Snewton * ==================================================================== 2243412Snewton */ 2343412Snewton 2443412Snewton 2543412Snewton#include <apr.h> 2643412Snewton#include <apr_pools.h> 2743412Snewton#include <apr_general.h> 2843412Snewton 29116174Sobrien#include "diff.h" 30116174Sobrien 31116174Sobrien 3243412Snewton/* 3343412Snewton * Calculate the Longest Common Subsequence (LCS) between two datasources. 3443412Snewton * This function is what makes the diff code tick. 3543412Snewton * 3643412Snewton * The LCS algorithm implemented here is based on the approach described 3790002Salfred * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison 3843412Snewton * Algorithm", but has been modified for better performance. 3943412Snewton * 4043412Snewton * Let M and N be the lengths (number of tokens) of the two sources 4143412Snewton * ('files'). The goal is to reach the end of both sources (files) with the 4290002Salfred * minimum number of insertions + deletions. Since there is a known length 4343412Snewton * difference N-M between the files, that is equivalent to just the minimum 4443412Snewton * number of deletions, or equivalently the minimum number of insertions. 4543412Snewton * For symmetry, we use the lesser number - deletions if M<N, insertions if 4665302Sobrien * M>N. 4765302Sobrien * 4865302Sobrien * Let 'k' be the difference in remaining length between the files, i.e. 4965302Sobrien * if we're at the beginning of both files, k=N-M, whereas k=0 for the 5065302Sobrien * 'end state', at the end of both files. An insertion will increase k by 5165302Sobrien * one, while a deletion decreases k by one. If k<0, then insertions are 5265302Sobrien * 'free' - we need those to reach the end state k=0 anyway - but deletions 5343412Snewton * are costly: Adding a deletion means that we will have to add an additional 5443412Snewton * insertion later to reach the end state, so it doesn't matter if we count 5543412Snewton * deletions or insertions. Similarly, deletions are free for k>0. 5643412Snewton * 5783366Sjulian * Let a 'state' be a given position in each file {pos1, pos2}. An array 5883366Sjulian * 'fp' keeps track of the best possible state (largest values of 5943412Snewton * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away 6043412Snewton * from k=0), as well as a linked list of what matches were used to reach 6143412Snewton * that state. For each new value of p, we find for each value of k the 6243412Snewton * best achievable state for that k - either by doing a costly operation 6343412Snewton * (deletion if k<0) from a state achieved at a lower p, or doing a free 6443412Snewton * operation (insertion if k<0) from a state achieved at the same p - 6543412Snewton * and in both cases advancing past any matching regions found. This is 6643412Snewton * handled by running loops over k in order of descending absolute value. 67210197Strasz * 68125454Sjhb * A recent improvement of the algorithm is to ignore tokens that are unique 69121275Stjr * to one file or the other, as those are known from the start to be 70107849Salfred * impossible to match. 71107849Salfred */ 72107849Salfred 7343412Snewtontypedef struct svn_diff__snake_t svn_diff__snake_t; 74107849Salfred 75111119Simpstruct svn_diff__snake_t 7643412Snewton{ 7783366Sjulian apr_off_t y; 7843412Snewton svn_diff__lcs_t *lcs; 79107849Salfred svn_diff__position_t *position[2]; 8043412Snewton}; 8143412Snewton 8243412Snewtonstatic APR_INLINE void 8343412Snewtonsvn_diff__snake(svn_diff__snake_t *fp_k, 84107849Salfred svn_diff__token_index_t *token_counts[2], 8543412Snewton svn_diff__lcs_t **freelist, 8643412Snewton apr_pool_t *pool) 8743412Snewton{ 8843412Snewton svn_diff__position_t *start_position[2]; 89107849Salfred svn_diff__position_t *position[2]; 9043412Snewton svn_diff__lcs_t *lcs; 9143412Snewton svn_diff__lcs_t *previous_lcs; 9243412Snewton 9343412Snewton /* The previous entry at fp[k] is going to be replaced. See if we 9443412Snewton * can mark that lcs node for reuse, because the sequence up to this 9543412Snewton * point was a dead end. 9643412Snewton */ 9743412Snewton lcs = fp_k[0].lcs; 9843412Snewton while (lcs) 9943412Snewton { 10043412Snewton lcs->refcount--; 10183366Sjulian if (lcs->refcount) 10283366Sjulian break; 10343412Snewton 10443412Snewton previous_lcs = lcs->next; 10543412Snewton lcs->next = *freelist; 10643412Snewton *freelist = lcs; 10743412Snewton lcs = previous_lcs; 10843412Snewton } 10943412Snewton 11043412Snewton if (fp_k[-1].y >= fp_k[1].y) 11143412Snewton { 112107849Salfred start_position[0] = fp_k[-1].position[0]; 113107849Salfred start_position[1] = fp_k[-1].position[1]->next; 114107849Salfred 11543412Snewton previous_lcs = fp_k[-1].lcs; 11689319Salfred } 11743412Snewton else 11843412Snewton { 11943412Snewton start_position[0] = fp_k[1].position[0]->next; 12043412Snewton start_position[1] = fp_k[1].position[1]; 12143412Snewton 122109153Sdillon previous_lcs = fp_k[1].lcs; 123107849Salfred } 12497658Stanimura 125107849Salfred 12697658Stanimura if (previous_lcs) 12797658Stanimura { 12897658Stanimura previous_lcs->refcount++; 12996972Stanimura } 13096972Stanimura 13196972Stanimura /* ### Optimization, skip all positions that don't have matchpoints 13243412Snewton * ### anyway. Beware of the sentinel, don't skip it! 13343412Snewton */ 13483366Sjulian 13543412Snewton position[0] = start_position[0]; 13643412Snewton position[1] = start_position[1]; 137107849Salfred 13843412Snewton while (1) 139114983Sjhb { 140114983Sjhb while (position[0]->token_index == position[1]->token_index) 141114983Sjhb { 142114983Sjhb position[0] = position[0]->next; 143114983Sjhb position[1] = position[1]->next; 144114983Sjhb } 145114983Sjhb 146112888Sjeff if (position[1] != start_position[1]) 147114983Sjhb { 148114983Sjhb lcs = *freelist; 149112888Sjeff if (lcs) 150114983Sjhb { 151114983Sjhb *freelist = lcs->next; 152114983Sjhb } 153114983Sjhb else 15443412Snewton { 15543412Snewton lcs = apr_palloc(pool, sizeof(*lcs)); 15643412Snewton } 15743412Snewton 15843412Snewton lcs->position[0] = start_position[0]; 15943412Snewton lcs->position[1] = start_position[1]; 16043412Snewton lcs->length = position[1]->offset - start_position[1]->offset; 16189306Salfred lcs->next = previous_lcs; 16243412Snewton lcs->refcount = 1; 16343412Snewton previous_lcs = lcs; 16443412Snewton start_position[0] = position[0]; 16543412Snewton start_position[1] = position[1]; 16643412Snewton } 16743412Snewton 16843412Snewton /* Skip any and all tokens that only occur in one of the files */ 16983366Sjulian if (position[0]->token_index >= 0 17083366Sjulian && token_counts[1][position[0]->token_index] == 0) 17143412Snewton start_position[0] = position[0] = position[0]->next; 17243412Snewton else if (position[1]->token_index >= 0 17343412Snewton && token_counts[0][position[1]->token_index] == 0) 17443412Snewton start_position[1] = position[1] = position[1]->next; 17543412Snewton else 17643412Snewton break; 177107849Salfred } 178107849Salfred 179107849Salfred fp_k[0].lcs = previous_lcs; 18043412Snewton fp_k[0].position[0] = position[0]; 18183366Sjulian fp_k[0].position[1] = position[1]; 18243412Snewton 18343412Snewton fp_k[0].y = position[1]->offset; 184107849Salfred} 18543412Snewton 18643412Snewton 18743412Snewtonstatic svn_diff__lcs_t * 18843412Snewtonsvn_diff__lcs_reverse(svn_diff__lcs_t *lcs) 18943412Snewton{ 19043412Snewton svn_diff__lcs_t *next; 19183366Sjulian svn_diff__lcs_t *prev; 19243412Snewton 19383366Sjulian next = NULL; 19443412Snewton while (lcs != NULL) 19543412Snewton { 19643412Snewton prev = lcs->next; 19743412Snewton lcs->next = next; 19843412Snewton next = lcs; 19943412Snewton lcs = prev; 20043412Snewton } 20183366Sjulian 20243412Snewton return next; 20343412Snewton} 20443412Snewton 20543412Snewton 20643412Snewton/* Prepends a new lcs chunk for the amount of LINES at the given positions 207168355Srwatson * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it. 20843412Snewton * This function assumes LINES > 0. */ 209168355Srwatsonstatic svn_diff__lcs_t * 21043412Snewtonprepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines, 21143412Snewton apr_off_t pos0_offset, apr_off_t pos1_offset, 21243412Snewton apr_pool_t *pool) 213168355Srwatson{ 21443412Snewton svn_diff__lcs_t *new_lcs; 215168355Srwatson 21643412Snewton SVN_ERR_ASSERT_NO_RETURN(lines > 0); 21743412Snewton 21843412Snewton new_lcs = apr_palloc(pool, sizeof(*new_lcs)); 21943412Snewton new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0])); 22043412Snewton new_lcs->position[0]->offset = pos0_offset; 22143412Snewton new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1])); 22243412Snewton new_lcs->position[1]->offset = pos1_offset; 22343412Snewton new_lcs->length = lines; 22443412Snewton new_lcs->refcount = 1; 22543412Snewton new_lcs->next = lcs; 22643412Snewton 22743412Snewton return new_lcs; 22843412Snewton} 22943412Snewton 23043412Snewton 23143412Snewtonsvn_diff__lcs_t * 23243412Snewtonsvn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */ 23343412Snewton svn_diff__position_t *position_list2, /* pointer to tail (ring) */ 23443412Snewton svn_diff__token_index_t *token_counts_list1, /* array of counts */ 23543412Snewton svn_diff__token_index_t *token_counts_list2, /* array of counts */ 23643412Snewton svn_diff__token_index_t num_tokens, 237102003Srwatson apr_off_t prefix_lines, 23843412Snewton apr_off_t suffix_lines, 23943412Snewton apr_pool_t *pool) 24043412Snewton{ 24143412Snewton apr_off_t length[2]; 24243412Snewton svn_diff__token_index_t *token_counts[2]; 24343412Snewton svn_diff__token_index_t unique_count[2]; 24443412Snewton svn_diff__token_index_t token_index; 24543412Snewton svn_diff__snake_t *fp; 24643412Snewton apr_off_t d; 24743412Snewton apr_off_t k; 24843412Snewton apr_off_t p = 0; 249 svn_diff__lcs_t *lcs, *lcs_freelist = NULL; 250 251 svn_diff__position_t sentinel_position[2]; 252 253 /* Since EOF is always a sync point we tack on an EOF link 254 * with sentinel positions 255 */ 256 lcs = apr_palloc(pool, sizeof(*lcs)); 257 lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0])); 258 lcs->position[0]->offset = position_list1 259 ? position_list1->offset + suffix_lines + 1 260 : prefix_lines + suffix_lines + 1; 261 lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1])); 262 lcs->position[1]->offset = position_list2 263 ? position_list2->offset + suffix_lines + 1 264 : prefix_lines + suffix_lines + 1; 265 lcs->length = 0; 266 lcs->refcount = 1; 267 lcs->next = NULL; 268 269 if (position_list1 == NULL || position_list2 == NULL) 270 { 271 if (suffix_lines) 272 lcs = prepend_lcs(lcs, suffix_lines, 273 lcs->position[0]->offset - suffix_lines, 274 lcs->position[1]->offset - suffix_lines, 275 pool); 276 if (prefix_lines) 277 lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool); 278 279 return lcs; 280 } 281 282 unique_count[1] = unique_count[0] = 0; 283 for (token_index = 0; token_index < num_tokens; token_index++) 284 { 285 if (token_counts_list1[token_index] == 0) 286 unique_count[1] += token_counts_list2[token_index]; 287 if (token_counts_list2[token_index] == 0) 288 unique_count[0] += token_counts_list1[token_index]; 289 } 290 291 /* Calculate lengths M and N of the sequences to be compared. Do not 292 * count tokens unique to one file, as those are ignored in __snake. 293 */ 294 length[0] = position_list1->offset - position_list1->next->offset + 1 295 - unique_count[0]; 296 length[1] = position_list2->offset - position_list2->next->offset + 1 297 - unique_count[1]; 298 299 /* strikerXXX: here we allocate the furthest point array, which is 300 * strikerXXX: sized M + N + 3 (!) 301 */ 302 fp = apr_pcalloc(pool, 303 sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3)); 304 305 /* The origo of fp corresponds to the end state, where we are 306 * at the end of both files. The valid states thus span from 307 * -N (at end of first file and at the beginning of the second 308 * file) to +M (the opposite :). Finally, svn_diff__snake needs 309 * 1 extra slot on each side to work. 310 */ 311 fp += length[1] + 1; 312 313 sentinel_position[0].next = position_list1->next; 314 position_list1->next = &sentinel_position[0]; 315 sentinel_position[0].offset = position_list1->offset + 1; 316 token_counts[0] = token_counts_list1; 317 318 sentinel_position[1].next = position_list2->next; 319 position_list2->next = &sentinel_position[1]; 320 sentinel_position[1].offset = position_list2->offset + 1; 321 token_counts[1] = token_counts_list2; 322 323 /* Negative indices will not be used elsewhere 324 */ 325 sentinel_position[0].token_index = -1; 326 sentinel_position[1].token_index = -2; 327 328 /* position d = M - N corresponds to the initial state, where 329 * we are at the beginning of both files. 330 */ 331 d = length[0] - length[1]; 332 333 /* k = d - 1 will be the first to be used to get previous 334 * position information from, make sure it holds sane 335 * data 336 */ 337 fp[d - 1].position[0] = sentinel_position[0].next; 338 fp[d - 1].position[1] = &sentinel_position[1]; 339 340 p = 0; 341 do 342 { 343 /* For k < 0, insertions are free */ 344 for (k = (d < 0 ? d : 0) - p; k < 0; k++) 345 { 346 svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool); 347 } 348 /* for k > 0, deletions are free */ 349 for (k = (d > 0 ? d : 0) + p; k >= 0; k--) 350 { 351 svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool); 352 } 353 354 p++; 355 } 356 while (fp[0].position[1] != &sentinel_position[1]); 357 358 if (suffix_lines) 359 lcs->next = prepend_lcs(fp[0].lcs, suffix_lines, 360 lcs->position[0]->offset - suffix_lines, 361 lcs->position[1]->offset - suffix_lines, 362 pool); 363 else 364 lcs->next = fp[0].lcs; 365 366 lcs = svn_diff__lcs_reverse(lcs); 367 368 position_list1->next = sentinel_position[0].next; 369 position_list2->next = sentinel_position[1].next; 370 371 if (prefix_lines) 372 return prepend_lcs(lcs, prefix_lines, 1, 1, pool); 373 else 374 return lcs; 375} 376