1139743Simp/*
243412Snewton * lcs.c :  routines for creating an lcs
343412Snewton *
443412Snewton * ====================================================================
543412Snewton *    Licensed to the Apache Software Foundation (ASF) under one
643412Snewton *    or more contributor license agreements.  See the NOTICE file
743412Snewton *    distributed with this work for additional information
843412Snewton *    regarding copyright ownership.  The ASF licenses this file
943412Snewton *    to you under the Apache License, Version 2.0 (the
1043412Snewton *    "License"); you may not use this file except in compliance
1143412Snewton *    with the License.  You may obtain a copy of the License at
1243412Snewton *
1343412Snewton *      http://www.apache.org/licenses/LICENSE-2.0
1443412Snewton *
1543412Snewton *    Unless required by applicable law or agreed to in writing,
1643412Snewton *    software distributed under the License is distributed on an
1743412Snewton *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
1843412Snewton *    KIND, either express or implied.  See the License for the
1943412Snewton *    specific language governing permissions and limitations
2043412Snewton *    under the License.
2143412Snewton * ====================================================================
2243412Snewton */
2343412Snewton
2443412Snewton
2543412Snewton#include <apr.h>
2643412Snewton#include <apr_pools.h>
2743412Snewton#include <apr_general.h>
2843412Snewton
29116174Sobrien#include "diff.h"
30116174Sobrien
31116174Sobrien
3243412Snewton/*
3343412Snewton * Calculate the Longest Common Subsequence (LCS) between two datasources.
3443412Snewton * This function is what makes the diff code tick.
3543412Snewton *
3643412Snewton * The LCS algorithm implemented here is based on the approach described
3790002Salfred * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
3843412Snewton * Algorithm", but has been modified for better performance.
3943412Snewton *
4043412Snewton * Let M and N be the lengths (number of tokens) of the two sources
4143412Snewton * ('files'). The goal is to reach the end of both sources (files) with the
4290002Salfred * minimum number of insertions + deletions. Since there is a known length
4343412Snewton * difference N-M between the files, that is equivalent to just the minimum
4443412Snewton * number of deletions, or equivalently the minimum number of insertions.
4543412Snewton * For symmetry, we use the lesser number - deletions if M<N, insertions if
4665302Sobrien * M>N.
4765302Sobrien *
4865302Sobrien * Let 'k' be the difference in remaining length between the files, i.e.
4965302Sobrien * if we're at the beginning of both files, k=N-M, whereas k=0 for the
5065302Sobrien * 'end state', at the end of both files. An insertion will increase k by
5165302Sobrien * one, while a deletion decreases k by one. If k<0, then insertions are
5265302Sobrien * 'free' - we need those to reach the end state k=0 anyway - but deletions
5343412Snewton * are costly: Adding a deletion means that we will have to add an additional
5443412Snewton * insertion later to reach the end state, so it doesn't matter if we count
5543412Snewton * deletions or insertions. Similarly, deletions are free for k>0.
5643412Snewton *
5783366Sjulian * Let a 'state' be a given position in each file {pos1, pos2}. An array
5883366Sjulian * 'fp' keeps track of the best possible state (largest values of
5943412Snewton * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
6043412Snewton * from k=0), as well as a linked list of what matches were used to reach
6143412Snewton * that state. For each new value of p, we find for each value of k the
6243412Snewton * best achievable state for that k - either by doing a costly operation
6343412Snewton * (deletion if k<0) from a state achieved at a lower p, or doing a free
6443412Snewton * operation (insertion if k<0) from a state achieved at the same p -
6543412Snewton * and in both cases advancing past any matching regions found. This is
6643412Snewton * handled by running loops over k in order of descending absolute value.
67210197Strasz *
68125454Sjhb * A recent improvement of the algorithm is to ignore tokens that are unique
69121275Stjr * to one file or the other, as those are known from the start to be
70107849Salfred * impossible to match.
71107849Salfred */
72107849Salfred
7343412Snewtontypedef struct svn_diff__snake_t svn_diff__snake_t;
74107849Salfred
75111119Simpstruct svn_diff__snake_t
7643412Snewton{
7783366Sjulian    apr_off_t             y;
7843412Snewton    svn_diff__lcs_t      *lcs;
79107849Salfred    svn_diff__position_t *position[2];
8043412Snewton};
8143412Snewton
8243412Snewtonstatic APR_INLINE void
8343412Snewtonsvn_diff__snake(svn_diff__snake_t *fp_k,
84107849Salfred                svn_diff__token_index_t *token_counts[2],
8543412Snewton                svn_diff__lcs_t **freelist,
8643412Snewton                apr_pool_t *pool)
8743412Snewton{
8843412Snewton  svn_diff__position_t *start_position[2];
89107849Salfred  svn_diff__position_t *position[2];
9043412Snewton  svn_diff__lcs_t *lcs;
9143412Snewton  svn_diff__lcs_t *previous_lcs;
9243412Snewton
9343412Snewton  /* The previous entry at fp[k] is going to be replaced.  See if we
9443412Snewton   * can mark that lcs node for reuse, because the sequence up to this
9543412Snewton   * point was a dead end.
9643412Snewton   */
9743412Snewton  lcs = fp_k[0].lcs;
9843412Snewton  while (lcs)
9943412Snewton    {
10043412Snewton      lcs->refcount--;
10183366Sjulian      if (lcs->refcount)
10283366Sjulian        break;
10343412Snewton
10443412Snewton      previous_lcs = lcs->next;
10543412Snewton      lcs->next = *freelist;
10643412Snewton      *freelist = lcs;
10743412Snewton      lcs = previous_lcs;
10843412Snewton    }
10943412Snewton
11043412Snewton  if (fp_k[-1].y >= fp_k[1].y)
11143412Snewton    {
112107849Salfred      start_position[0] = fp_k[-1].position[0];
113107849Salfred      start_position[1] = fp_k[-1].position[1]->next;
114107849Salfred
11543412Snewton      previous_lcs = fp_k[-1].lcs;
11689319Salfred    }
11743412Snewton  else
11843412Snewton    {
11943412Snewton      start_position[0] = fp_k[1].position[0]->next;
12043412Snewton      start_position[1] = fp_k[1].position[1];
12143412Snewton
122109153Sdillon      previous_lcs = fp_k[1].lcs;
123107849Salfred    }
12497658Stanimura
125107849Salfred
12697658Stanimura  if (previous_lcs)
12797658Stanimura    {
12897658Stanimura      previous_lcs->refcount++;
12996972Stanimura    }
13096972Stanimura
13196972Stanimura  /* ### Optimization, skip all positions that don't have matchpoints
13243412Snewton   * ### anyway. Beware of the sentinel, don't skip it!
13343412Snewton   */
13483366Sjulian
13543412Snewton  position[0] = start_position[0];
13643412Snewton  position[1] = start_position[1];
137107849Salfred
13843412Snewton  while (1)
139114983Sjhb    {
140114983Sjhb      while (position[0]->token_index == position[1]->token_index)
141114983Sjhb        {
142114983Sjhb          position[0] = position[0]->next;
143114983Sjhb          position[1] = position[1]->next;
144114983Sjhb        }
145114983Sjhb
146112888Sjeff      if (position[1] != start_position[1])
147114983Sjhb        {
148114983Sjhb          lcs = *freelist;
149112888Sjeff          if (lcs)
150114983Sjhb            {
151114983Sjhb              *freelist = lcs->next;
152114983Sjhb            }
153114983Sjhb          else
15443412Snewton            {
15543412Snewton              lcs = apr_palloc(pool, sizeof(*lcs));
15643412Snewton            }
15743412Snewton
15843412Snewton          lcs->position[0] = start_position[0];
15943412Snewton          lcs->position[1] = start_position[1];
16043412Snewton          lcs->length = position[1]->offset - start_position[1]->offset;
16189306Salfred          lcs->next = previous_lcs;
16243412Snewton          lcs->refcount = 1;
16343412Snewton          previous_lcs = lcs;
16443412Snewton          start_position[0] = position[0];
16543412Snewton          start_position[1] = position[1];
16643412Snewton        }
16743412Snewton
16843412Snewton      /* Skip any and all tokens that only occur in one of the files */
16983366Sjulian      if (position[0]->token_index >= 0
17083366Sjulian          && token_counts[1][position[0]->token_index] == 0)
17143412Snewton        start_position[0] = position[0] = position[0]->next;
17243412Snewton      else if (position[1]->token_index >= 0
17343412Snewton               && token_counts[0][position[1]->token_index] == 0)
17443412Snewton        start_position[1] = position[1] = position[1]->next;
17543412Snewton      else
17643412Snewton        break;
177107849Salfred    }
178107849Salfred
179107849Salfred  fp_k[0].lcs = previous_lcs;
18043412Snewton  fp_k[0].position[0] = position[0];
18183366Sjulian  fp_k[0].position[1] = position[1];
18243412Snewton
18343412Snewton  fp_k[0].y = position[1]->offset;
184107849Salfred}
18543412Snewton
18643412Snewton
18743412Snewtonstatic svn_diff__lcs_t *
18843412Snewtonsvn_diff__lcs_reverse(svn_diff__lcs_t *lcs)
18943412Snewton{
19043412Snewton  svn_diff__lcs_t *next;
19183366Sjulian  svn_diff__lcs_t *prev;
19243412Snewton
19383366Sjulian  next = NULL;
19443412Snewton  while (lcs != NULL)
19543412Snewton    {
19643412Snewton      prev = lcs->next;
19743412Snewton      lcs->next = next;
19843412Snewton      next = lcs;
19943412Snewton      lcs = prev;
20043412Snewton    }
20183366Sjulian
20243412Snewton  return next;
20343412Snewton}
20443412Snewton
20543412Snewton
20643412Snewton/* Prepends a new lcs chunk for the amount of LINES at the given positions
207168355Srwatson * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
20843412Snewton * This function assumes LINES > 0. */
209168355Srwatsonstatic svn_diff__lcs_t *
21043412Snewtonprepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines,
21143412Snewton            apr_off_t pos0_offset, apr_off_t pos1_offset,
21243412Snewton            apr_pool_t *pool)
213168355Srwatson{
21443412Snewton  svn_diff__lcs_t *new_lcs;
215168355Srwatson
21643412Snewton  SVN_ERR_ASSERT_NO_RETURN(lines > 0);
21743412Snewton
21843412Snewton  new_lcs = apr_palloc(pool, sizeof(*new_lcs));
21943412Snewton  new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0]));
22043412Snewton  new_lcs->position[0]->offset = pos0_offset;
22143412Snewton  new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1]));
22243412Snewton  new_lcs->position[1]->offset = pos1_offset;
22343412Snewton  new_lcs->length = lines;
22443412Snewton  new_lcs->refcount = 1;
22543412Snewton  new_lcs->next = lcs;
22643412Snewton
22743412Snewton  return new_lcs;
22843412Snewton}
22943412Snewton
23043412Snewton
23143412Snewtonsvn_diff__lcs_t *
23243412Snewtonsvn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
23343412Snewton              svn_diff__position_t *position_list2, /* pointer to tail (ring) */
23443412Snewton              svn_diff__token_index_t *token_counts_list1, /* array of counts */
23543412Snewton              svn_diff__token_index_t *token_counts_list2, /* array of counts */
23643412Snewton              svn_diff__token_index_t num_tokens,
237102003Srwatson              apr_off_t prefix_lines,
23843412Snewton              apr_off_t suffix_lines,
23943412Snewton              apr_pool_t *pool)
24043412Snewton{
24143412Snewton  apr_off_t length[2];
24243412Snewton  svn_diff__token_index_t *token_counts[2];
24343412Snewton  svn_diff__token_index_t unique_count[2];
24443412Snewton  svn_diff__token_index_t token_index;
24543412Snewton  svn_diff__snake_t *fp;
24643412Snewton  apr_off_t d;
24743412Snewton  apr_off_t k;
24843412Snewton  apr_off_t p = 0;
249  svn_diff__lcs_t *lcs, *lcs_freelist = NULL;
250
251  svn_diff__position_t sentinel_position[2];
252
253  /* Since EOF is always a sync point we tack on an EOF link
254   * with sentinel positions
255   */
256  lcs = apr_palloc(pool, sizeof(*lcs));
257  lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
258  lcs->position[0]->offset = position_list1
259                             ? position_list1->offset + suffix_lines + 1
260                             : prefix_lines + suffix_lines + 1;
261  lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
262  lcs->position[1]->offset = position_list2
263                             ? position_list2->offset + suffix_lines + 1
264                             : prefix_lines + suffix_lines + 1;
265  lcs->length = 0;
266  lcs->refcount = 1;
267  lcs->next = NULL;
268
269  if (position_list1 == NULL || position_list2 == NULL)
270    {
271      if (suffix_lines)
272        lcs = prepend_lcs(lcs, suffix_lines,
273                          lcs->position[0]->offset - suffix_lines,
274                          lcs->position[1]->offset - suffix_lines,
275                          pool);
276      if (prefix_lines)
277        lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool);
278
279      return lcs;
280    }
281
282  unique_count[1] = unique_count[0] = 0;
283  for (token_index = 0; token_index < num_tokens; token_index++)
284    {
285      if (token_counts_list1[token_index] == 0)
286        unique_count[1] += token_counts_list2[token_index];
287      if (token_counts_list2[token_index] == 0)
288        unique_count[0] += token_counts_list1[token_index];
289    }
290
291  /* Calculate lengths M and N of the sequences to be compared. Do not
292   * count tokens unique to one file, as those are ignored in __snake.
293   */
294  length[0] = position_list1->offset - position_list1->next->offset + 1
295              - unique_count[0];
296  length[1] = position_list2->offset - position_list2->next->offset + 1
297              - unique_count[1];
298
299  /* strikerXXX: here we allocate the furthest point array, which is
300   * strikerXXX: sized M + N + 3 (!)
301   */
302  fp = apr_pcalloc(pool,
303                   sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3));
304
305  /* The origo of fp corresponds to the end state, where we are
306   * at the end of both files. The valid states thus span from
307   * -N (at end of first file and at the beginning of the second
308   * file) to +M (the opposite :). Finally, svn_diff__snake needs
309   * 1 extra slot on each side to work.
310   */
311  fp += length[1] + 1;
312
313  sentinel_position[0].next = position_list1->next;
314  position_list1->next = &sentinel_position[0];
315  sentinel_position[0].offset = position_list1->offset + 1;
316  token_counts[0] = token_counts_list1;
317
318  sentinel_position[1].next = position_list2->next;
319  position_list2->next = &sentinel_position[1];
320  sentinel_position[1].offset = position_list2->offset + 1;
321  token_counts[1] = token_counts_list2;
322
323  /* Negative indices will not be used elsewhere
324   */
325  sentinel_position[0].token_index = -1;
326  sentinel_position[1].token_index = -2;
327
328  /* position d = M - N corresponds to the initial state, where
329   * we are at the beginning of both files.
330   */
331  d = length[0] - length[1];
332
333  /* k = d - 1 will be the first to be used to get previous
334   * position information from, make sure it holds sane
335   * data
336   */
337  fp[d - 1].position[0] = sentinel_position[0].next;
338  fp[d - 1].position[1] = &sentinel_position[1];
339
340  p = 0;
341  do
342    {
343      /* For k < 0, insertions are free */
344      for (k = (d < 0 ? d : 0) - p; k < 0; k++)
345        {
346          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
347        }
348	  /* for k > 0, deletions are free */
349      for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
350        {
351          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
352        }
353
354      p++;
355    }
356  while (fp[0].position[1] != &sentinel_position[1]);
357
358  if (suffix_lines)
359    lcs->next = prepend_lcs(fp[0].lcs, suffix_lines,
360                            lcs->position[0]->offset - suffix_lines,
361                            lcs->position[1]->offset - suffix_lines,
362                            pool);
363  else
364    lcs->next = fp[0].lcs;
365
366  lcs = svn_diff__lcs_reverse(lcs);
367
368  position_list1->next = sentinel_position[0].next;
369  position_list2->next = sentinel_position[1].next;
370
371  if (prefix_lines)
372    return prepend_lcs(lcs, prefix_lines, 1, 1, pool);
373  else
374    return lcs;
375}
376