subversion/libsvn_diff/lcs.c

139743Simp/*
43412Snewton * lcs.c :  routines for creating an lcs
43412Snewton *
43412Snewton * ====================================================================
43412Snewton *    Licensed to the Apache Software Foundation (ASF) under one
43412Snewton *    or more contributor license agreements.  See the NOTICE file
43412Snewton *    distributed with this work for additional information
43412Snewton *    regarding copyright ownership.  The ASF licenses this file
43412Snewton *    to you under the Apache License, Version 2.0 (the
43412Snewton *    "License"); you may not use this file except in compliance
43412Snewton *    with the License.  You may obtain a copy of the License at
43412Snewton *
43412Snewton *      http://www.apache.org/licenses/LICENSE-2.0
43412Snewton *
43412Snewton *    Unless required by applicable law or agreed to in writing,
43412Snewton *    software distributed under the License is distributed on an
43412Snewton *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
43412Snewton *    KIND, either express or implied.  See the License for the
43412Snewton *    specific language governing permissions and limitations
43412Snewton *    under the License.
43412Snewton * ====================================================================
43412Snewton */
43412Snewton
43412Snewton
43412Snewton#include <apr.h>
43412Snewton#include <apr_pools.h>
43412Snewton#include <apr_general.h>
43412Snewton
116174Sobrien#include "diff.h"
116174Sobrien
116174Sobrien
43412Snewton/*
43412Snewton * Calculate the Longest Common Subsequence (LCS) between two datasources.
43412Snewton * This function is what makes the diff code tick.
43412Snewton *
43412Snewton * The LCS algorithm implemented here is based on the approach described
90002Salfred * by Sun Wu, Udi Manber and Gene Meyers in "An O(NP) Sequence Comparison
43412Snewton * Algorithm", but has been modified for better performance.
43412Snewton *
43412Snewton * Let M and N be the lengths (number of tokens) of the two sources
43412Snewton * ('files'). The goal is to reach the end of both sources (files) with the
90002Salfred * minimum number of insertions + deletions. Since there is a known length
43412Snewton * difference N-M between the files, that is equivalent to just the minimum
43412Snewton * number of deletions, or equivalently the minimum number of insertions.
43412Snewton * For symmetry, we use the lesser number - deletions if M<N, insertions if
65302Sobrien * M>N.
65302Sobrien *
65302Sobrien * Let 'k' be the difference in remaining length between the files, i.e.
65302Sobrien * if we're at the beginning of both files, k=N-M, whereas k=0 for the
65302Sobrien * 'end state', at the end of both files. An insertion will increase k by
65302Sobrien * one, while a deletion decreases k by one. If k<0, then insertions are
65302Sobrien * 'free' - we need those to reach the end state k=0 anyway - but deletions
43412Snewton * are costly: Adding a deletion means that we will have to add an additional
43412Snewton * insertion later to reach the end state, so it doesn't matter if we count
43412Snewton * deletions or insertions. Similarly, deletions are free for k>0.
43412Snewton *
83366Sjulian * Let a 'state' be a given position in each file {pos1, pos2}. An array
83366Sjulian * 'fp' keeps track of the best possible state (largest values of
43412Snewton * {pos1, pos2}) that can be achieved for a given cost 'p' (# moves away
43412Snewton * from k=0), as well as a linked list of what matches were used to reach
43412Snewton * that state. For each new value of p, we find for each value of k the
43412Snewton * best achievable state for that k - either by doing a costly operation
43412Snewton * (deletion if k<0) from a state achieved at a lower p, or doing a free
43412Snewton * operation (insertion if k<0) from a state achieved at the same p -
43412Snewton * and in both cases advancing past any matching regions found. This is
43412Snewton * handled by running loops over k in order of descending absolute value.
210197Strasz *
125454Sjhb * A recent improvement of the algorithm is to ignore tokens that are unique
121275Stjr * to one file or the other, as those are known from the start to be
107849Salfred * impossible to match.
107849Salfred */
107849Salfred
43412Snewtontypedef struct svn_diff__snake_t svn_diff__snake_t;
107849Salfred
111119Simpstruct svn_diff__snake_t
43412Snewton{
83366Sjulian    apr_off_t             y;
43412Snewton    svn_diff__lcs_t      *lcs;
107849Salfred    svn_diff__position_t *position[2];
43412Snewton};
43412Snewton
43412Snewtonstatic APR_INLINE void
43412Snewtonsvn_diff__snake(svn_diff__snake_t *fp_k,
107849Salfred                svn_diff__token_index_t *token_counts[2],
43412Snewton                svn_diff__lcs_t **freelist,
43412Snewton                apr_pool_t *pool)
43412Snewton{
43412Snewton  svn_diff__position_t *start_position[2];
107849Salfred  svn_diff__position_t *position[2];
43412Snewton  svn_diff__lcs_t *lcs;
43412Snewton  svn_diff__lcs_t *previous_lcs;
43412Snewton
43412Snewton  /* The previous entry at fp[k] is going to be replaced.  See if we
43412Snewton   * can mark that lcs node for reuse, because the sequence up to this
43412Snewton   * point was a dead end.
43412Snewton   */
43412Snewton  lcs = fp_k[0].lcs;
43412Snewton  while (lcs)
43412Snewton    {
43412Snewton      lcs->refcount--;
83366Sjulian      if (lcs->refcount)
83366Sjulian        break;
43412Snewton
43412Snewton      previous_lcs = lcs->next;
43412Snewton      lcs->next = *freelist;
43412Snewton      *freelist = lcs;
43412Snewton      lcs = previous_lcs;
43412Snewton    }
43412Snewton
43412Snewton  if (fp_k[-1].y >= fp_k[1].y)
43412Snewton    {
107849Salfred      start_position[0] = fp_k[-1].position[0];
107849Salfred      start_position[1] = fp_k[-1].position[1]->next;
107849Salfred
43412Snewton      previous_lcs = fp_k[-1].lcs;
89319Salfred    }
43412Snewton  else
43412Snewton    {
43412Snewton      start_position[0] = fp_k[1].position[0]->next;
43412Snewton      start_position[1] = fp_k[1].position[1];
43412Snewton
109153Sdillon      previous_lcs = fp_k[1].lcs;
107849Salfred    }
97658Stanimura
107849Salfred
97658Stanimura  if (previous_lcs)
97658Stanimura    {
97658Stanimura      previous_lcs->refcount++;
96972Stanimura    }
96972Stanimura
96972Stanimura  /* ### Optimization, skip all positions that don't have matchpoints
43412Snewton   * ### anyway. Beware of the sentinel, don't skip it!
43412Snewton   */
83366Sjulian
43412Snewton  position[0] = start_position[0];
43412Snewton  position[1] = start_position[1];
107849Salfred
43412Snewton  while (1)
114983Sjhb    {
114983Sjhb      while (position[0]->token_index == position[1]->token_index)
114983Sjhb        {
114983Sjhb          position[0] = position[0]->next;
114983Sjhb          position[1] = position[1]->next;
114983Sjhb        }
114983Sjhb
112888Sjeff      if (position[1] != start_position[1])
114983Sjhb        {
114983Sjhb          lcs = *freelist;
112888Sjeff          if (lcs)
114983Sjhb            {
114983Sjhb              *freelist = lcs->next;
114983Sjhb            }
114983Sjhb          else
43412Snewton            {
43412Snewton              lcs = apr_palloc(pool, sizeof(*lcs));
43412Snewton            }
43412Snewton
43412Snewton          lcs->position[0] = start_position[0];
43412Snewton          lcs->position[1] = start_position[1];
43412Snewton          lcs->length = position[1]->offset - start_position[1]->offset;
89306Salfred          lcs->next = previous_lcs;
43412Snewton          lcs->refcount = 1;
43412Snewton          previous_lcs = lcs;
43412Snewton          start_position[0] = position[0];
43412Snewton          start_position[1] = position[1];
43412Snewton        }
43412Snewton
43412Snewton      /* Skip any and all tokens that only occur in one of the files */
83366Sjulian      if (position[0]->token_index >= 0
83366Sjulian          && token_counts[1][position[0]->token_index] == 0)
43412Snewton        start_position[0] = position[0] = position[0]->next;
43412Snewton      else if (position[1]->token_index >= 0
43412Snewton               && token_counts[0][position[1]->token_index] == 0)
43412Snewton        start_position[1] = position[1] = position[1]->next;
43412Snewton      else
43412Snewton        break;
107849Salfred    }
107849Salfred
107849Salfred  fp_k[0].lcs = previous_lcs;
43412Snewton  fp_k[0].position[0] = position[0];
83366Sjulian  fp_k[0].position[1] = position[1];
43412Snewton
43412Snewton  fp_k[0].y = position[1]->offset;
107849Salfred}
43412Snewton
43412Snewton
43412Snewtonstatic svn_diff__lcs_t *
43412Snewtonsvn_diff__lcs_reverse(svn_diff__lcs_t *lcs)
43412Snewton{
43412Snewton  svn_diff__lcs_t *next;
83366Sjulian  svn_diff__lcs_t *prev;
43412Snewton
83366Sjulian  next = NULL;
43412Snewton  while (lcs != NULL)
43412Snewton    {
43412Snewton      prev = lcs->next;
43412Snewton      lcs->next = next;
43412Snewton      next = lcs;
43412Snewton      lcs = prev;
43412Snewton    }
83366Sjulian
43412Snewton  return next;
43412Snewton}
43412Snewton
43412Snewton
43412Snewton/* Prepends a new lcs chunk for the amount of LINES at the given positions
168355Srwatson * POS0_OFFSET and POS1_OFFSET to the given LCS chain, and returns it.
43412Snewton * This function assumes LINES > 0. */
168355Srwatsonstatic svn_diff__lcs_t *
43412Snewtonprepend_lcs(svn_diff__lcs_t *lcs, apr_off_t lines,
43412Snewton            apr_off_t pos0_offset, apr_off_t pos1_offset,
43412Snewton            apr_pool_t *pool)
168355Srwatson{
43412Snewton  svn_diff__lcs_t *new_lcs;
168355Srwatson
43412Snewton  SVN_ERR_ASSERT_NO_RETURN(lines > 0);
43412Snewton
43412Snewton  new_lcs = apr_palloc(pool, sizeof(*new_lcs));
43412Snewton  new_lcs->position[0] = apr_pcalloc(pool, sizeof(*new_lcs->position[0]));
43412Snewton  new_lcs->position[0]->offset = pos0_offset;
43412Snewton  new_lcs->position[1] = apr_pcalloc(pool, sizeof(*new_lcs->position[1]));
43412Snewton  new_lcs->position[1]->offset = pos1_offset;
43412Snewton  new_lcs->length = lines;
43412Snewton  new_lcs->refcount = 1;
43412Snewton  new_lcs->next = lcs;
43412Snewton
43412Snewton  return new_lcs;
43412Snewton}
43412Snewton
43412Snewton
43412Snewtonsvn_diff__lcs_t *
43412Snewtonsvn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) */
43412Snewton              svn_diff__position_t *position_list2, /* pointer to tail (ring) */
43412Snewton              svn_diff__token_index_t *token_counts_list1, /* array of counts */
43412Snewton              svn_diff__token_index_t *token_counts_list2, /* array of counts */
43412Snewton              svn_diff__token_index_t num_tokens,
102003Srwatson              apr_off_t prefix_lines,
43412Snewton              apr_off_t suffix_lines,
43412Snewton              apr_pool_t *pool)
43412Snewton{
43412Snewton  apr_off_t length[2];
43412Snewton  svn_diff__token_index_t *token_counts[2];
43412Snewton  svn_diff__token_index_t unique_count[2];
43412Snewton  svn_diff__token_index_t token_index;
43412Snewton  svn_diff__snake_t *fp;
43412Snewton  apr_off_t d;
43412Snewton  apr_off_t k;
43412Snewton  apr_off_t p = 0;
  svn_diff__lcs_t *lcs, *lcs_freelist = NULL;

  svn_diff__position_t sentinel_position[2];

  /* Since EOF is always a sync point we tack on an EOF link
   * with sentinel positions
   */
  lcs = apr_palloc(pool, sizeof(*lcs));
  lcs->position[0] = apr_pcalloc(pool, sizeof(*lcs->position[0]));
  lcs->position[0]->offset = position_list1
                             ? position_list1->offset + suffix_lines + 1
                             : prefix_lines + suffix_lines + 1;
  lcs->position[1] = apr_pcalloc(pool, sizeof(*lcs->position[1]));
  lcs->position[1]->offset = position_list2
                             ? position_list2->offset + suffix_lines + 1
                             : prefix_lines + suffix_lines + 1;
  lcs->length = 0;
  lcs->refcount = 1;
  lcs->next = NULL;

  if (position_list1 == NULL || position_list2 == NULL)
    {
      if (suffix_lines)
        lcs = prepend_lcs(lcs, suffix_lines,
                          lcs->position[0]->offset - suffix_lines,
                          lcs->position[1]->offset - suffix_lines,
                          pool);
      if (prefix_lines)
        lcs = prepend_lcs(lcs, prefix_lines, 1, 1, pool);

      return lcs;
    }

  unique_count[1] = unique_count[0] = 0;
  for (token_index = 0; token_index < num_tokens; token_index++)
    {
      if (token_counts_list1[token_index] == 0)
        unique_count[1] += token_counts_list2[token_index];
      if (token_counts_list2[token_index] == 0)
        unique_count[0] += token_counts_list1[token_index];
    }

  /* Calculate lengths M and N of the sequences to be compared. Do not
   * count tokens unique to one file, as those are ignored in __snake.
   */
  length[0] = position_list1->offset - position_list1->next->offset + 1
              - unique_count[0];
  length[1] = position_list2->offset - position_list2->next->offset + 1
              - unique_count[1];

  /* strikerXXX: here we allocate the furthest point array, which is
   * strikerXXX: sized M + N + 3 (!)
   */
  fp = apr_pcalloc(pool,
                   sizeof(*fp) * (apr_size_t)(length[0] + length[1] + 3));

  /* The origo of fp corresponds to the end state, where we are
   * at the end of both files. The valid states thus span from
   * -N (at end of first file and at the beginning of the second
   * file) to +M (the opposite :). Finally, svn_diff__snake needs
   * 1 extra slot on each side to work.
   */
  fp += length[1] + 1;

  sentinel_position[0].next = position_list1->next;
  position_list1->next = &sentinel_position[0];
  sentinel_position[0].offset = position_list1->offset + 1;
  token_counts[0] = token_counts_list1;

  sentinel_position[1].next = position_list2->next;
  position_list2->next = &sentinel_position[1];
  sentinel_position[1].offset = position_list2->offset + 1;
  token_counts[1] = token_counts_list2;

  /* Negative indices will not be used elsewhere
   */
  sentinel_position[0].token_index = -1;
  sentinel_position[1].token_index = -2;

  /* position d = M - N corresponds to the initial state, where
   * we are at the beginning of both files.
   */
  d = length[0] - length[1];

  /* k = d - 1 will be the first to be used to get previous
   * position information from, make sure it holds sane
   * data
   */
  fp[d - 1].position[0] = sentinel_position[0].next;
  fp[d - 1].position[1] = &sentinel_position[1];

  p = 0;
  do
    {
      /* For k < 0, insertions are free */
      for (k = (d < 0 ? d : 0) - p; k < 0; k++)
        {
          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
        }
	  /* for k > 0, deletions are free */
      for (k = (d > 0 ? d : 0) + p; k >= 0; k--)
        {
          svn_diff__snake(fp + k, token_counts, &lcs_freelist, pool);
        }

      p++;
    }
  while (fp[0].position[1] != &sentinel_position[1]);

  if (suffix_lines)
    lcs->next = prepend_lcs(fp[0].lcs, suffix_lines,
                            lcs->position[0]->offset - suffix_lines,
                            lcs->position[1]->offset - suffix_lines,
                            pool);
  else
    lcs->next = fp[0].lcs;

  lcs = svn_diff__lcs_reverse(lcs);

  position_list1->next = sentinel_position[0].next;
  position_list2->next = sentinel_position[1].next;

  if (prefix_lines)
    return prepend_lcs(lcs, prefix_lines, 1, 1, pool);
  else
    return lcs;
}