1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006, 2007 Free
4 Software Foundation, Inc.
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
21 /* The basic idea is to consider two vectors as similar if, when
22 transforming the first vector into the second vector through a
23 sequence of edits (inserts and deletes of one element each),
24 this sequence is short - or equivalently, if the ordered list
25 of elements that are untouched by these edits is long. For a
26 good introduction to the subject, read about the "Levenshtein
27 distance" in Wikipedia.
29 The basic algorithm is described in:
30 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
31 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
32 see especially section 4.2, which describes the variation used below.
34 The basic algorithm was independently discovered as described in:
35 "Algorithms for Approximate String Matching", E. Ukkonen,
36 Information and Control Vol. 64, 1985, pp. 100-118.
38 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
39 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
40 at the price of producing suboptimal output for large inputs with
43 /* Before including this file, you need to define:
44 ELEMENT The element type of the vectors being compared.
45 EQUAL A two-argument macro that tests two elements for
47 OFFSET A signed integer type sufficient to hold the
48 difference between two indices. Usually
49 something like ssize_t.
50 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
51 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
52 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
53 USE_HEURISTIC (Optional) Define if you want to support the
54 heuristic for large vectors. */
56 /* Maximum value of type OFFSET. */
58 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
60 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
63 # define IF_LINT(Code) Code
65 # define IF_LINT(Code) /* empty */
70 * Context of comparison operation.
74 /* Vectors being compared. */
81 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
82 furthest along the given diagonal in the forward search of the edit
86 /* Vector, indexed by diagonal, containing the X coordinate of the point
87 furthest along the given diagonal in the backward search of the edit
92 /* This corresponds to the diff -H flag. With this heuristic, for
93 vectors with a constant small density of changes, the algorithm is
94 linear in the vectors size. */
98 /* Edit scripts longer than this are too expensive to compute. */
101 /* Snakes bigger than this are considered `big'. */
102 #define SNAKE_LIMIT 20
107 /* Midpoints of this partition. */
111 /* True if low half will be analyzed minimally. */
114 /* Likewise for high half. */
119 /* Find the midpoint of the shortest edit script for a specified portion
122 Scan from the beginnings of the vectors, and simultaneously from the ends,
123 doing a breadth-first search through the space of edit-sequence.
124 When the two searches meet, we have found the midpoint of the shortest
127 If FIND_MINIMAL is true, find the minimal edit script regardless of
128 expense. Otherwise, if the search is too expensive, use heuristics to
129 stop the search and report a suboptimal answer.
131 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
132 XMID - YMID equals the number of inserted elements minus the number
133 of deleted elements (counting only elements before the midpoint).
135 Set PART->lo_minimal to true iff the minimal edit script for the
136 left half of the partition is known; similarly for PART->hi_minimal.
138 This function assumes that the first elements of the specified portions
139 of the two vectors do not match, and likewise that the last elements do not
140 match. The caller must trim matching elements from the beginning and end
141 of the portions it is going to specify.
143 If we return the "wrong" partitions, the worst this can do is cause
144 suboptimal diff output. It cannot cause incorrect diff output. */
147 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
148 struct partition *part, struct context *ctxt)
150 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
151 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
152 const ELEMENT *const xv = ctxt->xvec; /* Still more help for the compiler. */
153 const ELEMENT *const yv = ctxt->yvec; /* And more and more . . . */
154 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
155 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
156 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
157 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
159 OFFSET fmax = fmid; /* Limits of top-down search. */
161 OFFSET bmax = bmid; /* Limits of bottom-up search. */
162 OFFSET c; /* Cost. */
163 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
164 diagonal with respect to the northwest. */
171 OFFSET d; /* Active diagonal. */
172 bool big_snake = false;
174 /* Extend the top-down search by an edit step in each diagonal. */
183 for (d = fmax; d >= fmin; d -= 2)
187 OFFSET tlo = fd[d - 1];
188 OFFSET thi = fd[d + 1];
189 OFFSET x0 = tlo < thi ? thi : tlo + 1;
191 for (x = x0, y = x0 - d;
192 x < xlim && y < ylim && EQUAL (xv[x], yv[y]);
195 if (x - x0 > SNAKE_LIMIT)
198 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
202 part->lo_minimal = part->hi_minimal = true;
207 /* Similarly extend the bottom-up search. */
209 bd[--bmin - 1] = OFFSET_MAX;
213 bd[++bmax + 1] = OFFSET_MAX;
216 for (d = bmax; d >= bmin; d -= 2)
220 OFFSET tlo = bd[d - 1];
221 OFFSET thi = bd[d + 1];
222 OFFSET x0 = tlo < thi ? tlo : thi - 1;
224 for (x = x0, y = x0 - d;
225 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]);
228 if (x0 - x > SNAKE_LIMIT)
231 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
235 part->lo_minimal = part->hi_minimal = true;
244 /* Heuristic: check occasionally for a diagonal that has made lots
245 of progress compared with the edit distance. If we have any
246 such, find the one that has made the most progress and return it
247 as if it had succeeded.
249 With this heuristic, for vectors with a constant small density
250 of changes, the algorithm is linear in the vector size. */
252 if (200 < c && big_snake && ctxt->heuristic)
256 for (d = fmax; d >= fmin; d -= 2)
258 OFFSET dd = d - fmid;
261 OFFSET v = (x - xoff) * 2 - dd;
263 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
266 && xoff + SNAKE_LIMIT <= x && x < xlim
267 && yoff + SNAKE_LIMIT <= y && y < ylim)
269 /* We have a good enough best diagonal; now insist
270 that it end with a significant snake. */
273 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++)
274 if (k == SNAKE_LIMIT)
286 part->lo_minimal = true;
287 part->hi_minimal = false;
292 for (d = bmax; d >= bmin; d -= 2)
294 OFFSET dd = d - bmid;
297 OFFSET v = (xlim - x) * 2 + dd;
299 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
302 && xoff < x && x <= xlim - SNAKE_LIMIT
303 && yoff < y && y <= ylim - SNAKE_LIMIT)
305 /* We have a good enough best diagonal; now insist
306 that it end with a significant snake. */
309 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++)
310 if (k == SNAKE_LIMIT - 1)
322 part->lo_minimal = false;
323 part->hi_minimal = true;
327 #endif /* USE_HEURISTIC */
329 /* Heuristic: if we've gone well beyond the call of duty, give up
330 and report halfway between our best results so far. */
331 if (c >= ctxt->too_expensive)
334 OFFSET fxbest IF_LINT (= 0);
336 OFFSET bxbest IF_LINT (= 0);
338 /* Find forward diagonal that maximizes X + Y. */
340 for (d = fmax; d >= fmin; d -= 2)
342 OFFSET x = MIN (fd[d], xlim);
356 /* Find backward diagonal that minimizes X + Y. */
357 bxybest = OFFSET_MAX;
358 for (d = bmax; d >= bmin; d -= 2)
360 OFFSET x = MAX (xoff, bd[d]);
374 /* Use the better of the two diagonals. */
375 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
378 part->ymid = fxybest - fxbest;
379 part->lo_minimal = true;
380 part->hi_minimal = false;
385 part->ymid = bxybest - bxbest;
386 part->lo_minimal = false;
387 part->hi_minimal = true;
395 /* Compare in detail contiguous subsequences of the two vectors
396 which are known, as a whole, to match each other.
398 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
400 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
403 If FIND_MINIMAL, find a minimal difference no matter how
406 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. */
409 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
410 bool find_minimal, struct context *ctxt)
412 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
413 ELEMENT const *yv = ctxt->yvec;
415 /* Slide down the bottom initial diagonal. */
416 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff]))
422 /* Slide up the top initial diagonal. */
423 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1]))
429 /* Handle simple cases. */
433 NOTE_INSERT (ctxt, yoff);
436 else if (yoff == ylim)
439 NOTE_DELETE (ctxt, xoff);
444 struct partition part;
446 /* Find a point of correspondence in the middle of the vectors. */
447 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
449 /* Use the partitions to split this problem into subproblems. */
450 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
451 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
458 #undef EXTRA_CONTEXT_FIELDS