1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2010 Free Software
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
31 see especially section 4.2, which describes the variation used below.
33 The basic algorithm was independently discovered as described in:
34 "Algorithms for Approximate String Matching", E. Ukkonen,
35 Information and Control Vol. 64, 1985, pp. 100-118.
37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
39 at the price of producing suboptimal output for large inputs with
42 /* Before including this file, you need to define:
43 ELEMENT The element type of the vectors being compared.
44 EQUAL A two-argument macro that tests two elements for
46 OFFSET A signed integer type sufficient to hold the
47 difference between two indices. Usually
48 something like ssize_t.
49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
52 EARLY_ABORT(ctxt) (Optional) A boolean expression that triggers an
53 early abort of the computation.
54 USE_HEURISTIC (Optional) Define if you want to support the
55 heuristic for large vectors.
56 Before including this file, you also need to include:
62 /* Maximum value of type OFFSET. */
64 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
66 /* Default to no early abort. */
68 # define EARLY_ABORT(ctxt) false
71 /* Use this to suppress gcc's `...may be used before initialized' warnings.
72 Beware: The Code argument must not contain commas. */
75 # define IF_LINT(Code) Code
77 # define IF_LINT(Code) /* empty */
81 /* As above, but when Code must contain one comma. */
84 # define IF_LINT2(Code1, Code2) Code1, Code2
86 # define IF_LINT2(Code1, Code2) /* empty */
91 * Context of comparison operation.
95 /* Vectors being compared. */
102 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
103 furthest along the given diagonal in the forward search of the edit
107 /* Vector, indexed by diagonal, containing the X coordinate of the point
108 furthest along the given diagonal in the backward search of the edit
113 /* This corresponds to the diff -H flag. With this heuristic, for
114 vectors with a constant small density of changes, the algorithm is
115 linear in the vectors size. */
119 /* Edit scripts longer than this are too expensive to compute. */
120 OFFSET too_expensive;
122 /* Snakes bigger than this are considered `big'. */
123 #define SNAKE_LIMIT 20
128 /* Midpoints of this partition. */
132 /* True if low half will be analyzed minimally. */
135 /* Likewise for high half. */
140 /* Find the midpoint of the shortest edit script for a specified portion
143 Scan from the beginnings of the vectors, and simultaneously from the ends,
144 doing a breadth-first search through the space of edit-sequence.
145 When the two searches meet, we have found the midpoint of the shortest
148 If FIND_MINIMAL is true, find the minimal edit script regardless of
149 expense. Otherwise, if the search is too expensive, use heuristics to
150 stop the search and report a suboptimal answer.
152 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
153 XMID - YMID equals the number of inserted elements minus the number
154 of deleted elements (counting only elements before the midpoint).
156 Set PART->lo_minimal to true iff the minimal edit script for the
157 left half of the partition is known; similarly for PART->hi_minimal.
159 This function assumes that the first elements of the specified portions
160 of the two vectors do not match, and likewise that the last elements do not
161 match. The caller must trim matching elements from the beginning and end
162 of the portions it is going to specify.
164 If we return the "wrong" partitions, the worst this can do is cause
165 suboptimal diff output. It cannot cause incorrect diff output. */
168 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
169 struct partition *part, struct context *ctxt)
171 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
172 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
173 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
174 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
175 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
176 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
177 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
178 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
180 OFFSET fmax = fmid; /* Limits of top-down search. */
182 OFFSET bmax = bmid; /* Limits of bottom-up search. */
183 OFFSET c; /* Cost. */
184 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
185 diagonal with respect to the northwest. */
192 OFFSET d; /* Active diagonal. */
193 bool big_snake = false;
195 /* Extend the top-down search by an edit step in each diagonal. */
204 for (d = fmax; d >= fmin; d -= 2)
208 OFFSET tlo = fd[d - 1];
209 OFFSET thi = fd[d + 1];
210 OFFSET x0 = tlo < thi ? thi : tlo + 1;
212 for (x = x0, y = x0 - d;
213 x < xlim && y < ylim && EQUAL (xv[x], yv[y]);
216 if (x - x0 > SNAKE_LIMIT)
219 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
223 part->lo_minimal = part->hi_minimal = true;
228 /* Similarly extend the bottom-up search. */
230 bd[--bmin - 1] = OFFSET_MAX;
234 bd[++bmax + 1] = OFFSET_MAX;
237 for (d = bmax; d >= bmin; d -= 2)
241 OFFSET tlo = bd[d - 1];
242 OFFSET thi = bd[d + 1];
243 OFFSET x0 = tlo < thi ? tlo : thi - 1;
245 for (x = x0, y = x0 - d;
246 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]);
249 if (x0 - x > SNAKE_LIMIT)
252 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
256 part->lo_minimal = part->hi_minimal = true;
265 /* Heuristic: check occasionally for a diagonal that has made lots
266 of progress compared with the edit distance. If we have any
267 such, find the one that has made the most progress and return it
268 as if it had succeeded.
270 With this heuristic, for vectors with a constant small density
271 of changes, the algorithm is linear in the vector size. */
273 if (200 < c && big_snake && ctxt->heuristic)
278 for (d = fmax; d >= fmin; d -= 2)
280 OFFSET dd = d - fmid;
283 OFFSET v = (x - xoff) * 2 - dd;
285 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
288 && xoff + SNAKE_LIMIT <= x && x < xlim
289 && yoff + SNAKE_LIMIT <= y && y < ylim)
291 /* We have a good enough best diagonal; now insist
292 that it end with a significant snake. */
295 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++)
296 if (k == SNAKE_LIMIT)
308 part->lo_minimal = true;
309 part->hi_minimal = false;
317 for (d = bmax; d >= bmin; d -= 2)
319 OFFSET dd = d - bmid;
322 OFFSET v = (xlim - x) * 2 + dd;
324 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
327 && xoff < x && x <= xlim - SNAKE_LIMIT
328 && yoff < y && y <= ylim - SNAKE_LIMIT)
330 /* We have a good enough best diagonal; now insist
331 that it end with a significant snake. */
334 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++)
335 if (k == SNAKE_LIMIT - 1)
347 part->lo_minimal = false;
348 part->hi_minimal = true;
353 #endif /* USE_HEURISTIC */
355 /* Heuristic: if we've gone well beyond the call of duty, give up
356 and report halfway between our best results so far. */
357 if (c >= ctxt->too_expensive)
360 OFFSET fxbest IF_LINT (= 0);
362 OFFSET bxbest IF_LINT (= 0);
364 /* Find forward diagonal that maximizes X + Y. */
366 for (d = fmax; d >= fmin; d -= 2)
368 OFFSET x = MIN (fd[d], xlim);
382 /* Find backward diagonal that minimizes X + Y. */
383 bxybest = OFFSET_MAX;
384 for (d = bmax; d >= bmin; d -= 2)
386 OFFSET x = MAX (xoff, bd[d]);
400 /* Use the better of the two diagonals. */
401 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
404 part->ymid = fxybest - fxbest;
405 part->lo_minimal = true;
406 part->hi_minimal = false;
411 part->ymid = bxybest - bxbest;
412 part->lo_minimal = false;
413 part->hi_minimal = true;
421 /* Compare in detail contiguous subsequences of the two vectors
422 which are known, as a whole, to match each other.
424 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
426 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
429 If FIND_MINIMAL, find a minimal difference no matter how
432 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT.
434 Return false if terminated normally, or true if terminated through early
438 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
439 bool find_minimal, struct context *ctxt)
441 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
442 ELEMENT const *yv = ctxt->yvec;
444 /* Slide down the bottom initial diagonal. */
445 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff]))
451 /* Slide up the top initial diagonal. */
452 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1]))
458 /* Handle simple cases. */
462 NOTE_INSERT (ctxt, yoff);
463 if (EARLY_ABORT (ctxt))
467 else if (yoff == ylim)
470 NOTE_DELETE (ctxt, xoff);
471 if (EARLY_ABORT (ctxt))
477 struct partition part IF_LINT2 (= { .xmid = 0, .ymid = 0 });
479 /* Find a point of correspondence in the middle of the vectors. */
480 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
482 /* Use the partitions to split this problem into subproblems. */
483 if (compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt))
485 if (compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt))
495 #undef EXTRA_CONTEXT_FIELDS