1 /* Analyze differences between two vectors.
3 Copyright (C) 1988-1989, 1992-1995, 2001-2004, 2006-2008 Free
4 Software Foundation, Inc.
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <http://www.gnu.org/licenses/>. */
20 /* The basic idea is to consider two vectors as similar if, when
21 transforming the first vector into the second vector through a
22 sequence of edits (inserts and deletes of one element each),
23 this sequence is short - or equivalently, if the ordered list
24 of elements that are untouched by these edits is long. For a
25 good introduction to the subject, read about the "Levenshtein
26 distance" in Wikipedia.
28 The basic algorithm is described in:
29 "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
30 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
31 see especially section 4.2, which describes the variation used below.
33 The basic algorithm was independently discovered as described in:
34 "Algorithms for Approximate String Matching", E. Ukkonen,
35 Information and Control Vol. 64, 1985, pp. 100-118.
37 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE
38 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
39 at the price of producing suboptimal output for large inputs with
42 /* Before including this file, you need to define:
43 ELEMENT The element type of the vectors being compared.
44 EQUAL A two-argument macro that tests two elements for
46 OFFSET A signed integer type sufficient to hold the
47 difference between two indices. Usually
48 something like ssize_t.
49 EXTRA_CONTEXT_FIELDS Declarations of fields for 'struct context'.
50 NOTE_DELETE(ctxt, xoff) Record the removal of the object xvec[xoff].
51 NOTE_INSERT(ctxt, yoff) Record the insertion of the object yvec[yoff].
52 USE_HEURISTIC (Optional) Define if you want to support the
53 heuristic for large vectors.
54 Before including this file, you also need to include:
60 /* Maximum value of type OFFSET. */
62 ((((OFFSET)1 << (sizeof (OFFSET) * CHAR_BIT - 2)) - 1) * 2 + 1)
64 /* Use this to suppress gcc's `...may be used before initialized' warnings. */
67 # define IF_LINT(Code) Code
69 # define IF_LINT(Code) /* empty */
74 * Context of comparison operation.
78 /* Vectors being compared. */
85 /* Vector, indexed by diagonal, containing 1 + the X coordinate of the point
86 furthest along the given diagonal in the forward search of the edit
90 /* Vector, indexed by diagonal, containing the X coordinate of the point
91 furthest along the given diagonal in the backward search of the edit
96 /* This corresponds to the diff -H flag. With this heuristic, for
97 vectors with a constant small density of changes, the algorithm is
98 linear in the vectors size. */
102 /* Edit scripts longer than this are too expensive to compute. */
103 OFFSET too_expensive;
105 /* Snakes bigger than this are considered `big'. */
106 #define SNAKE_LIMIT 20
111 /* Midpoints of this partition. */
115 /* True if low half will be analyzed minimally. */
118 /* Likewise for high half. */
123 /* Find the midpoint of the shortest edit script for a specified portion
126 Scan from the beginnings of the vectors, and simultaneously from the ends,
127 doing a breadth-first search through the space of edit-sequence.
128 When the two searches meet, we have found the midpoint of the shortest
131 If FIND_MINIMAL is true, find the minimal edit script regardless of
132 expense. Otherwise, if the search is too expensive, use heuristics to
133 stop the search and report a suboptimal answer.
135 Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number
136 XMID - YMID equals the number of inserted elements minus the number
137 of deleted elements (counting only elements before the midpoint).
139 Set PART->lo_minimal to true iff the minimal edit script for the
140 left half of the partition is known; similarly for PART->hi_minimal.
142 This function assumes that the first elements of the specified portions
143 of the two vectors do not match, and likewise that the last elements do not
144 match. The caller must trim matching elements from the beginning and end
145 of the portions it is going to specify.
147 If we return the "wrong" partitions, the worst this can do is cause
148 suboptimal diff output. It cannot cause incorrect diff output. */
151 diag (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim, bool find_minimal,
152 struct partition *part, struct context *ctxt)
154 OFFSET *const fd = ctxt->fdiag; /* Give the compiler a chance. */
155 OFFSET *const bd = ctxt->bdiag; /* Additional help for the compiler. */
156 ELEMENT const *const xv = ctxt->xvec; /* Still more help for the compiler. */
157 ELEMENT const *const yv = ctxt->yvec; /* And more and more . . . */
158 const OFFSET dmin = xoff - ylim; /* Minimum valid diagonal. */
159 const OFFSET dmax = xlim - yoff; /* Maximum valid diagonal. */
160 const OFFSET fmid = xoff - yoff; /* Center diagonal of top-down search. */
161 const OFFSET bmid = xlim - ylim; /* Center diagonal of bottom-up search. */
163 OFFSET fmax = fmid; /* Limits of top-down search. */
165 OFFSET bmax = bmid; /* Limits of bottom-up search. */
166 OFFSET c; /* Cost. */
167 bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd
168 diagonal with respect to the northwest. */
175 OFFSET d; /* Active diagonal. */
176 bool big_snake = false;
178 /* Extend the top-down search by an edit step in each diagonal. */
187 for (d = fmax; d >= fmin; d -= 2)
191 OFFSET tlo = fd[d - 1];
192 OFFSET thi = fd[d + 1];
193 OFFSET x0 = tlo < thi ? thi : tlo + 1;
195 for (x = x0, y = x0 - d;
196 x < xlim && y < ylim && EQUAL (xv[x], yv[y]);
199 if (x - x0 > SNAKE_LIMIT)
202 if (odd && bmin <= d && d <= bmax && bd[d] <= x)
206 part->lo_minimal = part->hi_minimal = true;
211 /* Similarly extend the bottom-up search. */
213 bd[--bmin - 1] = OFFSET_MAX;
217 bd[++bmax + 1] = OFFSET_MAX;
220 for (d = bmax; d >= bmin; d -= 2)
224 OFFSET tlo = bd[d - 1];
225 OFFSET thi = bd[d + 1];
226 OFFSET x0 = tlo < thi ? tlo : thi - 1;
228 for (x = x0, y = x0 - d;
229 xoff < x && yoff < y && EQUAL (xv[x - 1], yv[y - 1]);
232 if (x0 - x > SNAKE_LIMIT)
235 if (!odd && fmin <= d && d <= fmax && x <= fd[d])
239 part->lo_minimal = part->hi_minimal = true;
248 /* Heuristic: check occasionally for a diagonal that has made lots
249 of progress compared with the edit distance. If we have any
250 such, find the one that has made the most progress and return it
251 as if it had succeeded.
253 With this heuristic, for vectors with a constant small density
254 of changes, the algorithm is linear in the vector size. */
256 if (200 < c && big_snake && ctxt->heuristic)
260 for (d = fmax; d >= fmin; d -= 2)
262 OFFSET dd = d - fmid;
265 OFFSET v = (x - xoff) * 2 - dd;
267 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
270 && xoff + SNAKE_LIMIT <= x && x < xlim
271 && yoff + SNAKE_LIMIT <= y && y < ylim)
273 /* We have a good enough best diagonal; now insist
274 that it end with a significant snake. */
277 for (k = 1; EQUAL (xv[x - k], yv[y - k]); k++)
278 if (k == SNAKE_LIMIT)
290 part->lo_minimal = true;
291 part->hi_minimal = false;
296 for (d = bmax; d >= bmin; d -= 2)
298 OFFSET dd = d - bmid;
301 OFFSET v = (xlim - x) * 2 + dd;
303 if (v > 12 * (c + (dd < 0 ? -dd : dd)))
306 && xoff < x && x <= xlim - SNAKE_LIMIT
307 && yoff < y && y <= ylim - SNAKE_LIMIT)
309 /* We have a good enough best diagonal; now insist
310 that it end with a significant snake. */
313 for (k = 0; EQUAL (xv[x + k], yv[y + k]); k++)
314 if (k == SNAKE_LIMIT - 1)
326 part->lo_minimal = false;
327 part->hi_minimal = true;
331 #endif /* USE_HEURISTIC */
333 /* Heuristic: if we've gone well beyond the call of duty, give up
334 and report halfway between our best results so far. */
335 if (c >= ctxt->too_expensive)
338 OFFSET fxbest IF_LINT (= 0);
340 OFFSET bxbest IF_LINT (= 0);
342 /* Find forward diagonal that maximizes X + Y. */
344 for (d = fmax; d >= fmin; d -= 2)
346 OFFSET x = MIN (fd[d], xlim);
360 /* Find backward diagonal that minimizes X + Y. */
361 bxybest = OFFSET_MAX;
362 for (d = bmax; d >= bmin; d -= 2)
364 OFFSET x = MAX (xoff, bd[d]);
378 /* Use the better of the two diagonals. */
379 if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
382 part->ymid = fxybest - fxbest;
383 part->lo_minimal = true;
384 part->hi_minimal = false;
389 part->ymid = bxybest - bxbest;
390 part->lo_minimal = false;
391 part->hi_minimal = true;
399 /* Compare in detail contiguous subsequences of the two vectors
400 which are known, as a whole, to match each other.
402 The subsequence of vector 0 is [XOFF, XLIM) and likewise for vector 1.
404 Note that XLIM, YLIM are exclusive bounds. All indices into the vectors
407 If FIND_MINIMAL, find a minimal difference no matter how
410 The results are recorded by invoking NOTE_DELETE and NOTE_INSERT. */
413 compareseq (OFFSET xoff, OFFSET xlim, OFFSET yoff, OFFSET ylim,
414 bool find_minimal, struct context *ctxt)
416 ELEMENT const *xv = ctxt->xvec; /* Help the compiler. */
417 ELEMENT const *yv = ctxt->yvec;
419 /* Slide down the bottom initial diagonal. */
420 while (xoff < xlim && yoff < ylim && EQUAL (xv[xoff], yv[yoff]))
426 /* Slide up the top initial diagonal. */
427 while (xoff < xlim && yoff < ylim && EQUAL (xv[xlim - 1], yv[ylim - 1]))
433 /* Handle simple cases. */
437 NOTE_INSERT (ctxt, yoff);
440 else if (yoff == ylim)
443 NOTE_DELETE (ctxt, xoff);
448 struct partition part;
450 /* Find a point of correspondence in the middle of the vectors. */
451 diag (xoff, xlim, yoff, ylim, find_minimal, &part, ctxt);
453 /* Use the partitions to split this problem into subproblems. */
454 compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal, ctxt);
455 compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal, ctxt);
462 #undef EXTRA_CONTEXT_FIELDS