1 package Algorithm::Diff;
2 # Skip to first "=head" line for documentation.
5 use integer; # see below in _replaceNextLargerWith() for mod to make
6 # if you don't use this
7 use vars qw( $VERSION @EXPORT_OK );
9 # ^ ^^ ^^-- Incremented at will
10 # | \+----- Incremented for non-trivial changes to features
11 # \-------- Incremented for fundamental changes
13 *import = \&Exporter::import;
15 prepare LCS LCDidx LCS_length
16 diff sdiff compact_diff
17 traverse_sequences traverse_balanced
20 # McIlroy-Hunt diff algorithm
21 # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
22 # by Ned Konz, perl@bike-nomad.com
23 # Updates by Tye McQueen, http://perlmonks.org/?node=tye
25 # Create a hash that maps each element of $aCollection to the set of
26 # positions it occupies in $aCollection, restricted to the elements
27 # within the range of indexes specified by $start and $end.
28 # The fourth parameter is a subroutine reference that will be called to
29 # generate a string to use as a key.
30 # Additional parameters, if any, will be passed to this subroutine.
32 # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
34 sub _withPositionsOfInInterval
36 my $aCollection = shift; # array ref
42 for ( $index = $start ; $index <= $end ; $index++ )
44 my $element = $aCollection->[$index];
45 my $key = &$keyGen( $element, @_ );
46 if ( exists( $d{$key} ) )
48 unshift ( @{ $d{$key} }, $index );
55 return wantarray ? %d : \%d;
58 # Find the place at which aValue would normally be inserted into the
59 # array. If that place is already occupied by aValue, do nothing, and
60 # return undef. If the place does not exist (i.e., it is off the end of
61 # the array), add it to the end, otherwise replace the element at that
62 # point with aValue. It is assumed that the array's values are numeric.
63 # This is where the bulk (75%) of the time is spent in this module, so
64 # try to make it fast!
66 sub _replaceNextLargerWith
68 my ( $array, $aValue, $high ) = @_;
72 if ( $high == -1 || $aValue > $array->[-1] )
74 push ( @$array, $aValue );
78 # binary search for insertion point...
82 while ( $low <= $high )
84 $index = ( $high + $low ) / 2;
86 # $index = int(( $high + $low ) / 2); # without 'use integer'
87 $found = $array->[$index];
89 if ( $aValue == $found )
93 elsif ( $aValue > $found )
103 # now insertion point is in $low.
104 $array->[$low] = $aValue; # overwrite next larger
108 # This method computes the longest common subsequence in $a and $b.
110 # Result is array or ref, whose contents is such that
111 # $a->[ $i ] == $b->[ $result[ $i ] ]
112 # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
114 # An additional argument may be passed; this is a hash or key generating
115 # function that should return a string that uniquely identifies the given
116 # element. It should be the case that if the key is the same, the elements
117 # will compare the same. If this parameter is undef or missing, the key
118 # will be the element as a string.
120 # By default, comparisons will use "eq" and elements will be turned into keys
121 # using the default stringizing operator '""'.
123 # Additional parameters, if any, will be passed to the key generation
126 sub _longestCommonSubsequence
128 my $a = shift; # array ref or hash ref
129 my $b = shift; # array ref or hash ref
130 my $counting = shift; # scalar
131 my $keyGen = shift; # code ref
132 my $compare; # code ref
134 if ( ref($a) eq 'HASH' )
135 { # prepared hash must be in $b
141 # Check for bogus (non-ref) argument values
142 if ( !ref($a) || !ref($b) )
144 my @callerInfo = caller(1);
145 die 'error: must pass array or hash references to ' . $callerInfo[3];
149 # Note that these are optimized.
150 if ( !defined($keyGen) ) # optimize for strings
152 $keyGen = sub { $_[0] };
153 $compare = sub { my ( $a, $b ) = @_; $a eq $b };
160 &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
164 my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
165 my ( $prunedCount, $bMatches ) = ( 0, {} );
167 if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
173 my ( $bStart, $bFinish ) = ( 0, $#$b );
175 # First we prune off any common elements at the beginning
176 while ( $aStart <= $aFinish
177 and $bStart <= $bFinish
178 and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
180 $matchVector->[ $aStart++ ] = $bStart++;
185 while ( $aStart <= $aFinish
186 and $bStart <= $bFinish
187 and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
189 $matchVector->[ $aFinish-- ] = $bFinish--;
193 # Now compute the equivalence classes of positions of elements
195 _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
200 my ( $i, $ai, $j, $k );
201 for ( $i = $aStart ; $i <= $aFinish ; $i++ )
203 $ai = &$keyGen( $a->[$i], @_ );
204 if ( exists( $bMatches->{$ai} ) )
207 for $j ( @{ $bMatches->{$ai} } )
210 # optimization: most of the time this will be true
211 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
217 $k = _replaceNextLargerWith( $thresh, $j, $k );
220 # oddly, it's faster to always test this (CPU cache?).
224 [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
232 return $prunedCount + @$thresh if $counting;
233 for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
235 $matchVector->[ $link->[1] ] = $link->[2];
243 return wantarray ? @$matchVector : $matchVector;
246 sub traverse_sequences
248 my $a = shift; # array ref
249 my $b = shift; # array ref
250 my $callbacks = shift || {};
252 my $matchCallback = $callbacks->{'MATCH'} || sub { };
253 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
254 my $finishedACallback = $callbacks->{'A_FINISHED'};
255 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
256 my $finishedBCallback = $callbacks->{'B_FINISHED'};
257 my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
259 # Process all the lines in @$matchVector
265 for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
267 my $bLine = $matchVector->[$ai];
268 if ( defined($bLine) ) # matched
270 &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
271 &$matchCallback( $ai, $bi++, @_ );
275 &$discardACallback( $ai, $bi, @_ );
279 # The last entry (if any) processed was a match.
280 # $ai and $bi point just past the last matching lines in their sequences.
282 while ( $ai <= $lastA or $bi <= $lastB )
286 if ( $ai == $lastA + 1 and $bi <= $lastB )
288 if ( defined($finishedACallback) )
290 &$finishedACallback( $lastA, @_ );
291 $finishedACallback = undef;
295 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
300 if ( $bi == $lastB + 1 and $ai <= $lastA )
302 if ( defined($finishedBCallback) )
304 &$finishedBCallback( $lastB, @_ );
305 $finishedBCallback = undef;
309 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
313 &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
314 &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
320 sub traverse_balanced
322 my $a = shift; # array ref
323 my $b = shift; # array ref
324 my $callbacks = shift || {};
326 my $matchCallback = $callbacks->{'MATCH'} || sub { };
327 my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
328 my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
329 my $changeCallback = $callbacks->{'CHANGE'};
330 my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
332 # Process all the lines in match vector
343 # Find next match indices $ma and $mb
347 $ma <= $#$matchVector
348 && !defined $matchVector->[$ma]
351 last if $ma > $#$matchVector; # end of matchVector?
352 $mb = $matchVector->[$ma];
354 # Proceed with discard a/b or change events until
356 while ( $ai < $ma || $bi < $mb )
359 if ( $ai < $ma && $bi < $mb )
363 if ( defined $changeCallback )
365 &$changeCallback( $ai++, $bi++, @_ );
369 &$discardACallback( $ai++, $bi, @_ );
370 &$discardBCallback( $ai, $bi++, @_ );
375 &$discardACallback( $ai++, $bi, @_ );
381 &$discardBCallback( $ai, $bi++, @_ );
386 &$matchCallback( $ai++, $bi++, @_ );
389 while ( $ai <= $lastA || $bi <= $lastB )
391 if ( $ai <= $lastA && $bi <= $lastB )
395 if ( defined $changeCallback )
397 &$changeCallback( $ai++, $bi++, @_ );
401 &$discardACallback( $ai++, $bi, @_ );
402 &$discardBCallback( $ai, $bi++, @_ );
405 elsif ( $ai <= $lastA )
407 &$discardACallback( $ai++, $bi, @_ );
413 &$discardBCallback( $ai, $bi++, @_ );
422 my $a = shift; # array ref
423 my $keyGen = shift; # code ref
426 $keyGen = sub { $_[0] } unless defined($keyGen);
428 return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
433 my $a = shift; # array ref
434 my $b = shift; # array ref or hash ref
435 my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
438 for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
440 if ( defined( $matchVector->[$i] ) )
442 push ( @retval, $a->[$i] );
445 return wantarray ? @retval : \@retval;
450 my $a = shift; # array ref
451 my $b = shift; # array ref or hash ref
452 return _longestCommonSubsequence( $a, $b, 1, @_ );
459 my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
460 my @am= grep defined $match->[$_], 0..$#$match;
461 my @bm= @{$match}[@am];
469 my( $am, $bm )= LCSidx( $a, $b, @_ );
471 my( $ai, $bi )= ( 0, 0 );
472 push @cdiff, $ai, $bi;
474 while( @$am && $ai == $am->[0] && $bi == $bm->[0] ) {
479 push @cdiff, $ai, $bi;
483 push @cdiff, $ai, $bi;
485 push @cdiff, 0+@$a, 0+@$b
486 if $ai < @$a || $bi < @$b;
487 return wantarray ? @cdiff : \@cdiff;
492 my $a = shift; # array ref
493 my $b = shift; # array ref
497 push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
500 push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
507 traverse_sequences( $a, $b,
508 { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
510 return wantarray ? @$retval : $retval;
515 my $a = shift; # array ref
516 my $b = shift; # array ref
518 my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
519 my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
521 push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
524 push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
531 DISCARD_A => $discard,
537 return wantarray ? @$retval : $retval;
540 ########################################
541 my $Root= __PACKAGE__;
542 package Algorithm::Diff::_impl;
545 sub _Idx() { 0 } # $me->[_Idx]: Ref to array of hunk indices
546 # 1 # $me->[1]: Ref to first sequence
547 # 2 # $me->[2]: Ref to second sequence
548 sub _End() { 3 } # $me->[_End]: Diff between forward and reverse pos
549 sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
550 sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
551 sub _Pos() { 6 } # $me->[_Pos]: Which hunk is currently selected
552 sub _Off() { 7 } # $me->[_Off]: Offset into _Idx for current position
553 sub _Min() { -2 } # Added to _Off to get min instead of max+1
564 return if $me->[_Pos];
565 my $meth= ( caller(1) )[3];
566 Die( "Called $meth on 'reset' object" );
572 return $seq + $me->[_Off]
573 if 1 == $seq || 2 == $seq;
574 my $meth= ( caller(1) )[3];
575 Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
581 return ref $us if ref $us;
582 return $us . "::_obj";
587 my( $us, $seq1, $seq2, $opts ) = @_;
589 for( $opts->{keyGen} ) {
590 push @args, $_ if $_;
592 for( $opts->{keyGenArgs} ) {
593 push @args, @$_ if $_;
595 my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
597 if( 0 == $cdif->[2] && 0 == $cdif->[3] ) {
601 my @obj= ( $cdif, $seq1, $seq2 );
602 $obj[_End] = (1+@$cdif)/2;
605 my $me = bless \@obj, $us->getObjPkg();
613 $pos= int( $pos || 0 );
617 if $pos < 0 || $me->[_End] <= $pos;
618 $me->[_Pos]= $pos || !1;
619 $me->[_Off]= 2*$pos - 1;
625 my( $me, $base )= @_;
626 my $oldBase= $me->[_Base];
627 $me->[_Base]= 0+$base if defined $base;
633 my( $me, $pos, $base )= @_;
635 my $you= bless \@obj, ref($me);
636 $you->Reset( $pos ) if defined $pos;
642 my( $me, $steps )= @_;
643 $steps= 1 if ! defined $steps;
645 my $pos= $me->[_Pos];
646 my $new= $pos + $steps;
647 $new= 0 if $pos && $new < 0;
654 my( $me, $steps )= @_;
655 $steps= 1 if ! defined $steps;
656 my $pos= $me->Next(-$steps);
657 $pos -= $me->[_End] if $pos;
664 return 0 if $me->[_Same] == ( 1 & $me->[_Pos] );
666 my $off= $me->[_Off];
667 for my $seq ( 1, 2 ) {
669 if $me->[_Idx][ $off + $seq + _Min ]
670 < $me->[_Idx][ $off + $seq ];
676 my( $me, $seq, $base )= @_;
678 my $off= $me->_ChkSeq($seq);
679 $base= $me->[_Base] if !defined $base;
680 return $base + $me->[_Idx][ $off + _Min ];
684 my( $me, $seq, $base )= @_;
686 my $off= $me->_ChkSeq($seq);
687 $base= $me->[_Base] if !defined $base;
688 return $base + $me->[_Idx][ $off ] -1;
692 my( $me, $seq, $base )= @_;
694 my $off = $me->_ChkSeq($seq);
696 return $me->[_Idx][ $off ]
697 - $me->[_Idx][ $off + _Min ];
699 $base= $me->[_Base] if !defined $base;
700 return ( $base + $me->[_Idx][ $off + _Min ] )
701 .. ( $base + $me->[_Idx][ $off ] - 1 );
707 my $off = $me->_ChkSeq($seq);
709 return $me->[_Idx][ $off ]
710 - $me->[_Idx][ $off + _Min ];
714 $me->[_Idx][ $off + _Min ]
715 .. ( $me->[_Idx][ $off ] - 1 )
722 return wantarray ? () : 0
723 if $me->[_Same] != ( 1 & $me->[_Pos] );
724 return $me->Items(1);
736 items=> \&Items, # same thing
746 for my $word ( split ' ', $arg ) {
748 if( $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
749 || not $meth= $getName{ lc $2 }
751 Die( $Root, ", Get: Invalid request ($word)" );
753 my( $base, $name, $seq )= ( $1, $2, $3 );
757 : $meth->( $me, $seq, $base )
763 } elsif( 1 == @value ) {
766 Die( 0+@value, " values requested from ",
767 $Root, "'s Get in scalar context" );
771 my $Obj= getObjPkg($Root);
774 for my $meth ( qw( new getObjPkg ) ) {
775 *{$Root."::".$meth} = \&{$meth};
776 *{$Obj ."::".$meth} = \&{$meth};
779 Next Prev Reset Copy Base Diff
780 Same Items Range Min Max Get
783 *{$Obj."::".$meth} = \&{$meth};
791 Algorithm::Diff - Compute `intelligent' differences between two files / lists
795 require Algorithm::Diff;
797 # This example produces traditional 'diff' output:
799 my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
801 $diff->Base( 1 ); # Return line numbers, not indices
802 while( $diff->Next() ) {
803 next if $diff->Same();
805 if( ! $diff->Items(2) ) {
806 sprintf "%d,%dd%d\n",
807 $diff->Get(qw( Min1 Max1 Max2 ));
808 } elsif( ! $diff->Items(1) ) {
810 $diff->Get(qw( Max1 Min2 Max2 ));
813 sprintf "%d,%dc%d,%d\n",
814 $diff->Get(qw( Min1 Max1 Min2 Max2 ));
816 print "< $_" for $diff->Items(1);
818 print "> $_" for $diff->Items(2);
822 # Alternate interfaces:
824 use Algorithm::Diff qw(
825 LCS LCS_length LCSidx
826 diff sdiff compact_diff
827 traverse_sequences traverse_balanced );
829 @lcs = LCS( \@seq1, \@seq2 );
830 $lcsref = LCS( \@seq1, \@seq2 );
831 $count = LCS_length( \@seq1, \@seq2 );
833 ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
836 # Complicated interfaces:
838 @diffs = diff( \@seq1, \@seq2 );
840 @sdiffs = sdiff( \@seq1, \@seq2 );
842 @cdiffs = compact_diff( \@seq1, \@seq2 );
847 { MATCH => \&callback1,
848 DISCARD_A => \&callback2,
849 DISCARD_B => \&callback3,
858 { MATCH => \&callback1,
859 DISCARD_A => \&callback2,
860 DISCARD_B => \&callback3,
861 CHANGE => \&callback4,
870 (by Mark-Jason Dominus)
872 I once read an article written by the authors of C<diff>; they said
873 that they worked very hard on the algorithm until they found the
876 I think what they ended up using (and I hope someone will correct me,
877 because I am not very confident about this) was the `longest common
878 subsequence' method. In the LCS problem, you have two sequences of
883 a b c d e f g i j k r x y z
885 and you want to find the longest sequence of items that is present in
886 both original sequences in the same order. That is, you want to find
887 a new sequence I<S> which can be obtained from the first sequence by
888 deleting some items, and from the secend sequence by deleting other
889 items. You also want I<S> to be as long as possible. In this case I<S>
894 From there it's only a small step to get diff-like output:
899 This module solves the LCS problem. It also includes a canned function
900 to generate C<diff>-like output.
902 It might seem from the example above that the LCS of two sequences is
903 always pretty obvious, but that's not always the case, especially when
904 the two sequences have many repeated elements. For example, consider
909 A naive approach might start by matching up the C<a> and C<b> that
910 appear at the beginning of each sequence, like this:
915 This finds the common subsequence C<a b c z>. But actually, the LCS
928 (See also the README file and several example
929 scripts include with this module.)
931 This module now provides an object-oriented interface that uses less
932 memory and is easier to use than most of the previous procedural
933 interfaces. It also still provides several exportable functions. We'll
934 deal with these in ascending order of difficulty: C<LCS>,
935 C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
936 C<traverse_sequences>, and C<traverse_balanced>.
940 Given references to two lists of items, LCS returns an array containing
941 their longest common subsequence. In scalar context, it returns a
942 reference to such a list.
944 @lcs = LCS( \@seq1, \@seq2 );
945 $lcsref = LCS( \@seq1, \@seq2 );
947 C<LCS> may be passed an optional third parameter; this is a CODE
948 reference to a key generation function. See L</KEY GENERATION
951 @lcs = LCS( \@seq1, \@seq2, \&keyGen, @args );
952 $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
954 Additional parameters, if any, will be passed to the key generation
959 This is just like C<LCS> except it only returns the length of the
960 longest common subsequence. This provides a performance gain of about
961 9% compared to C<LCS>.
965 Like C<LCS> except it returns references to two arrays. The first array
966 contains the indices into @seq1 where the LCS items are located. The
967 second array contains the indices into @seq2 where the LCS items are located.
969 Therefore, the following three lists will contain the same values:
971 my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
972 my @list1 = @seq1[ @$idx1 ];
973 my @list2 = @seq2[ @$idx2 ];
974 my @list3 = LCS( \@seq1, \@seq2 );
978 $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
979 $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
981 C<new> computes the smallest set of additions and deletions necessary
982 to turn the first sequence into the second and compactly records them
985 You use the object to iterate over I<hunks>, where each hunk represents
986 a contiguous section of items which should be added, deleted, replaced,
991 The following summary of all of the methods looks a lot like Perl code
992 but some of the symbols have different meanings:
994 [ ] Encloses optional arguments
995 : Is followed by the default value for an optional argument
996 | Separates alternate return results
1000 $obj = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
1001 $pos = $obj->Next( [ $count : 1 ] );
1002 $revPos = $obj->Prev( [ $count : 1 ] );
1003 $obj = $obj->Reset( [ $pos : 0 ] );
1004 $copy = $obj->Copy( [ $pos, [ $newBase ] ] );
1005 $oldBase = $obj->Base( [ $newBase ] );
1007 Note that all of the following methods C<die> if used on an object that
1008 is "reset" (not currently pointing at any hunk).
1010 $bits = $obj->Diff( );
1011 @items|$cnt = $obj->Same( );
1012 @items|$cnt = $obj->Items( $seqNum );
1013 @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
1014 $minIdx = $obj->Min( $seqNum, [ $base ] );
1015 $maxIdx = $obj->Max( $seqNum, [ $base ] );
1016 @values = $obj->Get( @names );
1018 Passing in C<undef> for an optional argument is always treated the same
1019 as if no argument were passed in.
1023 $pos = $diff->Next(); # Move forward 1 hunk
1024 $pos = $diff->Next( 2 ); # Move forward 2 hunks
1025 $pos = $diff->Next(-5); # Move backward 5 hunks
1027 C<Next> moves the object to point at the next hunk. The object starts
1028 out "reset", which means it isn't pointing at any hunk. If the object
1029 is reset, then C<Next()> moves to the first hunk.
1031 C<Next> returns a true value iff the move didn't go past the last hunk.
1032 So C<Next(0)> will return true iff the object is not reset.
1034 Actually, C<Next> returns the object's new position, which is a number
1035 between 1 and the number of hunks (inclusive), or returns a false value.
1039 C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
1040 previous hunk. On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
1043 The position returned by C<Prev> is relative to the I<end> of the
1044 hunks; -1 for the last hunk, -2 for the second-to-last, etc.
1048 $diff->Reset(); # Reset the object's position
1049 $diff->Reset($pos); # Move to the specified hunk
1050 $diff->Reset(1); # Move to the first hunk
1051 $diff->Reset(-1); # Move to the last hunk
1053 C<Reset> returns the object, so, for example, you could use
1054 C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
1058 $copy = $diff->Copy( $newPos, $newBase );
1060 C<Copy> returns a copy of the object. The copy and the orignal object
1061 share most of their data, so making copies takes very little memory.
1062 The copy maintains its own position (separate from the original), which
1063 is the main purpose of copies. It also maintains its own base.
1065 By default, the copy's position starts out the same as the original
1066 object's position. But C<Copy> takes an optional first argument to set the
1067 new position, so the following three snippets are equivalent:
1069 $copy = $diff->Copy($pos);
1071 $copy = $diff->Copy();
1074 $copy = $diff->Copy()->Reset($pos);
1076 C<Copy> takes an optional second argument to set the base for
1077 the copy. If you wish to change the base of the copy but leave
1078 the position the same as in the original, here are two
1081 $copy = $diff->Copy();
1084 $copy = $diff->Copy(undef,0);
1086 Here are two equivalent way to get a "reset" copy:
1088 $copy = $diff->Copy(0);
1090 $copy = $diff->Copy()->Reset();
1094 $bits = $obj->Diff();
1096 C<Diff> returns a true value iff the current hunk contains items that are
1097 different between the two sequences. It actually returns one of the
1104 C<3==(1|2)>. This hunk contains items from @seq1 and the items
1105 from @seq2 that should replace them. Both sequence 1 and 2
1106 contain changed items so both the 1 and 2 bits are set.
1110 This hunk only contains items from @seq2 that should be inserted (not
1111 items from @seq1). Only sequence 2 contains changed items so only the 2
1116 This hunk only contains items from @seq1 that should be deleted (not
1117 items from @seq2). Only sequence 1 contains changed items so only the 1
1122 This means that the items in this hunk are the same in both sequences.
1123 Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
1130 C<Same> returns a true value iff the current hunk contains items that
1131 are the same in both sequences. It actually returns the list of items
1132 if they are the same or an emty list if they aren't. In a scalar
1133 context, it returns the size of the list.
1137 $count = $diff->Items(2);
1138 @items = $diff->Items($seqNum);
1140 C<Items> returns the (number of) items from the specified sequence that
1141 are part of the current hunk.
1143 If the current hunk contains only insertions, then
1144 C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
1145 If the current hunk contains only deletions, then C<< $diff->Items(2) >>
1146 will return an empty list (0 in a scalar conext).
1148 If the hunk contains replacements, then both C<< $diff->Items(1) >> and
1149 C<< $diff->Items(2) >> will return different, non-empty lists.
1151 Otherwise, the hunk contains identical items and all of the following
1152 will return the same lists:
1154 @items = $diff->Items(1);
1155 @items = $diff->Items(2);
1156 @items = $diff->Same();
1160 $count = $diff->Range( $seqNum );
1161 @indices = $diff->Range( $seqNum );
1162 @indices = $diff->Range( $seqNum, $base );
1164 C<Range> is like C<Items> except that it returns a list of I<indices> to
1165 the items rather than the items themselves. By default, the index of
1166 the first item (in each sequence) is 0 but this can be changed by
1167 calling the C<Base> method. So, by default, the following two snippets
1168 return the same lists:
1170 @list = $diff->Items(2);
1171 @list = @seq2[ $diff->Range(2) ];
1173 You can also specify the base to use as the second argument. So the
1174 following two snippets I<always> return the same lists:
1176 @list = $diff->Items(1);
1177 @list = @seq1[ $diff->Range(1,0) ];
1181 $curBase = $diff->Base();
1182 $oldBase = $diff->Base($newBase);
1184 C<Base> sets and/or returns the current base (usually 0 or 1) that is
1185 used when you request range information. The base defaults to 0 so
1186 that range information is returned as array indices. You can set the
1187 base to 1 if you want to report traditional line numbers instead.
1191 $min1 = $diff->Min(1);
1192 $min = $diff->Min( $seqNum, $base );
1194 C<Min> returns the first value that C<Range> would return (given the
1195 same arguments) or returns C<undef> if C<Range> would return an empty
1200 C<Max> returns the last value that C<Range> would return or C<undef>.
1204 ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
1205 @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
1207 C<Get> returns one or more scalar values. You pass in a list of the
1208 names of the values you want returned. Each name must match one of the
1211 /^(-?\d+)?(min|max)[12]$/i
1212 /^(range[12]|same|diff|base)$/i
1214 The 1 or 2 after a name says which sequence you want the information
1215 for (and where allowed, it is required). The optional number before
1216 "min" or "max" is the base to use. So the following equalities hold:
1218 $diff->Get('min1') == $diff->Min(1)
1219 $diff->Get('0min2') == $diff->Min(2,0)
1221 Using C<Get> in a scalar context when you've passed in more than one
1222 name is a fatal error (C<die> is called).
1228 Given a reference to a list of items, C<prepare> returns a reference
1229 to a hash which can be used when comparing this sequence to other
1230 sequences with C<LCS> or C<LCS_length>.
1232 $prep = prepare( \@seq1 );
1233 for $i ( 0 .. 10_000 )
1235 @lcs = LCS( $prep, $seq[$i] );
1236 # do something useful with @lcs
1239 C<prepare> may be passed an optional third parameter; this is a CODE
1240 reference to a key generation function. See L</KEY GENERATION
1243 $prep = prepare( \@seq1, \&keyGen );
1244 for $i ( 0 .. 10_000 )
1246 @lcs = LCS( $seq[$i], $prep, \&keyGen );
1247 # do something useful with @lcs
1250 Using C<prepare> provides a performance gain of about 50% when calling LCS
1251 many times compared with not preparing.
1255 @diffs = diff( \@seq1, \@seq2 );
1256 $diffs_ref = diff( \@seq1, \@seq2 );
1258 C<diff> computes the smallest set of additions and deletions necessary
1259 to turn the first sequence into the second, and returns a description
1260 of these changes. The description is a list of I<hunks>; each hunk
1261 represents a contiguous section of items which should be added,
1262 deleted, or replaced. (Hunks containing unchanged items are not
1265 The return value of C<diff> is a list of hunks, or, in scalar context, a
1266 reference to such a list. If there are no differences, the list will be
1269 Here is an example. Calling C<diff> for the following two sequences:
1272 b c d e f j k l m r s t
1274 would produce the following list:
1277 [ [ '-', 0, 'a' ] ],
1279 [ [ '+', 2, 'd' ] ],
1284 [ [ '+', 6, 'k' ] ],
1293 There are five hunks here. The first hunk says that the C<a> at
1294 position 0 of the first sequence should be deleted (C<->). The second
1295 hunk says that the C<d> at position 2 of the second sequence should
1296 be inserted (C<+>). The third hunk says that the C<h> at position 4
1297 of the first sequence should be removed and replaced with the C<f>
1298 from position 4 of the second sequence. And so on.
1300 C<diff> may be passed an optional third parameter; this is a CODE
1301 reference to a key generation function. See L</KEY GENERATION
1304 Additional parameters, if any, will be passed to the key generation
1309 @sdiffs = sdiff( \@seq1, \@seq2 );
1310 $sdiffs_ref = sdiff( \@seq1, \@seq2 );
1312 C<sdiff> computes all necessary components to show two sequences
1313 and their minimized differences side by side, just like the
1314 Unix-utility I<sdiff> does:
1321 It returns a list of array refs, each pointing to an array of
1322 display instructions. In scalar context it returns a reference
1323 to such a list. If there are no differences, the list will have one
1324 entry per item, each indicating that the item was unchanged.
1326 Display instructions consist of three elements: A modifier indicator
1327 (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
1328 C<c>: Element changed) and the value of the old and new elements, to
1329 be displayed side-by-side.
1331 An C<sdiff> of the following two sequences:
1334 b c d e f j k l m r s t
1353 C<sdiff> may be passed an optional third parameter; this is a CODE
1354 reference to a key generation function. See L</KEY GENERATION
1357 Additional parameters, if any, will be passed to the key generation
1360 =head2 C<compact_diff>
1362 C<compact_diff> is much like C<sdiff> except it returns a much more
1363 compact description consisting of just one flat list of indices. An
1364 example helps explain the format:
1366 my @a = qw( a b c e h j l m n p );
1367 my @b = qw( b c d e f j k l m r s t );
1368 @cdiff = compact_diff( \@a, \@b );
1371 # start start values values
1385 The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
1386 above example) indicating where a hunk begins. The 1st, 3rd, 5th, etc.
1387 entries are all indices into @seq2 (@b in the above example) indicating
1388 where the same hunk begins.
1390 So each pair of indices (except the last pair) describes where a hunk
1391 begins (in each sequence). Since each hunk must end at the item just
1392 before the item that starts the next hunk, the next pair of indices can
1393 be used to determine where the hunk ends.
1395 So, the first 4 entries (0..3) describe the first hunk. Entries 0 and 1
1396 describe where the first hunk begins (and so are always both 0).
1397 Entries 2 and 3 describe where the next hunk begins, so subtracting 1
1398 from each tells us where the first hunk ends. That is, the first hunk
1399 contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
1400 and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
1403 In other words, the first hunk consists of the following two lists of items:
1406 # of indices of indices
1407 @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
1408 @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
1409 # Hunk start Hunk end
1411 Note that the hunks will always alternate between those that are part of
1412 the LCS (those that contain unchanged items) and those that contain
1413 changes. This means that all we need to be told is whether the first
1414 hunk is a 'same' or 'diff' hunk and we can determine which of the other
1415 hunks contain 'same' items or 'diff' items.
1417 By convention, we always make the first hunk contain unchanged items.
1418 So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
1419 counting from 1) all contain unchanged items. And the 2nd, 4th, 6th,
1420 etc. hunks (all even-numbered hunks if you start counting from 1) all
1421 contain changed items.
1423 Since @a and @b don't begin with the same value, the first hunk in our
1424 example is empty (otherwise we'd violate the above convention). Note
1425 that the first 4 index values in our example are all zero. Plug these
1426 values into our previous code block and we get:
1428 @hunk1a = @a[ 0 .. 0-1 ];
1429 @hunk1b = @b[ 0 .. 0-1 ];
1431 And C<0..-1> returns the empty list.
1433 Move down one pair of indices (2..5) and we get the offset ranges for
1434 the second hunk, which contains changed items.
1436 Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
1437 consists of these two lists of items:
1439 @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
1440 @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
1442 @hunk2a = @a[ 0 .. 1-1 ];
1443 @hunk2b = @b[ 0 .. 0-1 ];
1445 @hunk2a = @a[ 0 .. 0 ];
1446 @hunk2b = @b[ 0 .. -1 ];
1451 That is, we would delete item 0 ('a') from @a.
1453 Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
1454 consists of these two lists of items:
1456 @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
1457 @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
1459 @hunk3a = @a[ 1 .. 3-1 ];
1460 @hunk3a = @b[ 0 .. 2-1 ];
1462 @hunk3a = @a[ 1 .. 2 ];
1463 @hunk3a = @b[ 0 .. 1 ];
1465 @hunk3a = qw( b c );
1466 @hunk3a = qw( b c );
1468 Note that this third hunk contains unchanged items as our convention demands.
1470 You can continue this process until you reach the last two indices,
1471 which will always be the number of items in each sequence. This is
1472 required so that subtracting one from each will give you the indices to
1473 the last items in each sequence.
1475 =head2 C<traverse_sequences>
1477 C<traverse_sequences> used to be the most general facility provided by
1478 this module (the new OO interface is more powerful and much easier to
1481 Imagine that there are two arrows. Arrow A points to an element of
1482 sequence A, and arrow B points to an element of the sequence B.
1483 Initially, the arrows point to the first elements of the respective
1484 sequences. C<traverse_sequences> will advance the arrows through the
1485 sequences one element at a time, calling an appropriate user-specified
1486 callback function before each advance. It willadvance the arrows in
1487 such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
1488 which are equal and which are part of the LCS, there will be some moment
1489 during the execution of C<traverse_sequences> when arrow A is pointing
1490 to C<$A[$i]> and arrow B is pointing to C<$B[$j]>. When this happens,
1491 C<traverse_sequences> will call the C<MATCH> callback function and then
1492 it will advance both arrows.
1494 Otherwise, one of the arrows is pointing to an element of its sequence
1495 that is not part of the LCS. C<traverse_sequences> will advance that
1496 arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
1497 depending on which arrow it advanced. If both arrows point to elements
1498 that are not part of the LCS, then C<traverse_sequences> will advance
1499 one of them and call the appropriate callback, but it is not specified
1502 The arguments to C<traverse_sequences> are the two sequences to
1503 traverse, and a hash which specifies the callback functions, like this:
1507 { MATCH => $callback_1,
1508 DISCARD_A => $callback_2,
1509 DISCARD_B => $callback_3,
1513 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
1514 the indices of the two arrows as their arguments. They are not expected
1515 to return any values. If a callback is omitted from the table, it is
1518 Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
1519 corresponding index in A or B.
1521 If arrow A reaches the end of its sequence, before arrow B does,
1522 C<traverse_sequences> will call the C<A_FINISHED> callback when it
1523 advances arrow B, if there is such a function; if not it will call
1524 C<DISCARD_B> instead. Similarly if arrow B finishes first.
1525 C<traverse_sequences> returns when both arrows are at the ends of their
1526 respective sequences. It returns true on success and false on failure.
1527 At present there is no way to fail.
1529 C<traverse_sequences> may be passed an optional fourth parameter; this
1530 is a CODE reference to a key generation function. See L</KEY GENERATION
1533 Additional parameters, if any, will be passed to the key generation function.
1535 If you want to pass additional parameters to your callbacks, but don't
1536 need a custom key generation function, you can get the default by
1541 { MATCH => $callback_1,
1542 DISCARD_A => $callback_2,
1543 DISCARD_B => $callback_3,
1545 undef, # default key-gen
1551 C<traverse_sequences> does not have a useful return value; you are
1552 expected to plug in the appropriate behavior with the callback
1555 =head2 C<traverse_balanced>
1557 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
1558 uses a different algorithm to iterate through the entries in the
1559 computed LCS. Instead of sticking to one side and showing element changes
1560 as insertions and deletions only, it will jump back and forth between
1561 the two sequences and report I<changes> occurring as deletions on one
1562 side followed immediatly by an insertion on the other side.
1564 In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
1565 supported by C<traverse_sequences>, C<traverse_balanced> supports
1566 a C<CHANGE> callback indicating that one element got C<replaced> by another:
1570 { MATCH => $callback_1,
1571 DISCARD_A => $callback_2,
1572 DISCARD_B => $callback_3,
1573 CHANGE => $callback_4,
1577 If no C<CHANGE> callback is specified, C<traverse_balanced>
1578 will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
1579 therefore resulting in a similar behaviour as C<traverse_sequences>
1580 with different order of events.
1582 C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
1583 noticable only while processing huge amounts of data.
1585 The C<sdiff> function of this module
1586 is implemented as call to C<traverse_balanced>.
1588 C<traverse_balanced> does not have a useful return value; you are expected to
1589 plug in the appropriate behavior with the callback functions.
1591 =head1 KEY GENERATION FUNCTIONS
1593 Most of the functions accept an optional extra parameter. This is a
1594 CODE reference to a key generating (hashing) function that should return
1595 a string that uniquely identifies a given element. It should be the
1596 case that if two elements are to be considered equal, their keys should
1597 be the same (and the other way around). If no key generation function
1598 is provided, the key will be the element as a string.
1600 By default, comparisons will use "eq" and elements will be turned into keys
1601 using the default stringizing operator '""'.
1603 Where this is important is when you're comparing something other than
1604 strings. If it is the case that you have multiple different objects
1605 that should be considered to be equal, you should supply a key
1606 generation function. Otherwise, you have to make sure that your arrays
1607 contain unique references.
1609 For instance, consider this example:
1615 my $package = shift;
1616 return bless { name => '', ssn => '', @_ }, $package;
1622 my $new = bless { %$old }, ref($old);
1627 return shift()->{'ssn'};
1630 my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
1631 my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
1632 my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
1633 my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
1634 my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
1638 my $array1 = [ $person1, $person2, $person4 ];
1639 my $array2 = [ $person1, $person3, $person4, $person5 ];
1640 Algorithm::Diff::diff( $array1, $array2 );
1642 everything would work out OK (each of the objects would be converted
1643 into a string like "Person=HASH(0x82425b0)" for comparison).
1645 But if you did this:
1647 my $array1 = [ $person1, $person2, $person4 ];
1648 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1649 Algorithm::Diff::diff( $array1, $array2 );
1651 $person4 and $person4->clone() (which have the same name and SSN)
1652 would be seen as different objects. If you wanted them to be considered
1653 equivalent, you would have to pass in a key generation function:
1655 my $array1 = [ $person1, $person2, $person4 ];
1656 my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1657 Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
1659 This would use the 'ssn' field in each Person as a comparison key, and
1660 so would consider $person4 and $person4->clone() as equal.
1662 You may also pass additional parameters to the key generation function
1665 =head1 ERROR CHECKING
1667 If you pass these routines a non-reference and they expect a reference,
1668 they will die with a message.
1672 This version released by Tye McQueen (http://perlmonks.org/?node=tye).
1676 Parts Copyright (c) 2000-2004 Ned Konz. All rights reserved.
1677 Parts by Tye McQueen.
1679 This program is free software; you can redistribute it and/or modify it
1680 under the same terms as Perl.
1684 Mark-Jason still maintains a mailing list. To join a low-volume mailing
1685 list for announcements related to diff and Algorithm::Diff, send an
1686 empty mail message to mjd-perl-diff-request@plover.com.
1690 Versions through 0.59 (and much of this documentation) were written by:
1692 Mark-Jason Dominus, mjd-perl-diff@plover.com
1694 This version borrows some documentation and routine names from
1695 Mark-Jason's, but Diff.pm's code was completely replaced.
1697 This code was adapted from the Smalltalk code of Mario Wolczko
1698 <mario@wolczko.com>, which is available at
1699 ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
1701 C<sdiff> and C<traverse_balanced> were written by Mike Schilli
1702 <m@perlmeister.com>.
1704 The algorithm is that described in
1705 I<A Fast Algorithm for Computing Longest Common Subsequences>,
1706 CACM, vol.20, no.5, pp.350-353, May 1977, with a few
1707 minor improvements to improve the speed.
1709 Much work was done by Ned Konz (perl@bike-nomad.com).
1711 The OO interface and some other changes are by Tye McQueen.