pintos-os.org Git - pintos-anon/blob - src/tests/Algorithm/Diff.pm

   1 package Algorithm::Diff;
   2 # Skip to first "=head" line for documentation.
   3 use strict;
   4
   5 use integer;    # see below in _replaceNextLargerWith() for mod to make
   6                 # if you don't use this
   7 use vars qw( $VERSION @EXPORT_OK );
   8 $VERSION = 1.19_01;
   9 #          ^ ^^ ^^-- Incremented at will
  10 #          | \+----- Incremented for non-trivial changes to features
  11 #          \-------- Incremented for fundamental changes
  12 require Exporter;
  13 *import    = \&Exporter::import;
  14 @EXPORT_OK = qw(
  15     prepare LCS LCDidx LCS_length
  16     diff sdiff compact_diff
  17     traverse_sequences traverse_balanced
  18 );
  19
  20 # McIlroy-Hunt diff algorithm
  21 # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com>
  22 # by Ned Konz, perl@bike-nomad.com
  23 # Updates by Tye McQueen, http://perlmonks.org/?node=tye
  24
  25 # Create a hash that maps each element of $aCollection to the set of
  26 # positions it occupies in $aCollection, restricted to the elements
  27 # within the range of indexes specified by $start and $end.
  28 # The fourth parameter is a subroutine reference that will be called to
  29 # generate a string to use as a key.
  30 # Additional parameters, if any, will be passed to this subroutine.
  31 #
  32 # my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
  33
  34 sub _withPositionsOfInInterval
  35 {
  36     my $aCollection = shift;    # array ref
  37     my $start       = shift;
  38     my $end         = shift;
  39     my $keyGen      = shift;
  40     my %d;
  41     my $index;
  42     for ( $index = $start ; $index <= $end ; $index++ )
  43     {
  44         my $element = $aCollection->[$index];
  45         my $key = &$keyGen( $element, @_ );
  46         if ( exists( $d{$key} ) )
  47         {
  48             unshift ( @{ $d{$key} }, $index );
  49         }
  50         else
  51         {
  52             $d{$key} = [$index];
  53         }
  54     }
  55     return wantarray ? %d : \%d;
  56 }
  57
  58 # Find the place at which aValue would normally be inserted into the
  59 # array. If that place is already occupied by aValue, do nothing, and
  60 # return undef. If the place does not exist (i.e., it is off the end of
  61 # the array), add it to the end, otherwise replace the element at that
  62 # point with aValue.  It is assumed that the array's values are numeric.
  63 # This is where the bulk (75%) of the time is spent in this module, so
  64 # try to make it fast!
  65
  66 sub _replaceNextLargerWith
  67 {
  68     my ( $array, $aValue, $high ) = @_;
  69     $high ||= $#$array;
  70
  71     # off the end?
  72     if ( $high == -1 || $aValue > $array->[-1] )
  73     {
  74         push ( @$array, $aValue );
  75         return $high + 1;
  76     }
  77
  78     # binary search for insertion point...
  79     my $low = 0;
  80     my $index;
  81     my $found;
  82     while ( $low <= $high )
  83     {
  84         $index = ( $high + $low ) / 2;
  85
  86         # $index = int(( $high + $low ) / 2);  # without 'use integer'
  87         $found = $array->[$index];
  88
  89         if ( $aValue == $found )
  90         {
  91             return undef;
  92         }
  93         elsif ( $aValue > $found )
  94         {
  95             $low = $index + 1;
  96         }
  97         else
  98         {
  99             $high = $index - 1;
 100         }
 101     }
 102
 103     # now insertion point is in $low.
 104     $array->[$low] = $aValue;    # overwrite next larger
 105     return $low;
 106 }
 107
 108 # This method computes the longest common subsequence in $a and $b.
 109
 110 # Result is array or ref, whose contents is such that
 111 #   $a->[ $i ] == $b->[ $result[ $i ] ]
 112 # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined.
 113
 114 # An additional argument may be passed; this is a hash or key generating
 115 # function that should return a string that uniquely identifies the given
 116 # element.  It should be the case that if the key is the same, the elements
 117 # will compare the same. If this parameter is undef or missing, the key
 118 # will be the element as a string.
 119
 120 # By default, comparisons will use "eq" and elements will be turned into keys
 121 # using the default stringizing operator '""'.
 122
 123 # Additional parameters, if any, will be passed to the key generation
 124 # routine.
 125
 126 sub _longestCommonSubsequence
 127 {
 128     my $a        = shift;    # array ref or hash ref
 129     my $b        = shift;    # array ref or hash ref
 130     my $counting = shift;    # scalar
 131     my $keyGen   = shift;    # code ref
 132     my $compare;             # code ref
 133
 134     if ( ref($a) eq 'HASH' )
 135     {                        # prepared hash must be in $b
 136         my $tmp = $b;
 137         $b = $a;
 138         $a = $tmp;
 139     }
 140
 141     # Check for bogus (non-ref) argument values
 142     if ( !ref($a) || !ref($b) )
 143     {
 144         my @callerInfo = caller(1);
 145         die 'error: must pass array or hash references to ' . $callerInfo[3];
 146     }
 147
 148     # set up code refs
 149     # Note that these are optimized.
 150     if ( !defined($keyGen) )    # optimize for strings
 151     {
 152         $keyGen = sub { $_[0] };
 153         $compare = sub { my ( $a, $b ) = @_; $a eq $b };
 154     }
 155     else
 156     {
 157         $compare = sub {
 158             my $a = shift;
 159             my $b = shift;
 160             &$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
 161         };
 162     }
 163
 164     my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
 165     my ( $prunedCount, $bMatches ) = ( 0, {} );
 166
 167     if ( ref($b) eq 'HASH' )    # was $bMatches prepared for us?
 168     {
 169         $bMatches = $b;
 170     }
 171     else
 172     {
 173         my ( $bStart, $bFinish ) = ( 0, $#$b );
 174
 175         # First we prune off any common elements at the beginning
 176         while ( $aStart <= $aFinish
 177             and $bStart <= $bFinish
 178             and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
 179         {
 180             $matchVector->[ $aStart++ ] = $bStart++;
 181             $prunedCount++;
 182         }
 183
 184         # now the end
 185         while ( $aStart <= $aFinish
 186             and $bStart <= $bFinish
 187             and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
 188         {
 189             $matchVector->[ $aFinish-- ] = $bFinish--;
 190             $prunedCount++;
 191         }
 192
 193         # Now compute the equivalence classes of positions of elements
 194         $bMatches =
 195           _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
 196     }
 197     my $thresh = [];
 198     my $links  = [];
 199
 200     my ( $i, $ai, $j, $k );
 201     for ( $i = $aStart ; $i <= $aFinish ; $i++ )
 202     {
 203         $ai = &$keyGen( $a->[$i], @_ );
 204         if ( exists( $bMatches->{$ai} ) )
 205         {
 206             $k = 0;
 207             for $j ( @{ $bMatches->{$ai} } )
 208             {
 209
 210                 # optimization: most of the time this will be true
 211                 if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
 212                 {
 213                     $thresh->[$k] = $j;
 214                 }
 215                 else
 216                 {
 217                     $k = _replaceNextLargerWith( $thresh, $j, $k );
 218                 }
 219
 220                 # oddly, it's faster to always test this (CPU cache?).
 221                 if ( defined($k) )
 222                 {
 223                     $links->[$k] =
 224                       [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
 225                 }
 226             }
 227         }
 228     }
 229
 230     if (@$thresh)
 231     {
 232         return $prunedCount + @$thresh if $counting;
 233         for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
 234         {
 235             $matchVector->[ $link->[1] ] = $link->[2];
 236         }
 237     }
 238     elsif ($counting)
 239     {
 240         return $prunedCount;
 241     }
 242
 243     return wantarray ? @$matchVector : $matchVector;
 244 }
 245
 246 sub traverse_sequences
 247 {
 248     my $a                 = shift;          # array ref
 249     my $b                 = shift;          # array ref
 250     my $callbacks         = shift || {};
 251     my $keyGen            = shift;
 252     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
 253     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
 254     my $finishedACallback = $callbacks->{'A_FINISHED'};
 255     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
 256     my $finishedBCallback = $callbacks->{'B_FINISHED'};
 257     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
 258
 259     # Process all the lines in @$matchVector
 260     my $lastA = $#$a;
 261     my $lastB = $#$b;
 262     my $bi    = 0;
 263     my $ai;
 264
 265     for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
 266     {
 267         my $bLine = $matchVector->[$ai];
 268         if ( defined($bLine) )    # matched
 269         {
 270             &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
 271             &$matchCallback( $ai,    $bi++, @_ );
 272         }
 273         else
 274         {
 275             &$discardACallback( $ai, $bi, @_ );
 276         }
 277     }
 278
 279     # The last entry (if any) processed was a match.
 280     # $ai and $bi point just past the last matching lines in their sequences.
 281
 282     while ( $ai <= $lastA or $bi <= $lastB )
 283     {
 284
 285         # last A?
 286         if ( $ai == $lastA + 1 and $bi <= $lastB )
 287         {
 288             if ( defined($finishedACallback) )
 289             {
 290                 &$finishedACallback( $lastA, @_ );
 291                 $finishedACallback = undef;
 292             }
 293             else
 294             {
 295                 &$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
 296             }
 297         }
 298
 299         # last B?
 300         if ( $bi == $lastB + 1 and $ai <= $lastA )
 301         {
 302             if ( defined($finishedBCallback) )
 303             {
 304                 &$finishedBCallback( $lastB, @_ );
 305                 $finishedBCallback = undef;
 306             }
 307             else
 308             {
 309                 &$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
 310             }
 311         }
 312
 313         &$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
 314         &$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
 315     }
 316
 317     return 1;
 318 }
 319
 320 sub traverse_balanced
 321 {
 322     my $a                 = shift;              # array ref
 323     my $b                 = shift;              # array ref
 324     my $callbacks         = shift || {};
 325     my $keyGen            = shift;
 326     my $matchCallback     = $callbacks->{'MATCH'} || sub { };
 327     my $discardACallback  = $callbacks->{'DISCARD_A'} || sub { };
 328     my $discardBCallback  = $callbacks->{'DISCARD_B'} || sub { };
 329     my $changeCallback    = $callbacks->{'CHANGE'};
 330     my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
 331
 332     # Process all the lines in match vector
 333     my $lastA = $#$a;
 334     my $lastB = $#$b;
 335     my $bi    = 0;
 336     my $ai    = 0;
 337     my $ma    = -1;
 338     my $mb;
 339
 340     while (1)
 341     {
 342
 343         # Find next match indices $ma and $mb
 344         do {
 345             $ma++;
 346         } while(
 347                 $ma <= $#$matchVector
 348             &&  !defined $matchVector->[$ma]
 349         );
 350
 351         last if $ma > $#$matchVector;    # end of matchVector?
 352         $mb = $matchVector->[$ma];
 353
 354         # Proceed with discard a/b or change events until
 355         # next match
 356         while ( $ai < $ma || $bi < $mb )
 357         {
 358
 359             if ( $ai < $ma && $bi < $mb )
 360             {
 361
 362                 # Change
 363                 if ( defined $changeCallback )
 364                 {
 365                     &$changeCallback( $ai++, $bi++, @_ );
 366                 }
 367                 else
 368                 {
 369                     &$discardACallback( $ai++, $bi, @_ );
 370                     &$discardBCallback( $ai, $bi++, @_ );
 371                 }
 372             }
 373             elsif ( $ai < $ma )
 374             {
 375                 &$discardACallback( $ai++, $bi, @_ );
 376             }
 377             else
 378             {
 379
 380                 # $bi < $mb
 381                 &$discardBCallback( $ai, $bi++, @_ );
 382             }
 383         }
 384
 385         # Match
 386         &$matchCallback( $ai++, $bi++, @_ );
 387     }
 388
 389     while ( $ai <= $lastA || $bi <= $lastB )
 390     {
 391         if ( $ai <= $lastA && $bi <= $lastB )
 392         {
 393
 394             # Change
 395             if ( defined $changeCallback )
 396             {
 397                 &$changeCallback( $ai++, $bi++, @_ );
 398             }
 399             else
 400             {
 401                 &$discardACallback( $ai++, $bi, @_ );
 402                 &$discardBCallback( $ai, $bi++, @_ );
 403             }
 404         }
 405         elsif ( $ai <= $lastA )
 406         {
 407             &$discardACallback( $ai++, $bi, @_ );
 408         }
 409         else
 410         {
 411
 412             # $bi <= $lastB
 413             &$discardBCallback( $ai, $bi++, @_ );
 414         }
 415     }
 416
 417     return 1;
 418 }
 419
 420 sub prepare
 421 {
 422     my $a       = shift;    # array ref
 423     my $keyGen  = shift;    # code ref
 424
 425     # set up code ref
 426     $keyGen = sub { $_[0] } unless defined($keyGen);
 427
 428     return scalar _withPositionsOfInInterval( $a, 0, $#$a, $keyGen, @_ );
 429 }
 430
 431 sub LCS
 432 {
 433     my $a = shift;                  # array ref
 434     my $b = shift;                  # array ref or hash ref
 435     my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
 436     my @retval;
 437     my $i;
 438     for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
 439     {
 440         if ( defined( $matchVector->[$i] ) )
 441         {
 442             push ( @retval, $a->[$i] );
 443         }
 444     }
 445     return wantarray ? @retval : \@retval;
 446 }
 447
 448 sub LCS_length
 449 {
 450     my $a = shift;                          # array ref
 451     my $b = shift;                          # array ref or hash ref
 452     return _longestCommonSubsequence( $a, $b, 1, @_ );
 453 }
 454
 455 sub LCSidx
 456 {
 457     my $a= shift @_;
 458     my $b= shift @_;
 459     my $match= _longestCommonSubsequence( $a, $b, 0, @_ );
 460     my @am= grep defined $match->[$_], 0..$#$match;
 461     my @bm= @{$match}[@am];
 462     return \@am, \@bm;
 463 }
 464
 465 sub compact_diff
 466 {
 467     my $a= shift @_;
 468     my $b= shift @_;
 469     my( $am, $bm )= LCSidx( $a, $b, @_ );
 470     my @cdiff;
 471     my( $ai, $bi )= ( 0, 0 );
 472     push @cdiff, $ai, $bi;
 473     while( 1 ) {
 474         while(  @$am  &&  $ai == $am->[0]  &&  $bi == $bm->[0]  ) {
 475             shift @$am;
 476             shift @$bm;
 477             ++$ai, ++$bi;
 478         }
 479         push @cdiff, $ai, $bi;
 480         last   if  ! @$am;
 481         $ai = $am->[0];
 482         $bi = $bm->[0];
 483         push @cdiff, $ai, $bi;
 484     }
 485     push @cdiff, 0+@$a, 0+@$b
 486         if  $ai < @$a || $bi < @$b;
 487     return wantarray ? @cdiff : \@cdiff;
 488 }
 489
 490 sub diff
 491 {
 492     my $a      = shift;    # array ref
 493     my $b      = shift;    # array ref
 494     my $retval = [];
 495     my $hunk   = [];
 496     my $discard = sub {
 497         push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
 498     };
 499     my $add = sub {
 500         push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
 501     };
 502     my $match = sub {
 503         push @$retval, $hunk
 504             if 0 < @$hunk;
 505         $hunk = []
 506     };
 507     traverse_sequences( $a, $b,
 508         { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
 509     &$match();
 510     return wantarray ? @$retval : $retval;
 511 }
 512
 513 sub sdiff
 514 {
 515     my $a      = shift;    # array ref
 516     my $b      = shift;    # array ref
 517     my $retval = [];
 518     my $discard = sub { push ( @$retval, [ '-', $a->[ $_[0] ], "" ] ) };
 519     my $add = sub { push ( @$retval, [ '+', "", $b->[ $_[1] ] ] ) };
 520     my $change = sub {
 521         push ( @$retval, [ 'c', $a->[ $_[0] ], $b->[ $_[1] ] ] );
 522     };
 523     my $match = sub {
 524         push ( @$retval, [ 'u', $a->[ $_[0] ], $b->[ $_[1] ] ] );
 525     };
 526     traverse_balanced(
 527         $a,
 528         $b,
 529         {
 530             MATCH     => $match,
 531             DISCARD_A => $discard,
 532             DISCARD_B => $add,
 533             CHANGE    => $change,
 534         },
 535         @_
 536     );
 537     return wantarray ? @$retval : $retval;
 538 }
 539
 540 ########################################
 541 my $Root= __PACKAGE__;
 542 package Algorithm::Diff::_impl;
 543 use strict;
 544
 545 sub _Idx()  { 0 } # $me->[_Idx]: Ref to array of hunk indices
 546             # 1   # $me->[1]: Ref to first sequence
 547             # 2   # $me->[2]: Ref to second sequence
 548 sub _End()  { 3 } # $me->[_End]: Diff between forward and reverse pos
 549 sub _Same() { 4 } # $me->[_Same]: 1 if pos 1 contains unchanged items
 550 sub _Base() { 5 } # $me->[_Base]: Added to range's min and max
 551 sub _Pos()  { 6 } # $me->[_Pos]: Which hunk is currently selected
 552 sub _Off()  { 7 } # $me->[_Off]: Offset into _Idx for current position
 553 sub _Min() { -2 } # Added to _Off to get min instead of max+1
 554
 555 sub Die
 556 {
 557     require Carp;
 558     Carp::confess( @_ );
 559 }
 560
 561 sub _ChkPos
 562 {
 563     my( $me )= @_;
 564     return   if  $me->[_Pos];
 565     my $meth= ( caller(1) )[3];
 566     Die( "Called $meth on 'reset' object" );
 567 }
 568
 569 sub _ChkSeq
 570 {
 571     my( $me, $seq )= @_;
 572     return $seq + $me->[_Off]
 573         if  1 == $seq  ||  2 == $seq;
 574     my $meth= ( caller(1) )[3];
 575     Die( "$meth: Invalid sequence number ($seq); must be 1 or 2" );
 576 }
 577
 578 sub getObjPkg
 579 {
 580     my( $us )= @_;
 581     return ref $us   if  ref $us;
 582     return $us . "::_obj";
 583 }
 584
 585 sub new
 586 {
 587     my( $us, $seq1, $seq2, $opts ) = @_;
 588     my @args;
 589     for( $opts->{keyGen} ) {
 590         push @args, $_   if  $_;
 591     }
 592     for( $opts->{keyGenArgs} ) {
 593         push @args, @$_   if  $_;
 594     }
 595     my $cdif= Algorithm::Diff::compact_diff( $seq1, $seq2, @args );
 596     my $same= 1;
 597     if(  0 == $cdif->[2]  &&  0 == $cdif->[3]  ) {
 598         $same= 0;
 599         splice @$cdif, 0, 2;
 600     }
 601     my @obj= ( $cdif, $seq1, $seq2 );
 602     $obj[_End] = (1+@$cdif)/2;
 603     $obj[_Same] = $same;
 604     $obj[_Base] = 0;
 605     my $me = bless \@obj, $us->getObjPkg();
 606     $me->Reset( 0 );
 607     return $me;
 608 }
 609
 610 sub Reset
 611 {
 612     my( $me, $pos )= @_;
 613     $pos= int( $pos || 0 );
 614     $pos += $me->[_End]
 615         if  $pos < 0;
 616     $pos= 0
 617         if  $pos < 0  ||  $me->[_End] <= $pos;
 618     $me->[_Pos]= $pos || !1;
 619     $me->[_Off]= 2*$pos - 1;
 620     return $me;
 621 }
 622
 623 sub Base
 624 {
 625     my( $me, $base )= @_;
 626     my $oldBase= $me->[_Base];
 627     $me->[_Base]= 0+$base   if  defined $base;
 628     return $oldBase;
 629 }
 630
 631 sub Copy
 632 {
 633     my( $me, $pos, $base )= @_;
 634     my @obj= @$me;
 635     my $you= bless \@obj, ref($me);
 636     $you->Reset( $pos )   if  defined $pos;
 637     $you->Base( $base );
 638     return $you;
 639 }
 640
 641 sub Next {
 642     my( $me, $steps )= @_;
 643     $steps= 1   if  ! defined $steps;
 644     if( $steps ) {
 645         my $pos= $me->[_Pos];
 646         my $new= $pos + $steps;
 647         $new= 0   if  $pos  &&  $new < 0;
 648         $me->Reset( $new )
 649     }
 650     return $me->[_Pos];
 651 }
 652
 653 sub Prev {
 654     my( $me, $steps )= @_;
 655     $steps= 1   if  ! defined $steps;
 656     my $pos= $me->Next(-$steps);
 657     $pos -= $me->[_End]   if  $pos;
 658     return $pos;
 659 }
 660
 661 sub Diff {
 662     my( $me )= @_;
 663     $me->_ChkPos();
 664     return 0   if  $me->[_Same] == ( 1 & $me->[_Pos] );
 665     my $ret= 0;
 666     my $off= $me->[_Off];
 667     for my $seq ( 1, 2 ) {
 668         $ret |= $seq
 669             if  $me->[_Idx][ $off + $seq + _Min ]
 670             <   $me->[_Idx][ $off + $seq ];
 671     }
 672     return $ret;
 673 }
 674
 675 sub Min {
 676     my( $me, $seq, $base )= @_;
 677     $me->_ChkPos();
 678     my $off= $me->_ChkSeq($seq);
 679     $base= $me->[_Base] if !defined $base;
 680     return $base + $me->[_Idx][ $off + _Min ];
 681 }
 682
 683 sub Max {
 684     my( $me, $seq, $base )= @_;
 685     $me->_ChkPos();
 686     my $off= $me->_ChkSeq($seq);
 687     $base= $me->[_Base] if !defined $base;
 688     return $base + $me->[_Idx][ $off ] -1;
 689 }
 690
 691 sub Range {
 692     my( $me, $seq, $base )= @_;
 693     $me->_ChkPos();
 694     my $off = $me->_ChkSeq($seq);
 695     if( !wantarray ) {
 696         return  $me->[_Idx][ $off ]
 697             -   $me->[_Idx][ $off + _Min ];
 698     }
 699     $base= $me->[_Base] if !defined $base;
 700     return  ( $base + $me->[_Idx][ $off + _Min ] )
 701         ..  ( $base + $me->[_Idx][ $off ] - 1 );
 702 }
 703
 704 sub Items {
 705     my( $me, $seq )= @_;
 706     $me->_ChkPos();
 707     my $off = $me->_ChkSeq($seq);
 708     if( !wantarray ) {
 709         return  $me->[_Idx][ $off ]
 710             -   $me->[_Idx][ $off + _Min ];
 711     }
 712     return
 713         @{$me->[$seq]}[
 714                 $me->[_Idx][ $off + _Min ]
 715             ..  ( $me->[_Idx][ $off ] - 1 )
 716         ];
 717 }
 718
 719 sub Same {
 720     my( $me )= @_;
 721     $me->_ChkPos();
 722     return wantarray ? () : 0
 723         if  $me->[_Same] != ( 1 & $me->[_Pos] );
 724     return $me->Items(1);
 725 }
 726
 727 my %getName;
 728 BEGIN {
 729     %getName= (
 730         same => \&Same,
 731         diff => \&Diff,
 732         base => \&Base,
 733         min  => \&Min,
 734         max  => \&Max,
 735         range=> \&Range,
 736         items=> \&Items, # same thing
 737     );
 738 }
 739
 740 sub Get
 741 {
 742     my $me= shift @_;
 743     $me->_ChkPos();
 744     my @value;
 745     for my $arg (  @_  ) {
 746         for my $word (  split ' ', $arg  ) {
 747             my $meth;
 748             if(     $word !~ /^(-?\d+)?([a-zA-Z]+)([12])?$/
 749                 ||  not  $meth= $getName{ lc $2 }
 750             ) {
 751                 Die( $Root, ", Get: Invalid request ($word)" );
 752             }
 753             my( $base, $name, $seq )= ( $1, $2, $3 );
 754             push @value, scalar(
 755                 4 == length($name)
 756                     ? $meth->( $me )
 757                     : $meth->( $me, $seq, $base )
 758             );
 759         }
 760     }
 761     if(  wantarray  ) {
 762         return @value;
 763     } elsif(  1 == @value  ) {
 764         return $value[0];
 765     }
 766     Die( 0+@value, " values requested from ",
 767         $Root, "'s Get in scalar context" );
 768 }
 769
 770
 771 my $Obj= getObjPkg($Root);
 772 no strict 'refs';
 773
 774 for my $meth (  qw( new getObjPkg )  ) {
 775     *{$Root."::".$meth} = \&{$meth};
 776     *{$Obj ."::".$meth} = \&{$meth};
 777 }
 778 for my $meth (  qw(
 779     Next Prev Reset Copy Base Diff
 780     Same Items Range Min Max Get
 781     _ChkPos _ChkSeq
 782 )  ) {
 783     *{$Obj."::".$meth} = \&{$meth};
 784 }
 785
 786 1;
 787 __END__
 788
 789 =head1 NAME
 790
 791 Algorithm::Diff - Compute `intelligent' differences between two files / lists
 792
 793 =head1 SYNOPSIS
 794
 795     require Algorithm::Diff;
 796
 797     # This example produces traditional 'diff' output:
 798
 799     my $diff = Algorithm::Diff->new( \@seq1, \@seq2 );
 800
 801     $diff->Base( 1 );   # Return line numbers, not indices
 802     while(  $diff->Next()  ) {
 803         next   if  $diff->Same();
 804         my $sep = '';
 805         if(  ! $diff->Items(2)  ) {
 806             sprintf "%d,%dd%d\n",
 807                 $diff->Get(qw( Min1 Max1 Max2 ));
 808         } elsif(  ! $diff->Items(1)  ) {
 809             sprint "%da%d,%d\n",
 810                 $diff->Get(qw( Max1 Min2 Max2 ));
 811         } else {
 812             $sep = "---\n";
 813             sprintf "%d,%dc%d,%d\n",
 814                 $diff->Get(qw( Min1 Max1 Min2 Max2 ));
 815         }
 816         print "< $_"   for  $diff->Items(1);
 817         print $sep;
 818         print "> $_"   for  $diff->Items(2);
 819     }
 820
 821
 822     # Alternate interfaces:
 823
 824     use Algorithm::Diff qw(
 825         LCS LCS_length LCSidx
 826         diff sdiff compact_diff
 827         traverse_sequences traverse_balanced );
 828
 829     @lcs    = LCS( \@seq1, \@seq2 );
 830     $lcsref = LCS( \@seq1, \@seq2 );
 831     $count  = LCS_length( \@seq1, \@seq2 );
 832
 833     ( $seq1idxref, $seq2idxref ) = LCSidx( \@seq1, \@seq2 );
 834
 835
 836     # Complicated interfaces:
 837
 838     @diffs  = diff( \@seq1, \@seq2 );
 839
 840     @sdiffs = sdiff( \@seq1, \@seq2 );
 841
 842     @cdiffs = compact_diff( \@seq1, \@seq2 );
 843
 844     traverse_sequences(
 845         \@seq1,
 846         \@seq2,
 847         {   MATCH     => \&callback1,
 848             DISCARD_A => \&callback2,
 849             DISCARD_B => \&callback3,
 850         },
 851         \&key_generator,
 852         @extra_args,
 853     );
 854
 855     traverse_balanced(
 856         \@seq1,
 857         \@seq2,
 858         {   MATCH     => \&callback1,
 859             DISCARD_A => \&callback2,
 860             DISCARD_B => \&callback3,
 861             CHANGE    => \&callback4,
 862         },
 863         \&key_generator,
 864         @extra_args,
 865     );
 866
 867
 868 =head1 INTRODUCTION
 869
 870 (by Mark-Jason Dominus)
 871
 872 I once read an article written by the authors of C<diff>; they said
 873 that they worked very hard on the algorithm until they found the
 874 right one.
 875
 876 I think what they ended up using (and I hope someone will correct me,
 877 because I am not very confident about this) was the `longest common
 878 subsequence' method.  In the LCS problem, you have two sequences of
 879 items:
 880
 881     a b c d f g h j q z
 882
 883     a b c d e f g i j k r x y z
 884
 885 and you want to find the longest sequence of items that is present in
 886 both original sequences in the same order.  That is, you want to find
 887 a new sequence I<S> which can be obtained from the first sequence by
 888 deleting some items, and from the secend sequence by deleting other
 889 items.  You also want I<S> to be as long as possible.  In this case I<S>
 890 is
 891
 892     a b c d f g j z
 893
 894 From there it's only a small step to get diff-like output:
 895
 896     e   h i   k   q r x y
 897     +   - +   +   - + + +
 898
 899 This module solves the LCS problem.  It also includes a canned function
 900 to generate C<diff>-like output.
 901
 902 It might seem from the example above that the LCS of two sequences is
 903 always pretty obvious, but that's not always the case, especially when
 904 the two sequences have many repeated elements.  For example, consider
 905
 906     a x b y c z p d q
 907     a b c a x b y c z
 908
 909 A naive approach might start by matching up the C<a> and C<b> that
 910 appear at the beginning of each sequence, like this:
 911
 912     a x b y c         z p d q
 913     a   b   c a b y c z
 914
 915 This finds the common subsequence C<a b c z>.  But actually, the LCS
 916 is C<a x b y c z>:
 917
 918           a x b y c z p d q
 919     a b c a x b y c z
 920
 921 or
 922
 923     a       x b y c z p d q
 924     a b c a x b y c z
 925
 926 =head1 USAGE
 927
 928 (See also the README file and several example
 929 scripts include with this module.)
 930
 931 This module now provides an object-oriented interface that uses less
 932 memory and is easier to use than most of the previous procedural
 933 interfaces.  It also still provides several exportable functions.  We'll
 934 deal with these in ascending order of difficulty:  C<LCS>,
 935 C<LCS_length>, C<LCSidx>, OO interface, C<prepare>, C<diff>, C<sdiff>,
 936 C<traverse_sequences>, and C<traverse_balanced>.
 937
 938 =head2 C<LCS>
 939
 940 Given references to two lists of items, LCS returns an array containing
 941 their longest common subsequence.  In scalar context, it returns a
 942 reference to such a list.
 943
 944     @lcs    = LCS( \@seq1, \@seq2 );
 945     $lcsref = LCS( \@seq1, \@seq2 );
 946
 947 C<LCS> may be passed an optional third parameter; this is a CODE
 948 reference to a key generation function.  See L</KEY GENERATION
 949 FUNCTIONS>.
 950
 951     @lcs    = LCS( \@seq1, \@seq2, \&keyGen, @args );
 952     $lcsref = LCS( \@seq1, \@seq2, \&keyGen, @args );
 953
 954 Additional parameters, if any, will be passed to the key generation
 955 routine.
 956
 957 =head2 C<LCS_length>
 958
 959 This is just like C<LCS> except it only returns the length of the
 960 longest common subsequence.  This provides a performance gain of about
 961 9% compared to C<LCS>.
 962
 963 =head2 C<LCSidx>
 964
 965 Like C<LCS> except it returns references to two arrays.  The first array
 966 contains the indices into @seq1 where the LCS items are located.  The
 967 second array contains the indices into @seq2 where the LCS items are located.
 968
 969 Therefore, the following three lists will contain the same values:
 970
 971     my( $idx1, $idx2 ) = LCSidx( \@seq1, \@seq2 );
 972     my @list1 = @seq1[ @$idx1 ];
 973     my @list2 = @seq2[ @$idx2 ];
 974     my @list3 = LCS( \@seq1, \@seq2 );
 975
 976 =head2 C<new>
 977
 978     $diff = Algorithm::Diffs->new( \@seq1, \@seq2 );
 979     $diff = Algorithm::Diffs->new( \@seq1, \@seq2, \%opts );
 980
 981 C<new> computes the smallest set of additions and deletions necessary
 982 to turn the first sequence into the second and compactly records them
 983 in the object.
 984
 985 You use the object to iterate over I<hunks>, where each hunk represents
 986 a contiguous section of items which should be added, deleted, replaced,
 987 or left unchanged.
 988
 989 =over 4
 990
 991 The following summary of all of the methods looks a lot like Perl code
 992 but some of the symbols have different meanings:
 993
 994     [ ]     Encloses optional arguments
 995     :       Is followed by the default value for an optional argument
 996     |       Separates alternate return results
 997
 998 Method summary:
 999
1000     $obj        = Algorithm::Diff->new( \@seq1, \@seq2, [ \%opts ] );
1001     $pos        = $obj->Next(  [ $count : 1 ] );
1002     $revPos     = $obj->Prev(  [ $count : 1 ] );
1003     $obj        = $obj->Reset( [ $pos : 0 ] );
1004     $copy       = $obj->Copy(  [ $pos, [ $newBase ] ] );
1005     $oldBase    = $obj->Base(  [ $newBase ] );
1006
1007 Note that all of the following methods C<die> if used on an object that
1008 is "reset" (not currently pointing at any hunk).
1009
1010     $bits       = $obj->Diff(  );
1011     @items|$cnt = $obj->Same(  );
1012     @items|$cnt = $obj->Items( $seqNum );
1013     @idxs |$cnt = $obj->Range( $seqNum, [ $base ] );
1014     $minIdx     = $obj->Min(   $seqNum, [ $base ] );
1015     $maxIdx     = $obj->Max(   $seqNum, [ $base ] );
1016     @values     = $obj->Get(   @names );
1017
1018 Passing in C<undef> for an optional argument is always treated the same
1019 as if no argument were passed in.
1020
1021 =item C<Next>
1022
1023     $pos = $diff->Next();    # Move forward 1 hunk
1024     $pos = $diff->Next( 2 ); # Move forward 2 hunks
1025     $pos = $diff->Next(-5);  # Move backward 5 hunks
1026
1027 C<Next> moves the object to point at the next hunk.  The object starts
1028 out "reset", which means it isn't pointing at any hunk.  If the object
1029 is reset, then C<Next()> moves to the first hunk.
1030
1031 C<Next> returns a true value iff the move didn't go past the last hunk.
1032 So C<Next(0)> will return true iff the object is not reset.
1033
1034 Actually, C<Next> returns the object's new position, which is a number
1035 between 1 and the number of hunks (inclusive), or returns a false value.
1036
1037 =item C<Prev>
1038
1039 C<Prev($N)> is almost identical to C<Next(-$N)>; it moves to the $Nth
1040 previous hunk.  On a 'reset' object, C<Prev()> [and C<Next(-1)>] move
1041 to the last hunk.
1042
1043 The position returned by C<Prev> is relative to the I<end> of the
1044 hunks; -1 for the last hunk, -2 for the second-to-last, etc.
1045
1046 =item C<Reset>
1047
1048     $diff->Reset();     # Reset the object's position
1049     $diff->Reset($pos); # Move to the specified hunk
1050     $diff->Reset(1);    # Move to the first hunk
1051     $diff->Reset(-1);   # Move to the last hunk
1052
1053 C<Reset> returns the object, so, for example, you could use
1054 C<< $diff->Reset()->Next(-1) >> to get the number of hunks.
1055
1056 =item C<Copy>
1057
1058     $copy = $diff->Copy( $newPos, $newBase );
1059
1060 C<Copy> returns a copy of the object.  The copy and the orignal object
1061 share most of their data, so making copies takes very little memory.
1062 The copy maintains its own position (separate from the original), which
1063 is the main purpose of copies.  It also maintains its own base.
1064
1065 By default, the copy's position starts out the same as the original
1066 object's position.  But C<Copy> takes an optional first argument to set the
1067 new position, so the following three snippets are equivalent:
1068
1069     $copy = $diff->Copy($pos);
1070
1071     $copy = $diff->Copy();
1072     $copy->Reset($pos);
1073
1074     $copy = $diff->Copy()->Reset($pos);
1075
1076 C<Copy> takes an optional second argument to set the base for
1077 the copy.  If you wish to change the base of the copy but leave
1078 the position the same as in the original, here are two
1079 equivalent ways:
1080
1081     $copy = $diff->Copy();
1082     $copy->Base( 0 );
1083
1084     $copy = $diff->Copy(undef,0);
1085
1086 Here are two equivalent way to get a "reset" copy:
1087
1088     $copy = $diff->Copy(0);
1089
1090     $copy = $diff->Copy()->Reset();
1091
1092 =item C<Diff>
1093
1094     $bits = $obj->Diff();
1095
1096 C<Diff> returns a true value iff the current hunk contains items that are
1097 different between the two sequences.  It actually returns one of the
1098 follow 4 values:
1099
1100 =over 4
1101
1102 =item 3
1103
1104 C<3==(1|2)>.  This hunk contains items from @seq1 and the items
1105 from @seq2 that should replace them.  Both sequence 1 and 2
1106 contain changed items so both the 1 and 2 bits are set.
1107
1108 =item 2
1109
1110 This hunk only contains items from @seq2 that should be inserted (not
1111 items from @seq1).  Only sequence 2 contains changed items so only the 2
1112 bit is set.
1113
1114 =item 1
1115
1116 This hunk only contains items from @seq1 that should be deleted (not
1117 items from @seq2).  Only sequence 1 contains changed items so only the 1
1118 bit is set.
1119
1120 =item 0
1121
1122 This means that the items in this hunk are the same in both sequences.
1123 Neither sequence 1 nor 2 contain changed items so neither the 1 nor the
1124 2 bits are set.
1125
1126 =back
1127
1128 =item C<Same>
1129
1130 C<Same> returns a true value iff the current hunk contains items that
1131 are the same in both sequences.  It actually returns the list of items
1132 if they are the same or an emty list if they aren't.  In a scalar
1133 context, it returns the size of the list.
1134
1135 =item C<Items>
1136
1137     $count = $diff->Items(2);
1138     @items = $diff->Items($seqNum);
1139
1140 C<Items> returns the (number of) items from the specified sequence that
1141 are part of the current hunk.
1142
1143 If the current hunk contains only insertions, then
1144 C<< $diff->Items(1) >> will return an empty list (0 in a scalar conext).
1145 If the current hunk contains only deletions, then C<< $diff->Items(2) >>
1146 will return an empty list (0 in a scalar conext).
1147
1148 If the hunk contains replacements, then both C<< $diff->Items(1) >> and
1149 C<< $diff->Items(2) >> will return different, non-empty lists.
1150
1151 Otherwise, the hunk contains identical items and all of the following
1152 will return the same lists:
1153
1154     @items = $diff->Items(1);
1155     @items = $diff->Items(2);
1156     @items = $diff->Same();
1157
1158 =item C<Range>
1159
1160     $count = $diff->Range( $seqNum );
1161     @indices = $diff->Range( $seqNum );
1162     @indices = $diff->Range( $seqNum, $base );
1163
1164 C<Range> is like C<Items> except that it returns a list of I<indices> to
1165 the items rather than the items themselves.  By default, the index of
1166 the first item (in each sequence) is 0 but this can be changed by
1167 calling the C<Base> method.  So, by default, the following two snippets
1168 return the same lists:
1169
1170     @list = $diff->Items(2);
1171     @list = @seq2[ $diff->Range(2) ];
1172
1173 You can also specify the base to use as the second argument.  So the
1174 following two snippets I<always> return the same lists:
1175
1176     @list = $diff->Items(1);
1177     @list = @seq1[ $diff->Range(1,0) ];
1178
1179 =item C<Base>
1180
1181     $curBase = $diff->Base();
1182     $oldBase = $diff->Base($newBase);
1183
1184 C<Base> sets and/or returns the current base (usually 0 or 1) that is
1185 used when you request range information.  The base defaults to 0 so
1186 that range information is returned as array indices.  You can set the
1187 base to 1 if you want to report traditional line numbers instead.
1188
1189 =item C<Min>
1190
1191     $min1 = $diff->Min(1);
1192     $min = $diff->Min( $seqNum, $base );
1193
1194 C<Min> returns the first value that C<Range> would return (given the
1195 same arguments) or returns C<undef> if C<Range> would return an empty
1196 list.
1197
1198 =item C<Max>
1199
1200 C<Max> returns the last value that C<Range> would return or C<undef>.
1201
1202 =item C<Get>
1203
1204     ( $n, $x, $r ) = $diff->Get(qw( min1 max1 range1 ));
1205     @values = $diff->Get(qw( 0min2 1max2 range2 same base ));
1206
1207 C<Get> returns one or more scalar values.  You pass in a list of the
1208 names of the values you want returned.  Each name must match one of the
1209 following regexes:
1210
1211     /^(-?\d+)?(min|max)[12]$/i
1212     /^(range[12]|same|diff|base)$/i
1213
1214 The 1 or 2 after a name says which sequence you want the information
1215 for (and where allowed, it is required).  The optional number before
1216 "min" or "max" is the base to use.  So the following equalities hold:
1217
1218     $diff->Get('min1') == $diff->Min(1)
1219     $diff->Get('0min2') == $diff->Min(2,0)
1220
1221 Using C<Get> in a scalar context when you've passed in more than one
1222 name is a fatal error (C<die> is called).
1223
1224 =back
1225
1226 =head2 C<prepare>
1227
1228 Given a reference to a list of items, C<prepare> returns a reference
1229 to a hash which can be used when comparing this sequence to other
1230 sequences with C<LCS> or C<LCS_length>.
1231
1232     $prep = prepare( \@seq1 );
1233     for $i ( 0 .. 10_000 )
1234     {
1235         @lcs = LCS( $prep, $seq[$i] );
1236         # do something useful with @lcs
1237     }
1238
1239 C<prepare> may be passed an optional third parameter; this is a CODE
1240 reference to a key generation function.  See L</KEY GENERATION
1241 FUNCTIONS>.
1242
1243     $prep = prepare( \@seq1, \&keyGen );
1244     for $i ( 0 .. 10_000 )
1245     {
1246         @lcs = LCS( $seq[$i], $prep, \&keyGen );
1247         # do something useful with @lcs
1248     }
1249
1250 Using C<prepare> provides a performance gain of about 50% when calling LCS
1251 many times compared with not preparing.
1252
1253 =head2 C<diff>
1254
1255     @diffs     = diff( \@seq1, \@seq2 );
1256     $diffs_ref = diff( \@seq1, \@seq2 );
1257
1258 C<diff> computes the smallest set of additions and deletions necessary
1259 to turn the first sequence into the second, and returns a description
1260 of these changes.  The description is a list of I<hunks>; each hunk
1261 represents a contiguous section of items which should be added,
1262 deleted, or replaced.  (Hunks containing unchanged items are not
1263 included.)
1264
1265 The return value of C<diff> is a list of hunks, or, in scalar context, a
1266 reference to such a list.  If there are no differences, the list will be
1267 empty.
1268
1269 Here is an example.  Calling C<diff> for the following two sequences:
1270
1271     a b c e h j l m n p
1272     b c d e f j k l m r s t
1273
1274 would produce the following list:
1275
1276     (
1277       [ [ '-', 0, 'a' ] ],
1278
1279       [ [ '+', 2, 'd' ] ],
1280
1281       [ [ '-', 4, 'h' ],
1282         [ '+', 4, 'f' ] ],
1283
1284       [ [ '+', 6, 'k' ] ],
1285
1286       [ [ '-',  8, 'n' ],
1287         [ '-',  9, 'p' ],
1288         [ '+',  9, 'r' ],
1289         [ '+', 10, 's' ],
1290         [ '+', 11, 't' ] ],
1291     )
1292
1293 There are five hunks here.  The first hunk says that the C<a> at
1294 position 0 of the first sequence should be deleted (C<->).  The second
1295 hunk says that the C<d> at position 2 of the second sequence should
1296 be inserted (C<+>).  The third hunk says that the C<h> at position 4
1297 of the first sequence should be removed and replaced with the C<f>
1298 from position 4 of the second sequence.  And so on.
1299
1300 C<diff> may be passed an optional third parameter; this is a CODE
1301 reference to a key generation function.  See L</KEY GENERATION
1302 FUNCTIONS>.
1303
1304 Additional parameters, if any, will be passed to the key generation
1305 routine.
1306
1307 =head2 C<sdiff>
1308
1309     @sdiffs     = sdiff( \@seq1, \@seq2 );
1310     $sdiffs_ref = sdiff( \@seq1, \@seq2 );
1311
1312 C<sdiff> computes all necessary components to show two sequences
1313 and their minimized differences side by side, just like the
1314 Unix-utility I<sdiff> does:
1315
1316     same             same
1317     before     |     after
1318     old        <     -
1319     -          >     new
1320
1321 It returns a list of array refs, each pointing to an array of
1322 display instructions. In scalar context it returns a reference
1323 to such a list. If there are no differences, the list will have one
1324 entry per item, each indicating that the item was unchanged.
1325
1326 Display instructions consist of three elements: A modifier indicator
1327 (C<+>: Element added, C<->: Element removed, C<u>: Element unmodified,
1328 C<c>: Element changed) and the value of the old and new elements, to
1329 be displayed side-by-side.
1330
1331 An C<sdiff> of the following two sequences:
1332
1333     a b c e h j l m n p
1334     b c d e f j k l m r s t
1335
1336 results in
1337
1338     ( [ '-', 'a', ''  ],
1339       [ 'u', 'b', 'b' ],
1340       [ 'u', 'c', 'c' ],
1341       [ '+', '',  'd' ],
1342       [ 'u', 'e', 'e' ],
1343       [ 'c', 'h', 'f' ],
1344       [ 'u', 'j', 'j' ],
1345       [ '+', '',  'k' ],
1346       [ 'u', 'l', 'l' ],
1347       [ 'u', 'm', 'm' ],
1348       [ 'c', 'n', 'r' ],
1349       [ 'c', 'p', 's' ],
1350       [ '+', '',  't' ],
1351     )
1352
1353 C<sdiff> may be passed an optional third parameter; this is a CODE
1354 reference to a key generation function.  See L</KEY GENERATION
1355 FUNCTIONS>.
1356
1357 Additional parameters, if any, will be passed to the key generation
1358 routine.
1359
1360 =head2 C<compact_diff>
1361
1362 C<compact_diff> is much like C<sdiff> except it returns a much more
1363 compact description consisting of just one flat list of indices.  An
1364 example helps explain the format:
1365
1366     my @a = qw( a b c   e  h j   l m n p      );
1367     my @b = qw(   b c d e f  j k l m    r s t );
1368     @cdiff = compact_diff( \@a, \@b );
1369     # Returns:
1370     #   @a      @b       @a       @b
1371     #  start   start   values   values
1372     (    0,      0,   #       =
1373          0,      0,   #    a  !
1374          1,      0,   #  b c  =  b c
1375          3,      2,   #       !  d
1376          3,      3,   #    e  =  e
1377          4,      4,   #    f  !  h
1378          5,      5,   #    j  =  j
1379          6,      6,   #       !  k
1380          6,      7,   #  l m  =  l m
1381          8,      9,   #  n p  !  r s t
1382         10,     12,   #
1383     );
1384
1385 The 0th, 2nd, 4th, etc. entries are all indices into @seq1 (@a in the
1386 above example) indicating where a hunk begins.  The 1st, 3rd, 5th, etc.
1387 entries are all indices into @seq2 (@b in the above example) indicating
1388 where the same hunk begins.
1389
1390 So each pair of indices (except the last pair) describes where a hunk
1391 begins (in each sequence).  Since each hunk must end at the item just
1392 before the item that starts the next hunk, the next pair of indices can
1393 be used to determine where the hunk ends.
1394
1395 So, the first 4 entries (0..3) describe the first hunk.  Entries 0 and 1
1396 describe where the first hunk begins (and so are always both 0).
1397 Entries 2 and 3 describe where the next hunk begins, so subtracting 1
1398 from each tells us where the first hunk ends.  That is, the first hunk
1399 contains items C<$diff[0]> through C<$diff[2] - 1> of the first sequence
1400 and contains items C<$diff[1]> through C<$diff[3] - 1> of the second
1401 sequence.
1402
1403 In other words, the first hunk consists of the following two lists of items:
1404
1405                #  1st pair     2nd pair
1406                # of indices   of indices
1407     @list1 = @a[ $cdiff[0] .. $cdiff[2]-1 ];
1408     @list2 = @b[ $cdiff[1] .. $cdiff[3]-1 ];
1409                # Hunk start   Hunk end
1410
1411 Note that the hunks will always alternate between those that are part of
1412 the LCS (those that contain unchanged items) and those that contain
1413 changes.  This means that all we need to be told is whether the first
1414 hunk is a 'same' or 'diff' hunk and we can determine which of the other
1415 hunks contain 'same' items or 'diff' items.
1416
1417 By convention, we always make the first hunk contain unchanged items.
1418 So the 1st, 3rd, 5th, etc. hunks (all odd-numbered hunks if you start
1419 counting from 1) all contain unchanged items.  And the 2nd, 4th, 6th,
1420 etc. hunks (all even-numbered hunks if you start counting from 1) all
1421 contain changed items.
1422
1423 Since @a and @b don't begin with the same value, the first hunk in our
1424 example is empty (otherwise we'd violate the above convention).  Note
1425 that the first 4 index values in our example are all zero.  Plug these
1426 values into our previous code block and we get:
1427
1428     @hunk1a = @a[ 0 .. 0-1 ];
1429     @hunk1b = @b[ 0 .. 0-1 ];
1430
1431 And C<0..-1> returns the empty list.
1432
1433 Move down one pair of indices (2..5) and we get the offset ranges for
1434 the second hunk, which contains changed items.
1435
1436 Since C<@diff[2..5]> contains (0,0,1,0) in our example, the second hunk
1437 consists of these two lists of items:
1438
1439         @hunk2a = @a[ $cdiff[2] .. $cdiff[4]-1 ];
1440         @hunk2b = @b[ $cdiff[3] .. $cdiff[5]-1 ];
1441     # or
1442         @hunk2a = @a[ 0 .. 1-1 ];
1443         @hunk2b = @b[ 0 .. 0-1 ];
1444     # or
1445         @hunk2a = @a[ 0 .. 0 ];
1446         @hunk2b = @b[ 0 .. -1 ];
1447     # or
1448         @hunk2a = ( 'a' );
1449         @hunk2b = ( );
1450
1451 That is, we would delete item 0 ('a') from @a.
1452
1453 Since C<@diff[4..7]> contains (1,0,3,2) in our example, the third hunk
1454 consists of these two lists of items:
1455
1456         @hunk3a = @a[ $cdiff[4] .. $cdiff[6]-1 ];
1457         @hunk3a = @b[ $cdiff[5] .. $cdiff[7]-1 ];
1458     # or
1459         @hunk3a = @a[ 1 .. 3-1 ];
1460         @hunk3a = @b[ 0 .. 2-1 ];
1461     # or
1462         @hunk3a = @a[ 1 .. 2 ];
1463         @hunk3a = @b[ 0 .. 1 ];
1464     # or
1465         @hunk3a = qw( b c );
1466         @hunk3a = qw( b c );
1467
1468 Note that this third hunk contains unchanged items as our convention demands.
1469
1470 You can continue this process until you reach the last two indices,
1471 which will always be the number of items in each sequence.  This is
1472 required so that subtracting one from each will give you the indices to
1473 the last items in each sequence.
1474
1475 =head2 C<traverse_sequences>
1476
1477 C<traverse_sequences> used to be the most general facility provided by
1478 this module (the new OO interface is more powerful and much easier to
1479 use).
1480
1481 Imagine that there are two arrows.  Arrow A points to an element of
1482 sequence A, and arrow B points to an element of the sequence B.
1483 Initially, the arrows point to the first elements of the respective
1484 sequences.  C<traverse_sequences> will advance the arrows through the
1485 sequences one element at a time, calling an appropriate user-specified
1486 callback function before each advance.  It willadvance the arrows in
1487 such a way that if there are equal elements C<$A[$i]> and C<$B[$j]>
1488 which are equal and which are part of the LCS, there will be some moment
1489 during the execution of C<traverse_sequences> when arrow A is pointing
1490 to C<$A[$i]> and arrow B is pointing to C<$B[$j]>.  When this happens,
1491 C<traverse_sequences> will call the C<MATCH> callback function and then
1492 it will advance both arrows.
1493
1494 Otherwise, one of the arrows is pointing to an element of its sequence
1495 that is not part of the LCS.  C<traverse_sequences> will advance that
1496 arrow and will call the C<DISCARD_A> or the C<DISCARD_B> callback,
1497 depending on which arrow it advanced.  If both arrows point to elements
1498 that are not part of the LCS, then C<traverse_sequences> will advance
1499 one of them and call the appropriate callback, but it is not specified
1500 which it will call.
1501
1502 The arguments to C<traverse_sequences> are the two sequences to
1503 traverse, and a hash which specifies the callback functions, like this:
1504
1505     traverse_sequences(
1506         \@seq1, \@seq2,
1507         {   MATCH => $callback_1,
1508             DISCARD_A => $callback_2,
1509             DISCARD_B => $callback_3,
1510         }
1511     );
1512
1513 Callbacks for MATCH, DISCARD_A, and DISCARD_B are invoked with at least
1514 the indices of the two arrows as their arguments.  They are not expected
1515 to return any values.  If a callback is omitted from the table, it is
1516 not called.
1517
1518 Callbacks for A_FINISHED and B_FINISHED are invoked with at least the
1519 corresponding index in A or B.
1520
1521 If arrow A reaches the end of its sequence, before arrow B does,
1522 C<traverse_sequences> will call the C<A_FINISHED> callback when it
1523 advances arrow B, if there is such a function; if not it will call
1524 C<DISCARD_B> instead.  Similarly if arrow B finishes first.
1525 C<traverse_sequences> returns when both arrows are at the ends of their
1526 respective sequences.  It returns true on success and false on failure.
1527 At present there is no way to fail.
1528
1529 C<traverse_sequences> may be passed an optional fourth parameter; this
1530 is a CODE reference to a key generation function.  See L</KEY GENERATION
1531 FUNCTIONS>.
1532
1533 Additional parameters, if any, will be passed to the key generation function.
1534
1535 If you want to pass additional parameters to your callbacks, but don't
1536 need a custom key generation function, you can get the default by
1537 passing undef:
1538
1539     traverse_sequences(
1540         \@seq1, \@seq2,
1541         {   MATCH => $callback_1,
1542             DISCARD_A => $callback_2,
1543             DISCARD_B => $callback_3,
1544         },
1545         undef,     # default key-gen
1546         $myArgument1,
1547         $myArgument2,
1548         $myArgument3,
1549     );
1550
1551 C<traverse_sequences> does not have a useful return value; you are
1552 expected to plug in the appropriate behavior with the callback
1553 functions.
1554
1555 =head2 C<traverse_balanced>
1556
1557 C<traverse_balanced> is an alternative to C<traverse_sequences>. It
1558 uses a different algorithm to iterate through the entries in the
1559 computed LCS. Instead of sticking to one side and showing element changes
1560 as insertions and deletions only, it will jump back and forth between
1561 the two sequences and report I<changes> occurring as deletions on one
1562 side followed immediatly by an insertion on the other side.
1563
1564 In addition to the C<DISCARD_A>, C<DISCARD_B>, and C<MATCH> callbacks
1565 supported by C<traverse_sequences>, C<traverse_balanced> supports
1566 a C<CHANGE> callback indicating that one element got C<replaced> by another:
1567
1568     traverse_balanced(
1569         \@seq1, \@seq2,
1570         {   MATCH => $callback_1,
1571             DISCARD_A => $callback_2,
1572             DISCARD_B => $callback_3,
1573             CHANGE    => $callback_4,
1574         }
1575     );
1576
1577 If no C<CHANGE> callback is specified, C<traverse_balanced>
1578 will map C<CHANGE> events to C<DISCARD_A> and C<DISCARD_B> actions,
1579 therefore resulting in a similar behaviour as C<traverse_sequences>
1580 with different order of events.
1581
1582 C<traverse_balanced> might be a bit slower than C<traverse_sequences>,
1583 noticable only while processing huge amounts of data.
1584
1585 The C<sdiff> function of this module
1586 is implemented as call to C<traverse_balanced>.
1587
1588 C<traverse_balanced> does not have a useful return value; you are expected to
1589 plug in the appropriate behavior with the callback functions.
1590
1591 =head1 KEY GENERATION FUNCTIONS
1592
1593 Most of the functions accept an optional extra parameter.  This is a
1594 CODE reference to a key generating (hashing) function that should return
1595 a string that uniquely identifies a given element.  It should be the
1596 case that if two elements are to be considered equal, their keys should
1597 be the same (and the other way around).  If no key generation function
1598 is provided, the key will be the element as a string.
1599
1600 By default, comparisons will use "eq" and elements will be turned into keys
1601 using the default stringizing operator '""'.
1602
1603 Where this is important is when you're comparing something other than
1604 strings.  If it is the case that you have multiple different objects
1605 that should be considered to be equal, you should supply a key
1606 generation function. Otherwise, you have to make sure that your arrays
1607 contain unique references.
1608
1609 For instance, consider this example:
1610
1611     package Person;
1612
1613     sub new
1614     {
1615         my $package = shift;
1616         return bless { name => '', ssn => '', @_ }, $package;
1617     }
1618
1619     sub clone
1620     {
1621         my $old = shift;
1622         my $new = bless { %$old }, ref($old);
1623     }
1624
1625     sub hash
1626     {
1627         return shift()->{'ssn'};
1628     }
1629
1630     my $person1 = Person->new( name => 'Joe', ssn => '123-45-6789' );
1631     my $person2 = Person->new( name => 'Mary', ssn => '123-47-0000' );
1632     my $person3 = Person->new( name => 'Pete', ssn => '999-45-2222' );
1633     my $person4 = Person->new( name => 'Peggy', ssn => '123-45-9999' );
1634     my $person5 = Person->new( name => 'Frank', ssn => '000-45-9999' );
1635
1636 If you did this:
1637
1638     my $array1 = [ $person1, $person2, $person4 ];
1639     my $array2 = [ $person1, $person3, $person4, $person5 ];
1640     Algorithm::Diff::diff( $array1, $array2 );
1641
1642 everything would work out OK (each of the objects would be converted
1643 into a string like "Person=HASH(0x82425b0)" for comparison).
1644
1645 But if you did this:
1646
1647     my $array1 = [ $person1, $person2, $person4 ];
1648     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1649     Algorithm::Diff::diff( $array1, $array2 );
1650
1651 $person4 and $person4->clone() (which have the same name and SSN)
1652 would be seen as different objects. If you wanted them to be considered
1653 equivalent, you would have to pass in a key generation function:
1654
1655     my $array1 = [ $person1, $person2, $person4 ];
1656     my $array2 = [ $person1, $person3, $person4->clone(), $person5 ];
1657     Algorithm::Diff::diff( $array1, $array2, \&Person::hash );
1658
1659 This would use the 'ssn' field in each Person as a comparison key, and
1660 so would consider $person4 and $person4->clone() as equal.
1661
1662 You may also pass additional parameters to the key generation function
1663 if you wish.
1664
1665 =head1 ERROR CHECKING
1666
1667 If you pass these routines a non-reference and they expect a reference,
1668 they will die with a message.
1669
1670 =head1 AUTHOR
1671
1672 This version released by Tye McQueen (http://perlmonks.org/?node=tye).
1673
1674 =head1 LICENSE
1675
1676 Parts Copyright (c) 2000-2004 Ned Konz.  All rights reserved.
1677 Parts by Tye McQueen.
1678
1679 This program is free software; you can redistribute it and/or modify it
1680 under the same terms as Perl.
1681
1682 =head1 MAILING LIST
1683
1684 Mark-Jason still maintains a mailing list.  To join a low-volume mailing
1685 list for announcements related to diff and Algorithm::Diff, send an
1686 empty mail message to mjd-perl-diff-request@plover.com.
1687
1688 =head1 CREDITS
1689
1690 Versions through 0.59 (and much of this documentation) were written by:
1691
1692 Mark-Jason Dominus, mjd-perl-diff@plover.com
1693
1694 This version borrows some documentation and routine names from
1695 Mark-Jason's, but Diff.pm's code was completely replaced.
1696
1697 This code was adapted from the Smalltalk code of Mario Wolczko
1698 <mario@wolczko.com>, which is available at
1699 ftp://st.cs.uiuc.edu/pub/Smalltalk/MANCHESTER/manchester/4.0/diff.st
1700
1701 C<sdiff> and C<traverse_balanced> were written by Mike Schilli
1702 <m@perlmeister.com>.
1703
1704 The algorithm is that described in
1705 I<A Fast Algorithm for Computing Longest Common Subsequences>,
1706 CACM, vol.20, no.5, pp.350-353, May 1977, with a few
1707 minor improvements to improve the speed.
1708
1709 Much work was done by Ned Konz (perl@bike-nomad.com).
1710
1711 The OO interface and some other changes are by Tye McQueen.
1712
1713 =cut