#! /usr/bin/perl
# Copyright (C) 2001-2002 by <Christian.Queinnec@lip6.fr>
# $Id: plagiat.pl,v 1.14 2007/01/28 18:00:02 queinnec Exp $
# Licensed under the same terms as Perl itself.

# Look for plagiarism.

# {{{ Documentation
# This documentation may be seen as text with pod2text plagiat.pl

=head1 NAME

plagiat - A perl script to detect plagiarism between files.

 plagiat.pl [-v -v ... ] --path='*/*/*/file' [other options]

=for html Download the package from
<a href="http://www-spi.lip6.fr/~queinnec/Miscellaneous/plagiat.txt">
http://www-spi.lip6.fr/~queinnec/Miscellaneous/plagiat.pl </a>

=for text Download the package from
http://www-spi.lip6.fr/~queinnec/Miscellaneous/plagiat.txt

=head1 DESCRIPTION

Plagiat is a perl script that tries to detect plagiarims between
files.  When given a shell expression yielding many files, it compares
them two by two and signal those that are very close.

The comparison algorithm is very simple. Every file is compressed,
then every pair of files is concatenated then compressed. If the
compressed pair has a size that is far less than the sum of the size
of the two individually compressed files then there are common
fragments between the two files and potential plagiarism.

Attention, this is a quadratic algorithm, it may take a looooong time!
By the way, the files you want to compare should not be too small.
Fifty lines at least seem to be the least size for valuable
comparisons. It might be useful to pretty-print and obfuscate the
files to compare before (but I have yet to test that method).

This script will generate compressed files (near the original file
that is, in the same directory with the usual extension meaning
compressed (.gz here)). It will also generate a .NEAR file that
contains the name of the files that are near this one. Attention, if
f1 and f2 are close then either f1.NEAR will mention f2, either
f2.NEAR will mention f1.

=head1 THE .NEAR FILES

Here is an example of a real .NEAR file. A .NEAR file is made of lines
all having the structure of the next line:

 1796 1796 1861 /tmp/testplagiat/1/912423947/words.c     [correlation=0.91%]

 ^ size of the compressed file
      ^ size of the other compressed file
           ^ size of the two files compressed together
                ^ name of the other file
                                                          ^ correlation

The first number is the size of the compressed file words.c.  The
second number is the size of the compressed file to which it is
compared (here this is /tmp/testplagiat/1/912423947/words.c. The size
of the compressed concatenated files is the third number. The name of
the other file is then mentioned. The final number is the correlation
between the two files. These two files are probably very close.

=head1 OPTIONS

Here are the options of the plagiat script.

=over 4

=item * --path

The mandatory --path option specifies a shell regexp leading
to the files to be compared. Pay attention to quote the shell regexp
for the shell. For example, consider the following command:

 plagiat.pl --path='*/f.c' 

It will compare all the files named f.c in the subdirectories of the
current directory. 

=item * -v

The optional -v option increases the verbosity of the process.
The generated lines are produced on stderr.

=item * --threshold

The optional --threshold specifies a number of bytes below
which files are not compared. Use the -h option to see the current
value of this threshold.

=item * --help

The -h or --help option outputs a short usage notice.

=item * --factor

The --factor option specifies the correlation threshold between 0 and
1 against which plagiarim is detected. Use the -h option to see the
current value of this threshold. A factor of 1 means you are
interested by quasi-exact correspondance while 0.5 means that you want
to be noticed of loose resemblance at all.

=item * --clean

The --clean option removes all the .NEAR and compressed files
before the plagiarism detection. It corresponds to the union of the
options --clean-gz and --clean-near.

=item * --clean-gz

The --clean-gz option removes all the compressed files before
the plagiarism detection.

=item * --clean-near

The --clean-near option removes all the previous .NEAR files
before the plagiarism detection.

=item * -k

The -k option specifies that no empty .NEAR file should be
produced.

=back

=head1 BUGS

Should use the opt package for options!
Allow an incremental usage (that requires to record the compressed pairs!)

=head1 AUTHOR

Christian Queinnec <Christian.Queinnec@lip6.fr>

=cut

# }}}

use diagnostics;
diagnostics::enable();

my $version = 
    ' $Id: plagiat.pl,v 1.14 2007/01/28 18:00:02 queinnec Exp $ ';
my $verbose = 0;
my $shpath;
my $threshold = 28;
my $gzip = 'gzip -n -6 -c';
my $factor = 0.25;
my $clean_gz = 0;
my $clean_near = 0;
my $remove_empty_near_files = 0;

my $usage = "$0 --path='*/*/file' [-v|--clean|--factor=0.3|--threshold=50]
Copyright (C) 2001 by <Christian.Queinnec\@lip6.fr> " . ' $Revision: 1.14 $ ' . "
Licensed under the same terms as Perl itself.

Compare all the files specified by the path option. These files are first
compressed into */*/file.gz then compared by pairs. Results ie near misses
are produced into */*/file.NEAR files. A line in a */*/file.NEAR file
looks like: 

1796 1796 1861 /tmp/testplagiat/1/912423947/words.c     [correlation=0.91%]
^ size of the compressed file
     ^ size of the other compressed file
          ^ size of the two files compressed together
               ^ name of the other file
                                                         ^ correlation
a number between 0. and 1. where 1. means totally similar and 0. means
totally different. Attention, this is a quadratic algorithm!

NOTE: if a file f1 is close to a file f2 then this will appear in
exactly one of the file f1.NEAR or f2.NEAR.

Among options are:
--path        shell regexp specifying which files to compare          Mandatory
-v            be even more verbose (may be used multiply)             Optional
--threshold   don't consider compressed files under that size         Optional
              (by default: $threshold (the size of the compressed
               empty file))
--factor      correlation factor (between 0. and 1.)                  Optional
              (by default: $factor)
-k            don't produce empty .NEAR files                         Optional
--clean       remove all .NEAR and all .gz files before               Optional
--clean-gz    remove all .gz files before                             Optional
--clean-near  remove all .NEAR files before                           Optional
";

sub clean_all_NEAR {
    my(@files) = glob($shpath);
    foreach my $f ( @files ) {
        if ( -f "$f.NEAR" ) {
            print STDERR "Removing $f.NEAR ...\n" if $verbose > 1;
            unlink "$f.NEAR";
        };
    }
}

sub clean_all_gz {
    my(@files) = glob($shpath);
    foreach my $f ( @files ) {
        if ( -f "$f.gz" ) {
            print STDERR "Removing $f.gz ...\n" if $verbose > 1;
            unlink "$f.gz";
        };
    }
}

# For every file to be analyzed, compress it into a .gz file.

sub compress_all_files {
    my(@files) = glob($shpath);
    foreach my $f ( @files ) {
        if ( ! -f "$f.gz" ) {
            print STDERR "Compressing $f ...\n" if $verbose > 1;
            my $cmd = "$gzip '$f' > '$f.gz'";
            system($cmd) == 0
                || die "Cannot run $cmd: $?";
        };
    }
}

# Compare files by pairs and collect near misses.

sub look_for_clones {
    my $count = 0;
    my $comparison = 0;
    my $near_misses = 0;
    my(@files) = glob($shpath);
    foreach my $f1 ( @files ) {
        $count++;
        print STDERR "Comparing $f1 ...\n" if $verbose > 2;
        my $size1 = -s "$f1.gz";
        # No plagiarism if the file is nearly empty.
        next if ( $size1 <= $threshold );
        open(OUT, ">> $f1.NEAR")
            || die "Cannot open $f1.NEAR: $?";
        my $near_is_empty = 1;
        my $skip = 1;
        foreach my $f2 ( @files ) {
            print STDERR "     with $f2 ...\n" if $verbose > 2;
            # Skip all files until finding f1:
            if ( $f1 eq $f2 ) {
                $skip = 0; # stop skipping files!
                next;
            };
            next if $skip;
            my $size2 = -s "$f2.gz";
            # No plagiarism if the file is nearly empty.
            next if ( $size2 <= $threshold );
            # Now compare f1 and f2 that are non empty:
            my $cmd = "cat $f1 $f2 | $gzip - > /tmp/t.gz";
            system($cmd) == 0 
                || die "Cannot run $cmd: $?";
            $comparison++;
            my $c1size = -s "$f1.gz";
            my $c2size = -s "$f2.gz";
            my $ccsize = -s "/tmp/t.gz";
            unlink '/tmp/t.gz';
            my $min = ( $c1size > $c2size) ? $c2size : $c1size;
            #my $correlation = ($ccsize - $m) / (($c1size + $c2size) - $m);
            my $correlation = 1.0 - (($c1size + $c2size - $ccsize) / $min);
            my $c = 100 * abs($correlation);
            my $comment = "$c1size $c2size $ccsize $f2\t[correlation=$c%]\n";
            print STDERR $comment if $verbose > 2;
            if ( $correlation >= $factor ) {   # See note
                $near_misses++;
                print OUT $comment;
                $near_is_empty = 0;
                print STDERR "NEAR! $c1size $c2size $ccsize [correlation=$c%]
\t$f1\n\t$f2\n"
                    if $verbose;
            };
        }
        close(OUT)
            || die "Cannot close $f1.NEAR: $?";
        # Remove .NEAR files if empty:
        if ( $remove_empty_near_files && $near_is_empty ) {
            (1 == unlink "$f1.NEAR")
                || die "Cannot remove $f1.NEAR: $?";
        };
    }
    print STDERR "SUMMARY: $count files, $comparison comparisons performed",
                 ", $near_misses potential near misses found.\n"
        if $verbose;
}

# NOTE: $ccsize is always less than the sum of $c1size and $c2size.
# $ccsize is always greater than the maximum of $c1size and $c2size.
# (if the two files are equal then $ccsize is around $c1size + 10.)
# If $ccsize is far less than the sum then chances are that the two
# original files have a lot in common. The correlation is computed as
# let m = min($c1size, $c2size) in 
#        1 - (($c1size + $c2size - $ccsize) / m)
# This is a number between 0 and 1. The greater it is, the more the files 
# are similar! A correlation of 100% is a sure sign of plagiarism.

# Analyze options and start computations.

while ( @ARGV > 0 ) {
    local $_ = shift;
    print STDERR "Analyzing option $_\n" if $verbose > 1;
    if ( /^--clean-gz$/ ) {
        $clean_gz = 1;
    } elsif ( /^--clean-near$/ ) {
        $clean_near = 1;
    } elsif ( /^--clean$/ ) {
        $clean_gz = 1;
        $clean_near = 1;
    } elsif ( /^-v$/ ) {
        $verbose++;
    } elsif ( /^-k$/ ) {
        $remove_empty_near_files = 1;
    } elsif ( /^--path=(.*)$/ ) {
        $shpath = $1;
    } elsif ( /^--factor=(\d*(\.\d*)?)$/ ) {
        $factor = "0" . $1;
    } elsif ( /^--threshold=(\d+)$/ ) {
        $threshold = $1;
    } elsif ( /^-(h|-help)$/ ) {
        print $usage;
        exit(0);
    } else {
        print STDERR "Unknown option: $_\n";
        die $usage;
    }
}

if ( ! defined($shpath) ) {
    print STDERR "Please, specify the --path option.\n";
    die $usage;
};

clean_all_gz() if $clean_gz;
clean_all_NEAR() if $clean_near;
compress_all_files();
look_for_clones();

1;

# end of plagiat.pl