#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

#local packages to use
use runtime;
use fasta_parser;

if ($ARGV[0] && $ARGV[0] eq "help") {
	print <<EOF;

  Program to perform a sequence of screens of paired sequence ends against 
  a full length protein database (usually arabidopsis ATH1_pep). If the two 
  ends sequences from a given pair match different proteins, then the sequence
  or clone from which the end pair is derived is considerd chimeric. If the
  pair matches ostensibly the same protein or related proteins, then it is
  considered conclusive evidence that the sequence is not chimeric.

  "related proteins" are loosely defined as proteins with a blastp evalue
  better than 0.001 -- ie, 0.1% of the time the blastp-detected relationship 
  between the peptide sequences is spurious. Thus, we might falsely conclude 
  that a sequence is not chimeric due to a spurious match like this, at most 
  one-tenth one percent of the time.

  Method: Several end lengths are scanned (depends on inputs to this script - 
		  see notes below). If a chimeric result is obtained, further results
		  for that end pair is ignored. This is because we do not, and can not
		  in general precisely know where chimeric sequences are joined. Thus,
		  chimeric results at shorter end lenghts could be superceded by 
		  non-chimeric results at longer lengths as the end size grows past
	  the joining point. Thus, we proceed from short ends to long, 
		  locking in the chimeric conclusion if it is obtained.

  Usage: -p <project> 
		 -l <library>
		 -b <protein blastp result file> (-m 8 output)
		 -d <target database> (e.g. /data/shared/blast/databases/current/ath1/ATH1_pep)
		 -e <evalue threshold>

  Output: For each observed pair identifier in the file, outputs the following
		  <identifier> <result> <length of observation> <leading score> <leading evalue> <leading match> <trailing score> <trailing evalue> <trailing match>

  In the case of inconclusive results, the NULL maybe outputed for either
  of the match & score fields.

  Result codes:
	0 - Not chimeric (conclusive)
	1 - Possibly Chimeric
	2 - Inconclusive (neither end matched)
	3 - Inconclusive (only one end had a match)


EOF
	exit -1;
}

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$args{$flag}=$val;
}

my $lib=$args{'l'};
my $project=$args{'p'};
my $target_database=$args{'d'};
$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$lib or die "No library specified, try '-l library_name (-l all to process all libraries)'\n";
my $input_basename="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/$lib-chimera_prescreen";
my $blastp_outfile=$args{'b'};
my $evalue_threshold=$args{'e'};
my $output_filename="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/$lib-chimera_screen_results.txt";

if (!$target_database) {
	print STDERR "Warning: target database not specified. Using default.\n";
	$target_database = "/data/shared/blast/databases/current/ath1/ATH1_pep";
}

if (!$evalue_threshold) {
	print STDERR "Warning: evalue threshold not specified. Using 0.001.\n";
	$evalue_threshold = 0.001;
}

if (-f $output_filename) {
	print STDERR "Output file \"$output_filename\" already exists. This program".
	" will not overwrite it.\n";
	exit -1;
}


# Do this before we do much of anything else. If this fails for some reason
# (like permission denied or something) then we don't want to bother BLASTing
# away for hours before choking on opening the output file
open OF, ">$output_filename"
	or die "Failed to open output file \"$output_filename\" ($!)";


# Couple of notes here: Found that not all sequences matched themselves in
# the self BLAST. Why? Not sure, perhaps low-complexity filter problems.
# Added a string comparison below so that we do not rely on %hit_eq for this
# (trivial) case. Also found that matches are not always symmetric. Again,
# low complexity filter seems a likely explanation, since it is only applied
# to the query sequence.
my %hit_eq;
if (!$blastp_outfile) {
	print STDERR "Warning: no \"self-blast\" output file given for target " .
	"database (use -b)\n";
	print STDERR "Ends will have to match exactly the same sequence in the " .
	"the target database!\n";
} else {
	print STDERR "Loading blastp file...";
	keys %hit_eq = 0x1<<16;
	open PF, "<$blastp_outfile"
		or die "\n\nFailed to open blastp_outfile \"$blastp_outfile\" ($!)";

	my ($s, $t);
	while(<PF>) {
		chomp;

		($s, $t) = split/\t/;
		$hit_eq{$s}->{$t} = 1; # if .... (use threshold?)
		$hit_eq{$t}->{$s} = 1;
	}

	close PF;
	print STDERR "Done.\n";
}

my $path = substr $input_basename, 0, rindex($input_basename,"/")+1;

my $basename;
if (!$path) {
	$path = "./";
	$basename = $input_basename;
} else {
	$basename = substr $input_basename, rindex($input_basename,"/")+1;
}

print "PATH=$path\tBASNAME=$basename\n";

opendir D, $path
	or die "Can't open directory \"$path\" ($!)";

my %files = ();
while($_ = readdir D) {
	if (m/^$basename-([0-9]+)\.seq$/o) {
		$files{$1} = $path . $_;
	}
}

closedir D;

if (keys(%files) == 0) {
	print STDERR "No prepared input files matching \"$input_basename\" were " .
	"found.\n";
	exit -1;
}

# Check the first file to find all the identifiers and set up a structure
# to track what is known about each sequence pair.
my ($first_filename) = (sort { $a <=> $b } keys %files);
$first_filename = $files{$first_filename};

my $fh = fasta_parser->load_file($first_filename);
my %state = ();
foreach ( $fh->get_seqids() ) {
	my ($id) = split/:/;
	$state{$id} = [ $id, 2, "NULL", "NULL", "NULL", "NULL", "NULL", "NULL", "NULL" ] if !defined($state{$id});
}
my $n_ids = keys %state;
print STDERR "Found $n_ids distinct sequence pairs in $first_filename\n";
$fh->close();
undef $fh;

# Run BLASTs, of varying end lengths, as specified by the input file set
# discovered above.
my %out_files = ();
foreach my $end_length ( sort { $a <=> $b } keys %files ) {
	my $out_filename = "chimera-$$-$end_length-blastx-out";
	$out_files{$end_length} = $out_filename;

	if (system("parallel_blast.pl $files{$end_length} $out_filename $target_database blastx $evalue_threshold 8 auto \"-v 1 -b 1\"")) {
		print STDERR "parallel_blast.pl returned non-zero exit code ",($?/256),"\n";
		exit -1;
	}

	open OUT_FILE, "<$out_filename"
		or die "Can't open result from BLAST for end length $end_length ($!)";

	my %fp_match = ();
	my %tp_match = ();
	keys %fp_match = 0x1<<18;
	keys %tp_match = 0x1<<18;

	while(<OUT_FILE>) {
		chomp;

		@_ = split/\t/;
		my ($s, $t, $evalue, $score) = ($_[0], $_[1], $_[10], $_[11]);
		my ($id,$end) = split/:/,$s;

		# Check threshold? We set the evalue threshold requested on the BLAST
		# command line... but if re-filtering or dynamic thresholds were to be
		# used, support needs to be added here.

		# Note: BLAST can return several alignments for the same query/target
		#       pair, even when we have -v 1 -b 1 turned on.
		if ($end==5) {
			if (defined($fp_match{$id}) && $fp_match{$id}->[0] ne $t) {
				print STDERR "Five prime match already defined for $s " .
				"($fp_match{$id}), skipping $t\n";
			} else {
				$fp_match{$id} = [ $t, $score, $evalue ];
			}
		} elsif ($end==3) {
			if (defined($tp_match{$id}) && $tp_match{$id}->[0] ne $t) {
				print STDERR "Three prime match already defined for $s " .
				"($tp_match{$id}), skipping $t\n";
			} else {
				$tp_match{$id} = [ $t, $score, $evalue ];
			}
		} else {
			print STDERR "Unknown end code $end in $s\n";
		}
	}

	close OUT_FILE;

	my $id;
	foreach $id ( keys %fp_match ) {
		if ($tp_match{$id}) {
			if (($tp_match{$id}->[0] eq $fp_match{$id}->[0]) ||
				$hit_eq{$fp_match{$id}->[0]}->{$tp_match{$id}->[0]}) {
				# Conclusively not chimeric... but only if previous iterations hadn't
				# established it as chimeric already (see notes above)
				if ($state{$id}->[1] != 1) {
					$state{$id}->[1] = 0;
					$state{$id}->[2] = $end_length;
					@{$state{$id}}[3..5] = @{$fp_match{$id}};
					@{$state{$id}}[6..8] = @{$tp_match{$id}};
				}
			} else {
				# Chimeric -- reported matches are not similiar enough according to
				# self blast of target database
				$state{$id}->[1] = 1;
				$state{$id}->[2] = $end_length;
				@{$state{$id}}[3..5] = @{$fp_match{$id}};
				@{$state{$id}}[6..8] = @{$tp_match{$id}};
			}
		} else {
			# One end has a match, record that fact
			if ($state{$id}->[1]!=0 && $state{$id}->[1]!=1) {
				$state{$id}->[1] = 3;
				$state{$id}->[2] = $end_length;
				@{$state{$id}}[3,4,5] = @{$fp_match{$id}};
				@{$state{$id}}[6,7,8] = ( "NULL", "NULL", "NULL" );
			}
		}
	}

	# Catch tp only matches
	foreach $id ( keys %tp_match ) {
		if (!$fp_match{$id}) {
			if ($state{$id}->[1]!=0 && $state{$id}->[1]!=1) {
				$state{$id}->[1] = 3;
				$state{$id}->[2] = $end_length;
				@{$state{$id}}[6,7,8] = @{$tp_match{$id}};
				@{$state{$id}}[3,4,5] = ( "NULL", "NULL", "NULL" );
			}
		}
	}
}

print STDERR "Blast jobs are finished. Writing output...";
foreach my $id ( sort { $a cmp $b } keys %state ) {
	print OF join("\t",@{$state{$id}}),"\n";
}

close OF;
print STDERR "Done.\n";


