#!/usr/bin/perl -w
use strict;

#person in charge of this script
################################
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

# includes
##########
use runtime;
use db_link;
use projects;
use DBI;
use IO::File;

# parse command-line arguments
##############################
@ARGV or print "No input parameters, proceeding with default.\n";
my @arg_pairs = split (/\-/, (join ' ', @ARGV));
my %args=();
foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$args{$flag}=$val;
}
my $lib=$args{'l'};
my $project=$args{'p'};
# we really care that this script gets valid input because later on it deletes
# files based on the input given here... it is a huge security/stupidity risk
($lib =~ /[A-Za-z]{3}/) or die "No library specified, try '-l library_name (no digits, this script runs on combined libraries)'\n";
($project =~ /(c|f|p)gn/) or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

# predefined variables
######################
# file which was generated by the previous script
my $input_file="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/$lib-chimera_screen_results.txt";
# file to append results to (leave unset to not use this feature)
my $output_file="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/global-chimera_results.txt";
#my $output_file;

# keep track of everything
##########################
my ($sequences, $not_chimeric, $possibly_chimeric, $inconclusive0, $inconclusive1) = 0;
my %results = ();

# parse input/results file
##########################
open(RESULTS, "< $input_file") or die "Couldn't open ${input_file}: $!\n";
while(<RESULTS>) {
	my ($identifier, $result, $length_observation, $leading_score, $leading_evalue, $leading_match, $trailing_score, $trailing_evalue, $trailing_match) = split(/\t/,$_);
	# verify that the same sequence hasn't already been parsed
	# that should never happen
	if (!$results{$identifier}) {
		$results{$identifier}{'identifier'} = $identifier;
		$results{$identifier}{'result'} = $result;
		$results{$identifier}{'length_observation'} = $length_observation;
		$results{$identifier}{'leading_score'} = $leading_score;
		$results{$identifier}{'leading_evalue'} = $leading_evalue;
		$results{$identifier}{'leading_match'} = $leading_match;
		$results{$identifier}{'trailing_score'} = $trailing_score;
		$results{$identifier}{'trailing_evalue'} = $trailing_evalue;
		$results{$identifier}{'trailing_evalue'} = $trailing_evalue;
		$sequences++;
		($result == 0) && $not_chimeric++;
		($result == 1) && $possibly_chimeric++;
		($result == 2) && $inconclusive0++;
		($result == 3) && $inconclusive1++;
	}
	# if the impossible should happen (which it sometimes does),
	# print a warning about it and continue
	else {
		print "$identifier exists in file multiple times!  Only the first instance is recorded.\n"
	}
}
close RESULTS;

# let the user know where we're at
##################################
print "Finished parsing ${project}::${lib} combined result file containing $sequences total sequences:\n\t$not_chimeric non-chimeric sequences\n\t$possibly_chimeric possibly chimeric sequences\n\t$inconclusive0 inconclusive (neither end matched) sequences\n\t$inconclusive1 inconclusive (only one end matched) sequences\n";
# print the results to the output file, if specified
if ($output_file) {
	open OUTFILE, ">>$output_file" or die "Couldn't open ${output_file}: $!\n";
	print OUTFILE "${project}::${lib}\t$sequences\t$not_chimeric\t$possibly_chimeric\t$inconclusive0\t$inconclusive1\n";
	close OUTFILE;
}

# clean up files generated by other scripts
###########################################
system("rm -r /data/shared/pgn_data_processing/unigene_builds/$project/tmp-*");
system("rm /data/shared/pgn_data_processing/unigene_builds/$project/$lib-chimera_prescreen-*");
system("rm /data/shared/pgn_data_processing/unigene_builds/$project/chimera-*");

# connect to the database
#########################
#my ($db, $usr) = @{projects::get_db_info($project)};
#$db or die "No known database for project $project";
#my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";


