#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';


#local folders with custom packages
use lib '/soldb/local_scripts/custom_packages';

#local packages to use
use runtime;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $in_file=$args{'i'};
my $out_dir=$args{'o'};

my $start_time=time;

$in_file or die "no input file specified, try -i phrap_out_file\n";;
$out_dir||="/tmp/";


my $lib=$in_file;
$lib=~s/^.+\/([^\/]+)_phrap.out/$1/;
my $align_file=$out_dir.$lib.".alignment";

print "$align_file\n";



my %lib_align=();
my ($cluster, $contig, $est);


open FILEIN, "$in_file" or die "Couldn't open file $in_file\n";
my ($readline, $vector_read);
my %vector=();

foreach (<FILEIN>){

#skip empty lines
    /^\s*$/ 
	and next;	
    
#turn off seqtrim_read after reading the EST trimming info
    /^Near duplicate reads/ 
	and $vector_read=0
	    and next;
    
#start reading EST possible vector info
    /^Probable unremoved sequencing vector/ 
	and $vector_read=1
	    and next;
    
#parse EST possible vector info
    $vector_read
	and /^[a-zA-Z0-9\.-]+-([0-9]+)\s+([0-9]+)-([0-9]+)\s+([a-zA-Z]+)/
	    and $vector{$1}=[$2, $3, $4]
		and next;
    
#turn off readline after finishing reading the components
    (/^Contig quality/ or /^Overall/)
	and $readline=0
	    and next;
    
#read the cluster summary and turn on readline
    /^Contig ([0-9]+)\.\s+([0-9]+) read[s]{0,1}; ([0-9]+) bp.+,\s+([0-9]+) / 
	and $contig=$1
	    and $lib_align{$lib}{$contig}{summary}=[$2, $3, $4]
		and $readline=1
		    and next;
    
#skip comment lines
    /^\s*\*+\s+/
	and next;
    
#read the member ESTs, start and stop positions
#NOTE:  It is safe to assume that all the trimming info
#       has been read before reaching the contig info
    
    if ($readline){
	tr/\)\(//d;
	/^\s*C/ or substr($_, 0, 0)='N';
	my ($reverse, $start, $end, $est, $score, $othermatch, $pct1, $pct2, $pct3, $front_trim, $front_trim_match, $back_trim, $back_trim_match)=split;
	my $direction='u';
	$est=~/^.+\.([bg])/
	    and $direction=$1;
	$est=~/^(.+)-([0-9]+)$/
	    and my $cid=$1
		and my $seqid=$2;
	$cid=~s/\.[bg]$//;
	my ($vector_start, $vector_end, $vector_seq)=(0,0, '');
	$vector{$seqid} 
	and $vector_start=$vector{$seqid}[0] 
	    and $vector_end=$vector{$seqid}[1]
		and $vector_seq=$vector{$seqid}[2];
	$lib_align{$lib}{$contig}{$seqid}=[$cid, $reverse, $direction, $start, $end, $front_trim, $back_trim, $vector_start, $vector_end, $vector_seq];
	next;
    }
    
}

close FILEIN;





open FILEOUT, ">$align_file";

print FILEOUT "
#Parsed contig info from phrap standard output
#Format:
#
#lib name
#*tab*contig nr, reads nr, contig length (untrimmed), contig length (trimmed)
#*tab**tab*est nr, est cornell id, reversed sequence (C=yes, N=no),
#direction (b=5\', g=3\'), start location within contig, end location
#within contig, seq trimmed for assemby at start, seq trimmed at end, 
#possible vector start, possible vector end, possible vector sequence\n
";

foreach $lib (keys %lib_align){
    $lib or (print "Empty lib\n" and next);
    print FILEOUT "Library $lib\n";
    foreach $contig (keys %{$lib_align{$lib}}){
	$contig or (print "Empty contig id in $lib\n" and next);
	print FILEOUT "\tContig$contig, ", join (', ', @{$lib_align{$lib}{$contig}{summary}}), "\n";
	foreach $est (keys %{$lib_align{$lib}{$contig}}){
	    $est or (print "Empty est id in $contig of $lib\n" and next);
	    $est ne 'summary'
		and print FILEOUT "\t\t$est, ", join (', ', @{$lib_align{$lib}{$contig}{$est}}), "\n";
	}
    }
}

close FILEOUT;

runtime::runtime_print($start_time, 'Contig membership alignment extraction');
