#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

use db_link;
use projects;
use CXGN::BioTools::FastaParser;
use Data::Dumper;

# 0 for normal output
my $debug = 0;

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));
my %args=();
foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$args{$flag}=$val;
}
my $project=$args{'p'};
my $lib_name=$args{'l'};
my $seq_in_file=$args{'i'};
my $output_directory=$args{'d'};
my $output_file=$args{'o'};
$seq_in_file ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib_name/$lib_name.fasta";
$output_directory ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib_name";
$output_file ||= "$output_directory/$lib_name.fasta.cap.align";

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$lib_name or die "No library specified, try '-l library_name (-l all to process all libraries)'\n";

# connect to database
my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#######################################################################
#######################################################################
#main body of script
#######################################################################
#######################################################################

open OUT, ">$output_file" or die "Couldn't open ${output_file}: $!\n";

#######################################################################
# process contigs
#######################################################################

open FINDPIPE, "find $output_directory -name \"*.fasta.cap.ace\" |" or die "Failed to open pipe for find command ($!)";

# iterate through each .fasta.cap.ace file in the input directory
foreach ( <FINDPIPE> ) {
	my ($contig_name, $contig_no);
	my $cluster_nr = 0;
	my $first = "yes"; # keep track of what we are doing while parsing the acefile
	my %components = ();
	my $readname = "not set";
	my $padded_length = "0";
	
	open ACEFILE, "<$_" or die "Can't open ace file \"$_\" ($!)\n";

	# read first line of acefile, ensure it is ACTUALLY an ace file,
	# get number of contigs described in file
	my $line = <ACEFILE>;
	if ($line !~ m/^AS/) {
		die "Not ACE format $_\n";
	}
	my (undef, $n_contigs) = split /\s/, $line;

	# if there are no contigs, there is no point continuing with this file
	if ($n_contigs == 0) {
		close ACEFILE;
		next;
	}

	# iterate through further lines in acefile
	while(<ACEFILE>) {
		my $line = $_;

		my ($ort, $start_pos, $first_qbase, $last_qbase);

		# line defines a contig
		# find out whether this defines a contig, and if so, which one
		if (m/^CO/) {
			# if this isn't the first time we've seen a line like this, 'commit' previous contig's data
			($first eq "no") and dump_output($cluster_nr, $contig_no, \%components);
			# start learning about the current new contig
			$first = "no";
			%components = ();
			(undef, $contig_name) = split /\s/, $line;
			($contig_no) = $contig_name =~ m/Contig([0-9]+)/;
			($debug > 1) and warn "**CO: ($cluster_nr) $contig_no\n";
		}

		# line defines a contig member
		# get the realname, orientation, and starting position of this contig member
		if (m/^AF/) {
			(undef, $readname, $ort, $start_pos) = split /\s/, $line;

			# make sure this member doesn't appear multiple times
			if (defined($components{$readname})) {
				warn "Warning: component $readname already read from file\n";
				print OUT "Warning: component $readname already read from file\n";
			}
			
			# change the orientation notation to something useful
			if ($ort eq "C") {
				$ort = "-";
			} else {
				$ort = "+";
			}

			# set this data for this contig member
			$components{$readname} = { ort => $ort, start_pos => $start_pos };
			($debug > 1) and warn "**AF: readname: $readname, ort: $ort, start: $start_pos\n";
		}

		# line defines a contig member
		# get the realname and length of this contig member
		if (m/^RD/) {
			(undef, $readname, $padded_length) = split /\s/, $line;

			# make sure this member has already been defined
			if (! defined($components{$readname})) {
				warn "Found RD line for component $readname, but there was no AF line\n";
				print OUT "Found RD line for component $readname, but there was no AF line\n";
			}

			# set this data for this contig member
			$components{$readname}->{padded_length} = $padded_length;
			($debug > 1) and warn "**RD: readname: $readname, padded_length: $padded_length\n";
		}

		# line defines a contig member
		# get the start trim and start position for this contig member
		if (m/^QA/) {
			(undef, undef, undef, $first_qbase, $last_qbase) = split /\s/, $line;

			# set this data for this contig member
			$components{$readname}->{start_trim} = $first_qbase - 1;
			$components{$readname}->{start_pos} += $first_qbase - 1;
			$components{$readname}->{end_trim} = $padded_length - $last_qbase;
			$components{$readname}->{end_pos} = $last_qbase - 1;
			($debug > 1) and warn "**QA: (X)readname: $readname, (X)padded_length: $padded_length, first_qbase: $first_qbase, last_qbase: $last_qbase\n";
		}

	}

	# End of File condition, process last contig
	close ACEFILE;
	dump_output($cluster_nr, $contig_no, \%components);
}

close FINDPIPE;


#######################################################################
# process singlets
#######################################################################

my ($seq_id);
my $singlet_file=$lib_name . ".fasta.cap.singlets";
open FINDPIPE, "find $output_directory -name \"*.cap.singlets\" |"  or die "Failed to open pipe for find command ($!)";

foreach ( <FINDPIPE> ) {
	chomp;
	my ($contig_name, $contig_no);
	if (! m/\.fasta\.cap\.singlets$/) {
		die "File \"$_\" unrecognized\n";
	}
	my $ufh = CXGN::BioTools::FastaParser->load_file("$output_directory/$singlet_file");
	foreach $seq_id ( $ufh->get_seqids() ) {
		if ($seq_id =~ m/^([X0-9]+)-S-(.+)/) {
			my $seq_length = length($ufh->get_sequence($seq_id));
			if ($1 eq "X") {
				print OUT "Singlet:\t",join("\t",-1,-1,1,$seq_length,$seq_length),"\n";
				print OUT join("\t",$2,"+",1,$seq_length,0,0),"\n";
			} else {
				print OUT "Singlet:\t",join("\t",$1,-1,1,$seq_length,$seq_length),"\n";
				print OUT join("\t",$2,"+",1,$seq_length,0,0),"\n";
			}
		}
	}
}

close FINDPIPE;


close OUT;

#######################################################################
#######################################################################
# subroutines
#######################################################################
#######################################################################

sub dump_output {
	my ($cluster_nr, $contig_no, $component_ref) = @_;
	if ($debug > 0) {
		warn "dump_output recieved:\n";
		warn "\tcluster_nr: $cluster_nr\n";
		warn "\tcontig_no: $contig_no\n";
		warn "\tcomponent:\n";
		warn Dumper %$component_ref;
	}
	
	my @reads = sort { $component_ref->{$a}->{start_pos} <=> $component_ref->{$b}->{start_pos} } keys %{$component_ref};

	my ($max_e, $min_s, $max_l) = (0,0,0);
	foreach ( @reads ) {
		my ($s, $e, $l);
		$s = $component_ref->{$_}->{start_pos}-$component_ref->{$_}->{start_trim};
		$e = $component_ref->{$_}->{end_pos}+$component_ref->{$_}->{end_trim};
		$l = $component_ref->{$_}->{end_pos};

		$min_s = $s if ($s < $min_s);
		$max_e = $e if ($e > $max_e);
		$max_l = $l if ($l > $max_l);
	}

	my $n_components = @reads;

	# untrimmed length is untrimmed end - untrimmed start 
	# Format: cluster# contig# n_components untrimmed_length trimmed_length
	print OUT "Contig:\t",join("\t",$cluster_nr,$contig_no,$n_components,$max_e-$min_s,$max_l),"\n";
	foreach ( @reads ) {
		print OUT join("\t",$_,$component_ref->{$_}->{ort},$component_ref->{$_}->{start_pos},$component_ref->{$_}->{end_pos}, $component_ref->{$_}->{start_trim},$component_ref->{$_}->{end_trim}),"\n";
	}
}


# QA padded start padded end
# Length used = padded end - padded start
# CO Aligned END = ALign Start + Length used
# Trim end = padded_length - padded_end