#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/(^|\W)\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$flag or next;
	$args{$flag}=$val;
}
my $contig_file=$args{'c'};
my $contig_qual_file=$args{'q'};
my $alignment_file=$args{'a'};
my $project=$args{'p'};
my $build_id=$args{'b'};
my $lib=$args{'l'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$build_id or die "No build id specified, please specify one using the -b flag\n";
$lib or die "No library specified, try '-l library_name (-l all to process all libraries)";

$contig_file ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib/$lib.fasta.cap.contigs";
$contig_qual_file ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib/$lib.fasta.cap.contigs.qual";
$alignment_file ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib/$lib.fasta.cap.align";

# get db info
my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

########DEBUG###########
#print "$contig_file\n$contig_qual_file\n$alignment_file\n"; 

my $script_file=__FILE__;

#main body of script
#####################

my $start_time=time;

# find out which libraries wthis affects
########################################

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

my @libraries=();
if ($lib eq 'all') {
	my $ql = $dbh->prepare("select library_name from est_library where library_name!='all';");
	$ql->execute();
	while(my ($library_name) = $ql->fetchrow_array()) {
		push @libraries, $library_name;
	}
}
else {
	my $ql = $dbh->prepare("select library_name from est_library where library_name like '$lib%';");
	$ql->execute();
	while(my ($library_name) = $ql->fetchrow_array()) {
		push @libraries, $library_name;
	}
}

#parse in the contig info
#########################
my %contig_seq;
my %contig_qual;
my %contig_members;
my %contig_summary;
my %singlet_contigs;

my ($contig_nr, $dataline, $est_id);

#get the sequence
open FILEIN, "$contig_file" or die "Couldn't open $contig_file for read.\n";
while (<FILEIN>){
	chomp;
	if (/^>/){
		if (/^>Contig([0-9]+)\s*$/){ 
			if ($contig_nr){
				$contig_seq{$contig_nr}=$dataline;
			}
			$contig_nr = "0_" . $1;

#########DEBUG#############
#			warn "$contig_nr";

			$dataline='';
			next;
		}
		else {
			last;
		}
	}
	$dataline.=$_;
}
# print "contig_nr: $contig_nr\n";
$contig_seq{$contig_nr}=$dataline;

close FILEIN;

#get the quality
open FILEIN, "$contig_qual_file" or die "Couldn't open $contig_qual_file for read.\n";
while (<FILEIN>){
	chomp;
	if (/^>/){
		if (/^>Contig([0-9]+)\s*$/){ 
			if ($contig_nr){
				$contig_qual{$contig_nr}=$dataline;
			}
			$contig_nr= "0_" . $1;
			$dataline='';
			next;
		}
		else{
			last;
		}
	}
	$dataline.=$_;
}
$contig_qual{$contig_nr}=$dataline;

close FILEIN;


#get the memebership and alignments

# Parse the alignment info into a data structure from the alignment file.

my ($contig_data, $singlet_data)=(0,0);

open FILEIN, $alignment_file;
while (<FILEIN>){

#skip empty and comment lines
	(/^\s*$/ or /^\#/) and next;

#check for the proper project
#	if(/^Project ([^\s]+)/){
#		$project eq $1 or die "Wrong project: expected $project, found $1";
#		next;
#	}

#read the contigs
	if(/^\s*Contig:\s+([0-9]+)\t([0-9]+)\t(.+)$/){
		$contig_nr= $1 . "_" . $2;

#########DEBUG#############
#		print "$contig_nr\n";

		@{$contig_summary{$contig_nr}}=split /\t/, $3;
		$contig_data=1;
		$singlet_data=0;
		next;       
	}

#read and skip singlet data
	if (/^\s*Singlet:/){
		$contig_data=0;
		$singlet_data=1;
		next;       
	}
	$singlet_data and next;


#read the member ESTs, and their info
	if(/^\s*[\w-]+-([0-9]+)\t(.+)$/){
		$est_id=$1;
		@{$contig_members{$contig_nr}{$est_id}}=split /\t/, $2;

######DEBUG#########
#		print join ('|', @{$contig_members{$contig_nr}{$est_id}}) . "\n";

		next;
	}
}
close FILEIN;

print "Got " . int (keys %contig_summary) . " contigs\n";
print "Got " . int (keys %contig_seq) . " contig seq\n";
print "Got " . int (keys %contig_qual) . " contig seq quality\n";

my ($membership_contigs, $member_count);
foreach (keys %contig_members){
	my $nr_members = int (keys %{$contig_members{$_}});
	$nr_members>1 or next;
	$membership_contigs++;
	$member_count+=$nr_members;
}

print "Got $membership_contigs contigs with more than one element\n";
print "Got $member_count members in those contigs\n";





# We need a source of random numbers to make temp file names.
# If we cannot get /dev/urandom, just give up.
open DEVRANDOM, "</dev/urandom"
	or die "sys error: Cannot open /dev/urandom\n";

#start loading the assembly info

#if library specific, get the library ID or abort
###################################################
my $library='';
my @lib_ids=();
foreach $library (@libraries){
	$stm= "select est_library_id from est_library where library_name='$library'";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	my $lib_id;
	$rc = $sth->bind_columns(\$lib_id);
	unless ($sth->fetch){
		db_link::disconnect_db($dbh);
		die "Couldn't find library id\n";
	}
	push @lib_ids, $lib_id;
}



#load the new data
###################

my ($contig_count, $contig_member_count, $singlet_count)=(0,0,0);

my %assembly_id=();

#load the contig summary
foreach (keys %contig_summary){

#skip those with just one member
	($contig_summary{$_}[0] > 1) or next;

# Create temp load id
	my ($random, $random_string);
	read( DEVRANDOM, $random, 16);
	$random_string = unpack("h*",$random);

	$stm = "insert into unigene_assembly (contig, nr_components, untrimmed_len, trimmed_len, load_id) values ('$_', '$contig_summary{$_}[0]', '$contig_summary{$_}[1]', '$contig_summary{$_}[2]', '$random_string')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#    print "$stm\n";

#get the contig identifier for the inserted contig
	my $unigene_assembly_id;
	$stm="select unigene_assembly_id from unigene_assembly where load_id='$random_string'";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$unigene_assembly_id);
	if($sth->fetch){
		$assembly_id{$_}=$unigene_assembly_id;
	}


#    print "$stm\n";

#insert into the unigene table
	$stm = "insert into unigenes (unigene_build_id, unigene_element_id, unigene_element_type_id) values ('$build_id', '$unigene_assembly_id', '1')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#    print "$stm\n";

	$contig_count++;
}



#insert the sequence and quality
foreach (keys %contig_seq){

	$assembly_id{$_} or next;

#insert the sequence
	$stm="insert into unigene_assembly_sequence (unigene_assembly_id, sequence_data) values ('$assembly_id{$_}', '$contig_seq{$_}')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the sequence id
	my $contig_seq_id;
	$stm="select unigene_assembly_seq_id from unigene_assembly_sequence where unigene_assembly_id='$assembly_id{$_}'";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$contig_seq_id);
	$sth->fetch;


#insert the quality values
	$stm="insert into unigene_assembly_sequence_quality (unigene_assembly_seq_id, unigene_assembly_id, quality_values) values ('$contig_seq_id', '$assembly_id{$_}', '$contig_qual{$_}')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
}

#insert the assembly component info
my $contig;
my %contig_member_id=();
foreach $contig (keys %contig_members){

	$assembly_id{$contig} or next;

	foreach (keys %{$contig_members{$contig}}){

#get the seq_id for this trimmed_seq_id
		my $seq_id;
		$stm="select seq_id from trimmed_sequence where trimmed_seq_id='$_'";
		$sth = $dbh->prepare($stm) 
		|| die "Can't prepare statement: $DBI::errstr";
		$rv = $sth->execute
		|| die "Can't execute statement: $DBI::errstr";
		$rc = $sth->bind_columns(\$seq_id);
		$sth->fetch or die ("Couldn't find seq id for trimmed_seq_id=$_\n");


#set variables
		my $trimmed_seq_id = $_;
		my $unigene_assembly_id=$assembly_id{$contig};
		my $assembly_orientation=$contig_members{$contig}{$_}[0];
		if ($assembly_orientation eq '+'){
			$assembly_orientation = 'N';
		}
		if ($assembly_orientation eq '-'){
			$assembly_orientation = 'C';
		}
		my $read_direction = 'u';
		my $start_location = $contig_members{$contig}{$_}[1] + $contig_members{$contig}{$_}[3] - 1;
		my $end_location = $contig_members{$contig}{$_}[2] - $contig_members{$contig}{$_}[4];
		my $start_trim = $contig_members{$contig}{$_}[3];
		my $end_trim = $contig_members{$contig}{$_}[4];

		$contig_member_id{$_}=$trimmed_seq_id;

#insert into contig_components
		$stm="insert into unigene_assembly_component (seq_id, trimmed_seq_id, unigene_assembly_id, assembly_orientation, read_direction, start_location, end_location, start_trim, end_trim) values ('$seq_id', '$trimmed_seq_id', '$unigene_assembly_id', '$assembly_orientation', '$read_direction', '$start_location', '$end_location', '$start_trim', '$end_trim')";
		$sth = $dbh->prepare($stm) 
		|| die "Can't prepare statement: $DBI::errstr";
		$rv = $sth->execute
		|| die "Can't execute statement: $DBI::errstr";

		$contig_member_count++
	}
}



#load the singlets

#get a list of all the sequences assembly started with

my %singlet_seq_id=();
my ($trimmed_seq_id, $seq_group_id);

foreach (@lib_ids){
	$stm="select t.trimmed_seq_id, t.seq_id from trimmed_sequence as t left join quality_evaluation as q using (trimmed_seq_id)";
	unless ($libraries[0] eq 'all'){
		$stm .= " left join est_info as e using (seq_id)";
	}

	$stm .= " where q.qual_criteria_id='6'";

	unless ($libraries[0] eq 'all'){
		$stm .= " and e.est_library_id='$_'";
	}

	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$trimmed_seq_id, \$seq_group_id);
	while($sth->fetch){
		$singlet_seq_id{$trimmed_seq_id}=$seq_group_id;
	}
}



#remove all the sequences that were assembled in contigs
foreach (keys %singlet_seq_id){
	if($contig_member_id{$_}){
		delete $singlet_seq_id{$_};
	}
}

#load the singlets
foreach (keys %singlet_seq_id){
	$stm="insert into unigene_singleton (seq_id, trimmed_seq_id, unigene_build_id) values ('$singlet_seq_id{$_}', '$_', '$build_id')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the singlet id
	my $singlet_id;
	$stm="select unigene_singleton_id from unigene_singleton where trimmed_seq_id='$_'";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$singlet_id);
	$sth->fetch;

#insert into the unigene table
	$stm = "insert into unigenes (unigene_build_id, unigene_element_id, unigene_element_type_id) values ('$build_id', '$singlet_id', '2')";
	$sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
	$singlet_count++;
}


db_link::disconnect_db($dbh);


print "Loaded $contig_count contigs, $contig_member_count contig components, and $singlet_count singlets\n";
runtime::runtime_print($start_time, "Assembly upload");


