#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $lib=$args{'l'};
my $project=$args{'p'};
my $contig_file=$args{'i'};
my $contig_qual_file=$args{'q'};
my $alignment_file=$args{'a'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$lib or die "Please specify a library to load using the -l flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#set defaults for io, script filename, etc
$contig_file ||= "/tmp/${lib}.fasta.contigs";
$contig_qual_file ||= $contig_file.".qual";
$alignment_file ||= "/tmp/${lib}.alignment";




#main body of script
#####################

my $start_time=time;


#parse in the contig info
my %contig_seq=();
my %contig_qual=();
my %contig_members=();
my %contig_summary=();

my ($contig_nr, $dataline, $est_id);


#get the sequence
open FILEIN, "$contig_file";
while (<FILEIN>){
    chomp;
    if (/^>.+Contig([0-9]+)/){ 
        if ($contig_nr){
            $contig_seq{$contig_nr}=$dataline;
        }
        $contig_nr=$1;
        $dataline='';
        next;
    }
    $dataline.=$_;
}

$contig_seq{$contig_nr}=$dataline;

close FILEIN;

#get the quality
open FILEIN, "$contig_qual_file";
while (<FILEIN>){
    chomp;
    if (/^>.+Contig([0-9]+)/){ 
        if ($contig_nr){
            $contig_qual{$contig_nr}=$dataline;
        }
        $contig_nr=$1;
        $dataline='';
        next;
    }
    $dataline.=$_;
}

$contig_qual{$contig_nr}=$dataline;

close FILEIN;


#get the memebership and alignments

# Parse the alignment info into a data structure from the alignment file.
# Expected format for alignment file is:
#
# Library name
# *tab* contig nr, nr of reads, contig length (untrimmed), contig length (trimmed)
# *tab* *tab* est nr, est cornell id, reversed sequence (C=yes, N=no),
# direction (b=5', g=3'), start location within contig, end location
# within contig, seq trimmed for assemby at start, seq trimmed at end, 
# possible vector start, possible vector end, possible vector sequence

open FILEIN, $alignment_file;
while (<FILEIN>){

#skip empty and comment lines
    (/^\s*$/ or /^\#/) and next;

#check for the proper library
    if(/^Library ([^\s]+)/){
	$lib eq $1 or die "Wrong library: expected $lib, found $1";
	next;
    }

#read the contigs
    if(/^\tContig([0-9]+),\s+(.+)$/){
        $contig_nr=$1;
	@{$contig_summary{$contig_nr}}=split /, /, $2;
        next;       
    }

#read the member ESTs, and their info
    if(/^\t\t([0-9]+),\s+(.+)$/){
	$est_id=$1;
	@{$contig_members{$contig_nr}{$est_id}}=split /, /, $2;
	next;
    }
}
close FILEIN;

print "Got " . int (keys %contig_summary) . " contigs\n";
print "Got " . int (keys %contig_seq) . " contig seq\n";
print "Got " . int (keys %contig_qual) . " contig seq quality\n";

my ($membership_contigs, $member_count);
foreach (keys %contig_members){
    my $nr_members = int (keys %{$contig_members{$_}});
    $nr_members>1 or next;
    $membership_contigs++;
    $member_count+=$nr_members;
}

print "Got $membership_contigs contigs with more than one element\n";
print "Got $member_count members in those contigs\n";


# We need a source of random numbers to make temp file names.
# If we cannot get /dev/urandom, just give up.
open DEVRANDOM, "</dev/urandom"
  or die "sys error: Cannot open /dev/urandom\n";

#start loading the assembly info
# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);


#get the library id
my $lib_id;
$stm = "select est_library_id from est_library where library_name='$lib'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$lib_id);
$sth->fetch;


##remove the previous entries for the assemblies
#################################################

my @assembly_id_to_remove;
my $assembly_id_topull;
$stm = "select lib_assembly_id from lib_assembly where est_library_id='$lib_id'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$assembly_id_topull);
while($sth->fetch){
    push @assembly_id_to_remove, $assembly_id_topull;
}

foreach (@assembly_id_to_remove){

    $stm = "delete from lib_assembly where lib_assembly_id='$_'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    $stm = "delete from lib_assembly_sequence where lib_assembly_id='$_'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    $stm = "delete from lib_assembly_sequence_quality where lib_assembly_id='$_'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    $stm = "delete from lib_assembly_component where lib_assembly_id='$_'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

}

$stm = "delete from lib_assembly_singleton where est_library_id='$lib_id'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";




#load the new data
###################

my ($contig_count, $contig_member_count, $singlet_count)=(0,0,0);

my %assembly_id=();

#load the contig summary
foreach (keys %contig_summary){

#skip those with just one member
    ($contig_summary{$_}[0] > 1) or next;

# Create temp load id
    my ($random, $random_string);
    read( DEVRANDOM, $random, 16);
    $random_string = unpack("h*",$random);

    $stm = "insert into lib_assembly (est_library_id, contig, nr_components, untrimmed_len, trimmed_len, load_id) values ('$lib_id', '$_', '$contig_summary{$_}[0]', '$contig_summary{$_}[1]', '$contig_summary{$_}[2]', '$random_string')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the contig identifier for the inserted contig
    my $lib_assembly_id;
    $stm="select lib_assembly_id from lib_assembly where load_id='$random_string'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$lib_assembly_id);
    if($sth->fetch){
	$assembly_id{$_}=$lib_assembly_id;
    }

    $contig_count++;
}


#insert the sequence and quality
foreach (keys %contig_seq){

    $assembly_id{$_} or next;

#insert the sequence
    $stm="insert into lib_assembly_sequence (lib_assembly_id, sequence_data) values ('$assembly_id{$_}', '$contig_seq{$_}')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the sequence id
    my $contig_seq_id;
    $stm="select lib_assembly_sequence_id from lib_assembly_sequence where lib_assembly_id='$assembly_id{$_}'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$contig_seq_id);
    $sth->fetch;


#insert the quality values
    $stm="insert into lib_assembly_sequence_quality (lib_assembly_sequence_id, lib_assembly_id, quality_values) values ('$contig_seq_id', '$assembly_id{$_}', '$contig_qual{$_}')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
}

#insert the assembly component info
my $contig;
my %contig_member_id=();
foreach $contig (keys %contig_members){

    $assembly_id{$contig} or next;

    foreach (keys %{$contig_members{$contig}}){
 
#get the seq_id for this trimmed_id
	my $seq_id;
	$stm="select seq_id from trimmed_sequence where trimmed_seq_id='$_'";
	$sth = $dbh->prepare($stm) 
	    || die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	    || die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$seq_id);
	if($sth->fetch){
	    $contig_member_id{$_}=$seq_id;
	}
	
#set no vector to empty string
	$contig_members{$contig}{$_}[9] ||= '';
	
#insert into contig_components
	$stm="insert into lib_assembly_component (seq_id, trimmed_seq_id, lib_assembly_id, assembly_orientation, read_direction, start_location, end_location, start_trim, end_trim, possible_vector_start, possible_vector_end, possible_vector_seq) values ('$seq_id', '$_', '$assembly_id{$contig}', '$contig_members{$contig}{$_}[1]', '$contig_members{$contig}{$_}[2]', '$contig_members{$contig}{$_}[3]', '$contig_members{$contig}{$_}[4]', '$contig_members{$contig}{$_}[5]', '$contig_members{$contig}{$_}[6]', '$contig_members{$contig}{$_}[7]', '$contig_members{$contig}{$_}[8]', '$contig_members{$contig}{$_}[9]')";
	$sth = $dbh->prepare($stm) 
	    || die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	    || die "Can't execute statement: $DBI::errstr";
	
	$contig_member_count++
    }
}

#load the singlets

#get a list of all the sequences assembly started with
#for the combined libraries, select all qualifying sequences

my %singlet_seq_id=();
my ($trimmed_seq_id, $seq_group_id);

$stm="select t.trimmed_seq_id, t.seq_id from trimmed_sequence as t, quality_evaluation as q";
($lib eq 'all') or $stm.=", est_info as e";
$stm.=" where t.trimmed_seq_id=q.trimmed_seq_id and q.qual_criteria_id='6'";
($lib eq 'all') or $stm.=" and t.seq_id=e.seq_id and e.est_library_id='$lib_id'";

$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$trimmed_seq_id, \$seq_group_id);
while($sth->fetch){
    $singlet_seq_id{$trimmed_seq_id}=$seq_group_id;
}


#remove all the sequences that were assembled in contigs
foreach (keys %singlet_seq_id){
    if($contig_member_id{$_}){
	delete $singlet_seq_id{$_};
    }
}

#load the singlets
foreach (keys %singlet_seq_id){
    $stm="insert into lib_assembly_singleton (seq_id, trimmed_seq_id, est_library_id) values ('$singlet_seq_id{$_}', '$_', '$lib_id')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    $singlet_count++;
}


db_link::disconnect_db($dbh);


print "Loaded $contig_count contigs, $contig_member_count contig components, and $singlet_count singlets\n";
 runtime::runtime_print($start_time, "Assembly upload");


