#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

use lib '/soldb/www/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $in_dir=$args{'i'};
my $project=$args{'p'};
my $build_id=$args{'b'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$build_id or die "No build id specified, please specify one using the -b flag\n";


my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";

#set defaults for io, script filename, etc
$in_dir ||= "/tmp/";

#uniform trailing slashes
$in_dir=~/\/$/ or $in_dir.='/';


my $contig_file=$in_dir.$project."_all_passed_seq.fasta.contigs";
my $contig_qual_file=$contig_file.".qual";
my $alignment_file=$in_dir.$project."_unigene.alignment";

########DEBUG###########
#print "$contig_file\n$contig_qual_file\n$alignment_file\n"; 


my $script_file=__FILE__;


#main body of script
#####################

my $start_time=time;


#parse in the contig info
my %contig_seq=();
my %contig_qual=();
my %contig_members=();
my %contig_summary=();

my ($contig_nr, $dataline, $est_id);


#get the sequence
open FILEIN, "$contig_file";
while (<FILEIN>){
    chomp;
    if (/^>.+Contig([0-9]+)/){ 
        if ($contig_nr){
            $contig_seq{$contig_nr}=$dataline;
        }
        $contig_nr=$1;
        $dataline='';
        next;
    }
    $dataline.=$_;
}

$contig_seq{$contig_nr}=$dataline;

close FILEIN;

#get the quality
open FILEIN, "$contig_qual_file";
while (<FILEIN>){
    chomp;
    if (/^>.+Contig([0-9]+)/){ 
        if ($contig_nr){
            $contig_qual{$contig_nr}=$dataline;
        }
        $contig_nr=$1;
        $dataline='';
        next;
    }
    $dataline.=$_;
}

$contig_qual{$contig_nr}=$dataline;

close FILEIN;


#get the memebership and alignments

# Parse the alignment info into a data structure from the alignment file.
# Expected format for alignment file is:
#
# Library name
# *tab* contig nr, nr of reads, contig length (untrimmed), contig length (trimmed)
# *tab* *tab* est nr, est cornell id, reversed sequence (C=yes, N=no),
# direction (b=5', g=3'), start location within contig, end location
# within contig, seq trimmed for assemby at start, seq trimmed at end, 
# possible vector start, possible vector end, possible vector sequence

open FILEIN, $alignment_file;
while (<FILEIN>){

#skip empty and comment lines
    (/^\s*$/ or /^\#/) and next;

#check for the proper project
    if(/^Project ([^\s]+)/){
	$project eq $1 or die "Wrong project: expected $project, found $1";
	next;
    }

#read the contigs
    if(/^\tContig([0-9]+),\s+(.+)$/){
        $contig_nr=$1;
	@{$contig_summary{$contig_nr}}=split /, /, $2;
        next;       
    }

#read the member ESTs, and their info
    if(/^\t\t([0-9]+),\s+(.+)$/){
	$est_id=$1;
	@{$contig_members{$contig_nr}{$est_id}}=split /, /, $2;
	next;
    }
}
close FILEIN;

print "Got " . int (keys %contig_summary) . " contigs\n";
print "Got " . int (keys %contig_seq) . " contig seq\n";
print "Got " . int (keys %contig_qual) . " contig seq quality\n";

my ($membership_contigs, $member_count);
foreach (keys %contig_members){
    my $nr_members = int (keys %{$contig_members{$_}});
    $nr_members>1 or next;
    $membership_contigs++;
    $member_count+=$nr_members;
}

print "Got $membership_contigs contigs with more than one element\n";
print "Got $member_count members in those contigs\n";


# We need a source of random numbers to make temp file names.
# If we cannot get /dev/urandom, just give up.
open DEVRANDOM, "</dev/urandom"
  or die "sys error: Cannot open /dev/urandom\n";

#start loading the assembly info
# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);




#load the new data
###################

my ($contig_count, $contig_member_count, $singlet_count)=(0,0,0);

my %assembly_id=();

#load the contig summary
foreach (keys %contig_summary){

#skip those with just one member
    ($contig_summary{$_}[0] > 1) or next;

# Create temp load id
    my ($random, $random_string);
    read( DEVRANDOM, $random, 16);
    $random_string = unpack("h*",$random);

    $stm = "insert into unigene_assembly (contig, nr_components, untrimmed_len, trimmed_len, load_id) values ('$_', '$contig_summary{$_}[0]', '$contig_summary{$_}[1]', '$contig_summary{$_}[2]', '$random_string')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the contig identifier for the inserted contig
    my $unigene_assembly_id;
    $stm="select unigene_assembly_id from unigene_assembly where load_id='$random_string'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$unigene_assembly_id);
    if($sth->fetch){
	$assembly_id{$_}=$unigene_assembly_id;
    }


#insert into the unigene table
    $stm = "insert into unigenes (unigene_build_id, unigene_source_id, unigene_source_table) values ('$build_id', '$unigene_assembly_id', 'unigene_assembly')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

    $contig_count++;
}


#insert the sequence and quality
foreach (keys %contig_seq){

    $assembly_id{$_} or next;

#insert the sequence
    $stm="insert into unigene_assembly_sequence (unigene_assembly_id, sequence_data) values ('$assembly_id{$_}', '$contig_seq{$_}')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the sequence id
    my $contig_seq_id;
    $stm="select unigene_assembly_seq_id from unigene_assembly_sequence where unigene_assembly_id='$assembly_id{$_}'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$contig_seq_id);
    $sth->fetch;


#insert the quality values
    $stm="insert into unigene_assembly_sequence_quality (unigene_assembly_seq_id, unigene_assembly_id, quality_values) values ('$contig_seq_id', '$assembly_id{$_}', '$contig_qual{$_}')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
}

#insert the assembly component info
my $contig;
my %contig_member_id=();
foreach $contig (keys %contig_members){

    $assembly_id{$contig} or next;

    foreach (keys %{$contig_members{$contig}}){
 
#get the seq_id for this trimmed_id
	my $seq_id;
	$stm="select seq_id from trimmed_sequence where trimmed_seq_id='$_'";
	$sth = $dbh->prepare($stm) 
	    || die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	    || die "Can't execute statement: $DBI::errstr";
	$rc = $sth->bind_columns(\$seq_id);
	if($sth->fetch){
	    $contig_member_id{$_}=$seq_id;
	}
	
#set no vector to empty string
	$contig_members{$contig}{$_}[9] ||= '';
	
#insert into contig_components
	$stm="insert into unigene_assembly_component (seq_id, trimmed_seq_id, unigene_assembly_id, assembly_orientation, read_direction, start_location, end_location, start_trim, end_trim, possible_vector_start, possible_vector_end, possible_vector_seq) values ('$seq_id', '$_', '$assembly_id{$contig}', '$contig_members{$contig}{$_}[1]', '$contig_members{$contig}{$_}[2]', '$contig_members{$contig}{$_}[3]', '$contig_members{$contig}{$_}[4]', '$contig_members{$contig}{$_}[5]', '$contig_members{$contig}{$_}[6]', '$contig_members{$contig}{$_}[7]', '$contig_members{$contig}{$_}[8]', '$contig_members{$contig}{$_}[9]')";
	$sth = $dbh->prepare($stm) 
	    || die "Can't prepare statement: $DBI::errstr";
	$rv = $sth->execute
	    || die "Can't execute statement: $DBI::errstr";
	
	$contig_member_count++
    }
}

#load the singlets

#get a list of all the sequences assembly started with

my %singlet_seq_id=();
my ($trimmed_seq_id, $seq_group_id);

$stm="select t.trimmed_seq_id, t.seq_id from trimmed_sequence as t, quality_evaluation as q";
$stm.=" where t.trimmed_seq_id=q.trimmed_seq_id and q.qual_criteria_id='6'";
$sth = $dbh->prepare($stm) 
    || die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute
    || die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$trimmed_seq_id, \$seq_group_id);
while($sth->fetch){
    $singlet_seq_id{$trimmed_seq_id}=$seq_group_id;
}


#remove all the sequences that were assembled in contigs
foreach (keys %singlet_seq_id){
    if($contig_member_id{$_}){
	delete $singlet_seq_id{$_};
    }
}

#load the singlets
foreach (keys %singlet_seq_id){
    $stm="insert into unigene_singleton (seq_id, trimmed_seq_id, unigene_build_id) values ('$singlet_seq_id{$_}', '$_', '$build_id')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#get the singlet id
    my $singlet_id;
    $stm="select unigene_singleton_id from unigene_singleton where seq_id='$singlet_seq_id{$_}'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$singlet_id);
    $sth->fetch;

#insert into the unigene table
    $stm = "insert into unigenes (unigene_build_id, unigene_source_id, unigene_source_table) values ('$build_id', '$singlet_id', 'unigene_singleton')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $singlet_count++;
}


db_link::disconnect_db($dbh);


print "Loaded $contig_count contigs, $contig_member_count contig components, and $singlet_count singlets\n";
 runtime::runtime_print($start_time, "Assembly upload");


