#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Dan Ilut <dci1@cornell.edu>';

use lib '/data/shared/pgn_data_processing/scripts/perllib';

#local packages to use
use runtime;
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){

    $_ or next;
    my ($flag, $val)=split /\s+/;
    $args{$flag}=$val;
}

my $seq_in_file=$args{'i'};
my $qual_in_file=$args{'q'};
my $project=$args{'p'};
my $seq_type=$args{'t'};
my $seq_format=$args{'f'};
my $sequencing_info=$args{'s'};
my $read_direction=$args{'r'};
my $other_id_type=$args{'o'};
my $source_info_dir=$args{'a'};


$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
$seq_in_file ||="/tmp/basecall_seqs_fasta";
$qual_in_file ||="/tmp/basecall_seqs_fasta.qual";
$seq_type ||= 'nucleotide';
$seq_format ||= 'FASTA';
$sequencing_info ||= '1';
$read_direction ||= '5';
$other_id_type ||= 'clone name';
$source_info_dir ||= '/data/shared/pgn_data_processing/incoming_files/'.$project.'/file_source/';


#parameter that decides if we try to read sequencing info from disk
#or assign sequencing facility info in script
# set to 0 on 2003-03-03 by dan

my $use_seqinfo_on_disk = 0;


#main body of script
#####################

my $start_time=time;

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#read in and hash the sequence and quality files
my %sequences=();
my %quality=();

my ($defline, $dataline)=('','');
open FILEIN, $seq_in_file;

foreach (<FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $sequences{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }
    $dataline.=$_;
}

$sequences{$defline}=$dataline;

close FILEIN;

open FILEIN, $qual_in_file;
foreach (<FILEIN>){
    chomp;
    if (/^>([^\s]+)\s/){ 
	if ($defline){
	    $quality{$defline}=$dataline;
	}
	$defline=$1;
	$dataline='';
	next;
    }
    $dataline.=$_;
}
$quality{$defline}=$dataline;

close FILEIN;

my $nr_seq=int(keys %sequences);
my $nr_qual= int(keys %quality);

print "Read $nr_seq sequences and $nr_qual sets of quality values.\n";

unless ($nr_seq == $nr_qual){
    die "Seq - qual missmatch: $nr_seq sequences, $nr_qual quality value entries\n";
}




#figure out sequencing facility information
#currently only FGN has multiple sequencing facilities

my %lib_source=();
my %lib_sequencing=();


#####################################
# START FGN BRANCH                  #
#####################################
if ($project eq 'fgn'){
    if ($use_seqinfo_on_disk){

#FGP: get sequence source for fgp files
	my @sequencing_loc= ();
	push @sequencing_loc, 'uf';
	push @sequencing_loc, 'ps';
	my $loc;
	foreach $loc (@sequencing_loc){
	    my $file=$source_info_dir.$loc."_files.txt";
	    open FILEIN, $file or die "Couldn't read file $file\n";
	    while(<FILEIN>){
		$_ or next;
		chomp;
		$lib_source{$_}=$loc;
		
	    }
	    close FILEIN;
	}
    }
    else{
#create lib_based location hash - temp solution until we can look up the info in the LIMS system
#dan - 26/03/03

	$lib_sequencing{'Eca01'}= 'ps';
	$lib_sequencing{'Eca03'}= 'ps';
	$lib_sequencing{'Nad01'}= 'uf';
	$lib_sequencing{'Nad03'}= 'ps';
	$lib_sequencing{'Pam01'}= 'uf';
	$lib_sequencing{'Wmi01'}= 'ps';
	$lib_sequencing{'Atr01'}= 'uf';
	$lib_sequencing{'Ltu01'}= 'ps';
	$lib_sequencing{'Wmi02'}= 'ps';
	$lib_sequencing{'Atr02'}= 'uf';
	$lib_sequencing{'Zfi01'}= 'uf';
	$lib_sequencing{'Csa01'}= 'ps';
	$lib_sequencing{'Csa02'}= 'ps';
	$lib_sequencing{'Aam01'}= 'uf';
	$lib_sequencing{'She01'}= 'ps';
	
    }
}
#####################################
# END FGN BRANCH                    #
#####################################

foreach $defline (keys %sequences){

    my $seq_id;

#check to see if tracefile exists, skip if so
    my ($folder_name, $trace_name)=split (/\#\#/, $defline);

    my $stm = "select seq_id from tracefile_location where trace_name='$trace_name' and folder_name='$folder_name'";
    my $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    my $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    my $rc = $sth->bind_columns(\$seq_id);
    if ($sth->fetch){
	print "Trace file already exists for $trace_name from $folder_name, skipping\n";
	next;
    }


#Find the library name
#######################
    my $lib_name='';

#####################################
# START CGN BRANCH                  #
#####################################
    if($project eq 'cgn'){
#find out the plate number, cccp3 and cccp4 are MWG, all else BRC
	my $sequencing_info=1;
	$trace_name=~/^cccp[34]/
	    and $sequencing_info=2;

#get the library name
	unless ($trace_name=~/([^_]+[^0-9]+)[0-9]+[A-Za-z]+[0-9]+\./){
	    print "Couldn't find library name from $trace_name, skipping\n";
	    next;
	}
	
	$lib_name=$1;
    }
#####################################
# END CGN BRANCH                    #
#####################################


#####################################
# START FGN BRANCH                  #
#####################################
    if ($project eq 'fgn'){

#get the library name
	unless ($trace_name=~/^(([^-]+)-[^-]+)-[^\.]+\./){
	    print "Couldn't find library name from $trace_name, skipping\n";
	    next;
	}
	$lib_name = $2;
	my $source = $lib_source{$folder_name};
	unless($source){
	    $folder_name =~ /^([^\-]+)-/;
	    $source = $lib_sequencing{$1};
	}

	unless($source){
	    print "Couldn't find source for $trace_name, skipping\n";
	    next;
	}

	($source eq 'ps') and $sequencing_info=1;
	($source eq 'uf') and $sequencing_info=2
	}
#####################################
# END FGN BRANCH                    #
#####################################



# Do all the SQL needed to load the data
# Note that there is no robust rollback if one of the steps fail
# If you need to roll back a loading batch, use the plate removal script
# Plate removal script is /data/shared/pgn_data_processing/scripts/data_manipulation/pull_plate.pl (as of 2003-10-14)

#NB. Some of this could be done much more efficiently using mysql specific syntax


#create sequence group
    $stm="insert into sequence_group (tmp_loading_id) values ('$defline')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


#retrieve seq_id
    $stm="select seq_id from sequence_group where tmp_loading_id='$defline'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$seq_id);
    $sth->fetch;


#insert trace names
    $stm="insert into tracefile_location (seq_id, trace_name, folder_name) values ('$seq_id', '$trace_name', '$folder_name')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#retrieve seq_type id and seq_format id
    my ($seq_type_id, $seq_format_id);

    $stm="select seq_type_id from sequence_type where description='$seq_type'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$seq_type_id);
    $sth->fetch;

    $stm="select seq_format_id from sequence_format where description='$seq_format'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$seq_format_id);
    $sth->fetch;


#insert sequences
    $stm = "insert into raw_sequence (seq_id, sequence_data, data_type, data_format) values ('$seq_id', '$sequences{$defline}', '$seq_type_id', '$seq_format_id')";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";


#retrieve raw_seq_id
    my $raw_seq_id;
    $stm = "select raw_seq_id from raw_sequence where seq_id='$seq_id'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$raw_seq_id);
    $sth->fetch;


#insert quality values

    $stm = "insert into raw_sequence_quality (seq_id, raw_seq_id, quality_values) values ('$seq_id', '$raw_seq_id', '$quality{$defline}' )";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";



#insert est info

    my $lib_id;
    $stm="select est_library_id from est_library where library_name='$lib_name'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$lib_id);
    unless ($sth->fetch){
	die "Couldn't find library id matching $lib_name\n";
    }


    $stm = "insert into est_info (seq_id, read_direction, est_library_id, sequencing_info_id) values ('$seq_id', '$read_direction', '$lib_id', '$sequencing_info' )";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

#insert other identifier (clone name)
    my $clone_name=$trace_name;
    $clone_name=~s/^([^\.]+)\..*$/$1/;
    my $other_type_id;

    $stm = "select identifier_type_id from identifier_type where identifier_description='$other_id_type'";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";
    $rc = $sth->bind_columns(\$other_type_id);
    $sth->fetch;


    $stm = "insert into other_identifier (local_db_id, external_id, external_id_type) values ('$seq_id', '$clone_name', '$other_type_id' )";
    $sth = $dbh->prepare($stm) 
	|| die "Can't prepare statement: $DBI::errstr";
    $rv = $sth->execute
	|| die "Can't execute statement: $DBI::errstr";

}


db_link::disconnect_db($dbh);

 runtime::runtime_print($start_time, "Database upload");


