#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

#local packages to use
use db_link;
use projects;


@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$args{$flag}=$val;
}

my $project=$args{'p'};
my $lib=$args{'l'};
my $seq_out_file=$args{'o'};
my $qual_out_file=$args{'q'};

$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";
$lib or die "No library specified, try '-l library_name (-l all to process all libraries)'\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";


#set defaults for io, script filename, etc
my $seq_out_dir = "/data/shared/pgn_data_processing/unigene_builds/$project/$lib";
$seq_out_file ||= "$seq_out_dir/$lib.fasta";
$qual_out_file ||= "/data/shared/pgn_data_processing/unigene_builds/$project/$lib/$lib.fasta.qual";

#main body of script
#####################

# create the output folder, if it doesn't exist
if (!-e $seq_out_dir) {
	system("mkdir $seq_out_dir");
}

# try to open the database
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";
my ($stm, $sth, $rv, $rc);

#pull all sequences and relevant information for this library
my ($seq_id, $seq, $quality, $read_direction, $chemistry, $dye, $clone);
my %sequences=();

$stm = "select ts.trimmed_seq_id, ts.sequence_data, tq.quality_values, e.read_direction, s.chemistry, s.dye, o.external_id from trimmed_sequence as ts, trimmed_sequence_quality as tq, est_info as e, sequencing_information as s, other_identifier as o, quality_evaluation as q";

unless ($lib eq 'all'){
	$stm .=", est_library as l";
}

$stm.=" where ts.trimmed_seq_id=tq.trimmed_seq_id and ts.seq_id=e.seq_id and e.sequencing_info_id=s.sequencing_info_id and ts.seq_id=o.local_db_id and ts.trimmed_seq_id=q.trimmed_seq_id and q.qual_criteria_id='6'";

unless ($lib eq 'all'){
	$stm.=" and e.est_library_id=l.est_library_id and e.chimeric!=1 and l.library_name like '$lib%'";
}

$sth = $dbh->prepare($stm) or die "Can't prepare statement: $DBI::errstr";
$rv = $sth->execute or die "Can't execute statement: $DBI::errstr";
$rc = $sth->bind_columns(\$seq_id, \$seq, \$quality, \$read_direction, \$chemistry, \$dye, \$clone);

while ($sth->fetch){
	push @{$sequences{$seq_id}}, ($seq, ,$quality, $read_direction, $chemistry, $dye, $clone); 
}

#close the database link
db_link::disconnect_db($dbh);


#write out the files for phrap
open (SEQ_OUT, ">$seq_out_file");
open (QUAL_OUT, ">$qual_out_file");

#make the lookup tables for the expected nomenclature, see phrap.doc for details
my %direction_table=();
$direction_table{'5'}='fwd';
$direction_table{'3'}='rev';

my %chem_table=();
$chem_table{'terminator'}='term';
$chem_table{'primer'}='prim';
$chem_table{'unknown'}='unknown';

my %dye_table=();
$dye_table{'rhodamine'}='rhod';
$dye_table{'d-rhodamine'}='d-rhod';
$dye_table{'big-dye'}='big';
$dye_table{'energy-transfer'}='ET';
$dye_table{'bodipy'}='bodipy';
$dye_table{'unknown'}='unknown';

print "Found ".int(keys %sequences)." sequences.\n";

foreach (keys %sequences){
	
	my $defline=">$sequences{$_}[5]-$_ DIRECTION: $direction_table{$sequences{$_}[2]}  CHEM: $chem_table{$sequences{$_}[3]}  DYE: $dye_table{$sequences{$_}[4]}  TEMPLATE: $sequences{$_}[5]\n"; 

	print SEQ_OUT $defline . $sequences{$_}[0] . "\n";
	print QUAL_OUT $defline . $sequences{$_}[1] . "\n";
}

close SEQ_OUT;
close QUAL_OUT;


