#!/usr/bin/perl -w
use strict;

#person in charge of this script
#this should be the person currently in charge of scripts for the SGN project
my $script_maintainer='Teri Solow <tms45@cornell.edu>';

#local packages to use
use runtime;
use db_link;
use projects;

use DBI;
use IO::File;


if (!$ARGV[0] || $ARGV[0] eq "help") {
	print <<EOF;

  Program to extract sequences from the "est" table of the database, for
  the purpose of homology-based chimera screening against the arabidopsis
  proteome.

  This program works as follows:

  ESTs are traced to their clones. If matching 3' (or 5') reads are 
  found, the paired ends created for chimera screening is created by
  selecting the leading portions of each matching read.

  If the EST does not have a matching 3' (or 5') read, the longest read 
  available is used, end portions selected but forced *not* to overlap.

  FASTA files are output with end lengths of 150, 200, 250, and 300bp.
  Header lines have identifiers such as SGN-Ennnnn:[53]

  Usage: -p <project> -l <library>

  Note: Output files will be named <output basename>-150.seq, 
		<output basename>-200.seq, etc.

EOF

	exit -1;
}

@ARGV or print "No input parameters, proceeding with default.\n";

my @arg_pairs = split (/\-/, (join ' ', @ARGV));

my %args=();

foreach (@arg_pairs){
	$_ or next;
	my ($flag, $val)=split /\s+/;
	$args{$flag}=$val;
}

my $lib=$args{'l'};
my $project=$args{'p'};
my $output_basename="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/$lib-chimera_prescreen";

$lib or die "No library specified, try '-l library_name (-l all to process all libraries)'\n";
$project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n";

my ($db, $usr) = @{projects::get_db_info($project)};
$db or die "No known database for project $project";
my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n";

# query the database for the requisite information
my $estq = $dbh->prepare("select e.seq_id, e.read_direction, t.sequence_data from est_info as e left join est_library as l on l.est_library_id=e.est_library_id left join trimmed_sequence as t on e.seq_id=t.seq_id where l.library_name like '$lib%' and t.sequence_data is not null");

my ($est, $dir, $seq);
my %est = ();
my %est_dir = ();
my %est_seq = ();

$estq->execute();
while(($est, $dir, $seq) = $estq->fetchrow_array()) {
	$est{$est} = $est;
	$est_dir{$est} = $dir;
	$est_seq{$est} = $seq;
}

# Open output files
my ($i, $s);
my @fh;
for($i=0,$s=150;$i<4;$i++,$s+=50) {
	$fh[$i] = new IO::File ">$output_basename-$s.seq"
		or die "Failed to open output file \"$output_basename-$s.seq\" ($!)";
}

foreach ( %est ) {
	my $max_cut = length($est_seq{$_})/2;
	for($i=0,$s=150;$i<4;$i++,$s+=50) {
		# After thought: some reads (nearly 10,000) are shorter than 300bp. We 
		# only discard if they are less than 150bp. Thus, to ensure they get into
		# at least one screening, on the first file we just break the sequence in
		# half.
		last if $s > $max_cut && $i>0;
		$s=$max_cut if ($s > $max_cut);
		# look at read direction
		if ($est_dir{$_} == "5" ) {
			$fh[$i]->print(">$_:5\n",substr($est_seq{$_},0,$s),"\n");
			$fh[$i]->print(">$_:3\n",substr($est_seq{$_},-1*$s),"\n");
		}
		elsif ($est_dir{$_} == "3" ) {
			$fh[$i]->print(">$_:3\n",substr($est_seq{$_},0,$s),"\n");
			$fh[$i]->print(">$_:5\n",substr($est_seq{$_},-1*$s),"\n");
		}
	}
}

# close output files
for($i=0;$i<4;$i++) {
	$fh[$i]->close();
}
