#!/usr/bin/perl -w

=head1 NAME

intron_finder_command_line.pl

=head1 DESCRIPTION

Determines if there are possible intron sites in transcript sequences by aligning sequences to Arabidopsis using BLAST and then parsing the introns from the known intron positions in Arabidopsis. 

parameters: 

=over 5


=item -i

input file (fasta) [required]

=item -o

output file (? format) [required]

=item -H 

dbhost [default: db.cornell.edu]

=item -D

database name [default: cxgn]

=item -w 

web output (goes to STDOUT instead of file)

=item -f

optional gene feature file (format: ....)

=item -p

protein database for blasting. This file should be Arabidopsis and the gene feature file needs to correspond to the sequences in this file. (Download both from TAIR).

=item -e

maximum e-value for reporting a match

=item -s

simple output format (ID tab direction tab intron_pos tab intron_pos ... newline)

=back

Structure of feature file. This file can be downloaded from the TAIR website at:
ftp://ftp.arabidopsis.org/ ... somewhere.

chromosome      gene name       TAIR accession  feature start   stop    length  orientation
1       AT1G05850.1     gene:2198687    GENE    1766502 1768662 2161    reverse
1       AT1G05850.1     gene:2198687    exon    1766502 1767249 748     reverse
1       AT1G05850.1     gene:2198687    ORF     1766832 1768116 1285    reverse
1       AT1G05850.1     gene:2198687    coding_region   1766832 1767249 418     reverse
1       AT1G05850.1     gene:2198687    exon    1767351 1767510 160     reverse
1       AT1G05850.1     gene:2198687    coding_region   1767351 1767510 160     reverse
1       AT1G05850.1     gene:2198687    exon    1767729 1768150 422     reverse
1       AT1G05850.1     gene:2198687    coding_region   1767729 1768116 388     reverse
1       AT1G05850.1     gene:2198687    exon    1768547 1768662 116     reverse
1       AT1G05770.1     gene:2198692    GENE    1725546 1726249 704     reverse


=head1 AUTHOR

Emil Keyder


=cut

#use lib '/home/erk23/perl_modules';
use Bio::SearchIO;
use Bio::SeqIO;
use CXGN::DB::InsertDBH;
use strict;
use POSIX qw(ceil);
use Getopt::Std;
use File::Spec;
use CXGN::VHost;
my $vhost_conf=CXGN::VHost->new();
#
# the default protein database to compare against can be changed here
#
my $DEFAULT_PROTEIN_DATABASE_STRING = $vhost_conf->get_conf('intron_finder_database')."/ATH1.pep.04172003";

#my $DEFAULT_PROTEIN_DATABASE_STRING = "/data/shared/intron_finder_database/ATH1.pep.04172003";

#
# the default gene feature file can be changed here
#
my $DEFAULT_GENE_FEATURE_FILE = File::Spec->catfile($vhost_conf->get_conf('intron_finder_database'), "SV_gene_feature.data");

#my $DEFAULT_GENE_FEATURE_FILE = "/data/shared/intron_finder_database/SV_gene_feature.data";

#
# where blast is on the system
#
my $blast_location = $vhost_conf->get_conf('blast_path')."/blastall";


my $rnd = rand(1e15);

# input file, output file, e value, progress messages
use vars qw($opt_i $opt_o $opt_e $opt_p $opt_f $opt_w $opt_D $opt_H $opt_s); 

getopts('i:o:e:p:f:wH:D:s');

my $arabidopsis_url = "http://www.arabidopsis.org/servlets/Search?type=general&name=GENEID&action=detail&method=4&sub_type=gene";

if (!$opt_H) { 
    $opt_H = "db.sgn.cornell.edu";
}
if (!$opt_D) { 
    $opt_D = "cxgn"; 
}

#my ($infile) = @ARGV;

if(!$opt_i || ((!$opt_o) && (!$opt_w)))
{

    print <<TEXT;

Required parameter missing.

Required parameters:
    -i     input file
    -o     output file
Optional parameters
    -e     e value for blast (default is 1e-50)
    -p     protein database to use for intron detection (default is
           $DEFAULT_PROTEIN_DATABASE_STRING which must be in the current directory
           if another database is not specified)
    -f     gene feature file to use (default is $DEFAULT_GENE_FEATURE_FILE)
           Please ensure that you have a gene feature file in the TAIR format
           to go with any alternate databases you use.
           The arabidopsis features file can be downloaded from tair.org

TEXT

#    -w is secret switch for web version
#    print ("\t-w\t -w flag - output to stdout rather than a file. hidden, only for use by web version.");

    exit;
}

my $protein_db;
my $gene_feature_file;

if($opt_p) { $protein_db = $opt_p; }
else { $protein_db = $DEFAULT_PROTEIN_DATABASE_STRING;}

if($opt_f) { $gene_feature_file = $opt_f; }
else { $gene_feature_file = $DEFAULT_GENE_FEATURE_FILE;}

my $infile = $opt_i;
my $outfile = $opt_o;


open (ALRES, ">".$outfile);

# print ALRES "ALRES is $outfile\n";
# print ALRES "infile is $infile\n";

my $e_value;
if ($opt_e) 
{    $e_value = $opt_e; }
else
{    $e_value = '1e-50'; }

#
# Set temporary path
#
#

my $TEMP_FILE_DIR;

if($opt_w)
{
    $TEMP_FILE_DIR = $vhost_conf->get_conf('basepath').$vhost_conf->get_conf('tempfiles_subdir')."/intron_detection";
    print ALRES "set temp to ", $TEMP_FILE_DIR, "\n";
}
else
{
    $TEMP_FILE_DIR = ".";
#    print ALRES "set temp to ", $TEMP_FILE_DIR;
}

warn "THE TEMP_FILE_DIR is $TEMP_FILE_DIR\n";

my @lines;
my $clone_name = "";
my $stripped;

#my $db = DBI -> connect ("dbi:mysql:database=sgn;host=localhost", 'web_usr', 'tomato', { RaiseError => 1 });
my $dbh = CXGN::DB::InsertDBH->new( {dbname => $opt_D,
				    dbhost => $opt_H,
				    dbuser => "postgres"
				    });
if(!$dbh)
{
    &printWebOrFile ("Error connecting to database, please try again later\n");
    &programExit("Couldn't connect to database, please try again later.");
}    

my $clone_query = $dbh->prepare("SELECT unigene_consensi.seq, unigene.unigene_id, qc_report.hqi_start, qc_report.hqi_length, est.seq, unigene_member.start FROM clone LEFT JOIN seqread ON clone.clone_id = seqread.clone_id LEFT JOIN est ON seqread.read_id = est.read_id LEFT JOIN unigene_member ON unigene_member.est_id = est.est_id LEFT JOIN unigene ON unigene.unigene_id = unigene_member.unigene_id LEFT JOIN unigene_consensi ON unigene_consensi.consensi_id = unigene.consensi_id LEFT JOIN qc_report ON qc_report.est_id = est.est_id WHERE clone.clone_name = ? AND est.status = 0");

my $unigene_query = $dbh->prepare("SELECT unigene_consensi.seq, unigene.unigene_id, qc_report.hqi_start, qc_report.hqi_length, est.seq, unigene_member.start FROM unigene LEFT JOIN unigene_consensi ON unigene.consensi_id = unigene_consensi.consensi_id LEFT JOIN unigene_member ON unigene.unigene_id = unigene_member.unigene_id LEFT JOIN est ON unigene_member.est_id = est.est_id LEFT JOIN qc_report ON qc_report.est_id = est.est_id WHERE unigene.unigene_id = ? AND est.status = 0");

if(!(open(FILEIN, $infile)))
{
    if ($opt_w)
    {
	&programExit ("Unexpected error: Couldn't open EST file $infile. Please try again later.");
    }
    else 
    {
	die "Couldn't find file $infile, please enter a valid filename.";
    }
}
@lines = <FILEIN>;

# %unigenes maps the original identifier (EST) to an array (sequence of unigene, unigene id, EST start coord, EST length, EST sequence, start on unigene).  

my %unigenes;

for(my $i = 0; $i < @lines; $i++){
   
    chomp $lines[$i];
    next if ($lines[$i] =~ /^\n/);
   # next if ($lines[$i] =~ /^(\s)*$/;
    
    if($lines[$i] =~ s/^>(.*)\s*$/$1/)
    {
	# print ALRES "matched ", $lines[$i], " at line $i\n";
	
	#look in the cheat file
	my $unigeneid = "";
	
	$stripped = $lines[$i];
	
	# print ALRES "checking whether exists...\n";
	if(!($unigenes{$stripped})) # skip multiple copies of the same EST in a single input file
	{
	    if ($lines[$i + 1] && ($lines[$i + 1] !~ /^>/))
	    {
		for(; ($lines[$i + 1] && ($lines[$i + 1] !~ /^>/)); $i++) #hopefully, there's a sequence in the input file
		{
		    chomp $lines[$i + 1];
		    ${$unigenes{$stripped}}[4] .= $lines[$i + 1];
		}
	    }
	    else
	    {
		&programExit ("Please specify a sequence for identifier ". $lines[$i]);
	    }
	}
    
	else
	{ #duplicate id, indicate error and die
	    &programExit("Duplicate identifiers. Please ensure that each identifier appears only once in the input.");
	}
    }
    else
    {  # no '>' or other parse error
	#print ALRES "didn't match ", $lines[$i], " at line $i\n";
	&programExit ("Input not in FASTA format.");
    }
}

    
close FILEIN;

#$clone_query->finish;
#$unigene_query->finish;
#$dbh->disconnect;

my $key;

# print ALRES $TEMP_FILE_DIR;

my $temp_seq_file = File::Spec->catfile($TEMP_FILE_DIR, &getFileName($infile)."_temp_seq_file");

my $blast_result_file = File::Spec->catfile($TEMP_FILE_DIR, &getFileName($infile)."_temp_blast_results_file");

if (!open(SEQFILE, ">$temp_seq_file"))
{
    if ($opt_w)
    {
#	print ALRES "\ntrying to open $temp_seq_file";
	&programExit("Unexpected error occurred. Please try again later.");
    }
    else
    {
	&programExit("Couldn't create temporary sequence files for BLAST. Please check your permissions on the directory.");

    }
}

foreach $key (keys(%unigenes)){
    print SEQFILE ">", $key;
    if ($unigenes{$key}[1]){ #if we have a unigene for the EST, we use it for blasting. 
	print SEQFILE "\n", $unigenes{$key}[0], "\n\n";
    }
    else{#if we don't, we use the EST dna sequence provided in the input
	print SEQFILE "\n", $unigenes{$key}[4], "\n\n";
    }
}

close SEQFILE;

my $blastcmdstr = "$blast_location -d $protein_db -p blastx -e $e_value -i $temp_seq_file -o $blast_result_file";

if (! -e $blast_result_file) { 

    print STDERR "BLASTING (this may take a while...)\n";
    system($blastcmdstr);
}
#system("rm -f $temp_seq_file");
else { 
    print STDERR "Skipping BLAST because blast file already present.\n";
    print STDERR "(Remove file and start over if the BLAST was not ok...)\n";
}
my $blastparser = new Bio::SearchIO(-format => "blast", 
				   -file => "$blast_result_file");
my $coords;

# %unigenes maps the original identifier (EST) to an array (sequence of unigene, unigene id, EST_hqi start coord (on EST), EST length, EST sequence, EST start on unigene).  	 
while (my $result = $blastparser->next_result){
    my $i = 0;
    my $query = $result->query_name;
    if (!$opt_s) { &printWebOrFile ("Results for $query:\n\n"); }
    while (my $hit = $result->next_hit){
	while (my $hsp = $hit->next_hsp){

            #check annotation of tair database for exon positions
	    #these are intron positions on protein, offset by where our hit starts.  
	    #i.e. if the protein had an intron excised after the third amino acid, but our hit starts from the second amino acid, aa_intron_positions will contain 2.  
	    #may contain fractions (introns that divide codons)
	    my @aa_intron_positions = &checkIntrons($hit->name, $hsp->start('hit'), $hsp->end('hit'));

#	    print ("aa intron positions: \n");
#	    printArray(@aa_intron_positions);

	    #index in hit_string (aa) of real positions
	    my @aa_visual_positions = map {&convertRealToVisual($hsp->hit_string, $_);} @aa_intron_positions;

#	    printArray(@aa_visual_positions);
	    #index in query_string(aa) of real positions

	    my @query_aa_real_positions = map {&convertVisualToReal($hsp->query_string, $_);} @aa_visual_positions;
	    #my  @query_aa_real_positions = @aa_visual_positions;
#	    printArray(@query_aa_real_positions);
	    
    	    my @cdna_intron_relative_positions = map {$_ * 3} @query_aa_real_positions; 

#	    printArray(@cdna_intron_relative_positions);

	    my $reverse_orientation = 0;

	    my $blast_frame = ($hsp->query->frame + 1) * $hsp->query->strand;

	    if($blast_frame < 0)
	    {
		$reverse_orientation = 1;
	    }
	    
	    my @cdna_intron_absolute_positions;
	    if ($reverse_orientation)
	    {
		@cdna_intron_absolute_positions =  map {$hsp->end('query') - $_ - 1;} @cdna_intron_relative_positions;	    
	    }
	    else
	    {
		@cdna_intron_absolute_positions = map {$_ + $hsp->start('query') - 1;} @cdna_intron_relative_positions;
	    }
	    
	    # make sure we don't get intron positions like 45.99999999999
	    @cdna_intron_absolute_positions = map {&roundToInt($_);} @cdna_intron_absolute_positions;

#    	    printArray(@cdna_intron_absolute_positions);

#	    print "\n";

    	    #query_intron_positions is either positions on the sequence from the input file or positions on the unigene sequence
	    if($unigenes{$query}[1]) #if we have it in the database
	    {
		@aa_visual_positions = map {&removeOutofboundsIntrons($_, 0, (length ($hsp->query_string)) + 1);} @aa_visual_positions;
		@cdna_intron_absolute_positions = map {&removeOutofboundsIntrons($_, $unigenes{$query}[5], $unigenes{$query}[5] + $unigenes{$query}[3]);} @cdna_intron_absolute_positions;
	    }
	    if(@aa_visual_positions) #make sure we have at least one intron in the alignment to print
	    {

		if (!$opt_s) { 
		    
		    $i++; # results with introns count
		    if ($reverse_orientation)
		    {
			&printWebOrFile("Frame for this query is negative.\n");
		    }
		    
		    print ALRES (@aa_visual_positions." introns found in region.\n");
		    
		    &displayAlignments($query, $hsp, $hit, $reverse_orientation);	    
		    
		    &displayIntrons(\@cdna_intron_absolute_positions, \@aa_visual_positions, $hsp->query_string, $hsp->hit_string);
		    &printWebOrFile ("\n");
		}
		
		else { 
		    $i++;
		    my $dir = "+"; 
		    if ($reverse_orientation) { $dir = "-"; }
		    &printWebOrFile(join "\t", ($query, $dir,  @cdna_intron_absolute_positions));
		    &printWebOrFile("\n");
		}
	    }
#	print "\n";
	}
    }
    if (!$opt_s) { &printWebOrFile("$i results returned for query $query.\n\n");}
}

#system("rm -f blast.results");
close ALRES;


#check if there is an intron in the given gene between start and end. start and end are amino acid #s, with first amino acid in protein = 1
#values will be offset from start, i.e. will return the number of aa.s expected between the value of start and a given intron, rather
#than absolute positions on the gene.
sub checkIntrons($$$)
{
    my ($geneid, $start, $end) = @_;

    #map {print $_, " ";} @_;
    #print "\n";
    #print "gene feature file is ", $gene_feature_file, "\n";
    open (DATAFILE, $gene_feature_file) || die "couldn't open data file $gene_feature_file";
    my $found;
    my @coords;
    my $orientation;

    my @relevant_introns;
   
    while (<DATAFILE>)
    {         
	if (m/^\d+\s+$geneid\s+gene:\d+\s+(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\w+)$/i)
	{
	    #$1=type $2=start $3=stop $4=length  $5=orientation
	    $found = 1;
	    $orientation = $5;
	    if($1 eq "coding_region")
	    {
		#don't want to use lengths, because we need to get rid of contiguous coding regions
		push @coords, $2; #start
		push @coords, $3; #end
	    }
	}
	else {$found && last;} #don't look until end of file if we already found it
    }
    close DATAFILE;

    @coords = sort { $a <=> $b } @coords;

    #get rid of contiguous coding regions that don't have introns between them 

    @coords = &deleteContiguousExons(@coords);

    #find # of amino acids coded by each coding region
    #to do this, we have to play with the coordinates if the protein is reverse oriented
    
    if ($orientation eq "reverse"){
	my $start_on_chrom = $coords[$#coords];
	
	@coords = map {$_ - $start_on_chrom} @coords;    
	@coords = map {$_ * -1} @coords;
	@coords = sort { $a <=> $b } @coords;
    }
    my @aa_per_coding_region;

    for (my $j = 0; $j < @coords; $j += 2)
    {
	
	my $numaa = (($coords[$j + 1] - $coords[$j]) + 1) / 3;
	push @aa_per_coding_region, $numaa;
    }

    for (my $j = 0, my $curTotal = 0; $j < @aa_per_coding_region; $j++)
    {
	$curTotal += $aa_per_coding_region[$j];
	if ($curTotal >= $end)
	{
	    last;
	}
	else
	{ 
	    if ($curTotal > $start)
	    {
		push @relevant_introns, (($curTotal - $start) + 1);
#		print("total is $curTotal, start is $start\n");
	    }
	}
    }
    return @relevant_introns;
}

#remove cases where we have coding region coordinates that do not actually have an intron between them, such as (0, 15, 15, 16) is reduced to (0, 16)
sub deleteContiguousExons(@)
{
    my (@coords) = @_;
    my @tempcoords;
    
    for (my $i = 0; ($i < @coords); $i++)
    {
	if ( not($coords[$i + 1]) || ($coords[$i] != $coords[$i + 1]))
	{
	    push @tempcoords, $coords[$i];
	}
	else 
	{
	    $i++;
	}
    }
    return @tempcoords;
}

#check that the introns are within the est, rather than elsewhere on the unigene.  
sub removeOutofboundsIntrons($$$){
    my ($pos, $start, $end) = @_;
    if ($pos > $start && $pos < $end)
    {
	return $pos;
    }
    else
    {
	return ();
    }
}

sub displayAlignments($$$$$){
    
    my ($query, $hsp, $hit, $reverse_orientation) = @_;
    my $eststart;

    my $wslen; #whitespace for formatting
    my $i;

    my $hit_name = $opt_w ? &makeArabidopsisURL($hit->name) : ($hit->name);

    my $query_start_pos;
    if ($reverse_orientation)
    {
	$query_start_pos = $hsp->end('query');
    }
    else
    {
	$query_start_pos = $hsp->start('query');
    }

    if ($unigenes{$query}[1]){ #we have info about this sequence from our database
	if ((($hsp->start('query') >= $unigenes{$query}[5]) && ($hsp->start('query') <= ($unigenes{$query}[5] + $unigenes{$query}[3])))
	    ||  (($hsp->end('query') >= $unigenes{$query}[5]) && ($hsp->end('query') <= ($unigenes{$query}[5] + $unigenes{$query}[3]))))
	{#the HSP has at least one end within our EST
	  	     
	     my $estspaces = undef;
	     if (($hsp->start('query')) > ($unigenes{$query}[5])) #if hsp starts in unigene after est starts in unigene
	     {
		 $eststart = $hsp->start('query') - $unigenes{$query}[5];
		 #open (TEMPDNA, ">tempdnafilex__yz.txt");
		 #print TEMPDNA ">\n", substr($unigenes{$query}[4], $eststart), "\n";
		 #close TEMPDNA;
	     } 
	     
	     else
	     {
		 $estspaces = ceil(($unigenes{$query}[5] - $hsp->start('query')) / 3);
		 $eststart = 3 - (($unigenes{$query}[5] - $hsp->start('query')) % 3);
		 #open (TEMPDNA, ">tempdnafilex__yz.txt");
		 #print TEMPDNA ">\n", substr($unigenes{$query}[4], $eststart), "\n";
		 #close TEMPDNA;
	     }

	     #first the unigene
	     $wslen = 20 - (length($unigenes{$query}[1]) + length($query_start_pos)); 
	     &printWebOrFile ($unigenes{$query}[1].":");

	     for (my $i = 0; $i < $wslen; $i++) 
	     {
		 &printWebOrFile (" ");
	     }
	     
	     &printWebOrFile ($query_start_pos."|".$hsp->query_string."\n");

	     #Convert the EST sequence to amino acid using the Bioperl module. 
	     my $converter = Bio::SeqIO->new(-file => "tempdnafilex__yz.txt", '-format' => 'Fasta');
	     my $dnaseq = $converter->next_seq;

	     my $pobj = $dnaseq->translate(undef, undef, 0);

	     $wslen = 20 - length($query); 
	     &printWebOrFile ($query.":");

	     for ($i = 0; $i < ($wslen - length $eststart); $i++) 
	     {
		 &printWebOrFile (" ");     
	     }

	     &printWebOrFile ($eststart."|");
	     if ($estspaces) { for ($i = 0; $i < $wslen; $i++) { print " ";} }

	     my $estend = length ($pobj->seq());

	     $estend = &convertRealToVisual($hsp->query_string, $estend);

	     #changed to accomodate gaps in unigene alignment
     	     &printWebOrFile (substr($hsp->query_string, 0, $estend + 1)."\n");
#	     print $out_file_handle $pobj->seq(), "\n";

	     #then the HSP
	     $wslen = 20 - (length($hit->name) + length($hsp->start('hit'))); 

	     &printWebOrFile($hit_name.":");
	     
	     for ($i = 0; $i < $wslen; $i++) 
	     {
		 &printWebOrFile (" ");
	     }
	     
	     &printWebOrFile ($hsp->start('hit')."|");
	     &printWebOrFile ($hsp->hit_string."\n");
	 }
    }

#we have no info in the database, just the sequence from the input file. since we blasted with the 
#EST sequence anyway, no need for arithmetic, just print hsp and query.  
    else 
    {
	# &printWebOrFile ("No unigene entry found for this query.\n");
	
	#EST sequence
	
	my $wslen = 20 - length($query); 
	
	&printWebOrFile ($query.":");
	
	for ($i = 0; $i < ($wslen - length ($query_start_pos)); $i++)
	{
	    &printWebOrFile (" ");
	}
	
	&printWebOrFile ($query_start_pos."|");
	&printWebOrFile ($hsp->query_string."\n");
	
        #$eststart = $hsp->start('query');

	#blast results
	$wslen = 20 - (length($hit->name) + length($hsp->start('hit'))); 
	&printWebOrFile($hit_name.":");
	for ($i = 0; $i < $wslen; $i++) {&printWebOrFile (" ");}
	&printWebOrFile ($hsp->start('hit')."|");
	&printWebOrFile ($hsp->hit_string."\n");
    }
    return $reverse_orientation;
}


sub displayIntrons($$$$)
{
    my ($intron_absolute_positions, $intron_visual_positions, $query_string, $hit_string) = @_;
    
    my @intron_visual_positions = @$intron_visual_positions;
    my @intron_absolute_positions = @$intron_absolute_positions;

#    map {print $_, " "} @intron_visual_positions;
    my $intronstring;
    my $positionstring;

    $intronstring .= "possible introns:    |";
    $positionstring .=  "positions:           |";

    my $current_visual_intron = shift @intron_visual_positions;
    my $current_absolute_intron = shift @intron_absolute_positions;
    

    for (my $i = 1; $current_visual_intron && $current_absolute_intron; $i++)
    {
	if ($i >= $current_visual_intron)
	{
	    $intronstring .= "^";
	    $positionstring .= $current_absolute_intron;
	    
#	    print "printing intron at $i\n";
#	    print "cvi is $current_visual_intron\n";

	    for (my $j = 1; $j < length $current_absolute_intron; $j++)
	    {
		$intronstring .= " ";
		$i++;
	    }
	    $current_visual_intron = shift @intron_visual_positions;
	    $current_absolute_intron = shift @intron_absolute_positions;
	}
	else 
	{
	    $intronstring .= " ";
	    $positionstring .= " ";
	}
    }
    
    &printWebOrFile($intronstring."\n".$positionstring."\n");
}

# number of gaps in the string between 0 and i.  
sub getNumGapsBeforePosition($$)
{
    
    my ($query_string, $i) = @_;
    $i = ceil($i);
    my $current_string;
    my $gapcount = 0;
    my $totalgapcount = 0;
    my $start = 0;
 
    do 
    {
	$gapcount = 0;
#	print "query_string length is ", length $query_string, "\n";
#	print "start is $start, i is $i\n";
	$current_string = substr($query_string, $start, $i);
	while ($current_string =~ /-/g)
	{ 
	    $gapcount++; 
	}
	
	$totalgapcount += $gapcount;
	$start += $i;
#	print "start is $start after increment\n";
	$i = $gapcount;

    } while($gapcount != 0);
 
    return $totalgapcount;
}


sub convertRealToVisual($$)
{
    my ($var1, $var2) = @_;
    return convertRealVisual($var1, $var2, 1);
}

sub convertVisualToReal($$)
{
    my ($var1, $var2) = @_;
    return convertRealVisual($var1, $var2, 0);
}

# what position on the string does the dna/aa index correspond to, or vice versa, given gaps in the string.
# probably easier to do with regexps but i cant find the solution right now.  
sub convertRealVisual($$$) 
{
    my ($str, $pos, $r2v) = @_; #r2v is true if realtovisual, false if visualtoreal
    my $visualpos = 0;
    my $elem;
    
        
    my @strarray = split(//, $str);
        
    while($visualpos < $pos)
    {
	$elem = $strarray[$visualpos++];
	if ($elem eq '-')
	{
	    if ($r2v)   {$pos += 1;}
	    else        {$pos -= 1;}
	}
    }
    return ($pos);
}

sub printArray(@){
    my (@array ) = @_;
    map {print $_, " ";} @array;
    print "\n";
}

sub printWebOrFile($)
{
    my($stuff_to_print) = @_;

    #this turned out not to matter for now, since the web version also prints to the file, then prints the file to the page.
    if ($opt_w) #running on the web 
    {
	print ALRES $stuff_to_print;
    }
    else
    {
	print ALRES $stuff_to_print;
    }
}

sub programExit($)
{
    my ($exit_msg) = @_;
    if ($opt_w)
    {
	print ALRES "<p>", $exit_msg, "</p>";
	#print ALRES "<a href = \"find_introns.pl\">Return to search page</a>";
    }
    else
    {
	print ALRES $exit_msg;
    }
    close ALRES;
    die $exit_msg;
}

sub makeArabidopsisURL($)
{
    my ($hit) = @_;
    my $url = $arabidopsis_url;
    $url =~ s/GENEID/$hit/;
    $url = "<a href=$url>$hit</a>";
}

#
# get file name from path
#

sub getFileName($)
{
    my ($path) = @_;
    my $filename;
    ($_, $_, $filename) =  File::Spec->splitpath($path);
    return $filename;
}

sub roundToInt($)
{
    #print ALRES "\n\n ROUNDING \n\n";
    
    my ($num) = @_;
    my $res = int($num + .5 * ($num <=> 0));
    #print ALRES "\n\n $num ---> $res \n\n";
    return $res;
    
}