#!/usr/bin/env perl
use strict;
use warnings;
use English;
use Carp;
use FindBin;
use Getopt::Std;

use XML::Twig;

#use Data::Dumper;

sub usage {
  my $message = shift || '';
  $message = "Error: $message\n" if $message;
  die <<EOU;
$message
Usage:
  $FindBin::Script [ options ]  xml_file.pl

  Script to extract either a single annotation or a distinct subregion
  from a GAME XML file.

  Options:

  -n name
     Extract the region containing the annotation with the given name.
     If the name is not unique in the file, takes the first annotation
     element (in file order) with that name.  Conflicts with -r.

  -r start,end
     Extract the given region, in bases.  Conflicts with -n.

  -o file
     output new GAME XML to this file

  -p num_bases (default 200)
     Pad the extracted region by <int> bases on each side.

  -A
     don't extract any <annotation> elements.  Ones specified with -n
     are exempt.

  -C
     don't extract any <computational_analysis> elements

EOU
}

our %opt;
getopts('n:r:o:p:AC',\%opt) or usage();
@ARGV == 1 or usage();
$opt{n} && $opt{r}
  and die "cannot specify both -n and -r\n";
$opt{p} ||= 200;

my $input_xml_file = shift @ARGV;
-r $input_xml_file or die "Cannot read from file '$input_xml_file'\n";


#figure out what filehandle we'll use for our output
my $out_fh = do {
  unless ($opt{o}) {
    \*STDOUT
  } else {
    -d $opt{o} and die "'$opt{o}' is a directory\n";
    open my $f, ">$opt{o}"
      or die "Cannot write to file '$opt{o}': $!\n";
    $f
  }
};

#this is the <annotation> name we're looking for
my $desired_annot_name = $opt{n};

#figure out what region we'll be cutting out,
#defaults to the whole sequence
my ($region_start,$region_end) =
  $opt{r} ? validate_region($opt{r},$input_xml_file) :
  $opt{n} ? find_annotation_region($desired_annot_name,$input_xml_file) :
            (1, (seq_len($input_xml_file) or die "could not parse length from xml file\n"));

#warn "region is ($region_start,$region_end)\n";

#add padding to our region
my ($padded_start,$padded_end) = ($region_start,$region_end);
#warn "padded region is ($padded_start,$padded_end), pad is $opt{p}\n";
$padded_start -= $opt{p};
$padded_start = 1 if $padded_start < 1;
$padded_end += $opt{p};
$padded_end = seq_len($input_xml_file) if seq_len($input_xml_file) < $padded_end;

#warn "padded region is ($padded_start,$padded_end)\n";


#now go through and transform the document, printing to our output filehandle
my $twig = XML::Twig->new
  (
   twig_roots =>
   {
    'seq' => sub {
      my ($t,$elem) = @_;

      if($elem->att('focus') && $elem->att('focus') eq 'true') {

	my $len = $padded_end - $padded_start + 1;
	$elem->set_att( length => $len );
	my $res = $elem->first_child('residues')
	  or $t->xpcroak("parse error, no residues for focused sequence\n");
	my $sequence = $res->text;
	$sequence =~ s/\s+//g;
	$sequence = substr($sequence,$padded_start-1,$len);
	#      warn "sequence is now ".length($sequence)." ($len)\n";
	$res->set_text( $sequence );
      }

      $elem->print($out_fh);
    },
    annotation => sub {
      my ($t,$elem) = @_;
      if($desired_annot_name) {
	my $id = $elem->att('id');
#	warn "getting name for id '$id'...\n";
	my $name = $elem->first_child('name');
	unless($name) {
	  die "<annotation> with id '$id' has no <name> child, but has children: ".join(',',map {$_->tag} $elem->children);
	}
	$name = $name->text;
#	warn "got name $name\n";
	if( $name eq $desired_annot_name
	    && in_region($t,$elem,$region_start,$region_end)
	  ) {
	  offset_regions($t,$elem,$padded_start-1);
	  $elem->print($out_fh);
	} else {
#	  $elem->purge;
	}
      } else {
	if( !$opt{A}
	    && in_region($t,$elem,$region_start,$region_end) ) {
	  offset_regions($t,$elem,$padded_start-1);
	  $elem->print($out_fh);
	} else {
#	  $elem->purge;
	}
      }
    },
    computational_analysis => sub {
      my ($t,$elem) = @_;
      my $got_sets = 0;
      if( !$opt{C}
	  && !$desired_annot_name ) {
	foreach my $set ($elem->children('result_set'),$elem->children('result_span')) {
	  if(in_region($t,$set,$region_start,$region_end)) {
	    $got_sets = 1;
	    offset_regions($t,$set,$padded_start-1);
	  } else {
	    $set->delete;
	  }
	}
	
      }
      if($got_sets) {
	$elem->print($out_fh);
      } else {
#	$elem->purge;
      }
    },

    #adjust map_position to the new region
    map_position         => sub {
      my ($t,$elem) = @_;
      my $span = $elem->first_child('span');
      my $start = $span->first_child('start');
      my $end = $span->first_child('end');
      $start->set_text(1);
      $end->set_text($padded_end-$padded_start+1);
      $elem->print($out_fh);
    },
#     transaction        => sub { $_[1]->print($out_fh) },
#     deleted_transcript => sub { $_[1]->print($out_fh) },
#     changed_gene       => sub { $_[1]->print($out_fh) },
   },
   twig_print_outside_roots => $out_fh,
   pretty_print => 'indented',
  );
$twig->parsefile($input_xml_file);
#$twig->print($out_fh);


#given a twig element and a region start and a region end,
#return 1 if this element lies in the specified region
sub in_region {
  my ($twig,$elem,$s,$e) = @_;

  my ($bs,$be) = find_element_bounds($twig,$elem);

  return 0 unless $bs && $be;
  return ($bs >= $s && $be <= $e);
}

# take a region string like '212,2342' and the input file name, and
# make sure it's a valid region string, especially given how long the
# sequence in the file is.  if it's valid, returns list of two ints
# like (212,2342).  dies informatively if the region is not valid
sub validate_region {
  my ($region_string,$infile) = @_;
  my ($start,$end) = split /,/,$region_string;
  $start+=0; $end+=0;
  ($start,$end) = ($end,$start) if $start > $end;

  $start >= 1 or die "invalid region start '$start'\n";


  if( my $length = seq_len($infile) ) {
    return ($start,$end) if $length >= $end;
    die "input file's sequence is only $length bases long, but you specified a region ending at $end\n";
  } else {
    die "cannot parse input file, does not contain a <seq> element with a 'length' attribute\n"
  }
}

#look up the length of the focus sequence in the given input file
#cache
sub seq_len {
  my ($infile) = @_;
  our %seqlen_cache;
  return $seqlen_cache{$infile} ||= do {
    #find the length of the focus sequence in the input file
    open my $f, $infile or die "Could not open '$infile': $!";
    my $length;
    while (<$f>) {
      if (/<seq [^>]*focus="true"/) {
	if(my ($l) = /<seq [^>]*length="(\d+)"/) {
	  $length = $l;
#	  warn "got length $length\n";
	  last;
	}
      }
    }
    $length
  };
}

#given an <annotation> name, find it in the input file and get the
#extent of its regions, returning them as a two-element list, or die
#if not found
sub find_annotation_region {
  my ($annot_name,$infile) = @_;
  $annot_name =~ s/"/\\"/g; #escape any quotes

  #look through all the annotation regions and find the extents of the
  #one we desire
  my ($start,$end);
  my $twig = XML::Twig->new
    ( twig_handlers =>
      {
       qq|annotation[string(name)="$annot_name"]| => sub {
	 my ($t,$annot_elem) = @_;
	 ($start,$end) = find_element_bounds($t,$annot_elem);
#	 $annot_elem->purge;
       },
      },
    );
  $twig->parsefile($infile);
#  $twig->purge;
#  print "got ($start,$end)\n";
  if($start && $end) {
    return ($start,$end);
  } else {
    die qq|no annotation found with name '$annot_name'\n|;
  }
}


#take a Twig element, search for its <seq_relationship> descendants,
#and return the bounds of the element based on the bounds of its
#children in a 2-element list (start,end), or an empty list if no
#bounds found.  dies on error.
sub find_element_bounds {
  my ($twig,$element) = @_;

  my ($start,$end);
  my $tag = $element->tag;
  my $id = $element->att('id') || '';
#  warn "starting annot parse of <$tag id=\"$id\">\n";
  foreach my $seqrel ($element->descendants('seq_relationship')) {
#    warn "got seqrel\n";
    next if $seqrel->att('type') && $seqrel->att('type') eq 'subject';
    my ($s,$e) = map {$seqrel->first_child('span')->first_child($_)->text} qw/start end/;
    $s and $e or die "parse error ('$s','$e')";
    ($s,$e) = ($e,$s) if $s > $e;
    $start = $s if !$start || $s < $start;
    $end = $e if !$end || $end < $e;
  }
  if ($start && $end) {
    return ($start,$end);
  } else {
    return;
#    $twig->xpcroak(qq|parse error: '<$tag>' element contains no <seq_relationship type="query"> descendants\n|.join(',',map {$_->tag} $element->children));
  }
}

#find all the seq_relationship elements and fix their coordinates to
#adjust to the new subsequence
sub offset_regions {
  my ($t,$element,$offset_amount) = @_;

  foreach my $seqrel ($element->descendants('seq_relationship')) {
    #don't offset subject seq_relationships
    next if $seqrel->att('type') && $seqrel->att('type') eq 'subject';
    foreach (qw/start end/) {
      my $e = $seqrel->first_child('span')->first_child($_);
      $e->set_text($e->text - $offset_amount);
    }
  }

}
