#!/usr/bin/perl -w use strict; #person in charge of this script #this should be the person currently in charge of scripts for the SGN project my $script_maintainer='Teri Solow '; #local packages to use use runtime; use db_link; use projects; use DBI; use IO::File; if (!$ARGV[0] || $ARGV[0] eq "help") { print < -l Note: Output files will be named -150.seq, -200.seq, etc. EOF exit -1; } @ARGV or print "No input parameters, proceeding with default.\n"; my @arg_pairs = split (/\-/, (join ' ', @ARGV)); my %args=(); foreach (@arg_pairs){ $_ or next; my ($flag, $val)=split /\s+/; $args{$flag}=$val; } my $lib=$args{'l'}; my $project=$args{'p'}; my $output_basename="/data/shared/pgn_data_processing/unigene_builds/$project/chimera_screen/$lib-chimera_prescreen"; $lib or die "No library specified, try '-l library_name (-l all to process all libraries)'\n"; $project or die "Please specify a project (cgn, fgn, pgn) using the -p flag\n"; my ($db, $usr) = @{projects::get_db_info($project)}; $db or die "No known database for project $project"; my $dbh = db_link::connect_db($db, $usr) or die "couldn't open database link\n"; # query the database for the requisite information my $estq = $dbh->prepare("select e.seq_id, e.read_direction, t.sequence_data from est_info as e left join est_library as l on l.est_library_id=e.est_library_id left join trimmed_sequence as t on e.seq_id=t.seq_id where l.library_name like '$lib%' and t.sequence_data is not null"); my ($est, $dir, $seq); my %est = (); my %est_dir = (); my %est_seq = (); $estq->execute(); while(($est, $dir, $seq) = $estq->fetchrow_array()) { $est{$est} = $est; $est_dir{$est} = $dir; $est_seq{$est} = $seq; } # Open output files my ($i, $s); my @fh; for($i=0,$s=150;$i<4;$i++,$s+=50) { $fh[$i] = new IO::File ">$output_basename-$s.seq" or die "Failed to open output file \"$output_basename-$s.seq\" ($!)"; } foreach ( %est ) { my $max_cut = length($est_seq{$_})/2; for($i=0,$s=150;$i<4;$i++,$s+=50) { # After thought: some reads (nearly 10,000) are shorter than 300bp. We # only discard if they are less than 150bp. Thus, to ensure they get into # at least one screening, on the first file we just break the sequence in # half. last if $s > $max_cut && $i>0; $s=$max_cut if ($s > $max_cut); # look at read direction if ($est_dir{$_} == "5" ) { $fh[$i]->print(">$_:5\n",substr($est_seq{$_},0,$s),"\n"); $fh[$i]->print(">$_:3\n",substr($est_seq{$_},-1*$s),"\n"); } elsif ($est_dir{$_} == "3" ) { $fh[$i]->print(">$_:3\n",substr($est_seq{$_},0,$s),"\n"); $fh[$i]->print(">$_:5\n",substr($est_seq{$_},-1*$s),"\n"); } } } # close output files for($i=0;$i<4;$i++) { $fh[$i]->close(); }