#!/usr/bin/perl -w my $usage="Usage: $0 [-hcp] [accessionFile]\n" . " -c: print cds\n" . " -p: print translated proteins\n" . " STDIN is used as the input if no fastaFile is given\n" . "accessionFile should be tab delimited. 1st column (accesion No) " . "and 2nd column (seqName to be used) are used. " . "The other columns are ignored. Lines with # are ignored\n"; my $accessionSep = "\t"; my $sep = "\t"; use Bio::DB::GenBank; use Bio::SeqIO; #use Bio::SeqFeature::Gene::GeneStructure; #use Bio::SeqFeature::Gene::NC_Feature; use Getopt::Std; # handle optional flags my %opts = (); getopts('cpho:', \%opts) || die "$usage\n"; die "$usage\n" if ($opts{h}); my $outFormat = $opts{o} || 'fasta'; my $outFH = Bio::SeqIO->newFh ( -fh => \*STDOUT, -format => $outFormat ); # get the accession number file @ARGV = ('-') unless @ARGV; # take STDIN when no arg my $accessionFile = shift; my @accession = ReadInAccessionFile ($accessionFile); my @accNums = Column (1, @accession); my @seqNames = Column (2, @accession); my @seqs = DownloadSeqs(@accNums); if (@seqs != @seqNames) { die "number of downloaded sequences are: " . scalar(@seqs) . " smaller than specified: " . scalar(@seqNames); } # print out for my $i (0..$#seqs) { my @cds = ($opts{c} || $opts{p}) ? extractCDS($seqs[$i]) : ($seqs[$i]); my $numCDS = @cds; if ($numCDS == 0) { warn $seqs[$i]->accession_number ."(". $seqNames[$i].")"." has no CDS"; } elsif ($numCDS > 1) { warn $seqs[$i]->accession_number ."(". $seqNames[$i].")". " has " . $numCDS . " CDS.\n"; } for my $c (@cds) { print ">", $seqNames[$i]; print "|" . $c->desc() if ($numCDS > 1); #, $c->seq()); my $seqString = ($opts{p}) ? $c->translate->seq : $c->seq; if ($opts{p}) { $seqString =~ s/\*\s*$//; # remove the termination codon at the end } print "\n$seqString\n"; } } exit(0); #Coderet($seqs[0], "cds"); # readin a file, and return an array. # Each element contains accessionNo . $sep . seqName sub ReadInAccessionFile { my $infile = shift; open (INFILE, "<$infile") || die "Can't open $infile\n"; my @result = (); while () { chomp; s/#.+$//; # remove comments; s/^\s+//; s/\s+$//; next if /^$/; my @line = split /$accessionSep/; if (@line < 2) { warn "WARN: line $. contains < 2 columns. " . "1st column is accessionNo, 2nd column is seqName to be used.". " Skipping this line\n"; next; } push @result, $line[0] . $sep . $line[1]; # accession seqName } return @result; } sub DownloadSeqs { my $batch = 1; # 1 for batch downloading (quicker) my $batchSize = 20; # 20 seems to be the max of Entrez batch download my @accNumArray = @_; my $gb = new Bio::DB::GenBank(); my @seqArray = (); if ($batch) { while (my @tmpAccNums = splice (@accNumArray, 0, $batchSize)) { my $seqio = $gb->get_Stream_by_batch(\@tmpAccNums); while (my $seq = $seqio->next_seq()) { push @seqArray, $seq; } } } else { for my $acc (@accNumArray) { $seq = $gb->get_Seq_by_acc($acc); push @seqArray, $seq; } } return @seqArray; } # this is supposed to drive EMBOSS coderet, but it doesn't work. # sub Coderet { # my $seq = shift; # my $type = shift; # should be cds|mrna|translation # use Bio::Factory::EMBOSS; # my $f = Bio::Factory::EMBOSS -> new(); # my $coderet = $f->program('coderet'); # my $tmpOutfile = '/tmp/coderet.out'; # $coderet->run( { '-seqall' => $seq, # '-seqout' => $tmpOutfile # } ); # #-mrna -translation # # my $seqIO = Bio::SeqIO->new( -file => $tmpOutfile); # } # returns an array of seq objects, each of them are CDS. sub extractCDS { # get seq object my $entry = shift; my $entryAcc = $entry->accession_number(); #$entry->id(); my @result =(); foreach my $feat ($entry->top_SeqFeatures()) { next if ($feat->primary_tag() ne 'CDS'); my $id = $feat->has_tag('gene') ? join(' ', $feat->each_tag_value('gene')) : 'no_gene'; my $acc = $feat->has_tag('protein_id') ? join(' ', $feat->each_tag_value('protein_id')) : 'no_pid'; my $desc = $feat->has_tag('product') ? join(' ', $feat->each_tag_value('product')) : 'no_product'; $desc="$id($acc)|$desc|from $entryAcc"; # my $featseq; # eval { $featseq = $feat->seq(); }; # if ( $@ ) { # print STDERR # "** can't get the correct sequence of CDS $id|$acc **\n"; # print STDERR "** skip it **\n"; # } my $loc = $feat->location(); my $splicedSeq = ""; if($loc->isa('Bio::Location::SplitLocationI')) { foreach my $subloc ($loc->sub_Location()) { my $subSeqFrag = $entry->trunc($subloc->start,$subloc->end); $subSeqFrag = $subSeqFrag->revcom if ($subloc->strand == -1); # $subloc->seq_id() $splicedSeq .= $subSeqFrag->seq(); } } else { $splicedSeq = $entry->subseq($loc->start,$loc->end); } # adjust the reading frame my $codonStart = 1; if ($feat->has_tag('codon_start')) { my @tmpCodonStarts = $feat->each_tag_value('codon_start'); warn "Weird, more than 1 codon_start\n" if (@tmpCodonStarts > 1); $codonStart = $tmpCodonStarts[0]; } $splicedSeq = substr($splicedSeq, $codonStart - 1); my $seqclass; if($entry->can_call_new()) { $seqclass = ref($entry); } else { $seqclass = 'Bio::PrimarySeq'; $entry->_attempt_to_load_Seq(); } my $thisCDS = $seqclass->new('-seq' => $splicedSeq, '-display_id' =>$entry->display_id() . "|$id($acc)", '-accession_number' => $entryAcc, '-alphabet' => $entry->alphabet, '-desc' => $desc, ); push @result, (($loc->strand == -1) ? $thisCDS->revcom : $thisCDS); } return @result; } sub Column { my $col = shift; my @array = @_; my @result = (); for my $i (@array) { my @line = split /$sep/, $i; push @result, ((@line >= $col) ? $line[$col-1] : "NA") ; } return @result; } sub simpleExamples { # this returns a Seq object : my $seqobj = $gb->get_Seq_by_id('MUSIGHBA1'); # this returns a Seq object : my $seqobj2 = $gb->get_Seq_by_acc('AF303112'); print $seqobj->seq(), "\n"; # other methods $seqobj->display_id(); # the human read-able id of the sequence $seqobj->seq(); # string of sequence $seqobj->subseq(5,10); # part of the sequence as a string $seqobj->accession_number(); # when there, the accession number $seqobj->alphabet(); # one of 'dna','rna','protein' $seqobj->primary_id(); # a unique id for this sequence irregardless # of its display_id or accession number $seqobj->desc(); # a description of the sequence # this returns a SeqIO object : my $seqio = $gb->get_Stream_by_batch([ qw(J00522 AF303112 2981014)]); my $out = Bio::SeqIO->new(-file => ">/tmp/junk" , '-format' => 'FASTA'); while (my $seq = $seqio->next_seq()) { print "Sequence ",$seq->id," first 10 bases ",$seq->subseq(1,10),"\n"; $out->write_seq($seq); } } sub extCDSOld { my $entry = shift; my $entryid = $entry->id(); my @result =(); foreach my $feat ($entry->top_SeqFeatures()) { if ($feat->primary_tag() eq 'CDS') { my $id = $feat->has_tag('gene') ? join(' ', $feat->each_tag_value('gene')) : 'no_gene'; my $acc = $feat->has_tag('protein_id') ? join(' ', $feat->each_tag_value('protein_id')) : 'no_pid'; my $desc = $feat->has_tag('product') ? join(' ', $feat->each_tag_value('product')) : 'no_product'; my $featseq; eval { $featseq = $feat->seq(); }; if ( $@ ) { print STDERR "** can't get the correct sequence of CDS $id|$acc **\n"; print STDERR "** skip it **\n"; next; } $featseq->id("$id|$acc"); $featseq->desc("$desc |from $entryid"); push @result, $featseq; } } return @result; } sub PrintFeature { my $seqfile = shift; my $format = "genbank"; my $seqio = Bio::SeqIO->new (-format => $format , -file => $seqfile); my $seqobj = $seqio->next_seq(); foreach my $feat ( $seqobj->all_SeqFeatures() ) { if ($feat->has_tag('translation')) { if ($feat->has_tag('protein_id')) { my @protein_id = $feat->each_tag_value('protein_id'); my $protein_id = join " ", @protein_id; } elsif ((! $show_gene) && $feat->has_tag('gene')) { @gene = $feat->each_tag_value('gene'); $protein_id = join " ", @gene; } else { $protein_id = ""; } if ($show_gene && $feat->has_tag('gene')) { @gene = $feat->each_tag_value('gene'); $gene = join " ", @gene; } else { $gene = ""; } if ($feat->has_tag('product')) { @product = $feat->each_tag_value('product'); $product = join " ", @product; } else { $product = ""; } $desc = "$protein_id $gene $product"; $desc .= " (" . $feat->start . "," . $feat->end . ")"; $desc =~ s/\s+/ /g; @translation = $feat->each_tag_value('translation'); print ">$desc\n", $translation[0],"\n"; } } } sub printFT { my $dummy = shift; my @feat = $dummy->all_SeqFeatures(); #$dummy->transcripts(); #$dummy->add_transcript_as_features(@feat); for $i (@feat) { print $i->primary_tag, "\n"; if ($i->primary_tag eq 'CDS') { $loc = $i->location(); @sublocs = $loc->sub_Location(); foreach my $j (@sublocs) { print $j->start, "++", $j->end, "\n"; } # $out = Bio::SeqIO->new(-file => ">/tmp/junk" , '-format' => 'FASTA'); # $out->write_seq($spSeq); } } }