#!/usr/bin/perl -w my $usage= "Usage: $0 [-hr] [-d genomicDNA_SeqName] [-m mRNA_SeqName] [fastaFile]\n". " -h: help\n". " -d seqName: sequence name of genomic DNA\n". " -m seqName: sequence name of mRNA\n". " -r: returned range is using the site positions of genomicDNA \n". " instead of aligned absolute position" . "Compare the genomicDNA and its mRNA, and find out the range specifier\n". "of exons. The output can be fed to -s option of selectSites.pl\n". "If name of input file (fastaFile) is not given, STDIN is used.\n" . "If the fastaFile contains only two sequences, the longer sequence after\n". "removing all gaps are considered as the genomic DNA\n". "If the fastaFile contains more than two sequences, -d and -m are mandatory.\n". "If mRNA doesn't exactly match with genomicDNA, results may not be reliable."; my $sep = "\t"; my $lineWidth = 70; # used to break the long sequences into lines. our($opt_h, $opt_r, $opt_m, $opt_d); use Getopt::Std; getopts('hm:d:r') || die "$usage\n"; die "$usage\n" if (defined($opt_h)); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $seqFile = shift @ARGV; my @dat = ReadInFASTA($seqFile); my $numSeq = @dat; my @nam = GetSeqName(@dat); if (defined($opt_m) && defined($opt_d)) { @selectedSeqs = ($opt_d, $opt_m); @dat = SelectSeqs(\@dat, \@selectedSeqs); if (@dat != 2) { die "ERROR: Please make sure that sequence names: $opt_m and $opt_d exist in". "input file\n"; } } else { if ($numSeq == 2) { my @tmpDat = GetSeqDat(@dat); $tmpDat[0] =~ s/-//g; $tmpDat[1] =~ s/-//g; my $len1 = length($tmpDat[0]); my $len2 = length($tmpDat[1]); if ($len2 > $len1) { warn ("INFO: 1st sequence is considered as mRNA\n"); @dat = ($dat[1], $dat[0]); # flip the order } else { warn ("INFO: 2nd sequence is considered as mRNA\n"); } } else { die "ERROR: there are more than two sequences in the input file,\n". "Please specify mRNA with -m and genomic DNA with -g\n\n$usage\n "; } } # @dat contains (genomic, mRNA) my @seqDat = GetSeqDat(@dat); my @genomicDNA = split //, $seqDat[0]; my @mRNA = split //, $seqDat[1]; my @exonAbs = (); my @exonRel = (); my @intronAbs = (); my @intronRel = (); my @weird = (); # sites where genomic is - but mRNA is not - my @mismatch = (); my $state = 'p'; # p = before genomicDNA starts, i = intron, e = exon # Find the index to the last base of genomicDNA my $last = $#genomicDNA; while ($genomicDNA[$last] eq '-') { $last--; } my $genomicCntr = 0; foreach my $i (0..$last) { if ($state eq 'p') { # processing before genomic starts next if ($genomicDNA[$i] eq '-'); $state = 'e'; } my $ggg = $genomicDNA[$i]; my $mmm = $mRNA[$i]; if ($ggg eq '-') { if ($state eq 'i') { push @intronAbs, $i+1; } else { push @exonAbs, $i+1; } next if ($mmm eq '-'); push @weird, $i; next; } # genomic is not '-' $genomicCntr++; if ($ggg eq $mmm) { # exon push @exonAbs, $i+1; push @exonRel, $genomicCntr; $state = 'e'; } elsif ($mmm eq '-') { # intron push @intronAbs, $i+1; push @intronRel, $genomicCntr; $state = 'i'; } else { # mismatch push @mismatch, $i+1; push @exonAbs, $i+1; push @exonRel, $genomicCntr; $state = 'e'; } } #print "@exonAbs\n"; # handle weird sites if (@weird > 0) { warn "INFO: following sites had gaps in genomic DNA, and non-gaps in mRNA, " . " non-gap chars in mRNA were considered as gaps: " . join(",", @weird) . "\n"; } if (@mismatch > 0) { warn "INFO: following sites had mismatch between genomic DNA vs mRNA, ". "sites were considered as exon: " . join(",", @mismatch) . "\n"; } ## final output if (defined($opt_r)) { print CondenseIndex(@exonRel), "\n"; } else { print CondenseIndex(@exonAbs), "\n"; } exit(0); # Takes an array of integer, and create a range string. # E.g. (2,3,4,6,9,11,12,13) => "2-4,6,9,11-13" # assumes an array is already sorted sub CondenseIndex { my @arr = @_; my $prev = shift @arr; my $result = $prev; my $begin = $prev; foreach my $num (@arr) { if ($num == $prev + 1) { $prev = $num; next; } else { # not continuous if ($prev eq $begin) { $result .= ",$num"; } else { $result .= "-$prev,$num" } $begin = $num; $prev = $num; } } # deal with the last element if ($arr[$#arr] == $arr[$#arr-1]+1) { $result .= "-$arr[$#arr]"; } return $result; } # takes an arg; name of a file from which data are read Then read in # the data and make an array. Each element of this array corresponds # to a sequence, name tab data. sub ReadInFASTA { my $infile = shift; my @line; my $i = -1; my @result = (); my @seqName = (); my @seqDat = (); open (INFILE, "<$infile") || die "Can't open $infile\n"; while () { chomp; if (/^>/) { # name line in fasta format $i++; s/^>\s*//; s/^\s+//; s/\s+$//; $seqName[$i] = $_; $seqDat[$i] = ""; } else { s/^\s+//; s/\s+$//; s/\s+//g; # get rid of any spaces next if (/^$/); # skip empty line s/[uU]/T/g; # change U to T $seqDat[$i] = $seqDat[$i] . uc($_); } # checking no occurence of internal separator $sep. die ("ERROR: \"$sep\" is an internal separator. Line $. of " . "the input FASTA file contains this charcter. Make sure this " . "separator character is not used in your data file or modify " . "variable \$sep in this script to some other character.\n") if (/$sep/); } close(INFILE); foreach my $i (0..$#seqName) { $result[$i] = $seqName[$i] . $sep . $seqDat[$i]; } return (@result); } sub GetSeqDat { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[1]; } return (@result) } sub GetSeqName { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[0]; } return (@result) } sub SelectSeqs { my ($seqARef, $nameARef) = @_; my @seqName = GetSeqName(@$seqARef); my @seqDat = GetSeqDat(@$seqARef); # make a hash table my %seqHash = (); for my $i (0..$#seqName) { if (exists($seqHash{$seqName[$i]})) { die "ERROR: In fasta file, there are more than 1 entry " . "which has the name $seqName[$i]\n"; } else { $seqHash{$seqName[$i]} = $seqDat[$i]; } } # select the specified seqs foreach my $name (@$nameARef) { if (exists($seqHash{$name})) { my $tmp = $name . $sep . $seqHash{$name}; push @result, $tmp; } else { warn "WARN: $name didn't occur in the input file\n"; } } return @result; }