#!/usr/bin/perl my $usage="Usage: $0 [-t geneticCodeTbl] [-s speciesList] dnaSeq"; # Amino acid seqs hasn't been implemented yet. # This script takes an aligned DNA or amino acid seq file in fasta # format (as an argument or STDIN), and calculate the number of # singleton observed in the data. When DNA seq is given, it assumes # the first nucleotide of each sequence in the file corresponds to the # 1st position of a codon. Then number of singletons for each of the # three codon positions are calculated. Obviously, the three # positions are meaningles when amino acid sequences are given (just # use the total number for AA). # At first, it will print out the locations of singletons: # e.g.: # seq1 3:3:C # seq2 8:2:G 16:1:C* # seq3 5:2:C 16:1:G* # The first column is the name of the sequence, and 2nd to the last # columns indicate the location of singletons in this sequence. The # location is formatted as x:y:z, where x is the location of the site # in the sequence, y is the position in the codon (1, 2, or 3), and z # is the nucleotide which corrsponds to the singleton. When "*" is # seen after a location info, one should be careful with the info. # For example, consider a site of 6 sequences. If the nucleotide of 6 # seqs at a site is {-, A, G, C, -, -} (1st, 5th, 6th are gaps), # number of singletons in this site is 2 (=3-1), NOT 3. But we do not # know which of two are the singletons. So the script arbitrary drop # one of them (say A), and G, and C are considered as the singletons. # In this case, "*" are attached to C and G to indicate this kind of # situation. A base insertion seen only on one sequence is considered # as uninformative and ignored. Also a site where all sequences have a gap, # is ignored. # If the names of sequences are delimitted by "|" in the name line (which # is indicated by ">" in the beginnning of the line), the name will be # trancated to include only the first part. # At the end, the script prints out the final counts. use Getopt::Std; getopts('ht:s:') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } my %aminoAcid; my @dnaSeq, @seqNameArray; my $maxSeqLen; die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $dnaFile = shift @ARGV; # initialize the hash %aminoAcid if (defined($opt_t)) { # -t genCodeTbl was given open (CODE_TBL, "<$opt_t") || die "ERROR: Can't open $opt_t\n"; InitGenCode(*CODE_TBL); close (CODE_TBL); } else { # use the default standard table supplied at the end InitGenCode(*DATA); } # for debugging #while (($k, $v) = each %aminoAcid) { print "**$k** => --$v--\n";}; # initialize the @dnaSeq, and @seqNameArray my $numSeq = ReadInDNA($dnaFile); # for debugging # while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; }; ProcessDna($numSeq, $maxSeqLen); exit (0); # Initialize the hashTbl, %aminoAcid{codon}, # by reading in the FH given as the argument sub InitGenCode { # take a typeglob of FILEHANDLE as an argument, e.g. *INFILE local *FH = shift; my $type; my @aa, @b1, @b2, @b3; my $i; my $codon; while () { chomp; s/\s+$//; if (/^\s*(.*)\s*=/) { # extract the type of the line $type = $1; } else { next; } s/^\s*(.*)=\s*//; # get rid of characters before "=" if ($type =~ /AAs/) { @aa = split (//); } elsif ($type =~ /Base1/) { @b1 = split (//); } elsif ($type =~ /Base2/) { @b2 = split (//); } elsif ($type =~ /Base3/) { @b3 = split (//); } else { next; } } if (@aa + @b1 + @b2 + @b3 != 64 * 4) { # checking the length of arrays die "ERROR, Please check the genetic code table is well formatted\n"; } # making a hash table, %aminoAcid, Note all upper cases are used for ($i = 0; $i < 64; $i++) { $codon = uc ($b1[$i] . $b2[$i] . $b3[$i] ); $aminoAcid{$codon} = uc $aa[$i]; } } # this function read in dna data and store it in @dnaSeq and @seqNameArray # It also initialize the $maxSeqLen. # It returns number of sequences. sub ReadInDNA { # takes an arg; name of a file from which data are read my $infile = shift; my @line; my $seqCntr = 0; my $seqName; open (INFILE, $infile) || die "Can't open $infile\n"; $maxSeqLen = 0; while () { chomp; if (/^>/) { # name line in fasta format s/^>\s*//; @line = split (/\|/); # note it takes only the name before | $seqNameArray[$seqCntr] = $line[0]; $seqNameArray[$seqCntr] =~ s/\s+$//; $maxSeqLen = larger($maxSeqLen, charLen($dnaSeq[$seqCntr-1])) if ($seqCntr > 0); $seqCntr++; } else { s/^\s+//; s/\s+$//; next if (/^$/); # skip empty line $dnaSeq[$seqCntr-1] = $dnaSeq[$seqCntr-1] . uc( $_ ); } } # need this for the last sequence $maxSeqLen = larger($maxSeqLen, charLen($dnaSeq[$seqCntr-1])); close(INFILE); return ($seqCntr); } # count the number of characters in a string sub charLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); } # this function take two scalars and return the larger value sub larger { my ($a, $b) = @_; return (($a > $b) ? $a : $b); } # count the number of singletons and print it out sub ProcessDna { my ($numSeq, $seqLen) = @_; my $site; my $base; my @baseCntr; my @singletonSeqNum; @singletonCntr = (0,0,0,0); for ($site = 0; $site < $seqLen; $site++) { my $warnFlag = 0; @baseCntr = (0,0,0,0,0); @singletonSeqNum = (-1,-1,-1,-1,-1); for ($i = 0; $i < $numSeq; $i++) { $base = substr($dnaSeq[$i], $site, 1); $base = '-' if ($base eq ''); # end of seq reached. $base =~ tr/-AGCT/-/c; # change unrecognized character to '-' $base =~ tr/-AGCT/01234/; # code the base to number $baseCntr[$base] ++; $singletonSeqNum[$base] = $i; } # ignore when single insertion, or all seq has a gap. next if ($numSeq - $baseCntr[0] < 2); # When the site contains 0 or 1 counts for all of the nucleotide # (i.e., A, G, C, and T), one of them shouldn't be considered as # singleton. # E.g.) When counts are A:G:C:T = 0:1:1:1, # number of singletons should be 3 - 1 = 2. So I'm setting the # one of the count (in this case count for G) to be 0. if ($baseCntr[1] < 2 && $baseCntr[2] < 2 && $baseCntr[3] < 2 && $baseCntr[4] < 2) { $warnFlag = 1; for ($i = 1; $i < 5; $i++) { $baseCntr[$i] = 0 if ($baseCntr[$i] == 1); last; } } $singletonCntr[3]++; # this element counts the num of informative sites for ($i = 1; $i < 5; $i++) { next if ($baseCntr[$i] != 1); # found a singleton when reached here $singletonCntr[$site % 3]++; my $tmpBase = $i; $tmpBase =~ tr/1234/AGCT/; $singletonLoc = $singletonLoc . ", " . $singletonSeqNum[$i] . ":" . $site . ":" . $tmpBase; if ($warnFlag == 1) { $singletonLoc = $singletonLoc . "*"; } } } $singletonLoc =~ s/^, //; PrintSingletonLoc($singletonLoc); print "\n\nNumber of singletons in $numSeq sequences, $seqLen bases\n"; print "Number of informative sites " . "(insertion in single seq or gap is excluded)" . ": $singletonCntr[3]\n\n"; my $tot = $singletonCntr[0] + $singletonCntr[1] + $singletonCntr[2]; print "1stCodon: $singletonCntr[0], 2ndCodon: $singletonCntr[1], " . "3rdCodon: $singletonCntr[2], total: $tot\n\n"; } sub PrintSingletonLoc { my $locString = shift; my @locArray = split (/, /, $locString); my @eachRecord; my $oldSeqNum = -1; @locArray = sortByColumn (0, "a", @locArray); print "Location of all singletons:\n"; print "NameOfSeq\tsiteNumber:codonPosition:nucleotide ...\n"; foreach my $item (@locArray) { @eachRecord = split (/:/, $item); $eachRecord[1]++; # increment the site number because $eachRecord[1] # is originally 0-offset. my $codonPos = ($eachRecord[1] - 1) % 3 + 1; if ($eachRecord[0] == $oldSeqNum) { print "\t$eachRecord[1]:$codonPos:$eachRecord[2]"; } else { print "\n$seqNameArray[$eachRecord[0]]\t" . "$eachRecord[1]:$codonPos:$eachRecord[2]"; $oldSeqNum = $eachRecord[0]; } } print "\n"; } sub sortByColumn { # numerical sort by a column, return an array # sortbyColumn ($col_num, $order, @record) # @record is an array with each element representing a space delimited record # example # ("473 p1 S 0:06 -bash", "541 p2 SW 0:00 ps-a", ....) # $col_num -- the column by which the record is sorted by (left-most col is 0) # $order can be "a" (ascending) or "d" (descending), # sort column can be hyphnated numbers (e.g. 10-4-150) local $col_num = shift(@_); local $order = shift(@_); local @record = @_ ; local ($sortCol); ## test if the sort column is hyphnated or plain number local $sortMethod = "number"; foreach $sortCol (@record) { if ( (split(/\s+/,$sortCol))[$col_num] =~ /\d+-\d+/ ) { $sortMethod = "hyphnated"; last ; } } return sort $sortMethod @record; ## two sub-sub-routines sub number { # $col_num, $order are the given arguments # the left-most column is 0 local $first = (split(/\s+/, $a))[$col_num]; local $second = (split(/\s+/, $b))[$col_num]; # argument errors not well trapped here ($first,$second) = ($second, $first) if ($order eq "d"); return ($first <=> $second); } #probably I don't need the "sub number" sub hyphnated { # $col_num, $order are the given arguments local ($each_number, $cmp_result, @temp_swap); ## separte the hyphnated numbers and put them in the following arrays local @first = split(/-/, ((split(/\s+/, $a))[$col_num])); local @second = split(/-/, ((split(/\s+/, $b))[$col_num])); ## ascending (default) or descending order if ($order eq "d") { @temp_swap = @first; @first = @second; @second = @temp_swap; } ## comparison of array elements for ($each_number = 0; $each_number <= (($#first < $#second) ? $#first : $#second) ; $each_number++) { $cmp_result = ($first[$each_number] <=> $second[$each_number]); last if ($cmp_result); } ## if the size of two arrays differ if ( ($cmp_result == 0) && ($#first != $#second) ) { return (($#first < $#second) ? -1 : 1); } else { return $cmp_result; } } } # The Standard Code (transl_table=1) from NCBI # http://www3.ncbi.nlm.nih.gov/htbin-post/Taxonomy/wprintgc?mode=c#SG1 __DATA__ AAs = FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG Starts = ---M---------------M---------------M---------------------------- Base1 = TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG Base2 = TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG Base3 = TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG