#!/usr/bin/perl # Gives FASTA seq as an argument or STDIN. # Calculate the pairwise differences my $usage="Usage: $0 [-ch] [alignedDNASeq]\n" . " -c: eliminate codons which include one or more gaps\n" . " This option can be combined with -e\n" . " -e: eliminate base sites where gaps are observed in at least one seq\n". " Gap sites are eliminated pair-wise without this option\n" . " following options: not implemented\n" . " -s: use a file which list names of sequences to be used in the analysis\n". " -n: do not print seq names\n" . " -f: output format, currently only phylip, PAML style if not specfied"; my $sep = "\t"; use Getopt::Std; getopts('f:hces:n') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; my $dnaFile = shift; # $mode is set "nucleotide" or "protein" # read in the data and store them in @seqName and @seqDat my @dat = ReadInFASTA($dnaFile); if (defined($opt_c)) { @dat = CleanSeqs(@dat); } my @seqDat = GetSeqDat(@dat); #print join "\n", @seqDat, "\n"; my %composi1 =(); my %composi2 =(); my %composi3 =(); for my $s (@seqDat) { my @entry = split //, $s; for my $i (0..$#entry) { $posi = $i % 3; $residue = $entry[$i]; if ($posi == 0) { $composi1{$residue}++; } elsif ($posi == 1) { $composi2{$residue}++; } else { $composi3{$residue}++; } } } print "Position 1:\t"; my $total = TotalCnt(%composi1); for $key (sort(keys(%composi1))) { print "$key: $composi1{$key}\t"; } print "total: $total\nPosition 2:\t"; my $total = TotalCnt(%composi2); for $key (sort(keys(%composi2))) { print "$key: $composi2{$key}\t"; } print "total: $total\nPosition 3:\t"; my $total = TotalCnt(%composi3); for $key (sort(keys(%composi3))) { print "$key: $composi3{$key}\t"; } print "total: $total\n"; # for debugging # while (($k, $v) = each %aaSeq) { print "**$k** => ++$v++\n"; }; # foreach $k (keys %synoDiff) # {print "**$k** => $synoDiff{$k} $nonSynoDiff{$k}\n";}; exit (0); sub TotalCnt { my %hash = @_; $sum = 0; for my $key (keys(%hash)) { $sum += $hash{$key}; } return $sum; } # takes an arg; name of a file from which data are read Then read in # the data and make an array. Each element of this array corresponds # to a sequence, name tab data. sub ReadInFASTA { my $infile = shift; my @line; my $i = -1; my @result = (); my @seqName = (); my @seqDat = (); open (INFILE, "<$infile") || die "Can't open $infile\n"; while () { chomp; if (/^>/) { # name line in fasta format $i++; s/^>\s*//; s/^\s+//; s/\s+$//; $seqName[$i] = $_; $seqDat[$i] = ""; } else { s/^\s+//; s/\s+$//; s/\s+//g; # get rid of any spaces next if (/^$/); # skip empty line s/[uU]/T/g; # change U to T $seqDat[$i] = $seqDat[$i] . uc($_); } # checking no occurence of internal separator $sep. die ("ERROR: \"$sep\" is an internal separator. Line $. of " . "the input FASTA file contains this charcter. Make sure this " . "separator character is not used in your data file or modify " . "variable \$sep in this script to some other character.\n") if (/$sep/); } close(INFILE); foreach my $i (0..$#seqName) { $result[$i] = $seqName[$i] . $sep . $seqDat[$i]; } return (@result); } # takes multiple string; each string represent a sequence (name tab seq). # Get rid of the sites where at least one seq has a gap (or ambiguious base) # It returns an array of sequences. sub CleanSeqs { my @dat = @_; my @seqDat = GetSeqDat(@dat); my @seqName = GetSeqName(@dat); my $minLength = CharLen($seqDat[0]); my ($i, $seqNum); my @noGapSites = (); my @result = (); my @codonMat = (); # get rid of codons with gaps foreach $i (@seqDat) { my @tmpArray = (defined($opt_c)) ? MkTripletArray($i) : split(//, $i); push @codonMat, [ @tmpArray ]; $minLength = @tmpArray if (@tmpArray < $minLength); # find min length } # identify the sites without any gaps. for $i (0 .. $minLength - 1) { my $gap = 0; if (defined($opt_c)) { for $seqNum (0 .. $#seqDat) { if ($codonMat[$seqNum][$i] !~ /^[ACGT]{3}$/) { $gap = 1; last; } } } elsif ($mode eq "nucleotide") { for $seqNum (0 .. $#seqDat) { if ($codonMat[$seqNum][$i] !~ /^[ACGT]{1}$/) { $gap = 1; last; } } } else { # protein for $seqNum (0 .. $#seqDat) { if ($codonMat[$seqNum][$i] =~ /^[\-\?]{1}$/) { $gap = 1; last; } } } push (@noGapSites, $i) if ($gap == 0); } # select the sites without any gaps for $seq (0 .. $#seqDat) { my @oneSeq = (); foreach $i (@noGapSites) { push @oneSeq, $codonMat[$seq][$i]; } my $newSeq = join "", @oneSeq; push @result, $seqName[$seq] . $sep . $newSeq; } return @result; } sub GetSeqDat { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[1]; } return (@result) } sub GetSeqName { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[0]; } return (@result) } # returns an array with the 1st argument repeated n times (2nd arg) sub Repeat { my ($val, $n) = @_; my @result = (); foreach my $i (0..($n-1)) { push @result, $val; } return @result; } # this function take two scalars and return the larger value sub Smaller { my ($a, $b) = @_; return (($a < $b) ? $a : $b); } sub Min { my $min = shift; foreach my $i (@_) { $min = ($i < $min) ? $i : $min; } return $min; } sub Sum { my $sum = 0; foreach my $i (@_) { $sum += $i; } return $sum; } # count the number of characters in a string sub CharLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); } # for a given string, it separates into triplets, and return # the resulting array. # if the last element is less than a triplet, it will be removed sub MkTripletArray { my $seq = shift; $seq =~ s/\s+//g; $seq =~ s/(.{3})/$1 /g; $seq =~ s/\s+$//; my @result = split(/ /, $seq); pop @result unless ($result[$#result] =~ /.{3}/); return @result; } # When one of the codons corresponds to termination, (-1, -1) is returned. # If two codons are identical, (0,0) is returned even if the codon # correspond to termination. # take a list as the argument and extract the unique elements. # The order of elements will not be preserved. sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; }