#!/usr/bin/perl -w my $usage = "\nUsage: $0 [-hg] [-w windowSize,incr] [sitesInputFile]\n\n". " -g: By default, sites with any gaps are ignored, but this option will\n". " let SITES to include these sites with gaps in the analysis\n". "Runs SITES polymorphism analysis, and it prints outs the output in\n". "tab delimited format.\n\n". "- Sites with any gaps are removed from analysis, and sites with\n". " multiple hits (more than 2 bases) are removed, i.e. -Sxz is specified.\n". "- Run sites with five different types of site choice:\n". " All site types (a), silent base changes (is), noncoding base changes (i)\n". " synonymous coding sites (s), non-synonymous/replacement coding sites (r).\n". "- Comparison within groups (-Og) is specified\n" . "- Note that piPerBp and thetaPerBp is different from per site measure\n". " pi and theta per gene is divided by the number of all kinds of sites\n". " used in the analysis. Hence, for site types other than all (a), you need to\n". " divide pi or theta by appropriate number of sistes (e.g. synSites, \n". " nonSynSites, nonCodingSites column)\n\n". "- Each analysis of windowing analysis contains total base of windowSize.\n". " The sites with any gaps are removed and not counted in the windowSize.\n". "- Even if you specify windowSize of 200, actual number of bp analyzed\n". " may be slightly smaller if there are sites with multiple hits.\n". "- With analysis other than site type (a), windowSize refer to the\n". " total number of site, not the number of site of given type \n". " (e.g. synonymous)\n". "- increments (incr) is after removing the gap sites. Say that there\n". " is a gap between 200-300, and incr of 5 is specified.\n". " If it calculate the polymorphism centered around 198, then the next\n". " calculation is centered around 304 ". " (because 198, 199, gap --- gap, 301, 302, 303, 304)\n"; use POSIX qw (tmpnam); use IO::File; my $debug = 0; our ($opt_h, $opt_w, $opt_g, $opt_d); use Getopt::Std; getopts('hgdw:') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $sitesInFile = shift @ARGV; my $tmpOut = '/tmp/ooojjjj'; my $fh; do {$tmpOut = tmpnam()} until $fh = IO::File->new("$tmpOut.SIT", O_RDWR|O_CREAT|O_EXCL); if (defined($opt_d)) { # for debugging warn "INFO: Temporary SITES output is $tmpOut.SIT, for debugging\n"; } $fh -> close; system("touch $tmpOut.SIT"); # run polymorphism analysis (-Ap) # -Og for comparisons within groups my %sitesDat = ReadInSites($sitesInFile); if ($debug) { my @key = qw(comment numSeq alignedLen frame numNonCodingStretches numGroups); foreach my $k (@key) { print "$k => $sitesDat{$k}\n"; } print "range = "; foreach my $k (0..($sitesDat{numNonCodingStretches}-1)) { #print join ("-", @{${$sitesDat{'ncRange'}}[$k]}), "\n"; print join ("-", @{$sitesDat{'ncRange'}->[$k]}), "\n"; print $sitesDat{'ncRange'}->[$k]->[0], "-", $sitesDat{'ncRange'}->[$k]->[1],"\n"; } print "groups = ", join (":", @{$sitesDat{'groupNames'}}), "\n"; print "grpSizes = ", join (":", @{$sitesDat{'groupSizes'}}), "\n"; print "allGapSites = ", join (":", @{$sitesDat{'allGapSites'}}), "\n"; print "anyGapSites = ", join (":", @{$sitesDat{'anyGapSites'}}), "\n"; my @nnn = @{$sitesDat{'seqNames'}}; my @seqDatMat = @{$sitesDat{'seqMat'}}; foreach my $i (0..$#nnn) { my $len = $sitesDat{'alignedLen'}; print "$nnn[$i] = ", join ("", @{$seqDatMat[$i]}[0..19], "...", @{$seqDatMat[$i]}[($len-10)..($len-1)]), "\n"; } } if (defined($opt_w)) { my ($winSize, $incr); if ($opt_w =~ /^\s*(\d+)\s*,\s*(\d+)\s*/) { $winSize = $1; $incr = $2; } else { die "$usage\n\nSpecify the window size and increments for -w\n"; } my @windowRes = RunWindowAnalysis($sitesInFile, $winSize, $incr); print join("\n", @windowRes), "\n"; exit; } PrintSummary($sitesInFile); if (!defined($opt_d)) { unlink ("$tmpOut.SIT") || die "Can't unlink $tmpOut.SIT\n"; } exit; sub PrintSummary { my $sitesInFile = shift; my $sOpt = (defined($opt_g)) ? "" : "x"; # all site -Sa system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Sza$sOpt -Aa -Ogs > /dev/null"); my %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); #PrintSitesInfoVert("header",\%result); my $outputString = PrintSitesInfo("header",\%result); $outputString = "mutType\t" . $outputString; $outputString =~ s/\n/\nall\t/; # silent base changes -Sis system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szis$sOpt -Aa -Ogs > /dev/null"); %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); $outputString .= "\nsilent\t" . PrintSitesInfo("noHeader",\%result); # noncoding base changes -Si system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szi$sOpt -Aa -Ogs > /dev/null"); %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); $outputString .= "\nnoncoding\t" . PrintSitesInfo("noHeader",\%result); # synonymous base changes -Ss system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szs$sOpt -Aa -Ogs > /dev/null"); %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); $outputString .= "\nsyn\t" . PrintSitesInfo("noHeader",\%result); # non-synonymous base changes -Sr system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Szr$sOpt -Aa -Ogs > /dev/null"); %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); $outputString .= "\nnonsyn\t" . PrintSitesInfo("noHeader",\%result); print "$outputString\n"; } sub RunWindowAnalysis { my ($sitesInFile, $window, $incr) = @_; my %sitesIn = ReadInSites($sitesInFile); my $alnLen = $sitesIn{'alignedLen'}; my @gapSites = @{$sitesIn{'anyGapSites'}}; my @allSites = 1..$alnLen; my @sitesUsed = InANotInB(\@allSites, \@gapSites); my $frontOffset = MyRound(($window - 1) /2); my $backOffset = $window - 1 - $frontOffset; my @siteSelection = ('a', 'is', 'i', 's', 'r'); # correspond to all, silent, non-coding, syn, non-syn my @statType = qw(pi thetaW tajimaD fuliDnoOG fuliD); my $index; my @allResults = (); my $sOpt = (defined($opt_g)) ? "" : "x"; for ($index = $frontOffset; $index < @sitesUsed-1-$backOffset; $index += $incr) { my $begin = $sitesUsed[$index-$frontOffset]; my $end = $sitesUsed[$index+$backOffset]; my $middle = $sitesUsed[$index]; $thisResult = "$middle\t$begin\t$end"; foreach my $siteSel (@siteSelection) { system("sites -I$sitesInFile -R$tmpOut -M \"Analysis\" -Sz$sOpt$siteSel -Aa -Ogsm -Y${begin}-${end} > /dev/null"); my %result = ReadSitesOut("$tmpOut.SIT"); CheckSitesInfo(%result); # get the number of sites in each category once if ($siteSel eq $siteSelection[0]) { my $countSilentSites = $result{'synSites'} + $result{'nonCodingSites'}; $thisResult .= "\t" . join("\t",($result{'analyzedSeqLen'}, $countSilentSites, $result{'nonCodingSites'}, $result{'synSites'}, $result{'nonSynSites'})); } foreach my $ss (@statType) { $thisResult .= "\t$result{$ss}"; } } push @allResults, $thisResult; } my $header = "siteMedian\tsiteBegin\tsiteEnd"; $header .= "\tallSites\tsilentSites\tnonCodingSites\tsynSites\tnonSynSites"; foreach my $ttt (@siteSelection) { foreach my $sss (@statType) { $header .= "\t$sss.$ttt"; } } unshift @allResults, $header; return @allResults; } sub CheckSitesInfo { my %info = @_; my @segSites = split /\t/, $info{'segSites'}; my @totalMu = split /\t/, $info{'totalMu'}; my $diff = 0; for my $i (0..$#totalMu) { if ($segSites[$i] ne $totalMu[$i] && $segSites[$i] ne 'NA') { $diff = 1; last; } } if ($diff == 1) { warn "CHECK $info{'segSites'} END\n"; warn "WARN: SegSites ($info{'segSites'}) and totalMu ($info{'totalMu'}) differ\n"; } } # Returns a information hash with the following keys: # numExcludedIndels number of sites excluded due to indels # numExcludedMultiBases number of sites excluded because there are > 2 bases # numSeq total number of sequences in the analysis (ignoring grouping). # alignedLen aligned sequence length in the entire data set # synSites average number of synonymous sites used in the analysis # nonSynSites average number of non-synonymous sites used in the analysis # nonCodingSites analyzedSeqLen - synSites - nonSynSites # numGroups number of subgroups # names names of groups # divergence average pairwise differences between groups # netDivergence net average pairwise differences between groups # fixedDiff fixed difference between groups # Fst # Nm population migration rates assuming diploidy # tajimaD # numSeqs number of sequences in each subgroup # fuliD # fuliDnoOG # totalMu number of segregating sites # derivedSingleton number of derived singletons (only with outgroup) # singleton number of singletons # analyzedSeqLen sequence lengths used for analysis of each group # segSites number of segregating sites, should be same as totalMu # thetaW watterson's theta estimate per gene # thetaWPerBp watterson's theta estimate per base pair (not per site) # pi average pairwise differences per gene # piPerBp average pairwise differences per base pair (not per site) sub ReadSitesOut { my $sitesFileName = shift; open SITESOUT, "<$sitesFileName" || die "Can't open $sitesFileName\n" ; my $state = 0; my %info=(); my $cntr = 0; if (defined ($opt_g)) { # INDELS section doesn't exist $state = 2; $info{'numExcludedIndels'} = 0; } # note theta is not estimated for the group with only 1 sequence. my @singleSeqPerGroupIndex = (); { my $index = 0; foreach my $grpSize (@{$sitesDat{'groupSizes'}}) { if ($grpSize <2) { push @singleSeqPerGroupIndex, $index; } $index++; } } while (){ chomp; if ($state == 0 && /^\*\*\*POSITIONS WITHIN INDELS WERE EXCLUDED/) { $state++; next; } if ($state == 1) { if (/NUMBER OF POSITIONS EXCLUDED\s*:\s*(\d+)\s*/) { $info{'numExcludedIndels'} = $1; $state++; } next; } if ($state == 2 && /^\*\*\*POSITIONS WITH MORE THAN TWO BASE VALUES EXCLUDED/) { $state++; next; } if ($state == 3) { if (/NUMBER OF POSITIONS EXCLUDED\s*:\s*(\d+)\s*/) { $info{'numExcludedMultiBases'} = $1; $state++; } next; } if ($state == 4 && /^NUMBER OF SEQUENCES\s*:\s*(\d+)\s+LENGTH OF SEQUENCES\s*:\s*(\d+)/) { $info{'numSeq'} = $1; $info{'alignedLen'} = $2; $state++; next; } if ($state == 5) { if (/^\s*AVERAGE NUMBER OF SYN. SITES\s*:\s*([\d\.]+)\s+REPLACEMENT SITES\s*:\s*([\d\.]+)\s*/) { $info{'synSites'} = $1; $info{'nonSynSites'} = $2; $state++; } next; } if ($state == 6 && /^GROUP DIFFERENCES PER BASE PAIR/) { $state++; next; } if ($state == 7) { next if (/^\s+- above and on the diagonal/); next if (/^\s+- below the diagonal/); if (/^(\s+\d+)+\s*$/) { s/^\s+//; s/\s+$//; my @groups = split /\s+/; if ($groups[$#groups] != @groups) { warn "WARN: number of groups ($groups[$#groups]) may not be estimated correctly\n"; } $info{'numGroups'} = $groups[$#groups]; $state++; $cntr = 0; # set the counter to 0 } next; } # extracting the divergence and net-divergence if ($state == 8) { if ($info{'numGroups'} < 2) { $info{'divergence'} = 'NA'; $info{'netDivergence'} = 'NA'; $info{'names'} = 'NA'; $state++; next; } next if (/^\s*-+\s*$/); s/\|//g; s/^\s*\d+:\s*//; s/\s+$//; my @line = split /\s+/; my $grName = shift @line; if (defined ($info{'names'})) { $info{'names'} .= "\t$grName"; } else { $info{'names'} = $grName; } # extract above diagonal: ave pw diff per base pair foreach my $colIndex (($cntr+1)..($info{'numGroups'}-1)) { if (defined($info{'divergence'})) { # : separated list $info{'divergence'} .= "\t$line[$colIndex]"; } else { $info{'divergence'} = "$line[$colIndex]"; } } #extract below diagnonal: net pw diff per base pair if ($cntr>0) { foreach my $colIndex (0..($cntr-1)) { if (defined($info{'netDivergence'})) { # tab separated list $info{'netDivergence'} .= "\t$line[$colIndex]"; } else { $info{'netDivergence'} = "$line[$colIndex]"; } } } #print "$info{'names'} &&& $info{'divergence'} &&& $info{'netDivergence'}\n"; #debug if ($cntr < $info{'numGroups'}-1) { $cntr++; } else { $state++; # get out of this extraction loop } next; } if ($state == 9) { if ($info{'numGroups'} < 2) { $info{'fixedDiff'} = 'NA'; $state += 2; #FIXED DIFFERENCES section doesn't exist next; } if (/^FIXED DIFFERENCES/) { $state++; $cntr = 0; } next; } # Fixed differences if($state == 10) { next if (/^[\s-]+$/); next if (/^(\s*\d+)+\s*$/); s/^\s*\d+:\s*//; s/\s+$//; my @line = split /\s+/; my $grName = shift @line; # extract above diagonal: ave pw diff per base pair foreach my $colIndex (($cntr+1)..($info{'numGroups'}-1)) { if (defined($info{'fixedDiff'})) { # : separated list $info{'fixedDiff'} .= "\t$line[$colIndex]"; } else { $info{'fixedDiff'} = "$line[$colIndex]"; } } if ($cntr < $info{'numGroups'} -1) { $cntr++; } else { $state++; # get out of this extraction loop } next; } if ($state == 11) { if ($info{'numGroups'} < 2) { $info{'Fst'} = 'NA'; $info{'Nm'} = 'NA'; $state += 2; # skip the next section next; } if(/Fst AND POPULATION MIGRATION RATES/) { $state++; $cntr = 0; next; } } # Fst and nm if ($state == 12) { next if (/^\s*- above the diagonal/); next if (/^\s*- below the diagonal/); next if (/^(\s*\d+)+\s*$/); next if (/^[\s-]+$/); s/\|//g; s/^\s*\d+:\s*//; s/\s+$//; my @line = split /\s+/; my $grName = shift @line; # extract above diagonal: ave pw diff per base pair foreach my $colIndex (($cntr)..($info{'numGroups'}-2)) { if (defined($info{'Nm'})) { # : separated list $info{'Nm'} .= "\t$line[$colIndex]"; } else { $info{'Nm'} = "$line[$colIndex]"; } } #extract below diagnonal: net pw diff per base pair if ($cntr > 0) { foreach my $colIndex (0..$cntr-1) { if (defined($info{'Fst'})) { # tab separated list $info{'Fst'} .= "\t$line[$colIndex]"; } else { $info{'Fst'} = "$line[$colIndex]"; } } } if ($cntr < $info{'numGroups'}-1) { $cntr++; } else { $state++; # get out of this extraction loop } next; } ### extract D statistics if ($state == 13 && /^D STATISTICS/) { $state++; next; } if ($state == 14 && /^[-\s]+$/) { $state++; $cntr=0; next; } if ($state == 15) { if ($cntr < $info{'numGroups'}) { s/no OutGp/ NA /g; s/undef/ NA /g; s/low ss/ lowSS /g; my @line = split /\s+/; if (@line != 8) { die "Extracting D statistics, and there are " . scalar(@line) . " columns, but it should be 8\n"; } if (defined($info{'tajimaD'})) { $info{'numSeqs'} .= "\t$line[1]"; $info{'tajimaD'} .= "\t$line[2]"; $info{'fuliD'} .= "\t$line[3]"; $info{'fuliDnoOG'} .= "\t$line[4]"; $info{'totalMu'} .= "\t$line[5]"; $info{'derivedSingleton'} .= "\t$line[6]"; $info{'singleton'} .= "\t$line[7]"; } else { $info{'numSeqs'} = $line[1]; $info{'tajimaD'} = $line[2]; $info{'fuliD'} = $line[3]; $info{'fuliDnoOG'} = $line[4]; $info{'totalMu'} = $line[5]; $info{'derivedSingleton'} = $line[6]; $info{'singleton'} = $line[7]; } $cntr++; } else { $state++; } } ### extract theta if ($state == 16 && /^THETA \(4Nu\) ESTIMATES/) { $state++; next; } if ($state == 17 && /^[-\s]+$/) { $state++; $cntr=0; next; } if ($state == 18) { # If there is only 1 seq per group, sites do not print out the # theta line corresponding to this group if ($cntr < $info{'numGroups'} - @singleSeqPerGroupIndex) { my @line = split /\s+/; if (@line != 8) { die "Extracting theta statistics, and there are " . scalar(@line) . " columns, but it should be 8\n"; } if (defined($info{'thetaW'})) { $info{'analyzedSeqLen'} .= "\t$line[2]"; $info{'segSites'} .= "\t$line[3]"; $info{'thetaW'} .= "\t$line[4]"; $info{'thetaWPerBp'} .= "\t$line[5]"; $info{'pi'} .= "\t$line[6]"; $info{'piPerBp'} .= "\t$line[7]"; } else { $info{'analyzedSeqLen'} .= "$line[2]"; $info{'segSites'} .= "$line[3]"; $info{'thetaW'} .= "$line[4]"; $info{'thetaWPerBp'} .= "$line[5]"; $info{'pi'} .= "$line[6]"; $info{'piPerBp'} = "$line[7]"; } $cntr++; } else { $state++; } } if ($state == 19) { if (@singleSeqPerGroupIndex > 0) { # Fixing the missing theta by inserting NA foreach my $kwd (qw(analyzedSeqLen segSites thetaW thetaWPerBp pi piPerBp)) { my @values = split /\t/, $info{$kwd}; foreach my $insIndex (@singleSeqPerGroupIndex) { splice @values, $insIndex, 0, 'NA'; } $info{$kwd} = join "\t", @values; } } last; } } close(SITESOUT); if ($state != 19) { die "ERROR: Reading in the sites output, but couldn't find all info\n". "The last state was $state. Check the output of sites: $sitesFileName\n"; } # calc dN and dS my @lenVect = split /\t/, $info{'analyzedSeqLen'}; my @ncSitesVect = (); foreach my $lll (@lenVect) { # note that when there is a single seq per group, theta # section doesn't get printed for the group. So # analyzedSeqLen is not known. if ($lll eq 'NA') { push @ncSitesVect, 'NA'; } else { push @ncSitesVect, $lll - $info{'synSites'} - $info{'nonSynSites'}; } } $info{'nonCodingSites'} = join "\t", @ncSitesVect; # change the order of information extracted from below diagonal matrix. if ($info{'numGroups'} > 2) { foreach my $k (('Fst','netDivergence')) { my @vals = split /\t/, $info{$k}; my @rr=TransposeLowerToUpperTriangleFlat($info{'numGroups'},\@vals); $info{$k} = join "\t", @rr; } } return %info; } sub PrintSitesInfo { my ($header, $infoHashRef) =@_; my %info = %$infoHashRef; my @keyList = qw(names numSeqs piPerBp pi thetaWPerBp thetaW segSites singleton derivedSingleton tajimaD fuliDnoOG fuliD divergence netDivergence fixedDiff Fst Nm analyzedSeqLen synSites nonSynSites nonCodingSites alignedLen numExcludedIndels numExcludedMultiBases); #my @pairwiseKeyList = qw(divergence netDivergence fixedDiff Fst Nm); my $result = ""; if ($header ne 'noHeader') { $result .= join("\t",@keyList) . "\n"; } foreach my $kkk (0..$#keyList) { if (!defined($info{$keyList[$kkk]})) { warn "WARN: $keyList[$kkk] not found in information hash\n"; } my $out = $info{$keyList[$kkk]}; $out =~ s/\t/,/g; if ($kkk == 0) { $result .= "$out"; } else { $result .= "\t$out"; } } return $result; } sub PrintSitesInfoVert { my ($header, $infoHashRef) =@_; my %info = %$infoHashRef; my @keyList = qw(names numSeqs piPerBp pi thetaWPerBp thetaW segSites singleton derivedSingleton tajimaD fuliDnoOG fuliD divergence netDivergence fixedDiff Fst Nm analyzedSeqLen synSites nonSynSites nonCodingSites alignedLen numExcludedIndels numExcludedMultiBases); foreach my $kkk (0..$#keyList) { my $out = $info{$keyList[$kkk]}; $out =~ s/\t/,/g; if($header ne 'noHeader') { $out = "$keyList[$kkk]\t$out\n"; } print "$out"; } } # @array is a flat array containing (2.1, 3.1, 3.2, 4.1, 4.2, ,...) # from a square matrix # 1.1 1.2 1.3 1.4 1.5 1.6 # 2.1 2.2 2.3 2.4 2.5 2.6 # .... # This function reorder it to (2.1, 3.1, 4.1, 5.1, 3,2,...) sub TransposeLowerToUpperTriangleFlat { my ($numRow, $arrRef) = @_; my @vals = @$arrRef; my @mat = (); foreach my $row (1..($numRow-1)) { my @temp = splice @vals,0,$row; push @mat, \@temp; } my @result = (); foreach my $col (0..($numRow-2)) { foreach my $row ($col..($numRow-2)) { push @result, $mat[$row][$col]; } } return @result; } # take a file name of SITES inputfile, read the file and return # hash with all info sub ReadInSites { my $file = shift; open IN, "<$file" || die "Can't open $file\n"; my %dat = (); $dat{'comment'} = ; # 1st line chomp $dat{'comment'}; # 2nd line my $line = ; ($dat{'numSeq'}, $dat{'alignedLen'}) = Get2Num ($line); # 3rd line $line = ; # frame is the position in the coding frame of the first coding base ($dat{'frame'}, $dat{'numNonCodingStretches'}) = Get2Num ($line); my @range = (); foreach my $i (1..$dat{'numNonCodingStretches'}) { $line = ; my @tmp = Get2Num($line); push @range, \@tmp; } $dat{'ncRange'} = \@range; $line = ; if ($line =~ /^\s*(\d+)\s*/) { $dat{'numGroups'} = $1; } else { die "ERROR: line $. of file $file should contain a single integer: $line\n"; } if ($dat{'numGroups'} > 1) { my @groupNames = (); my @groupSizes = (); foreach my $ggg (1..$dat{'numGroups'}) { $line = ; $line =~ s/^\s+//; $line =~ s/\s+$//; my @grpLine = split /\s+/, $line; if (@grpLine != 2) { die "ERROR: Reading in group sizes, this line doesn't have 2 columns: $line\n"; } push @groupNames, $grpLine[0]; push @groupSizes, $grpLine[1]; } $dat{'groupNames'} = \@groupNames; $dat{'groupSizes'} = \@groupSizes; } # readin sequence data my @rawdatArr = ; chomp(@rawdatArr); my $rawDat = join "", @rawdatArr; @rawdatArr = split //, $rawDat; my @seqName = (); my @seqData = (); # matrix for my $seqNum (0..($dat{'numSeq'}-1)) { my @bufferArr = splice @rawdatArr, 0, 10; # 10 char; my $thisName = join("", @bufferArr); $thisName =~ s/\s+//g; push @seqName, $thisName; my @seqArr = (); my $base; while (@seqArr < $dat{'alignedLen'} && ($base = shift @rawdatArr)) { push (@seqArr, $base) if ($base !~ /^\s$/); } if (@seqArr != $dat{'alignedLen'}) { die "ERROR: For $thisName, the number of bases (" . scalar(@seqArr) . ") is shorter than expected (" . $dat{'alignedLen'} . ")\n"; } push @seqData, \@seqArr; } my $residue = join "", @rawdatArr; if ($residue !~ /^\s*$/) { die "ERROR: after reading $dat{'numSeq'} sequences ($dat{'alignedLen'} bp each), there are some left-over:\n$residue\n"; } $dat{'seqNames'} = \@seqName; $dat{'seqMat'} = \@seqData; close (IN); my @allGapSites = GapSitesMat(\@seqData,"all"); my @anyGapSites = GapSitesMat(\@seqData,"any"); $dat{'allGapSites'} = \@allGapSites; $dat{'anyGapSites'} = \@anyGapSites; return %dat; } # take a string with two integers separated by space, and return an array of # 2 integer values. sub Get2Num { my $string = shift; my @result = (); if ($string =~ /^\s*(\d+)\s+(\d+)\s*$/) { @result = ($1, $2); } else { die "ERROR: this line should contain 2 integers: $string\n"; } return @result; } # take a reference to matrix (2-dim array) # 2nd arugment is "all" or "any" sub GapSitesMat { my ($matRef, $option) =@_; if ($option !~ /(all|any)/i) { die "ERROR: 2nd argument to GapSitesMat should be all or any, ". "not $option\n"; } my $numRow = @$matRef; return () if ($numRow == 0); my $numCol = @{$matRef->[0]}; return () if ($numCol == 0); my @result = (); foreach my $col (0..($numCol-1)) { my $anyGap = 0; my $allGap = 1; foreach my $row (0..($numRow - 1)) { my $b = $matRef->[$row]->[$col]; if ($b =~ /^-$/) { if ($option eq 'any') { $anyGap = 1; last; } } else { $allGap = 0; } } if ($option eq 'any') { if ($anyGap) { push @result, $col+1; } } else { if ($allGap) { push @result, $col+1; } } } return @result; } sub InANotInB { my ($aRef, $bRef) =@_; my %seen = (); my @aonly =(); foreach my $item (@$bRef) { $seen{$item} = 1}; foreach my $item (@$aRef) { push (@aonly, $item) unless $seen{$item}; } return (@aonly); } sub MyRound { my $num = shift; return int($num + 0.5 * ($num <=> 0)); }