#!/usr/bin/perl -w my $usage = "\nUsage: $0 [-phma] [-w windowSize] [-i interval] -l alignLen -n sampleSize [-r numberOfSimulatedSampling] [-f seqNameFile] fastaFiles\n" . " -h: help\n". " -m: memory conservation. By default, all sequence data get read into\n". " memory. But it is not possible with a lot of sequence data. With\n". " -m, the program accesses files when required without reading in.\n". " This slows down the process, but enables analysis with large set\n". " (Not implemented yet)\n". " -a: Nordborg Arabidopsis data contains multiple samples per population.\n". " This option specify to sample at most 1 sample per population.\n". " -f filename: read in a file, listing names of sequences to be used. 1 name per line\n". " -w: Size (in bp) of each window\n" . " -p: Slidling Window Pi analysis instead of Tajima's D\n". " -i: Increment (in bp) of sliding windows\n". " -l: Length of aligned sequences\n". " -n: number of sequences (samples) in data\n". " -r: How many re-sampling to be done?\n" ; use Bio::AlignIO; use Bio::Align::DNAStatistics; use File::Find; use File::Basename; use Getopt::Std; my %IUPAC = ('A'=>'A','C'=>'C','G'=>'G','T'=>'T','U'=>'T','-'=>'-','?'=>'?', 'M'=>'A C','R'=>'A G','W'=>'A T','S'=>'C G','Y'=>'C T','K'=>'G T', 'V'=>'A C G','H'=>'A C T','D'=>'A G T','B'=>'C G T', 'N'=>'A C G T'); our ($opt_h, $opt_a, $opt_f, $opt_m, $opt_l, $opt_w, $opt_i, $opt_r, $opt_p); getopts('ahpml:w:r:i:n:f:') || die $usage; die $usage if (defined($opt_h)); die "$usage\nERROR: Please specify at least one source directory\n" unless @ARGV; # should take align len, window-size, window step-size if (!defined ($opt_l)) { die "Please specify -l the alignment lengths after removing sites with gaps\n"; } if (!defined($opt_n)) { die "Please specify number of samples with -n\n"; } if (!defined ($opt_w)) { warn "WARN: -w was not used, using default size of 100bp window size"; $opt_w = 100; } if (!defined ($opt_i)) { warn "WARN: -i (intervals between neighboring windows) not specified, using default of 25\n"; $opt_i = 25; } if (!defined($opt_r)) { $opt_r = 5000; warn "WARN: number of simulated resampling (-r) not specified, using default of $opt_r)\n"; } if ($opt_l < $opt_w) { die "Window size (-w) has to be greater than or equal to alignment length (-l)\n"; } my $possibleStartSites = $opt_l - $opt_w + 1; my $numWindows = int(($possibleStartSites - 1) / $opt_i) + 1; # This is number of windows per gene #$numWindows = 5; # debug my $windowSize = $opt_w; my $numSamples = $opt_n; my $increment = $opt_i; my $alignLen = $opt_l; my $verbose = 0; my $outAln = Bio::AlignIO->new(-format => 'fasta'); # used for debug print STDERR "INFO: reading in files, "; # recursively goes down to directory and find the fasta files my @fileName = FilenamesRecursive(@ARGV); # print join("\n", @fileName), "\n"; my @fileInfo = ProcessFastaFile(@fileName); print STDERR "DONE, ", scalar(@fileInfo), " files read in.\n"; if ($verbose ) { foreach my $fff (@fileInfo) { print STDERR "file: ", $$fff{filename}, "\tn: ", $$fff{n}, "\tlen: ", $fff->{alignedLen}, "\n"; # print join(":", @{ $$fff{names} }), "\n"; } } if (defined($opt_p)) { # This prints out as it goes WindowPi(\@fileInfo, $windowSize, $numWindows, $numSamples, $opt_r, $alignLen, $increment); } else { my ($maxArrRef, $minArrRef) = WindowMaxMinTajimaD(\@fileInfo, $windowSize, $numWindows, $numSamples, $opt_r, $alignLen, $increment); foreach my $i (0..(@$maxArrRef - 1)) { print join("\t", ($$minArrRef[$i], $$maxArrRef[$i])), "\n"; } } exit ; sub FilenamesRecursive { my @fileName =(); my @sourceDirName; foreach my $dir (@_) { $dir =~ s@/$@@; # Strip any trailing slash if (-d $dir) { push @sourceDirName, $dir; } elsif (-f $dir) { push @fileName, $dir; } else { warn "Don't know how to handle argument '$dir'\n"; next; } } # extract the plain files and symlinks push @fileName, ListRegFilesRecursive(@sourceDirName); # print join "\n", @fileName, "\n"; # for debug return @fileName; } # take a list of directories and returns the names of plain files and sym links sub ListRegFilesRecursive { my @names = (); find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_; return @names; } # Make an obj # filename => fileName # inMemory => 'Y' or 'N' # n => number of sequences # alignedLen => aligned length # names => reference to array of sequence names # align => alignmentObject if inMemory = 'y' otherwise ''. sub ProcessFastaFile { my @result = (); my $href; my $inMem = (defined($opt_m)) ? 'N': 'Y'; $inMem = 'Y'; # All sequence get read into memory, this can be changed in the future. my @namesFromFile = (defined($opt_f)) ? (ReadSeqNameFile($opt_f)) : (); foreach my $fileName (@_) { my $inAlnIOObj = Bio::AlignIO->new(-file => $fileName, -format => 'fasta'); my ($alnLen, $numSeq) = (0,0); my @seqNames = (); my ($aln); # Is there a case with multiple align per file? # I'm assuming this loop goes through only once. while ($aln = $inAlnIOObj->next_aln()) { # select names if (defined($opt_f)) { $aln = SubsetOfAlignByID($aln, \@namesFromFile); } SetAlphabetInAlnObj($aln, "dna"); # sequence clean up $aln->uppercase(); $aln->map_chars('U', 'T'); ## To speed up, converting anything other than ATGC to gap (ignored) #$aln->map_chars('[^ATGC]', '-'); # causes warning about alphabet when a seq is all -, so manual clean foreach my $sss ($aln->each_seq) { my $thisSeq = $sss->seq(); $thisSeq =~ s/[^ATGC]/-/g; if ($thisSeq =~ /^-+$/) { # empty seq $aln->remove_seq($sss); next; } $sss->seq($thisSeq, 'dna'); } $aln = RmEmptySamplesFromAlignObj($aln); $aln = RmInsertionFromAlignObj($aln); $numSeq = $aln->no_sequences; $alnLen = $aln->length; @seqNames = SeqNamesInAlnObj($aln); # making sure sequence names are unique if (scalar @seqNames != scalar(ExtractUnique(@seqNames))) { die "Error: Sequence names in fasta file $fileName contain duplicates: " . join(",", sort(@seqNames)) . "\n"; } if ($numSeq > 0 && $alnLen > 0) { my $alnObj = ($inMem eq 'Y') ? $aln : ''; $href = { 'filename' => $fileName, 'inMemory' => $inMem, 'n' => $numSeq, 'alignedLen' => $alnLen, 'names' => \@seqNames, 'align' => $alnObj}; push @result, $href; if ($verbose) { print STDERR "fn: $fileName, n: $numSeq, len: $alnLen\n"; # debug } #print join(":", @seqNames), "\n"; # debug } } } return @result; } sub ReadSeqNameFile { my $file = shift; my @result = (); open(INFILE, "<$file") || die "Can't open $file\n"; while () { chomp; s/#.*$//; # remove comments (#) s/^\s+//; s/\s+$//; next if (/^$/); push @result, $_; } close(INFILE); return @result; } # Sequence names of Nordborg data contains population(ecotype) before the first '-'. sub NordborgRandPop { my $numSamples = shift; my @names = @_; # Check there are enough pops my @pops = @names; @pops = map {my @tmp = split /-/, $_; $tmp[0]} @pops; @pops = ExtractUnique(@pops); if ($numSamples > @pops) { warn "WARN: in Nordborg data set, only ", scalar(@pops), " pops. Can't choose $numSamples samples\n"; return (); } # start to sample; my @result = (); my %seen = (); while (@result < $numSamples) { my $i = int (rand(scalar(@names))); my @nameParts = split /-/, $names[$i]; my $pop = shift @nameParts; next if (defined($seen{$pop})); $seen{$pop} = 1; push @result, $names[$i]; } # print STDERR "SEL ", join(" ", sort(@result))," NUM ", scalar(@result), "\n"; #debug return @result; } sub WindowMaxMinTajimaD { my ($fileInfoArrRef, $windowSize, $numWindows, $numSamples, $numRep, $totalLen, $increment) = @_; my @fileInfo = @$fileInfoArrRef; my @lenVect = map {$_->{alignedLen}} @fileInfo; # print "@lenVect\n"; my $maxLen = Max(@lenVect); my $rep = 0; my @maxTajDarr = (); my @minTajDarr = (); print STDERR "INFO: WindowAnalysis of Tajima's D started: \n" . "windowSize=$windowSize, numWindows=$numWindows, $numRep repetitions\n". "Analysis Progress:\n"; my $printIntvl = int($numRep / 200); $printIntvl = 10 if ($printIntvl < 10); # $printIntvl = 1; while ($rep < $numRep) { # select the names, give up after 100 tries my @names = (); my $nameTryCntr = 0; while (@names != $numSamples) { my $fn = int (rand(scalar(@fileInfo))); my @allNames = @{ $fileInfo[$fn]->{names} }; #print "$rep HERE1\n"; #profile if (defined($opt_a)) { @names = NordborgRandPop($numSamples, @allNames); # print "rep: $rep, names ", join(" ", sort(@names)),"\n"; # debug } else { my @index = RandIntArrayUniq($numSamples, scalar(@names) - 1); @index = sort {$a <=> $b} @index; @names = @names[@index]; # print "rep $rep: ",$fileInfo[$fn]->{filename}," @index @names\n"; # debug } $nameTryCntr++; if ($nameTryCntr > 100) { die "Error: Tried to get $numSamples samples, but there isn't ". "enough samples in the data\n"; } } #print "$rep HERE2\n"; #profile #my @windows = SampleWindows(\@fileInfo, $maxLen, $windowSize, $numWindows, # \@names); #my @windows = SampleWindowsSequential(\@fileInfo, $maxLen, $windowSize, # $numWindows, \@names, $increment); my @windows = SampleWindowsSlideWConcat(\@fileInfo, $maxLen, $windowSize, $numWindows, \@names, $totalLen, $increment); #print "$rep HERE3\n"; #profile next if (@windows != $numWindows); # giving up this name set # Do analysis my $minTajD = TajimaD($windows[0]); my $maxTajD = $minTajD; foreach my $thisAln (@windows[1..$#windows]) { my $tajD = TajimaD($thisAln); $maxTajD = $tajD if ($tajD > $maxTajD); $minTajD = $tajD if ($tajD < $minTajD); #$outAln->write_aln($thisAln); #print "$tajD max $maxTajD min $minTajD\n"; # debug } push @maxTajDarr, $maxTajD; push @minTajDarr, $minTajD; #print "$rep HERE4\n"; #profile $rep++; print STDERR " $rep Repetitions done\n" if ($rep % $printIntvl == 0); } return (\@maxTajDarr, \@minTajDarr); } sub WindowPi { my ($fileInfoArrRef, $windowSize, $numWindows, $numSamples, $numRep, $totalLen, $increment) = @_; my @fileInfo = @$fileInfoArrRef; my @lenVect = map {$_->{alignedLen}} @fileInfo; # print "@lenVect\n"; my $maxLen = Max(@lenVect); my $rep = 0; my @maxTajDarr = (); my @minTajDarr = (); print STDERR "INFO: WindowAnalysis of Pi started: \n" . "windowSize=$windowSize, numWindows=$numWindows, $numRep repetitions\n". "Analysis Progress:\n"; my $printIntvl = int($numRep / 200); $printIntvl = 10 if ($printIntvl < 10); # $printIntvl = 1; while ($rep < $numRep) { # select the names, give up after 100 tries my @names = (); my $nameTryCntr = 0; while (@names != $numSamples) { my $fn = int (rand(scalar(@fileInfo))); my @allNames = @{ $fileInfo[$fn]->{names} }; if (defined($opt_a)) { @names = NordborgRandPop($numSamples, @allNames); } else { my @index = RandIntArrayUniq($numSamples, scalar(@names) - 1); @index = sort {$a <=> $b} @index; @names = @names[@index]; } $nameTryCntr++; if ($nameTryCntr > 100) { die "Error: Tried to get $numSamples samples, but there isn't ". "enough samples in the data\n"; } } my @windows = SampleWindowsSlideWConcat(\@fileInfo, $maxLen, $windowSize, $numWindows, \@names, $totalLen, $increment); next if (@windows != $numWindows); # giving up this name set # Do analysis my @piArr = (); foreach my $thisAln (@windows) { my $piPerGene = AvePWDiff($thisAln); push @piArr, $piPerGene/($thisAln->length); } # print all pi print "\t"; print join("\t", @piArr), "\n"; $rep++; print STDERR " $rep Repetitions done\n" if ($rep % $printIntvl == 0); } return (\@maxTajDarr, \@minTajDarr); } # It will return an array of Align Object. # - Extract the sequences specified by @$seqNameArrRef. # - Then remove any character other than ATGC and calculate the cleaned length. # - Genes in @$fileInfoArrRef are sampled proportional to the cleaned length. # - Genes are sampled WITH replacement # - For each gene, one window of $windowSize is extracted. # - This process get repaeated until total number of samples become # $numWindows. sub SampleWindows { my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef) = @_; my @fileInfo = @$fileInfoArrRef; my %avoidHash = (); my %lenHash = (); my %subsetHash = (); # my $trial = 0; my @resultAlignArr = (); # my @cntr = (0,0,0,0); while (@resultAlignArr < $numWindows) { if ($trial > $numWindows * 100) { warn "INFO: too many trials to sample $numWindows windows from name set of " . join(" ", @$seqNameArrRef) . " giving up this set, and creating another name set\n"; return (); } $trial++; # Pick one candidate gene my $fn = int (rand(scalar(@fileInfo))); next if (defined($avoidHash{$fn})); if (! defined($lenHash{$fn})) { my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, $seqNameArrRef); if ($subAln eq "" || $subAln->no_sequences != scalar(@$seqNameArrRef)) { $avoidHash{$fn} = 1; # $cntr[0]++; next; } $subAln = RmAnyGapAmbigSites($subAln); # This is SimpleAlign object if ($subAln eq "") { # means all sites contains gaps/ambig chars $avoidHash{$fn} = 1; # $cntr[1]++; next; } my $thisLen = $subAln->length; # print "file; $fn, AFTER RmGap, LENGTH: $thisLen, max: $maxLen\nA"; # debug # $outAln->write_aln($subAln); #debug if ($thisLen < $windowSize){ $avoidHash{$fn} = 1; # $cntr[2]++; next; } $lenHash{$fn} = $thisLen; $subsetHash{$fn} = $subAln; } # choose proportional to the length of each gene if (rand($maxLen) > $lenHash{$fn} ) { # $cntr[3]++; next; } # make a slice with rand # rand returns [0, max) my $begin = int(rand($lenHash{$fn} - $windowSize + 1)) + 1; my $end = $begin + $windowSize - 1; my $subAln = $subsetHash{$fn}; $subAln = $subAln->slice($begin,$end); # print "BEGIN: $begin - $end\n"; # debug # $outAln->write_aln($subAln); # debug push @resultAlignArr, $subAln; #debug } # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n"; return @resultAlignArr; } # It will return an array of Align Object. # - Extract the sequences specified by @$seqNameArrRef. # - Then remove any character other than ATGC and calculate the cleaned length. # - Genes in @$fileInfoArrRef are sampled proportional to the cleaned length. # - Genes are sampled WITHOUT replacement # - For each gene, sequnences are extracted with $windowSize and $increment # - It will keep sampling genes until total number of samples become # $numWindows sub SampleWindowsSequential { my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef, $increment) = @_; my @fileInfo = @$fileInfoArrRef; my %avoidHash = (); my %lenHash = (); my %subsetHash = (); my %usedHash = (); my $trial = 0; my @resultAlignArr = (); # my @cntr = (0,0,0,0); while (@resultAlignArr < $numWindows) { if ($trial > $numWindows * 100) { warn "INFO: too many trials to sample $numWindows windows from name set of " . join(" ", @$seqNameArrRef) . " giving up this set, and creating another name set\n"; return (); } $trial++; # Pick one candidate gene my $fn = int (rand(scalar(@fileInfo))); next if (defined($avoidHash{$fn})); next if (defined($usedHash{$fn})); if (! defined($lenHash{$fn})) { my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, $seqNameArrRef); if ($subAln eq "" || $subAln->no_sequences != scalar(@$seqNameArrRef)) { $avoidHash{$fn} = 1; # $cntr[0]++; next; } $subAln = RmAnyGapAmbigSites($subAln); # This is SimpleAlign object if ($subAln eq "") { # means all sites contains gaps/ambig chars $avoidHash{$fn} = 1; # $cntr[1]++; next; } my $thisLen = $subAln->length; # print "file; $fn, AFTER RmGap, LENGTH: $thisLen, max: $maxLen\nA"; # debug # $outAln->write_aln($subAln); #debug if ($thisLen < $windowSize){ $avoidHash{$fn} = 1; # $cntr[2]++; next; } $lenHash{$fn} = $thisLen; $subsetHash{$fn} = $subAln; } # choose proportional to the length of each gene if (rand($maxLen) > $lenHash{$fn} ) { # $cntr[3]++; next; } # Make sequential slice my $subAln = $subsetHash{$fn}; for(my $begin = 1; (@resultAlignArr < $numWindows) && ($begin <= $lenHash{$fn} - $windowSize + 1); $begin += $increment) { my $end = $begin + $windowSize -1; my $subAlnSlice = $subAln->slice($begin,$end); push @resultAlignArr, $subAlnSlice; #debug } $usedHash{$fn} = 1; } # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n"; return @resultAlignArr; } # It will return an array of Align Object. # - Extract the sequences specified by @$seqNameArrRef. # - Then remove any character other than ATGC and calculate the cleaned length. # - Genes are concatenated until the length of concatenated sequences become # at least $totalLen bp # - Genes are sampled WITHOUT replacement # - From this concatenated sequence, the beginning of the sequence is randomly # chosen so the selected sequence has exactly $totalLen bp # - Then sliding windows are extracted with $windowSize and $increment sub SampleWindowsSlideWConcat { my ($fileInfoArrRef, $maxLen, $windowSize, $numWindows, $seqNameArrRef, $totalLen, $increment) = @_; my @fileInfo = @$fileInfoArrRef; my %avoidHash = (); my %lenHash = (); my %subsetHash = (); my %usedHash = (); my $trial = 0; my @resultAlignArr = (); my @tempGeneArray = (); my $cumuLen = 0; # my @cntr = (0,0,0,0); while ($cumuLen < $totalLen) { if ($trial > $numWindows * 100) { warn "INFO: too many trials to sample $numWindows windows from name set of " . join(" ", @$seqNameArrRef) . " giving up this set, and creating another name set\n"; return (); } $trial++; # Pick one candidate gene my $fn = int (rand(scalar(@fileInfo))); next if (defined($avoidHash{$fn})); next if (defined($usedHash{$fn})); if (! defined($lenHash{$fn})) { my $subAln = SubsetOfAlignByID($fileInfo[$fn]->{align}, $seqNameArrRef); if ($subAln eq "" || $subAln->no_sequences != scalar(@$seqNameArrRef)) { $avoidHash{$fn} = 1; # $cntr[0]++; next; } $subAln = RmAnyGapAmbigSites($subAln); # This is SimpleAlign object if ($subAln eq "") { # means all sites contains gaps/ambig chars $avoidHash{$fn} = 1; # $cntr[1]++; next; } $lenHash{$fn} = $subAln->length; $subsetHash{$fn} = $subAln; } # choose proportional to the length of each gene if (rand($maxLen) > $lenHash{$fn} ) { # $cntr[3]++; next; } $cumuLen += $lenHash{$fn}; push @tempGeneArray, $subsetHash{$fn}; $usedHash{$fn} = 1; #print "File:", $fileInfo[$fn]->{filename}, ", len $lenHash{$fn}, cumu $cumuLen\n"; #debug } my $concatAln = ConcatenateAln(@tempGeneArray); # pick the random begin, uniform [1, ($concatAln->length) - $totalLen +1] my $beginGene = int (rand(($concatAln->length) - $totalLen +1)) + 1; my $endGene = $beginGene + $totalLen - 1; #print "CumuLen: ", $concatAln->length, " beg: $beginGene, end: $endGene\n"; #debug #$outAln->write_aln($concatAln); # debug # Make sequential slice for(my $begin = $beginGene; $begin+$windowSize-1<= $endGene; $begin += $increment) { my $end = $begin + $windowSize -1; my $subAlnSlice = $concatAln->slice($begin,$end); push @resultAlignArr, $subAlnSlice; } if (@resultAlignArr != $numWindows) { die "ERROR: Created ". scalar(@resultAlignArr). " windows, but there should be $numWindows windows\n"; } # print STDERR "FAIL = ", $trial - $numWindows, " = ", join(":",@cntr)," # excluded genes: ",scalar(keys(%avoidHash)), " MAXLEN = $maxLen\n"; ## printing pi per site for the concatenaed aln if(defined($opt_p)) { print AvePWDiff($concatAln) / ($concatAln->length); } return @resultAlignArr; } ######### Align Object related functions # From a Bio::Align object, extract all of the sequence names # and return them in an array sub SeqNamesInAlnObj { my $alnObj = shift; my @result = (); foreach my $seq ($alnObj->each_seq) { my $name = $seq->display_id; push @result, $name; } return @result; } # Takes an array of align object and make a new align object # with all concatenated. # The order of sequences is same as the first object. If # other align objects have sequences with names not included in the first object, # Those sequences are ignored. If sequences are missing in the other # Align object, it will return "". If multiple sequences have same name, # "" get returned. sub ConcatenateAln { my $thisAln = shift @_; my $resultAln = $thisAln->slice(1,$thisAln->length); foreach my $alnObj (@_) { foreach my $cumuSeq ($resultAln->each_seq) { my $nnn = $cumuSeq->display_id; my @newSeq = $alnObj->each_seq_with_id($nnn); if (@newSeq != 0) { if (@newSeq == 0) { warn "WARN: in ConcatenateAln(), $nnn not found\n"; return ""; } elsif (@newSeq > 1) { warn "WARN: in ConcatenateAln(), $nnn found " . scalar(@newSeq). " times. Not processing\n";; return ""; } } my $connected = ($cumuSeq->seq()) . ($newSeq[0]->seq()); $cumuSeq->seq($connected); } } return ($resultAln); } ## side effect: alphabet of all sequences in aln get set. sub SetAlphabetInAlnObj { my ($alnObj, $alphabet) = @_; foreach my $sss ($alnObj->each_seq) { $sss->alphabet($alphabet); } } # consenusu_iupac returns lower case for sites which contains '-', '?', or '.' sub TajimaD { my $alnObj = shift; my $n = $alnObj->no_sequences; my $an = WattersonCoef1($n); my $bn = WattersonCoef2($n); my $e1 = ($n+1) / (3 * $an * ($n-1)) - 1/($an ** 2); my $e2 = (2 * ($n ** 2 + $n + 3)/ (9 * $n * ($n - 1)) - ($n + 2)/ ($n * $an) + $bn / ($an ** 2)) / ($an ** 2 + $bn); my $pi = AvePWDiff($alnObj); my $s = NumSegSites($alnObj, 1); # "1" means "use Eta" my $wattTheta = $s / $an; my $denom = sqrt ($e1 * $s + $e2 * $s * ($s - 1) ); if ($denom == 0) { $denom = 1; # is this valid? } my $d = ($pi - $wattTheta) / $denom; return $d; } sub WattersonCoef1 { my $n = shift; my @arr = map {1/$_} (reverse 1..($n-1)); return Sum(@arr); } sub WattersonCoef2 { my $n = shift; my @arr = map {1/($_ ** 2)} (reverse 1..($n-1)); return Sum(@arr); } # Note this doesn't handle degenerate code correctly. # This is slow sub PiePerSite { my ($alnObj) = @_; my $statObj = Bio::Align::DNAStatistics->new(); my $distMat = $statObj->distance(-align => $alnObj, -method =>'uncorrected'); my @names = $distMat->column_names; # my @row_names = $distMat->row_names; # print "COL: @names\nROW: @row_names\n"; # print $distMat->print_matrix; return 0 if (@names < 2); my $result = 0; foreach my $rrr (0..($#names-1)) { foreach my $ccc (($rrr+1)..$#names) { $result += $distMat->get_entry($names[$rrr], $names[$ccc]); } } $result /= scalar(@names) * (scalar(@names) - 1) / 2; # average return $result; } # This is pi per gene sub AvePWDiff { my $alnObj = shift; my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] ); my $numSites = @{ $seqMat[0] } ; my $numSamples = @seqMat; return 0 if ($numSites < 1 || $numSamples < 2); my $sum = 0; foreach my $ind1 (0..($numSamples - 2)) { foreach my $ind2 (($ind1+1)..($numSamples-1)) { my $comparableBases = 0; my $thisDiff = 0; foreach my $site (0..($numSites - 1)) { my $b1 = uc($seqMat[$ind1]->[$site]); my $b2 = uc($seqMat[$ind2]->[$site]); next if (($b1 . $b2) !~ /^[ACGT]{2}$/); # Skip all ambiguous characters and gap $comparableBases ++; $thisDiff++ if ($b1 ne $b2); } $sum += $thisDiff; } } return $sum / ($numSamples * ($numSamples-1)/2); } # if $etaQ > 0, it returns Eta (total number of mutations) instead of # Number of segregating sites. sub NumSegSites { my ($alnObj, $etaQ) = @_; my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] ); my $numCol = @{ $seqMat[0] } ; my @result = (); foreach my $col (0..($numCol-1)) { my @thisCol = (); foreach my $row (0..$#seqMat) { push @thisCol, $seqMat[$row]->[$col]; } @thisCol = ExtractUnique(@thisCol); my $type = PolymorphismTypeNoHet(@thisCol); push @result, $type; } if ($etaQ) { # similar to num seg sites: add +1 for tri-allelic, +2 for quad-allelic sites @result = map { ($_ > 1) ? ($_ - 1) % 4 : 0 } @result; return Sum(@result); } else { # num seg sites: type 2-4,6-8 are the sites with polymorphism @result = grep {/^[234678]$/} @result; return scalar(@result) } } # number of mutations # Takes an array of bases, and categorize the 'polymorphism' in the array # Ambiguity code are considered as ambiguous, not heterozygote. # Returns: # 0 : all gap (-) # 1 : monomorphic (include all N or ?) # 2 : di-allelic # 3 : tri-allelic # 4 : quad-allelic # 5 : monomorphic + gap (include cases: N N N N - N) # 6 : di-allelic + gap # 7 : tri-allelic + gap # 8 : quad-allelic + gap # <0 : non IUPAC code encountered <- not used # WARN: For simplicity, 3 or 4 base ambiguity codes are considered as a # missing char (? N V H D B), and they are ignored. # This could cause problem. For example, with (G, G, G, G, H, H), this site # contains at least two types of bases (H=A|C|T). However, this function # consider this site to be monomorphic sub PolymorphismTypeNoHet { my @bases = @_; # check gaps, and get rid of them my $withGap = 0; # a flag to indicate existence of gap my @gaps = grep {/^-$/} @bases; if (@gaps > 0) { $withGap = 4; # raise the flag @bases = grep {! /^-$/} @bases; # get rid of gaps return 0 if (@bases == 0); } @bases = map {uc $_} @bases; # convert to all upper case @bases = map {s/U/T/; $_} @bases; # U->T @bases = map {s/\?/N/; $_} @bases; # ? -> N my $degenCode = MinimumBaseSet(@bases); my $type; if ($degenCode =~ /^[ATGC]$/) { $type = 1; } elsif ($degenCode =~ /^[WSMKRY]$/) { $type = 2; } elsif ($degenCode =~ /^[BDHV]$/) { $type = 3; } elsif ($degenCode =~ /^N$/) { $type = 4; } elsif ($degenCode =~ /^-$/) { $type = 0; } else { die "ERROR: MinimumBaseSet returned $degenCode\n"; } return ($type + $withGap) ; } # Takes an array: each element is single base. Then find a set of # fewest bases which will match all elements. # (A, A, G, G) -> R (R=A/G) # (A, A, A, R) -> A # (A, A, A, Y) -> W (Y = C/T, so M=A/C or W=A/T is OK, but W get returned) # (R, R, R, R) -> A # (A, C, G, T) -> N # (N, N, N, N) -> A # If all elements in the array are something other than IUPAC code, returns gap '-' sub MinimumBaseSet { my @baseArr = map {uc $_} @_; @baseArr = map {s/U/T/; $_} @baseArr; # U->T @baseArr = map {s/\?/N/; $_} @baseArr; # ? -> N @baseArr = grep { /[ACGTWSMKRYBDHVN\-]/ } @baseArr; # ignore non IUPAC return '-' if (@baseArr == 0); my @degen = ('A', 'C', 'G', 'T', 'W', 'S', 'M', 'K', 'R', 'Y', 'B', 'D', 'H', 'V'); foreach my $candidate (@degen) { my @candidateArr = split /\s+/, $IUPAC{$candidate}; my $match = 1; foreach my $bbb (@baseArr) { my @this = split /\s+/, $IUPAC{$bbb}; # check sharing my @isect = Intersection(\@candidateArr, \@this); if (@isect == 0) { $match = 0; last; } } if ($match) { # all @baseArr can be explained by the candidate return $candidate; } } my @base = grep {/^[ACGTWSMKRYBDHVN]$/} @baseArr; if (@base > 0) { return 'N'; } else { return '-'; # in case, all elements are '-' } } # Takes a Bio::Align object, and a reference to array of names # returns an Bio::Align object with the names specified. # Note that if the sequence names are not unique, only the first sample # will be selected. # The order of sequences in returned alignment are not the order in the array # of names, but the order in the Align object # If same name is included in the name array multiple times, only 1 sequence # will be returned. # If the name array contains names which are not in Align object, it will be # ignored. # If seq names (display_id) are not uniq in Align Object, later one get used. sub SubsetOfAlignByID { my ($alnObj, $nameArrRef) = @_; my @names = SeqNamesInAlnObj($alnObj); my @index = 1 .. scalar(@names); my %posiHash = map { $names[$_ - 1] => $_ } @index; # my @matchedIndex = map {$posiHash{$_}} @$nameArrRef; # @matchedIndex = grep { /^\d+$/ } @matchedIndex; my @matchedIndex = (); foreach my $nnn (@$nameArrRef) { if (defined($posiHash{$nnn})) { push @matchedIndex, $posiHash{$nnn}; } else { return ""; # trying to speed up } } my $subset = $alnObj->select_noncont(@matchedIndex); return $subset; } sub SubsetOfAlignByIDold { my ($alnObj, $nameArrRef) = @_; my @names = SeqNamesInAlnObj($alnObj); my @matchedIndex = (); foreach my $target (@$nameArrRef) { foreach my $index (0..$#names) { if ($names[$index] eq $target) { push @matchedIndex, $index + 1; # unit-offset last; } } } my $subset = $alnObj->select_noncont(@matchedIndex); return $subset; } ### Get rid of samples whose sequences are empty, or all N, X, ?, or - sub RmEmptySamplesFromAlignObj { my $origAlnObj = shift; # make duplicate my $alnObj = $origAlnObj->slice(1,$origAlnObj->length); foreach my $seq ($alnObj->each_seq) { my $dna = $seq->seq(); if ($dna =~ /^[NnXx\?\-]*$/) { my $name = $seq->display_id; if ($verbose) { warn "INFO $name is removed because it is all gaps or missing chars\n"; } $alnObj->remove_seq($seq); } } return $alnObj; } # Returns a 2-dim array, not used sub MatrixFromAlignObj { my ($alnObj, $colNumArr) = @_; @result = (); foreach my $seq ($alnObj->each_seq) { my $dna = $seq->seq(); my @dnaArr = split //, $dna; my @index = map {$_ - 1} @$colNumArr; # convert to 0-offset index my @select = @dnaArr[@index]; push @result, \@select; } return @result; } # From align object, site which contain, any degenerate char or any gap will be # removed. sub RmAnyGapAmbigSites { my $alnObj = shift; # convert anything other than ATGCU get converted to gap # $alnObj->map_chars('[^ATGCUatgcu]', '-'); # ## WARN ## : uncomment above to use this function in general, # In this script, this is done in ProcessFastaFile to speed up my $gline = $alnObj->gap_line; my $gchar = $alnObj->gap_char; my $result; # Creating an align object: each seq has 0 length. # Without this, remove_gaps appears to return undef. if ($gline =~ /^$gchar+$/) { # all sites with gap char # Making an align object with no sequence data # Make a duplicate, select() doesn't make duplicates # It will through a couple of harmless warnings: # MSG: Got a sequence with no letters in it cannot guess alphabet [] # MSG: Slice [1-16] of sequence [seq5/1-6] contains no residues. Sequence excluded from the new alignment. return ""; $result = $alnObj->slice(1,$alnObj->length); foreach my $sss ($result->each_seq) { $sss->seq(''); } } else { $result =$alnObj->remove_gaps(); # a site with at least one gap get removed } return $result; } # Return a new Align object after removing the sites where all except one seq # has non-gap character. Also removes sites with all gaps. sub RmInsertionFromAlignObj { my $alnObj = shift; my @seqMat = MatrixFromAlignObj($alnObj, [ 1..($alnObj->length) ] ); my $numSites = @{ $seqMat[0] } ; my $numSamples = @seqMat; my @insertionSites = (); # 0 offset foreach my $col (0..($numSites-1)) { my $numNonGap = 0; foreach my $row (0..($numSamples-1)) { if ($seqMat[$row]->[$col] ne '-') { $numNonGap ++; last if ($numNonGap > 1); } } push (@insertionSites, $col) if ($numNonGap < 2); } my @range = SiteIndexToRange(@insertionSites); my @rmMat = (); foreach my $rrr (@range) { my ($begin, $end) = split /-/, $rrr; push @rmMat, [ $begin, $end ]; } return ($alnObj->remove_columns(@rmMat)); } ######### End of Align Object related functions ####### general functions # rand integers between 0 and $max (both ends inclusive) sub RandIntArrayUniq { my ($size, $max) = @_; my @result = (); my %uniq = (); while (@result < $size) { my $candidate = int(rand ($max + 1)); # rand returns [0, $max + 1) next if (defined($uniq{$candidate})); push @result, $candidate; $uniq{$candidate} = 1; } return (@result); } sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; } sub Intersection { my ($arrRefA, $arrRefB) = @_; my %union = (); my %isect = (); foreach my $e (@$arrRefA) { $union{$e} = 1; } foreach my $e (@$arrRefB) { if ($union{$e}) { $isect{$e} = 1; } #$union{$e} = 1; } return (keys %isect); } sub Max { my $max = shift; foreach $item (@_) { if ($item > $max) { $max = $item; } } return $max; } sub Sum { $result = 0; foreach my $i (@_) { $result += $i; } return $result; } sub SiteIndexToRange { my @index = @_; return () if (@index == 0); @index = sort {$a <=> $b} (@index); @index = ExtractUnique (@index); $index[$#index+1] = $index[$#index] + 2; # this force the final index # to be printed out. my $begin = shift @index; my $prev = $begin; my @results = (); for (my $i = 0; $i < @index; $i++) { if ($index[$i] == $prev + 1) { $prev = $index[$i]; next; } # now put the range into the results, and reset push @results, "$begin-$prev"; $begin = $prev = $index[$i]; } return (@results); }