#!/usr/bin/perl -w my $usage="Usage: $0 [-h] [fastaFile]\n" . " -h: help (print this message)\n" . "\n". "Calculate the aligned length after removing sites where all samples\n". "contain gaps, the aligned length after removing sites where at least\n". "one sample has a gap, and the aligned length after removing sites where\n". "at least one sample has a gap or ambiguous character (char other than\n". "ATGC), length of each samples excluding gaps (include ambiguous char), \n". "and average lengths\n"; my $sep = "\t"; # if you use tab in the sequence name, change this to # other characters such as "," my $replaceChar = "-"; # for -g, this character is used to replace my $lineWidth = 70; # used to break the long sequences into lines for FASTA out my $debug = 0; our($opt_h, $opt_g); use Getopt::Std; getopts('hg') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $dnaFile = shift @ARGV; # initialize the @seqArray, @seqNameArray, and $maxSeqLen my @dat = ReadInFASTA($dnaFile); my $numSeq = @dat; if (! CheckEqualSeqLength(@dat)) { print "*** WARN *** The input file contained sequences with different ". "lengths.\n Interprete the results carefully since '?' was attached". " to the ends of\n the shorter sequences.\n" ; @dat = AdjustSeqLength(@dat); } @dat = RemoveGapOnlySites(\@dat, 1); # clean out the data. my $maxLen = MaxSeqLen(@dat); # Find the length after removing sites with any gaps, and after removing # sites with any gaps or any ambiguous char my $lenAfterGapRm = 0; my $lenAfterAmbigRm = 0; { my @temp = RemoveSitesWithAnyGap(\@dat, "noRmAmbiguous"); # print join("\n", @temp, "\n"); #debug $lenAfterGapRm = MaxSeqLen(@temp); @temp = RemoveSitesWithAnyGap(\@dat, "rmAmbiguous"); # print join("\n", @temp, "\n"); #debug $lenAfterAmbigRm = MaxSeqLen(@temp); } # find seq lengths for each my @seqLens = SeqLensRmGaps(@dat); # length vector after removing gaps (- or ?) print "## Sequence length of each samples, excluding gaps ($numSeq samples)\n"; my @names = GetSeqName(@dat); for my $i (0..$#names) { print "$names[$i]\t$seqLens[$i]\n" } print "\n## Summary\n"; print "$maxLen\tAligned Len (# of aligned sites after removing gap-only sites)\n"; print "$lenAfterGapRm\tAligned Len after removing sites with any gaps\n"; print "$lenAfterAmbigRm\tAligned Len after removing sites with any gaps or ambiguous bases\n"; my $meanSeqLens = Mean(@seqLens); print "$meanSeqLens\tAve. lengths of sequences\n"; exit(0); #### functions sub Mean { my $sum = 0; foreach my $i (@_) { $sum += $i; } return $sum / scalar(@_); } sub MkSelIndex { my ($max, $siteList) = @_; $siteList =~ s/^\s+//; $siteList =~ s/\s+$//; my @sites = split(/\s*,\s*/, $siteList); my @result = (); foreach my $item (@sites) { if ($item =~ /^(\d+)-(\d+)$/) { die "ERROR: 1st number is larger than 2nd in $item\n" if ($1 > $2); $beginPos = $1 - 1; $endPos = $2 - 1; } elsif ($item =~ /^-(\d+)$/) { $beginPos = 0; $endPos = $1 - 1; } elsif ($item =~ /^(\d+)-$/) { $beginPos = $1 - 1; $endPos = $max-1; } elsif ($item =~ /^(\d+)$/) { $beginPos = $1 - 1; $endPos = $1 - 1; } else { die "$siteList given as the list of sites. " . "Make sure it is comma delimitted, and each element is " . " one of the forms: 23-26, 29, -10, 40-\n"; } push (@result, $beginPos..$endPos); } return(@result); } # takes an arg; name of a file from which data are read Then read in # the data and make an array. Each element of this array corresponds # to a sequence, name tab data. sub ReadInFASTA { my $infile = shift; my @line; my $i = -1; my @result = (); my @seqName = (); my @seqDat = (); open (INFILE, "<$infile") || die "Can't open $infile\n"; while () { chomp; if (/^>/) { # name line in fasta format $i++; s/^>\s*//; s/^\s+//; s/\s+$//; $seqName[$i] = $_; $seqDat[$i] = ""; } else { s/^\s+//; s/\s+$//; s/\s+//g; # get rid of any spaces next if (/^$/); # skip empty line s/[uU]/T/g; # change U to T $seqDat[$i] = $seqDat[$i] . uc($_); } # checking no occurence of internal separator $sep. die ("ERROR: \"$sep\" is an internal separator. Line $. of " . "the input FASTA file contains this charcter. Make sure this " . "separator character is not used in your data file or modify " . "variable \$sep in this script to some other character.\n") if (/$sep/); } close(INFILE); foreach my $i (0..$#seqName) { $result[$i] = $seqName[$i] . $sep . $seqDat[$i]; } return (@result); } sub GetSeqDat { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[1]; } return (@result) } sub GetSeqName { my @data = @_; my @line; my @result = (); foreach my $i (@data) { @line = split (/$sep/, $i); push @result, $line[0]; } return (@result) } sub SelectSites { my ($arrayRef, $indexRef) = @_; unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){ die "args to SelectSites() should be ARRAY REF, ARRAY REF\n"; } my $maxIndex = @$arrayRef -1; my @result = (); foreach my $posi (@$indexRef) { if ($maxIndex < $posi) { push @result, "?"; } else { push @result, $$arrayRef[$posi]; } } return @result; } sub ReplaceOtherSitesWGaps { my ($arrayRef, $indexRef) = @_; unless (@_ == 2 && ref($arrayRef) eq 'ARRAY' && ref($indexRef) eq 'ARRAY'){ die "args to SelectSites() should be ARRAY REF, ARRAY REF\n"; } my $maxIndex = @$arrayRef -1; my @allSites = 0..($maxIndex) ; my @index = InANotInB (\@allSites, $indexRef); # making the complement set warn "WARN: some selected sites don't exists\n" if (Max(@$indexRef) > $maxIndex); my @result = @$arrayRef; foreach my $posi (@index) { $result[$posi] = $replaceChar; } if ($debug) { print join "", "debug: ", @$arrayRef, "\n"; print join "", "debug: ", @result, "\n\n"; } return @result; } sub Sites { my ($datRef, $indexRef) = @_; my @seqDat = GetSeqDat(@$datRef); my @seqName = GetSeqName(@$datRef); my @result = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { my @tmpArray = split(//, $seqDat[$seqNumber]); my @thisSeq = (defined($opt_g)) ? ReplaceOtherSitesWGaps(\@tmpArray, $indexRef) : SelectSites(\@tmpArray, $indexRef); my $thisLine = $seqName[$seqNumber] . "\t" . (join("", @thisSeq)); push @result, $thisLine; } return (@result); } sub PrintFASTA { my @seqName = GetSeqName(@_); my @seqDat = GetSeqDat(@_); for my $i (0..$#seqDat) { # print ">$seqName[$i]\n$seqDat[$i]\n"; print ">$seqName[$i]\n"; my $seq = $seqDat[$i]; for (my $pos=0 ; $pos < length ($seq) ; $pos += $lineWidth) { print substr($seq, $pos, $lineWidth), "\n"; } } } sub MaxSeqLen { my @data = GetSeqDat(@_); my $maxLen = 0; foreach $i (@data) { my $len = CharLen($i); $maxLen = $len if ($len > $maxLen); } return ($maxLen); } # returns an array of seq lengths after removing - and ? for each seq. sub SeqLensRmGaps { my @data = GetSeqDat(@_); my @result = (); foreach my $i (@data) { $i =~ s/[-\?]//g; # remove gaps my $len = CharLen($i); push @result, $len; } return (@result); } # take std seq data (name\tseq), and attach "?" for the shorter sequences sub AdjustSeqLength { my @data = @_; my @seqDat = GetSeqDat(@_); my @seqName = GetSeqName(@_); my $maxLen = MaxSeqLen(@_); foreach my $i (0 .. $#seqDat) { my $thisLen = CharLen ($seqDat[$i]); if ($thisLen == $maxLen) { ; # do nothing } elsif ($thisLen < $maxLen) { my $diff = $maxLen - $thisLen; warn "WARN: $seqName[$i] shorter. " . "$diff '?' (missing character) were added at the end\n"; for ($j=0; $j < $diff; $j++) { $data[$i] = $data[$i] . "?"; } } else { die "ERROR: the length of sequence $seqName[$i] is $thisLen, " . "longer than \$maxLen = $maxLen. Weird!!"; } } return (@data); } # check if the length of all sequences are same sub CheckEqualSeqLength { my @data = @_; my @seqDat = GetSeqDat(@_); my @seqName = GetSeqName(@_); my $maxLen = MaxSeqLen(@_); my $equalLen = 1; foreach my $i (0 .. $#seqDat) { my $thisLen = CharLen ($seqDat[$i]); if ($thisLen != $maxLen) { $equalLen = 0; last; } } return $equalLen; } sub RemoveGapOnlySites { my ($seqDatARef, $multipleOf) = @_; my @seqDat = GetSeqDat(@$seqDatARef); my @seqName = GetSeqName(@$seqDatARef); my $maxLen = MaxSeqLen(@$seqDatARef); my @gapSites = (); my @notGapSites = (); my ($posi, $seqNumber); my @seqMat = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { my @tmpArray = split(//, $seqDat[$seqNumber]); # Check the length if (@tmpArray != $maxLen) { die "ERROR: the sequence $seqName[$i] is not same length " . "as \$maxLen = $maxLen. Weird!!"; } push @seqMat, [ @tmpArray ]; } # now identify the all gap sites for $posi (0 .. ($maxLen-1)) { my $gap = 1; for $seqNumber (0 .. $#seqMat){ if ($seqMat[$seqNumber][$posi] !~ /^[-\?]$/) { $gap = 0; last; } } if ($gap == 1) { # all sequences have a gap at these sites push (@gapSites, $posi+1); # now unit-offset } else { # there are some non-gap character at these sites push (@notGapSites, $posi+1); } } my @rmSites = (); # removing multiples of $multipleOf for(my $i = 0; $i < @gapSites - $multipleOf + 1; $i++) { my $rmFlag = 1; for(my $j = 1; $j < $multipleOf; $j++) { if ($gapSites[$i] + $j != $gapSites[$i+$j]) { $rmFlag = 0; # we don't want to remove this $i $j=$multipleOf; # get out of inner loop } } if ($rmFlag == 1) { push @rmSites, @gapSites[$i..($i+$multipleOf-1)]; $i += $multipleOf - 1; } } my @allSites = 1..($maxLen) ; my @selIndex = InANotInB (\@allSites, \@rmSites); @selIndex = To0Offset(@selIndex); # convert to 0-ffset # select sites and make results my @result = (); for $seqNumber (0 .. $#seqMat) { my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex); my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq)); push (@result, $line); } # if (@rmSites > 0) { # warn ("Following sites consist of all gaps, removed from analysis\n"); # print STDERR join(" ", @rmSites); # print STDERR "\n"; # } return (@result); } # if the 2nd argument is "rmAmbiguous", ambiguous characters are removed (anything other than ATGC) sub RemoveSitesWithAnyGap { my ($seqDatARef, $ambiguousRm) = @_; my @seqDat = GetSeqDat(@$seqDatARef); my @seqName = GetSeqName(@$seqDatARef); my $maxLen = MaxSeqLen(@$seqDatARef); my @gapSites = (); my @notGapSites = (); my ($posi, $seqNumber); my @seqMat = (); # make 2 dimensional matrix foreach $seqNumber (0..$#seqDat) { if ($ambiguousRm eq "rmAmbiguous") { # replace anything other than ATGC with - $seqDat[$seqNumber] =~ s/[^ATGCatgc]/-/g; } my @tmpArray = split(//, $seqDat[$seqNumber]); # Check the length if (@tmpArray != $maxLen) { die "ERROR: the sequence $seqName[$i] is not same length " . "as \$maxLen = $maxLen. Weird!!"; } push @seqMat, [ @tmpArray ]; } # now identify the sites with any gap for $posi (0 .. ($maxLen-1)) { my $gap = 0; for $seqNumber (0 .. $#seqMat){ if ($seqMat[$seqNumber][$posi] =~ /^[-\?]$/) { $gap = 1; last; } } if ($gap == 1) { # there is a gap at this site push (@gapSites, $posi+1); # now unit-offset } else { # No seq has a gap at this site. push (@notGapSites, $posi+1); } } # print join(":", "gap=", @gapSites, "\n"); # debug # print join(":", "nogap=", @notGapSites, "\n"); #debug # my @rmSites = @gapSites; # my @allSites = 1..($maxLen) ; # my @selIndex = InANotInB (\@allSites, \@rmSites); my @selIndex = @notGapSites; @selIndex = To0Offset(@selIndex); # convert to 0-ffset # select sites and make results my @result = (); for $seqNumber (0 .. $#seqMat) { my @thisSeq = SelectSites($seqMat[$seqNumber], \@selIndex); my $line = $seqName[$seqNumber] . $sep . (join("", @thisSeq)); push (@result, $line); } # if (@rmSites > 0) { # warn ("Following sites contain gaps\n"); # print STDERR join(" ", @rmSites); # print STDERR "\n"; # } return (@result); } # convert 1-offset index array to 0-offset array sub To0Offset { my @result = map {$_ - 1} @_; return @result; } # count the number of characters in a string sub CharLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); } # this function take two scalars and return the larger value sub Larger { my ($a, $b) = @_; return (($a > $b) ? $a : $b); } sub InANotInB { my ($aRef, $bRef) =@_; my %seen = (); my @aonly =(); foreach my $item (@$bRef) { $seen{$item} = 1}; foreach my $item (@$aRef) { push (@aonly, $item) unless $seen{$item}; } return (@aonly); } sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; } sub Max { my $max = shift; foreach $item (@_) { if ($item > $max) { $max = $item; } } return $max; } sub MemberQ { my ($x, $arrRef) = @_; foreach my $item (@$arrRef) { if ($x eq $item) { return 1; } } return 0; }