#!/usr/bin/perl # This script reads in codemlsites output, and extract the positively # selected sites for each models. You can give multiple inputfiles. my $usage="Usage: $0 [-d alignedDNASeq [-f]] [-n] [codemlsites.out]\n" . " -d: PAML aligned DNA Sequence file\n" . " -f: read FASTA instead of PAML\n" . " -n: include filenames in the output"; my $convTblBin = "~/marcylab/seq/pamlConvTbl.pl"; use Getopt::Std; getopts('hd:fn') || die "$usage\n"; if (defined($opt_h)) { die "$usage\n"; } my $printOutStage = 0; my $model = -1; if (defined ($opt_d)) { %convTbl=MkSiteConvTbl($opt_d); print "model\tsiteNum.paml\taa\tp\tsignificance\tsiteNum\n"; } else { print "model\tsiteNum.paml\taa\tp\tsignificance\n"; } @ARGV = ('-') unless @ARGV; while ($ARGV=shift) { open (ARGV, $ARGV) or warn "Can't open $ARGV: $!\n"; while () { if (/^Model\s+(\d+):/) { $model = $1; next; } if (/^Positively selected sites Prob/) { $printOutStage=1; next; } if ($printOutStage == 1) { chomp; s/^\s+//; s/\s+$//; next if (/^$/); if (/^Time used:/) { $printOutStage = 0; next; } unless (/^\d+/) { warn "In model $model: $_\n"; next; } my $sigVal = 0; if (/(\*+)$/) { my @stars = split (//, $1); $sigVal = @stars; s/(\*+)$//; } s/\s+/\t/g; if(defined($opt_n)) { print "$ARGV:$model\t"; } else { print "$model\t"; } print; print "\t$sigVal"; if (defined($opt_d)) { # print the actual position print "\t$convTbl{$1}\n" if (/^\s*(\d+)/); } else { print "\n"; } } } } sub MkSiteConvTbl { my $file = shift; my $convTblOpt = "-f" if (defined($opt_f)); open (INPIPE, "$convTblBin $convTblOpt -c $file|") || die "Can't run $convTblBin\n"; my $header = ; warn "The first line of $convTblBin output is \"$header\", " . "but expecting \"PAML\tOrig\"\n" if ($header !~ /^PAML\tOrig/); my %resHash = (); while () { my @line = split; if (exists ($resHash{$line[0]})) { die "There are two lines with same site #\n"; } $resHash{$line[0]} = $line[1]; } close (INPIPE); return (%resHash); }