#!/usr/bin/perl -w # Takes sites output and create lines appropriate for mlhka my $usage = "Help to create mlhka input file\n". "Usage: $0 [-hc] [SITES_inputFile]\n\n". " -c: print the SITES command which this script use\n" . " -h: help\n\n". "Takes SITES input file with at least two groups defined.\n". "If no filename is given, it take STDIN as the input\n\n" . "Runs SITES and extract the information for the inputfile of mlhka\n". "It runs sites with -Szxis: remove position with >2 bases, remove\n". "sites with ANY gaps (data-set wide), and use silent (non-coding + \n". "synonymous) mutation.\n". "SITES print out the total length of sequences, not the number of silent\n". "sites. This script correct the problem, and the seq length printed by\n". "this script is (total analyzed length) - (average replacement sites)\n". "Output:\n". "If there are two groups in the input file, it will print four mlhka lines.\n". "1. sp2 is considered as outgroup, use all sp2 and divergence is ave. distance\n". "2. 1 sequence of sp2 is considered as outgroup (single line)\n". "3. sp1 is considered as outgroup\n". "4. 1 sequence of sp.1 is considered as outgroup\n". "You need to pick 1 appropriate line and copy it to 'infile.txt' for mlhka.\n". "The name of mlhka input file has to be 'infile.txt'.\n". "The 1st column should be the gene name, edit it appropriately.\n". "\n6-th column of the output is initial theta (PER SITE) value.\n" . "The script use Watterson's theta estimate for this.\n" . "The last column may need to be modified if your gene is not autosomal.\n"; use IO::File; use POSIX qw(tmpnam); our ($opt_h, $opt_c, $opt_d); use Getopt::Std; getopts('hcd') || die "$usage\n"; die "$usage\n" if (defined($opt_h)); die "$usage\n" if (@ARGV > 1); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. my $file = shift; my $outFile="TEMP_OUTPUT"; if (defined($opt_c)) { $file = "INPUT_FILE"; print "sites -I$file -R$outFile -M'' -Szxis -Ap -Ohgs > /dev/null\n"; exit; } my $fh; do {$outFile = tmpnam()} until $fh = IO::File->new("$outFile.SIT", O_RDWR|O_CREAT|O_EXCL); if (defined($opt_d)) { # for debugging warn "INFO: Temporary SITES output is $outFile.SIT, for debugging\n"; } system("sites -I$file -R$outFile -M'' -Szxis -Apc -Ohgs > /dev/null"); open SITESOUT, "<$outFile.SIT" || die "cannot open $outFile.SIT\n"; my $state = 0; my $cntr = 0; my @extracted = (); my %info = (); while () { if ($state == 0) { if (/^\s*AVERAGE NUMBER OF SYN. SITES\s*:\s*([\d\.]+)\s+REPLACEMENT SITES\s*:\s*([\d\.]+)\s*/) { $info{'synSites'} = $1; $info{'nonSynSites'} = $2; $state++; } next; } if ($state == 1) { $state ++ if (/^\s*Formatted Values for Input to HKA Program\s*$/); next; } if ($state == 2){ if (/^\s*Fu & Li D for group 2\s*$/) { $state ++; } next; } if ($state == 3) { if (/^_+\s*$/) { $state++; last; } s/^\s+//; s/\s+$//; push @extracted, $_; next; } last if ($state ==4); } if ($state != 4) { die "ERROR: it didn't find complete info, last state is $state\n"; } if (@extracted < 2) { die "ERROR: couldn't find at least two lines of formated text"; } if (@extracted %2 != 0) { die "ERROR: HKA formated text isn't multpile of 2:\n" . join("\n", @extracted), "\n"; } my @names = (); my @dat = (); foreach my $i (0..$#extracted) { if ($i % 2 == 0) { # name line $extracted[$i] =~ s/\s+/ /g; $extracted[$i] .= " (2nd one = outgroup)"; push @names, $extracted[$i]; } else { $extracted[$i] =~ s/low ss/ lowSS /g; my @line = split /\s+/, $extracted[$i]; my $segSites = $line[7]; my $seqLen = $line[2] - $info{'nonSynSites'} ; # note $line[2] is the analyzed seq length, not num silent sites # So by substracting nonSynSites, this become length of intron + syn. # This value is not exactly same as DNAsp, which probably uses # slightly different method of calculating number of syn sites my $sampleSize = $line[5]; my $theta = ($sampleSize > 1) ? $segSites/WattersonCoef($sampleSize)/$seqLen : "NA"; if ($sampleSize > 1 && $segSites < 1) { warn "WARN: watterson's theta estimate is 0 (no segregating sites),". "using 0.000001 as the initial theta to avoid problems ". "with mlhka (mlhka doesn't like initial theta of 0)\n"; $theta = "0.000001"; # avoiding printing out in '1e-06' by "" } $seqLen = MyRound($seqLen); # rounding to integer; push @dat, "$line[0]\t$seqLen\t$segSites\t$sampleSize\t$line[9]\t$theta\t$line[1]"; # id, seqLen SegSites sampleSize div starttheta scalar } } print "1geneID\t2seqLen\t3SegSites\t4sampleSize\t5divergence\t6starttheta\t7scalar\n"; for my $i (0..$#names) { print "$names[$i]\n$dat[$i]\n"; } unlink ("$outFile.SIT") || die "Can't unlink $outFile.SIT\n"; exit; sub WattersonCoef { my $sampleSize = shift; my $result = 0; for (my $i = $sampleSize - 1; $i > 0; $i--) { $result += 1/$i; } return $result; } sub MyRound { my $num = shift; return int($num + 0.5 * ($num <=> 0)); } # Values for Input to HKA Program # --------- ------ --- ----- -- --- ------- # Lines are generated for each ordered species pair, and also with each 2nd species as single sequence # Value Order : # locus identifier (input data filename upto 10 characters) # inheritance scalar (1.0 for autosomal is default - this must be edited: 0.75 X -linked; 0.25 mitochondrial or Y-linked) # mean sequence length group1 # mean sequence length group2 # mean sequence length between group # # of sequences group1 # # of sequences group2 # # of polymorphic sites group1 # # of polymorphic sites group2 # mean pairwise divergence between group # Tajima D for group 1 # Tajima D for group 2 # Fu & Li D for group 1 # Fu & Li D for group 2 # lyrpet thal # data/genic 1.0 1886 1886 1886 28 19 46 10 94 -0.5753 -1.2559 -0.5611 -1.6868 # lyrpet thal (single line) # data/genic 1.0 1886 1886 1886 28 1 46 0 94 -0.5753 -1.2559 -0.5611 -1.6868 # thal lyrpet # data/genic 1.0 1886 1886 1886 19 28 10 46 94 -1.2559 -0.5753 -1.6868 -0.5611 # thal lyrpet (single line) # data/genic 1.0 1886 1886 1886 19 1 10 0 94 -1.2559 -0.5753 -1.6868 -0.5611 # _____________________________________________________________________________