#!/usr/bin/perl my $usage = "Usage: $0 [-h] consedDiscrepancyListFile\n\n" . "This program summarizes the high quality descripancies of a contig in consed.\n". " 1. From consed, open a contig by double clicking.\n" . " 2. Choose \"Navigate\" -> \n" . " \"High Quality (>=40) Discrepancies, > 5bp from unaligned region\"\n". " 3. Push \"Save\" at the bottom of the pop-up window.\n". " 4. You can choose whatever file name, but I save it in a temporary place:\n". " \"/tmp/list.txt\"\n". " 5. $0 /tmp/list.txt\n". " This will show two ways of summarizing:\n". " - 1st section contains a mutation shared by a multiple clones,\n". " followed by the position of the mutation.\n". " This could indicate that the SNP may be real.\n" . " - 2nd section contains the position of discrepancies for each read.\n". " A clone with a lot of SNPS could be a different allele.\n"; my $sep = "\t"; my $debug = 0; use Getopt::Std; getopts('h') || die "$usage\n"; die "$usage\n" if (defined($opt_h)); @ARGV = ('-') unless @ARGV; # take STDIN when no arg. ### read in the data my @data = (); while (<>) { my @line = split /\s+/; my $name = $line[1]; my $posi = $line[2]; my $indID; my $cloneInfo; # "+?" is stingy match if ($name =~ /^(.+?)\.(.*)\.ab1$/) { $indID = $1; $cloneInfo = $2; } else { warn "WARN: $name doesn't match expectation\n"; next; } @line = split(/-/, $cloneInfo); $line[4] =~ s/Al//; # remove Al from "Al24.1.2" $indID = $indID . "_" . $line[4]; my $entry = $indID . $sep . $posi; push @data, $entry ; } # attaching a dummy line at the end, so the last data is analyzed my $dummyLine = "dummy" . $sep . "dummy"; push @data, $dummyLine; my $curPosi = ""; my @cache = (); my %polySites = (); my %singleton = (); foreach my $i (0..$#data) { my ($name, $posi) = split /$sep/, $data[$i]; # first line if ($i == 0) { $curPosi = $posi; @cache = ($name); next; } print "$name $posi $curPosi\n" if ($debug); if ($posi eq $curPosi) { # same site print " cached\n" if ($debug); push @cache, $name; next; } # When it is reached here, moving to a new site, so process data @cache = sort(ExtractUnique(@cache)); # Summarize and attach the data in appropriate places if (@cache == 1) { # singleton if (defined($singleton{$cache[0]})) { $singleton{$cache[0]} = $singleton{$cache[0]} . $sep . $curPosi; } else { $singleton{$cache[0]} = $curPosi; } print " singleton: $cache[0] => $singleton{$cache[0]}\n" if ($debug); } else { # polymorphic site my $discrepancyClones = join($sep, @cache); if(defined($polySites{$discrepancyClones})) { $polySites{$discrepancyClones} = $polySites{$discrepancyClones} . $ sep . $curPosi; } else { $polySites{$discrepancyClones} = $curPosi; } print " poly: $discrepancyClones => $polySites{$discrepancyClones}\n" if ($debug); } # reset @cache =($name); $curPosi = $posi; } #if (scalar(keys(%polySites)) > 0) { foreach my $key (sort(keys(%polySites))) { print $key . $sep . $polySites{$key} . "\n"; } print "\n\n"; #} #if (scalar(keys(%singleton)) > 0) { foreach my $key (sort(keys(%singleton))) { print $key . $sep . $singleton{$key} . "\n"; } #} exit (0); # take a list as the argument and extract the unique elements. # The order of elements will not be preserved. sub ExtractUnique { my %seen=(); my @unique = (); foreach my $item (@_) { push (@unique, $item) unless $seen{$item}++; } return @unique; }