#!/usr/bin/perl my $usage = "Usage: $0 [-h] inputfile\n". " -h: help\n" . " -a: inputfile is amino acid seq, instead of DNA\n". "\n" . "This program read in the sequence file, which may contain sequences " . "with opposite orientation. It identifies identical alleles by going " . "through all pairwise comparisons. When the shorter sequence of the two ". "is identical to the substring of the longer, they are considered as an " . "allele. Gaps '-' will be removed before the comparison. The longest " . "sequences of each allele will be printed in STDOUT. These unique ". "alleles are formated in FASTA. Information about " . "which alleles are identical and the difference in the lengths are ". "printed in STDERR. When the sequences with the opposite direction are" . "included, it makes the complement of the sequences with revseq of " . "EMBOSS, and the comparison is made.\n"; use Bio::AlignIO; use Bio::SeqIO; use Bio::Factory::EMBOSS; use IO::File; use POSIX qw(tmpnam); use Getopt::Std; getopts('ah') || die "$usage\n"; die "$usage\n" if (defined($opt_h)); my $infile = shift; my $outFH = Bio::SeqIO->newFh(-format => 'fasta'); # read in the data $in = Bio::SeqIO->new(-file => $infile, '-format' => 'fasta'); my @seqArray =(); while(my $seq = $in->next_seq()) { my $tmpSeq = $seq->seq(); # $tmpSeq =~ s/-//g; # remove gaps $seq->seq($tmpSeq); push @seqArray, $seq; } # compare seq strings my @uniq = (); my ($i, $j); for ($i=0; $i < @seqArray; $i++) { my $keep = $seqArray[$i] ; my @identical = ($seqArray[$i]->display_id()); for ($j=$i+1; $j < @seqArray; $j++) { my $comp = QsubSeq($seqArray[$i], $seqArray[$j]); if ($comp eq "different" && (!defined($opt_a))) { # check the complement $comp = QsubSeq($seqArray[$i], Complement($seqArray[$j])); } next if ($comp eq "different"); # They matches. push @identical, $seqArray[$j]->display_id() . " (" . -$comp . ")"; if ($comp < 0) { $keep = $seqArray[$j]; # keeping the longer seq } splice(@seqArray, $j, 1); # remove this $j = $j - 1; } if (@identical > 1) { print stderr join " = ", @identical; print stderr "\n"; } push @uniq, $keep; } # output foreach my $i (@uniq) { print $outFH $i; } exit(0); # takes two sequences, and returns an signed integer when one seq is # a substring of the other. The returned value is the difference # in the lengths of the seq sub QsubSeq { my ($seq1, $seq2) = @_; my $s1 = lc($seq1->seq()); # setting them to lower case my $s2 = lc($seq2->seq()); # remove gaps '-' $s1 =~ s/-//g; $s2 =~ s/-//g; my $len1 = CharLen($s1); my $len2 = CharLen($s2); if ($len1 > $len2) { unless ($s1 =~ /$s2/) { return ("different"); } } elsif ($len2 > $len1) { unless ($s2 =~ /$s1/) { return ("different"); } } else { unless ($s1 eq $s2) { return ("different"); } } # when reached here, 1 seq is substring of the other, so return # the length diff of the two seq (seq1 - seq2) return ($len1 - $len2) } sub Complement { my $seq = shift; my $tempOutfile; my $fh; do {$tempOutfile = tmpnam()} until $fh = IO::File->new($tempOutfile, O_RDWR|O_CREAT|O_EXCL); $fh->close; my $factory = new Bio::Factory::EMBOSS; my $revseq = $factory->program('revseq'); my %input = (-sequence => $seq, -outseq => $tempOutfile ); $revseq->run(\%input); my $seqio = Bio::SeqIO->new (-file => $tempOutfile); unlink $tempOutfile || die "ERROR: Unable to unlink $tempOutfile\n"; return($seqio->next_seq()) } sub CharLen { my $string = shift; my @charString = split (//, $string); return scalar(@charString); }