#!/usr/bin/perl -w # # This program is used to batch-process (base call and assemble) ABI trace. # It extract the information about the source of DNA from the filename, # and assemble contigs for each source. # # Copyright 2006, Naoki Takebayashi # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA # 02110-1301 USA my $usage = "\nUsage: $0 [-hr] sourceDir1 [sourceDir2 ...]\n" . " -h: help\n" . " -r: run phredPhrap in the updated directory\n\n" . "sourceDir contains *.ab1 files (output from ABI sequencer). This " . "program assumes that the *.ab1 files follows this naming convention: " . "sampleID.templateType.ab1. The portion before the 1st dot corresponds " . "the name of sample. You can put whatever info in the templateType " . "section. St. Louis naming convention follows this pattern. The program recursively go down the sourceDirs to find *.ab1 files, and creates symlinks in ./sampleID/chromat_dir/, pointing to the *.ab1 files inside of the sourceDir. The script make sure that there is no file with duplicated names. Additionally, if correct symlinks already exists, no new symlink will be created."; use File::Find; use File::Basename; use Getopt::Std; getopts('hr') || die $usage; die $usage if (defined($opt_h)); die "$usage\nERROR: Please specify at least one source directory\n" unless @ARGV; my @fileName =(); my @sourceDirName; foreach my $dir (@ARGV) { $dir =~ s@/$@@; # Strip any trailing slash if (-d $dir) { push @sourceDirName, $dir; } elsif (-f $dir) { push @fileName, $dir; } else { warn "Don't know how to handle argument '$dir'\n"; next; } } # extract the plain files and symlinks push @fileName, ListRegFilesRecursive(@sourceDirName); # print join "\n", @fileName, "\n"; # for debug @fileName = SelectAB1Files (@fileName); my @duped = CheckDuplicatedBasename (@fileName); if (@duped > 0) { warn "ERROR: the filenames should be unique\n"; warn (join "\n", @duped); warn ("\n"); die; } # if some files are already in the destination, they are removed from the list @fileName = CheckDestination(@fileName); # Now it should be safe to make the symlinks. MakeSymlinks (@fileName); if (defined ($opt_r)) { RunPhredPhrap (@fileName); } exit (0); # take a list of directories and returns the names of plain files and sym links sub ListRegFilesRecursive { my @names =(); find sub {push @names, $File::Find::name if (-f $_ || -l $_) }, @_; return @names; } # only select ab1 files sub SelectAB1Files { my @result = (); foreach $file (@_) { my $id = GetIDFromFilename($file); if ($id eq "") { warn "INFO: $file does not follow the naming convention: " . "templateID.type.ab1. Ignoring ..."; next; } push @result, $file; } return @result; } sub CheckDuplicatedBasename { my %seen = (); my @dupPairs = (); foreach my $name (@_) { $base=basename($name); if ($seen{$base}) { push @dupPairs, "$seen{$base} = $name"; } else { $seen{$base} = $name; } } return (@dupPairs); } sub GetIDFromFilename { my $name = shift; my $bn = basename $name; # "+?" is stingy match if ($bn =~ /^(.+?)\..*\.ab1$/) { return $1; } else { return ""; } } sub CheckDestination { my @result = (); foreach my $file (@_) { my $id = GetIDFromFilename($file); if ($id eq "") { warn "INFO: $file does not follow the naming convention: " . "templateID.type.ab1. Ignoring ..."; next; } if ( -e $id) { if (-d $id) { unless ( -d "$id/chromat_dir" && -d "$id/edit_dir" && -d "$id/phd_dir" ) { die "ERROR: destination directory, $id, doesn't contain " . "consed dir structure: chromat_dir, edit_dir, and " . "phd_dir\n"; } } else { die "PROBLEM with destination: non-directory with name $id " . "exists\n"; } } else { push @result, $file; next; } # the destination directory exists, so check duplication my $base = basename($file); unless (-e "$id/chromat_dir/$base") { push @result, $file; next; } # there is already a file with same name my ($destDev, $destIno) = stat ("$id/chromat_dir/$base"); my ($newDev, $newIno) = stat $file; # the file exists. if ($destDev == $newDev && $destIno == $newIno) { warn "INFO: ignoring $file (= $id/chromat_dir/$base)\n"; # Ignore this file next; } ### hmm, warn "ERROR: $id/chromat_dir/$base exists, symlink to $file " . "can't be made\n"; die; } return @result; } sub MakeSymlinks { foreach my $file (@_) { my $id = GetIDFromFilename($file); unless (-e $id) { MakeConsedDirStructure($id); } my $bn = basename $file; print "$id/chromat_dir/$bn -> $file\n"; if (! symlink "../../$file", "$id/chromat_dir/$bn") { warn "WARN: could not make symlink: $id/chromat_dir/$bn -> $file\n"; } } } sub MakeConsedDirStructure { my $name = shift; system ("mkdir -p $name/chromat_dir"); system ("mkdir -p $name/edit_dir"); system ("mkdir -p $name/phd_dir"); } sub RunPhredPhrap { my @updatedSamples = (); foreach $file (@_) { push @updatedSamples, GetIDFromFilename($file); } @updatedSamples = Unique(@updatedSamples); foreach $dir (@updatedSamples) { system("cd $dir/edit_dir/;phredPhrap"); } } sub Unique { my %seen = (); my @uniq = grep { ! $seen{$_} ++ } @_; return @uniq; }