Tuesday, November 4, 2008

Simple assembly

Last week my good friend M asks me to create a script to draw some sequences, the main problem is to visually see differences in transcript orientation (sense and antisense), some time ago I created similar tools for mapping short sequences like 454 pyro or siRNAs. Now the big problem is to have a good assembly, many regions can extend a "contig" to the left or the right, so internal coordinates change every time you add a new element, with short sequences you have a target well defined and the extension is relative to it.

After thinking a little, I decide not extend my code to the assembly (yes, I'm lazy) and use Phrap to perform the job. A parser read the output and extract the alignment information.

Second problem was the fasta naming convention, the original fasta file is a mixed names of other sequences, so I decided to create and tag an unique ID, "seq_##".

Extending the problem, M asks me to be possible to add more sequences, based in the sequence homology, or blast output, then I added a small option to blast against another fasta file and add the matches to the assembly sequences.

Lot of work, require GD for graphics (with 1 base = 1 pixel) and executables of phrap/blast, the output are the assemblies with reports in GFF format and pretty images in PNG.

An example:



This is the code:


#!/usr/bin/perl -w
use strict;
use Getopt::Long;

=head1 NAME

clusterSeq.pl FASTA

=head1 DESCRIPTION

Script to simply assembly, the FASTA input is re-named for unique IDs,
prepared for blast and search blastn against itself, the blast output
is parsed and "contigs" are assembled and a GFF3 file is created.

=cut

my ($help); # Help flag
my ($f); # principal Fasta file
my ($a); # secondary fasta file to add
my ($plot); # Plot flag
my ($gff); # Print GFF files flag
my ($phrapdir) = '.'; # where is Phrap ?
my ($blastdir) = '.'; # where is Blast ?
my ($blastprog) = 'megablast'; # What program (megablast | blastall -p blastn)
my ($blastopt) = '-e 0.000001 -D 3 -a 2'; # some Blast options

die "Cannot find Phrap exec !!!\n" unless(-e "$phrapdir/phrap");

usage() if(!GetOptions(
'help|h'=>\$help,
'file|f=s'=>\$f,
'add|a:s' =>\$a,
'plot|p'=>\$plot,
'gff|g'=>\$gff));
usage() if(defined $help);
usage() unless(defined $f and -e $f);

if (defined $a) {
print "Preparinf files in $a to be blasted\n";
die "Cannot find Blast binary !!!\n" unless (-e "$blastdir/$blastprog");
die "Cannot find formatdb binary !!!\n" unless (-e "$blastdir/formatdb");
system ("$blastdir/formatdb -i $a -p F -o F");

print "Blasting $f in $a db\n";
system ("$blastdir/$blastprog -i $f -d $a $blastopt -o blast.out");

print "Adding new sequences to $f";
my %addme = ();
open B, "blast.out" or die "cannot open blast.out\n";
while (<B>) {
next if(/#/);
chomp;
my ($s, $h, @rest) = split (/\t/, $_);
$h =~ s/lcl\|//;
$addme{$h}++;
}
close B;
local $/ = '>';
open A, "$a" or die "cannot open $a\n";
open F, ">>$f" or die "cannot open $f\n";
while (<A>) {
s/>//g;
foreach my $m (keys %addme) {
if ($_ =~ /$m/) {
print F ">$_";
undef $addme{$m};
last;
}
}
}
close A;
close F;
}

open F, "$f" or die "cannot open $f\n";
open T, "> $f.tmp" or die "cannot open $f.tmp\n";
my %ids = ();
my ($id) = 0;
while (<F>) {
chomp;
if (/>/) {
$id++;
s/>/>seq_$id | /;
$ids{"seq_$id"}{'name'} = $_;
}
else {
$ids{"seq_$id"}{'len'} += length $_;
}
print T "$_\n";
}
close F;
close T;

system ("$phrapdir/phrap $f.tmp > $f.phrap_out");

open P, "$f.phrap_out" or die "cannot open $f.phrap_out\n";
my ($active) = 0;
my ($contig) = '';
my %contigs = ();
my %contigs_len = ();
while (<P>) {
if (/^Contig\s+(\d+)\.\s+(\d+)\s+reads;\s+(\d+)/) {
if ($2 > 1) { # More than one read by contig
$contig = "contig_$1";
$active++;
$contigs_len{$contig} = $3;
}

}
elsif ($active >= 1) {
if (/^\s+(\d+)\s+(\d+)\s+(seq_\d+)/) { # Capture forward sequeces
$contigs{$contig}{$3}{'ini'} = $1;
$contigs{$contig}{$3}{'end'} = $2;
$contigs{$contig}{$3}{'dir'} = '+';
#print "$contig $1 $2 $3 +\n";
}
elsif (/^C\s+(\d+)\s+(\d+)\s+(seq_\d+)/) { # Capture reverse sequences
$contigs{$contig}{$3}{'ini'} = $1;
$contigs{$contig}{$3}{'end'} = $2;
$contigs{$contig}{$3}{'dir'} = '-';
#print "$contig $1 $2 $3 -\n";
}
else {
$active = 0;
}
}
}
close P;

createGFF() if(defined $gff);
createPNG() if(defined $plot);

=head1 SUBROUTINES

=head2 createGFF

Read the sequence info and print a GFF v3.0 file for each contig

=cut

sub createGFF {
print "Creating GFF files ...\n";
foreach my $c (keys %contigs) {
open G, ">$c.gff" or die "cannot write $c.gff\n";
my $l = $contigs_len{$c};
print G "##gff-version 3\n##sequence-region $c 1 $l\n";
#print "contig=$c len=$l\n";
foreach my $s (keys %{ $contigs{$c} }) {
my $i = $contigs{$c}{$s}{'ini'};
my $e = $contigs{$c}{$s}{'end'};
my $d = $contigs{$c}{$s}{'dir'};
my $n = $ids{$s}{'name'};
#print "contig=$c seq=$s ini=$i end=$e dir=$d name=$n\n";
print G "$c\tassembly\tblock\t$i\t$e\t.\t$d\t.\t$n\n";
}
close G;
}
}

=head2 createPNG

Read the sequence info and print a PNG map file for each contig

=cut

sub createPNG {
use GD;
print "Creating PNG images ...\n";

# define color schemes
my $col_f = 'silver'; # String +
my $col_r = 'gold'; # String -
my $col_t = 'black'; # Text color
my $col_l = 'navy'; # Lines
my $col_b = 'red'; # Bold

foreach my $c (keys %contigs) {
open G, ">$c.png" or die "cannot write $c.png\n";
my $l = $contigs_len{$c};
# Number of sequences to plot
my $nseq = 2; # Base sequence (contig) and the ruler
foreach my $s (keys %{ $contigs{$c} }) { $nseq++; }

# Create image
my $im = new GD::Image($l + 20, $nseq * 20);
my $x = 10;
my $y = 5;

# Allocate some colors
my %colors= ();
$colors{'white' } = $im->colorAllocate(255,255,255);
$colors{'black' } = $im->colorAllocate( 0, 0, 0);
$colors{'red' } = $im->colorAllocate(255, 0, 0);
$colors{'blue' } = $im->colorAllocate( 0, 0,255);
$colors{'grey' } = $im->colorAllocate(190,190,190);
$colors{'silver' } = $im->colorAllocate(191,191,191);
$colors{'maroon' } = $im->colorAllocate(128, 0, 0);
$colors{'purple' } = $im->colorAllocate(128, 0,128);
$colors{'green' } = $im->colorAllocate( 0,128, 0);
$colors{'lime' } = $im->colorAllocate( 0,255, 0);
$colors{'olive' } = $im->colorAllocate(107,142, 35);
$colors{'yellow' } = $im->colorAllocate(255,255, 0);
$colors{'gold' } = $im->colorAllocate(255,215, 0);
$colors{'orange' } = $im->colorAllocate(255,127, 0);
$colors{'navy' } = $im->colorAllocate( 0, 0,128);
$colors{'teal' } = $im->colorAllocate( 0,128,128);
$colors{'aqua' } = $im->colorAllocate( 0,255,255);

# Print the ruler
$im->line($x, $y, $x + $l, $y, $colors{$col_l});
for (my $i = $x; $i <= $x + $l ; $i += 100) { $im->line($i, $y - 3, $i, $y + 3, $colors{$col_l});
$im->string(gdTinyFont, $i + 1, $y, $i - 10, $colors{$col_b});
}
$y += 20;

# Print the contig
$im->filledRectangle($x, $y, $x + $l, $y + 12, $colors{$col_f});
$im->string(gdTinyFont, $x+5, $y, "$c($l)", $colors{$col_t});
my $arrow = new GD::Polygon;
$arrow->addPt($x + $l, $y - 3);
$arrow->addPt($x + $l + 5, $y + 6);
$arrow->addPt($x + $l, $y + 15);
$arrow->addPt($x + $l, $y - 3);
$im->filledPolygon($arrow, $colors{$col_f});

# Add each sequence
foreach my $s (keys %{ $contigs{$c} }) {
$y += 20;
my $i = $contigs{$c}{$s}{'ini'};
my $e = $contigs{$c}{$s}{'end'};
my $d = $contigs{$c}{$s}{'dir'};
my $n = $ids{$s}{'name'}; $n =~ s/>//g;

# define the color
my $col = $col_f;
$col = $col_r if ($d eq '-');

# Main block
$im->filledRectangle($i, $y, $e, $y + 12, $colors{$col});

# Create the arrow tip
my $arw = new GD::Polygon;
if ($d eq '+') { # to the left
$arw->addPt($e, $y - 3);
$arw->addPt($e + 5, $y + 6);
$arw->addPt($e, $y + 15);
$arw->addPt($e, $y - 3);
}
else { # to the right
$arw->addPt($i, $y - 3);
$arw->addPt($i - 5, $y + 6);
$arw->addPt($i, $y + 15);
$arw->addPt($i, $y - 3);
}
$im->filledPolygon($arw, $colors{$col});

# add the name
my $flen = $ids{$s}{'len'};
my $rmat = $e - $i + 1;
$im->string(gdTinyFont, $i+5, $y, "$n($rmat/$flen)", $colors{$col_t});
}

# make sure we are writing to a binary stream
binmode G;
# Convert the image to PNG and print it
print G $im->png;
close G;
}
}

=head2 usage

Print help if no values or aks for help

=cut

sub usage {
print <<__HELP__
Usage: clusterSeq.pl -f FASTA [-plot] [-gff] [-add FASTA]

Options:
-help | -h Print this screen
-file | -f Fasta file to use
-add | -a Add the fasta file with blast hits
-plot | -p Create PNG images for each contig
-gff | -g Create GFF3 files for each contig
__HELP__
;
exit 1;
}



=head1 AUTHOR

Juan Caballero

=head1 CONTACT

linxe __a__ glib.org.mx

=head1 LICENSE

Perl Artistic License v2.0
http://www.perlfoundation.org/artistic_license_2_0

=cut

No comments:

Post a Comment