Skip to content

Commit

Permalink
more updates to handle FASTA 3.6 multi-HSP hits
Browse files Browse the repository at this point in the history
  • Loading branch information
ajm6q committed May 17, 2011
1 parent 14850cf commit 0e54c5b
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 47 deletions.
177 changes: 133 additions & 44 deletions Bio/SearchIO/fasta.pm
Expand Up @@ -65,7 +65,7 @@ Report bugs to the Bioperl bug tracking system to help us keep track
of the bugs and their resolution. Bug reports can be submitted via the
web:
https://redmine.open-bio.org/projects/bioperl/
http://bugzilla.open-bio.org/
=head1 AUTHOR - Jason Stajich, Aaron Mackey
Expand Down Expand Up @@ -214,6 +214,7 @@ sub next_result {
my $data = '';
my $seentop = 0;
my $current_hsp;
my $m9HSP = 0;
$self->start_document();
my @hit_signifs;
while ( defined( $_ = $self->_readline ) ) {
Expand All @@ -222,10 +223,17 @@ sub next_result {
if (
m/(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+)?sequence/oxi
|| /(\S+)\s+compares\s+a/
|| ( m/^\#\s+/
|| /(\S+)\s+performs\s+a/
|| /(\S+)\s+produces\s/
|| /(\S+)\s+finds\s+/ # for lalign, but does not work because no "The best scores are:"
|| ( m/^\#\s+/ # has a command log line
&& ( $_ = $self->_readline )
&& /(\S+)\s+searches\s+a\s+(protein\s+or\s+DNA\s+)?sequence/oxi
|| /(\S+)\s+compares\s+a/ )
|| /(\S+)\s+compares\s+a/
|| /(\S+)\s+performs\s+a/
|| /(\S+)\s+produces\s/
|| /(\S+)\s+finds\s+/ # for lalign, but does not work because no "The best scores are:"
)
)
{
if ($seentop) {
Expand Down Expand Up @@ -405,20 +413,54 @@ sub next_result {
}
);
}
elsif (/^\s*(Smith-Waterman).+(\S+)\s*matrix [^\]]*?(xS)?\]/) {
elsif (/^\s*(Smith-Waterman)/) {

$self->{'_reporttype'} = $1;

m/\[\s*(\S+)\s+matrix \([^\)]+\)(xS)?\],/;

$self->element(
{
'Name' => 'Parameters_matrix',
'Data' => $2
'Data' => $1
}
);
$self->element(
{
'Name' => 'Parameters_filter',
'Data' => defined $3 ? 1 : 0,
'Data' => defined $2 ? 1 : 0,
}
);
$self->{'_reporttype'} = $1;
if (/\s*gap\-penalty:\s*(\-?\d+)\/(\-?\d+)/) {
$self->element(
{
'Name' => 'Parameters_gap-open',
'Data' => $1,
}
);

$self->element(
{
'Name' => 'Parameters_gap-ext',
'Data' => $2,
}
);
}
elsif (/\s*open\/ext:\s*(\-?\d+)\/(\-?\d+)/) {
$self->element(
{
'Name' => 'Parameters_gap-open',
'Data' => $1,
}
);

$self->element(
{
'Name' => 'Parameters_gap-ext',
'Data' => $2,
}
);
}

$self->element(
{
Expand Down Expand Up @@ -462,6 +504,7 @@ sub next_result {
}

if ($line[0] eq "+-") {
$m9HSP = 1;
# parse HSP, add to last parsed Hit
my %hspData;

Expand All @@ -474,6 +517,9 @@ sub next_result {

next;
}
elsif ($line[0] eq '>>><<<') {
last;
}

my (%data, %hspData);
@data{@labels} = @hspData{@labels} = splice( @line, @line - @labels );
Expand Down Expand Up @@ -541,20 +587,42 @@ sub next_result {
$self->{'_reporttype'} = $1
if ( $self->{'_reporttype'} !~ /FAST[PN]/i );

#
# get gap-pen line for FASTA33, which is not on the matrix line
#
# FASTA (3.36 June 2000) function [optimized, BL50 matrix (15:-5)] ktup: 2
# join: 36, opt: 24, gap-pen: -12/ -2, width: 16
#
$_ = $self->_readline();
if (/(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+)/) {
$self->element(
{
'Name' => 'Parameters_gap-open',
'Data' => $1
}
);
$self->element(
{
'Name' => 'Parameters_gap-ext',
'Data' => $2
}
);
}

$self->element(
{
'Name' => 'FastaOutput_program',
'Data' => $self->{'_reporttype'}
}
);

}
elsif (/^Algorithm:\s+(\S+)\s+\(([^)]+)\)\s+(\S+)/) {
elsif (/^Algorithm:\s+(\S+)\s+.*\s*\(([^)]+)\)\s+(\S+)/) {
$self->{'_reporttype'} = $1
if ( $self->{'_reporttype'} !~ /FAST[PN]/i );
}
elsif (
/^Parameters:\s+(\S+)\s*matrix\s*(?:\(([^(]+?)\))?\s*ktup:\s*(\d+)/)
{ # FASTA 35.04
elsif ( /^Parameters:/ ) { # FASTA 35.04/FASTA 36
m/Parameters:\s+(\S+)\s+matrix\s+\([^\)]+\)(xS)?,?\s/;
$self->element(
{
'Name' => 'Parameters_matrix',
Expand All @@ -567,12 +635,32 @@ sub next_result {
'Data' => defined $2 ? $2 : 0,
}
);
$self->element(
{
'Name' => 'Parameters_ktup',
'Data' => $3
}
);
if (/ktup:\s(\d+)/) {
$self->element(
{
'Name' => 'Parameters_ktup',
'Data' => $1
}
);
if (/ktup:\s\d+$/) {
$_ = $self->_readline();
}

}
if (/(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+)/) {
$self->element(
{
'Name' => 'Parameters_gap-open',
'Data' => $1
}
);
$self->element(
{
'Name' => 'Parameters_gap-ext',
'Data' => $2
}
);
}
$self->element(
{
'Name' => 'FastaOutput_program',
Expand All @@ -581,30 +669,17 @@ sub next_result {
);
}
elsif (
/(?:gap\-pen|open\/ext):\s+([\-\+]?\d+)\s*\/\s*([\-\+]?\d+).+width:\s+(\d+)/
/^\s+ktup:\s*(\d+),/
)
{
$self->element(
{
'Name' => 'Parameters_gap-open',
'Name' => 'Parameters_ktup',
'Data' => $1
}
);
$self->element(
{
'Name' => 'Parameters_gap-ext',
'Data' => $2
}
);
$self->element(
{
'Name' => 'Parameters_word-size',
'Data' => $3
}
);
}
elsif (/^(>--)$/ || /^>>(?!>)(.+?)\s+(?:\((\d+)\s*(aa|nt)\))?$/) {

if ( $self->in_element('hsp') ) {
$self->end_element( { 'Name' => 'Hsp' } );
}
Expand Down Expand Up @@ -664,23 +739,30 @@ sub next_result {
}
);
}
else {
# push @{$hit_signifs[0]->{HSPs}}, $current_hsp;
}


$_ = $self->_readline();
my $strand = 1;
if( /rev-comp/ ) {
$strand = -1;
}
my ( $score, $bits, $e, $e2 ) = /Z-score: \s* (\S+) \s*
(?: bits: \s* (\S+) \s+ )?
(?: E|expect ) \s* \((?:\d+)?\) :? \s*(\S+)
(?: \s* E2 \s* \(\) :? \s*(\S+) )?
/ox;
$bits = $score unless defined $bits;
my $v = shift @{$hit_signifs[0]->{HSPs}}
if (@hit_signifs && @{$hit_signifs[0]->{HSPs}});
my ($v);
if ($firstHSP && !$m9HSP) {
$v = shift @{$hit_signifs[0]->{HSPs}}
if (@hit_signifs && @{$hit_signifs[0]->{HSPs}});
$current_hsp = $v;
}
else {
$v = $current_hsp;
}
if ( defined $v ) {
@{$v}{qw(evalue evalue2 bits z-sc)} = ( $e, $e2, $bits, $score );
}
Expand Down Expand Up @@ -729,6 +811,14 @@ sub next_result {
);
$_ = $self->_readline();
if (s/global\/.* score:\s*(\d+)\;?//) {
$self->element(
{
'Name' => 'Hsp_sw-score',
'Data' => $1
}
);
}
if (s/Smith-Waterman score:\s*(\d+)\;?//) {
$self->element(
{
Expand Down Expand Up @@ -769,10 +859,7 @@ sub next_result {
'Data' => $len
}
);
if( $strand < 0 ) {
# flip-flop start/end when query is on opposite strand
($querystart,$queryend) = ($queryend,$querystart);
}

$self->element(
{
'Name' => 'Hsp_query-from',
Expand Down Expand Up @@ -1537,6 +1624,8 @@ sub _processHits {

$self->element({'Name' => 'Hsp_bit-score', 'Data' => $hsp->{bits} } )
if exists $hsp->{bits};
$self->element({'Name' => 'Hsp_sw-score', 'Data' => $hsp->{'n-w'} } )
if exists $hsp->{'n-w'};
$self->element({'Name' => 'Hsp_sw-score', 'Data' => $hsp->{sw} } )
if exists $hsp->{sw};
$self->element({'Name' => 'Hsp_gaps', 'Data' => $hsp->{'%_gid'} } )
Expand Down
11 changes: 8 additions & 3 deletions t/SearchIO/fasta.t
Expand Up @@ -7,7 +7,7 @@ BEGIN {
use lib '.';
use Bio::Root::Test;

test_begin(-tests => 301);
test_begin(-tests => 299);

use_ok('Bio::SearchIO');
}
Expand Down Expand Up @@ -364,8 +364,13 @@ is($result->get_parameter('gapopen'), -10);
is($result->get_parameter('gapext'), -2);
is($result->get_parameter('ktup'), 2);
is($result->get_parameter('matrix'), 'BL50');
is($result->get_parameter('wordsize'), 16);
is($result->get_parameter('filter'), '15:-5');

# wordsize is the same as ktup, not opt width, as we used to parse
# is($result->get_parameter('wordsize'), 16);

# this is the range of the scoring matrix, not a filter (which is meant
# to capture whether xS seg filtering used)
# is($result->get_parameter('filter'), '15:-5');

is($result->get_statistic('lambda'), 0.122629);
is($result->get_statistic('dbletters'), 10449259);
Expand Down

0 comments on commit 0e54c5b

Please sign in to comment.