Skip to content

Commit

Permalink
Cleaned up summary generation. Can generate summary without doing a l…
Browse files Browse the repository at this point in the history
…oad.
  • Loading branch information
Lincoln Stein committed May 24, 2010
1 parent 5539ba3 commit 8095dc7
Show file tree
Hide file tree
Showing 3 changed files with 140 additions and 123 deletions.
115 changes: 0 additions & 115 deletions Bio/DB/SeqFeature/Store/DBI/SQLite.pm
Expand Up @@ -489,121 +489,6 @@ sub _fetch_indexed_features_sql {
END
}

sub feature_summary {
my $self = shift;
my ($seq_name,$start,$end,$types,$bins,$iterator) =
rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
['TYPES','TYPE','PRIMARY_TAG'],
'BINS',
'ITERATOR',
],@_);
my ($coverage,$tag) = $self->coverage_array(-seqid=> $seq_name,
-start=> $start,
-end => $end,
-type => $types,
-bins => $bins);
my $score = 0;
for (@$coverage) { $score += $_ }
$score /= @$coverage;

my $feature = Bio::SeqFeature::Lite->new(-seq_id => $seq_name,
-start => $start,
-end => $end,
-type => $tag,
-score => $score,
-attributes =>
{ coverage =>
join(',',map {sprintf('%.2f',$_)} @$coverage)
});
return $iterator
? Bio::DB::SeqFeature::Store::DBI::FeatureIterator->new($feature)
: $feature;
}

sub coverage_array {
my $self = shift;
my ($seq_name,$start,$end,$types,$bins) =
rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);

$bins ||= 1000;
$start ||= 1;
unless ($end) {
my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
$end = $segment->end;
}

my $binsize = ($end-$start+1)/$bins;
my $seqid = $self->_locationid_nocreate($seq_name) || 0;

return [] unless $seqid;

# where each bin starts
my @his_bin_array = map {$start + $binsize * $_} (0..$bins);
my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;

my $interval_stats_table = $self->_qualify('interval_stats');

# pick up the type ids
my ($from,$where,$group,@a) = $self->_types_sql($types,'b');
$where =~ s/.+AND//s;
my $sth = $self->_prepare(<<END);
SELECT id,tag FROM $from
WHERE $where
END
;
my (@t,$report_tag);
$sth->execute(@a);
while (my ($t,$tag) = $sth->fetchrow_array) {
$report_tag ||= $tag;
push @t,$t;
}

my %bins;
for my $typeid (@t) {

my ($from,$where,$group,@a) = $self->_types_sql($types,'b');

my $sql = <<END;
SELECT bin,cum_count
FROM $interval_stats_table
WHERE typeid=?
AND seqid=? AND bin >= ?
LIMIT 1
END
;
my $sth = $self->_prepare($sql);
for (my $i=0;$i<@sum_bin_array;$i++) {

my @args = ($typeid,$seqid,$sum_bin_array[$i]);
$self->_print_query($sql,@args) if $self->debug;

$sth->execute(@args) or $self->throw($sth->errstr);
my ($bin,$cum_count) = $sth->fetchrow_array;
push @{$bins{$typeid}},[$bin,$cum_count];
}
}

my @merged_bins;
my $firstbin = int(($start-1)/$binsize);
for my $type (keys %bins) {
my $arry = $bins{$type};
my $last_count = $arry->[0][1]-1;
my $last_bin = -1;
my $i = 0;
my $delta;
for my $b (@$arry) {
my ($bin,$count) = @$b;
$delta = $count - $last_count if $bin > $last_bin;
$merged_bins[$i++] = $delta;
$last_count = $count;
$last_bin = $bin;
}
}

return wantarray ? (\@merged_bins,"$report_tag:bins") : \@merged_bins;
}

###
# get primary sequence between start and end
#
Expand Down
123 changes: 123 additions & 0 deletions Bio/DB/SeqFeature/Store/DBI/mysql.pm
Expand Up @@ -1973,6 +1973,121 @@ sub _dump_update_attribute_index {
}
}

sub feature_summary {
my $self = shift;
my ($seq_name,$start,$end,$types,$bins,$iterator) =
rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
['TYPES','TYPE','PRIMARY_TAG'],
'BINS',
'ITERATOR',
],@_);
my ($coverage,$tag) = $self->coverage_array(-seqid=> $seq_name,
-start=> $start,
-end => $end,
-type => $types,
-bins => $bins);
my $score = 0;
for (@$coverage) { $score += $_ }
$score /= @$coverage;

my $feature = Bio::SeqFeature::Lite->new(-seq_id => $seq_name,
-start => $start,
-end => $end,
-type => $tag,
-score => $score,
-attributes =>
{ coverage =>
join(',',map {sprintf('%.2f',$_)} @$coverage)
});
return $iterator
? Bio::DB::SeqFeature::Store::DBI::FeatureIterator->new($feature)
: $feature;
}

sub coverage_array {
my $self = shift;
my ($seq_name,$start,$end,$types,$bins) =
rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);

$bins ||= 1000;
$start ||= 1;
unless ($end) {
my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
$end = $segment->end;
}

my $binsize = ($end-$start+1)/$bins;
my $seqid = $self->_locationid_nocreate($seq_name) || 0;

return [] unless $seqid;

# where each bin starts
my @his_bin_array = map {$start + $binsize * $_} (0..$bins);
my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;

my $interval_stats_table = $self->_qualify('interval_stats');

# pick up the type ids
my ($from,$where,$group,@a) = $self->_types_sql($types,'b');
$where =~ s/.+AND//s;
my $sth = $self->_prepare(<<END);
SELECT id,tag FROM $from
WHERE $where
END
;
my (@t,$report_tag);
$sth->execute(@a);
while (my ($t,$tag) = $sth->fetchrow_array) {
$report_tag ||= $tag;
push @t,$t;
}

my %bins;
for my $typeid (@t) {

my ($from,$where,$group,@a) = $self->_types_sql($types,'b');

my $sql = <<END;
SELECT bin,cum_count
FROM $interval_stats_table
WHERE typeid=?
AND seqid=? AND bin >= ?
LIMIT 1
END
;
my $sth = $self->_prepare($sql);
for (my $i=0;$i<@sum_bin_array;$i++) {

my @args = ($typeid,$seqid,$sum_bin_array[$i]);
$self->_print_query($sql,@args) if $self->debug;

$sth->execute(@args) or $self->throw($sth->errstr);
my ($bin,$cum_count) = $sth->fetchrow_array;
push @{$bins{$typeid}},[$bin,$cum_count];
}
}

my @merged_bins;
my $firstbin = int(($start-1)/$binsize);
for my $type (keys %bins) {
my $arry = $bins{$type};
my $last_count = $arry->[0][1]-1;
my $last_bin = -1;
my $i = 0;
my $delta;
for my $b (@$arry) {
my ($bin,$count) = @$b;
$delta = $count - $last_count if $bin > $last_bin;
$merged_bins[$i++] = $delta;
$last_count = $count;
$last_bin = $bin;
}
}

return wantarray ? (\@merged_bins,"$report_tag:bins") : \@merged_bins;
}

sub build_summary_statistics {
my $self = shift;
my $interval_stats_table = $self->_qualify('interval_stats');
Expand All @@ -1982,6 +2097,7 @@ sub build_summary_statistics {
my $sbs = SUMMARY_BIN_SIZE;

my $result = eval {
$self->_add_interval_stats_table;
$self->_disable_keys($dbh,$interval_stats_table);
$dbh->do("DELETE FROM $interval_stats_table");

Expand Down Expand Up @@ -2057,6 +2173,13 @@ sub _load_bins {
}
}

sub _add_interval_stats_table {
my $self = shift;
my $tables = $self->table_definitions;
my $interval_table = $self->_qualify('interval_stats');
$self->dbh->do("CREATE TABLE IF NOT EXISTS $interval_table $tables->{interval_stats}");
}

sub _fetch_indexed_features_sql {
my $self = shift;
my $feature_table = $self->_qualify('feature');
Expand Down
25 changes: 17 additions & 8 deletions Bio/DB/SeqFeature/Store/Loader.pm
Expand Up @@ -148,15 +148,18 @@ END
# try to bring in highres time() function
eval "require Time::HiRes";

$tmpdir ||= File::Spec->tmpdir();
$tmpdir ||= File::Spec->tmpdir();

# remember the temporary directory in order to delete it on exit
my $temp_load = tempdir(
'BioDBSeqFeature_XXXXXXX',
DIR=>$tmpdir,
CLEANUP=>1
);

my $tmp_store = Bio::DB::SeqFeature::Store->new(-adaptor => 'berkeleydb',
-temporary=> 1,
-dsn => tempdir(
'BioDBSeqFeature_XXXXXXX',
DIR=>$tmpdir,
CLEANUP=>1
),
-dsn => $temp_load,
-cache => 1,
-write => 1)
unless $normalized;
Expand All @@ -172,6 +175,7 @@ END
verbose => $verbose,
load_data => {},
tmpdir => $tmpdir,
temp_load => $temp_load,
subfeatures_normalized => $normalized,
subfeatures_in_table => $in_table,
coordinate_mapper => $coordinate_mapper,
Expand Down Expand Up @@ -225,7 +229,6 @@ will happen, but it will probably not be what you expect.
=cut

sub load {
warn "LOAD:";
my $self = shift;
my $start = $self->time();
my $count = 0;
Expand All @@ -238,7 +241,6 @@ sub load {
}

if ($self->summary_stats) {
warn "SUMMARY STATS";
$self->msg("Building summary statistics for coverage graphs...");
my $start = $self->time();
$self->build_summary;
Expand Down Expand Up @@ -712,6 +714,13 @@ sub unescape {
return $todecode;
}

sub DESTROY {
my $self = shift;
if (my $ld = $self->{temp_load}) {
unlink $ld;
}
}

1;
__END__
Expand Down

0 comments on commit 8095dc7

Please sign in to comment.