#---------------------------------------------------------------------
# DeFCoM: A supervised learning genomic footprinter
# Copyright (C) 2016 Bryan Quach
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#----------------------------------------------------------------------
from os import path
import pysam
[docs]class Alignments(object):
"""Access and process sequence alignment data.
The Alignment class acts as a wrapper class for pysam to provide additional
footprinting specific data processing functionality. Information about
pysam can be found at http://pysam.readthedocs.org/.
Attributes:
_alignment: A pysam AlignmentFile object.
Args:
aln_file: A string containing the path and filename for a sorted
BAM file. The file must have an index file with the same name
but with the '.bai' extension appended. For example, the index
file for '/path/file.bam' should be '/path/file.bam.bai'.
Raises:
IOError: BAM file could not be found.
AssertionError: File is not a BAM file.
IOError: BAM file index could not be found.
"""
def __init__(self, aln_file):
"""Inits the Alignment class."""
self._alignment = self._load_alignment(aln_file)
def __del__(self):
"""Deconstructor.
Closes the AlignmentFile object in '_alignment'.
"""
try:
self._alignment.close()
except:
pass
def _load_alignment(self, filename):
"""Load the BAM data into an AlignmentFile object.
Args:
filename: A string for the BAM file to load. The path should be
prepended if the file is located outside the working directory.
Returns:
A pysam AlignmentFile object.
Raises:
IOError: BAM file could not be found.
AssertionError: File is not a BAM file.
IOError: BAM file index could not be found.
"""
if not path.isfile(filename):
error_msg = "%s could not be found." % filename
raise IOError(error_msg)
if path.splitext(filename)[1] != ".bam":
error_msg = "%s is not a BAM file." % path.basename(filename)
raise AssertionError(error_msg)
if not path.isfile(filename + ".bai"):
error_msg = "Index file %s not found." % (filename + ".bai")
raise IOError(error_msg)
print "Loading BAM file %s..." % path.basename(filename)
return pysam.AlignmentFile(filename, "rb")
[docs] def get_mapped_read_count(self):
"""Get the number of mapped reads in the AlignmentFile object.
Returns: A long int corresponding to the number of mapped reads in the
'_alignment' object.
"""
return self._alignment.mapped
[docs] def get_unmapped_read_count(self):
"""Get the number of unmapped reads in the AlignmentFile object.
Returns: A long int corresponding to the number of unmapped reads in
the '_alignment' object.
"""
return self._alignment.unmapped
[docs] def get_total_read_count(self):
"""Get the total number of reads in the Samfile object.
Returns: A long int corresponding to the number of total reads in the
'_alignment' object.
"""
return self._alignment.mapped + self._alignment.unmapped
[docs] def get_reads(self, chrom, start, end, multi_iter=False):
"""Get reads overlapping a genomic region.
Retrieves the set of reads overlapping a specified genomic region using
BED format chromosome coordinates i.e., 0-based indexing [start, end).
Args:
chrom: A string for the name of the chromosome for the desired
genomic region. Names are derived from the BAM header.
start: An int for the 0-based start position of the genomic region.
end: An int for the 0-based end position of the genomic region. The
position 'end' denotes one position past the region of interest.
multi_iter: A boolean indicating whether to enable multipe
iterators. This should be 'True' if multiple calls are made to
this method for the same Alignments object and the iterators
are used concurrently.
Returns:
An iterator of AlignedSegment objects.
Raises:
ValueError - The genomic coordinates are out of range, are invalid,
or the file does not permit random access.
"""
return self._alignment.fetch(chrom, start, end,
multiple_iterators=multi_iter)
[docs] def get_read_density(self, chrom, start, end, strand=None,
use_weights=False, multi_iter=False):
"""Compute the total number of reads overlapping a genomic region.
Calculates the read density within a genomic region specified using BED
format chromosome coordinates i.e., 0-based indexing [start, end).
Strand-specific read density is computed if specified. It is
important to note that a read is counted merely if it overlaps the
specified region. The read does not have to be fully contained in the
region.
Args:
chrom: A string for the name of the chromosome for the desired
genomic region. Names are derived from the BAM header.
start: An int for the 0-based start position of the genomic region.
end: An int for the 0-based end position of the genomic region. The
position 'end' denotes one position past the region of interest.
strand: The strand from which to obtain the read density. Must be
one of '+', '-', or 'None' (default). If 'None' is specified
then reads from both strands are included.
multi_iter: A boolean indicating whether to enable multipe
iterators. This should be 'True' if multiple calls are made to
this method concurrently for the same Alignments object.
Returns:
A float value corresponding to the total read count observed in
the genomic interval.
Raises:
KeyError - The tag field 'XW' cannot be found for a read.
ValueError - The genomic coordinates are out of range, are invalid,
or the file does not permit random access.
ValueError - 'strand' is not specified correctly.
"""
reads = self._alignment.fetch(chrom, start, end,
multiple_iterators=multi_iter)
read_data = []
if use_weights:
for read in reads:
read_data.append((read.is_reverse, read.is_unmapped,
read.get_tag('XW')))
if strand == "+":
read_count = 0.0
for i in read_data:
if not i[0] and not i[1]:
read_count += i[2]
elif strand == "-":
read_count = 0.0
for i in read_data:
if i[0] and not i[1]:
read_count += i[2]
elif strand == None:
read_count = sum([i[2] for i in read_data if not i[1]])
else:
raise ValueError("strand must be '+', '-', or None")
else:
read_data = [(read.is_reverse, read.is_unmapped)
for read in reads]
if strand == "+":
read_count = 0.0
for i in read_data:
if not i[0] and not i[1]:
read_count += 1.0
elif strand == "-":
read_count = 0.0
for i in read_data:
if i[0] and not i[1]:
read_count += 1.0
elif strand == None:
read_count = sum([1.0 for i in read_data if not i[1]])
else:
raise ValueError("strand must be '+', '-', or None")
return read_count
[docs] def get_weights(self, chrom, start, end, strand=None, f_offset=0,
r_offset=0, multi_iter=False):
"""Retrieve bias correction weights for a given region.
Retrieves the per position bias correction weights within the
specified genomic interval. The per read weights for a position are
totaled to get the per position weight. The BAM file must contain
bias correction information or else this method will raise an error.
Args:
chrom: A string for the name of the chromosome for the desired
genomic region. Names are derived from the BAM header.
start: An int for the 0-based start position of the genomic region.
end: An int for the 0-based end position of the genomic region. The
position 'end' denotes one position past the region of interest.
strand: The strand from which to obtain the weights. Must be
one of '+', '-', or 'None' (default). If 'None' is specified
then reads from both strands are included.
f_offset: An int denoting the offset downstream from the 5' end of
a forward (+) strand read. Equivalent to read shifting.
r_offset: An int denoting the offset upstream from the 5' end of a
reverse (-) strand read. Equivalent to read shifting.
multi_iter: A boolean indicating whether to enable multipe
iterators. This should be 'True' if multiple calls are made to
this method concurrently for the same Alignments object.
Returns:
A 2-D list of bias correction weights where the first dimension
represents a genomic position. Index 0 corresponds to 'start'. The
second dimension is a list of all the read weights at a position.
Raises:
KeyError - The tag field 'XW' cannot be found for a read.
ValueError - The genomic coordinates are out of range, are invalid,
or the file does not permit random access.
ValueError - 'strand' is not specified correctly.
"""
reads = self._alignment.fetch(chrom, start, end,
multiple_iterators=multi_iter)
pos_weights = {}
neg_weights = {}
for read in reads:
if read.is_unmapped: continue
weight = read.get_tag('XW')
if read.is_reverse:
cut_site = read.reference_end - r_offset - 1
if cut_site >= start and cut_site < end:
current_weight = neg_weights.get(cut_site, 0.0)
neg_weights[cut_site] = current_weight + weight
else:
cut_site = read.reference_start + f_offset
if cut_site >= start and cut_site < end:
current_weight = pos_weights.get(cut_site, 0.0)
pos_weights[cut_site] = current_weight + weight
if strand is None:
pos_weights = [pos_weights.get(x, 0.0) for x in range(start, end)]
neg_weights = [neg_weights.get(x, 0.0) for x in range(start, end)]
combined_weights = []
for x, y in zip(pos_weights, neg_weights):
combined_weights.append(x + y)
return combined_weights
elif strand == '+':
return [pos_weights.get(x, 0.0) for x in range(start, end)]
elif strand == '-':
return [neg_weights.get(x, 0.0) for x in range(start, end)]
else:
raise ValueError("strand must be '+', '-', or None")
[docs] def get_cut_sites(self, chrom, start, end, strand=None, f_offset=0,
r_offset=0, use_weights=False, multi_iter=False):
"""Retrieve DNaseI digestion sites for a genomic region.
Makes a list of counts relative to the start coordinate of the
genomic region specified using BED format chromosome coordinates i.e.,
0-based indexing [start, end). Cuts are assumed to be 5' read ends
unless offsets are specified.
Args:
chrom: A string for the name of the chromosome for the desired
genomic region. Names are derived from the BAM header.
start: An int for the 0-based start position of the genomic region.
end: An int for the 0-based end position of the genomic region. The
position 'end' denotes one position past the region of interest.
strand: The strand from which to obtain the cut counts. Must be
one of '+', '-', 'combined' or 'None' (default). If 'combined'
is specified then reads from both strands are aggregated. If
'None' is specified the read vectors for each strand are
concatenated with the '+' strand vector being first.
f_offset: An int denoting the offset downstream from the 5' end of
a forward (+) strand read. Equivalent to read shifting.
r_offset: An int denoting the offset upstream from the 5' end of a
reverse (-) strand read. Equivalent to read shifting.
use_weights: A boolean indicating whether bias correction weights
should be used for each read. The weights are expected to be
stored as an optional tag 'XW' for each read.
multi_iter: A boolean indicating whether to enable multipe
iterators. This should be 'True' if multiple calls are made to
this method concurrently for the same Alignments object.
Returns:
A list of counts as floats with positions corresponding to the
specified genomic interval.
Raises:
KeyError - The tag field 'XW' cannot be found for a read.
ValueError - The genomic coordinates are out of range, are invalid,
or the file does not permit random access.
ValueError - 'strand' is not specified correctly.
"""
if use_weights:
return self.get_weights(chrom, start, end, strand, f_offset,
r_offset, multi_iter)
reads = self._alignment.fetch(chrom, start, end,
multiple_iterators=multi_iter)
pos_cuts = {}
neg_cuts = {}
for read in reads:
if read.is_unmapped: continue
if read.is_reverse:
cut_site = read.reference_end - r_offset - 1
if cut_site >= start and cut_site < end:
neg_cuts[cut_site] = neg_cuts.get(cut_site, 0.0) + 1.0
else:
cut_site = read.reference_start + f_offset
if cut_site >= start and cut_site < end:
pos_cuts[cut_site] = pos_cuts.get(cut_site, 0.0) + 1.0
if strand is None:
pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)]
neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)]
combined_cuts = ([x + y for x, y in zip(pos_cuts, neg_cuts)])
return combined_cuts
elif strand == "combined":
pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)]
neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)]
concat_cuts = pos_cuts + neg_cuts
return concat_cuts
elif strand == '+':
return [pos_cuts.get(x, 0.0) for x in range(start, end)]
elif strand == '-':
return [neg_cuts.get(x, 0.0) for x in range(start, end)]
else:
err_msg = "strand must be '+', '-', 'combined', or None (default)"
raise ValueError(err_msg)