Source code for defcom.alignment

#---------------------------------------------------------------------
# DeFCoM: A supervised learning genomic footprinter
# Copyright (C) 2016  Bryan Quach
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#----------------------------------------------------------------------

from os import path
import pysam

[docs]class Alignments(object):

    """Access and process sequence alignment data.

    The Alignment class acts as a wrapper class for pysam to provide additional
    footprinting specific data processing functionality. Information about 
    pysam can be found at http://pysam.readthedocs.org/.

    Attributes:
        _alignment: A pysam AlignmentFile object.

    Args:
        aln_file: A string containing the path and filename for a sorted 
            BAM file. The file must have an index file with the same name 
            but with the '.bai' extension appended. For example, the index 
            file for '/path/file.bam' should be '/path/file.bam.bai'.

    Raises:
        IOError: BAM file could not be found.
        AssertionError: File is not a BAM file.
        IOError: BAM file index could not be found.
    """

    def __init__(self, aln_file):
        """Inits the Alignment class."""
        self._alignment = self._load_alignment(aln_file)

    def __del__(self):
        """Deconstructor.

        Closes the AlignmentFile object in '_alignment'.
        """
        try:
            self._alignment.close()
        except:
            pass

    def _load_alignment(self, filename):
        """Load the BAM data into an AlignmentFile object.

        Args:
            filename: A string for the BAM file to load. The path should be 
                prepended if the file is located outside the working directory.

        Returns:
            A pysam AlignmentFile object.
    
        Raises:
            IOError: BAM file could not be found.
            AssertionError: File is not a BAM file.
            IOError: BAM file index could not be found.
        """
        if not path.isfile(filename): 
            error_msg = "%s could not be found." % filename
            raise IOError(error_msg)
        if path.splitext(filename)[1] != ".bam":
            error_msg = "%s is not a BAM file." % path.basename(filename)
            raise AssertionError(error_msg)
        if not path.isfile(filename + ".bai"):
            error_msg = "Index file %s not found." % (filename + ".bai")
            raise IOError(error_msg)
        print "Loading BAM file %s..." % path.basename(filename)
        return pysam.AlignmentFile(filename, "rb")

[docs]    def get_mapped_read_count(self):
        """Get the number of mapped reads in the AlignmentFile object.

        Returns: A long int corresponding to the number of mapped reads in the
            '_alignment' object.
        """
        return self._alignment.mapped

[docs]    def get_unmapped_read_count(self):
        """Get the number of unmapped reads in the AlignmentFile object.

        Returns: A long int corresponding to the number of unmapped reads in 
            the '_alignment' object.
        """
        return self._alignment.unmapped
    
[docs]    def get_total_read_count(self):
        """Get the total number of reads in the Samfile object.

        Returns: A long int corresponding to the number of total reads in the
            '_alignment' object.
        """
        return self._alignment.mapped + self._alignment.unmapped

[docs]    def get_reads(self, chrom, start, end, multi_iter=False):
        """Get reads overlapping a genomic region.

        Retrieves the set of reads overlapping a specified genomic region using
        BED format chromosome coordinates i.e., 0-based indexing [start, end).

        Args:
            chrom: A string for the name of the chromosome for the desired 
                genomic region. Names are derived from the BAM header. 

            start: An int for the 0-based start position of the genomic region.

            end: An int for the 0-based end position of the genomic region. The
               position 'end' denotes one position past the region of interest.

            multi_iter: A boolean indicating whether to enable multipe 
                iterators. This should be 'True' if multiple calls are made to
                this method for the same Alignments object and the iterators 
                are used concurrently.

        Returns:
            An iterator of AlignedSegment objects.

        Raises:
            ValueError - The genomic coordinates are out of range, are invalid,
                or the file does not permit random access.
        """
        return self._alignment.fetch(chrom, start, end, 
            multiple_iterators=multi_iter)

[docs]    def get_read_density(self, chrom, start, end, strand=None, 
        use_weights=False, multi_iter=False):
        """Compute the total number of reads overlapping a genomic region.

        Calculates the read density within a genomic region specified using BED
        format chromosome coordinates i.e., 0-based indexing [start, end). 
        Strand-specific read density is computed if specified. It is 
        important to note that a read is counted merely if it overlaps the
        specified region. The read does not have to be fully contained in the
        region.

        Args:
            chrom: A string for the name of the chromosome for the desired 
                genomic region. Names are derived from the BAM header. 

            start: An int for the 0-based start position of the genomic region.

            end: An int for the 0-based end position of the genomic region. The
               position 'end' denotes one position past the region of interest.

            strand: The strand from which to obtain the read density. Must be 
               one of '+', '-', or 'None' (default). If 'None' is specified 
               then reads from both strands are included.

            multi_iter: A boolean indicating whether to enable multipe 
                iterators. This should be 'True' if multiple calls are made to
                this method concurrently for the same Alignments object.

        Returns:
            A float value corresponding to the total read count observed in
            the genomic interval.
        
        Raises:
            KeyError - The tag field 'XW' cannot be found for a read.

            ValueError - The genomic coordinates are out of range, are invalid,
                or the file does not permit random access.

            ValueError - 'strand' is not specified correctly.
        """
        reads = self._alignment.fetch(chrom, start, end, 
            multiple_iterators=multi_iter)
        read_data = []
        if use_weights:
            for read in reads:
                read_data.append((read.is_reverse, read.is_unmapped, 
                    read.get_tag('XW')))
            if strand == "+":
                read_count = 0.0
                for i in read_data:
                    if not i[0] and not i[1]:
                        read_count += i[2]
            elif strand == "-":
                read_count = 0.0
                for i in read_data:
                    if i[0] and not i[1]:
                        read_count += i[2]
            elif strand == None:
                read_count = sum([i[2] for i in read_data if not i[1]])
            else:
                raise ValueError("strand must be '+', '-', or None")
        else:
            read_data = [(read.is_reverse, read.is_unmapped) 
                for read in reads]
            if strand == "+":
                read_count = 0.0
                for i in read_data:
                    if not i[0] and not i[1]:
                        read_count += 1.0
            elif strand == "-":
                read_count = 0.0
                for i in read_data:
                    if i[0] and not i[1]:
                        read_count += 1.0
            elif strand == None:
                read_count = sum([1.0 for i in read_data if not i[1]])
            else:
                raise ValueError("strand must be '+', '-', or None")
        return read_count

[docs]    def get_weights(self, chrom, start, end, strand=None, f_offset=0,
        r_offset=0, multi_iter=False):
        """Retrieve bias correction weights for a given region.

        Retrieves the per position bias correction weights within the 
        specified genomic interval. The per read weights for a position are
        totaled to get the per position weight. The BAM file must contain
        bias correction information or else this method will raise an error.

        Args:
            chrom: A string for the name of the chromosome for the desired 
                genomic region. Names are derived from the BAM header. 

            start: An int for the 0-based start position of the genomic region.

            end: An int for the 0-based end position of the genomic region. The
               position 'end' denotes one position past the region of interest.

            strand: The strand from which to obtain the weights. Must be 
               one of '+', '-', or 'None' (default). If 'None' is specified 
               then reads from both strands are included.

            f_offset: An int denoting the offset downstream from the 5' end of
               a forward (+) strand read. Equivalent to read shifting.

            r_offset: An int denoting the offset upstream from the 5' end of a
               reverse (-) strand read. Equivalent to read shifting.

            multi_iter: A boolean indicating whether to enable multipe 
                iterators. This should be 'True' if multiple calls are made to
                this method concurrently for the same Alignments object.

        Returns:
            A 2-D list of bias correction weights where the first dimension
            represents a genomic position. Index 0 corresponds to 'start'. The
            second dimension is a list of all the read weights at a position.

        Raises:
            KeyError - The tag field 'XW' cannot be found for a read.

            ValueError - The genomic coordinates are out of range, are invalid,
                or the file does not permit random access.

            ValueError - 'strand' is not specified correctly.
        """
        reads = self._alignment.fetch(chrom, start, end, 
            multiple_iterators=multi_iter)
        pos_weights = {}
        neg_weights = {}
        for read in reads:
            if read.is_unmapped: continue
            weight = read.get_tag('XW')
            if read.is_reverse:
                cut_site = read.reference_end - r_offset - 1
                if cut_site >= start and cut_site < end:
                    current_weight = neg_weights.get(cut_site, 0.0)
                    neg_weights[cut_site] = current_weight + weight
            else:
                cut_site = read.reference_start + f_offset
                if cut_site >= start and cut_site < end:
                    current_weight = pos_weights.get(cut_site, 0.0)
                    pos_weights[cut_site] = current_weight + weight
        if strand is None:
            pos_weights = [pos_weights.get(x, 0.0) for x in range(start, end)]
            neg_weights = [neg_weights.get(x, 0.0) for x in range(start, end)]
            combined_weights = []
            for x, y in zip(pos_weights, neg_weights):
                combined_weights.append(x + y)
            return combined_weights
        elif strand == '+':
            return [pos_weights.get(x, 0.0) for x in range(start, end)]
        elif strand == '-':
            return [neg_weights.get(x, 0.0) for x in range(start, end)]
        else:
            raise ValueError("strand must be '+', '-', or None")

[docs]    def get_cut_sites(self, chrom, start, end, strand=None, f_offset=0,
        r_offset=0, use_weights=False, multi_iter=False):
        """Retrieve DNaseI digestion sites for a genomic region.

        Makes a list of counts relative to the start coordinate of the
        genomic region specified using BED format chromosome coordinates i.e., 
        0-based indexing [start, end). Cuts are assumed to be 5' read ends
        unless offsets are specified.

        Args:
            chrom: A string for the name of the chromosome for the desired 
                genomic region. Names are derived from the BAM header.
 
            start: An int for the 0-based start position of the genomic region.

            end: An int for the 0-based end position of the genomic region. The
               position 'end' denotes one position past the region of interest.

            strand: The strand from which to obtain the cut counts. Must be 
               one of '+', '-', 'combined' or 'None' (default). If 'combined' 
               is specified then reads from both strands are aggregated. If 
               'None' is specified the read vectors for each strand are 
               concatenated with the '+' strand vector being first.

            f_offset: An int denoting the offset downstream from the 5' end of
               a forward (+) strand read. Equivalent to read shifting.

            r_offset: An int denoting the offset upstream from the 5' end of a
               reverse (-) strand read. Equivalent to read shifting.

            use_weights: A boolean indicating whether bias correction weights 
                should be used for each read. The weights are expected to be
                stored as an optional tag 'XW' for each read.

            multi_iter: A boolean indicating whether to enable multipe 
                iterators. This should be 'True' if multiple calls are made to
                this method concurrently for the same Alignments object.

        Returns:
            A list of counts as floats with positions corresponding to the 
            specified genomic interval.

        Raises:
            KeyError - The tag field 'XW' cannot be found for a read.

            ValueError - The genomic coordinates are out of range, are invalid,
                or the file does not permit random access.

            ValueError - 'strand' is not specified correctly.
        """
        if use_weights:
            return self.get_weights(chrom, start, end, strand, f_offset, 
                r_offset, multi_iter)
        reads = self._alignment.fetch(chrom, start, end, 
            multiple_iterators=multi_iter)
        pos_cuts = {}
        neg_cuts = {}
        for read in reads:
            if read.is_unmapped: continue
            if read.is_reverse:
                cut_site = read.reference_end - r_offset - 1
                if cut_site >= start and cut_site < end:
                    neg_cuts[cut_site] = neg_cuts.get(cut_site, 0.0) + 1.0
            else:
                cut_site = read.reference_start + f_offset
                if cut_site >= start and cut_site < end:
                    pos_cuts[cut_site] = pos_cuts.get(cut_site, 0.0) + 1.0
        if strand is None:
            pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)]
            neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)]
            combined_cuts = ([x + y for x, y in zip(pos_cuts, neg_cuts)])
            return combined_cuts
        elif strand == "combined":
            pos_cuts = [pos_cuts.get(x, 0.0) for x in range(start, end)]
            neg_cuts = [neg_cuts.get(x, 0.0) for x in range(start, end)]
            concat_cuts = pos_cuts + neg_cuts
            return concat_cuts
        elif strand == '+':
            return [pos_cuts.get(x, 0.0) for x in range(start, end)]
        elif strand == '-':
            return [neg_cuts.get(x, 0.0) for x in range(start, end)]
        else:
            err_msg = "strand must be '+', '-', 'combined', or None (default)"
            raise ValueError(err_msg)