Source code for defcom.genome

#---------------------------------------------------------------------
# DeFCoM: A supervised learning genomic footprinter
# Copyright (C) 2016  Bryan Quach
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.
#----------------------------------------------------------------------

from os import path
from pysam import FastaFile

[docs]class Genome(object):

    """Access and process genomic sequence information.
    
    Provides functionality to retrieve and process genomic sequence data. The
    class acts as a wrapper around the pysam FastaFile class.
    
    Attributes:
        _genome: A pysam Fastafile object.

    Args:
        fasta_file: A string containing the path and filename for a genome 
            FASTA file. The file must have an index file with the same name but
            with the '.fai' extension appended. For example, the index file for
            '/path/file.fa' should be '/path/file.fa.fai'.

    Raises:
        IOError: FASTA file could not be found or opened.

        IOError: FASTA file index could not be found.
    """

    def __init__(self, fasta_file):
        """Initialize Genome class."""
        self._genome = self._load_fasta(fasta_file)

    def __del__(self):
        """Deconstructor for the Genome class."""
        try:
            self._genome.close()
        except:
            pass

    def _load_fasta(self, filename):
        """Load the FASTA file data into a FastaFile object.

        Args:
            filename: A string for the FASTA file to load. The path should be
                prepended if the file is located outside the working directory.

        Returns:
          A pysam FastaFile object.

        Raises:
            IOError: FASTA file could not be found or opened.

            IOError: FASTA file index could not be found.
        """
        print "Loading FASTA file %s..." % path.basename(filename)
        if not path.isfile(filename):
            error_msg = "%s could not be found." % filename
            raise IOError(error_msg)
        if not path.isfile(filename + ".fai"):
            error_msg = "Index file %s not found." % (filename + ".fai")
            raise IOError(error_msg)
        return FastaFile(filename)

[docs]    def get_genome_filename(self):
        """Get the associated FASTA file name.

        Returns:
            A string for the FASTA file name.
        """
        return path.basename(self._genome.filename)

[docs]    def get_chr_names(self):
        """Return the chromosome names associated with the genome.

        Returns:
            A tuple containing chromosome names as strings.
        """
        return self._genome.references

[docs]    def get_chr_length(self, chrom):
        """Get the length of specified chromosome.
        
        Args:
            chrom: A string specifying the chromosome of interest. Must follow
                the naming convention specified by the FASTA file.

        Returns:
            An int value representing the length of the chromosome.

        Raises:
            KeyError: Chromosome name could not be found.
        """
        try:
            return self._genome.get_reference_length(chrom)
        except KeyError:
            print ("Genome has the following chromosome/reference names:\n")
            for i in self._genome.references: print i
            error_msg = "\n%s not found in reference list!" % chrom
            raise KeyError(error_msg)

[docs]    def get_sequence(self, chrom, start, end, strand="+"):
        """Get nucleotide sequence from genomic coordinates.

        Retrieves the nucleotide sequence for a specified genomic region using
        BED format chromosome coordinates i.e., 0-based indexing [start, end).

        Args:
            chrom: A string specifying the chromosome of interest. Must follow
                the naming convention specified by the FASTA file.

            start: An int for the 0-based start position of the genomic region.

            end: An int for the 0-based end position of the genomic region. The
               position 'end' denotes one position past the region of interest.

            strand: The strand from which to obtain the cut counts. Must be 
               either '+' or '-'.

        Returns:
            A string of uppercase nucleotides corresponding to specified
            strand of the genomic region supplied.

        Raises:
            KeyError - The chromosome name is specified incorrectly.

            ValueError - The genomic coordinates are out of range, are invalid,
                or the file does not permit random access.

            ValueError - 'strand' is not specified correctly.
        """
        if strand == "+":
            return self._genome.fetch(chrom, start, end).upper()
        elif strand == "-":
            dna = self._genome.fetch(chrom, start+1, end+1).upper()
            return self.reverse_complement(dna)
        else:
            raise ValueError("strand must be '+' or '-'")

    @classmethod
[docs]    def complement(self, sequence):
        """Get the complement of the given DNA sequence.
           
        Args:
            sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}.
    
        Returns:
            A string with the complementary DNA sequence of the sequence 
            provided.

        Raises:
            KeyError: The string provided contains characters other than 
                'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', and 'n'.
        """
        seq_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N',
            'a':'T', 't':'A', 'g':'C', 'c':'G', 'n':'N',}
        try:        
            return "".join([seq_dict[i] for i in sequence.upper()])
        except KeyError:
            error_msg = "DNA sequence must be from {A,C,G,T,N,a,c,g,t,n}"
            raise KeyError(error_msg)
    
    @classmethod
[docs]    def reverse_complement(self, sequence):
        """Reverse complement the given DNA sequence.
           
        Args:
            sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}.
    
        Returns:
            A string with the reverse complement DNA sequence of the sequence 
            provided.

        Raises:
            KeyError: The string provided contains characters other than 
                'A', 'C', 'G', 'T', and 'N'.
        """
        return self.complement(sequence)[::-1]