#---------------------------------------------------------------------
# DeFCoM: A supervised learning genomic footprinter
# Copyright (C) 2016 Bryan Quach
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#----------------------------------------------------------------------
from os import path
from pysam import FastaFile
[docs]class Genome(object):
"""Access and process genomic sequence information.
Provides functionality to retrieve and process genomic sequence data. The
class acts as a wrapper around the pysam FastaFile class.
Attributes:
_genome: A pysam Fastafile object.
Args:
fasta_file: A string containing the path and filename for a genome
FASTA file. The file must have an index file with the same name but
with the '.fai' extension appended. For example, the index file for
'/path/file.fa' should be '/path/file.fa.fai'.
Raises:
IOError: FASTA file could not be found or opened.
IOError: FASTA file index could not be found.
"""
def __init__(self, fasta_file):
"""Initialize Genome class."""
self._genome = self._load_fasta(fasta_file)
def __del__(self):
"""Deconstructor for the Genome class."""
try:
self._genome.close()
except:
pass
def _load_fasta(self, filename):
"""Load the FASTA file data into a FastaFile object.
Args:
filename: A string for the FASTA file to load. The path should be
prepended if the file is located outside the working directory.
Returns:
A pysam FastaFile object.
Raises:
IOError: FASTA file could not be found or opened.
IOError: FASTA file index could not be found.
"""
print "Loading FASTA file %s..." % path.basename(filename)
if not path.isfile(filename):
error_msg = "%s could not be found." % filename
raise IOError(error_msg)
if not path.isfile(filename + ".fai"):
error_msg = "Index file %s not found." % (filename + ".fai")
raise IOError(error_msg)
return FastaFile(filename)
[docs] def get_genome_filename(self):
"""Get the associated FASTA file name.
Returns:
A string for the FASTA file name.
"""
return path.basename(self._genome.filename)
[docs] def get_chr_names(self):
"""Return the chromosome names associated with the genome.
Returns:
A tuple containing chromosome names as strings.
"""
return self._genome.references
[docs] def get_chr_length(self, chrom):
"""Get the length of specified chromosome.
Args:
chrom: A string specifying the chromosome of interest. Must follow
the naming convention specified by the FASTA file.
Returns:
An int value representing the length of the chromosome.
Raises:
KeyError: Chromosome name could not be found.
"""
try:
return self._genome.get_reference_length(chrom)
except KeyError:
print ("Genome has the following chromosome/reference names:\n")
for i in self._genome.references: print i
error_msg = "\n%s not found in reference list!" % chrom
raise KeyError(error_msg)
[docs] def get_sequence(self, chrom, start, end, strand="+"):
"""Get nucleotide sequence from genomic coordinates.
Retrieves the nucleotide sequence for a specified genomic region using
BED format chromosome coordinates i.e., 0-based indexing [start, end).
Args:
chrom: A string specifying the chromosome of interest. Must follow
the naming convention specified by the FASTA file.
start: An int for the 0-based start position of the genomic region.
end: An int for the 0-based end position of the genomic region. The
position 'end' denotes one position past the region of interest.
strand: The strand from which to obtain the cut counts. Must be
either '+' or '-'.
Returns:
A string of uppercase nucleotides corresponding to specified
strand of the genomic region supplied.
Raises:
KeyError - The chromosome name is specified incorrectly.
ValueError - The genomic coordinates are out of range, are invalid,
or the file does not permit random access.
ValueError - 'strand' is not specified correctly.
"""
if strand == "+":
return self._genome.fetch(chrom, start, end).upper()
elif strand == "-":
dna = self._genome.fetch(chrom, start+1, end+1).upper()
return self.reverse_complement(dna)
else:
raise ValueError("strand must be '+' or '-'")
@classmethod
[docs] def complement(self, sequence):
"""Get the complement of the given DNA sequence.
Args:
sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}.
Returns:
A string with the complementary DNA sequence of the sequence
provided.
Raises:
KeyError: The string provided contains characters other than
'A', 'C', 'G', 'T', 'N', 'a', 'c', 'g', 't', and 'n'.
"""
seq_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G', 'N':'N',
'a':'T', 't':'A', 'g':'C', 'c':'G', 'n':'N',}
try:
return "".join([seq_dict[i] for i in sequence.upper()])
except KeyError:
error_msg = "DNA sequence must be from {A,C,G,T,N,a,c,g,t,n}"
raise KeyError(error_msg)
@classmethod
[docs] def reverse_complement(self, sequence):
"""Reverse complement the given DNA sequence.
Args:
sequence: A string from alphabet {A,C,G,T,N,a,c,g,t,n}.
Returns:
A string with the reverse complement DNA sequence of the sequence
provided.
Raises:
KeyError: The string provided contains characters other than
'A', 'C', 'G', 'T', and 'N'.
"""
return self.complement(sequence)[::-1]