Source code for mrparse.mr_sequence

"""
Created on 17 Nov 2018

@author: jmht & hlasimpk
"""

import copy
import os

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import molecular_weight
from Bio.Alphabet import generic_protein
from Bio.SeqRecord import SeqRecord

SUFFIX_TO_TYPE = {'fasta': 'fasta',
                  'pir': 'pir',
                  'seq': 'fasta',
                  'fastq': 'fastq'}


[docs]class MultipleSequenceException(Exception):
    pass


[docs]class Sequence(object):
    """Class for handling sequence data"""
    def __init__(self, seq_file=None, sequence=None, sequence_type=None):
        self.sequence_file = seq_file
        self._molecular_weight = None
        self._bio_seq = None
        self._bio_seq_record = None
        if seq_file:
            self._read_sequence_file(seq_file, sequence_type)
        elif sequence:
            self._bio_seq = Seq(sequence)
            self._bio_seq_record = SeqRecord(self._bio_seq, generic_protein)
        self.nresidues = len(self._bio_seq)
        self.sequence = str(self._bio_seq)
        self.sequence = str(self._bio_seq)

    def __len__(self):
        if isinstance(self.nresidues, int):
            return self.nresidues
        return 0

    def _read_sequence_file(self, seq_file, sequence_type):
        if sequence_type is None:
            sequence_type = self.sequence_type_from_filename(seq_file)
            if not sequence_type:
                raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file))

        try:
            self._bio_seq_record = SeqIO.read(seq_file, sequence_type, alphabet=generic_protein)
            self._bio_seq = self._bio_seq_record.seq
        except ValueError:
            raise MultipleSequenceException

[docs]    @staticmethod
    def sequence_type_from_filename(seq_file):
        _, suffix = os.path.splitext(seq_file)
        suffix = suffix.lstrip('.').lower()
        try:
            return SUFFIX_TO_TYPE[suffix]
        except KeyError:
            return None

    @property
    def molecular_weight(self):
        if self._molecular_weight is None:
            self._calculate_molecular_weight()
        return self._molecular_weight

    def _calculate_molecular_weight(self):
        self._molecular_weight = molecular_weight(self._bio_seq, 'protein')

[docs]    def write(self, seq_file, sequence_type=None, description=None):
        """Write sequence out to file seq_file of type sequence_type.

        Parameters
        ----------
        seq_file : str
           The filename of the file
        sequence_type : str
           The type of the sequence (recogniseable to Biopython e.g. 'fasta') 
        description : str
           The text to put on the first line of the file if different from that already in the record
        """
        if sequence_type is None:
            sequence_type = self.sequence_type_from_filename(seq_file)
            if not sequence_type:
                raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file))
        if description:
            seq_record = copy.copy(self._bio_seq_record)
            seq_record.id = description
            seq_record.description = description
        else:
            seq_record = self._bio_seq_record
        SeqIO.write(seq_record, seq_file, sequence_type)


[docs]def merge_multiple_sequences(seq_file):
    """
    Function to merge multiple sequences from a fasta file

    Parameters
    ----------
    seq_file : str
        The filename of the file

    Returns
    -------
    seq : :obj:
        mrparse.mr_sequence.Sequence object

    Raises
    ------
    RuntimeError
    """

    sequence_type = Sequence.sequence_type_from_filename(seq_file)
    if not sequence_type:
        raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file))

    sequence = ""
    identifier = []
    previous_seqs = []
    for seq in SeqIO.parse(seq_file, sequence_type, alphabet=generic_protein):
        if seq.seq in previous_seqs:
            continue
        sequence += seq.seq
        identifier.append(seq.id)
        previous_seqs.append(seq.seq)

    bio_seq_record = SeqRecord(sequence)
    bio_seq_record.id = "||".join(identifier)

    file_name = os.path.basename(seq_file).split('.')[0]
    merged_seq_file = os.path.join(os.getcwd(), '{}_merged.fasta'.format(file_name))
    SeqIO.write(bio_seq_record, merged_seq_file, "fasta")

    return Sequence(merged_seq_file)