Source code for mrparse.mr_sequence

"""
Created on 17 Nov 2018

@author: jmht & hlasimpk
"""

import copy
import os

from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqUtils import molecular_weight
from Bio.Alphabet import generic_protein
from Bio.SeqRecord import SeqRecord

SUFFIX_TO_TYPE = {'fasta': 'fasta',
                  'pir': 'pir',
                  'seq': 'fasta',
                  'fastq': 'fastq'}


[docs]class MultipleSequenceException(Exception): pass
[docs]class Sequence(object): """Class for handling sequence data""" def __init__(self, seq_file=None, sequence=None, sequence_type=None): self.sequence_file = seq_file self._molecular_weight = None self._bio_seq = None self._bio_seq_record = None if seq_file: self._read_sequence_file(seq_file, sequence_type) elif sequence: self._bio_seq = Seq(sequence) self._bio_seq_record = SeqRecord(self._bio_seq, generic_protein) self.nresidues = len(self._bio_seq) self.sequence = str(self._bio_seq) self.sequence = str(self._bio_seq) def __len__(self): if isinstance(self.nresidues, int): return self.nresidues return 0 def _read_sequence_file(self, seq_file, sequence_type): if sequence_type is None: sequence_type = self.sequence_type_from_filename(seq_file) if not sequence_type: raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file)) try: self._bio_seq_record = SeqIO.read(seq_file, sequence_type, alphabet=generic_protein) self._bio_seq = self._bio_seq_record.seq except ValueError: raise MultipleSequenceException
[docs] @staticmethod def sequence_type_from_filename(seq_file): _, suffix = os.path.splitext(seq_file) suffix = suffix.lstrip('.').lower() try: return SUFFIX_TO_TYPE[suffix] except KeyError: return None
@property def molecular_weight(self): if self._molecular_weight is None: self._calculate_molecular_weight() return self._molecular_weight def _calculate_molecular_weight(self): self._molecular_weight = molecular_weight(self._bio_seq, 'protein')
[docs] def write(self, seq_file, sequence_type=None, description=None): """Write sequence out to file seq_file of type sequence_type. Parameters ---------- seq_file : str The filename of the file sequence_type : str The type of the sequence (recogniseable to Biopython e.g. 'fasta') description : str The text to put on the first line of the file if different from that already in the record """ if sequence_type is None: sequence_type = self.sequence_type_from_filename(seq_file) if not sequence_type: raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file)) if description: seq_record = copy.copy(self._bio_seq_record) seq_record.id = description seq_record.description = description else: seq_record = self._bio_seq_record SeqIO.write(seq_record, seq_file, sequence_type)
[docs]def merge_multiple_sequences(seq_file): """ Function to merge multiple sequences from a fasta file Parameters ---------- seq_file : str The filename of the file Returns ------- seq : :obj: mrparse.mr_sequence.Sequence object Raises ------ RuntimeError """ sequence_type = Sequence.sequence_type_from_filename(seq_file) if not sequence_type: raise RuntimeError("Cannot determine sequence type from file: {}".format(seq_file)) sequence = "" identifier = [] previous_seqs = [] for seq in SeqIO.parse(seq_file, sequence_type, alphabet=generic_protein): if seq.seq in previous_seqs: continue sequence += seq.seq identifier.append(seq.id) previous_seqs.append(seq.seq) bio_seq_record = SeqRecord(sequence) bio_seq_record.id = "||".join(identifier) file_name = os.path.basename(seq_file).split('.')[0] merged_seq_file = os.path.join(os.getcwd(), '{}_merged.fasta'.format(file_name)) SeqIO.write(bio_seq_record, merged_seq_file, "fasta") return Sequence(merged_seq_file)