Source code for mrparse.mr_homolog

"""
Created on 18 Oct 2018

@author: jmht
"""
from collections import OrderedDict
import copy
import glob
import logging
import os
from simbad.util.pdb_util import PdbStructure


[docs]class PdbModelException(Exception): pass
PDB_BASE_URL = 'https://www.rcsb.org/structure/' PDB_DIR = 'pdb_files' HOMOLOGS_DIR = 'homologs' logger = logging.getLogger(__name__)
[docs]class HomologData(object): OBJECT_ATTRIBUTES = ['hit', 'region'] def __init__(self): self.ellg = None self.frac_scat = None self.molecular_weight = None self.ncopies = None self.pdb_url = None self.pdb_file = None self.resolution = None self.rmsd = None self.total_frac_scat = None self.total_frac_scat_known = None # OBJECT_ATTRIBUTES - need to remove from static_dict self.hit = None # mr_hit.SequenceHit self.region = None @property def chain_id(self): return self._get_child_attr('hit', 'chain_id') @property def length(self): return self._get_child_attr('hit', 'length') @property def name(self): return self._get_child_attr('hit', 'name') @property def pdb_id(self): return self._get_child_attr('hit', 'pdb_id') @property def range(self): return (self.query_start, self.query_stop) @property def region_id(self): return self._get_child_attr('hit', 'region_id') @property def region_index(self): return self._get_child_attr('hit', 'region_index') @property def score(self): return self._get_child_attr('hit', 'score') @property def seq_ident(self): seq_id = self._get_child_attr('hit', 'local_sequence_identity') if seq_id: return seq_id / 100.0 return None @property def query_start(self): return self._get_child_attr('hit', 'query_start') @property def query_stop(self): return self._get_child_attr('hit', 'query_stop') # miscellaneous properties @property def static_dict(self): """Return a self representation with all properties resolved, suitable for JSON""" d = copy.copy(self.__dict__) for k in self.__dict__.keys(): if k in self.OBJECT_ATTRIBUTES: d.pop(k) # Get all properties for name in dir(self.__class__): obj = getattr(self.__class__, name) if name == 'static_dict': # Skip this property to avoid infinite recursion continue if isinstance(obj, property): val = obj.__get__(self, self.__class__) d[name] = val # Need to add in properties as these aren't included # FIX ONCE UPDATED JS TO HANDLE TWO INTS d['range'] = "{}-{}".format(*self.range) return d def _get_child_attr(self, child, attr): if hasattr(self, child): child = getattr(self, child) if hasattr(child, attr): return getattr(child, attr) return None def __str__(self): attrs = [k for k in self.__dict__.keys() if not k.startswith('_')] INDENT = " " out_str = "Class: {}\nData:\n".format(self.__class__) for a in sorted(attrs): out_str += INDENT + "{} : {}\n".format(a, self.__dict__[a]) return out_str
[docs]def homologs_from_hits(hits, pdb_dir=None): if not os.path.isdir(HOMOLOGS_DIR): os.mkdir(HOMOLOGS_DIR) homologs = OrderedDict() for hit in hits.values(): hlog = HomologData() hlog.hit = hit hit._homolog = hlog hlog.pdb_url = PDB_BASE_URL + hit.pdb_id try: hlog.pdb_file, hlog.molecular_weight, hlog.resolution = prepare_pdb(hit, pdb_dir) except PdbModelException as e: logger.critical("Error processing hit pdb %s", e.message) homologs[hlog.name] = hlog return homologs
[docs]def prepare_pdb(hit, pdb_dir): """ Download pdb or take file from cache trucate to required residues calculate the MW """ if pdb_dir != "None": prefix, ext = os.path.basename(glob.glob(os.path.join(pdb_dir, "*", "*"))[0]).split(".", 1) prefix = prefix[:-4] pdb_file = os.path.join(pdb_dir, "{0}", "{1}{2}.{3}").format(hit.pdb_id[1:3].lower(), prefix, hit.pdb_id.lower(), ext) else: if not os.path.isdir(PDB_DIR): os.mkdir(PDB_DIR) pdb_file = os.path.join(PDB_DIR, "{0}.pdb").format(hit.pdb_id.lower()) pdb_struct = PdbStructure() if os.path.isfile(pdb_file): pdb_struct = pdb_struct.from_file(pdb_file) else: try: pdb_struct = pdb_struct.from_pdb_code(hit.pdb_id) except RuntimeError: # SIMBAD currently raises an empty RuntimeError for download problems. raise PdbModelException("Error downloading PDB file for: {}".format(hit.pdb_id)) pdb_struct.save(pdb_file) resolution = pdb_struct.structure.resolution pdb_struct.standardize() pdb_struct.select_chain_by_id(hit.chain_id) seqid_range = range(hit.hit_start, hit.hit_stop + 1) pdb_struct.select_residues(to_keep_idx=seqid_range) truncated_pdb_name = "{}_{}_{}-{}.pdb".format(hit.pdb_id, hit.chain_id, hit.hit_start, hit.hit_stop) truncated_pdb_path = os.path.join(HOMOLOGS_DIR, truncated_pdb_name) pdb_struct.save(truncated_pdb_path, remarks=["PHASER ENSEMBLE MODEL 1 ID {}".format(hit.local_sequence_identity)]) return truncated_pdb_path, int(round(pdb_struct.molecular_weight)), resolution
[docs]def calculate_ellg(homologs, hkl_info): """Run PHASER to calculate the eLLG values and update the homolog data Sourced from: ccp4-src-2016-02-10/checkout/cctbx-phaser-dials-2015-12-22/phaser/phaser/CalcCCFromMRsolutions.py""" if not (hkl_info.molecular_weight and hkl_info.predicted_ncopies): raise RuntimeError("Cannot calculate eLLGs without molecular_weight and predicted ncopies") import phaser mrinput = phaser.InputMR_DAT() mrinput.setHKLI(hkl_info.hklin) mrinput.setMUTE(True) if hkl_info.labels.i and hkl_info.labels.sigi: mrinput.setLABI_I_SIGI(hkl_info.labels.i, hkl_info.labels.sigi) elif hkl_info.labels.f and hkl_info.labels.sigf: mrinput.setLABI_F_SIGF(hkl_info.labels.f, hkl_info.labels.sigf) else: msg = "No flags for intensities or amplitudes have been provided" raise RuntimeError(msg) datrun = phaser.runMR_DAT(mrinput) if not datrun.Success(): raise RuntimeError("Failed to initialise PHASER input.") ellginput = phaser.InputMR_ELLG() ellginput.setSPAC_HALL(datrun.getSpaceGroupHall()) ellginput.setCELL6(datrun.getUnitCell()) ellginput.setREFL_DATA(datrun.getDATA()) ellginput.setMUTE(True) # Should calculate MW without the search model so that the total MW will be correct when we add the search model ellginput.addCOMP_PROT_MW_NUM(hkl_info.molecular_weight, hkl_info.predicted_ncopies) search_models = [] for hname, d in homologs.items(): if d.pdb_file and d.seq_ident: ellginput.addENSE_PDB_ID(hname, d.pdb_file, d.seq_ident) search_models.append(hname) else: d.ellg = -1 # Set to -1 so that sorting works properly logger.warn("Cannot calculate eLLG for homolog {} due to missing data.".format(hname)) ellginput.addSEAR_ENSE_OR_ENSE_NUM(search_models, 1) runellg = phaser.runMR_ELLG(ellginput) phaser_log = 'phaser1.log' with open(phaser_log, 'w') as w: w.write(runellg.summary()) ellg_data_from_phaser_log(phaser_log, homologs)
[docs]def ellg_data_from_phaser_log(fpath, homologs): with open(fpath) as fh: line = fh.readline() while line: # Get base homolog data if line.strip().startswith('eLLG: eLLG of ensemble alone'): fh.readline() while True: line = fh.readline() if not line.strip(): break eLLG, rmsd, frac_scat, name = line.strip().split() h = homologs[name] h.ellg = float(eLLG) h.rmsd = float(rmsd) h.frac_scat = float(frac_scat) # Get ncopies if line.strip().startswith('Number of copies for eLLG target'): fh.readline() fh.readline() while True: line = fh.readline() if not line.strip(): break _, _, total_frac_scat_known, total_frac_scat, ncopies, homolog = line.strip().split() h = homologs[homolog] h.total_frac_scat_known = float(total_frac_scat_known) h.total_frac_scat = float(total_frac_scat) h.ncopies = int(ncopies) line = fh.readline() return homologs