Source code for mrparse.mr_jpred

"""
Created on 14 Nov 2018

@author: jmht
"""
import logging
import os
import re
import shutil
import tarfile

from mrparse.mr_annotation import AnnotationSymbol, SequenceAnnotation
from mrparse.mr_util import now, run_cmd

JPRED_SUBMISSION_EMAIL = 'jens.thomas@liverpool.ac.uk'

logger = logging.getLogger(__name__)
#logger.addHandler(logging.NullHandler()

HELIX = AnnotationSymbol()
HELIX.symbol = 'H'
HELIX.stype = 'Alpha Helix'
HELIX.name = 'helix'

SHEET = AnnotationSymbol()
SHEET.symbol = 'E'
SHEET.stype = 'Strand'
SHEET.name = 'strand'


[docs]class OutOfTimeException(Exception):
    pass


[docs]class JPred(object):
    def __init__(self, seq_info=None):
        self.seq_info = seq_info
        script_dir = os.path.join(os.path.abspath(os.path.dirname(__file__)),'scripts')
        self.jpred_script = os.path.join(script_dir, 'jpredapi')
        self.prediction = None
        self.exception = None

[docs]    def find_results_file(self, jpred_rundir):
        if not os.path.isdir(jpred_rundir):
            raise RuntimeError("Cannot find directory:%s" % jpred_rundir)
        out_concise = [f for f in os.listdir(jpred_rundir) if f.endswith('.concise')][0]
        out_concise = os.path.join(jpred_rundir, out_concise)
        return out_concise

[docs]    @staticmethod
    def parse_jpred_output(out_concise):
        ss_pred = None
        cc_28 = None
        logger.debug('Parsing JPRED concise output: %s' % out_concise)
        with open(out_concise) as f:
            line = f.readline()
            while line:
                prefix = 'Lupas_28:'
                if line.startswith(prefix):
                    line = line.strip().replace(prefix,'')
                    cc_28 = "".join(line.split(","))
                prefix = 'jnetpred:'
                if line.startswith(prefix):
                    line = line.strip().replace(prefix,'')
                    ss_pred = "".join(line.split(","))
                line = f.readline()
        assert ss_pred and cc_28
        return ss_pred, cc_28

[docs]    @staticmethod
    def parse_results_output(output):
        """Parse directory path of JPRED
        
Your job status will be checked with the following parameters:
JobId: jp_H_5vG49
getResults: yes
checkEvery: 10 [sec]
Thu Nov 29 15:02:01 2018    --->    Job jp_H_5vG49 finished. Results available at the following URL:
http://www.compbio.dundee.ac.uk/jpred4/results/jp_H_5vG49/jp_H_5vG49.results.html


Will attempt to download results now (using 'wget') from:
http://www.compbio.dundee.ac.uk/jpred4/results/jp_H_5vG49/jp_H_5vG49.tar.gz

Job results archive is now available at: jp_H_5vG49/jp_H_5vG49.tar.gz
        """
        dpath = None
        mregx = 'Job results archive is now available at: ?(\S+/\S+\.tar\.gz)'
        match = re.search(mregx, output)
        if not match:
            raise RuntimeError("Cannot parse directory path from output: {}".format(output))
        dpath = match.group(1)
        return dpath


[docs]    @staticmethod
    def parse_status_url(output):
        """Parse jobid and status url for JPRED
        
Your job will be submitted with the following parameters:
file: ../data/Q13586.fasta
format: seq
skipPDB: on
email: jens.thomas@liverpool.ac.uk
name: jens_test_job


Created JPred job with jobid: jp_H_5vG49
You can check the status of the job using the following URL: http://www.compbio.dundee.ac.uk/jpred4/cgi-bin/chklog?jp_H_5vG49
...or using 'perl jpredapi status jobid=jp_H_5vG49 getResults=yes checkEvery=60 silent' command
(Check documentation for more details.)

        """
        jobid, status_url = None, None
        mregx = 'Created JPred job with jobid: (\S+)\s+You can check the status of the job using the following URL: ?(http?://\S+)'
        match = re.search(mregx, output)
        if not match:
            raise RuntimeError("Cannot parse jobid and status_url from output: {}".format(output))
        jobid = match.group(1)
        status_url = match.group(2)
        return jobid, status_url

[docs]    @staticmethod
    def create_annotation(annotation):
        ann = SequenceAnnotation()
        ann.source = 'Jpred server'
        ann.annotation = annotation
        ann.library_add_annotation(HELIX)
        ann.library_add_annotation(SHEET)
        ann.scores = [1.0] * len(annotation)
        return ann

[docs]    def get_prediction(self, download_tgz=None, jpred_output=None, cleanup=False):
        """Calculate SS using the online JPRED server
        
        Parameters
        ----------
        download_tgz : str
           A results tar.gz file from the JPRED server - FOR RUNNING  UNIT TESTS
        jpred_output : str
           A results file from the JPRED server - FOR RUNNING  UNIT TESTS
        cleanup : bool
           Delete the downloaded/unpacked results
        """
        if not (download_tgz or jpred_output): # for testing
            if not os.path.isfile(self.seq_info.sequence_file):
                msg = "Cannot find JPRED sequence file: %s" % self.seq_info.sequence_file
                self.exception = msg
                logger.critical(msg)
                raise RuntimeError(msg)
            try:
                download_tgz = self.run_jpred(self.seq_info.sequence_file)
            except Exception as e:
                logger.critical(e)
                self.exception = e
                raise e
        if not jpred_output:
            results_directory = self.unpack_results(download_tgz)
            jpred_output = self.find_results_file(results_directory)
        ss_pred, _ = self.parse_jpred_output(jpred_output)
        if cleanup:
            self.cleanup(results_directory)
        self.prediction = self.create_annotation(ss_pred)
        logger.debug("JPred finished prediction at: %s" % now())
        return self.prediction

[docs]    def run_jpred(self, seqin):
        logger.debug("JPred starting prediction at: %s" % now())
        jobid = self.submit_job(seqin)
        download_tgz = self.get_results(jobid)
        return download_tgz
    
[docs]    def submit_job(self, seqin):
        cmd = [self.jpred_script,
               'submit',
               'file=%s' % seqin,
               'mode=single',
               'format=fasta',
#                'email={}'.format(JPRED_SUBMISSION_EMAIL),
               'name=ccp4_mrparse_submission',
               'skipPDB=on']
        out = run_cmd(cmd)
        jobid, status_url = self.parse_status_url(out)
        logger.info("*** Submitted JPRED job with id %s - check its progress here: %s", jobid, status_url)
        return jobid
        
[docs]    def get_results(self, jobid):
        """Check results and download from the server"""
        cmd = [self.jpred_script,
               'status',
               'jobid=%s' % jobid,
               'getResults=yes',
               'checkEvery=10']
        out = run_cmd(cmd)
        download_tgz = self.parse_results_output(out)
        download_tgz = os.path.abspath(download_tgz)
        logger.debug("JPred results downloaded to: %s" % download_tgz)
        return download_tgz

[docs]    def unpack_results(self, download_tgz):
        job_directory = os.path.dirname(download_tgz)
        with tarfile.open(download_tgz, 'r:*') as tf:
            if not tf.getmembers():
                raise RuntimeError('Empty archive: %s' % download_tgz)
            tf.extractall(path=job_directory)
        logger.debug('Extracted jpred files to: %s' % job_directory)
        return job_directory

[docs]    def cleanup(self, results_directory):
        if results_directory and os.path.isdir(results_directory):
            logger.debug('Removing jpred results directory: %s' % results_directory)
            shutil.rmtree(results_directory)