#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# completemodel : loop modelling using FREAD and MODELLER
#
# Author: Sebastian Kelm
# Created: 01/12/2011
#
#



import sys, os.path

if __name__ == "__main__":
  # Make sure we can import stuff
  sys.path.append(os.path.abspath(os.path.dirname(sys.argv[0])+"/../lib-python"))

from prosci.common import *
from prosci.util.ali import Ali
from prosci.util.pdb import Pdb
from prosci.util.residue import ResidueList
from prosci.util.pdb3d import dist
from prosci.util.gaps import count_nongaps, deGappify, isGap
from prosci.medeller.fread import getFreadPath, modelAllGaps, mapCore2TargetSeq
from prosci.loops.fread import FREAD_RANKING_BY_ESSS



if __name__ == "__main__":
    
    
    FREAD_IS_INSTALLED = bool(getFreadPath())
    
    
    import prosci.shell
    
    ################################
    # Command line option handling #
    ################################
    
    # 'history', 
    params = prosci.shell.Params(
       allowed=['prefix', 'atm', 'verbose', 'refine_sidechains', 'dbtype', 'scwrl', 'debug'],
       required=['seq', 'struc'],
       withargument=['prefix', 'seq', 'struc', 'dbtype', 'extension_max'],
       helpoption='help',
       usage="""
USAGE:
  
    $0 [OPTIONS] --seq <amino acid sequence> --struc <PDB file of model>
      \n""",
       
       help="""
  REQUIRED INPUT:

    --seq SEQUENCE  
              SEQUENCE can be the literal complete amino acid sequence of the complete target protein,
              or the path to a file containing it (either literally or in FASTA format).
              e.g. 'AAFSASKHEKSS'
              e.g. 'complete_sequence.txt'
    
    --struc PDBFILE
              PDBFILE is the filename of the PDB file containing the incomplete model's co-ordinates
  
  OTHER OPTIONS:
    
    --help
        Print this message and exit
    
    --prefix PREFIX
        Sets the prefix for output files to PREFIX
        (if PREFIX does not end with a dot, an additional dot is inserted between PREFIX and the
        default file name)
    
    --atm
        Output files with the .atm extension, instead of .pdb
    
    --verbose
        Print status messages
    
    --debug
        Enable debug mode.
        Don't regenerate HiAcc and HiCov models if they are already present.
  
  
  ALGORITHM BEHAVIOUR:
    
    --dbtype TYPE
        Sets the database type to use during FREAD loop modelling phases.
        TYPE can be one of the following values:
          mem           Membrane protein database
          sol           Soluble protein database
          ab            Immunoglobulin-like (antibody-like) database
    
    --extension_max INT
        Set maximum FREAD loop extension to INT. (Default: -1, i.e. unlimited)
        Set this to 0 to switch off extension altogether.
    
    --scwrl
        Model missing sidechains using SCWRL4. The executable 'Scwrl4' must be
        in your PATH environment variable.
    
    --refine_sidechains
        Refine sidechains using Modeller. This makes the final modelling phase
        (phase 6) much slower, but will reduce sidechain clashes. Use this if
        you want decent sidechains but don't have SCWRL installed. This only
        has an effect if you are creating a complete model and are NOT using the
        --scwrl option.
  
  
  DEFAULT OUTPUT FILES:
  
    Output filenames will have the format PREFIX.SUFFIX
    For details on PREFIX, see --prefix
    SUFFIX is one of the following:
      
    
      SUFFIX                  FILE CONTENT
      
      core.ali                Core model sequence, after parsing
      core.pdb                Core model structure, after parsing
      
      hiacc.ali               High accuracy model sequence
      hiacc.pdb               High accuracy model structure
                              (core + high accuracy database loops)
      
      hicov.ali               High coverage model sequence
      hicov.pdb               High coverage model structure
                              (hiacc + low accuracy db loops)
      
      complete.ali            Complete model sequence
      complete.pdb            Complete model structure
                              (hicov + ab initio loops)
      
      core.log                Log of core building method
      loops.database.log      Log of database  loop prediction method
      loops.abinitio.log      Log of ab initio loop prediction method
      \n""")
    
    if params.isOpt('verbose'):
      print "Parsing input files"
    
    if os.path.isfile(params.getOpt("seq")):
      UNGAPPED_TARGET_SEQ = read_file(params.getOpt("seq"))
    else:
      UNGAPPED_TARGET_SEQ = params.getOpt("seq")
    if ">" in UNGAPPED_TARGET_SEQ:
      UNGAPPED_TARGET_SEQ = Ali(UNGAPPED_TARGET_SEQ, fasta_mode=True)[0].master.seq
    UNGAPPED_TARGET_SEQ = deGappify(filter_string(UNGAPPED_TARGET_SEQ, allow=["printable"], deny=["whitespace"]))
    
    TARGET_LENGTH = len(UNGAPPED_TARGET_SEQ)
    
    if params.isOpt('verbose'):
      print "Modelling protein with sequence:"
      print UNGAPPED_TARGET_SEQ
    
    database_type = params.getOpt("dbtype", "mem")
    
    if params.isOpt("scwrl"):
      from prosci.medeller.scwrl import modelSidechains
    
    extension_max = None
    if params.isOpt("extension_max"):
      extension_max = int(params.getOpt("extension_max"))
      if extension_max < 0:
        extension_max = None
    
    outsuffix = "pdb"
    if params.isOpt("atm"):
      outsuffix = "atm"
    
    # Output filenames
    outprefix = params.getOpt("prefix", "")
    if outprefix and not outprefix.endswith('.'):
      outprefix += "."
    
    core_struc_filename        = "%score.%s"            %(outprefix, outsuffix)
    core_seq_filename          = "%score.ali"           %(outprefix)
    hiacc_struc_filename       = "%shiacc.%s"           %(outprefix, outsuffix)
    hiacc_seq_filename         = "%shiacc.ali"          %(outprefix)
    hicov_struc_filename       = "%shicov.%s"           %(outprefix, outsuffix)
    hicov_seq_filename         = "%shicov.ali"          %(outprefix)
    complete_struc_filename    = "%scomplete.%s"        %(outprefix, outsuffix)
    complete_seq_filename      = "%scomplete.ali"       %(outprefix)
    
    loop_log_database_filename = "%sloops.database.log" %(outprefix)
    loop_log_abinitio_filename = "%sloops.abinitio.log" %(outprefix)
    #loop_indeces_filename     = "%sloop.indeces.table" %(outprefix)
    
    
    
#     def writeFreadSeqFile(fname, seqid, seq):
#       header = ">%s\nstructure:%s:   1 :@:%4d ::unknown:unknown:-1:-1"%(seqid, seqid, len(seq))
#       f = open(fname, 'w')
#       f.write(header)
#       for j in xrange(0, len(seq), 60):
#         f.write("\n")
#         f.write(seq[j:j+60])
#       f.write("*\n")
#       f.close()
    
    
    
    ##########################
    # Read in the core model #
    ##########################
    
    coreModel = ResidueList(params.getOpt("struc"))
    
    if params.isOpt("scwrl"):
      if params.isOpt('verbose'):
        print "Modelling sidechains (core)"
      coreModelWithSidechains = modelSidechains(coreModel)
      if len(coreModelWithSidechains) < len(coreModel):
        sys.stderr.write("Warning: Something went wrong during sidechain modelling for Core model\n")
      else:
        coreModel = coreModelWithSidechains
    
    # Write high accuracy structure file
    #
    f = open(core_struc_filename, 'w')
    f.write(str(coreModel))
    f.close()
    
    
    template_searchdirs = [os.path.abspath(os.path.dirname(outprefix)), os.path.abspath(os.path.dirname(params.getOpt("struc")))]
    
    
    target_residue_indeces_core, alignment = mapCore2TargetSeq(UNGAPPED_TARGET_SEQ, coreModel, renumber=True)
    
    
    # Write core structure file
    #
    
#     if not params.isOpt("readonly_loops"):
#       # Write sequence files
#       # 
#       #
#       writeFreadSeqFile(complete_seq_filename, target_id, UNGAPPED_TARGET_SEQ)
#       
#       #writeFreadSeqFile(core_seq_filename, target_id, coreModel.getSeq(gapped=True, firstres=1, lastres=TARGET_LENGTH))
    
    
    
    
    ##########################################
    # Fill gaps in the core model with FREAD #
    ##########################################

    
    if params.isOpt('verbose'):
      if not FREAD_IS_INSTALLED and not (params.isOpt('4') and params.isOpt('5')):
        print "NOTICE: FREAD is not installed. Cannot produce high-accuracy and high-coverage models."
    
    
    if not params.isOpt('4') and FREAD_IS_INSTALLED:
      if params.isOpt('verbose'):
        print "Modelling loops (high-accuracy)"
      
      if not (params.isOpt('debug') and os.path.exists(hiacc_struc_filename)):
        # Model loops with high accuracy
        #
        modelAllGaps(core_struc_filename, UNGAPPED_TARGET_SEQ, target_residue_indeces_core, hiacc_struc_filename, "%shiacc.decoys"%(outprefix), loop_log_database_filename, score_cutoff=25, max_extension=extension_max, dbtype=database_type, minb=4)
      
      hiaccModel = ResidueList(hiacc_struc_filename)
      
      if params.isOpt("scwrl"):
        if params.isOpt('verbose'):
          print "Modelling sidechains (high-accuracy)"
        hiaccModelWithSidechains = modelSidechains(hiaccModel)
        if len(hiaccModelWithSidechains) < len(hiaccModel):
          sys.stderr.write("Warning: Something went wrong during sidechain modelling for HiAcc model\n")
        else:
          hiaccModel = coreModelWithSidechains
          f = open(hiacc_struc_filename, 'w')
          f.write(str(hiaccModel))
          f.close()
      
      # Write list of all gaps, as modelled (or not modelled) by FREAD
      # If FREAD found a hit through gap extension, this gap's "length" will differ from that found in the core model's gap list!
      #
      #f = open(loop_indeces_filename, 'w')
      #for g in gaplist:
        #f.write(join(", ", g))
        #f.write("\n")
      #f.close()
      
      
      # Write high accuracy structure file
      #
      f = open(hiacc_struc_filename, 'w')
      f.write(str(hiaccModel))
      f.close()
      
      
      # Write high accuracy sequence file
      #
      #writeFreadSeqFile(hiacc_seq_filename, template_id, hiaccModel.getSeq(gapped=True, firstres=1, lastres=TARGET_LENGTH))
      #writeModellerAlignmentFile(modeller_alignment_filename, "model", hiacc_struc_filename, hiaccModel.getSeq(gapped=True, firstres=1, lastres=TARGET_LENGTH))
    
    
    
    ###############################################
    # Fill more gaps in the core model with FREAD #
    ###############################################
    
    
    if not params.isOpt('5') and FREAD_IS_INSTALLED:
      if params.isOpt('verbose'):
        print "Modelling loops (high-coverage)"
      
      target_residue_indeces_hiacc, alignment_hiacc = mapCore2TargetSeq(UNGAPPED_TARGET_SEQ, hiaccModel)
      
      if not (params.isOpt('debug') and os.path.exists(hiacc_struc_filename)):
        # Model loops
        #
        modelAllGaps(hiacc_struc_filename, UNGAPPED_TARGET_SEQ, target_residue_indeces_hiacc, hicov_struc_filename, "%shicov.decoys"%(outprefix), loop_log_database_filename, score_cutoff=1, max_extension=extension_max, core_struc=hiaccModel, dbtype=database_type, ranking=FREAD_RANKING_BY_ESSS, minb=4)
      
      hicovModel = ResidueList(hicov_struc_filename)
      
      if params.isOpt("scwrl"):
        if params.isOpt('verbose'):
          print "Modelling sidechains (high-coverage)"
        hicovModelWithSidechains = modelSidechains(hicovModel)
        if len(hicovModelWithSidechains) < len(hicovModel):
          sys.stderr.write("Warning: Something went wrong during sidechain modelling for HiCov model\n")
        else:
          hicovModel = hicovModelWithSidechains
          f = open(hicov_struc_filename, 'w')
          f.write(str(hicovModel))
          f.close()
      
      # Write low accuracy structure file
      #
      f = open(hicov_struc_filename, 'w')
      f.write(str(hicovModel))
      f.close()
    
    
    
    if not params.isOpt('6'):
      try:
        from prosci.medeller.modellerif import completeModel as completeAllAtom
      except ImportError:
        if params.isOpt('verbose'):
          print "NOTICE: Modeller is not installed. Cannot produce complete model."
      else:
        if params.isOpt('verbose'):
          print "Completing model"
          if not FREAD_IS_INSTALLED:
            print "NOTICE: FREAD is not installed. All loops are modelled using Modeller."
        
        target_residue_indeces_hicov, alignment_hicov = mapCore2TargetSeq(UNGAPPED_TARGET_SEQ, hicovModel)
        
        # Complete the model using MODELLER
        completeModel = completeAllAtom(hicovModel, alignment_hicov, "target", template_ids=[hicovModel.code], template_directory=template_searchdirs, outFileName=complete_struc_filename, bfactor=15, refine_sidechains=params.isOpt("refine_sidechains"))
        
        if params.isOpt("scwrl"):
          if params.isOpt('verbose'):
            print "Modelling sidechains (complete)"
          completeModelWithSidechains = modelSidechains(completeModel)
          if len(completeModelWithSidechains) < len(completeModel):
            sys.stderr.write("Warning: Something went wrong during sidechain modelling for Complete model\n")
          else:
            completeModel = completeModelWithSidechains
            f = open(complete_struc_filename, 'w')
            f.write(str(completeModel))
            f.close()
