#!/usr/bin/env python

import sys
import os
import subprocess
import re
import traceback
import time
import argparse
from glob import glob

sys.path = [os.path.join(os.path.abspath(os.path.dirname(sys.argv[0]))), 
            os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), "..", "lib-python"),
            os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), "..", "..", "lib-python")] + sys.path

from prosci.common import splitpath
from prosci.util.splitchain import splitchains as pdbsplitchains
from prosci.imembrane.builder import Classifier
from cgdb_splitchain import splitchains as cgdbsplitchains


parser = argparse.ArgumentParser(description="Syncronise a local directory with the CGDB database online")
parser.add_argument('--remove', dest='remove', action='store_true', default=False,
                   help='remove entries that disappeared from the online database')
parser.add_argument('databasedir', type=str,
                   help='local directory to store the CGDB data in')
args = parser.parse_args()


SCRIPTDIR=os.path.dirname(sys.argv[0]) # The directory where this script is located
DBDIR=os.path.abspath(args.databasedir)


dir_analysis = "%s/analysis" % (DBDIR)
dir_struc    = "%s/chains" % (DBDIR)
dir_status   = "%s/status" % (DBDIR)
file_pdblist = "%s/pdb.list" % (DBDIR)


def cgdbid2imemid(cgdbid):
  return "%s.%s"%(cgdbid[:4], cgdbid[4:])

def imemid2cgdbid(imemid):
  return "%s%s"%tuple(imemid.split(".", 1))


def filter_non_aa(fin, fout):
  from prosci.util.pdb import residues
  
  atoms = set([])
  
  for line in fin:
        if line.startswith("ATOM"):
            res = line[17:20].upper()
            if residues.has_key(res):
              fout.write(line)
              iatm = int(line[ 6:11].strip())
              atoms.add(iatm)
        elif line.startswith("CONECT"):
            iatm = int(line[ 6:11].strip())
            if iatm in atoms:
              fout.write(line)
        else:
            fout.write(line)


def mkdirs(path):
  if os.path.isdir(path):
    return
  spath = os.path.abspath(path).split('/')
  for i in xrange(len(spath)):
    current = '/' + '/'.join(spath[:i+1])
    if not os.path.isdir(current):
      os.mkdir(current)


def diff_lines(f1, f2):
    s1 = set([])
    s2 = set([])
    
    for line in f1:
      line = line.strip()
      if line:
        s1.add(line)
    
    for line in f2:
      line = line.strip()
      if line:
        s2.add(line.strip())
    
    deleted   = s1 - s2
    added     = s2 - s1
    unchanged = s1 & s2
    
    return sorted(deleted), sorted(added), sorted(unchanged)



"""
<tr  class=color3>
  <td>200 ns</td>
  <td>DPPC self assembly</td>

  <td><a href=javascript:NewWindow('fs_popup.php?code=1MM40','Final_Snapshot','700','650','yes')><img border='1' src='./analysis/thumb-1MM40-final.png'width='91' height='91'></a> </td>
  <td><a href=javascript:NewWindow('sp_popup.php?code=1MM40','Structural_Properties','1050','530','yes')><img border='1' src='./analysis/thumb-1MM40-rmsd.png'width='91' height='91'></a> </td>
  <td><A HREF='./bilayer.php?pdb=1MM4'><img border='1' src='./analysis/thumb-1MM40-bilayer.png'width='91' height='91'></a> </td>
  <td><a href=javascript:NewWindow('an_popup.php?code=1MM40','Analysis','700','650','yes')><img border='1' src='./analysis/thumb-1MM40-coloured.png'width='91' height='91'></a> </td>
  <td></td>
 </tr>
"""


micelle      = re.compile("micel",re.I)
selfassembly = re.compile("self[^a-zA-Z]*assemb",re.I)
entrytype    = re.compile("^[^<>]*>[^<>]*<td[^<>]*>[^<>]*</td>[^<>]*<td[^<>]*>\s*([^<>]*?)\s*</td>",re.I)
tablelink    = re.compile("\Wan_popup\.php\?code=(\w{4,})\W")
dashdot      = re.compile("-|\.")


def download_cgdb_files(pdb):
  os.chdir(dir_analysis)
  
  pdb = pdb.upper()
  
  onlineentries = set([])
  localentries  = set([])
  
  # Get list of local entries for this PDB
  for filename in glob(pdb+'*'):
    localentries.add(dashdot.split(filename)[0])
  
  # Get list of online entries for this PDB
  document = subprocess.check_output("wget -nv --output-document=- http://sbcb.bioch.ox.ac.uk/cgdb/simtable.php?pdb="+pdb, shell=True)
  time.sleep(1)
  if document:
    #document = document.split("\n")
    document = document.split("<tr")
    for entry in document:
      match = tablelink.search(entry)
      if match:
        m_etype = entrytype.search(entry)
        if m_etype:
          etype = m_etype.group(1)
          if selfassembly.search(entry) and not micelle.search(entry):
            code = match.group(1)
            onlineentries.add(code)
            #print "Matched link:",code
  
  if args.remove:
    # Delete removed entries for this PDB
    removedentries = localentries - onlineentries
    for entry in removedentries:
      delete_entry(entry)
  
  # Download the analysis table and structure file for the new entries
  for code in sorted(onlineentries):
    a=subprocess.call("wget -N -nv http://sbcb.bioch.ox.ac.uk/cgdb/analysis/%s-coloured.table" % (code), shell=True)
    time.sleep(5)
    if a==0:
      subprocess.call("echo >> %s-coloured.table" % (code), shell=True)
      subprocess.call("ln -sf %s-coloured.table %s.table" % (code, cgdbid2imemid(code)), shell=True)
    a = subprocess.call("wget -N -nv http://sbcb.bioch.ox.ac.uk/cgdb/analysis/%s-final.pdb" % (code), shell=True)
    time.sleep(5)
    if a==0:
      fin  = file("%s-final.pdb" % (code))
      if fin:
        fout = open("%s.atm" % (cgdbid2imemid(code)), 'w')
        filter_non_aa(fin, fout)
        fout.close()
  
  # Split stucture into chains and put a new .atm file for each chain into dir_struc
  os.chdir(dir_struc)
  wholepdbs = glob("%s/%s*.atm" % (dir_analysis, pdb))
  for pdb_file in wholepdbs:
    try:
      cgdbsplitchains(pdb_file)
    except:
      e_id = splitpath(pdb_file)[1].split('-')[0]
      sys.stderr.write("WARNING: Could not load entry %s. Skipping.\n" % (e_id))
      delete_entry(e_id) # Delete partial entry
      #~ traceback.print_exc()
      continue

  


def delete_entry(e_id):
  matches = glob("%s/%s*" % (dir_analysis, e_id)) + glob("%s/%s*" % (dir_analysis, cgdbid2imemid(e_id))) + glob("%s/%s*" % (dir_struc, cgdbid2imemid(e_id)))
  for m in matches:
    if os.path.exists(m):
      os.remove(m)



def write_file(fname, txt):
  f = open(fname, "w")
  if txt:
    f.write(txt)
  f.close()



mkdirs(DBDIR)
mkdirs(dir_analysis)
mkdirs(dir_struc)
mkdirs(dir_status)


oldpdbs = set([])
for fname in glob("%s/*.done" % (dir_status)):
  pdbcode = os.path.basename(fname)[:4]
  oldpdbs.add(pdbcode)

newpdbs = subprocess.check_output('wget -nv --output-document=- http://sbcb.bioch.ox.ac.uk/cgdb/dbstart.php | egrep -o "structureId=(\w{4,})" | awk -F "=" \'{print toupper($2)}\'', shell=True).strip().split("\n")

if not newpdbs:
  raise RuntimeError("Could not get list of new PDBs in CGDB database online")

pdb_deleted, pdb_added, pdb_unchanged = diff_lines(oldpdbs, newpdbs)

print "deleted\n", pdb_deleted
print "added  \n", pdb_added
print "same   \n", pdb_unchanged

if args.remove:
  # Remove old entries (analysis and structure)
  for pdb in pdb_deleted:
    delete_entry(pdb)


# Add new entries to our local CGDB
# Parse new and existing entries,
# Split them into chains and archive their sequences
#
os.chdir(dir_analysis)
#pdb_newset = pdb_added + pdb_unchanged
pdb_newset = pdb_added
pdb_newset.sort()

for pdb in pdb_newset:
  #print "echo '%s' | %s/cgdb_get_tables.pl" % (pdb, SCRIPTDIR)
  #subprocess.call("echo '%s' | %s/cgdb_get_tables.pl" % (pdb, SCRIPTDIR), shell=True)
  download_cgdb_files(pdb)
  
  
  # Check all variants of each entry (e.g. 7AHL0, 7AHL1, 7AHL2, ...)
  #

  entry_variants = glob("%s/%s*.table" % (dir_analysis, pdb))
  for entry_file in entry_variants:
      if "-coloured.table" in entry_file:
        continue
      
      e_id = splitpath(entry_file)[1] # basename
      e_id = e_id.split('-')[0]       # get the entry ID
      
      assert e_id, "ERROR while parsing CGDB entry file name"
      
      try:
        e = Classifier(DBDIR, e_id)
      except:
        sys.stderr.write("WARNING: Could not load entry '%s'. Skipping.\n" % (e_id))
        delete_entry(e_id) # Delete partial entry
        traceback.print_exc()
        continue
      
      e_seq = e.pdb_data.get_seq()
      
      if not glob("%s/%s.ali" % (dir_analysis, e_id)):
        #e_ali   = ">%s\n%s\n%s\n" % (e_id, e.get_structure_lign(), e_seq)
        e_ali   = ">%s\n%s\n%s\n" % (e_id, e.pdb_data.get_structure_lign(), e_seq)
        e_alifile = open("%s/%s.ali" % (dir_analysis, e_id), 'w')
        e_alifile.write(e_ali)
        e_alifile.close()
      
      
      # Check each chain in the current entry variant (e.g. 7AHL0A, 7AHL0B, 7AHL0C, ...)
      #
      
      entry_variant_chains = glob("%s/%s*.atm" % (dir_struc, cgdbid2imemid(e_id)))
      
      for chain_file in entry_variant_chains:
          echain_id = splitpath(chain_file)[1] # basename without extension
          
          assert echain_id, "ERROR while parsing CGDB entry chain file name"
          
          try:
            echain = Classifier(DBDIR, echain_id)
          except:
            sys.stderr.write("WARNING: Could not load entry chain %s. Skipping.\n" % (echain_id))
            #~ traceback.print_exc()
            delete_entry(echain_id) # Delete partial entry
            continue
          
      pdbsplitchains(entry_variant_chains, "afc")
      
  write_file("%s/%s.done" % (dir_status, pdb), "done\n")
