#!/bin/bash
#
# MEDELLER pipeline SANITY CHECKS ONLY - THIS SCRIPT DOES NOT PRODUCE ANY OUTPUT
# Coded by Sebastian Kelm
#          JP (Slight modifications)
# Created on 19 September 2012
#
# Copyright 2008-2012 Sebastian Kelm <kelm@stats.ox.ac.uk>
#
#



################################################################################
# General use functions
################################################################################

function abspath {
	if [[ -d "$1" ]]; then
		pushd "$1" >/dev/null
		pwd
		popd >/dev/null
	elif [[ -e "$1" ]]; then
		pushd $(dirname "$1") >/dev/null
		echo $(pwd)/$(basename "$1")
		popd >/dev/null
	else
		echo "Error: '$1' does not exist!" 1>&2
		return 127
	fi
}



################################################################################
# Set up environment variables
################################################################################

MAINDIR=`abspath "$0"`
MAINDIR=`dirname "$MAINDIR"`
MAINDIR=`dirname "$MAINDIR"`

BINPATH="$MAINDIR/bin"
LIBPATH="$MAINDIR/lib-python"
SUBSTTABLEPATH="$MAINDIR/substtables"
BLASTDATAPATH="$MAINDIR/blastdata"
MPTPATH=`abspath "$MAINDIR/../MPT/bin"`

export PYTHONPATH="$LIBPATH:$PYTHONPATH"
export PATH="$BINPATH:$MPTPATH:/data/greenheron/apps/muscle:/data/greenheron/apps/usearch:/data/greenheron/apps/ncbi-blast-2.2.27+/bin:/data/greenheron/apps/joy-5.10:/data/greenheron/apps/joy-5.10/psa:/data/greenheron/apps/joy_related-1.06/hbond:/data/greenheron/apps/joy_related-1.06/sstruc:/data/greenheron/apps/mp-t/bin:/data/greenheron/apps/tmalign:/usr/local/bin:/usr/bin:/bin:/usr/local/games:/usr/gamesa"



################################################################################
# Parse command line options
################################################################################

if ! [ "$5" ]; then
  myname=`basename "$0"`
  echo "
  USAGE:
    $myname <target_filename> <template_filename> <template_chain> <blastdb> <outdir> [threads [medeller_flags [imembrane_flags]]]
    " 1>&2
  exit 1
fi


targetfile=`abspath "$1"`
templatefile=`abspath "$2"`
templatechain="$3"
blastdb=`abspath "$4"` # /data/pegasus/hill/BLAST/ncbi-blast-2.2.25+/bin/db/uniref90.fasta
outdir="$5"
threads="$6"
medeller_opts="$7"
imembrane_opts="$8"

templateid="template"


if [ ${#templatechain} != 1 ] && [ ${#templatechain} != 0 ]; then
  echo "Error: template_chain must be a single character, or '' in the case of a single unnamed chain. You specified: '$templatechain'" 1>&2
  exit 1
fi


if ! [ "$threads" ]; then
  threads=1
fi


if ! [ -f "$outdir/ignoreexisting" ]; then

  if [ -f "$outdir/input/$templateid$templatechain.pdb" ]; then
    echo "Error: Pre-processed template structure exists. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi



  if [ -s "$outdir/imembrane/tophit/query_annotation.tem" ]; then
    echo "Error: iMembrane annotation of template exists. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi



  if [ -s "$outdir/MPT/MPT_output.fasta" ]; then
    echo "Error: target-template alignment exists. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi



  if [ -s "$outdir/annotated_alignment.tem" ]; then
    echo "Error: annotated alignment exists. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi



  if [ -s "$outdir/medeller/annotated_core/query_annotation.tem" ]; then
    echo "Error: Annotated MEDELLER models exist. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi



  if [ -s "$outdir/medeller/model.core.pdb" ]; then
    echo "Error: MEDELLER models exist. Delete any output files or specify a different output directory." 1>&2
    exit 99
  fi

fi



################################################################################
# Perform basic sanity checks on the target and template files:
#     - missing or empty files?
#     - more than one sequence present in target file?
#     - target contains non-standard amino acids?
#     - extract target sequence's FASTA title
################################################################################

if ! [ -s "$targetfile" ]; then
  echo "Error: Target file empty or not found: $targetfile" 1>&2
  exit 1
fi

if ! [ -s "$templatefile" ]; then
  echo "Error: Template file empty or not found: $templatefile" 1>&2
  exit 1
fi

numtargetseqs=`grep -c '^>' "$targetfile"`
if [ "$numtargetseqs" != 1 ]; then
  echo "Error: Target file contains $numtargetseqs sequences. Must contain 1 sequence: $targetfile" 1>&2
  exit 2
fi

specialcharacters=$(grep -v '^>' "$targetfile" | grep -Po '[^ACDEFGHIKLMNPQRSTVWY\s]' | xargs echo | tr -d ' ')
if [ "$specialcharacters" ]; then
  echo "Error: Target file contains ${#specialcharacters} non-standard amino acids: '$specialcharacters'. Please substitute them with standard amino acids or cut down your target sequence. Path to target file: $targetfile" 1>&2
  exit 2
fi


mkdir -p "$outdir"
outdir=`abspath "$outdir"`

# Process the name in the target fasta file with some "clever" sed
sed 's/^>.*$/>target/g' "$targetfile" >"$outdir/target.fasta"
targetfile="$outdir/target.fasta"

targetid=`grep -m1 '^>' "$targetfile" | tr -d ' \t\n\r'`
targetid=${targetid:1}
if ! [ "$targetid" ]; then
  echo "Error: Target file does not have a FASTA-style heading: $targetfile" 1>&2
  exit 2
fi
#echo "Extracted targetid from FASTA file: '$targetid'"



################################################################################
# Perform basic sanity checks on target and template sequences:
#     - 100% identical?
#     - length1 / length2 < 2/3 or > 3/2 ?
#     - template contains non-standard amino acids?
################################################################################

success=0
mkdir -p "$outdir/input"
prefix="$outdir/input/$templateid"

#if [ -f "$prefix$templatechain.pdb" ]; then
#  echo "Error: Pre-processed template structure exists. Delete any output files or specify a different output directory." 1>&2
#  exit 99
#fi

cp -f "$templatefile" "$prefix.pdb"
filelist=`splitchain -pfF --prefix "$prefix" --seqtitle "$templateid" "$prefix.pdb"`
if ! [ -f "$prefix$templatechain.fasta" ]; then
  echo "Error: Failed to extract sequence of chain '$templatechain' from template PDB file. Check that it is a valid PDB file and contains the specified chain: $templatefile" 1>&2
  exit 2
fi
if ! [ -f "$prefix$templatechain.pdb" ]; then
  echo "Error: Failed to extract structure of chain '$templatechain' from template PDB file. Check that it is a valid PDB file and contains the spefified chain: $templatefile" 1>&2
  exit 2
fi
templateid="$templateid$templatechain"
templatefile="$prefix$templatechain.pdb"


specialcharacters=$(grep -v '^>' "$prefix$templatechain.fasta" | grep -Po '[^ACDEFGHIKLMNPQRSTVWY\s]' | xargs echo | tr -d ' ')
if [ "$specialcharacters" ]; then
  echo "Error: Template file contains ${#specialcharacters} non-standard amino acids: '$specialcharacters'. Please substitute them with standard amino acids or cut down your target structure. Path to template file: $templatefile" 1>&2
  exit 2
fi


seq_comparison=(`fasta2ali "$targetfile" "$prefix$templatechain.fasta" | pid -b -`)
identity=${seq_comparison[0]}
length1=${seq_comparison[3]}
length2=${seq_comparison[4]}

if (( $length1 < 40 )); then
  echo "Error: Target sequence must be at least 40 residues long" 1>&2
  exit 2
fi
if (( $length2 < 40 )); then
  echo "Error: Template structure must be at least 40 residues long" 1>&2
  exit 2
fi

if ! [ "$length2" ]; then
  echo "Error: Something went wrong when trying to assess target-template similarity" 1>&2
  exit 2
fi

echo "Extracted chain '$templatechain' from input file: $templateid"
echo "Estimated target-template percent identity: $identity"
echo "Sequence lengths: target=$length1, template=$length2"
echo

if [ "$identity" == "100.00" ]; then
  mkdir -p "$outdir/medeller"
  cp -f "$templatefile" "$outdir/medeller/model.core.pdb"
  cp -f "$outdir/medeller/model.core.pdb" "$outdir/medeller/model.hiacc.pdb"
  cp -f "$outdir/medeller/model.core.pdb" "$outdir/medeller/model.hicov.pdb"
  cp -f "$outdir/medeller/model.core.pdb" "$outdir/medeller/model.complete.pdb"
  
  echo "Target and template are identical. Copying template structure in its entirety and skipping modelling process."
  echo
  echo "Models can be found in $outdir/medeller:"
  pushd "$outdir/medeller" >/dev/null
  ls model.*.pdb
  popd >/dev/null
  
  echo "--- The End ---"
  exit
fi


length_difference=`perl -e "if ($length1 > $length2) {print $length1 / $length2} else {print $length2 / $length1};"`
success=`perl -e "if ($length_difference >= 1.5) {print 1;} else {print 0;}"`

if [ $success -ne 0 ]; then
  echo "Error: Lengths of target ($length1) and template ($length2) sequences differ by more than 50% of the shorter sequence (length of longer seq is $length_difference * length of shorter seq). Please consider choosing a different template or trimming the longer protein." 1>&2
  exit 3
fi


echo "--- The End ---"
