#!/bin/bash
VERSION=22-282 #__SETVERSION__

# Copyright 2022 Stijn van Dongen

# This program is free software; you can redistribute it and/or modify it
# under the terms of version 3 of the GNU General Public License as published
# by the Free Software Foundation. It should be shipped with MCL in the top
# level directory as the file COPYING.

#    _ _  __|_ _. __|_ _  _|   _ _  _ _|_. _  _  _  _    |. _ |  _  _  _
#   | (/__\ | | |(_ | (/_(_|  (_(_)| | | |(_|(/_| |(_\/  ||| ||<(_|(_|(/_
#                                           |        /               |
# RCL - Fast multi-resolution consensus clustering
# Author: Stijn van Dongen
#
# This RCL implementation uses programs/tools that are shipped with mcl.  It
# can be run on any set of clusterings from any method or program, but the
# network and clusterings have to be supplied in mcl matrix format.
#
# See github.com/micans/mcl#rcl and this script with no arguments.
# qc plots can be made with rcl-qc .

set -euo pipefail

themode=              # first argument, mode 'setup' 'tree' 'select', 'mcl'
projectdir=           # second argument, for all modes.
network=              # -n FNAME
tabfile=              # -t FNAME
cpu=1                 # -p NUM
RESOLUTION=           # -r, e.g. -r "100 200 400 800 1600 3200"
ANNOTATION=           # -a FNAME annotation file
CLUSTERING=           # -c FNAME clustering file
do_force=false        # -F (for mode 'tree')
INFLATION=            # -I, e.g. -I "1.3 1.35 1.4 1.45 1.5 1.55 1.6 1.65 1.7 1.8 1.9 2"
SELF=                 # -D
test_ucl=false        # Not an option currently, ucl-simple.sh is not shipped (-U)
run_dot=false         # -S

function usage() {
  local e=$1
  cat <<EOH
An rcl workflow requires three steps:

1) rcl setup   TAG -n NETWORKFILENAME -t TABFILENAME  LIST-OF-CLUSTERING-FILES
2) rcl tree    TAG [-F] [-p NCPU]
3) rcl select  TAG -r "RESOLUTIONLIST"

TAG will be used as a project directory name in the current directory.
NETWORKFILENAME and TABFILENAME are usually created by mcxload.
Hard links to these files will be made in TAG, symbolic if this is not possible.
All rcl commands are issued from outside and directly above directory TAG.
LIST-OF-CLUSTERING-FILES is stored in a file and retrieved when needed.
-F forces a run if a previous output exists.
For RESOLUTIONLIST a doubling is suggested, e.g. -r "100 200 400 800 1600 3200"
You may want to re-run with a modified list if the largest cluster size is either
too small or too large for your liking.
The history of your commands will be tracked in TAG/rcl.cline.
If 'dot' is available, a plot of results is left in TAG/rcl.hi.RESOLUTION.pdf
A table of all clusters of size above the smallest resolution is left in TAG/rcl.hi.RESOLUTION.txt

To make mcl clusterings to give to rcl:
rcl mcl TAG [-p NCPU] -n NETWORKFILENAME -I "INFLATIONLIST"
This may take a while for large graphs.
In step 1) you can then use
   rcl setup TAG -n NETWORKFILENAME -t TABFILENAME
INFLATIONLIST:
- for single cell use e.g. -I "1.3 1.35 1.4 1.45 1.5 1.55 1.6 1.65 1.7 1.8 1.9 2"
- for protein families you will probably want somewhat larger values
EOH
  exit $e
}

MODES="setup tree select mcl version"
n_req=0

function require_mode() {
  local mode=$1
  [[ -z $mode ]] && echo "Please provide a mode from '$MODES'" && usage 0
  if ! grep -qFw -- $mode <<< "$MODES"; then
    echo "Need a mode, one of { $MODES }"; usage 1
  fi
  themode=$mode
}
function require_tag() {
  local mode=$1 tag=$2
  if [[ -z $tag ]]; then
    echo "Mode $mode requires a project directory tag"
    false
  elif [[ $tag =~ ^- ]]; then
    echo "Project directory not allowed to start with a hyphen"
    false
  fi
  projectdir=$tag
}
function require_file() {
  local fname=$1 response=$2
  if [[ ! -f $fname ]]; then
    echo "Expected file $fname not found; $response"
    false
  fi
}
function require_imx() {
  local fname=$1 response=$2
  require_file "$fname" "$response"
  if ! mcx query -imx $fname --dim > /dev/null; then
    echo "Input is not in mcl network format; check file $fname"
    false
  fi
}
function test_absence () {
  local fname=$1
  local action=${2:?Programmer error, need second argument}  # return or exit
  if [[ -f $fname ]]; then
    if $do_force; then
      echo "Recreating existing file $fname"
    else
      echo "File $fname exists, use -F to force renewal"
      if [[ $action == 'return' ]]; then
        return 1
      elif [[ $action == 'exit' ]]; then
        exit 1
      fi
    fi
  fi
}

require_mode "${1-}"          # themode now set
shift 1
if grep -qFw $themode <<< "setup tree select mcl"; then
  require_tag $themode "${1-}"   # projectdir now set
  shift 1
fi

if [[ $themode == 'version' ]]; then
   list=$(for p in clm; do echo -n " | $($p --version | head -n 1)"; done)
   echo "Versions of programs used: $list |"
   rcl-select.pl --version
   rcldo.pl version
   echo "RCL version: $VERSION"
   exit 0
fi

while getopts :n:p:r:t:I:FDS opt
do
    case "$opt" in
    n) network=$OPTARG ;;
    p) cpu=$OPTARG ;;
    r) RESOLUTION=$OPTARG ;;
    t) tabfile=$OPTARG ;;
    I) INFLATION=$OPTARG ;;
    F) do_force=true ;;
    D) SELF="--self" ;;
    U) test_ucl=true ;;
    S) run_dot=true ;;
    :) echo "Flag $OPTARG needs argument" exit 1 ;;
    ?) echo "Flag $OPTARG unknown" exit 1 ;;
   esac
done

pfx=
if [[ -n $projectdir ]]; then
   mkdir -p $projectdir
   pfx=$projectdir/rcl
   if [[ -n $RESOLUTION || -n $INFLATION ]]; then
     echo -- $themode $projectdir $(printf "'%s' " "$@") >> $pfx.cline
   else
     echo -- "$themode $projectdir $@" >> $pfx.cline
   fi
fi

shift $((OPTIND-1))

function require_opt() {
  local mode=$1 option=$2 value=$3 description=$4
  if [[ -z $value ]]; then
    echo "Mode $mode requires $description for option $option"
    (( ++n_req ))
  fi
}


  ##
  ##  M C L

if [[ $themode == 'mcl' ]]; then
  require_opt mcl -n "$network" "a network in mcl format"
  require_opt mcl -I "$INFLATION" "a set of inflation values between quotes"
  if (( n_req > 0 )); then exit 1; fi
  mkdir -p $projectdir
  echo "-- Running mcl for inflation values in ($INFLATION)"
  echo ">> $cpu cpus will be used (-p option)"
  for I in $(tr -s ' ' '\n' <<< "$INFLATION" | sort -rn); do
    echo -n "-- inflation $I start .."
    mcl $network -I $I -t $cpu --i3 -odir $projectdir ${RCL_MCL_OPTIONS:-}
    echo " done"
  done 2> $projectdir/log.mcl
  echo $projectdir/out.*.I* | tr ' ' '\n' > $projectdir/rcl.lsocls
  exit 0


  ##
  ##  S E T U P

elif [[ $themode == 'setup' ]]; then
  require_opt setup -n "$network" "a network in mcl format"
  require_opt setup -t "$tabfile" "a tab file mapping indexes to labels"
  if (( n_req )); then exit 1; fi
  require_imx "$network" "Is $network an mcl matrix file?"

  ndim=$(grep -o "[0-9]\+$" <<< $(mcx query --dim -imx "$network"))
  ntab=$(wc -l < "$tabfile")

  if [[ $ntab != $ndim ]]; then
    echo "Dimension mismatch between network ($ndim) and tab file ($ntab)"
    false
  fi

  if ! test_absence $pfx.lsocls return; then
    true # echo "-- using existing file $pfx.lsocls"
  else
    (( $# < 2 )) && echo "Please supply a few clusterings" && false
    ls "$@" > $pfx.lsocls
    for f in $(cat $pfx.lsocls); do
      require_imx "$f" "Is clustering $f an mcl matrix file?"
    done
    echo "-- Supplied clusterings are in mcl format"
  fi

  if ! ln -f $tabfile $pfx.tab 2> /dev/null; then
    cp $tabfile $pfx.tab
  fi
  if ! ln -f $network $pfx.input 2> /dev/null; then
    ln -sf $network $pfx.input
  fi
  wc -l < $pfx.tab > $pfx.nitems
  rcl version >> $pfx.cline
  echo "Project directory $projectdir is ready ($pfx.input)"


  ##
  ##  T R E E

elif [[ $themode == 'tree' ]]; then
  require_imx "$pfx.input" "did you run rcl setup $projectdir?"
  require_file "$pfx.lsocls" "cluster file $pfx.lsocls is missing, weirdly"

  rclfile=$pfx.rcl
  test_absence  "$rclfile" exit

  mapfile -t cls < $pfx.lsocls
  echo "-- Computing RCL graph on ${cls[@]}"

  if $test_ucl; then
    ucl-simple.sh "$pfx.input" "${cls[@]}"
    mv -f out.ucl $rclfile
    echo "Ran UCL succesfully, output $rclfile was made accordingly"
  elif (( cpu == 1 )); then
    clm vol --progress $SELF -imx $pfx.input -write-rcl $rclfile -o $pfx.vol "${cls[@]}"
  else
    maxp=$((cpu-1))
    list=$(eval "echo {0..$maxp}")
    echo "-- All $cpu processes are chasing .,=+<>()-"
    for id in $list; do
      clm vol --progress $SELF -imx $pfx.input -gi $id/$cpu -write-rcl $pfx.R$id -o pfx.V$id "${cls[@]}" &
    done
    wait
    clxdo mxsum $(eval echo "$pfx.R{0..$maxp}") > $rclfile
    echo "-- Components summed in $rclfile"
  fi
  echo "-- Computing single linkage join order for network $rclfile"
  clm close --sl -sl-rcl-cutoff ${RCL_CUTOFF-0} -imx $rclfile -tab $pfx.tab -o $pfx.join-order -write-sl-list $pfx.node-values
  echo "RCL network and linkage both ready, you can run rcl select $projectdir"


  ##
  ##  S E L E C T

elif [[ $themode == 'select' ]]; then
  require_opt select -r "$RESOLUTION" "a list of resolution values between quotes"
  (( n_req )) && false
  minres=$(tr -s ' ' '\n' <<< "$RESOLUTION" | sort -n | head -n 1)
  require_imx "$pfx.rcl" "did you run rcl tree $projectdir?"
  echo "-- computing clusterings with resolution parameters $RESOLUTION"
  export MCLXIOVERBOSITY=2
  # export RCL_RES_PLOT_LIMIT=${RCL_RES_PLOT_LIMIT-500}

  rcl-select.pl $pfx $RESOLUTION < $pfx.join-order
  # rcl-select.22-178.pl $pfx $RESOLUTION < $pfx.join-order

  echo "-- saving resolution cluster files and displaying size of the 30 largest clusters"
  echo "-- in parentheses N identical clusters with previous level among 30 largest clusters"
                                      # To help space the granularity output.
  export CLXDO_WIDTH=$((${#pfx}+14))  # .res .cls length 8, leave 6 for resolution

  res_prev=""
  file_prev=""

  for r in $RESOLUTION; do
    rfile=$pfx.res$r.info
    if [[ ! -f $rfile ]]; then
      echo "Expected file $rfile not found"
      false
    fi
    prefix="$pfx.res$r"
    tail -n +2 $rfile | cut -f 5 | mcxload -235-ai - -o $prefix.cls
    mcxdump -icl $prefix.cls -tabr $pfx.tab | grep -v __dummy__ > $prefix.labels
    mcxdump -imx $prefix.cls -tabr $pfx.tab --no-values --transpose | grep -v __dummy__ > $prefix.txt
    nshared="--"
    if [[ -n $res_prev ]]; then
      nshared=$(grep -Fcwf <(head -n 30 $rfile | cut -f 1) <(head -n 30 $file_prev | cut -f  1) || true)
    fi
    export CLXDO_GRABIG_TAG="($(printf "%2s" $nshared)) "
    clxdo grabig 30 $prefix.cls
    res_prev=$r
    file_prev=$rfile
  done
  commalist=$(tr -s $'\t ' ',' <<< $RESOLUTION)
  hyphenlist=$(tr -s $'\t ' '-' <<< $RESOLUTION)
  resmapfile=$pfx.hi.$hyphenlist.resdot

  if $run_dot; then
    if [[ -f $resmapfile ]]; then
      rlist=($RESOLUTION)
      for theres in ${rlist[@]}; do
        resdotfile=$pfx.hi.$theres.dot
        respdffile=$pfx.hi.$theres.pdf
        rcl-dot-resmap.pl ${RCL_DOT_RESMAP_OPTIONS-} --minres=$theres --label=size < $resmapfile > $resdotfile
        if ! dot -Tpdf -Gsize=10,10\! < $resdotfile > $respdffile; then
          echo "-- dot did not run, pdf not produced"
        else
          echo "-- map of output produced in $respdffile"
        fi
      done
    else
      echo "-- Expected file $resmapfile not present"
    fi
  else
    echo "-- Size graph plots not produced (use -S to do so)"
  fi

cat <<EOM
The following outputs were made - individual resolution based files:
  Master files with tree information:       $(eval echo $pfx.res{$commalist}.info)
  One cluster-per line files with labels:   $(eval echo $pfx.res{$commalist}.labels)
  LABEL<TAB>CLUSID files:                   $(eval echo $pfx.res{$commalist}.txt)
  clustering files in mcx format:           $(eval echo $pfx.res{$commalist}.cls)

Summary table $pfx.sy.$hyphenlist.txt
  - All different levels integrated
  - Residual clusters (names ending in _A) collating small clusters below size $minres
  - Suitable for running quickmarkers and generating heatmap
EOM

fi


