#! /usr/bin/env bash

# $Header: /u/gcmpack/MITgcm/tools/example_scripts/ACESgrid/aces_test_all,v 1.35 2015/04/23 15:14:00 jmc Exp $
# $Name:  $

#  submit list of jobs or get list of submitted jobs
sub_list_jobs()
{
   # sub_list_jobs
   # input : JOB_LIST
   # output: NB_SUB_JOBS (+ status of jobs: M_{sfx}=submitted or skipped )

   NB_SUB_JOBS=0
   for i in $JOB_LIST
   do
   ( cd output ; test -e tst_${i}.stdout && mv -f tst_${i}.stdout tst_${i}.stdout.old )
     case $i in
       'mth') sfx='ifc_'${i}  ;;
       'mp2') sfx='ifc_'${i}  ;;
       'tuv') sfx='op64_'${i} ;;
       'tlm') sfx='op64_'${i} ;;
       'opa') sfx='op64_adm'  ;;
       'g77') sfx=${i}'_adm'  ;;
           *) sfx=${i}'_mpi'  ;;
     esac
     if test -f $SUB_DIR/aces_test_$sfx ; then
       JOB="tst_"$i
       job_exist=`$QSTAT -a | grep $USER | grep $JOB | wc -l`
       if [ $action -eq 2 ] ; then
       #-- to get outp back:
         if test "x_$job_exist" = x_0 ; then
           echo "did not find any job: $JOB" | tee -a $LOG_FIL
           eval M_$i='skipped'
         else
           echo -n "found a job: $JOB" | tee -a $LOG_FIL
           $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
           eval M_$i='submitted'
           NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
         fi
       else
       #-- to submit job
         if test "x_$job_exist" = x_0 ; then
           #-- update code if not done within submitted script
           doUp=`grep -c '^ *checkOut=0' $SUB_DIR/aces_test_$sfx`
           if test ! -e $TST_DIR/MITgcm_$i ; then doUp=0; fi
           if [ $doUp -ge 1 ] ; then
             echo "" >> $LOG_FIL
             echo " update $TST_DIR/MITgcm_$i :" | tee -a $LOG_FIL
             ( cd $TST_DIR/MITgcm_$i
               cvs -q -d :pserver:cvsanon@mitgcm.org:/u/gcmpack update -P -d
             ) >> $LOG_FIL 2>&1
           fi
           echo -n " $JOB : " | tee -a $LOG_FIL
           $QSUB $SUB_DIR/aces_test_$sfx | tee -a $LOG_FIL
           eval M_$i='submitted'
           NB_SUB_JOBS=`expr $NB_SUB_JOBS + 1`
         else
           echo $JOB | tee -a $LOG_FIL
           $QSTAT -a | grep $USER | grep $JOB | tee -a $LOG_FIL
           echo 'job already exist => skip this test' | tee -a $LOG_FIL
           eval M_$i='skipped'
         fi
       fi
     else
       echo 'no file:' aces_test_$sfx 'to submit' | tee -a $LOG_FIL
       eval M_$i='skipped'
     fi
   done
   echo " info-sub-list: NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
}

#  retrieve output when job is finished
get_outp_back()
{
   # get_outp_back number_of_jobs
   # input : JOB_LIST (+ status of jobs: M_{sfx}=submitted )
   # output: REJECTED (= list of fast-to-fail jobs)
   #        (+ change status of jobs to: M_{sfx}=done )

   nbJobs=$1
   REJECTED=
   minutes=0 ; freq=10
   fsec=`expr $freq \* 60`
   echo "Check every $freq mn for $nbJobs test(s) to finish" | tee -a $LOG_FIL
   echo "- start at :" `date` | tee -a $LOG_FIL
   while test $nbJobs != 0 ; do

     sleep $fsec
     minutes=$(( $minutes + $freq ))

     for i in $JOB_LIST ; do

       eval comm=\$M_$i
       if test $comm = 'submitted' ; then
         JOB="tst_"$i
         $QSTAT -a > $TMP_FIL
         RETVAL=$?
         ready_to_send=`grep $USER $TMP_FIL | grep $JOB | wc -l`
         rm -f $TMP_FIL
         if test "x$RETVAL" != x0 ; then
           echo " $QSTAT returned with error code: $RETVAL" | tee -a $LOG_FIL
           continue
         fi

         if test "x_$ready_to_send" = x_0 ; then
           run_dir=${TST_DIR}"/MITgcm_"$i"/verification"
#-      results output:
           tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^tr_$outPfix" | head -1`
           if test "x$tdir" != x ; then
             #- check this is the right output
             chk=`echo $tdir | grep -c $today`
             if test $chk = '0' ; then
               curday=`date +%Y%m%d`
               chk=`echo $tdir | grep -c $curday`
             fi
             if test $chk = '0' ; then
               echo "tdir='$tdir'" | tee -a $LOG_FIL
               echo "Output do not match, no email sent for $i" | tee -a $LOG_FIL
               if [ $minutes -eq $freq ] ; then
#-         add to rejected list if it fails in less than "freq" minutes
                 REJECTED="$REJECTED $i"
               fi
             elif [ $action -le 4 ] ; then
               rm -f "/tmp/tr_$outPfix-"$i".tar.gz"
               ( cd $run_dir ; tar -czf "/tmp/tr_$outPfix-"$i".tar.gz" ./$tdir )
               if test "x$HAVE_MPACK" = xt ; then
                 $MPACK -s MITgcm-test -m 3555000 "/tmp/tr_$outPfix-"$i".tar.gz" jmc@mitgcm.org
                 echo "Email sent for $i at:     " `date` | tee -a $LOG_FIL
               else
                 echo " no email sent for $i (no mpack)" | tee -a $LOG_FIL
               fi
             fi
           else
               echo " no output found for $i" | tee -a $LOG_FIL
               if [ $minutes -eq $freq ] ; then
#-         add to rejected list if it fails in less than "freq" minutes
                 REJECTED="$REJECTED $i"
               fi
           fi
#-      restart output:
           tdir=`ls -1 -t $run_dir | grep -v tr_out | grep "^rs_$outPfix" | head -1`
           if test "x$tdir" != x ; then
             #- check this is the right output
             chk=`echo $tdir | grep -c $today`
             if test $chk = '0' ; then
               curday=`date +%Y%m%d`
               chk=`echo $tdir | grep -c $curday`
             fi
             if test $chk = '0' ; then
               echo "tdir='$tdir'" | tee -a $LOG_FIL
               echo "Restart output do not match, no email sent for $i" | tee -a $LOG_FIL
             elif [ $action -le 4 ] ; then
               rm -f "/tmp/rs_$outPfix-"$i".tar.gz"
               ( cd $run_dir ; tar -czf "/tmp/rs_$outPfix-"$i".tar.gz" ./$tdir )
               if test "x$HAVE_MPACK" = xt ; then
                 $MPACK -s MITgcm-test -m 3555000 "/tmp/rs_$outPfix-"$i".tar.gz" jmc@mitgcm.org
                 echo "Email sent for $i restart:" `date` | tee -a $LOG_FIL
               else
                 echo " no email sent for $i restart (no mpack)" | tee -a $LOG_FIL
               fi
             fi
           else
              echo " no restart output for $i" | tee -a $LOG_FIL
           fi
#-      record successful sending
           eval M_$i=done
           nbJobs=`expr $nbJobs - 1`
           chmod 644 output/tst_$i.std*
         fi
       fi
     done

     #  "long" queue is 24hrs = 24*60min = 1440min
     if test $minutes -gt 2160 ; then
        hrs=$(( $minutes / 60 ));
        echo "Time expired after $minutes minutes ($hrs hours)" | tee -a $LOG_FIL
        echo ' ' $nbJobs '/' $NB_SUB_JOBS 'tests not yet finished' | tee -a $LOG_FIL
        exit 1
     fi
     if [ $action -eq 5 ] ; then nbJobs=0 ; fi

   done

   echo "Retrieving $NB_SUB_JOBS tests finish :" `date` | tee -a $LOG_FIL
   echo " info-get-outp: REJECTED='$REJECTED'" >> $LOG_FIL

}

#---+----1----+----2----+----3----+----4----+----5----+----6----+----7-|--+----|
#-- Sequential part of the script starts here:
#---------------------------------------------

# action =1 : submit test jobs ; =2 : get jobs output ; =3 : do both
# action =4 or 5 : same as, respectively, 3 or 1 but re-submit fast failed jobs
case $1 in
 '-subOnly') action=1 ; shift ;;
 '-getOnly') action=2 ; shift ;;
 '-sub+get') action=3 ; shift ;;
 '-double' ) action=4 ; shift ;;
          *) action=5 ;;
esac
#echo "action= $action , Arg= $# "

today=`date +%Y%m%d`
dAlt=`date +%d` ; dAlt=`expr $dAlt % 3`

if test $# = 0
then
  TEST_LIST='opa g77 adm tlm ifc mp2 mth pgi tuv gnu'
  if [ $dAlt -ne 0 ] ; then TEST_LIST=`echo $TEST_LIST | sed 's/ pgi / /'` ; fi
else
  TEST_LIST=$*
fi

# QSUB="/usr/bin/qsub"
# QSTAT="/usr/bin/qstat"
# logPfix='tst_submit'
# outPfix='acesgrid'
# HERE="$HOME/test_$outPfix"
# #TST_DIR="/data/jm_c/test_$outPfix"
# TST_DIR="/scratch/jm_c/test_$outPfix"
QSUB=qsub
QSTAT=qstat
logPfix='tst_all'
outPfix='aces'
HERE="$HOME/test_$outPfix"
TST_DIR=$HERE

MPACK="MITgcm_tools/mpack-1.6/mpack"
SUB_DIR="MITgcm_tools/example_scripts/ACESgrid"
TMP_FIL="$HERE/output/TTT.$$"
LOG_FIL="$HERE/output/$logPfix."`date +%m%d`".log"
#SUB_DIR="temp"

#-- clean up old log files and start a new one:
cd $HERE/output

rm -f $logPfix.*.log_bak
if test -f $LOG_FIL ; then mv -f $LOG_FIL ${LOG_FIL}_bak ; fi
echo -n '-- Starting: ' | tee -a $LOG_FIL
date | tee -a $LOG_FIL

n=$(( `ls $logPfix.*.log | wc -l` - 10 ))
if test $n -gt 0 ; then
  echo ' remove old log files:' | tee -a $LOG_FIL
    ls -lt $logPfix.*.log | tail -"$n" | tee -a $LOG_FIL
    ls -t  $logPfix.*.log | tail -"$n" | xargs rm -f
fi

. /etc/profile.d/modules.sh
module list >> $LOG_FIL 2>&1

#-- now really do something:
cd $HERE

  JOB_LIST=$TEST_LIST
  sub_list_jobs
 #echo " info-main:     NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL

if test $action = 1 ; then
  NB_JOBS2GET=0
elif test $action = 5 ; then
  NB_JOBS2GET=$NB_SUB_JOBS
else
#date_str=`date +%Y%m%d`"_0"

  echo "" | tee -a $LOG_FIL
#- build the mpack utility (from build_mpack in testreport):
  MPACKDIR=`dirname $MPACK`
  ( cd $MPACKDIR && ./configure && make ) > TTT.build_mpack.$$ 2>&1
  RETVAL=$?
  if test "x$RETVAL" != x0 ; then
    echo "Error building the mpack tools at: $MPACK_DIR" | tee -a $LOG_FIL
    if test -x $MPACK ; then
       HAVE_MPACK=t
       echo "  use (old ?) executable:" | tee -a $LOG_FIL
       ls -l $MPACK | tee -a $LOG_FIL
    else
       HAVE_MPACK=f
    fi
  else
    if test -x $MPACK ; then
       rm -f TTT.build_mpack.$$
       HAVE_MPACK=t
       echo "Building mpack: OK" | tee -a $LOG_FIL
    else
       echo " $MPACK not executable" | tee -a $LOG_FIL
       HAVE_MPACK=f
    fi
  fi
  echo "" >> $LOG_FIL

  NB_JOBS2GET=$NB_SUB_JOBS
fi

#- when it's done, retrieve output and send e-mail
  sleep 60
  get_outp_back $NB_JOBS2GET
 #echo " info-main:     REJECTED='$REJECTED'" >> $LOG_FIL

if test "x$REJECTED" != x ; then

 if [ $action -ge 4 ] ; then
  echo "" >> $LOG_FIL
  echo "Try 2nd round for fast-failed jobs: '$REJECTED'" | tee -a $LOG_FIL
  JOB_LIST=$REJECTED
  sub_list_jobs
 #echo " info-main:     NB_SUB_JOBS='$NB_SUB_JOBS'" >> $LOG_FIL
 fi

 if [ $action -eq 4 ] ; then
  echo "" >> $LOG_FIL
  get_outp_back $NB_SUB_JOBS
 #echo " info-main:     REJECTED='$REJECTED'" >> $LOG_FIL
 fi

fi

#------------------------
exit 0
