#!/bin/bash
#
# @@@ START COPYRIGHT @@@
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
#
# @@@ END COPYRIGHT @@@
#

#SQ Startup Driver Script. Use sqgen prior to this to generate various files required for startup.

function LogIt {
    echo  "`date`: $1" >> $TRAF_HOME/logs/startup.log
}

function Usage {

    echo
    echo "Usage: $script_name { -c } -x"
    echo
    echo "-c Perform a Cold Start"
    echo "-x Perform Extra Checks (chk_dbperms) before allowing startup"
    echo
    exit 1;
}


function hms
{
  seconds=$elapsed_seconds
  hours=$((seconds / 3600))
  seconds=$((seconds % 3600))
  minutes=$((seconds / 60))
  seconds=$((seconds % 60))
  echo " $hours hour(s) $minutes minute(s) $seconds second(s)"
}


function GetOpts {

    while getopts "chx" arg
      do
       case $arg in
          c)
             sq_start=c;
             ;;
          x)
             run_chk_dbperms=1;
             ;;
          h)
             Usage;
             ;;
          *)
             Usage;
             ;;
       esac
    done
}

function chkReturnCode {
    if [[ $2 != 0 ]]; then
       LogIt "End $script_name $sq_start, exit code: $2"
       echo "$1 returned error $2, Exiting..."
       exit $2;
    fi
}

function echoLog {

    echo $1 | tee -a $SQMON_LOG
}

function gomonColdStart {
    monreg=$MPI_TMPDIR/monitor.registry.*.`uname -n`
    echoLog "Forcing a Cold Start: rm $monreg"
    rm -f $monreg 2>&1
    echoLog "Performing a Cold Start"
    SQSCRIPT_FILE=$COLD_STARTFILE
}

function gomonColdRecoveryStart {

    SQSCRIPT_FILE=$COLD_STARTFILE
}

function CheckMonitorLink {

    if [ ! -e monitor ]; then
        if [ -h monitor ]; then
            echoLog "The monitor link $TRAF_HOME/sql/scripts/monitor points to a non-existent target:"
            ls -al monitor
        else
            echoLog "A soft link to 'monitor' is missing in $TRAF_HOME/sql/scripts"
        fi
        echoLog "Aborting startup."
        LogIt "End $script_name $sq_start, exit code: 1"
        exit 1
    fi

}

# Sets the variable gv_job_pid with the pid of the job (passed as an argument)
function getJobInfo {

    if [ -z "$1" ]; then
        exit 1
    fi

    lv_job=$1
    gv_job_pid=""

    lv_jobinfo=`jobs -l $lv_job`

    if [ ! -z "$lv_jobinfo" ]; then
        let lv_count=0
        for lv_item in $lv_jobinfo; do
            if [ $lv_count '==' 1 ]; then
                gv_job_pid=$lv_item
            fi
            let ++lv_count
        done
    fi

}

function SQStartProcesses {

    echoLog "Starting the SQ Environment (Executing $SQSCRIPT_FILE)"
    echoLog
    $SQSCRIPT_FILE > $SQMON_LOG 2>&1

    sqstart_stat=$?
    echo
    if [[ $sqstart_stat == 0 ]]; then
        echoLog "SQ Startup script ($SQSCRIPT_FILE) ran successfully. Performing further checks..."
        echo "0" > $SQSTART_EXIT_STATUS
    else
        echoLog "Error while executing the startup script!!!"
        sqcheck -f
        sq_stat=$?
        if [[ $sq_stat == 0 ]]; then
            echoLog "Even though all the SQ processes started up, the startup did not succeed!"
        fi
        echoLog
        echoLog "Please check the SQ shell log file : $SQMON_LOG"
        echoLog
        echoLog "SQ Startup (from $PWD) Failed"
        echoLog
        echo "1" > $SQSTART_EXIT_STATUS
    fi

}

function SQCheckOrphanProcesses {

    let lv_orphans_exist=0
    let lv_orphan_process_count=0
    let lv_sqcop_done=0
    let lv_num_checks=0
    let lv_max_checks=10

    echo -n "Checking orphan processes"
    while [ $lv_sqcop_done '==' 0 ]; do
    echo -n "."
    lv_orphan_process_count=`cstat -h | wc -l`
    let lv_orphans_exist=(lv_orphan_process_count '>' 2)

    if [ $lv_orphans_exist '==' 1 ]; then
        let lv_actual_orphan_count=(lv_orphan_process_count '-' 2)
            echo -ne "\rChecking orphan processes: $lv_actual_orphan_count"
        let ++lv_num_checks
        let lv_sqcop_done=(lv_num_checks '>' lv_max_checks)
        if [ $lv_sqcop_done '==' 0 ]; then
            sleep 5
        fi
    else
        let lv_sqcop_done=1
    fi
    done

    echo
    if [ $lv_orphans_exist '==' 1 ]; then
        lv_msg1="There are orphan processes from a previous SQ instance."
        echo "$lv_msg1"
        cstat
        echo
        lv_msg2="SQ startup has not been initiated. Exiting..."
        echo "$lv_msg"
        LogIt "End $script_name, exit code: 1, $lv_msg1 $lv_msg2"
        exit 1
    fi

}

function checkKerberos {

   # Check to see if kerberos is enabled in Hadoop
   cat /etc/hadoop/conf/core-site.xml | while read a; do
     found=`echo $a | grep "hadoop.security.authentication" | wc -l`
     if [[ $found -eq 1 ]]; then
       read b
       enabled=`echo $b | grep kerberos | wc -l`
       if [[ $enabled -eq 1 ]]; then
         exit 1
       else
         exit 0
       fi
     fi
   done
   retcode=$?

   if [[ $retcode -eq 1 ]]; then
      # Check to see if TGT exists (ticket granting ticket)
      tktExists=`krb5service status | grep "Ticket information not found" | wc -l`
      return $tktExists
   fi
   return 0
}

function checkUlimit {
  
 #check max_pid 
 userProc=`cat /proc/sys/kernel/pid_max`
 if [[ $userProc -gt 65535 ]] ; then
    echoLog ""
    echoLog "ERROR: issue detected during startup. Please make sure your kernel parameter (sysctl) is set up correctly."
    echoLog "ERROR: max_pid should not be greater than 65535"
    echoLog ""
    return
 fi

 #check ulimit -l
 maxLockMem=`ulimit -l`
 if [[ $maxLockMem -lt 65536 ]] ; then
  if [[  "$maxLockMem" -ne "unlimited" ]]; then
    echoLog ""
    echoLog "ERROR: issue detected during startup. Please make sure your ulimit -l is set up correctly."
    echoLog "ERROR: max locked memory is smaller than 64M"
    echoLog ""
    return 
  fi
 fi
  
 #check ulimit -l
 if [[ $maxLockMem -lt 327680 ]] ; then
  if [[  "$maxLockMem" -ne "unlimited" ]]; then
    echoLog ""
    echoLog "WARNING: issue detected during startup, please make sure your ulimit -l is set up correctly."
    echoLog "WARNING: max locked memory is smaller than 320M "
    echoLog ""
    return 
  fi
 fi

}

#########################################################
# MAIN portion of sqstart begins here
#########################################################

script_name=`/bin/basename $0`
sq_start=$SQ_STARTUP;
run_chk_dbperms=0

STARTtime=$(date +%s)


GetOpts $BASH_ARGV

if [ ! -z $TRAF_HOME ]; then
    cd $TRAF_HOME/sql/scripts
else
    echoLog
    echoLog "The TRAF_HOME environment variable does not exist."
    echoLog "Please ensure sqenv.sh has been sourced, and re-run $script_name."
    echoLog
    exit 1;
fi

checkUlimit

# Check SeaMonster kernel module if SeaMonster is enabled
if [[ $SQ_SEAMONSTER == "1" ]]; then
    echoLog
    echoLog "SeaMonster is enabled, checking SeaMonster Kernel Module installed version..."
    echoLog
    sminfo_out=`$TRAF_HOME/export/bin${SQ_MBTYPE}/sminfo 2>&1`
    if [ $? -gt 0 ]; then
       echoLog "$sminfo_out"
       lv_msg="Correct SeaMonster Kernel Module in not installed; please resolve this issue and re-run $script_name!"
       echo "$lv_msg"
       LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
       exit 1
    fi
    echoLog "$sminfo_out"
    LogIt "$sminfo_out"
    echoLog
fi

if [[ ! -e $TRAF_HOME/sql/scripts/sw_env.sh ]]; then
   checkKerberos
   if [[ $? -ne 0 ]]; then
      echo
      echo "Kerberos is enabled on the system but cannot find a valid TGT (ticket granting ticket) for the Trafodion ID"
      echo "Please run trafodion_secure_install to enable Kerberos for Trafodion"

      echo
      exit -1
   fi
fi

declare -i lv_len_mysqroot
declare -i lv_max_mysqroot_len
lv_max_mysqroot_len=78
lv_len_mysqroot=${#TRAF_HOME}
if [ $lv_len_mysqroot '>' $lv_max_mysqroot_len ]; then
    lv_msg="Length of the string \$TRAF_HOME ($TRAF_HOME) should be less than $lv_max_mysqroot_len. Exiting...";
    echoLog "$lv_msg"
    LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
    exit 1;
fi

LogIt "Begin $script_name $sq_start"
sqcheck -i 1 -d 1 > /dev/null 2>&1
sq_stat=$?
if [[ $sq_stat == 0 ]]; then
    lv_msg="SQ environment is already up."
    echoLog "$lv_msg"
    LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
    exit 1
elif [[ $sq_stat == 1 ]]; then
    lv_msg="SQ environment is partially up."
    echoLog "$lv_msg"
    LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
    exit 1
fi

SQCheckOrphanProcesses

if [[ $run_chk_dbperms == 1 ]]; then
    chk_dbperms=$TRAF_HOME/sql/scripts/chk_dbperms
    if [ -x $chk_dbperms ]; then
        echo
        echo "Checking ownership and permissions."
        echo -n "This will take a few minutes..."
        $chk_dbperms --busy
        chk_res=$?
        echo
        if [ $chk_res -gt 0 ]; then
            lv_msg="Ownership and/or Permissions issues found.. aborting."
            echoLog "$lv_msg"
            LogIt "End $script_name $sq_start, exit code: $chk_res, $lv_msg"
            exit 1
        fi
        echo
    fi
fi



SQLOG_DIR=$TRAF_HOME/logs
SQMON_LOG=$SQLOG_DIR/sqmon.log
SQSTART_EXIT_STATUS=./sqstart.exit_status

if [[ -z $SQSCRIPTS_DIR ]]; then
    SQSCRIPTS_DIR=$TRAF_HOME/sql/scripts
fi

mkdir -p $SQLOG_DIR
mkdir -p $MPI_TMPDIR

if [ ! -d $MPI_TMPDIR ]; then
    lv_msg="Error: The MPI_TMPDIR: $MPI_TMPDIR does not exist. Exiting."
    echoLog "$lv_msg"
    LogIt "End $script_name $sqstart, exit code: 1, $lv_msg"
    exit 1
fi

if [ ! -w $MPI_TMPDIR ]; then
    lv_msg="Error: The MPI_TMPDIR: $MPI_TMPDIR is not writable by $USER. Exiting."
    echoLog "$lv_msg"
    LogIt "End $script_name $sqstart, exit code: 1, $lv_msg"
    exit 1
fi


COLD_STARTFILE=$SQSCRIPTS_DIR/gomon.cold

# If -c option is set, start based on the option supplied
if [[ $sq_start == "c" ]]; then
    gomonColdStart
else
    gomonColdRecoveryStart
fi

echoLog "Removing old mpijob* files from $MPI_TMPDIR"
echoLog  ""
rm -f $MPI_TMPDIR/mpijob*
echoLog "Removing old monitor.port* files from $MPI_TMPDIR"
echoLog  ""
rm -f $MPI_TMPDIR/monitor.port.*


setup_sqpdsh

# Clear unique strings
$SQPDSHA 'cd $TRAF_HOME/sql/scripts; utilConfigDb -u'

echoLog "Executing sqipcrm (output to sqipcrm.out)"
sqipcrm > sqipcrm.out

echoLog "Executing cleanZKNodes (output to cleanZKNodes.out)"
cleanZKNodes > cleanZKNodes.out 2>&1

echoLog "Checking whether HBase is up"
hbcheck 4 30
if [ $? != 0 ]; then
    lv_msg="It looks like HBase is not up, hbcheck utility failed."
    echoLog "$lv_msg"
    LogIt "End $script_name $sq_start, exit code: 1, $lv_msg"
  exit 1
fi

rm -f $SQSTART_EXIT_STATUS
SQStartProcesses &
getJobInfo SQStartProcesses

if [ ! -z "$gv_job_pid" ]; then
    lv_sqstartup_job_pid=$gv_job_pid
    echoLog "Background SQ Startup job (pid: $lv_sqstartup_job_pid)"
else
    lv_msg="Some problem with the initiation of the job to startup SQ processes"
    echoLog "$lv_msg"
    LogIt "End $script_name $sqstart, exit code: 3, $lv_msg"
    exit 3
fi

sleep 5

declare -i lv_recovery_tx_cnt
declare -i lv_startup_status
declare -i lv_num_checks
declare -i lv_max_checks
declare -i lv_done
declare -i lv_process_count_curr
declare -i lv_process_count_last
declare -i lv_TM0_started
declare -i lv_max_iterations
declare -i lv_num_iterations
declare -i lv_sleep_per_iteration

let lv_recovery_tx_cnt=-1
let lv_startup_status=1
let lv_num_checks=0
#Long recoveries on a cluster could delay TX services
let lv_max_checks=600
# Check if resource is a work station
if [[ -e ${TRAF_HOME}/etc/ms.env ]] ; then
   VIRT_NODES=`awk '/SQ_VIRTUAL_NODES=/ { fields=split($0,virt,"=");  if ( fields == 2 ) { virtnodes=virt[2];}} END {print  virtnodes}' < $TRAF_HOME/etc/ms.env`
   if [[ -n "$VIRT_NODES" ]] ; then
      #Some utilities on workstations will timeout if we wait too long
      lv_max_checks=360
   fi
fi
let lv_done=0
let lv_process_count_curr=0
let lv_process_count_last=0
let lv_TM0_started=0
let lv_tm_svc_ready=0
let lv_max_iterations=5
let lv_num_iterations=0
let lv_sleep_per_iteration=5
let tx_srvc_ready_printed=0

while [ $lv_done '==' 0 ];
do

  if [ -e $SQSTART_EXIT_STATUS ]; then
      lv_startup_status=`cat $SQSTART_EXIT_STATUS`
      let lv_done=1
  else

      lv_process_count_curr=`cstat 2>/dev/null | grep -v "\-\-\-   \-\-\-\-" | grep -v 'pid   ppid' | wc -l`
      let lv_cnt_check=(lv_process_count_curr '>' lv_process_count_last)

      if [ $lv_cnt_check '==' 1 ]; then
         let lv_num_checks=0
      fi
      lv_process_count_last=lv_process_count_curr

#      lv_TM0_started=`cstat 2>/dev/null | grep -v "\-\-\-   \-\-\-\-" | grep -v 'pid   ppid' | grep TM | wc -l`

#      if [ $lv_TM0_started '!=' 0 ]; then
#         let lv_num_checks=0
#         lv_recovery_tx_cnt=`sqshell -c show | grep -v 'Configuration Change Notice' | grep DTM_RECOVERING_TX_COUNT | cut -d= -f2 | sed -e 's/^[[:space:]]*//'`
#         if [ "$lv_recovery_tx_cnt" -gt "0" ]; then
#            echo -ne "\r# of Transactions being recovered: $lv_recovery_tx_cnt "
#         else
#            if [ "$lv_tm_svc_ready" -eq 0 ]; then
#               echo -ne "\r# of Transactions being recovered: $lv_recovery_tx_cnt "
#               sqregck -r SQ_TXNSVC_READY -d 5 -i 5
#               lv_tm_svc_ready=$?
#            else
#               if [ $tx_srvc_ready_printed -eq 0 ]; then
#                 echo -ne "\rTransaction Services Ready          "
#                 tx_srvc_ready_printed=1
#               fi
#               let lv_tm_svc_ready=1
#            fi
#         fi
#      else
         if [ $lv_num_checks '==' 0 ]; then
            echo -ne "\r# of SQ processes: $lv_process_count_last "
         else
            echo -n "."
         fi
#      fi

      getJobInfo SQStartProcesses
      if [ -z "$gv_job_pid" ]; then
          if [ ! -e $SQSTART_EXIT_STATUS ]; then
              echoLog "The background SQ job (pid: $lv_sqstartup_job_pid) that was starting up the SQ processes exitted abnormally."
              echoLog "There are still $lv_process_count_curr SQ processes that exist in the environment."
              echoLog "Exiting."
              let lv_done=1
              lv_startup_status=3
              break
          fi
      fi

      let ++lv_num_checks
      if [[ $((lv_num_checks % 100)) -eq 0 ]]; then
          date; sqcheck -f
      fi
      let lv_done=(lv_num_checks '>' lv_max_checks)
      if [ $lv_done '==' 1 ]; then
          lv_startup_status=2
          echoLog ""
          echoLog "SQ Startup is taking longer than usual and seems to be stalled at $lv_process_count_curr processes. "
          echoLog "The background SQ job (pid: $lv_sqstartup_job_pid) that is starting up the SQ processes is still running."
          echoLog "Exiting."
      fi
      sleep 10
  fi

done

if [ $lv_startup_status '==' 0 ]; then
#   Minimize the wait time as much as possible by regularly checking,
#   but don't wait forever either
    let lv_done=0
    while [[ $lv_done -eq 0 ]] && [[ $lv_num_iterations -lt $lv_max_iterations ]];
    do
        sleep $lv_sleep_per_iteration
        # sqcheck will return:
        #    -1 - Not up ($?=255) 
        #     0 - Fully up and operational
        #     1 - Partially up and operational
        #     2 - Partially up and NOT operational
        #  On a return of 1, we loop for a while to let it come fully up
        sqcheck > /dev/null 2>&1
        sq_stat=$?
        if ([ $sq_stat '!=' 1 ]); then
            let lv_done=1
        else
            if [ $lv_num_iterations '==' 0 ]; then
                echo "The SQ environment is partially up! Continuing checks."
                echo -n "Checking"
            fi
            echo -n "."
            let ++lv_num_iterations
        fi
    done
    if [ $lv_num_iterations -eq $lv_max_iterations ]; then
        echo
    fi
#   Official sqcheck that will be displayed
    sqcheck | tee -a $SQMON_LOG
    echoLog "`date`"
fi

# Start the DCS Service
dcsstart
sleep 10
sqcheck | tee -a $SQMON_LOG
# Start the LOB Service
#comment out since it's unused right now
#lobstart
# Start the REST Service
reststart

echo
dcscheck | tee -a $SQMON_LOG
echo
echo "You can monitor the SQ shell log file : $SQMON_LOG"
echo

#check ulimit again to add it into log
checkUlimit

echo
STOPtime=$(date +%s)
elapsed_seconds=$(( $STOPtime - $STARTtime))
printf "Startup time "
hms  "$elapsed_seconds"

if [ $lv_startup_status '!=' 0 ]; then
  LogIt "Please check the log files in $SQLOG_DIR for errors."
fi

LogIt "End $script_name $sq_start, exit code: ${lv_startup_status}"
exit ${lv_startup_status}
