#!/bin/sh
#
# S.M.A.R.T. attribute check
# written by Gerd Koenig <geko@deb.ymc.ch>
# and Daniel Beyer <dabe@deb.ymc.ch>
#
# Description:
#
# This script will scan all available devices if they support S.M.A.R.T.
# and enables it, if needed.
# On each smart enabled device any attribute will be checked again its
# threshold (retrieved from the smart device database).
# Additionally the S.M.A.R.T-RAW-values are checked with the result logged and
# alerting if RAW-values are increased after the last extended offline-test has
# been run.
#
# Output:
#
# Returncode  String     Description
#     0       Ok         everything's fine
#     1       <warning>  message <warning> contains info about warning state
#     2       <error>    message <error> contains info about error state
#     3       <unknown>  message <unknown> contains info about some unknow state
#
#

. /usr/lib/lib-fliwi/fliwi-drives.sh

smartCtl="/usr/sbin/smartctl"

error_trace_dir="/var/lib/fliwi-check-smart"
MAX_SHORT_TEST_AGE_IN_HOURS=50
MAX_EXTENDED_TEST_AGE_IN_HOURS=350
DEFAULT_SUCCESSFUL_SELFTEST_GREP_STRING=' Completed without error '

foundSmartDisk=0
ret=0
okString="OK"

if [ ! -f "$smartCtl" ]; then
  echo "$smartCtl not found"
  exit 3
fi

if [ ! -d "$error_trace_dir" ] || \
   [ ! -w "$error_trace_dir" ]; then
  echo "$error_trace_dir not found or not writable"
  exit 3
fi

raw_values_to_analyse="Reallocated_Sector_Ct \
                       Spin_Retry_Count \
                       Reported_Uncorrect \
                       Command_Timeout \
                       Reallocated_Event_Count \
                       Current_Pending_Sector \
                       Offline_Uncorrectable"

fliwichecksmart_store_raw_value()
{
  tmp_disk=$1
  tmp_disk_identification=$2
  tmp_power_on_hours=$3
  tmp_numeric_value=$4
  tmp_raw_value_name=$5

  if [ ! -z "$tmp_numeric_value" ] && \
     [ $tmp_numeric_value -eq 0 ]; then
    # Reset last increment PoH to zero if numeric value is zero to prevent false positives due to e.g. new drives or newly monitored values)
    tmp_last_incremented_power_on_hours=0
  else
    tmp_last_incremented_power_on_hours=$tmp_power_on_hours
  fi

  LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS=''
  NUMERIC_VALUE=''

  store_to_file="$error_trace_dir/$tmp_disk-$tmp_disk_identification.$tmp_raw_value_name"

  if [ -f "$store_to_file" ]; then
    . $store_to_file

    if [ ! -z "$NUMERIC_VALUE" ] && \
       [ ! -z "$LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS" ] && \
       [ $NUMERIC_VALUE -eq $tmp_numeric_value ]; then
      tmp_last_incremented_power_on_hours=$LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS
    fi
  fi

  echo "### information gathered on $(LC_ALL=C date)" > $store_to_file
  echo "DISK='$tmp_disk'" >> $store_to_file
  echo "DISK_IDENTIFICATION='$tmp_disk_identification'" >> $store_to_file
  echo "POWER_ON_HOURS='$tmp_power_on_hours'" >> $store_to_file
  echo "LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS='$tmp_last_incremented_power_on_hours'" >> $store_to_file
  echo "VALUE_TYPE='$tmp_raw_value_name'" >> $store_to_file
  echo "NUMERIC_VALUE='$tmp_numeric_value'" >> $store_to_file
  echo "$tmp_raw_value_name='$tmp_numeric_value'" >> $store_to_file
}

fliwichecksmart_do_cleanup()
{
  if [ -n "$smart_temp_info_file" ] && \
     [ -f "$smart_temp_info_file" ]; then
    rm -f $smart_temp_info_file
  fi

  if [ -n "$smart_temp_attributes_file" ] && \
     [ -f "$smart_temp_attributes_file" ]; then
    rm -f $smart_temp_attributes_file
  fi

  if [ -n "$smart_temp_selftest_file" ] && \
     [ -f "$smart_temp_selftest_file" ]; then
    rm -f $smart_temp_selftest_file
  fi
}

smart_temp_info_file=$(tempfile --prefix 'fliwi_' --suffix '_check_smart_info')
if [ ! -f "$smart_temp_info_file" ]; then
  echo "Problem creating a tempfile for smart-info..."
  fliwichecksmart_do_cleanup
  exit 3
fi

smart_temp_attributes_file=$(tempfile --prefix 'fliwi_' --suffix '_check_smart_attributes')
if [ ! -f "$smart_temp_attributes_file" ]; then
  echo "Problem creating a tempfile for smart-attributes..."
  fliwichecksmart_do_cleanup
  exit 3
fi

smart_temp_selftest_file=$(tempfile --prefix 'fliwi_' --suffix '_check_smart_selftest')
if [ ! -f "$smart_temp_selftest_file" ]; then
  echo "Problem creating a tempfile for smart-selftests..."
  fliwichecksmart_do_cleanup
  exit 3
fi


problem_string=""
disk_file_prefixes_to_check=""
number_of_disks_found=0
disks="$($smartCtl --scan | awk '{print $1}') $(fliwi_get_non_block_device_disks)"
for disk in $disks
do
  SUCCESSFUL_SELFTEST_GREP_STRING="$DEFAULT_SUCCESSFUL_SELFTEST_GREP_STRING"
  LC_ALL=C $smartCtl -i $disk | sed 's/^[[:space:]]*//' > $smart_temp_info_file
  if [ $(cat $smart_temp_info_file | grep -c -F 'SMART support is: Available') -eq 1 ]; then
    foundSmartDisk=1
    if [ $(cat $smart_temp_info_file | grep -c -F "SMART support is: Enabled") -ne 1 ]; then
      test=$($smartCtl --smart=on $disk)
      if [ $? -ne 0 ]; then
        echo "Problem while enabling smart on $disk"
        ret=2
        continue
      fi
    fi

    if [ $(cat $smart_temp_info_file | sed -r 's/[[:space:]]+/ /g' | grep -c -F "Device Model: SAMSUNG HD753LJ") -eq 1 ] && \
       [ $(cat $smart_temp_info_file | sed -r 's/[[:space:]]+/ /g' | grep -c -F "Firmware Version: 1AA01113") -eq 1 ]; then
      SUCCESSFUL_SELFTEST_GREP_STRING='( Completed without error )|(^# [^1] .* Aborted by host[[:space:]]+00%)'
    fi

    if [ $($smartCtl -H $disk| grep -c -F 'PASSED') -ne 1 ]; then
      problem_string=$problem_string" - Disk $disk is about to fail"
      ret=2
      continue
    fi

    disk_last_successful_extended_offine_test_poh=0
    disk_last_successful_short_offine_test_poh=0

    LC_ALL=C $smartCtl -l selftest $disk | sed -r 's/[[:space:]]+/ /g' > $smart_temp_selftest_file

    for selftest_type in "Extended offline" "Short offline"
    do
      tmp_latest_extended_offline_test_string=$(cat $smart_temp_selftest_file | grep -F " $selftest_type " | grep -v -F ' in progress' | head -n 1)
      if [ -z "$tmp_latest_extended_offline_test_string" ]; then
        problem_string=$problem_string" - No '$selftest_type'-check has been run on disk $disk"
        if [ $ret -eq 0 ]; then
          ret=1
        fi
      elif [ $(echo $tmp_latest_extended_offline_test_string | grep -c -E -e "$SUCCESSFUL_SELFTEST_GREP_STRING") -ne 1 ]; then
        problem_string=$problem_string" - Latest '$selftest_type'-check on disk $disk failed"
        ret=2
      else
        temp_check_age=$(echo $tmp_latest_extended_offline_test_string | grep -E -e "$SUCCESSFUL_SELFTEST_GREP_STRING" | sed -r 's/.*% ([0-9]+) .*/\1/' | head -n 1 | sed -r 's/[^0-9]//g')
        if [ -n "$temp_check_age" ] && \
           [ $temp_check_age -gt 0 ]; then
          if [ "$selftest_type" = "Extended offline" ]; then
            disk_last_successful_extended_offine_test_poh=$temp_check_age
          elif [ "$selftest_type" = "Short offline" ]; then
            disk_last_successful_short_offine_test_poh=$temp_check_age
          fi
        fi
      fi
    done

    thresh_counter=0
    disk_without_dev=$(echo $disk | sed -r 's|/dev/||g' | sed -r 's/[^0-9a-zA-Z]//g')
    disk_identification=$(cat $smart_temp_info_file | grep -E -e '^(Model Family|Device Model|Serial Number):' | md5sum - | sed -r 's/^([0-9a-zA-Z]+).*/\1/g')
    disk_file_prefixes_to_check="$disk_file_prefixes_to_check $disk_without_dev-$disk_identification"
    number_of_disks_found=$(expr $number_of_disks_found + 1)

    echo $disk_last_successful_extended_offine_test_poh > $error_trace_dir/last_successful_extended_offline_check_poh.$disk_without_dev-$disk_identification

    $smartCtl -A $disk | sed 's/^[[:space:]]*//' | grep -e "^[0-9].*" | sed 's/ \+/##/g' > $smart_temp_attributes_file

    power_on_hours=$(cat $smart_temp_attributes_file | sed 's/^[[:space:]]*//' | grep -e "^[0-9].*" | sed 's/ \+/##/g' | grep Power_On_Hours | awk -F "##" '{print $10}' | sed -r 's/[^0-9]//g')
    if [ -z "$power_on_hours" ]; then
      power_on_hours=0
    fi

    for smartline in $(cat $smart_temp_attributes_file)
    do
      ### Check if thresholds are reached //start
      thresh_info_tmp=$(echo $smartline | awk -F "##" '$4<$6 {print $2"__"$4"__"$6}')
      if [ "$thresh_info_tmp" != "" ]; then
        thresh_counter=$(expr $thresh_counter + 1)
        ret=2
      fi
      ### Check if thresholds are reached //end

      current_raw_value_to_analyse=$(echo $smartline | awk -F "##" '{print $2}')
      for raw_value_to_analyse in $raw_values_to_analyse
      do
        if [ "$current_raw_value_to_analyse" = "$raw_value_to_analyse" ]; then
          numeric_raw_value=$(echo $smartline | awk -F "##" '$10>0 {print $10}' | sed -r 's/[^0-9]//g')
          if [ -z "$numeric_raw_value" ]; then
            numeric_raw_value=0
          fi

          fliwichecksmart_store_raw_value $disk_without_dev $disk_identification $power_on_hours $numeric_raw_value $current_raw_value_to_analyse
        fi
      done
    done

    if [ $thresh_counter -eq 1 ]; then
      problem_string=$problem_string" - Disk $disk_without_dev has one SMART-value reached its threshold"
    elif [ $thresh_counter -gt 1 ]; then
      problem_string=$problem_string" - Disk $disk_without_dev has $thresh_counter SMART-values reached their threshold"
    fi

    if [ $disk_last_successful_short_offine_test_poh -gt 0 ]; then
      ### Check if we need a short offline test
      if [ $(expr $disk_last_successful_short_offine_test_poh + $MAX_SHORT_TEST_AGE_IN_HOURS) -lt $power_on_hours ]; then
        problem_string=$problem_string" - No short offline test run on $disk_without_dev for $MAX_SHORT_TEST_AGE_IN_HOURS power-on hours"
        if [ $ret -eq 0 ]; then
          ret=1
        fi
      fi
    fi

    if [ $disk_last_successful_extended_offine_test_poh -gt 0 ]; then
      ### Check if we need a extended offline test
      if [ $(expr $disk_last_successful_extended_offine_test_poh + $MAX_EXTENDED_TEST_AGE_IN_HOURS) -lt $power_on_hours ]; then
        problem_string=$problem_string" - No extended offline test run on $disk_without_dev for $MAX_EXTENDED_TEST_AGE_IN_HOURS power-on hours"
        if [ $ret -eq 0 ]; then
          ret=1
        fi
      fi
    fi
  fi
done

if [ $foundSmartDisk -eq 0 ]; then
  echo "no disk with S.M.A.R.T. functionality available"
  fliwichecksmart_do_cleanup
  exit 0
fi

if [ $ret -eq 0 ]; then
  ### Check for unchecked RAW-errors

  global_error_counter=0
  for disk_file_prefix_to_check in $disk_file_prefixes_to_check
  do
    ERRORED_RAW_VALUES=''
    disk_error_counter=0
    disk_without_dev=$(echo $disk_file_prefix_to_check | cut -d '-' -f 1)
    disk_last_successful_extended_offine_test_poh=$(cat $error_trace_dir/last_successful_extended_offline_check_poh.$disk_file_prefix_to_check)
    if [ $? -ne 0 ] || \
       [ -z "$disk_last_successful_extended_offine_test_poh" ]; then
      echo "Can not determine poh-value of last successful extended offline check for '$disk_file_prefix_to_check'"
      fliwichecksmart_do_cleanup
      exit 2
    fi

    for disk_file_to_check in $(find $error_trace_dir/$disk_file_prefix_to_check.*)
    do
      . $disk_file_to_check
      if [ $? -ne 0 ]; then
        echo "Can not read gathered information for '$disk_file_prefix_to_check'"
        fliwichecksmart_do_cleanup
        exit 2
      fi

      if [ $(echo $raw_values_to_analyse | grep -c -F $VALUE_TYPE) -eq 0 ]; then
        echo "INFO: Ignoring no longer used RAW value '$VALUE_TYPE' for drive '$DISK'" 1>&2
        continue
      fi

      if [ -n "$NUMERIC_VALUE" ] && \
         [ $NUMERIC_VALUE -gt 0 ] && \
         [ -n "$LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS" ] && \
         [ $LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS -gt 0 ] && \
         [ $LAST_NUMERIC_INCREMENT_DETECTED_ON_POWER_ON_HOURS -gt $disk_last_successful_extended_offine_test_poh ]; then
        global_error_counter=$(expr $global_error_counter + 1)
        disk_error_counter=$(expr $disk_error_counter + 1)
        ERRORED_RAW_VALUES="$(echo $ERRORED_RAW_VALUES $VALUE_TYPE)"
      fi
    done

    if [ $disk_error_counter -eq 1 ]; then
      problem_string=$problem_string" - Disk $disk_without_dev has one unchecked and increased SMART-raw-value ($ERRORED_RAW_VALUES)"
    elif [ $disk_error_counter -gt 1 ]; then
      problem_string=$problem_string" - Disk $disk_without_dev has $disk_error_counter unchecked and increased SMART-raw-values ($ERRORED_RAW_VALUES)"
    fi
  done

  if [ $global_error_counter -gt 0 ]; then
    ret=1
  fi
fi

fliwichecksmart_do_cleanup

if [ $ret -eq 0 ]; then
  echo "$okString"
else
  echo "$problem_string" | sed -r 's/^ - //' | sed -r 's|/dev/||g' | sed -r 's/[[:space:]]+/ /g'
fi

exit $ret
