#!/usr/bin/env bash
# Redis 专项监控：检查 PING、持久化状态、近期 MISCONF/磁盘写入错误；必要时触发根分区安全清理并尝试 BGSAVE 自愈。
set -Eeuo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ROOT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
REDIS_CONTAINER="${REDIS_CONTAINER:-redis-base}"
REDIS_CONF_PATH="${REDIS_CONF_PATH:-/home/admin/docker-projects/redis-base/redis.conf}"
REDIS_MIN_ROOT_FREE_MB="${REDIS_MIN_ROOT_FREE_MB:-2048}"
REDIS_AUTO_CLEAN="${REDIS_AUTO_CLEAN:-1}"
REDIS_AUTO_BGSAVE="${REDIS_AUTO_BGSAVE:-1}"
REDIS_LOG_TAIL="${REDIS_LOG_TAIL:-300}"
REDIS_FORCE_ALERT="${REDIS_FORCE_ALERT:-0}"
REDIS_MIN_BGSAVE_INTERVAL_SEC="${REDIS_MIN_BGSAVE_INTERVAL_SEC:-60}"
REDIS_MAX_USED_MEMORY_MB_FOR_BGSAVE="${REDIS_MAX_USED_MEMORY_MB_FOR_BGSAVE:-1024}"
LOG_DIR="${LOG_DIR:-$ROOT_DIR/runtime/logs}"
BARK_GROUP="${BARK_GROUP:-monchk-redis}"
FAILED_LINE="?"

log(){ local line="[$(date '+%Y-%m-%d %H:%M:%S')] $*"; printf '%s\n' "$line" >&2 || true; mkdir -p "$LOG_DIR" 2>/dev/null || true; printf '%s\n' "$line" >> "$LOG_DIR/check-redis-bark_$(date +%Y%m%d).log" 2>/dev/null || true; }
fail(){ log "fatal $*"; exit 1; }
require(){ command -v "$1" >/dev/null 2>&1 || fail "缺少命令: $1"; }
url_encode(){ python3 -c 'import sys, urllib.parse; print(urllib.parse.quote(sys.argv[1], safe=""))' "$1"; }

send_bark(){
  local title="$1" body="$2"
  require curl; require python3
  local server="${BARK_SERVER:-https://api.day.app}" key="${BARK_KEY:-}" proxy="${BARK_PROXY:-}"
  local server2="${BARK_SECONDARY_SERVER:-$server}" key2="${BARK_SECONDARY_KEY:-}" proxy2="${BARK_SECONDARY_PROXY:-}"
  [ -n "$key" ] || { log "skip bark reason=no_key"; return 1; }
  local enc_title enc_body enc_group url rc=0 rc2=0
  enc_title="$(url_encode "$title")"; enc_body="$(url_encode "$body")"; enc_group="$(url_encode "$BARK_GROUP")"
  url="$server/$key/$enc_title/$enc_body?group=$enc_group"
  set +e
  curl -fsS --connect-timeout 10 --max-time 30 "$url" >/dev/null
  rc=$?
  if [ "$rc" -ne 0 ] && [ -n "$proxy" ]; then curl -fsS --connect-timeout 10 --max-time 30 --proxy "$proxy" "$url" >/dev/null; rc=$?; fi
  set -e
  if [ "$rc" -eq 0 ]; then log "bark success channel=primary"; return 0; fi
  if [ -n "$key2" ]; then
    url="$server2/$key2/$enc_title/$enc_body?group=$enc_group"
    set +e
    curl -fsS --connect-timeout 10 --max-time 30 "$url" >/dev/null
    rc2=$?
    if [ "$rc2" -ne 0 ] && [ -n "$proxy2" ]; then curl -fsS --connect-timeout 10 --max-time 30 --proxy "$proxy2" "$url" >/dev/null; rc2=$?; fi
    set -e
    [ "$rc2" -eq 0 ] && { log "bark success channel=secondary"; return 0; }
  fi
  log "bark failed primary_rc=$rc secondary_rc=$rc2"
  return 1
}

redis_password(){
  if [ -n "${REDIS_PASSWORD:-}" ]; then log "redis_password_source=REDIS_PASSWORD"; printf '%s' "$REDIS_PASSWORD"; return 0; fi
  if [ -n "${REDIS_PASSWORD_FILE:-}" ] && [ -r "$REDIS_PASSWORD_FILE" ]; then log "redis_password_source=REDIS_PASSWORD_FILE"; sed -n '1p' "$REDIS_PASSWORD_FILE"; return 0; fi
  if [ -r "$REDIS_CONF_PATH" ]; then log "redis_password_source=redis_conf"; awk '$1=="requirepass" {print $2; exit}' "$REDIS_CONF_PATH"; return 0; fi
  return 1
}

redis_cli(){
  local pass="$1"; shift
  docker exec "$REDIS_CONTAINER" redis-cli -a "$pass" --no-auth-warning "$@"
}

info_get(){ awk -F: -v key="$1" '$1==key {gsub(/\r/,"",$2); print $2; exit}'; }
root_avail_mb(){ df -Pm / | awk 'NR==2 {print $4}'; }
host_mem_available_mb(){ awk '$1=="MemAvailable:" {printf "%d", $2/1024}' /proc/meminfo 2>/dev/null || echo 0; }
host_overcommit_memory(){ cat /proc/sys/vm/overcommit_memory 2>/dev/null || echo unknown; }
container_restart_count(){ docker inspect "$REDIS_CONTAINER" --format '{{.RestartCount}}' 2>/dev/null || echo unknown; }
data_stat(){ docker exec "$REDIS_CONTAINER" sh -c 'stat -c "%U:%G %a %n" /data 2>/dev/null || ls -ld /data' 2>/dev/null || echo unknown; }
used_memory_mb_from_bytes(){ awk -v b="${1:-0}" 'BEGIN {printf "%d", b/1024/1024}'; }

recent_redis_errors(){
  docker logs --tail "$REDIS_LOG_TAIL" "$REDIS_CONTAINER" 2>&1 | grep -E "MISCONF|No space left|Background saving terminated with error|Can't save|Failed opening|Write error|Read-only file system" || true
}

cleanup(){ local rc=$?; [ "$rc" -ne 0 ] && log "cleanup aborted rc=$rc line=$FAILED_LINE" || log "cleanup success"; }
trap 'FAILED_LINE=$LINENO' ERR
trap cleanup EXIT

main(){
  require docker; require awk; require df; require grep
  local host pass ping persistence memory rdb_status aof_status aof_enabled loading bgsave_in_progress changes lastsave used_memory_human used_memory_bytes used_memory_mb maxmemory root_avail host_mem_avail overcommit restart_count data_perm errors status="OK" reasons="" before after clean_rc=0 bgsave_rc=0 ping_rc=0 now lastsave_age
  host="$(hostname -s 2>/dev/null || hostname || echo localhost)"
  docker inspect "$REDIS_CONTAINER" >/dev/null 2>&1 || fail "Redis 容器不存在: $REDIS_CONTAINER"
  pass="$(redis_password || true)"
  [ -n "$pass" ] || fail "未找到 Redis 密码；请设置 REDIS_PASSWORD/REDIS_PASSWORD_FILE 或检查 $REDIS_CONF_PATH"

  set +e
  ping="$(redis_cli "$pass" PING 2>&1)"; ping_rc=$?
  set -e
  if [ "$ping_rc" -ne 0 ] || [ "$ping" != "PONG" ]; then status="ALERT"; reasons="${reasons} ping_failed=$ping;"; fi

  persistence="$(redis_cli "$pass" INFO persistence 2>&1 || true)"
  memory="$(redis_cli "$pass" INFO memory 2>&1 || true)"
  rdb_status="$(printf '%s\n' "$persistence" | info_get rdb_last_bgsave_status || true)"
  aof_status="$(printf '%s\n' "$persistence" | info_get aof_last_write_status || true)"
  aof_enabled="$(printf '%s\n' "$persistence" | info_get aof_enabled || true)"
  loading="$(printf '%s\n' "$persistence" | info_get loading || true)"
  bgsave_in_progress="$(printf '%s\n' "$persistence" | info_get rdb_bgsave_in_progress || true)"
  changes="$(printf '%s\n' "$persistence" | info_get rdb_changes_since_last_save || true)"
  used_memory_human="$(printf '%s\n' "$memory" | info_get used_memory_human || true)"
  used_memory_bytes="$(printf '%s\n' "$memory" | info_get used_memory || true)"
  used_memory_mb="$(used_memory_mb_from_bytes "${used_memory_bytes:-0}")"
  maxmemory="$(printf '%s\n' "$memory" | info_get maxmemory_human || true)"
  lastsave="$(redis_cli "$pass" LASTSAVE 2>/dev/null || true)"
  root_avail="$(root_avail_mb)"
  host_mem_avail="$(host_mem_available_mb)"
  overcommit="$(host_overcommit_memory)"
  restart_count="$(container_restart_count)"
  data_perm="$(data_stat)"
  errors="$(recent_redis_errors)"

  [ "${rdb_status:-}" = "ok" ] || { status="ALERT"; reasons="${reasons} rdb_last_bgsave_status=${rdb_status:-missing};"; }
  if [ "${aof_enabled:-0}" = "1" ] && [ "${aof_status:-}" != "ok" ]; then status="ALERT"; reasons="${reasons} aof_last_write_status=${aof_status:-missing};"; fi
  [ "${loading:-0}" = "0" ] || { status="ALERT"; reasons="${reasons} loading=$loading;"; }
  [ -z "$errors" ] || { status="ALERT"; reasons="${reasons} recent_redis_log_error=yes;"; }
  case "$data_perm" in redis:*|unknown) ;; *) status="ALERT"; reasons="${reasons} data_perm_unexpected=$data_perm;";; esac
  if [ "$REDIS_FORCE_ALERT" = "1" ]; then status="ALERT"; reasons="${reasons} forced_alert=1;"; fi

  log "check host=$host redis=$REDIS_CONTAINER ping=$ping rdb_last_bgsave_status=${rdb_status:-missing} aof_enabled=${aof_enabled:-missing} aof_last_write_status=${aof_status:-missing} loading=${loading:-missing} bgsave_in_progress=${bgsave_in_progress:-missing} changes=${changes:-missing} used_memory_mb=$used_memory_mb host_mem_avail_mb=$host_mem_avail overcommit_memory=$overcommit data_perm=$data_perm restart_count=$restart_count root_avail_mb=$root_avail status=$status"

  if [ "$status" = "OK" ]; then
    echo "OK host=$host redis=$REDIS_CONTAINER ping=$ping rdb_last_bgsave_status=$rdb_status aof_enabled=${aof_enabled:-0} aof_last_write_status=${aof_status:-na} loading=${loading:-0} bgsave_in_progress=${bgsave_in_progress:-0} changes=${changes:-0} lastsave=${lastsave:-unknown} used_memory=${used_memory_human:-unknown} used_memory_mb=$used_memory_mb maxmemory=${maxmemory:-unknown} host_mem_avail_mb=$host_mem_avail overcommit_memory=$overcommit data_perm="$data_perm" restart_count=$restart_count root_avail_mb=$root_avail"
    return 0
  fi

  send_bark "[Redis 异常] $host $REDIS_CONTAINER $reasons" "host=$host\nredis=$REDIS_CONTAINER\nping=$ping\nrdb_last_bgsave_status=${rdb_status:-missing}\naof_enabled=${aof_enabled:-missing}\naof_last_write_status=${aof_status:-missing}\nloading=${loading:-missing}\nbgsave_in_progress=${bgsave_in_progress:-missing}\nchanges=${changes:-missing}\nused_memory_mb=$used_memory_mb\nhost_mem_avail_mb=$host_mem_avail\novercommit_memory=$overcommit\ndata_perm=$data_perm\nrestart_count=$restart_count\nroot_avail_mb=$root_avail\nreasons=$reasons\ntime=$(date '+%Y-%m-%d %H:%M:%S')" || true

  if [ "$REDIS_FORCE_ALERT" = "1" ]; then
    echo "ALERT host=$host redis=$REDIS_CONTAINER reasons=$reasons root_avail_mb=$root_avail forced=1"
    exit 2
  fi

  before="$root_avail"
  if [ "$REDIS_AUTO_CLEAN" = "1" ] && [ "$root_avail" -lt "$REDIS_MIN_ROOT_FREE_MB" ]; then
    set +e
    ROOT_FREE_MIN_MB="$REDIS_MIN_ROOT_FREE_MB" ROOT_FREE_AUTO_CLEAN=1 "$ROOT_DIR/bin/run" diagnose-root-free
    clean_rc=$?
    set -e
    after="$(root_avail_mb)"
    log "auto_clean done before_mb=$before after_mb=$after clean_rc=$clean_rc"
  else
    after="$before"
  fi

  now="$(date +%s)"
  lastsave_age=999999
  if printf '%s' "${lastsave:-}" | grep -Eq '^[0-9]+$'; then lastsave_age=$((now-lastsave)); fi
  if [ "$REDIS_AUTO_BGSAVE" = "1" ] && [ "$after" -ge "$REDIS_MIN_ROOT_FREE_MB" ] && { [ "${rdb_status:-}" != "ok" ] || [ -n "$errors" ]; }; then
    if [ "${loading:-0}" != "0" ]; then
      log "auto_bgsave skipped reason=loading loading=$loading"
    elif [ "${bgsave_in_progress:-0}" != "0" ]; then
      log "auto_bgsave skipped reason=bgsave_in_progress value=$bgsave_in_progress"
    elif [ "$lastsave_age" -lt "$REDIS_MIN_BGSAVE_INTERVAL_SEC" ]; then
      log "auto_bgsave skipped reason=too_soon lastsave_age_sec=$lastsave_age min_interval_sec=$REDIS_MIN_BGSAVE_INTERVAL_SEC"
    elif [ "$used_memory_mb" -gt "$REDIS_MAX_USED_MEMORY_MB_FOR_BGSAVE" ]; then
      log "auto_bgsave skipped reason=used_memory_too_high used_memory_mb=$used_memory_mb max=$REDIS_MAX_USED_MEMORY_MB_FOR_BGSAVE"
    elif [ "$host_mem_avail" -gt 0 ] && [ "$host_mem_avail" -lt $((used_memory_mb*2+128)) ]; then
      log "auto_bgsave skipped reason=host_mem_low host_mem_avail_mb=$host_mem_avail used_memory_mb=$used_memory_mb"
    else
      set +e
      redis_cli "$pass" BGSAVE >/dev/null 2>&1
      bgsave_rc=$?
      set -e
      sleep 2
      persistence="$(redis_cli "$pass" INFO persistence 2>&1 || true)"
      rdb_status="$(printf '%s\n' "$persistence" | info_get rdb_last_bgsave_status || true)"
      log "auto_bgsave done rc=$bgsave_rc rdb_last_bgsave_status=${rdb_status:-missing}"
    fi
  fi

  if [ "$ping" = "PONG" ] && [ "${rdb_status:-}" = "ok" ]; then
    send_bark "[Redis 自愈后恢复] $host $REDIS_CONTAINER" "host=$host\nredis=$REDIS_CONTAINER\nbefore_root_avail_mb=$before\nafter_root_avail_mb=$after\nclean_rc=$clean_rc\nbgsave_rc=$bgsave_rc\nrdb_last_bgsave_status=$rdb_status\ntime=$(date '+%Y-%m-%d %H:%M:%S')" || true
    echo "RECOVERED host=$host redis=$REDIS_CONTAINER reasons=$reasons before_root_avail_mb=$before after_root_avail_mb=$after clean_rc=$clean_rc bgsave_rc=$bgsave_rc rdb_last_bgsave_status=$rdb_status"
    return 0
  fi

  echo "ALERT host=$host redis=$REDIS_CONTAINER reasons=$reasons root_avail_mb=$after clean_rc=$clean_rc bgsave_rc=$bgsave_rc rdb_last_bgsave_status=${rdb_status:-missing}"
  exit 2
}
main "$@"
