bin/ 230 CLI tools (ask-*, br-*, agent-*, roadid, carpool) scripts/ 99 automation scripts fleet/ Node configs and deployment workers/ Cloudflare Worker sources (roadpay, road-search, squad webhooks) roadc/ RoadC programming language roadnet/ Mesh network (5 APs, WireGuard) operator/ Memory system scripts config/ System configs dotfiles/ Shell configs docs/ Documentation BlackRoad OS — Pave Tomorrow. RoadChain-SHA2048: d1a24f55318d338b RoadChain-Identity: alexa@sovereign RoadChain-Full: d1a24f55318d338b24b60bad7be39286379c76ae5470817482100cb0ddbbcb97e147d07ac7243da0a9f0363e4e5c833d612b9c0df3a3cd20802465420278ef74875a5b77f55af6fe42a931b8b635b3d0d0b6bde9abf33dc42eea52bc03c951406d8cbe49f1a3d29b26a94dade05e9477f34a7d4d4c6ec4005c3c2ac54e73a68440c512c8e83fd9b1fe234750b898ef8f4032c23db173961fe225e67a0432b5293a9714f76c5c57ed5fdf35b9fb40fd73c03ebf88b7253c6a0575f5afb6a6b49b3bda310602fb1ef676859962dad2aebbb2875814b30eee0a8ba195e482d4cbc91d8819e7f38f6db53e8063401649c77bb994371473cabfb917fb53e8cbe73d60
191 lines
6.2 KiB
Bash
Executable File
191 lines
6.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# BlackRoad Pi Fleet Health Monitor
|
|
# Pings all 4 Pis, checks key services, disk, memory, writes logs and alerts
|
|
# Usage: ./blackroad-health-monitor.sh
|
|
|
|
set -euo pipefail
|
|
|
|
# BlackRoad Brand Colors
|
|
PINK='\033[38;5;205m'
|
|
AMBER='\033[38;5;214m'
|
|
BLUE='\033[38;5;69m'
|
|
VIOLET='\033[38;5;135m'
|
|
GREEN='\033[38;5;82m'
|
|
RED='\033[38;5;196m'
|
|
RESET='\033[0m'
|
|
|
|
# Directories
|
|
LOG_DIR="$HOME/.blackroad/logs"
|
|
ALERT_DIR="$HOME/.blackroad/alerts"
|
|
mkdir -p "$LOG_DIR" "$ALERT_DIR"
|
|
|
|
# Date/time
|
|
DATE=$(date +%Y-%m-%d)
|
|
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
|
LOG_FILE="$LOG_DIR/health-${DATE}.log"
|
|
|
|
# Source centralized config for IPs
|
|
NODES_CONFIG="$HOME/.blackroad/config/nodes.sh"
|
|
[[ -f "$NODES_CONFIG" ]] && source "$NODES_CONFIG"
|
|
|
|
# Pi definitions: name|ip|ssh_user|services(port:label,...)
|
|
declare -a PI_DEFS=(
|
|
"Alice|${NODE_IP[alice]:-192.168.4.49}|${NODE_USER[alice]:-pi}|53:Pi-hole,5432:PostgreSQL,6333:Qdrant,6379:Redis"
|
|
"Cecilia|${NODE_IP[cecilia]:-192.168.4.96}|${NODE_USER[cecilia]:-blackroad}|11434:Ollama,9000:MinIO,5432:PostgreSQL"
|
|
"Octavia|${NODE_IP[octavia]:-192.168.4.100}|${NODE_USER[octavia]:-pi}|3100:Gitea,4222:NATS,11434:Ollama"
|
|
"Aria|${NODE_IP[aria]:-192.168.4.98}|${NODE_USER[aria]:-blackroad}|9000:Headscale,9443:Portainer"
|
|
"Lucidia|${NODE_IP[lucidia]:-192.168.4.38}|${NODE_USER[lucidia]:-octavia}|11434:Ollama,8000:LucidiaAPI"
|
|
)
|
|
|
|
SSH_OPTS="${BR_SSH_OPTS:--o ConnectTimeout=30 -o ServerAliveInterval=10 -o ServerAliveCountMax=3 -o StrictHostKeyChecking=no -o BatchMode=yes -o LogLevel=ERROR}"
|
|
ALERTS=()
|
|
NATS_SERVER="${NODE_IP[octavia]:-192.168.4.100}:4222"
|
|
|
|
# Publish to NATS (non-blocking, best-effort)
|
|
nats_pub() {
|
|
local subject="$1" payload="$2"
|
|
nats pub "$subject" "$payload" --server="$NATS_SERVER" 2>/dev/null || true
|
|
}
|
|
|
|
log() {
|
|
local msg="[$(date '+%H:%M:%S')] $1"
|
|
echo "$msg" >> "$LOG_FILE"
|
|
echo -e "$2$msg${RESET}"
|
|
}
|
|
|
|
log_separator() {
|
|
local sep="$(printf '=%.0s' {1..60})"
|
|
echo "$sep" >> "$LOG_FILE"
|
|
echo -e "${VIOLET}${sep}${RESET}"
|
|
}
|
|
|
|
add_alert() {
|
|
ALERTS+=("$1")
|
|
log "ALERT: $1" "$RED"
|
|
}
|
|
|
|
# SSH with built-in timeout (no coreutils timeout needed)
|
|
run_ssh() {
|
|
local user="$1" host="$2" cmd="$3"
|
|
ssh $SSH_OPTS "${user}@${host}" "$cmd" 2>/dev/null
|
|
}
|
|
|
|
check_port() {
|
|
local host="$1" port="$2" user="$3"
|
|
run_ssh "$user" "$host" "ss -tlnp 2>/dev/null | grep -q ':${port} ' && echo UP || echo DOWN" || echo "SSH_FAIL"
|
|
}
|
|
|
|
# Header
|
|
log_separator
|
|
log "BlackRoad Health Monitor - $TIMESTAMP" "$PINK"
|
|
log_separator
|
|
|
|
for pi_def in "${PI_DEFS[@]}"; do
|
|
IFS='|' read -r name ip user services <<< "$pi_def"
|
|
echo "" >> "$LOG_FILE"
|
|
log "--- $name ($ip) ---" "$BLUE"
|
|
|
|
# Ping check
|
|
if ping -c 1 -W 3 "$ip" &>/dev/null; then
|
|
log "$name: REACHABLE" "$GREEN"
|
|
nats_pub "blackroad.health" "{\"node\":\"$name\",\"status\":\"up\",\"ts\":$(date +%s)}"
|
|
else
|
|
add_alert "$name ($ip) is UNREACHABLE"
|
|
log "$name: UNREACHABLE - skipping service checks" "$RED"
|
|
nats_pub "blackroad.alerts" "{\"node\":\"$name\",\"status\":\"down\",\"ts\":$(date +%s)}"
|
|
continue
|
|
fi
|
|
|
|
# Service port checks
|
|
IFS=',' read -ra svc_list <<< "$services"
|
|
for svc in "${svc_list[@]}"; do
|
|
IFS=':' read -r port label <<< "$svc"
|
|
status=$(check_port "$ip" "$port" "$user")
|
|
if [[ "$status" == "UP" ]]; then
|
|
log " $label (port $port): UP" "$GREEN"
|
|
nats_pub "blackroad.health" "{\"service\":\"$label\",\"node\":\"$name\",\"port\":$port,\"status\":\"up\",\"ts\":$(date +%s)}"
|
|
else
|
|
add_alert "$name: $label (port $port) is $status"
|
|
log " $label (port $port): $status" "$RED"
|
|
nats_pub "blackroad.alerts" "{\"service\":\"$label\",\"node\":\"$name\",\"port\":$port,\"status\":\"down\",\"ts\":$(date +%s)}"
|
|
fi
|
|
done
|
|
|
|
# Disk usage
|
|
disk=$(run_ssh "$user" "$ip" "df -h / | tail -1" || echo "FAILED")
|
|
if [[ "$disk" != "FAILED" ]]; then
|
|
disk_pct=$(echo "$disk" | awk '{print $5}' | tr -d '%')
|
|
if [[ "$disk_pct" -ge 90 ]]; then
|
|
add_alert "$name: Disk usage at ${disk_pct}%"
|
|
log " Disk: $disk" "$RED"
|
|
elif [[ "$disk_pct" -ge 80 ]]; then
|
|
log " Disk: $disk" "$AMBER"
|
|
else
|
|
log " Disk: $disk" "$GREEN"
|
|
fi
|
|
else
|
|
log " Disk: check failed" "$RED"
|
|
fi
|
|
|
|
# Memory usage
|
|
mem=$(run_ssh "$user" "$ip" "free -m | grep Mem" || echo "FAILED")
|
|
if [[ "$mem" != "FAILED" ]]; then
|
|
mem_total=$(echo "$mem" | awk '{print $2}')
|
|
mem_used=$(echo "$mem" | awk '{print $3}')
|
|
if [[ "$mem_total" -gt 0 ]]; then
|
|
mem_pct=$(( mem_used * 100 / mem_total ))
|
|
if [[ "$mem_pct" -ge 90 ]]; then
|
|
add_alert "$name: Memory usage at ${mem_pct}%"
|
|
log " Memory: ${mem_used}M/${mem_total}M (${mem_pct}%)" "$RED"
|
|
elif [[ "$mem_pct" -ge 80 ]]; then
|
|
log " Memory: ${mem_used}M/${mem_total}M (${mem_pct}%)" "$AMBER"
|
|
else
|
|
log " Memory: ${mem_used}M/${mem_total}M (${mem_pct}%)" "$GREEN"
|
|
fi
|
|
fi
|
|
else
|
|
log " Memory: check failed" "$RED"
|
|
fi
|
|
|
|
# Pi-specific checks
|
|
case "$name" in
|
|
Cecilia)
|
|
ollama_count=$(run_ssh "$user" "$ip" "ollama list 2>/dev/null | tail -n +2 | wc -l" || echo "FAILED")
|
|
if [[ "$ollama_count" != "FAILED" ]]; then
|
|
log " Ollama models: $ollama_count" "$BLUE"
|
|
else
|
|
log " Ollama model count: check failed" "$RED"
|
|
fi
|
|
;;
|
|
Octavia)
|
|
gitea_repos=$(run_ssh "$user" "$ip" \
|
|
"curl -sf http://localhost:3100/api/v1/repos/search?limit=50 2>/dev/null | python3 -c 'import sys,json; print(len(json.load(sys.stdin).get(\"data\",[])))' 2>/dev/null || \
|
|
find /var/lib/gitea/repositories -maxdepth 2 -name '*.git' 2>/dev/null | wc -l || echo FAILED" || echo "FAILED")
|
|
if [[ "$gitea_repos" != "FAILED" ]]; then
|
|
log " Gitea repos: $gitea_repos" "$BLUE"
|
|
else
|
|
log " Gitea repo count: check failed" "$RED"
|
|
fi
|
|
;;
|
|
esac
|
|
done
|
|
|
|
# Write alerts file if any
|
|
if [[ ${#ALERTS[@]} -gt 0 ]]; then
|
|
ALERT_FILE="$ALERT_DIR/alert-${TIMESTAMP}.txt"
|
|
{
|
|
echo "BlackRoad Health Alert - $(date)"
|
|
echo "==============================="
|
|
for a in "${ALERTS[@]}"; do
|
|
echo " - $a"
|
|
done
|
|
} > "$ALERT_FILE"
|
|
echo "" >> "$LOG_FILE"
|
|
log "ALERTS WRITTEN: $ALERT_FILE (${#ALERTS[@]} issues)" "$RED"
|
|
else
|
|
echo "" >> "$LOG_FILE"
|
|
log "All systems healthy - no alerts" "$GREEN"
|
|
fi
|
|
|
|
log_separator
|