# BlackRoad Fleet Automation # Auto-probes fleet, collects KPIs, runs security scans, auto-heals # Deploy to: .github/workflows/fleet-automation.yml in BlackRoad-Operating-System # Copyright (c) 2025-2026 BlackRoad OS, Inc. name: Fleet Automation on: schedule: - cron: '0 */6 * * *' workflow_dispatch: inputs: mode: description: 'Operation mode' type: choice default: 'full-scan' options: - full-scan - quick-check - security-audit - auto-heal permissions: contents: write issues: write env: FLEET_ISSUE_TITLE: "Fleet Status Report" jobs: fleet-probe: name: Fleet Probe runs-on: self-hosted outputs: alice_status: ${{ steps.probe.outputs.alice }} cecilia_status: ${{ steps.probe.outputs.cecilia }} octavia_status: ${{ steps.probe.outputs.octavia }} aria_status: ${{ steps.probe.outputs.aria }} lucidia_status: ${{ steps.probe.outputs.lucidia }} report: ${{ steps.probe.outputs.report }} steps: - uses: actions/checkout@v4 - name: Probe All Nodes id: probe run: | REPORT="" TIMESTAMP=$(date -u '+%Y-%m-%dT%H:%M:%SZ') REPORT="## Fleet Status Report\n**Timestamp:** ${TIMESTAMP}\n\n" REPORT+="| Node | Status | CPU | Memory | Disk | Temp | Uptime |\n" REPORT+="|------|--------|-----|--------|------|------|--------|\n" probe_node() { local name=$1 ip=$2 user=$3 local data status cpu mem disk temp uptime_str data=$(ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=no "${user}@${ip}" \ 'echo UP cat /proc/loadavg | cut -d" " -f1 free -m | awk "/Mem:/{printf \"%d/%dMB\", \$3, \$2}" df / | awk "NR==2{print \$5}" cat /sys/class/thermal/thermal_zone0/temp 2>/dev/null | awk "{printf \"%.0f\", \$1/1000}" uptime -p 2>/dev/null | sed "s/up //"' 2>/dev/null) || data="DOWN" if echo "$data" | head -1 | grep -q "UP"; then status="Online" cpu=$(echo "$data" | sed -n '2p') mem=$(echo "$data" | sed -n '3p') disk=$(echo "$data" | sed -n '4p') temp=$(echo "$data" | sed -n '5p')°C uptime_str=$(echo "$data" | sed -n '6p') echo "| $name | ✅ $status | $cpu | $mem | $disk | $temp | $uptime_str |" echo "$name=online" >> $GITHUB_OUTPUT else echo "| $name | ❌ Offline | — | — | — | — | unreachable |" echo "$name=offline" >> $GITHUB_OUTPUT fi } REPORT+=$(probe_node "Alice" "192.168.4.49" "pi") REPORT+="\n" REPORT+=$(probe_node "Cecilia" "192.168.4.96" "blackroad") REPORT+="\n" REPORT+=$(probe_node "Octavia" "192.168.4.100" "pi") REPORT+="\n" REPORT+=$(probe_node "Aria" "192.168.4.98" "blackroad") REPORT+="\n" REPORT+=$(probe_node "Lucidia" "192.168.4.38" "octavia") REPORT+="\n" # Check services on online nodes REPORT+="\n### Services\n" for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do name=$(echo "$entry" | cut -d: -f1) ip=$(echo "$entry" | cut -d: -f2) user=$(echo "$entry" | cut -d: -f3) services=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'systemctl list-units --state=running --no-pager --no-legend | grep -E "cloudflared|ollama|docker|gitea|pihole|nginx|postgres|nats" | awk "{print \$1}" | tr "\n" ", "' 2>/dev/null) || services="unreachable" REPORT+="**${name}:** ${services}\n" done echo "report<> $GITHUB_OUTPUT echo -e "$REPORT" >> $GITHUB_OUTPUT echo "EOF" >> $GITHUB_OUTPUT - name: Update Fleet Status Issue uses: actions/github-script@v7 with: script: | const report = `${{ steps.probe.outputs.report }}`; const title = '${{ env.FLEET_ISSUE_TITLE }}'; // Find existing fleet status issue const issues = await github.rest.issues.listForRepo({ owner: context.repo.owner, repo: context.repo.repo, labels: 'fleet-status', state: 'open' }); if (issues.data.length > 0) { // Update existing await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issues.data[0].number, body: report }); } else { // Create new pinned issue await github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title: title, body: report, labels: ['fleet-status', 'automated'] }); } - name: Save Probe Results run: | mkdir -p data/fleet-probes REPORT='${{ steps.probe.outputs.report }}' echo "$REPORT" > "data/fleet-probes/$(date -u +%Y-%m-%d).md" kpi-collect: name: KPI Collection needs: fleet-probe runs-on: self-hosted if: github.event.inputs.mode != 'security-audit' steps: - uses: actions/checkout@v4 with: repository: blackboxprogramming/blackroad-os-kpis path: kpis - name: Run Collectors working-directory: kpis run: | bash collectors/collect-all.sh 2>&1 || echo "Some collectors failed" - name: Commit Results working-directory: kpis run: | git config user.name "BlackRoad Automation" git config user.email "automation@blackroad.io" git add -A git diff --cached --quiet || git commit -m "KPI collection $(date -u +%Y-%m-%d)" git push || echo "Push failed — may need token" security-scan: name: Security Scan needs: fleet-probe runs-on: self-hosted if: github.event.inputs.mode != 'quick-check' steps: - name: Scan Fleet run: | REPORT="## Security Scan\n**$(date -u '+%Y-%m-%dT%H:%M:%SZ')**\n\n" for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do name=$(echo "$entry" | cut -d: -f1) ip=$(echo "$entry" | cut -d: -f2) user=$(echo "$entry" | cut -d: -f3) REPORT+="### ${name}\n" # SSH key count keys=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'wc -l < ~/.ssh/authorized_keys 2>/dev/null || echo 0' 2>/dev/null) || keys="?" REPORT+="- SSH keys: ${keys}\n" # Check for plaintext secrets in crontabs secrets=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'crontab -l 2>/dev/null | grep -ciE "password|token|secret|api_key" || echo 0' 2>/dev/null) || secrets="?" REPORT+="- Crontab secrets: ${secrets}\n" # Firewall status fw=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'sudo ufw status 2>/dev/null | head -1 || echo "no ufw"' 2>/dev/null) || fw="?" REPORT+="- Firewall: ${fw}\n" # Listening ports ports=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'ss -tlnp 2>/dev/null | wc -l || echo ?' 2>/dev/null) || ports="?" REPORT+="- Listening TCP ports: ${ports}\n\n" done echo -e "$REPORT" echo -e "$REPORT" > /tmp/security-report.md - name: Post Security Report uses: actions/github-script@v7 with: script: | const fs = require('fs'); const report = fs.readFileSync('/tmp/security-report.md', 'utf8'); const issues = await github.rest.issues.listForRepo({ owner: context.repo.owner, repo: context.repo.repo, labels: 'security,automated', state: 'open' }); if (issues.data.length > 0) { await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: issues.data[0].number, body: report }); } auto-heal: name: Auto-Heal needs: fleet-probe runs-on: self-hosted if: github.event.inputs.mode == 'auto-heal' || github.event.inputs.mode == 'full-scan' steps: - name: Heal Nodes run: | ACTIONS="" for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do name=$(echo "$entry" | cut -d: -f1) ip=$(echo "$entry" | cut -d: -f2) user=$(echo "$entry" | cut -d: -f3) # Check disk > 90% disk=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'df / | awk "NR==2{print \$5}" | tr -d "%"' 2>/dev/null) || continue if [ "${disk:-0}" -ge 90 ]; then echo "🔧 ${name}: disk at ${disk}% — cleaning..." ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ 'sudo apt-get clean 2>/dev/null; sudo journalctl --vacuum-time=3d 2>/dev/null; docker system prune -f 2>/dev/null' || true ACTIONS+="- ${name}: cleaned disk (was ${disk}%)\n" fi # Check key services for svc in cloudflared ollama; do status=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ "systemctl is-active ${svc} 2>/dev/null" 2>/dev/null) || status="unknown" if [ "$status" = "inactive" ] || [ "$status" = "failed" ]; then echo "🔧 ${name}: restarting ${svc}..." ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \ "sudo systemctl restart ${svc}" 2>/dev/null || true ACTIONS+="- ${name}: restarted ${svc}\n" fi done done if [ -n "$ACTIONS" ]; then echo "Auto-heal actions taken:" echo -e "$ACTIONS" else echo "All services healthy — no action needed" fi