Files
blackroad-operating-system/.github/workflows/fleet-automation.yml

278 lines
10 KiB
YAML

# BlackRoad Fleet Automation
# Auto-probes fleet, collects KPIs, runs security scans, auto-heals
# Deploy to: .github/workflows/fleet-automation.yml in BlackRoad-Operating-System
# Copyright (c) 2025-2026 BlackRoad OS, Inc.
name: Fleet Automation
on:
schedule:
- cron: '0 */6 * * *'
workflow_dispatch:
inputs:
mode:
description: 'Operation mode'
type: choice
default: 'full-scan'
options:
- full-scan
- quick-check
- security-audit
- auto-heal
permissions:
contents: write
issues: write
env:
FLEET_ISSUE_TITLE: "Fleet Status Report"
jobs:
fleet-probe:
name: Fleet Probe
runs-on: self-hosted
outputs:
alice_status: ${{ steps.probe.outputs.alice }}
cecilia_status: ${{ steps.probe.outputs.cecilia }}
octavia_status: ${{ steps.probe.outputs.octavia }}
aria_status: ${{ steps.probe.outputs.aria }}
lucidia_status: ${{ steps.probe.outputs.lucidia }}
report: ${{ steps.probe.outputs.report }}
steps:
- uses: actions/checkout@v4
- name: Probe All Nodes
id: probe
run: |
REPORT=""
TIMESTAMP=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
REPORT="## Fleet Status Report\n**Timestamp:** ${TIMESTAMP}\n\n"
REPORT+="| Node | Status | CPU | Memory | Disk | Temp | Uptime |\n"
REPORT+="|------|--------|-----|--------|------|------|--------|\n"
probe_node() {
local name=$1 ip=$2 user=$3
local data status cpu mem disk temp uptime_str
data=$(ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=no "${user}@${ip}" \
'echo UP
cat /proc/loadavg | cut -d" " -f1
free -m | awk "/Mem:/{printf \"%d/%dMB\", \$3, \$2}"
df / | awk "NR==2{print \$5}"
cat /sys/class/thermal/thermal_zone0/temp 2>/dev/null | awk "{printf \"%.0f\", \$1/1000}"
uptime -p 2>/dev/null | sed "s/up //"' 2>/dev/null) || data="DOWN"
if echo "$data" | head -1 | grep -q "UP"; then
status="Online"
cpu=$(echo "$data" | sed -n '2p')
mem=$(echo "$data" | sed -n '3p')
disk=$(echo "$data" | sed -n '4p')
temp=$(echo "$data" | sed -n '5p')°C
uptime_str=$(echo "$data" | sed -n '6p')
echo "| $name | ✅ $status | $cpu | $mem | $disk | $temp | $uptime_str |"
echo "$name=online" >> $GITHUB_OUTPUT
else
echo "| $name | ❌ Offline | — | — | — | — | unreachable |"
echo "$name=offline" >> $GITHUB_OUTPUT
fi
}
REPORT+=$(probe_node "Alice" "192.168.4.49" "pi")
REPORT+="\n"
REPORT+=$(probe_node "Cecilia" "192.168.4.96" "blackroad")
REPORT+="\n"
REPORT+=$(probe_node "Octavia" "192.168.4.100" "pi")
REPORT+="\n"
REPORT+=$(probe_node "Aria" "192.168.4.98" "blackroad")
REPORT+="\n"
REPORT+=$(probe_node "Lucidia" "192.168.4.38" "octavia")
REPORT+="\n"
# Check services on online nodes
REPORT+="\n### Services\n"
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
name=$(echo "$entry" | cut -d: -f1)
ip=$(echo "$entry" | cut -d: -f2)
user=$(echo "$entry" | cut -d: -f3)
services=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'systemctl list-units --state=running --no-pager --no-legend | grep -E "cloudflared|ollama|docker|gitea|pihole|nginx|postgres|nats" | awk "{print \$1}" | tr "\n" ", "' 2>/dev/null) || services="unreachable"
REPORT+="**${name}:** ${services}\n"
done
echo "report<<EOF" >> $GITHUB_OUTPUT
echo -e "$REPORT" >> $GITHUB_OUTPUT
echo "EOF" >> $GITHUB_OUTPUT
- name: Update Fleet Status Issue
uses: actions/github-script@v7
with:
script: |
const report = `${{ steps.probe.outputs.report }}`;
const title = '${{ env.FLEET_ISSUE_TITLE }}';
// Find existing fleet status issue
const issues = await github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
labels: 'fleet-status',
state: 'open'
});
if (issues.data.length > 0) {
// Update existing
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issues.data[0].number,
body: report
});
} else {
// Create new pinned issue
await github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: title,
body: report,
labels: ['fleet-status', 'automated']
});
}
- name: Save Probe Results
run: |
mkdir -p data/fleet-probes
REPORT='${{ steps.probe.outputs.report }}'
echo "$REPORT" > "data/fleet-probes/$(date -u +%Y-%m-%d).md"
kpi-collect:
name: KPI Collection
needs: fleet-probe
runs-on: self-hosted
if: github.event.inputs.mode != 'security-audit'
steps:
- uses: actions/checkout@v4
with:
repository: blackboxprogramming/blackroad-os-kpis
path: kpis
- name: Run Collectors
working-directory: kpis
run: |
bash collectors/collect-all.sh 2>&1 || echo "Some collectors failed"
- name: Commit Results
working-directory: kpis
run: |
git config user.name "BlackRoad Automation"
git config user.email "automation@blackroad.io"
git add -A
git diff --cached --quiet || git commit -m "KPI collection $(date -u +%Y-%m-%d)"
git push || echo "Push failed — may need token"
security-scan:
name: Security Scan
needs: fleet-probe
runs-on: self-hosted
if: github.event.inputs.mode != 'quick-check'
steps:
- name: Scan Fleet
run: |
REPORT="## Security Scan\n**$(date -u '+%Y-%m-%dT%H:%M:%SZ')**\n\n"
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
name=$(echo "$entry" | cut -d: -f1)
ip=$(echo "$entry" | cut -d: -f2)
user=$(echo "$entry" | cut -d: -f3)
REPORT+="### ${name}\n"
# SSH key count
keys=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'wc -l < ~/.ssh/authorized_keys 2>/dev/null || echo 0' 2>/dev/null) || keys="?"
REPORT+="- SSH keys: ${keys}\n"
# Check for plaintext secrets in crontabs
secrets=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'crontab -l 2>/dev/null | grep -ciE "password|token|secret|api_key" || echo 0' 2>/dev/null) || secrets="?"
REPORT+="- Crontab secrets: ${secrets}\n"
# Firewall status
fw=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'sudo ufw status 2>/dev/null | head -1 || echo "no ufw"' 2>/dev/null) || fw="?"
REPORT+="- Firewall: ${fw}\n"
# Listening ports
ports=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'ss -tlnp 2>/dev/null | wc -l || echo ?' 2>/dev/null) || ports="?"
REPORT+="- Listening TCP ports: ${ports}\n\n"
done
echo -e "$REPORT"
echo -e "$REPORT" > /tmp/security-report.md
- name: Post Security Report
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const report = fs.readFileSync('/tmp/security-report.md', 'utf8');
const issues = await github.rest.issues.listForRepo({
owner: context.repo.owner,
repo: context.repo.repo,
labels: 'security,automated',
state: 'open'
});
if (issues.data.length > 0) {
await github.rest.issues.createComment({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: issues.data[0].number,
body: report
});
}
auto-heal:
name: Auto-Heal
needs: fleet-probe
runs-on: self-hosted
if: github.event.inputs.mode == 'auto-heal' || github.event.inputs.mode == 'full-scan'
steps:
- name: Heal Nodes
run: |
ACTIONS=""
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
name=$(echo "$entry" | cut -d: -f1)
ip=$(echo "$entry" | cut -d: -f2)
user=$(echo "$entry" | cut -d: -f3)
# Check disk > 90%
disk=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'df / | awk "NR==2{print \$5}" | tr -d "%"' 2>/dev/null) || continue
if [ "${disk:-0}" -ge 90 ]; then
echo "🔧 ${name}: disk at ${disk}% — cleaning..."
ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
'sudo apt-get clean 2>/dev/null; sudo journalctl --vacuum-time=3d 2>/dev/null; docker system prune -f 2>/dev/null' || true
ACTIONS+="- ${name}: cleaned disk (was ${disk}%)\n"
fi
# Check key services
for svc in cloudflared ollama; do
status=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
"systemctl is-active ${svc} 2>/dev/null" 2>/dev/null) || status="unknown"
if [ "$status" = "inactive" ] || [ "$status" = "failed" ]; then
echo "🔧 ${name}: restarting ${svc}..."
ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
"sudo systemctl restart ${svc}" 2>/dev/null || true
ACTIONS+="- ${name}: restarted ${svc}\n"
fi
done
done
if [ -n "$ACTIONS" ]; then
echo "Auto-heal actions taken:"
echo -e "$ACTIONS"
else
echo "All services healthy — no action needed"
fi