mirror of
https://github.com/blackboxprogramming/BlackRoad-Operating-System.git
synced 2026-03-17 00:57:12 -05:00
278 lines
10 KiB
YAML
278 lines
10 KiB
YAML
# BlackRoad Fleet Automation
|
|
# Auto-probes fleet, collects KPIs, runs security scans, auto-heals
|
|
# Deploy to: .github/workflows/fleet-automation.yml in BlackRoad-Operating-System
|
|
# Copyright (c) 2025-2026 BlackRoad OS, Inc.
|
|
|
|
name: Fleet Automation
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '0 */6 * * *'
|
|
workflow_dispatch:
|
|
inputs:
|
|
mode:
|
|
description: 'Operation mode'
|
|
type: choice
|
|
default: 'full-scan'
|
|
options:
|
|
- full-scan
|
|
- quick-check
|
|
- security-audit
|
|
- auto-heal
|
|
|
|
permissions:
|
|
contents: write
|
|
issues: write
|
|
|
|
env:
|
|
FLEET_ISSUE_TITLE: "Fleet Status Report"
|
|
|
|
jobs:
|
|
fleet-probe:
|
|
name: Fleet Probe
|
|
runs-on: self-hosted
|
|
outputs:
|
|
alice_status: ${{ steps.probe.outputs.alice }}
|
|
cecilia_status: ${{ steps.probe.outputs.cecilia }}
|
|
octavia_status: ${{ steps.probe.outputs.octavia }}
|
|
aria_status: ${{ steps.probe.outputs.aria }}
|
|
lucidia_status: ${{ steps.probe.outputs.lucidia }}
|
|
report: ${{ steps.probe.outputs.report }}
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
|
|
- name: Probe All Nodes
|
|
id: probe
|
|
run: |
|
|
REPORT=""
|
|
TIMESTAMP=$(date -u '+%Y-%m-%dT%H:%M:%SZ')
|
|
REPORT="## Fleet Status Report\n**Timestamp:** ${TIMESTAMP}\n\n"
|
|
REPORT+="| Node | Status | CPU | Memory | Disk | Temp | Uptime |\n"
|
|
REPORT+="|------|--------|-----|--------|------|------|--------|\n"
|
|
|
|
probe_node() {
|
|
local name=$1 ip=$2 user=$3
|
|
local data status cpu mem disk temp uptime_str
|
|
|
|
data=$(ssh -o ConnectTimeout=3 -o BatchMode=yes -o StrictHostKeyChecking=no "${user}@${ip}" \
|
|
'echo UP
|
|
cat /proc/loadavg | cut -d" " -f1
|
|
free -m | awk "/Mem:/{printf \"%d/%dMB\", \$3, \$2}"
|
|
df / | awk "NR==2{print \$5}"
|
|
cat /sys/class/thermal/thermal_zone0/temp 2>/dev/null | awk "{printf \"%.0f\", \$1/1000}"
|
|
uptime -p 2>/dev/null | sed "s/up //"' 2>/dev/null) || data="DOWN"
|
|
|
|
if echo "$data" | head -1 | grep -q "UP"; then
|
|
status="Online"
|
|
cpu=$(echo "$data" | sed -n '2p')
|
|
mem=$(echo "$data" | sed -n '3p')
|
|
disk=$(echo "$data" | sed -n '4p')
|
|
temp=$(echo "$data" | sed -n '5p')°C
|
|
uptime_str=$(echo "$data" | sed -n '6p')
|
|
echo "| $name | ✅ $status | $cpu | $mem | $disk | $temp | $uptime_str |"
|
|
echo "$name=online" >> $GITHUB_OUTPUT
|
|
else
|
|
echo "| $name | ❌ Offline | — | — | — | — | unreachable |"
|
|
echo "$name=offline" >> $GITHUB_OUTPUT
|
|
fi
|
|
}
|
|
|
|
REPORT+=$(probe_node "Alice" "192.168.4.49" "pi")
|
|
REPORT+="\n"
|
|
REPORT+=$(probe_node "Cecilia" "192.168.4.96" "blackroad")
|
|
REPORT+="\n"
|
|
REPORT+=$(probe_node "Octavia" "192.168.4.100" "pi")
|
|
REPORT+="\n"
|
|
REPORT+=$(probe_node "Aria" "192.168.4.98" "blackroad")
|
|
REPORT+="\n"
|
|
REPORT+=$(probe_node "Lucidia" "192.168.4.38" "octavia")
|
|
REPORT+="\n"
|
|
|
|
# Check services on online nodes
|
|
REPORT+="\n### Services\n"
|
|
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
|
|
name=$(echo "$entry" | cut -d: -f1)
|
|
ip=$(echo "$entry" | cut -d: -f2)
|
|
user=$(echo "$entry" | cut -d: -f3)
|
|
services=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'systemctl list-units --state=running --no-pager --no-legend | grep -E "cloudflared|ollama|docker|gitea|pihole|nginx|postgres|nats" | awk "{print \$1}" | tr "\n" ", "' 2>/dev/null) || services="unreachable"
|
|
REPORT+="**${name}:** ${services}\n"
|
|
done
|
|
|
|
echo "report<<EOF" >> $GITHUB_OUTPUT
|
|
echo -e "$REPORT" >> $GITHUB_OUTPUT
|
|
echo "EOF" >> $GITHUB_OUTPUT
|
|
|
|
- name: Update Fleet Status Issue
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const report = `${{ steps.probe.outputs.report }}`;
|
|
const title = '${{ env.FLEET_ISSUE_TITLE }}';
|
|
|
|
// Find existing fleet status issue
|
|
const issues = await github.rest.issues.listForRepo({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
labels: 'fleet-status',
|
|
state: 'open'
|
|
});
|
|
|
|
if (issues.data.length > 0) {
|
|
// Update existing
|
|
await github.rest.issues.createComment({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
issue_number: issues.data[0].number,
|
|
body: report
|
|
});
|
|
} else {
|
|
// Create new pinned issue
|
|
await github.rest.issues.create({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
title: title,
|
|
body: report,
|
|
labels: ['fleet-status', 'automated']
|
|
});
|
|
}
|
|
|
|
- name: Save Probe Results
|
|
run: |
|
|
mkdir -p data/fleet-probes
|
|
REPORT='${{ steps.probe.outputs.report }}'
|
|
echo "$REPORT" > "data/fleet-probes/$(date -u +%Y-%m-%d).md"
|
|
|
|
kpi-collect:
|
|
name: KPI Collection
|
|
needs: fleet-probe
|
|
runs-on: self-hosted
|
|
if: github.event.inputs.mode != 'security-audit'
|
|
steps:
|
|
- uses: actions/checkout@v4
|
|
with:
|
|
repository: blackboxprogramming/blackroad-os-kpis
|
|
path: kpis
|
|
|
|
- name: Run Collectors
|
|
working-directory: kpis
|
|
run: |
|
|
bash collectors/collect-all.sh 2>&1 || echo "Some collectors failed"
|
|
|
|
- name: Commit Results
|
|
working-directory: kpis
|
|
run: |
|
|
git config user.name "BlackRoad Automation"
|
|
git config user.email "automation@blackroad.io"
|
|
git add -A
|
|
git diff --cached --quiet || git commit -m "KPI collection $(date -u +%Y-%m-%d)"
|
|
git push || echo "Push failed — may need token"
|
|
|
|
security-scan:
|
|
name: Security Scan
|
|
needs: fleet-probe
|
|
runs-on: self-hosted
|
|
if: github.event.inputs.mode != 'quick-check'
|
|
steps:
|
|
- name: Scan Fleet
|
|
run: |
|
|
REPORT="## Security Scan\n**$(date -u '+%Y-%m-%dT%H:%M:%SZ')**\n\n"
|
|
|
|
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
|
|
name=$(echo "$entry" | cut -d: -f1)
|
|
ip=$(echo "$entry" | cut -d: -f2)
|
|
user=$(echo "$entry" | cut -d: -f3)
|
|
|
|
REPORT+="### ${name}\n"
|
|
|
|
# SSH key count
|
|
keys=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'wc -l < ~/.ssh/authorized_keys 2>/dev/null || echo 0' 2>/dev/null) || keys="?"
|
|
REPORT+="- SSH keys: ${keys}\n"
|
|
|
|
# Check for plaintext secrets in crontabs
|
|
secrets=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'crontab -l 2>/dev/null | grep -ciE "password|token|secret|api_key" || echo 0' 2>/dev/null) || secrets="?"
|
|
REPORT+="- Crontab secrets: ${secrets}\n"
|
|
|
|
# Firewall status
|
|
fw=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'sudo ufw status 2>/dev/null | head -1 || echo "no ufw"' 2>/dev/null) || fw="?"
|
|
REPORT+="- Firewall: ${fw}\n"
|
|
|
|
# Listening ports
|
|
ports=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'ss -tlnp 2>/dev/null | wc -l || echo ?' 2>/dev/null) || ports="?"
|
|
REPORT+="- Listening TCP ports: ${ports}\n\n"
|
|
done
|
|
|
|
echo -e "$REPORT"
|
|
echo -e "$REPORT" > /tmp/security-report.md
|
|
|
|
- name: Post Security Report
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const fs = require('fs');
|
|
const report = fs.readFileSync('/tmp/security-report.md', 'utf8');
|
|
const issues = await github.rest.issues.listForRepo({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
labels: 'security,automated',
|
|
state: 'open'
|
|
});
|
|
if (issues.data.length > 0) {
|
|
await github.rest.issues.createComment({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
issue_number: issues.data[0].number,
|
|
body: report
|
|
});
|
|
}
|
|
|
|
auto-heal:
|
|
name: Auto-Heal
|
|
needs: fleet-probe
|
|
runs-on: self-hosted
|
|
if: github.event.inputs.mode == 'auto-heal' || github.event.inputs.mode == 'full-scan'
|
|
steps:
|
|
- name: Heal Nodes
|
|
run: |
|
|
ACTIONS=""
|
|
|
|
for entry in "Alice:192.168.4.49:pi" "Cecilia:192.168.4.96:blackroad" "Lucidia:192.168.4.38:octavia"; do
|
|
name=$(echo "$entry" | cut -d: -f1)
|
|
ip=$(echo "$entry" | cut -d: -f2)
|
|
user=$(echo "$entry" | cut -d: -f3)
|
|
|
|
# Check disk > 90%
|
|
disk=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'df / | awk "NR==2{print \$5}" | tr -d "%"' 2>/dev/null) || continue
|
|
|
|
if [ "${disk:-0}" -ge 90 ]; then
|
|
echo "🔧 ${name}: disk at ${disk}% — cleaning..."
|
|
ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
'sudo apt-get clean 2>/dev/null; sudo journalctl --vacuum-time=3d 2>/dev/null; docker system prune -f 2>/dev/null' || true
|
|
ACTIONS+="- ${name}: cleaned disk (was ${disk}%)\n"
|
|
fi
|
|
|
|
# Check key services
|
|
for svc in cloudflared ollama; do
|
|
status=$(ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
"systemctl is-active ${svc} 2>/dev/null" 2>/dev/null) || status="unknown"
|
|
if [ "$status" = "inactive" ] || [ "$status" = "failed" ]; then
|
|
echo "🔧 ${name}: restarting ${svc}..."
|
|
ssh -o ConnectTimeout=3 -o BatchMode=yes "${user}@${ip}" \
|
|
"sudo systemctl restart ${svc}" 2>/dev/null || true
|
|
ACTIONS+="- ${name}: restarted ${svc}\n"
|
|
fi
|
|
done
|
|
done
|
|
|
|
if [ -n "$ACTIONS" ]; then
|
|
echo "Auto-heal actions taken:"
|
|
echo -e "$ACTIONS"
|
|
else
|
|
echo "All services healthy — no action needed"
|
|
fi
|