sync: 2026-03-15 23:00 — 32 files from Alexandria
Some checks failed
Lint & Format / detect (push) Failing after 32s
Monorepo Lint / lint-shell (push) Failing after 31s
Monorepo Lint / lint-js (push) Failing after 30s
Lint & Format / js-lint (push) Has been skipped
Lint & Format / py-lint (push) Has been skipped
Lint & Format / sh-lint (push) Has been skipped
Lint & Format / go-lint (push) Has been skipped

RoadChain-SHA2048: 692327ce2e990f37
RoadChain-Identity: alexa@sovereign
RoadChain-Full: 692327ce2e990f37649b83e948241ac858c0d07146c6b42043e4770d638c44d5bada5639ad82c7aa8911d7042912c1d75b6bbce9a453637621b3903dc912a3a9537696cedf7a0870e3bf962ca44677793082aaae5c5433615885ad20fab1e80417202d11e93284483551ba9558f06809d2f3fa53c00a657277d7c183abe3ba187c1af6856a455071771757cca67ff2b74c5f855f23dd8cc8f5b3596c966b2344361fcbb74843e9d9d9ad66c5321ef64ce787f9d255d11e0d4e0ee571af4e09697964e22f6f629a11279b315c9a4563860b169ad93fa500b485297516ef2ba2039f76348c0d547cfa182e9b0bccee73f5e8b7db7e33d61e8199bb4464c2c30d03
This commit is contained in:
2026-03-15 23:00:06 -05:00
parent eb1d7952f7
commit ac7b9b5958
32 changed files with 1093 additions and 229 deletions

View File

@@ -31,3 +31,4 @@
0 8 1 * * bash /Users/alexa/blackroad-operator/scripts/corporate-autopilot.sh >> /Users/alexa/blackroad-operator/docs/corporate/autopilot.log 2>&1
5 6 * * * cd /Users/alexa/blackroad-os-kpis && bash reports/slack-notify.sh >> /Users/alexa/blackroad-os-kpis/data/cron.log 2>&1
7,37 * * * * /Users/alexa/blackroad-git-sync.sh >> /Users/alexa/.blackroad/logs/git-sync.log 2>&1
*/5 * * * * /usr/bin/flock -n /tmp/fleet-coord.lock /Users/alexa/blackroad-fleet-coordinator.sh >> /Users/alexa/.blackroad/logs/coordinator.log 2>&1

View File

@@ -1,50 +1,50 @@
[2026-03-15 22:03:01] [BEAT] [alice] load=0.62 mem=3381/3794MB temp=32.6C disk=80%
[2026-03-15 22:04:01] [BEAT] [alice] load=0.76 mem=3382/3794MB temp=32.1C disk=80%
[2026-03-15 22:05:01] [BEAT] [alice] load=0.68 mem=3369/3794MB temp=34.1C disk=80%
[2026-03-15 22:06:01] [BEAT] [alice] load=0.59 mem=3378/3794MB temp=33.1C disk=80%
[2026-03-15 22:06:16] [BEAT] [alice] load=0.46 mem=3374/3794MB temp=33.6C disk=80%
[2026-03-15 22:06:16] [BEAT] [alice] load=0.46 mem=3372/3794MB temp=33.6C disk=80%
[2026-03-15 22:07:01] [BEAT] [alice] load=0.41 mem=3381/3794MB temp=32.6C disk=80%
[2026-03-15 22:08:01] [BEAT] [alice] load=0.76 mem=3378/3794MB temp=32.6C disk=80%
[2026-03-15 22:09:01] [BEAT] [alice] load=0.49 mem=3380/3794MB temp=33.6C disk=80%
[2026-03-15 22:10:02] [FLEET] [alice] Starting cross-node health check
[2026-03-15 22:10:02] [BEAT] [alice] load=0.55 mem=3359/3794MB temp=36.0C disk=80%
[2026-03-15 22:10:04] [FLEET] [alice] octavia: DOWN (no ping response)
[2026-03-15 22:10:04] [FLEET] [alice] cecilia: UP temp=43C mem=4779MB disk=19%
[2026-03-15 22:10:05] [FLEET] [alice] gematria: UP temp=C mem=4213MB disk=67%
[2026-03-15 22:10:06] [FLEET] [alice] lucidia: UP temp=58C mem=3614MB disk=33%
[2026-03-15 22:10:08] [FLEET] [alice] aria: DOWN (no ping response)
[2026-03-15 22:10:09] [FLEET] [alice] anastasia: UP temp=C mem=201MB disk=69%
[2026-03-15 22:11:01] [BEAT] [alice] load=0.45 mem=3380/3794MB temp=33.1C disk=80%
[2026-03-15 22:11:25] [BEAT] [alice] load=0.83 mem=3364/3794MB temp=36.0C disk=80%
[2026-03-15 22:11:25] [BEAT] [alice] load=0.83 mem=3362/3794MB temp=37.0C disk=80%
[2026-03-15 22:12:01] [BEAT] [alice] load=0.80 mem=3381/3794MB temp=34.1C disk=80%
[2026-03-15 22:13:01] [BEAT] [alice] load=1.23 mem=3379/3794MB temp=36.0C disk=80%
[2026-03-15 22:14:01] [BEAT] [alice] load=0.78 mem=3372/3794MB temp=34.1C disk=80%
[2026-03-15 22:15:02] [BEAT] [alice] load=0.73 mem=3336/3794MB temp=37.5C disk=80%
[2026-03-15 22:15:48] [DIAL] [alice] Switchboard unreachable
[2026-03-15 22:16:01] [BEAT] [alice] load=0.81 mem=3371/3794MB temp=34.6C disk=80%
[2026-03-15 22:16:08] [DIAL] [alice] Switchboard unreachable
[2026-03-15 22:16:34] [BEAT] [alice] load=1.11 mem=3357/3794MB temp=35.0C disk=80%
[2026-03-15 22:16:34] [BEAT] [alice] load=1.11 mem=3371/3794MB temp=35.0C disk=80%
[2026-03-15 22:17:01] [BEAT] [alice] load=1.13 mem=3372/3794MB temp=36.0C disk=80%
[2026-03-15 22:18:01] [BEAT] [alice] load=0.84 mem=3378/3794MB temp=34.1C disk=80%
[2026-03-15 22:19:01] [BEAT] [alice] load=1.27 mem=3379/3794MB temp=34.6C disk=80%
[2026-03-15 22:20:01] [FLEET] [alice] Starting cross-node health check
[2026-03-15 22:20:01] [BEAT] [alice] load=0.69 mem=3360/3794MB temp=36.0C disk=80%
[2026-03-15 22:20:03] [FLEET] [alice] octavia: DOWN (no ping response)
[2026-03-15 22:20:04] [FLEET] [alice] cecilia: UP temp=42C mem=4816MB disk=19%
[2026-03-15 22:20:05] [FLEET] [alice] gematria: UP temp=C mem=4209MB disk=67%
[2026-03-15 22:20:05] [FLEET] [alice] lucidia: UP temp=66C mem=3520MB disk=33%
[2026-03-15 22:20:05] [FLEET] [alice] aria: DOWN (no ping response)
[2026-03-15 22:20:06] [FLEET] [alice] anastasia: UP temp=C mem=277MB disk=69%
[2026-03-15 22:21:01] [BEAT] [alice] load=0.65 mem=3379/3794MB temp=36.0C disk=80%
[2026-03-15 22:21:43] [BEAT] [alice] load=0.63 mem=3370/3794MB temp=35.5C disk=80%
[2026-03-15 22:21:43] [BEAT] [alice] load=0.98 mem=3368/3794MB temp=35.5C disk=80%
[2026-03-15 22:22:02] [BEAT] [alice] load=0.83 mem=3375/3794MB temp=33.6C disk=80%
[2026-03-15 22:23:04] [BEAT] [alice] load=1.69 mem=3379/3794MB temp=34.1C disk=80%
[2026-03-15 22:24:01] [BEAT] [alice] load=1.14 mem=3372/3794MB temp=33.6C disk=80%
[2026-03-15 22:25:01] [BEAT] [alice] load=0.95 mem=3370/3794MB temp=35.5C disk=80%
[2026-03-15 22:26:01] [BEAT] [alice] load=0.97 mem=3375/3794MB temp=36.5C disk=80%
[2026-03-15 22:26:52] [BEAT] [alice] load=0.59 mem=3373/3794MB temp=34.6C disk=80%
[2026-03-15 22:26:52] [BEAT] [alice] load=0.59 mem=3370/3794MB temp=35.5C disk=80%
[2026-03-15 22:40:02] [FLEET] [alice] octavia: UP temp=36C mem=6762MB disk=67%
[2026-03-15 22:40:02] [HEAL] [alice] Healed 1 services
[2026-03-15 22:40:03] [FLEET] [alice] cecilia: UP temp=57C mem=5488MB disk=19%
[2026-03-15 22:40:04] [FLEET] [alice] gematria: UP temp=C mem=4192MB disk=67%
[2026-03-15 22:40:05] [FLEET] [alice] lucidia: UP temp=62C mem=3515MB disk=33%
[2026-03-15 22:40:07] [FLEET] [alice] aria: DOWN (no ping response)
[2026-03-15 22:40:08] [FLEET] [alice] anastasia: UP temp=C mem=275MB disk=69%
[2026-03-15 22:41:01] [BEAT] [alice] load=0.87 mem=3370/3794MB temp=35.5C disk=80%
[2026-03-15 22:42:01] [BEAT] [alice] load=0.90 mem=3369/3794MB temp=34.6C disk=80%
[2026-03-15 22:42:19] [BEAT] [alice] load=0.70 mem=3370/3794MB temp=33.6C disk=80%
[2026-03-15 22:42:19] [BEAT] [alice] load=0.70 mem=3370/3794MB temp=34.1C disk=80%
[2026-03-15 22:43:01] [BEAT] [alice] load=1.07 mem=3367/3794MB temp=36.5C disk=80%
[2026-03-15 22:44:01] [BEAT] [alice] load=1.07 mem=3369/3794MB temp=35.0C disk=80%
[2026-03-15 22:45:02] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
[2026-03-15 22:45:02] [BEAT] [alice] load=1.46 mem=3353/3794MB temp=37.5C disk=80%
[2026-03-15 22:45:02] [HEAL] [alice] Service blackroad-agent restarted successfully
[2026-03-15 22:45:03] [HEAL] [alice] Healed 1 services
[2026-03-15 22:45:48] [DIAL] [alice] Switchboard unreachable
[2026-03-15 22:46:01] [BEAT] [alice] load=4.01 mem=3367/3794MB temp=33.6C disk=80%
[2026-03-15 22:46:28] [DIAL] [alice] Switchboard unreachable
[2026-03-15 22:47:01] [BEAT] [alice] load=1.90 mem=3367/3794MB temp=34.6C disk=80%
[2026-03-15 22:47:29] [BEAT] [alice] load=1.62 mem=3364/3794MB temp=36.5C disk=80%
[2026-03-15 22:47:30] [BEAT] [alice] load=1.62 mem=3364/3794MB temp=36.0C disk=80%
[2026-03-15 22:48:01] [BEAT] [alice] load=1.23 mem=3369/3794MB temp=33.1C disk=80%
[2026-03-15 22:49:01] [BEAT] [alice] load=0.84 mem=3357/3794MB temp=35.0C disk=80%
[2026-03-15 22:50:01] [FLEET] [alice] Starting cross-node health check
[2026-03-15 22:50:01] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
[2026-03-15 22:50:01] [BEAT] [alice] load=0.87 mem=3336/3794MB temp=36.0C disk=80%
[2026-03-15 22:50:01] [HEAL] [alice] Service blackroad-agent restarted successfully
[2026-03-15 22:50:02] [FLEET] [alice] octavia: UP temp=37C mem=6738MB disk=67%
[2026-03-15 22:50:02] [HEAL] [alice] Healed 1 services
[2026-03-15 22:50:02] [FLEET] [alice] cecilia: UP temp=41C mem=5051MB disk=19%
[2026-03-15 22:50:04] [FLEET] [alice] gematria: UP temp=C mem=4188MB disk=67%
[2026-03-15 22:50:04] [FLEET] [alice] lucidia: UP temp=55C mem=3528MB disk=33%
[2026-03-15 22:50:06] [FLEET] [alice] aria: DOWN (no ping response)
[2026-03-15 22:50:07] [FLEET] [alice] anastasia: UP temp=C mem=275MB disk=69%
[2026-03-15 22:51:01] [BEAT] [alice] load=2.30 mem=3355/3794MB temp=34.1C disk=80%
[2026-03-15 22:52:01] [BEAT] [alice] load=1.24 mem=3360/3794MB temp=33.1C disk=80%
[2026-03-15 22:52:39] [BEAT] [alice] load=0.96 mem=3355/3794MB temp=33.1C disk=80%
[2026-03-15 22:52:39] [BEAT] [alice] load=0.96 mem=3355/3794MB temp=34.6C disk=80%
[2026-03-15 22:53:01] [BEAT] [alice] load=0.92 mem=3355/3794MB temp=34.6C disk=80%
[2026-03-15 22:54:02] [BEAT] [alice] load=1.25 mem=3358/3794MB temp=34.1C disk=80%
[2026-03-15 22:55:01] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
[2026-03-15 22:55:01] [BEAT] [alice] load=0.78 mem=3352/3794MB temp=36.0C disk=80%
[2026-03-15 22:55:01] [HEAL] [alice] Service blackroad-agent restarted successfully
[2026-03-15 22:55:01] [HEAL] [alice] Healed 1 services
[2026-03-15 22:56:02] [BEAT] [alice] load=0.90 mem=3356/3794MB temp=33.1C disk=80%
[2026-03-15 22:57:01] [BEAT] [alice] load=1.10 mem=3354/3794MB temp=34.1C disk=80%
[2026-03-15 22:57:49] [BEAT] [alice] load=0.74 mem=3354/3794MB temp=31.6C disk=80%
[2026-03-15 22:57:49] [BEAT] [alice] load=0.74 mem=3354/3794MB temp=32.1C disk=80%

View File

@@ -1 +1 @@
{"node":"alice","ts":"2026-03-16T03:26:52Z","load":0.59,"mem_free_mb":3370,"mem_total_mb":3794,"temp_c":35.5,"disk_pct":80,"throttle":"0x0"}
{"node":"alice","ts":"2026-03-16T03:57:49Z","load":0.74,"mem_free_mb":3354,"mem_total_mb":3794,"temp_c":32.1,"disk_pct":80,"throttle":"0x0"}

View File

@@ -22,7 +22,7 @@ MY_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || echo "unknown")
case "$MY_IP" in
192.168.4.49*) NODE_NAME="alice" ;;
192.168.4.96*) NODE_NAME="cecilia" ;;
192.168.4.100*) NODE_NAME="octavia" ;;
192.168.4.101*) NODE_NAME="octavia" ;;
192.168.4.98*) NODE_NAME="aria" ;;
192.168.4.38*) NODE_NAME="lucidia" ;;
esac
@@ -41,7 +41,7 @@ mkdir -p "$AUTONOMY_DIR"/{state,fleet,health,tasks/pending,tasks/completed,logs}
declare -A FLEET_IPS
FLEET_IPS[alice]="192.168.4.49"
FLEET_IPS[cecilia]="192.168.4.96"
FLEET_IPS[octavia]="192.168.4.100"
FLEET_IPS[octavia]="192.168.4.101"
FLEET_IPS[aria]="192.168.4.98"
FLEET_IPS[lucidia]="192.168.4.38"
FLEET_IPS[anastasia]="174.138.44.45"

View File

@@ -1,8 +1,8 @@
LISTEN 0 5 0.0.0.0:8184 0.0.0.0:*
LISTEN 0 5 0.0.0.0:8095 0.0.0.0:* users:(("python3",pid=608,fd=3))
LISTEN 0 5 0.0.0.0:8095 0.0.0.0:* users:(("python3",pid=18662,fd=3))
LISTEN 0 511 0.0.0.0:8080 0.0.0.0:*
LISTEN 0 511 0.0.0.0:8083 0.0.0.0:* users:(("node /usr/lib/n",pid=3890,fd=20))
LISTEN 0 511 127.0.0.1:6379 0.0.0.0:*
LISTEN 0 511 0.0.0.0:8083 0.0.0.0:* users:(("node /usr/lib/n",pid=3890,fd=20))
LISTEN 0 5 0.0.0.0:4010 0.0.0.0:* users:(("python3",pid=610,fd=3))
LISTEN 0 5 0.0.0.0:8011 0.0.0.0:* users:(("python3",pid=828,fd=3))
LISTEN 0 5 0.0.0.0:8010 0.0.0.0:*
@@ -10,7 +10,7 @@ LISTEN 0 5 0.0.0.0:8013 0.0.0.0:* users:(("python3",pid
LISTEN 0 5 0.0.0.0:8012 0.0.0.0:*
LISTEN 0 5 0.0.0.0:8014 0.0.0.0:*
LISTEN 0 2048 0.0.0.0:8001 0.0.0.0:* users:(("python3",pid=617,fd=6))
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=21795,fd=5))
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=31841,fd=5))
LISTEN 0 200 0.0.0.0:443 0.0.0.0:*
LISTEN 0 1024 0.0.0.0:6333 0.0.0.0:*
LISTEN 0 128 0.0.0.0:6334 0.0.0.0:*

View File

@@ -1,9 +1,9 @@
avahi-daemon.service
blackroad-agent.service
blackroad-agents-proxy.service
blackroad-agents.service
blackroad-nats-agent.service
blackroad-operator.service
blackroad-salesforce-agent.service
blackroad-status.service
blackroad-task-queue-v2.service
blackroad-task-worker.service
@@ -26,6 +26,7 @@ prism-agent.service
qdrant.service
redis-server.service
rng-tools-debian.service
road-phone.service
roadnet-failover.service
rsyslog.service
rtkit-daemon.service

View File

@@ -1,19 +1,19 @@
{
"hostname": "alice",
"ts": "2026-03-16T03:26:55Z",
"uptime_seconds": 2553,
"ts": "2026-03-16T03:57:50Z",
"uptime_seconds": 4408,
"kernel": "6.1.21-v8+",
"temp_c": 35.0,
"temp_c": 32.1,
"memory_mb": {
"total": 3794,
"used": 332,
"free": 3373
"used": 339,
"free": 3355
},
"disk": "11G/15G (80%)",
"load": [
0.63,
0.89,
0.91
0.74,
1.03,
1.13
],
"ollama_models": [
"qwen2.5:3b",

View File

@@ -11,7 +11,7 @@ LISTEN 0 5 0.0.0.0:8787 0.0.0.0:* users:(("python3",pid
LISTEN 0 511 0.0.0.0:80 0.0.0.0:* users:(("nginx",pid=3461172,fd=8),("nginx",pid=3461171,fd=8))
LISTEN 0 4096 0.0.0.0:111 0.0.0.0:* users:(("rpcbind",pid=589,fd=4),("systemd",pid=1,fd=127))
LISTEN 0 4096 *:8080 *:* users:(("headscale",pid=2341808,fd=12))
LISTEN 0 511 *:3000 *:* users:(("node /srv/hello",pid=1765254,fd=19))
LISTEN 0 511 *:3000 *:* users:(("node /srv/hello",pid=1771278,fd=19))
LISTEN 0 511 *:3001 *:* users:(("node",pid=757,fd=21))
LISTEN 0 128 [::]:22 [::]:* users:(("sshd",pid=991,fd=8))
LISTEN 0 511 [::]:80 [::]:* users:(("nginx",pid=3461172,fd=9),("nginx",pid=3461171,fd=9))

View File

@@ -4,7 +4,6 @@ chronyd.service
cloudflared.service
containerd.service
crond.service
dbus-:1.1-org.fedoraproject.SetroubleshootPrivileged@73368.service
dbus-broker.service
docker.service
droplet-agent.service
@@ -21,7 +20,6 @@ polkit.service
rpcbind.service
rsyslog.service
serial-getty@ttyS0.service
setroubleshootd.service
sshd.service
systemd-journald.service
systemd-logind.service

View File

@@ -1,22 +1,22 @@
{
"hostname": "anastasia",
"ts": "2026-03-16T03:26:53Z",
"uptime_seconds": 6773506,
"ts": "2026-03-16T03:57:50Z",
"uptime_seconds": 6775363,
"kernel": "5.14.0-651.el9.x86_64",
"temp_c": 0,
"memory_mb": {
"total": 765,
"used": 566,
"free": 198
"used": 482,
"free": 282
},
"disk": "18G/25G (69%)",
"load": [
0.48,
0.18,
0.09
0.01,
0.03,
0.02
],
"ollama_models": [],
"throttle": "N/A",
"voltage": "N/A",
"services_running": 30
"services_running": 28
}

View File

@@ -1 +1 @@
{"node":"aria","status":"down","ts":"2026-03-16T03:26:52Z"}
{"node":"aria","status":"down","ts":"2026-03-16T03:57:49Z"}

View File

@@ -1,50 +1,50 @@
[2026-03-15 22:04:02] [BEAT] [cecilia] load=0.37 mem=6325/8062MB temp=37.0C disk=19%
[2026-03-15 22:05:01] [BEAT] [cecilia] load=0.37 mem=6286/8062MB temp=38.6C disk=19%
[2026-03-15 22:05:03] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:06:01] [BEAT] [cecilia] load=0.26 mem=6291/8062MB temp=37.5C disk=19%
[2026-03-15 22:06:16] [BEAT] [cecilia] load=0.27 mem=6287/8062MB temp=37.5C disk=19%
[2026-03-15 22:06:16] [BEAT] [cecilia] load=0.27 mem=6287/8062MB temp=38.0C disk=19%
[2026-03-15 22:07:01] [BEAT] [cecilia] load=0.33 mem=6297/8062MB temp=38.6C disk=19%
[2026-03-15 22:08:01] [BEAT] [cecilia] load=0.19 mem=6295/8062MB temp=39.1C disk=19%
[2026-03-15 22:09:01] [BEAT] [cecilia] load=0.54 mem=4704/8062MB temp=44.6C disk=19%
[2026-03-15 22:10:01] [FLEET] [cecilia] Starting cross-node health check
[2026-03-15 22:10:01] [BEAT] [cecilia] load=2.73 mem=4786/8062MB temp=42.5C disk=19%
[2026-03-15 22:10:02] [FLEET] [cecilia] alice: UP temp=36C mem=3350MB disk=80%
[2026-03-15 22:10:04] [FLEET] [cecilia] octavia: DOWN (no ping response)
[2026-03-15 22:10:05] [FLEET] [cecilia] gematria: UP temp=C mem=4213MB disk=67%
[2026-03-15 22:10:06] [FLEET] [cecilia] lucidia: UP temp=58C mem=3614MB disk=33%
[2026-03-15 22:10:08] [FLEET] [cecilia] aria: DOWN (no ping response)
[2026-03-15 22:10:09] [FLEET] [cecilia] anastasia: UP temp=C mem=226MB disk=69%
[2026-03-15 22:11:01] [BEAT] [cecilia] load=1.92 mem=4862/8062MB temp=42.5C disk=19%
[2026-03-15 22:11:24] [BEAT] [cecilia] load=1.45 mem=4870/8062MB temp=42.5C disk=19%
[2026-03-15 22:11:24] [BEAT] [cecilia] load=1.45 mem=4867/8062MB temp=43.0C disk=19%
[2026-03-15 22:12:01] [BEAT] [cecilia] load=0.93 mem=4876/8062MB temp=41.9C disk=19%
[2026-03-15 22:13:01] [BEAT] [cecilia] load=0.46 mem=4880/8062MB temp=41.4C disk=19%
[2026-03-15 22:14:01] [BEAT] [cecilia] load=0.27 mem=4874/8062MB temp=41.4C disk=19%
[2026-03-15 22:15:01] [BEAT] [cecilia] load=0.26 mem=4831/8062MB temp=42.5C disk=19%
[2026-03-15 22:15:46] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:16:01] [BEAT] [cecilia] load=0.51 mem=4879/8062MB temp=41.4C disk=19%
[2026-03-15 22:16:34] [BEAT] [cecilia] load=0.44 mem=4879/8062MB temp=41.9C disk=19%
[2026-03-15 22:16:34] [BEAT] [cecilia] load=0.44 mem=4875/8062MB temp=40.8C disk=19%
[2026-03-15 22:17:01] [BEAT] [cecilia] load=0.27 mem=4870/8062MB temp=41.4C disk=19%
[2026-03-15 22:18:01] [BEAT] [cecilia] load=0.23 mem=4879/8062MB temp=40.8C disk=19%
[2026-03-15 22:19:01] [BEAT] [cecilia] load=0.24 mem=4865/8062MB temp=40.2C disk=19%
[2026-03-15 22:20:01] [FLEET] [cecilia] Starting cross-node health check
[2026-03-15 22:20:01] [BEAT] [cecilia] load=0.15 mem=4831/8062MB temp=41.4C disk=19%
[2026-03-15 22:20:02] [FLEET] [cecilia] alice: UP temp=35C mem=3364MB disk=80%
[2026-03-15 22:20:04] [FLEET] [cecilia] octavia: DOWN (no ping response)
[2026-03-15 22:20:05] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:20:05] [FLEET] [cecilia] gematria: UP temp=C mem=4209MB disk=67%
[2026-03-15 22:20:06] [FLEET] [cecilia] lucidia: UP temp=64C mem=3522MB disk=33%
[2026-03-15 22:20:08] [FLEET] [cecilia] aria: DOWN (no ping response)
[2026-03-15 22:20:09] [FLEET] [cecilia] anastasia: UP temp=C mem=277MB disk=69%
[2026-03-15 22:21:01] [BEAT] [cecilia] load=0.19 mem=4867/8062MB temp=40.2C disk=19%
[2026-03-15 22:21:42] [BEAT] [cecilia] load=0.42 mem=4868/8062MB temp=41.4C disk=19%
[2026-03-15 22:21:42] [BEAT] [cecilia] load=0.42 mem=4864/8062MB temp=41.4C disk=19%
[2026-03-15 22:22:01] [BEAT] [cecilia] load=0.39 mem=4858/8062MB temp=41.4C disk=19%
[2026-03-15 22:23:01] [BEAT] [cecilia] load=0.62 mem=4857/8062MB temp=40.8C disk=19%
[2026-03-15 22:24:01] [BEAT] [cecilia] load=0.44 mem=4860/8062MB temp=40.8C disk=19%
[2026-03-15 22:25:01] [BEAT] [cecilia] load=0.57 mem=4839/8062MB temp=40.2C disk=19%
[2026-03-15 22:26:01] [BEAT] [cecilia] load=0.63 mem=4864/8062MB temp=39.7C disk=19%
[2026-03-15 22:26:51] [BEAT] [cecilia] load=0.43 mem=4863/8062MB temp=40.8C disk=19%
[2026-03-15 22:26:51] [BEAT] [cecilia] load=0.43 mem=4859/8062MB temp=39.7C disk=19%
[2026-03-15 22:35:01] [BEAT] [cecilia] load=0.48 mem=4824/8062MB temp=39.7C disk=19%
[2026-03-15 22:35:05] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:36:01] [BEAT] [cecilia] load=1.19 mem=5491/8062MB temp=48.5C disk=19%
[2026-03-15 22:37:01] [BEAT] [cecilia] load=3.25 mem=5517/8062MB temp=51.8C disk=19%
[2026-03-15 22:37:09] [BEAT] [cecilia] load=3.44 mem=5518/8062MB temp=51.2C disk=19%
[2026-03-15 22:37:09] [BEAT] [cecilia] load=3.44 mem=5517/8062MB temp=51.2C disk=19%
[2026-03-15 22:38:01] [BEAT] [cecilia] load=3.77 mem=5515/8062MB temp=53.5C disk=19%
[2026-03-15 22:39:01] [BEAT] [cecilia] load=4.18 mem=5525/8062MB temp=55.1C disk=19%
[2026-03-15 22:40:02] [FLEET] [cecilia] Starting cross-node health check
[2026-03-15 22:40:02] [BEAT] [cecilia] load=4.46 mem=5488/8062MB temp=56.2C disk=19%
[2026-03-15 22:40:03] [FLEET] [cecilia] alice: UP temp=36C mem=3367MB disk=80%
[2026-03-15 22:40:03] [FLEET] [cecilia] octavia: UP temp=36C mem=6758MB disk=67%
[2026-03-15 22:40:05] [FLEET] [cecilia] gematria: UP temp=C mem=4193MB disk=67%
[2026-03-15 22:40:05] [FLEET] [cecilia] lucidia: UP temp=61C mem=3513MB disk=33%
[2026-03-15 22:40:07] [FLEET] [cecilia] aria: DOWN (no ping response)
[2026-03-15 22:40:08] [FLEET] [cecilia] anastasia: UP temp=C mem=275MB disk=69%
[2026-03-15 22:41:01] [BEAT] [cecilia] load=3.20 mem=5568/8062MB temp=46.3C disk=19%
[2026-03-15 22:42:01] [BEAT] [cecilia] load=1.35 mem=5568/8062MB temp=44.6C disk=19%
[2026-03-15 22:42:19] [BEAT] [cecilia] load=1.26 mem=5562/8062MB temp=44.1C disk=19%
[2026-03-15 22:42:19] [BEAT] [cecilia] load=1.26 mem=5560/8062MB temp=45.2C disk=19%
[2026-03-15 22:43:01] [BEAT] [cecilia] load=0.72 mem=5536/8062MB temp=43.5C disk=19%
[2026-03-15 22:44:01] [BEAT] [cecilia] load=0.60 mem=5275/8062MB temp=44.6C disk=19%
[2026-03-15 22:45:01] [BEAT] [cecilia] load=0.53 mem=5284/8062MB temp=44.1C disk=19%
[2026-03-15 22:45:47] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:46:01] [BEAT] [cecilia] load=2.22 mem=5141/8062MB temp=45.8C disk=19%
[2026-03-15 22:47:01] [BEAT] [cecilia] load=0.97 mem=5082/8062MB temp=43.5C disk=19%
[2026-03-15 22:47:29] [BEAT] [cecilia] load=0.65 mem=5077/8062MB temp=42.5C disk=19%
[2026-03-15 22:47:29] [BEAT] [cecilia] load=0.65 mem=5080/8062MB temp=43.0C disk=19%
[2026-03-15 22:48:01] [BEAT] [cecilia] load=0.60 mem=5079/8062MB temp=43.0C disk=19%
[2026-03-15 22:49:01] [BEAT] [cecilia] load=0.38 mem=5078/8062MB temp=41.4C disk=19%
[2026-03-15 22:50:01] [FLEET] [cecilia] Starting cross-node health check
[2026-03-15 22:50:01] [BEAT] [cecilia] load=0.19 mem=5045/8062MB temp=41.9C disk=19%
[2026-03-15 22:50:01] [FLEET] [cecilia] alice: UP temp=36C mem=3349MB disk=80%
[2026-03-15 22:50:02] [FLEET] [cecilia] octavia: UP temp=36C mem=6744MB disk=67%
[2026-03-15 22:50:03] [FLEET] [cecilia] gematria: UP temp=C mem=4197MB disk=67%
[2026-03-15 22:50:03] [FLEET] [cecilia] lucidia: UP temp=55C mem=3518MB disk=33%
[2026-03-15 22:50:05] [DIAL] [cecilia] Switchboard unreachable
[2026-03-15 22:50:05] [FLEET] [cecilia] aria: DOWN (no ping response)
[2026-03-15 22:50:06] [FLEET] [cecilia] anastasia: UP temp=C mem=275MB disk=69%
[2026-03-15 22:51:01] [BEAT] [cecilia] load=0.15 mem=5090/8062MB temp=40.8C disk=19%
[2026-03-15 22:52:01] [BEAT] [cecilia] load=1.28 mem=5085/8062MB temp=39.7C disk=19%
[2026-03-15 22:52:39] [BEAT] [cecilia] load=1.66 mem=5087/8062MB temp=40.2C disk=19%
[2026-03-15 22:52:39] [BEAT] [cecilia] load=1.66 mem=5083/8062MB temp=39.7C disk=19%
[2026-03-15 22:53:01] [BEAT] [cecilia] load=1.77 mem=5092/8062MB temp=39.7C disk=19%
[2026-03-15 22:54:01] [BEAT] [cecilia] load=0.87 mem=5089/8062MB temp=38.6C disk=19%
[2026-03-15 22:55:02] [BEAT] [cecilia] load=0.45 mem=5057/8062MB temp=39.7C disk=19%
[2026-03-15 22:56:01] [BEAT] [cecilia] load=0.25 mem=5154/8062MB temp=39.1C disk=19%
[2026-03-15 22:57:01] [BEAT] [cecilia] load=0.13 mem=5146/8062MB temp=39.7C disk=19%
[2026-03-15 22:57:48] [BEAT] [cecilia] load=0.37 mem=5197/8062MB temp=39.1C disk=19%
[2026-03-15 22:57:48] [BEAT] [cecilia] load=0.37 mem=5194/8062MB temp=38.6C disk=19%

View File

@@ -1 +1 @@
{"node":"cecilia","ts":"2026-03-16T03:26:51Z","load":0.43,"mem_free_mb":4859,"mem_total_mb":8062,"temp_c":39.7,"disk_pct":19,"throttle":"0x50000"}
{"node":"cecilia","ts":"2026-03-16T03:57:48Z","load":0.37,"mem_free_mb":5194,"mem_total_mb":8062,"temp_c":38.6,"disk_pct":19,"throttle":"0x50000"}

View File

@@ -22,7 +22,7 @@ MY_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || echo "unknown")
case "$MY_IP" in
192.168.4.49*) NODE_NAME="alice" ;;
192.168.4.96*) NODE_NAME="cecilia" ;;
192.168.4.100*) NODE_NAME="octavia" ;;
192.168.4.101*) NODE_NAME="octavia" ;;
192.168.4.98*) NODE_NAME="aria" ;;
192.168.4.38*) NODE_NAME="lucidia" ;;
esac
@@ -41,7 +41,7 @@ mkdir -p "$AUTONOMY_DIR"/{state,fleet,health,tasks/pending,tasks/completed,logs}
declare -A FLEET_IPS
FLEET_IPS[alice]="192.168.4.49"
FLEET_IPS[cecilia]="192.168.4.96"
FLEET_IPS[octavia]="192.168.4.100"
FLEET_IPS[octavia]="192.168.4.101"
FLEET_IPS[aria]="192.168.4.98"
FLEET_IPS[lucidia]="192.168.4.38"
FLEET_IPS[anastasia]="174.138.44.45"

View File

@@ -31,7 +31,7 @@ class OllamaClient:
if self._session:
await self._session.close()
async def generate(self, model: str, prompt: str, timeout: int = 120) -> str:
async def generate(self, model: str, prompt: str, timeout: int = 180) -> str:
"""Run inference through Ollama. Respects concurrency semaphore."""
async with self._semaphore:
try:
@@ -41,7 +41,7 @@ class OllamaClient:
"model": model,
"prompt": prompt,
"stream": False,
"options": {"num_predict": 1024, "temperature": 0.7},
"options": {"num_predict": 512, "temperature": 0.7},
},
timeout=aiohttp.ClientTimeout(total=timeout),
) as resp:

View File

@@ -17,6 +17,8 @@ from .config import CONTROLLER_HOST, CONTROLLER_PORT, TASKS_DB
from .spawn import SpawnScheduler
from .nats_protocol import NATSBus, TaskMessage, ResultMessage, HeartbeatMessage
from .router import TaskRouter
from .pipelines import PipelineExecutor, Pipeline, PipelineStep, BUILTIN_PIPELINES
from .jobs import JobScheduler, WorkerIntegration
log = logging.getLogger("orchestrator.controller")
@@ -24,6 +26,9 @@ log = logging.getLogger("orchestrator.controller")
scheduler = SpawnScheduler()
bus = NATSBus()
router = TaskRouter()
pipeline_executor: PipelineExecutor | None = None
job_scheduler: JobScheduler | None = None
worker_integration: WorkerIntegration | None = None
# Task result store (in-memory, backed by SQLite)
_results: dict[str, ResultMessage] = {}
@@ -56,6 +61,8 @@ def _init_tasks_db():
@asynccontextmanager
async def lifespan(app: FastAPI):
global pipeline_executor, job_scheduler, worker_integration
_init_tasks_db()
await bus.connect()
@@ -66,8 +73,24 @@ async def lifespan(app: FastAPI):
# Start health check loop
asyncio.create_task(_health_check_loop())
log.info("Controller started on %s:%d", CONTROLLER_HOST, CONTROLLER_PORT)
# Initialize pipelines
pipeline_executor = PipelineExecutor(bus)
# Initialize job scheduler
job_scheduler = JobScheduler(bus)
await job_scheduler.start()
# Initialize Worker integration
worker_integration = WorkerIntegration()
await worker_integration.init()
log.info("Controller started on %s:%d (pipelines, jobs, integrations)", CONTROLLER_HOST, CONTROLLER_PORT)
yield
if job_scheduler:
await job_scheduler.stop()
if worker_integration:
await worker_integration.close()
await bus.disconnect()
log.info("Controller stopped")
@@ -90,7 +113,10 @@ app.add_middleware(
# --- Handlers ---
async def _handle_result(result: ResultMessage):
"""Process task results from nodes."""
"""Process task results from nodes. Also forwards to pipeline executor."""
# Forward to pipeline executor for chained steps
if pipeline_executor:
pipeline_executor.on_result(result)
_results[result.task_id] = result
_pending_tasks.pop(result.task_id, None)
@@ -288,7 +314,120 @@ async def health():
"""Health check endpoint."""
return {
"status": "ok",
"version": "1.0.0",
"version": "2.0.0",
"nodes": router.cluster_stats(),
"pools": scheduler.pool_stats()["total_agents"],
}
# --- Pipeline API ---
class PipelineRequest(BaseModel):
pipeline: str = "" # Name of builtin pipeline
input: str = "" # Initial input
steps: list[dict] | None = None # Custom steps [{archetype, prompt_template, intent}]
@app.post("/api/pipelines")
async def run_pipeline(req: PipelineRequest):
"""Execute a multi-step agent pipeline."""
if not pipeline_executor:
raise HTTPException(status_code=503, detail="Pipeline executor not ready")
if req.pipeline:
pipe = pipeline_executor.get_builtin(req.pipeline)
if not pipe:
raise HTTPException(status_code=404, detail=f"Pipeline '{req.pipeline}' not found. Available: {pipeline_executor.list_pipelines()}")
elif req.steps:
pipe = Pipeline(
name="custom",
steps=[PipelineStep(**s) for s in req.steps],
)
else:
raise HTTPException(status_code=400, detail="Provide 'pipeline' name or 'steps' array")
if not req.input:
raise HTTPException(status_code=400, detail="'input' is required")
# Run pipeline in background, return immediately
task = asyncio.create_task(pipeline_executor.execute(pipe, req.input))
return {
"pipeline_id": pipe.pipeline_id,
"name": pipe.name,
"steps": len(pipe.steps),
"status": "running",
}
@app.get("/api/pipelines")
async def list_pipelines():
"""List available pipelines."""
return {
"builtin": [
{"name": k, "steps": len(v.steps), "archetypes": [s.archetype for s in v.steps]}
for k, v in BUILTIN_PIPELINES.items()
]
}
# --- Jobs API ---
@app.get("/api/jobs")
async def list_jobs():
"""List recurring jobs and their status."""
if not job_scheduler:
return {"jobs": []}
return {"jobs": job_scheduler.list_jobs()}
@app.post("/api/jobs/{name}/toggle")
async def toggle_job(name: str):
"""Enable or disable a recurring job."""
if not job_scheduler:
raise HTTPException(status_code=503, detail="Job scheduler not ready")
if not job_scheduler.toggle_job(name):
raise HTTPException(status_code=404, detail=f"Job '{name}' not found")
return {"ok": True, "name": name}
# --- Worker Integration API ---
@app.get("/api/workers/health")
async def worker_health():
"""Check health of all Cloudflare Workers."""
if not worker_integration:
raise HTTPException(status_code=503, detail="Worker integration not ready")
return await worker_integration.check_all_workers()
@app.get("/api/workers/search/stats")
async def search_stats():
"""Get search engine stats."""
if not worker_integration:
raise HTTPException(status_code=503, detail="Not ready")
return await worker_integration.get_search_stats()
@app.get("/api/workers/fleet")
async def fleet_status():
"""Get fleet status from fleet API."""
if not worker_integration:
raise HTTPException(status_code=503, detail="Not ready")
return await worker_integration.get_fleet_status()
@app.post("/api/tasks/batch")
async def submit_batch(tasks: list[TaskRequest]):
"""Submit multiple tasks at once."""
results = []
for req in tasks[:50]: # Max 50 per batch
task_id = f"task-{uuid.uuid4().hex[:12]}"
target_node = req.target_node or router.best_node(req.archetype) or ""
task = TaskMessage(
task_id=task_id, archetype=req.archetype, intent=req.intent,
prompt=req.prompt, priority=req.priority, target_node=target_node,
)
await bus.publish_task(task)
_pending_tasks[task_id] = task
results.append({"task_id": task_id, "archetype": req.archetype, "target_node": target_node})
return {"submitted": len(results), "tasks": results}

View File

@@ -0,0 +1,207 @@
"""
BlackRoad Agent Orchestrator — Recurring Jobs & Worker Integrations
Schedule agents to do real work: reindex search, collect analytics, monitor fleet.
"""
import asyncio
import time
import logging
import aiohttp
from dataclasses import dataclass
from .nats_protocol import NATSBus, TaskMessage
log = logging.getLogger("orchestrator.jobs")
@dataclass
class RecurringJob:
name: str
archetype: str
prompt: str
interval_seconds: int
intent: str = "scheduled"
enabled: bool = True
last_run: float = 0.0
run_count: int = 0
# Built-in recurring jobs
RECURRING_JOBS = [
RecurringJob(
name="fleet-health-check",
archetype="security", # Uses deepseek-r1:1.5b (fast)
prompt="Check BlackRoad infrastructure health. List which services are up or down. Be brief — one line per service.",
interval_seconds=600, # Every 10 minutes
),
RecurringJob(
name="security-scan",
archetype="security",
prompt="Brief security check: any concerns with the BlackRoad fleet? Check auth, network, and access patterns. 3 bullet points max.",
interval_seconds=3600, # Every hour
),
RecurringJob(
name="code-index-refresh",
archetype="coder", # Uses qwen2.5-coder:3b (medium speed)
prompt="What are the most important recent changes across BlackRoad repos? Summarize in 5 bullet points.",
interval_seconds=1800, # Every 30 minutes
),
RecurringJob(
name="analytics-digest",
archetype="security", # Fast model for simple analysis
prompt="Summarize BlackRoad OS usage: estimated active users, top services, any anomalies. Keep it to 3 lines.",
interval_seconds=3600, # Every hour
),
RecurringJob(
name="creative-brief",
archetype="security", # Use fast model
prompt="Write one motivational sentence about building sovereign infrastructure. Keep it under 20 words.",
interval_seconds=7200, # Every 2 hours
enabled=False, # Disabled by default — nice-to-have
),
]
class JobScheduler:
"""Runs recurring agent jobs on schedule."""
def __init__(self, bus: NATSBus):
self.bus = bus
self.jobs = {j.name: j for j in RECURRING_JOBS}
self._running = False
async def start(self):
"""Start the job scheduler loop. Staggers initial runs to avoid thundering herd."""
self._running = True
# Stagger initial runs — offset each job by 60s
offset = 0
for job in self.jobs.values():
job.last_run = time.time() + offset # Delay initial run
offset += 60
log.info("Job scheduler started with %d jobs (staggered)", len(self.jobs))
async def stop(self):
self._running = False
async def _run_loop(self):
while self._running:
now = time.time()
for job in self.jobs.values():
if not job.enabled:
continue
if now - job.last_run >= job.interval_seconds:
await self._execute_job(job)
await asyncio.sleep(30) # Check every 30s
async def _execute_job(self, job: RecurringJob):
"""Submit a job as a task to the orchestrator."""
task = TaskMessage(
task_id=f"job-{job.name}-{int(time.time())}",
archetype=job.archetype,
intent=job.intent,
prompt=job.prompt,
priority=7, # Lower priority than user tasks
)
await self.bus.publish_task(task)
job.last_run = time.time()
job.run_count += 1
log.info("Job %s submitted (run #%d)", job.name, job.run_count)
def list_jobs(self) -> list[dict]:
return [
{
"name": j.name,
"archetype": j.archetype,
"interval": j.interval_seconds,
"enabled": j.enabled,
"last_run": j.last_run,
"run_count": j.run_count,
}
for j in self.jobs.values()
]
def toggle_job(self, name: str) -> bool:
job = self.jobs.get(name)
if not job:
return False
job.enabled = not job.enabled
log.info("Job %s %s", name, "enabled" if job.enabled else "disabled")
return True
class WorkerIntegration:
"""Connect agent tasks to real Cloudflare Workers."""
def __init__(self):
self._session: aiohttp.ClientSession | None = None
async def init(self):
self._session = aiohttp.ClientSession()
async def close(self):
if self._session:
await self._session.close()
async def trigger_search_reindex(self, source: str = "github") -> dict:
"""Trigger search index rebuild via the index Worker."""
async with self._session.post(
f"https://index.blackroad.io/api/index?source={source}",
timeout=aiohttp.ClientTimeout(total=30),
) as resp:
return await resp.json()
async def get_fleet_status(self) -> dict:
"""Pull fleet status from the fleet API."""
async with self._session.get(
"https://fleet-api.amundsonalexa.workers.dev/fleet",
timeout=aiohttp.ClientTimeout(total=10),
) as resp:
return await resp.json()
async def get_search_stats(self) -> dict:
"""Pull search stats."""
async with self._session.get(
"https://search.blackroad.io/api/stats",
timeout=aiohttp.ClientTimeout(total=5),
) as resp:
return await resp.json()
async def get_analytics(self) -> dict:
"""Pull analytics summary."""
async with self._session.get(
"https://analytics.blackroad.io/api/stats",
timeout=aiohttp.ClientTimeout(total=5),
) as resp:
return await resp.json()
async def check_all_workers(self) -> dict:
"""Health check all Workers in parallel."""
endpoints = {
"auth": "https://auth.blackroad.io/api/health",
"pay": "https://pay.blackroad.io/health",
"search": "https://search.blackroad.io/api/health",
"portal": "https://portal.blackroad.io/api/health",
"chat": "https://chat.blackroad.io/api/health",
"images": "https://images.blackroad.io/api/health",
"index": "https://index.blackroad.io/api/health",
"analytics": "https://analytics.blackroad.io/api/health",
"stats": "https://stats.blackroad.io/health",
"agents": "https://agents.blackroad.io/health",
"fleet": "https://fleet.blackroad.io/health",
}
results = {}
tasks = []
for name, url in endpoints.items():
tasks.append(self._check_one(name, url))
for coro in asyncio.as_completed(tasks):
name, status = await coro
results[name] = status
return results
async def _check_one(self, name: str, url: str) -> tuple[str, str]:
try:
async with self._session.get(url, timeout=aiohttp.ClientTimeout(total=5)) as resp:
return (name, "up" if resp.status == 200 else "down")
except Exception:
return (name, "down")

View File

@@ -0,0 +1,169 @@
"""
BlackRoad Agent Orchestrator — Task Pipelines
Chain multiple agents together. Output of one feeds into the next.
"""
import asyncio
import uuid
import time
import logging
from dataclasses import dataclass, field
from .nats_protocol import NATSBus, TaskMessage, ResultMessage
log = logging.getLogger("orchestrator.pipelines")
@dataclass
class PipelineStep:
archetype: str
prompt_template: str # Use {input} for previous step's output
intent: str = "pipeline"
target_node: str = ""
@dataclass
class Pipeline:
name: str
steps: list[PipelineStep]
pipeline_id: str = ""
status: str = "pending"
results: list[ResultMessage] = field(default_factory=list)
created_at: float = 0.0
def __post_init__(self):
if not self.pipeline_id:
self.pipeline_id = f"pipe-{uuid.uuid4().hex[:12]}"
if not self.created_at:
self.created_at = time.time()
# Pre-built pipelines
BUILTIN_PIPELINES = {
"research-report": Pipeline(
name="Research Report",
steps=[
PipelineStep("researcher", "Research this topic thoroughly: {input}"),
PipelineStep("analyst", "Analyze these research findings and extract key insights:\n{input}"),
PipelineStep("creative", "Write a clear, engaging summary report based on this analysis:\n{input}"),
],
),
"code-review": Pipeline(
name="Code Review",
steps=[
PipelineStep("coder", "Review this code for bugs and improvements:\n{input}"),
PipelineStep("security", "Check this code review for security vulnerabilities:\n{input}"),
PipelineStep("coordinator", "Summarize the code review and security findings into actionable items:\n{input}"),
],
),
"fleet-audit": Pipeline(
name="Fleet Audit",
steps=[
PipelineStep("monitor", "Check the status of all BlackRoad infrastructure services: {input}"),
PipelineStep("security", "Audit these infrastructure findings for security issues:\n{input}"),
PipelineStep("analyst", "Produce a fleet health score and risk assessment:\n{input}"),
],
),
"content-create": Pipeline(
name="Content Creation",
steps=[
PipelineStep("researcher", "Research this topic for a blog post: {input}"),
PipelineStep("creative", "Write an engaging blog post based on this research:\n{input}"),
PipelineStep("coder", "Format this blog post as clean HTML with proper headings and structure:\n{input}"),
],
),
"bug-fix": Pipeline(
name="Bug Fix",
steps=[
PipelineStep("coder", "Analyze this bug report and identify the root cause:\n{input}"),
PipelineStep("coder", "Write a fix for this bug based on the analysis:\n{input}"),
PipelineStep("security", "Verify this fix doesn't introduce new vulnerabilities:\n{input}"),
],
),
}
class PipelineExecutor:
"""Executes multi-step pipelines by chaining agent tasks."""
def __init__(self, bus: NATSBus):
self.bus = bus
self._active: dict[str, Pipeline] = {}
self._results: dict[str, ResultMessage] = {}
self._waiters: dict[str, asyncio.Event] = {}
async def execute(self, pipeline: Pipeline, initial_input: str) -> Pipeline:
"""Execute a pipeline, chaining results through each step."""
pipeline.status = "running"
self._active[pipeline.pipeline_id] = pipeline
current_input = initial_input
log.info("Pipeline %s started: %s (%d steps)", pipeline.pipeline_id, pipeline.name, len(pipeline.steps))
for i, step in enumerate(pipeline.steps):
step_num = i + 1
task_id = f"{pipeline.pipeline_id}-step{step_num}"
# Build prompt from template
prompt = step.prompt_template.replace("{input}", current_input)
# Create and publish task
task = TaskMessage(
task_id=task_id,
archetype=step.archetype,
intent=step.intent,
prompt=prompt,
priority=2,
target_node=step.target_node,
)
# Set up waiter
event = asyncio.Event()
self._waiters[task_id] = event
await self.bus.publish_task(task)
log.info("Pipeline %s step %d/%d: %s task %s", pipeline.pipeline_id, step_num, len(pipeline.steps), step.archetype, task_id)
# Wait for result (timeout 5 min per step)
try:
await asyncio.wait_for(event.wait(), timeout=300)
except asyncio.TimeoutError:
log.error("Pipeline %s step %d timed out", pipeline.pipeline_id, step_num)
pipeline.status = "failed"
return pipeline
result = self._results.get(task_id)
if not result or result.status != "completed":
log.error("Pipeline %s step %d failed: %s", pipeline.pipeline_id, step_num, result.error if result else "no result")
pipeline.status = "failed"
return pipeline
pipeline.results.append(result)
current_input = result.result
log.info("Pipeline %s step %d completed in %dms", pipeline.pipeline_id, step_num, result.latency_ms)
pipeline.status = "completed"
log.info("Pipeline %s completed: %d steps, total %dms",
pipeline.pipeline_id, len(pipeline.steps),
sum(r.latency_ms for r in pipeline.results))
self._active.pop(pipeline.pipeline_id, None)
return pipeline
def on_result(self, result: ResultMessage):
"""Called when a task result arrives. Unblocks pipeline steps."""
self._results[result.task_id] = result
event = self._waiters.pop(result.task_id, None)
if event:
event.set()
def list_pipelines(self) -> list[str]:
return list(BUILTIN_PIPELINES.keys())
def get_builtin(self, name: str) -> Pipeline | None:
template = BUILTIN_PIPELINES.get(name)
if not template:
return None
# Return a fresh copy
return Pipeline(
name=template.name,
steps=list(template.steps),
)

View File

@@ -1,7 +1,6 @@
LISTEN 0 4096 127.0.0.1:33551 0.0.0.0:*
LISTEN 0 4096 127.0.0.1:9000 0.0.0.0:*
LISTEN 0 4096 127.0.0.1:45685 0.0.0.0:*
LISTEN 0 128 0.0.0.0:34001 0.0.0.0:*
LISTEN 0 4096 127.0.0.1:37175 0.0.0.0:*
LISTEN 0 32 127.0.0.1:53 0.0.0.0:*
LISTEN 0 2048 0.0.0.0:8788 0.0.0.0:* users:(("python3",pid=1429,fd=16))
LISTEN 0 5 0.0.0.0:8787 0.0.0.0:* users:(("python3",pid=1562,fd=3))
@@ -12,7 +11,7 @@ LISTEN 0 511 0.0.0.0:80 0.0.0.0:*
LISTEN 0 128 0.0.0.0:22 0.0.0.0:*
LISTEN 0 200 127.0.0.1:5432 0.0.0.0:*
LISTEN 0 32 192.168.4.96:53 0.0.0.0:*
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=19895,fd=5))
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=97297,fd=5))
LISTEN 0 5 0.0.0.0:4010 0.0.0.0:* users:(("python3",pid=1035,fd=3))
LISTEN 0 511 0.0.0.0:8080 0.0.0.0:*
LISTEN 0 511 0.0.0.0:3100 0.0.0.0:*

View File

@@ -1,19 +1,19 @@
{
"hostname": "cecilia",
"ts": "2026-03-16T03:26:52Z",
"uptime_seconds": 2322,
"ts": "2026-03-16T03:57:49Z",
"uptime_seconds": 4179,
"kernel": "6.12.62+rpt-rpi-2712",
"temp_c": 40.8,
"temp_c": 38.0,
"memory_mb": {
"total": 8062,
"used": 3217,
"free": 4845
"used": 2867,
"free": 5195
},
"disk": "81G/457G (19%)",
"load": [
0.48,
0.5,
0.47
0.37,
0.6,
0.93
],
"ollama_models": [
"deepseek-r1:1.5b",

View File

@@ -1,19 +1,19 @@
{
"hostname": "gematria",
"ts": "2026-03-16T03:26:54Z",
"uptime_seconds": 5430806,
"ts": "2026-03-16T03:57:52Z",
"uptime_seconds": 5432664,
"kernel": "5.15.0-113-generic",
"temp_c": 0,
"memory_mb": {
"total": 7937,
"used": 3312,
"free": 4177
"used": 3302,
"free": 4189
},
"disk": "52G/78G (67%)",
"load": [
3.1,
3.09,
3.09
3.26,
3.13,
3.1
],
"ollama_models": [
"qwen2.5:7b",

View File

@@ -1,50 +1,50 @@
[2026-03-15 22:04:02] [BEAT] [lucidia] load=2.23 mem=2933/8063MB temp=46.9C disk=33%
[2026-03-15 22:04:12] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:05:43] [BEAT] [lucidia] load=37.88 mem=3628/8063MB temp=46.3C disk=33%
[2026-03-15 22:06:01] [BEAT] [lucidia] load=29.62 mem=3648/8063MB temp=44.6C disk=33%
[2026-03-15 22:06:15] [BEAT] [lucidia] load=23.22 mem=3648/8063MB temp=45.2C disk=33%
[2026-03-15 22:06:15] [BEAT] [lucidia] load=23.22 mem=3646/8063MB temp=45.2C disk=33%
[2026-03-15 22:07:01] [BEAT] [lucidia] load=13.05 mem=3613/8063MB temp=46.9C disk=33%
[2026-03-15 22:08:01] [BEAT] [lucidia] load=5.60 mem=3604/8063MB temp=53.5C disk=33%
[2026-03-15 22:09:01] [BEAT] [lucidia] load=3.31 mem=3593/8063MB temp=47.4C disk=33%
[2026-03-15 22:10:01] [FLEET] [lucidia] Starting cross-node health check
[2026-03-15 22:10:01] [BEAT] [lucidia] load=3.25 mem=3598/8063MB temp=57.3C disk=33%
[2026-03-15 22:10:02] [FLEET] [lucidia] alice: UP temp=36C mem=3355MB disk=80%
[2026-03-15 22:10:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
[2026-03-15 22:10:04] [FLEET] [lucidia] cecilia: UP temp=42C mem=4776MB disk=19%
[2026-03-15 22:10:06] [FLEET] [lucidia] gematria: UP temp=C mem=4213MB disk=67%
[2026-03-15 22:10:08] [FLEET] [lucidia] aria: DOWN (no ping response)
[2026-03-15 22:10:09] [FLEET] [lucidia] anastasia: UP temp=C mem=201MB disk=69%
[2026-03-15 22:11:01] [BEAT] [lucidia] load=2.19 mem=3609/8063MB temp=50.7C disk=33%
[2026-03-15 22:11:24] [BEAT] [lucidia] load=1.92 mem=3572/8063MB temp=50.1C disk=33%
[2026-03-15 22:11:24] [BEAT] [lucidia] load=1.92 mem=3571/8063MB temp=49.6C disk=33%
[2026-03-15 22:12:01] [BEAT] [lucidia] load=3.10 mem=3618/8063MB temp=54.0C disk=33%
[2026-03-15 22:13:01] [BEAT] [lucidia] load=1.67 mem=3562/8063MB temp=58.4C disk=33%
[2026-03-15 22:14:01] [BEAT] [lucidia] load=1.45 mem=3573/8063MB temp=52.9C disk=33%
[2026-03-15 22:15:01] [BEAT] [lucidia] load=2.51 mem=3622/8063MB temp=61.1C disk=33%
[2026-03-15 22:15:47] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:16:01] [BEAT] [lucidia] load=2.37 mem=3562/8063MB temp=54.5C disk=33%
[2026-03-15 22:16:34] [BEAT] [lucidia] load=3.15 mem=3577/8063MB temp=59.5C disk=33%
[2026-03-15 22:16:34] [BEAT] [lucidia] load=3.15 mem=3576/8063MB temp=61.1C disk=33%
[2026-03-15 22:17:01] [BEAT] [lucidia] load=4.58 mem=3558/8063MB temp=56.2C disk=33%
[2026-03-15 22:18:01] [BEAT] [lucidia] load=3.43 mem=3575/8063MB temp=61.1C disk=33%
[2026-03-15 22:19:01] [BEAT] [lucidia] load=2.19 mem=3537/8063MB temp=54.5C disk=33%
[2026-03-15 22:19:08] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:20:01] [FLEET] [lucidia] Starting cross-node health check
[2026-03-15 22:20:01] [BEAT] [lucidia] load=3.87 mem=3528/8063MB temp=64.5C disk=33%
[2026-03-15 22:20:02] [FLEET] [lucidia] alice: UP temp=36C mem=3354MB disk=80%
[2026-03-15 22:20:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
[2026-03-15 22:20:04] [FLEET] [lucidia] cecilia: UP temp=40C mem=4815MB disk=19%
[2026-03-15 22:20:05] [FLEET] [lucidia] gematria: UP temp=C mem=4209MB disk=67%
[2026-03-15 22:20:07] [FLEET] [lucidia] aria: DOWN (no ping response)
[2026-03-15 22:20:08] [FLEET] [lucidia] anastasia: UP temp=C mem=277MB disk=69%
[2026-03-15 22:21:01] [BEAT] [lucidia] load=2.23 mem=3539/8063MB temp=55.6C disk=33%
[2026-03-15 22:21:42] [BEAT] [lucidia] load=3.43 mem=3513/8063MB temp=59.0C disk=33%
[2026-03-15 22:21:43] [BEAT] [lucidia] load=3.43 mem=3512/8063MB temp=59.5C disk=33%
[2026-03-15 22:22:01] [BEAT] [lucidia] load=2.59 mem=3547/8063MB temp=56.8C disk=33%
[2026-03-15 22:23:02] [BEAT] [lucidia] load=3.14 mem=3496/8063MB temp=62.2C disk=33%
[2026-03-15 22:24:01] [BEAT] [lucidia] load=2.45 mem=3507/8063MB temp=55.1C disk=33%
[2026-03-15 22:25:01] [BEAT] [lucidia] load=1.57 mem=3537/8063MB temp=55.6C disk=33%
[2026-03-15 22:26:01] [BEAT] [lucidia] load=1.95 mem=3454/8063MB temp=54.0C disk=33%
[2026-03-15 22:26:51] [BEAT] [lucidia] load=2.08 mem=3528/8063MB temp=59.5C disk=33%
[2026-03-15 22:26:52] [BEAT] [lucidia] load=2.08 mem=3521/8063MB temp=57.9C disk=33%
[2026-03-15 22:34:08] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:35:01] [BEAT] [lucidia] load=1.90 mem=3487/8063MB temp=56.2C disk=33%
[2026-03-15 22:36:01] [BEAT] [lucidia] load=2.06 mem=3457/8063MB temp=56.8C disk=33%
[2026-03-15 22:37:01] [BEAT] [lucidia] load=2.33 mem=3515/8063MB temp=61.7C disk=33%
[2026-03-15 22:37:09] [BEAT] [lucidia] load=3.86 mem=3515/8063MB temp=61.7C disk=33%
[2026-03-15 22:37:10] [BEAT] [lucidia] load=3.86 mem=3514/8063MB temp=62.8C disk=33%
[2026-03-15 22:38:01] [BEAT] [lucidia] load=3.72 mem=3513/8063MB temp=54.5C disk=33%
[2026-03-15 22:39:01] [BEAT] [lucidia] load=3.09 mem=3471/8063MB temp=53.5C disk=33%
[2026-03-15 22:40:01] [FLEET] [lucidia] Starting cross-node health check
[2026-03-15 22:40:01] [BEAT] [lucidia] load=2.87 mem=3474/8063MB temp=61.7C disk=33%
[2026-03-15 22:40:02] [FLEET] [lucidia] alice: UP temp=35C mem=3362MB disk=80%
[2026-03-15 22:40:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
[2026-03-15 22:40:04] [FLEET] [lucidia] cecilia: UP temp=56C mem=5480MB disk=19%
[2026-03-15 22:40:06] [FLEET] [lucidia] gematria: UP temp=C mem=4188MB disk=67%
[2026-03-15 22:40:08] [FLEET] [lucidia] aria: DOWN (no ping response)
[2026-03-15 22:40:09] [FLEET] [lucidia] anastasia: UP temp=C mem=275MB disk=69%
[2026-03-15 22:41:01] [BEAT] [lucidia] load=2.12 mem=3506/8063MB temp=54.0C disk=33%
[2026-03-15 22:42:01] [BEAT] [lucidia] load=3.38 mem=3536/8063MB temp=65.0C disk=33%
[2026-03-15 22:42:19] [BEAT] [lucidia] load=3.82 mem=3546/8063MB temp=59.5C disk=33%
[2026-03-15 22:42:19] [BEAT] [lucidia] load=3.82 mem=3546/8063MB temp=59.0C disk=33%
[2026-03-15 22:43:01] [BEAT] [lucidia] load=3.45 mem=3525/8063MB temp=55.6C disk=33%
[2026-03-15 22:44:01] [BEAT] [lucidia] load=3.24 mem=3510/8063MB temp=55.6C disk=33%
[2026-03-15 22:45:01] [BEAT] [lucidia] load=2.56 mem=3490/8063MB temp=57.3C disk=33%
[2026-03-15 22:45:46] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:46:01] [BEAT] [lucidia] load=1.80 mem=3506/8063MB temp=58.4C disk=33%
[2026-03-15 22:47:01] [BEAT] [lucidia] load=1.77 mem=3473/8063MB temp=53.5C disk=33%
[2026-03-15 22:47:29] [BEAT] [lucidia] load=1.70 mem=3482/8063MB temp=60.6C disk=33%
[2026-03-15 22:47:30] [BEAT] [lucidia] load=1.70 mem=3484/8063MB temp=60.0C disk=33%
[2026-03-15 22:48:01] [BEAT] [lucidia] load=2.67 mem=3480/8063MB temp=55.6C disk=33%
[2026-03-15 22:49:01] [BEAT] [lucidia] load=2.76 mem=3479/8063MB temp=61.7C disk=33%
[2026-03-15 22:49:09] [DIAL] [lucidia] Switchboard unreachable
[2026-03-15 22:50:01] [FLEET] [lucidia] Starting cross-node health check
[2026-03-15 22:50:01] [BEAT] [lucidia] load=2.31 mem=3527/8063MB temp=55.6C disk=33%
[2026-03-15 22:50:02] [FLEET] [lucidia] alice: UP temp=36C mem=3349MB disk=80%
[2026-03-15 22:50:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
[2026-03-15 22:50:04] [FLEET] [lucidia] cecilia: UP temp=42C mem=5049MB disk=19%
[2026-03-15 22:50:06] [FLEET] [lucidia] gematria: UP temp=C mem=4188MB disk=67%
[2026-03-15 22:50:08] [FLEET] [lucidia] aria: DOWN (no ping response)
[2026-03-15 22:50:09] [FLEET] [lucidia] anastasia: UP temp=C mem=276MB disk=69%
[2026-03-15 22:51:01] [BEAT] [lucidia] load=1.95 mem=3565/8063MB temp=55.1C disk=33%
[2026-03-15 22:52:01] [BEAT] [lucidia] load=2.08 mem=3568/8063MB temp=60.0C disk=33%
[2026-03-15 22:52:39] [BEAT] [lucidia] load=2.30 mem=3546/8063MB temp=52.4C disk=33%
[2026-03-15 22:52:39] [BEAT] [lucidia] load=2.30 mem=3546/8063MB temp=51.8C disk=33%
[2026-03-15 22:53:01] [BEAT] [lucidia] load=1.95 mem=3558/8063MB temp=51.2C disk=33%
[2026-03-15 22:54:01] [BEAT] [lucidia] load=1.88 mem=3558/8063MB temp=48.5C disk=33%
[2026-03-15 22:55:01] [BEAT] [lucidia] load=3.74 mem=3484/8063MB temp=58.4C disk=33%
[2026-03-15 22:56:01] [BEAT] [lucidia] load=1.93 mem=3486/8063MB temp=48.5C disk=33%
[2026-03-15 22:57:01] [BEAT] [lucidia] load=2.52 mem=3550/8063MB temp=48.0C disk=33%
[2026-03-15 22:57:49] [BEAT] [lucidia] load=2.51 mem=3540/8063MB temp=53.5C disk=33%
[2026-03-15 22:57:49] [BEAT] [lucidia] load=2.51 mem=3543/8063MB temp=53.5C disk=33%

View File

@@ -1,14 +1,14 @@
blackroad-gitea Up 36 minutes
road-pdns-admin Up 36 minutes (healthy)
road-pdns Up 36 minutes
road-dns-db Up 36 minutes
roadauth Up 36 minutes
roadapi Up 36 minutes
blackroad-edge-agent Up 36 minutes
blackroad.systems Up 36 minutes
blackroadai.com Up 36 minutes
blackroad-auth-gateway Up 36 minutes
blackroad-metaverse Up 36 minutes
blackroad-os Up 36 minutes
blackroad-os-carpool Up 36 minutes
pi-my-agent-1 Up 36 minutes (healthy)
blackroad-gitea Up About an hour
road-pdns-admin Up About an hour (healthy)
road-pdns Up About an hour
road-dns-db Up About an hour
roadauth Up About an hour
roadapi Up About an hour
blackroad-edge-agent Up About an hour
blackroad.systems Up About an hour
blackroadai.com Up About an hour
blackroad-auth-gateway Up About an hour
blackroad-metaverse Up About an hour
blackroad-os Up About an hour
blackroad-os-carpool Up About an hour
pi-my-agent-1 Up About an hour (healthy)

View File

@@ -1 +1 @@
{"node":"lucidia","ts":"2026-03-16T03:26:51Z","load":2.08,"mem_free_mb":3521,"mem_total_mb":8063,"temp_c":57.9,"disk_pct":33,"throttle":"0x0"}
{"node":"lucidia","ts":"2026-03-16T03:57:49Z","load":2.51,"mem_free_mb":3543,"mem_total_mb":8063,"temp_c":53.5,"disk_pct":33,"throttle":"0x0"}

View File

@@ -28,6 +28,7 @@ pironman5.service
pm2-pi.service
polkit.service
postgresql@17-main.service
road-phone.service
roadnet-failover.service
rtkit-daemon.service
serial-getty@ttyAMA10.service

View File

@@ -1,19 +1,19 @@
{
"hostname": "octavia",
"ts": "2026-03-16T03:26:52Z",
"uptime_seconds": 2370,
"ts": "2026-03-16T03:57:49Z",
"uptime_seconds": 4227,
"kernel": "6.12.62+rpt-rpi-2712",
"temp_c": 59.5,
"temp_c": 51.8,
"memory_mb": {
"total": 8063,
"used": 4549,
"free": 3513
"used": 4516,
"free": 3546
},
"disk": "73G/235G (33%)",
"load": [
2.08,
2.45,
3.35
2.51,
2.42,
2.55
],
"ollama_models": [
"qwen2.5:3b",
@@ -25,5 +25,5 @@
],
"throttle": "0x0",
"voltage": "0.8587V",
"services_running": 46
"services_running": 47
}

View File

@@ -1 +1 @@
{"node":"octavia","status":"down","ts":"2026-03-16T03:26:52Z"}
{"node":"octavia","status":"down","ts":"2026-03-16T03:57:49Z"}

View File

@@ -0,0 +1,190 @@
#!/bin/bash
# BlackRoad Fleet Coordinator — Background process that:
# 1. Monitors all nodes continuously
# 2. Auto-heals crashed services
# 3. Pushes telemetry to stats API
# 4. Syncs state between Pis
# 5. Alerts on problems
#
# Run: ./blackroad-fleet-coordinator.sh
# Cron: */5 * * * * /Users/alexa/blackroad-fleet-coordinator.sh >> ~/.blackroad/logs/coordinator.log 2>&1
set -euo pipefail
source ~/.blackroad/config/nodes.sh
LOG_DIR="$HOME/.blackroad/logs"
STATE_DIR="$HOME/.blackroad/fleet-state"
mkdir -p "$LOG_DIR" "$STATE_DIR"
STATS_URL="https://stats-blackroad.amundsonalexa.workers.dev"
STATS_KEY="blackroad-stats-push-2026"
log() { printf "[%s] %s\n" "$(date '+%Y-%m-%d %H:%M:%S')" "$1"; }
# ── 1. PROBE ALL NODES ──
probe_node() {
local name=$1
local ip="${NODE_IP[$name]}"
local user="${NODE_USER[$name]:-pi}"
local state_file="$STATE_DIR/${name}.json"
local prev_status="unknown"
[[ -f "$state_file" ]] && prev_status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
# Ping check
if ! ping -c1 -W2 "$ip" &>/dev/null; then
echo "{\"name\":\"$name\",\"ip\":\"$ip\",\"status\":\"down\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$state_file"
if [[ "$prev_status" != "down" ]]; then
log "ALERT: $name ($ip) went DOWN"
# Could push to Slack here
fi
return
fi
# SSH probe
local data
data=$(ssh $BR_SSH_OPTS "${user}@${ip}" "
load=\$(cat /proc/loadavg | awk '{print \$1}')
temp=\$(cat /sys/class/thermal/thermal_zone0/temp 2>/dev/null | awk '{printf \"%.0f\", \$1/1000}' || echo 0)
mem_free=\$(free -m | awk '/Mem:/ {print \$4}')
mem_total=\$(free -m | awk '/Mem:/ {print \$2}')
disk=\$(df / | awk 'NR==2 {print \$5}' | tr -d '%')
uptime_s=\$(cat /proc/uptime | awk '{print int(\$1)}')
svcs=\$(systemctl list-units --type=service --state=running --no-pager --no-legend 2>/dev/null | wc -l)
docker_c=\$(docker ps -q 2>/dev/null | wc -l || echo 0)
echo \"\$load|\$temp|\$mem_free|\$mem_total|\$disk|\$uptime_s|\$svcs|\$docker_c\"
" 2>/dev/null) || data=""
if [[ -z "$data" ]]; then
echo "{\"name\":\"$name\",\"ip\":\"$ip\",\"status\":\"ssh_fail\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$state_file"
log "WARN: $name ($ip) ping OK but SSH failed"
return
fi
IFS='|' read -r load temp mem_free mem_total disk uptime svcs docker_c <<< "$data"
cat > "$state_file" << EOF
{"name":"$name","ip":"$ip","status":"up","load":$load,"temp":$temp,"mem_free":$mem_free,"mem_total":$mem_total,"disk_pct":$disk,"uptime_s":$uptime,"services":$svcs,"containers":$docker_c,"ts":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
EOF
# State change alerts
if [[ "$prev_status" == "down" || "$prev_status" == "ssh_fail" ]]; then
log "RECOVERED: $name ($ip) is back UP"
fi
# Threshold alerts
if [[ "$disk" -gt 90 ]]; then log "ALERT: $name disk at ${disk}%"; fi
if [[ "$temp" -gt 75 ]]; then log "ALERT: $name temp at ${temp}C"; fi
if [[ "$mem_free" -lt 200 ]]; then log "ALERT: $name low memory (${mem_free}MB free)"; fi
}
# ── 2. SERVICE HEALTH CHECKS ──
# Some services bind to localhost only (PostgreSQL, Redis) — check via SSH
check_services() {
local name=$1
local ip="${NODE_IP[$name]}"
local services="${NODE_SERVICES[$name]:-}"
[[ -z "$services" ]] && return
IFS=',' read -ra svc_list <<< "$services"
for svc in "${svc_list[@]}"; do
local port=$(echo "$svc" | cut -d: -f1)
local label=$(echo "$svc" | cut -d: -f2)
# Try remote first, then check via SSH for localhost-bound services
if ! nc -z -w2 "$ip" "$port" 2>/dev/null; then
local ssh_check
ssh_check=$(br_ssh "$name" "ss -tlnp 2>/dev/null | grep -q ':$port ' && echo ok || echo down" 2>/dev/null || echo "ssh_fail")
if [[ "$ssh_check" != "ok" ]]; then
log "SERVICE DOWN: $name:$port ($label)"
fi
fi
done
}
# ── 3. AUTO-HEAL ──
auto_heal() {
# Check Ollama on Cecilia
if ! nc -z -w2 192.168.4.96 11434 2>/dev/null; then
log "HEAL: Restarting Ollama on Cecilia"
br_ssh cecilia "sudo systemctl restart ollama" 2>/dev/null
fi
# Check Gitea on Octavia
if ! nc -z -w2 192.168.4.101 3100 2>/dev/null; then
log "HEAL: Restarting Gitea (blackroad-git) on Octavia"
br_ssh octavia "docker restart blackroad-git" 2>/dev/null
fi
# Check cloudflared tunnels
for node in cecilia lucidia; do
local tunnel_ok
tunnel_ok=$(br_ssh "$node" "systemctl is-active cloudflared 2>/dev/null" || echo "inactive")
if [[ "$tunnel_ok" != "active" ]]; then
log "HEAL: Restarting cloudflared on $node"
br_ssh "$node" "sudo systemctl restart cloudflared" 2>/dev/null
fi
done
# Check NATS on Octavia
local nats_ok
nats_ok=$(br_ssh octavia "docker ps -q -f name=nats | head -1" 2>/dev/null || echo "")
if [[ -z "$nats_ok" ]]; then
log "HEAL: NATS container not running on Octavia"
fi
}
# ── 4. PUSH TELEMETRY ──
push_telemetry() {
for name in "${ALL_NODES[@]}"; do
local state_file="$STATE_DIR/${name}.json"
[[ -f "$state_file" ]] || continue
local status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
[[ "$status" == "up" ]] || continue
curl -s --max-time 5 -X POST "$STATS_URL/api/push" \
-H "Content-Type: application/json" \
-H "X-API-Key: $STATS_KEY" \
-d @"$state_file" >/dev/null 2>&1 || true
done
}
# ── 5. FLEET SUMMARY ──
fleet_summary() {
local up=0 down=0
for name in "${ALL_NODES[@]}"; do
local state_file="$STATE_DIR/${name}.json"
if [[ -f "$state_file" ]]; then
local status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
if [[ "$status" == "up" ]]; then
up=$((up + 1))
else
down=$((down + 1))
fi
fi
done
log "FLEET: $up up, $down down ($(date))"
}
# ── MAIN ──
log "━━━ Fleet Coordinator Run ━━━"
# Probe all nodes (local Pis only - cloud nodes are slower)
for name in "${PI_NODES[@]}"; do
probe_node "$name"
done
# Service checks
for name in "${PI_NODES[@]}"; do
local_status=$(python3 -c "import json;print(json.load(open('$STATE_DIR/${name}.json')).get('status','down'))" 2>/dev/null || echo "down")
[[ "$local_status" == "up" ]] && check_services "$name"
done
# Auto-heal
auto_heal
# Push telemetry
push_telemetry
# Summary
fleet_summary
log "━━━ Done ━━━"

View File

@@ -5,7 +5,7 @@
// Own your customers, subscriptions, invoices, and payments.
// Stripe is just a dumb card charger underneath.
const VERSION = '1.0.0';
const VERSION = '2.0.0';
// ─── Schema ──────────────────────────────────────────────────────────────
const SCHEMA = [
@@ -114,6 +114,37 @@ const SCHEMA = [
`CREATE INDEX IF NOT EXISTS idx_payments_customer ON payments(customer_id)`,
`CREATE INDEX IF NOT EXISTS idx_events_type ON events(type)`,
`CREATE INDEX IF NOT EXISTS idx_events_customer ON events(customer_id)`,
// v2: API keys for customers
`CREATE TABLE IF NOT EXISTS api_keys (
id TEXT PRIMARY KEY,
customer_id TEXT NOT NULL REFERENCES customers(id),
key_hash TEXT NOT NULL,
key_prefix TEXT NOT NULL,
name TEXT DEFAULT 'default',
scopes TEXT DEFAULT '["api:read"]',
rate_limit INTEGER DEFAULT 1000,
last_used TEXT,
usage_count INTEGER DEFAULT 0,
status TEXT DEFAULT 'active',
expires_at TEXT,
created_at TEXT DEFAULT (datetime('now'))
)`,
// v2: Usage metering
`CREATE TABLE IF NOT EXISTS usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
customer_id TEXT NOT NULL REFERENCES customers(id),
api_key_id TEXT REFERENCES api_keys(id),
endpoint TEXT NOT NULL,
method TEXT DEFAULT 'GET',
status_code INTEGER,
latency_ms INTEGER,
tokens_used INTEGER DEFAULT 0,
created_at TEXT DEFAULT (datetime('now'))
)`,
`CREATE INDEX IF NOT EXISTS idx_api_keys_customer ON api_keys(customer_id)`,
`CREATE INDEX IF NOT EXISTS idx_api_keys_prefix ON api_keys(key_prefix)`,
`CREATE INDEX IF NOT EXISTS idx_usage_customer ON usage(customer_id)`,
`CREATE INDEX IF NOT EXISTS idx_usage_date ON usage(created_at)`,
];
// ─── Seed Plans ──────────────────────────────────────────────────────────
@@ -965,10 +996,129 @@ function handleHealth() {
'GET /invoices?customer_id=',
'GET /payments?customer_id=',
'POST /webhook',
'GET /keys?customer_id=',
'POST /keys',
'DELETE /keys?id=',
'GET /usage?customer_id=&days=30',
'POST /usage/record',
],
});
}
// ─── API Keys ───────────────────────────────────────────────────────────
async function handleApiKeys(request, db) {
const url = new URL(request.url);
if (request.method === 'GET') {
const customerId = url.searchParams.get('customer_id');
if (!customerId) return err('customer_id required');
const keys = await db.prepare(
"SELECT id, customer_id, key_prefix, name, scopes, rate_limit, last_used, usage_count, status, expires_at, created_at FROM api_keys WHERE customer_id = ? ORDER BY created_at DESC"
).bind(customerId).all();
return json({ keys: keys.results.map(k => ({ ...k, scopes: JSON.parse(k.scopes || '[]') })) });
}
if (request.method === 'POST') {
const body = await request.json();
const { customer_id, name, scopes, rate_limit, expires_in_days } = body;
if (!customer_id) return err('customer_id required');
// Generate API key: rp_live_ + 32 random hex chars
const rawKey = `rp_live_${randHex(16)}`;
const keyPrefix = rawKey.slice(0, 12);
// Hash the key for storage (don't store raw)
const enc = new TextEncoder();
const hashBuf = await crypto.subtle.digest('SHA-256', enc.encode(rawKey));
const keyHash = Array.from(new Uint8Array(hashBuf)).map(b => b.toString(16).padStart(2, '0')).join('');
const id = `key_${uid()}`;
const expiresAt = expires_in_days
? new Date(Date.now() + expires_in_days * 86400000).toISOString()
: null;
await db.prepare(
'INSERT INTO api_keys (id, customer_id, key_hash, key_prefix, name, scopes, rate_limit, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
).bind(id, customer_id, keyHash, keyPrefix, name || 'default', JSON.stringify(scopes || ['api:read']), rate_limit || 1000, expiresAt).run();
// Return the raw key ONLY on creation — can never be retrieved again
return json({ id, key: rawKey, prefix: keyPrefix, name: name || 'default', warning: 'Save this key now. It cannot be retrieved again.' }, 201);
}
if (request.method === 'DELETE') {
const keyId = url.searchParams.get('id');
if (!keyId) return err('id required');
await db.prepare("UPDATE api_keys SET status = 'revoked' WHERE id = ?").bind(keyId).run();
return json({ ok: true, revoked: keyId });
}
return err('Method not allowed', 405);
}
// ─── Usage / Metering ────────────────────────────────────────────────────
async function handleUsage(request, db) {
const url = new URL(request.url);
const customerId = url.searchParams.get('customer_id');
if (!customerId) return err('customer_id required');
const days = parseInt(url.searchParams.get('days') || '30');
const since = new Date(Date.now() - days * 86400000).toISOString();
// Daily breakdown
const daily = await db.prepare(
`SELECT date(created_at) as day, COUNT(*) as requests, SUM(tokens_used) as tokens,
AVG(latency_ms) as avg_latency, SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as errors
FROM usage WHERE customer_id = ? AND created_at >= ? GROUP BY day ORDER BY day DESC`
).bind(customerId, since).all();
// Totals
const totals = await db.prepare(
`SELECT COUNT(*) as total_requests, SUM(tokens_used) as total_tokens,
AVG(latency_ms) as avg_latency, COUNT(DISTINCT endpoint) as unique_endpoints
FROM usage WHERE customer_id = ? AND created_at >= ?`
).bind(customerId, since).first();
// Top endpoints
const endpoints = await db.prepare(
`SELECT endpoint, method, COUNT(*) as count, AVG(latency_ms) as avg_latency
FROM usage WHERE customer_id = ? AND created_at >= ?
GROUP BY endpoint, method ORDER BY count DESC LIMIT 10`
).bind(customerId, since).all();
return json({
customer_id: customerId,
period: { days, since },
totals,
daily: daily.results,
top_endpoints: endpoints.results,
});
}
// ─── Record Usage (internal — called by API gateway) ─────────────────────
async function handleRecordUsage(request, db) {
const body = await request.json();
const { customer_id, api_key_id, endpoint, method, status_code, latency_ms, tokens_used } = body;
if (!customer_id || !endpoint) return err('customer_id and endpoint required');
await db.prepare(
'INSERT INTO usage (customer_id, api_key_id, endpoint, method, status_code, latency_ms, tokens_used) VALUES (?, ?, ?, ?, ?, ?, ?)'
).bind(customer_id, api_key_id || null, endpoint, method || 'GET', status_code || 200, latency_ms || 0, tokens_used || 0).run();
// Update API key usage count
if (api_key_id) {
await db.prepare(
"UPDATE api_keys SET usage_count = usage_count + 1, last_used = datetime('now') WHERE id = ?"
).bind(api_key_id).run();
}
return json({ ok: true });
}
function randHex(bytes) {
const arr = new Uint8Array(bytes);
crypto.getRandomValues(arr);
return Array.from(arr).map(b => b.toString(16).padStart(2, '0')).join('');
}
// ─── Rate limiting ───────────────────────────────────────────────────────
const rl = new Map();
function rateLimit(ip, max = 30, windowSec = 60) {
@@ -1040,6 +1190,15 @@ export default {
case path === '/stats':
response = await handleStats(db);
break;
case path === '/keys' || path === '/api-keys':
response = await handleApiKeys(request, db);
break;
case path === '/usage':
response = await handleUsage(request, db);
break;
case path === '/usage/record' && request.method === 'POST':
response = await handleRecordUsage(request, db);
break;
default:
response = err('Not found', 404);
}