sync: 2026-03-15 23:00 — 32 files from Alexandria
Some checks failed
Lint & Format / detect (push) Failing after 32s
Monorepo Lint / lint-shell (push) Failing after 31s
Monorepo Lint / lint-js (push) Failing after 30s
Lint & Format / js-lint (push) Has been skipped
Lint & Format / py-lint (push) Has been skipped
Lint & Format / sh-lint (push) Has been skipped
Lint & Format / go-lint (push) Has been skipped
Some checks failed
Lint & Format / detect (push) Failing after 32s
Monorepo Lint / lint-shell (push) Failing after 31s
Monorepo Lint / lint-js (push) Failing after 30s
Lint & Format / js-lint (push) Has been skipped
Lint & Format / py-lint (push) Has been skipped
Lint & Format / sh-lint (push) Has been skipped
Lint & Format / go-lint (push) Has been skipped
RoadChain-SHA2048: 692327ce2e990f37 RoadChain-Identity: alexa@sovereign RoadChain-Full: 692327ce2e990f37649b83e948241ac858c0d07146c6b42043e4770d638c44d5bada5639ad82c7aa8911d7042912c1d75b6bbce9a453637621b3903dc912a3a9537696cedf7a0870e3bf962ca44677793082aaae5c5433615885ad20fab1e80417202d11e93284483551ba9558f06809d2f3fa53c00a657277d7c183abe3ba187c1af6856a455071771757cca67ff2b74c5f855f23dd8cc8f5b3596c966b2344361fcbb74843e9d9d9ad66c5321ef64ce787f9d255d11e0d4e0ee571af4e09697964e22f6f629a11279b315c9a4563860b169ad93fa500b485297516ef2ba2039f76348c0d547cfa182e9b0bccee73f5e8b7db7e33d61e8199bb4464c2c30d03
This commit is contained in:
@@ -31,3 +31,4 @@
|
||||
0 8 1 * * bash /Users/alexa/blackroad-operator/scripts/corporate-autopilot.sh >> /Users/alexa/blackroad-operator/docs/corporate/autopilot.log 2>&1
|
||||
5 6 * * * cd /Users/alexa/blackroad-os-kpis && bash reports/slack-notify.sh >> /Users/alexa/blackroad-os-kpis/data/cron.log 2>&1
|
||||
7,37 * * * * /Users/alexa/blackroad-git-sync.sh >> /Users/alexa/.blackroad/logs/git-sync.log 2>&1
|
||||
*/5 * * * * /usr/bin/flock -n /tmp/fleet-coord.lock /Users/alexa/blackroad-fleet-coordinator.sh >> /Users/alexa/.blackroad/logs/coordinator.log 2>&1
|
||||
|
||||
@@ -1,50 +1,50 @@
|
||||
[2026-03-15 22:03:01] [BEAT] [alice] load=0.62 mem=3381/3794MB temp=32.6C disk=80%
|
||||
[2026-03-15 22:04:01] [BEAT] [alice] load=0.76 mem=3382/3794MB temp=32.1C disk=80%
|
||||
[2026-03-15 22:05:01] [BEAT] [alice] load=0.68 mem=3369/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:06:01] [BEAT] [alice] load=0.59 mem=3378/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:06:16] [BEAT] [alice] load=0.46 mem=3374/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:06:16] [BEAT] [alice] load=0.46 mem=3372/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:07:01] [BEAT] [alice] load=0.41 mem=3381/3794MB temp=32.6C disk=80%
|
||||
[2026-03-15 22:08:01] [BEAT] [alice] load=0.76 mem=3378/3794MB temp=32.6C disk=80%
|
||||
[2026-03-15 22:09:01] [BEAT] [alice] load=0.49 mem=3380/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:10:02] [FLEET] [alice] Starting cross-node health check
|
||||
[2026-03-15 22:10:02] [BEAT] [alice] load=0.55 mem=3359/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:10:04] [FLEET] [alice] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:10:04] [FLEET] [alice] cecilia: UP temp=43C mem=4779MB disk=19%
|
||||
[2026-03-15 22:10:05] [FLEET] [alice] gematria: UP temp=C mem=4213MB disk=67%
|
||||
[2026-03-15 22:10:06] [FLEET] [alice] lucidia: UP temp=58C mem=3614MB disk=33%
|
||||
[2026-03-15 22:10:08] [FLEET] [alice] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:10:09] [FLEET] [alice] anastasia: UP temp=C mem=201MB disk=69%
|
||||
[2026-03-15 22:11:01] [BEAT] [alice] load=0.45 mem=3380/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:11:25] [BEAT] [alice] load=0.83 mem=3364/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:11:25] [BEAT] [alice] load=0.83 mem=3362/3794MB temp=37.0C disk=80%
|
||||
[2026-03-15 22:12:01] [BEAT] [alice] load=0.80 mem=3381/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:13:01] [BEAT] [alice] load=1.23 mem=3379/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:14:01] [BEAT] [alice] load=0.78 mem=3372/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:15:02] [BEAT] [alice] load=0.73 mem=3336/3794MB temp=37.5C disk=80%
|
||||
[2026-03-15 22:15:48] [DIAL] [alice] Switchboard unreachable
|
||||
[2026-03-15 22:16:01] [BEAT] [alice] load=0.81 mem=3371/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:16:08] [DIAL] [alice] Switchboard unreachable
|
||||
[2026-03-15 22:16:34] [BEAT] [alice] load=1.11 mem=3357/3794MB temp=35.0C disk=80%
|
||||
[2026-03-15 22:16:34] [BEAT] [alice] load=1.11 mem=3371/3794MB temp=35.0C disk=80%
|
||||
[2026-03-15 22:17:01] [BEAT] [alice] load=1.13 mem=3372/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:18:01] [BEAT] [alice] load=0.84 mem=3378/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:19:01] [BEAT] [alice] load=1.27 mem=3379/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:20:01] [FLEET] [alice] Starting cross-node health check
|
||||
[2026-03-15 22:20:01] [BEAT] [alice] load=0.69 mem=3360/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:20:03] [FLEET] [alice] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:20:04] [FLEET] [alice] cecilia: UP temp=42C mem=4816MB disk=19%
|
||||
[2026-03-15 22:20:05] [FLEET] [alice] gematria: UP temp=C mem=4209MB disk=67%
|
||||
[2026-03-15 22:20:05] [FLEET] [alice] lucidia: UP temp=66C mem=3520MB disk=33%
|
||||
[2026-03-15 22:20:05] [FLEET] [alice] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:20:06] [FLEET] [alice] anastasia: UP temp=C mem=277MB disk=69%
|
||||
[2026-03-15 22:21:01] [BEAT] [alice] load=0.65 mem=3379/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:21:43] [BEAT] [alice] load=0.63 mem=3370/3794MB temp=35.5C disk=80%
|
||||
[2026-03-15 22:21:43] [BEAT] [alice] load=0.98 mem=3368/3794MB temp=35.5C disk=80%
|
||||
[2026-03-15 22:22:02] [BEAT] [alice] load=0.83 mem=3375/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:23:04] [BEAT] [alice] load=1.69 mem=3379/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:24:01] [BEAT] [alice] load=1.14 mem=3372/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:25:01] [BEAT] [alice] load=0.95 mem=3370/3794MB temp=35.5C disk=80%
|
||||
[2026-03-15 22:26:01] [BEAT] [alice] load=0.97 mem=3375/3794MB temp=36.5C disk=80%
|
||||
[2026-03-15 22:26:52] [BEAT] [alice] load=0.59 mem=3373/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:26:52] [BEAT] [alice] load=0.59 mem=3370/3794MB temp=35.5C disk=80%
|
||||
[2026-03-15 22:40:02] [FLEET] [alice] octavia: UP temp=36C mem=6762MB disk=67%
|
||||
[2026-03-15 22:40:02] [HEAL] [alice] Healed 1 services
|
||||
[2026-03-15 22:40:03] [FLEET] [alice] cecilia: UP temp=57C mem=5488MB disk=19%
|
||||
[2026-03-15 22:40:04] [FLEET] [alice] gematria: UP temp=C mem=4192MB disk=67%
|
||||
[2026-03-15 22:40:05] [FLEET] [alice] lucidia: UP temp=62C mem=3515MB disk=33%
|
||||
[2026-03-15 22:40:07] [FLEET] [alice] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:40:08] [FLEET] [alice] anastasia: UP temp=C mem=275MB disk=69%
|
||||
[2026-03-15 22:41:01] [BEAT] [alice] load=0.87 mem=3370/3794MB temp=35.5C disk=80%
|
||||
[2026-03-15 22:42:01] [BEAT] [alice] load=0.90 mem=3369/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:42:19] [BEAT] [alice] load=0.70 mem=3370/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:42:19] [BEAT] [alice] load=0.70 mem=3370/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:43:01] [BEAT] [alice] load=1.07 mem=3367/3794MB temp=36.5C disk=80%
|
||||
[2026-03-15 22:44:01] [BEAT] [alice] load=1.07 mem=3369/3794MB temp=35.0C disk=80%
|
||||
[2026-03-15 22:45:02] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
|
||||
[2026-03-15 22:45:02] [BEAT] [alice] load=1.46 mem=3353/3794MB temp=37.5C disk=80%
|
||||
[2026-03-15 22:45:02] [HEAL] [alice] Service blackroad-agent restarted successfully
|
||||
[2026-03-15 22:45:03] [HEAL] [alice] Healed 1 services
|
||||
[2026-03-15 22:45:48] [DIAL] [alice] Switchboard unreachable
|
||||
[2026-03-15 22:46:01] [BEAT] [alice] load=4.01 mem=3367/3794MB temp=33.6C disk=80%
|
||||
[2026-03-15 22:46:28] [DIAL] [alice] Switchboard unreachable
|
||||
[2026-03-15 22:47:01] [BEAT] [alice] load=1.90 mem=3367/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:47:29] [BEAT] [alice] load=1.62 mem=3364/3794MB temp=36.5C disk=80%
|
||||
[2026-03-15 22:47:30] [BEAT] [alice] load=1.62 mem=3364/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:48:01] [BEAT] [alice] load=1.23 mem=3369/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:49:01] [BEAT] [alice] load=0.84 mem=3357/3794MB temp=35.0C disk=80%
|
||||
[2026-03-15 22:50:01] [FLEET] [alice] Starting cross-node health check
|
||||
[2026-03-15 22:50:01] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
|
||||
[2026-03-15 22:50:01] [BEAT] [alice] load=0.87 mem=3336/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:50:01] [HEAL] [alice] Service blackroad-agent restarted successfully
|
||||
[2026-03-15 22:50:02] [FLEET] [alice] octavia: UP temp=37C mem=6738MB disk=67%
|
||||
[2026-03-15 22:50:02] [HEAL] [alice] Healed 1 services
|
||||
[2026-03-15 22:50:02] [FLEET] [alice] cecilia: UP temp=41C mem=5051MB disk=19%
|
||||
[2026-03-15 22:50:04] [FLEET] [alice] gematria: UP temp=C mem=4188MB disk=67%
|
||||
[2026-03-15 22:50:04] [FLEET] [alice] lucidia: UP temp=55C mem=3528MB disk=33%
|
||||
[2026-03-15 22:50:06] [FLEET] [alice] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:50:07] [FLEET] [alice] anastasia: UP temp=C mem=275MB disk=69%
|
||||
[2026-03-15 22:51:01] [BEAT] [alice] load=2.30 mem=3355/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:52:01] [BEAT] [alice] load=1.24 mem=3360/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:52:39] [BEAT] [alice] load=0.96 mem=3355/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:52:39] [BEAT] [alice] load=0.96 mem=3355/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:53:01] [BEAT] [alice] load=0.92 mem=3355/3794MB temp=34.6C disk=80%
|
||||
[2026-03-15 22:54:02] [BEAT] [alice] load=1.25 mem=3358/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:55:01] [HEAL] [alice] Service blackroad-agent is DOWN — restarting
|
||||
[2026-03-15 22:55:01] [BEAT] [alice] load=0.78 mem=3352/3794MB temp=36.0C disk=80%
|
||||
[2026-03-15 22:55:01] [HEAL] [alice] Service blackroad-agent restarted successfully
|
||||
[2026-03-15 22:55:01] [HEAL] [alice] Healed 1 services
|
||||
[2026-03-15 22:56:02] [BEAT] [alice] load=0.90 mem=3356/3794MB temp=33.1C disk=80%
|
||||
[2026-03-15 22:57:01] [BEAT] [alice] load=1.10 mem=3354/3794MB temp=34.1C disk=80%
|
||||
[2026-03-15 22:57:49] [BEAT] [alice] load=0.74 mem=3354/3794MB temp=31.6C disk=80%
|
||||
[2026-03-15 22:57:49] [BEAT] [alice] load=0.74 mem=3354/3794MB temp=32.1C disk=80%
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"node":"alice","ts":"2026-03-16T03:26:52Z","load":0.59,"mem_free_mb":3370,"mem_total_mb":3794,"temp_c":35.5,"disk_pct":80,"throttle":"0x0"}
|
||||
{"node":"alice","ts":"2026-03-16T03:57:49Z","load":0.74,"mem_free_mb":3354,"mem_total_mb":3794,"temp_c":32.1,"disk_pct":80,"throttle":"0x0"}
|
||||
|
||||
@@ -22,7 +22,7 @@ MY_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || echo "unknown")
|
||||
case "$MY_IP" in
|
||||
192.168.4.49*) NODE_NAME="alice" ;;
|
||||
192.168.4.96*) NODE_NAME="cecilia" ;;
|
||||
192.168.4.100*) NODE_NAME="octavia" ;;
|
||||
192.168.4.101*) NODE_NAME="octavia" ;;
|
||||
192.168.4.98*) NODE_NAME="aria" ;;
|
||||
192.168.4.38*) NODE_NAME="lucidia" ;;
|
||||
esac
|
||||
@@ -41,7 +41,7 @@ mkdir -p "$AUTONOMY_DIR"/{state,fleet,health,tasks/pending,tasks/completed,logs}
|
||||
declare -A FLEET_IPS
|
||||
FLEET_IPS[alice]="192.168.4.49"
|
||||
FLEET_IPS[cecilia]="192.168.4.96"
|
||||
FLEET_IPS[octavia]="192.168.4.100"
|
||||
FLEET_IPS[octavia]="192.168.4.101"
|
||||
FLEET_IPS[aria]="192.168.4.98"
|
||||
FLEET_IPS[lucidia]="192.168.4.38"
|
||||
FLEET_IPS[anastasia]="174.138.44.45"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
LISTEN 0 5 0.0.0.0:8184 0.0.0.0:*
|
||||
LISTEN 0 5 0.0.0.0:8095 0.0.0.0:* users:(("python3",pid=608,fd=3))
|
||||
LISTEN 0 5 0.0.0.0:8095 0.0.0.0:* users:(("python3",pid=18662,fd=3))
|
||||
LISTEN 0 511 0.0.0.0:8080 0.0.0.0:*
|
||||
LISTEN 0 511 0.0.0.0:8083 0.0.0.0:* users:(("node /usr/lib/n",pid=3890,fd=20))
|
||||
LISTEN 0 511 127.0.0.1:6379 0.0.0.0:*
|
||||
LISTEN 0 511 0.0.0.0:8083 0.0.0.0:* users:(("node /usr/lib/n",pid=3890,fd=20))
|
||||
LISTEN 0 5 0.0.0.0:4010 0.0.0.0:* users:(("python3",pid=610,fd=3))
|
||||
LISTEN 0 5 0.0.0.0:8011 0.0.0.0:* users:(("python3",pid=828,fd=3))
|
||||
LISTEN 0 5 0.0.0.0:8010 0.0.0.0:*
|
||||
@@ -10,7 +10,7 @@ LISTEN 0 5 0.0.0.0:8013 0.0.0.0:* users:(("python3",pid
|
||||
LISTEN 0 5 0.0.0.0:8012 0.0.0.0:*
|
||||
LISTEN 0 5 0.0.0.0:8014 0.0.0.0:*
|
||||
LISTEN 0 2048 0.0.0.0:8001 0.0.0.0:* users:(("python3",pid=617,fd=6))
|
||||
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=21795,fd=5))
|
||||
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=31841,fd=5))
|
||||
LISTEN 0 200 0.0.0.0:443 0.0.0.0:*
|
||||
LISTEN 0 1024 0.0.0.0:6333 0.0.0.0:*
|
||||
LISTEN 0 128 0.0.0.0:6334 0.0.0.0:*
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
avahi-daemon.service
|
||||
blackroad-agent.service
|
||||
blackroad-agents-proxy.service
|
||||
blackroad-agents.service
|
||||
blackroad-nats-agent.service
|
||||
blackroad-operator.service
|
||||
blackroad-salesforce-agent.service
|
||||
blackroad-status.service
|
||||
blackroad-task-queue-v2.service
|
||||
blackroad-task-worker.service
|
||||
@@ -26,6 +26,7 @@ prism-agent.service
|
||||
qdrant.service
|
||||
redis-server.service
|
||||
rng-tools-debian.service
|
||||
road-phone.service
|
||||
roadnet-failover.service
|
||||
rsyslog.service
|
||||
rtkit-daemon.service
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
{
|
||||
"hostname": "alice",
|
||||
"ts": "2026-03-16T03:26:55Z",
|
||||
"uptime_seconds": 2553,
|
||||
"ts": "2026-03-16T03:57:50Z",
|
||||
"uptime_seconds": 4408,
|
||||
"kernel": "6.1.21-v8+",
|
||||
"temp_c": 35.0,
|
||||
"temp_c": 32.1,
|
||||
"memory_mb": {
|
||||
"total": 3794,
|
||||
"used": 332,
|
||||
"free": 3373
|
||||
"used": 339,
|
||||
"free": 3355
|
||||
},
|
||||
"disk": "11G/15G (80%)",
|
||||
"load": [
|
||||
0.63,
|
||||
0.89,
|
||||
0.91
|
||||
0.74,
|
||||
1.03,
|
||||
1.13
|
||||
],
|
||||
"ollama_models": [
|
||||
"qwen2.5:3b",
|
||||
|
||||
@@ -11,7 +11,7 @@ LISTEN 0 5 0.0.0.0:8787 0.0.0.0:* users:(("python3",pid
|
||||
LISTEN 0 511 0.0.0.0:80 0.0.0.0:* users:(("nginx",pid=3461172,fd=8),("nginx",pid=3461171,fd=8))
|
||||
LISTEN 0 4096 0.0.0.0:111 0.0.0.0:* users:(("rpcbind",pid=589,fd=4),("systemd",pid=1,fd=127))
|
||||
LISTEN 0 4096 *:8080 *:* users:(("headscale",pid=2341808,fd=12))
|
||||
LISTEN 0 511 *:3000 *:* users:(("node /srv/hello",pid=1765254,fd=19))
|
||||
LISTEN 0 511 *:3000 *:* users:(("node /srv/hello",pid=1771278,fd=19))
|
||||
LISTEN 0 511 *:3001 *:* users:(("node",pid=757,fd=21))
|
||||
LISTEN 0 128 [::]:22 [::]:* users:(("sshd",pid=991,fd=8))
|
||||
LISTEN 0 511 [::]:80 [::]:* users:(("nginx",pid=3461172,fd=9),("nginx",pid=3461171,fd=9))
|
||||
|
||||
@@ -4,7 +4,6 @@ chronyd.service
|
||||
cloudflared.service
|
||||
containerd.service
|
||||
crond.service
|
||||
dbus-:1.1-org.fedoraproject.SetroubleshootPrivileged@73368.service
|
||||
dbus-broker.service
|
||||
docker.service
|
||||
droplet-agent.service
|
||||
@@ -21,7 +20,6 @@ polkit.service
|
||||
rpcbind.service
|
||||
rsyslog.service
|
||||
serial-getty@ttyS0.service
|
||||
setroubleshootd.service
|
||||
sshd.service
|
||||
systemd-journald.service
|
||||
systemd-logind.service
|
||||
|
||||
@@ -1,22 +1,22 @@
|
||||
{
|
||||
"hostname": "anastasia",
|
||||
"ts": "2026-03-16T03:26:53Z",
|
||||
"uptime_seconds": 6773506,
|
||||
"ts": "2026-03-16T03:57:50Z",
|
||||
"uptime_seconds": 6775363,
|
||||
"kernel": "5.14.0-651.el9.x86_64",
|
||||
"temp_c": 0,
|
||||
"memory_mb": {
|
||||
"total": 765,
|
||||
"used": 566,
|
||||
"free": 198
|
||||
"used": 482,
|
||||
"free": 282
|
||||
},
|
||||
"disk": "18G/25G (69%)",
|
||||
"load": [
|
||||
0.48,
|
||||
0.18,
|
||||
0.09
|
||||
0.01,
|
||||
0.03,
|
||||
0.02
|
||||
],
|
||||
"ollama_models": [],
|
||||
"throttle": "N/A",
|
||||
"voltage": "N/A",
|
||||
"services_running": 30
|
||||
"services_running": 28
|
||||
}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"node":"aria","status":"down","ts":"2026-03-16T03:26:52Z"}
|
||||
{"node":"aria","status":"down","ts":"2026-03-16T03:57:49Z"}
|
||||
|
||||
@@ -1,50 +1,50 @@
|
||||
[2026-03-15 22:04:02] [BEAT] [cecilia] load=0.37 mem=6325/8062MB temp=37.0C disk=19%
|
||||
[2026-03-15 22:05:01] [BEAT] [cecilia] load=0.37 mem=6286/8062MB temp=38.6C disk=19%
|
||||
[2026-03-15 22:05:03] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:06:01] [BEAT] [cecilia] load=0.26 mem=6291/8062MB temp=37.5C disk=19%
|
||||
[2026-03-15 22:06:16] [BEAT] [cecilia] load=0.27 mem=6287/8062MB temp=37.5C disk=19%
|
||||
[2026-03-15 22:06:16] [BEAT] [cecilia] load=0.27 mem=6287/8062MB temp=38.0C disk=19%
|
||||
[2026-03-15 22:07:01] [BEAT] [cecilia] load=0.33 mem=6297/8062MB temp=38.6C disk=19%
|
||||
[2026-03-15 22:08:01] [BEAT] [cecilia] load=0.19 mem=6295/8062MB temp=39.1C disk=19%
|
||||
[2026-03-15 22:09:01] [BEAT] [cecilia] load=0.54 mem=4704/8062MB temp=44.6C disk=19%
|
||||
[2026-03-15 22:10:01] [FLEET] [cecilia] Starting cross-node health check
|
||||
[2026-03-15 22:10:01] [BEAT] [cecilia] load=2.73 mem=4786/8062MB temp=42.5C disk=19%
|
||||
[2026-03-15 22:10:02] [FLEET] [cecilia] alice: UP temp=36C mem=3350MB disk=80%
|
||||
[2026-03-15 22:10:04] [FLEET] [cecilia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:10:05] [FLEET] [cecilia] gematria: UP temp=C mem=4213MB disk=67%
|
||||
[2026-03-15 22:10:06] [FLEET] [cecilia] lucidia: UP temp=58C mem=3614MB disk=33%
|
||||
[2026-03-15 22:10:08] [FLEET] [cecilia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:10:09] [FLEET] [cecilia] anastasia: UP temp=C mem=226MB disk=69%
|
||||
[2026-03-15 22:11:01] [BEAT] [cecilia] load=1.92 mem=4862/8062MB temp=42.5C disk=19%
|
||||
[2026-03-15 22:11:24] [BEAT] [cecilia] load=1.45 mem=4870/8062MB temp=42.5C disk=19%
|
||||
[2026-03-15 22:11:24] [BEAT] [cecilia] load=1.45 mem=4867/8062MB temp=43.0C disk=19%
|
||||
[2026-03-15 22:12:01] [BEAT] [cecilia] load=0.93 mem=4876/8062MB temp=41.9C disk=19%
|
||||
[2026-03-15 22:13:01] [BEAT] [cecilia] load=0.46 mem=4880/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:14:01] [BEAT] [cecilia] load=0.27 mem=4874/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:15:01] [BEAT] [cecilia] load=0.26 mem=4831/8062MB temp=42.5C disk=19%
|
||||
[2026-03-15 22:15:46] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:16:01] [BEAT] [cecilia] load=0.51 mem=4879/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:16:34] [BEAT] [cecilia] load=0.44 mem=4879/8062MB temp=41.9C disk=19%
|
||||
[2026-03-15 22:16:34] [BEAT] [cecilia] load=0.44 mem=4875/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:17:01] [BEAT] [cecilia] load=0.27 mem=4870/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:18:01] [BEAT] [cecilia] load=0.23 mem=4879/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:19:01] [BEAT] [cecilia] load=0.24 mem=4865/8062MB temp=40.2C disk=19%
|
||||
[2026-03-15 22:20:01] [FLEET] [cecilia] Starting cross-node health check
|
||||
[2026-03-15 22:20:01] [BEAT] [cecilia] load=0.15 mem=4831/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:20:02] [FLEET] [cecilia] alice: UP temp=35C mem=3364MB disk=80%
|
||||
[2026-03-15 22:20:04] [FLEET] [cecilia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:20:05] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:20:05] [FLEET] [cecilia] gematria: UP temp=C mem=4209MB disk=67%
|
||||
[2026-03-15 22:20:06] [FLEET] [cecilia] lucidia: UP temp=64C mem=3522MB disk=33%
|
||||
[2026-03-15 22:20:08] [FLEET] [cecilia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:20:09] [FLEET] [cecilia] anastasia: UP temp=C mem=277MB disk=69%
|
||||
[2026-03-15 22:21:01] [BEAT] [cecilia] load=0.19 mem=4867/8062MB temp=40.2C disk=19%
|
||||
[2026-03-15 22:21:42] [BEAT] [cecilia] load=0.42 mem=4868/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:21:42] [BEAT] [cecilia] load=0.42 mem=4864/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:22:01] [BEAT] [cecilia] load=0.39 mem=4858/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:23:01] [BEAT] [cecilia] load=0.62 mem=4857/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:24:01] [BEAT] [cecilia] load=0.44 mem=4860/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:25:01] [BEAT] [cecilia] load=0.57 mem=4839/8062MB temp=40.2C disk=19%
|
||||
[2026-03-15 22:26:01] [BEAT] [cecilia] load=0.63 mem=4864/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:26:51] [BEAT] [cecilia] load=0.43 mem=4863/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:26:51] [BEAT] [cecilia] load=0.43 mem=4859/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:35:01] [BEAT] [cecilia] load=0.48 mem=4824/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:35:05] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:36:01] [BEAT] [cecilia] load=1.19 mem=5491/8062MB temp=48.5C disk=19%
|
||||
[2026-03-15 22:37:01] [BEAT] [cecilia] load=3.25 mem=5517/8062MB temp=51.8C disk=19%
|
||||
[2026-03-15 22:37:09] [BEAT] [cecilia] load=3.44 mem=5518/8062MB temp=51.2C disk=19%
|
||||
[2026-03-15 22:37:09] [BEAT] [cecilia] load=3.44 mem=5517/8062MB temp=51.2C disk=19%
|
||||
[2026-03-15 22:38:01] [BEAT] [cecilia] load=3.77 mem=5515/8062MB temp=53.5C disk=19%
|
||||
[2026-03-15 22:39:01] [BEAT] [cecilia] load=4.18 mem=5525/8062MB temp=55.1C disk=19%
|
||||
[2026-03-15 22:40:02] [FLEET] [cecilia] Starting cross-node health check
|
||||
[2026-03-15 22:40:02] [BEAT] [cecilia] load=4.46 mem=5488/8062MB temp=56.2C disk=19%
|
||||
[2026-03-15 22:40:03] [FLEET] [cecilia] alice: UP temp=36C mem=3367MB disk=80%
|
||||
[2026-03-15 22:40:03] [FLEET] [cecilia] octavia: UP temp=36C mem=6758MB disk=67%
|
||||
[2026-03-15 22:40:05] [FLEET] [cecilia] gematria: UP temp=C mem=4193MB disk=67%
|
||||
[2026-03-15 22:40:05] [FLEET] [cecilia] lucidia: UP temp=61C mem=3513MB disk=33%
|
||||
[2026-03-15 22:40:07] [FLEET] [cecilia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:40:08] [FLEET] [cecilia] anastasia: UP temp=C mem=275MB disk=69%
|
||||
[2026-03-15 22:41:01] [BEAT] [cecilia] load=3.20 mem=5568/8062MB temp=46.3C disk=19%
|
||||
[2026-03-15 22:42:01] [BEAT] [cecilia] load=1.35 mem=5568/8062MB temp=44.6C disk=19%
|
||||
[2026-03-15 22:42:19] [BEAT] [cecilia] load=1.26 mem=5562/8062MB temp=44.1C disk=19%
|
||||
[2026-03-15 22:42:19] [BEAT] [cecilia] load=1.26 mem=5560/8062MB temp=45.2C disk=19%
|
||||
[2026-03-15 22:43:01] [BEAT] [cecilia] load=0.72 mem=5536/8062MB temp=43.5C disk=19%
|
||||
[2026-03-15 22:44:01] [BEAT] [cecilia] load=0.60 mem=5275/8062MB temp=44.6C disk=19%
|
||||
[2026-03-15 22:45:01] [BEAT] [cecilia] load=0.53 mem=5284/8062MB temp=44.1C disk=19%
|
||||
[2026-03-15 22:45:47] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:46:01] [BEAT] [cecilia] load=2.22 mem=5141/8062MB temp=45.8C disk=19%
|
||||
[2026-03-15 22:47:01] [BEAT] [cecilia] load=0.97 mem=5082/8062MB temp=43.5C disk=19%
|
||||
[2026-03-15 22:47:29] [BEAT] [cecilia] load=0.65 mem=5077/8062MB temp=42.5C disk=19%
|
||||
[2026-03-15 22:47:29] [BEAT] [cecilia] load=0.65 mem=5080/8062MB temp=43.0C disk=19%
|
||||
[2026-03-15 22:48:01] [BEAT] [cecilia] load=0.60 mem=5079/8062MB temp=43.0C disk=19%
|
||||
[2026-03-15 22:49:01] [BEAT] [cecilia] load=0.38 mem=5078/8062MB temp=41.4C disk=19%
|
||||
[2026-03-15 22:50:01] [FLEET] [cecilia] Starting cross-node health check
|
||||
[2026-03-15 22:50:01] [BEAT] [cecilia] load=0.19 mem=5045/8062MB temp=41.9C disk=19%
|
||||
[2026-03-15 22:50:01] [FLEET] [cecilia] alice: UP temp=36C mem=3349MB disk=80%
|
||||
[2026-03-15 22:50:02] [FLEET] [cecilia] octavia: UP temp=36C mem=6744MB disk=67%
|
||||
[2026-03-15 22:50:03] [FLEET] [cecilia] gematria: UP temp=C mem=4197MB disk=67%
|
||||
[2026-03-15 22:50:03] [FLEET] [cecilia] lucidia: UP temp=55C mem=3518MB disk=33%
|
||||
[2026-03-15 22:50:05] [DIAL] [cecilia] Switchboard unreachable
|
||||
[2026-03-15 22:50:05] [FLEET] [cecilia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:50:06] [FLEET] [cecilia] anastasia: UP temp=C mem=275MB disk=69%
|
||||
[2026-03-15 22:51:01] [BEAT] [cecilia] load=0.15 mem=5090/8062MB temp=40.8C disk=19%
|
||||
[2026-03-15 22:52:01] [BEAT] [cecilia] load=1.28 mem=5085/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:52:39] [BEAT] [cecilia] load=1.66 mem=5087/8062MB temp=40.2C disk=19%
|
||||
[2026-03-15 22:52:39] [BEAT] [cecilia] load=1.66 mem=5083/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:53:01] [BEAT] [cecilia] load=1.77 mem=5092/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:54:01] [BEAT] [cecilia] load=0.87 mem=5089/8062MB temp=38.6C disk=19%
|
||||
[2026-03-15 22:55:02] [BEAT] [cecilia] load=0.45 mem=5057/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:56:01] [BEAT] [cecilia] load=0.25 mem=5154/8062MB temp=39.1C disk=19%
|
||||
[2026-03-15 22:57:01] [BEAT] [cecilia] load=0.13 mem=5146/8062MB temp=39.7C disk=19%
|
||||
[2026-03-15 22:57:48] [BEAT] [cecilia] load=0.37 mem=5197/8062MB temp=39.1C disk=19%
|
||||
[2026-03-15 22:57:48] [BEAT] [cecilia] load=0.37 mem=5194/8062MB temp=38.6C disk=19%
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"node":"cecilia","ts":"2026-03-16T03:26:51Z","load":0.43,"mem_free_mb":4859,"mem_total_mb":8062,"temp_c":39.7,"disk_pct":19,"throttle":"0x50000"}
|
||||
{"node":"cecilia","ts":"2026-03-16T03:57:48Z","load":0.37,"mem_free_mb":5194,"mem_total_mb":8062,"temp_c":38.6,"disk_pct":19,"throttle":"0x50000"}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -22,7 +22,7 @@ MY_IP=$(hostname -I 2>/dev/null | awk '{print $1}' || echo "unknown")
|
||||
case "$MY_IP" in
|
||||
192.168.4.49*) NODE_NAME="alice" ;;
|
||||
192.168.4.96*) NODE_NAME="cecilia" ;;
|
||||
192.168.4.100*) NODE_NAME="octavia" ;;
|
||||
192.168.4.101*) NODE_NAME="octavia" ;;
|
||||
192.168.4.98*) NODE_NAME="aria" ;;
|
||||
192.168.4.38*) NODE_NAME="lucidia" ;;
|
||||
esac
|
||||
@@ -41,7 +41,7 @@ mkdir -p "$AUTONOMY_DIR"/{state,fleet,health,tasks/pending,tasks/completed,logs}
|
||||
declare -A FLEET_IPS
|
||||
FLEET_IPS[alice]="192.168.4.49"
|
||||
FLEET_IPS[cecilia]="192.168.4.96"
|
||||
FLEET_IPS[octavia]="192.168.4.100"
|
||||
FLEET_IPS[octavia]="192.168.4.101"
|
||||
FLEET_IPS[aria]="192.168.4.98"
|
||||
FLEET_IPS[lucidia]="192.168.4.38"
|
||||
FLEET_IPS[anastasia]="174.138.44.45"
|
||||
|
||||
@@ -31,7 +31,7 @@ class OllamaClient:
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
|
||||
async def generate(self, model: str, prompt: str, timeout: int = 120) -> str:
|
||||
async def generate(self, model: str, prompt: str, timeout: int = 180) -> str:
|
||||
"""Run inference through Ollama. Respects concurrency semaphore."""
|
||||
async with self._semaphore:
|
||||
try:
|
||||
@@ -41,7 +41,7 @@ class OllamaClient:
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": {"num_predict": 1024, "temperature": 0.7},
|
||||
"options": {"num_predict": 512, "temperature": 0.7},
|
||||
},
|
||||
timeout=aiohttp.ClientTimeout(total=timeout),
|
||||
) as resp:
|
||||
|
||||
@@ -17,6 +17,8 @@ from .config import CONTROLLER_HOST, CONTROLLER_PORT, TASKS_DB
|
||||
from .spawn import SpawnScheduler
|
||||
from .nats_protocol import NATSBus, TaskMessage, ResultMessage, HeartbeatMessage
|
||||
from .router import TaskRouter
|
||||
from .pipelines import PipelineExecutor, Pipeline, PipelineStep, BUILTIN_PIPELINES
|
||||
from .jobs import JobScheduler, WorkerIntegration
|
||||
|
||||
log = logging.getLogger("orchestrator.controller")
|
||||
|
||||
@@ -24,6 +26,9 @@ log = logging.getLogger("orchestrator.controller")
|
||||
scheduler = SpawnScheduler()
|
||||
bus = NATSBus()
|
||||
router = TaskRouter()
|
||||
pipeline_executor: PipelineExecutor | None = None
|
||||
job_scheduler: JobScheduler | None = None
|
||||
worker_integration: WorkerIntegration | None = None
|
||||
|
||||
# Task result store (in-memory, backed by SQLite)
|
||||
_results: dict[str, ResultMessage] = {}
|
||||
@@ -56,6 +61,8 @@ def _init_tasks_db():
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
global pipeline_executor, job_scheduler, worker_integration
|
||||
|
||||
_init_tasks_db()
|
||||
await bus.connect()
|
||||
|
||||
@@ -66,8 +73,24 @@ async def lifespan(app: FastAPI):
|
||||
# Start health check loop
|
||||
asyncio.create_task(_health_check_loop())
|
||||
|
||||
log.info("Controller started on %s:%d", CONTROLLER_HOST, CONTROLLER_PORT)
|
||||
# Initialize pipelines
|
||||
pipeline_executor = PipelineExecutor(bus)
|
||||
|
||||
# Initialize job scheduler
|
||||
job_scheduler = JobScheduler(bus)
|
||||
await job_scheduler.start()
|
||||
|
||||
# Initialize Worker integration
|
||||
worker_integration = WorkerIntegration()
|
||||
await worker_integration.init()
|
||||
|
||||
log.info("Controller started on %s:%d (pipelines, jobs, integrations)", CONTROLLER_HOST, CONTROLLER_PORT)
|
||||
yield
|
||||
|
||||
if job_scheduler:
|
||||
await job_scheduler.stop()
|
||||
if worker_integration:
|
||||
await worker_integration.close()
|
||||
await bus.disconnect()
|
||||
log.info("Controller stopped")
|
||||
|
||||
@@ -90,7 +113,10 @@ app.add_middleware(
|
||||
# --- Handlers ---
|
||||
|
||||
async def _handle_result(result: ResultMessage):
|
||||
"""Process task results from nodes."""
|
||||
"""Process task results from nodes. Also forwards to pipeline executor."""
|
||||
# Forward to pipeline executor for chained steps
|
||||
if pipeline_executor:
|
||||
pipeline_executor.on_result(result)
|
||||
_results[result.task_id] = result
|
||||
_pending_tasks.pop(result.task_id, None)
|
||||
|
||||
@@ -288,7 +314,120 @@ async def health():
|
||||
"""Health check endpoint."""
|
||||
return {
|
||||
"status": "ok",
|
||||
"version": "1.0.0",
|
||||
"version": "2.0.0",
|
||||
"nodes": router.cluster_stats(),
|
||||
"pools": scheduler.pool_stats()["total_agents"],
|
||||
}
|
||||
|
||||
|
||||
# --- Pipeline API ---
|
||||
|
||||
class PipelineRequest(BaseModel):
|
||||
pipeline: str = "" # Name of builtin pipeline
|
||||
input: str = "" # Initial input
|
||||
steps: list[dict] | None = None # Custom steps [{archetype, prompt_template, intent}]
|
||||
|
||||
@app.post("/api/pipelines")
|
||||
async def run_pipeline(req: PipelineRequest):
|
||||
"""Execute a multi-step agent pipeline."""
|
||||
if not pipeline_executor:
|
||||
raise HTTPException(status_code=503, detail="Pipeline executor not ready")
|
||||
|
||||
if req.pipeline:
|
||||
pipe = pipeline_executor.get_builtin(req.pipeline)
|
||||
if not pipe:
|
||||
raise HTTPException(status_code=404, detail=f"Pipeline '{req.pipeline}' not found. Available: {pipeline_executor.list_pipelines()}")
|
||||
elif req.steps:
|
||||
pipe = Pipeline(
|
||||
name="custom",
|
||||
steps=[PipelineStep(**s) for s in req.steps],
|
||||
)
|
||||
else:
|
||||
raise HTTPException(status_code=400, detail="Provide 'pipeline' name or 'steps' array")
|
||||
|
||||
if not req.input:
|
||||
raise HTTPException(status_code=400, detail="'input' is required")
|
||||
|
||||
# Run pipeline in background, return immediately
|
||||
task = asyncio.create_task(pipeline_executor.execute(pipe, req.input))
|
||||
|
||||
return {
|
||||
"pipeline_id": pipe.pipeline_id,
|
||||
"name": pipe.name,
|
||||
"steps": len(pipe.steps),
|
||||
"status": "running",
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/pipelines")
|
||||
async def list_pipelines():
|
||||
"""List available pipelines."""
|
||||
return {
|
||||
"builtin": [
|
||||
{"name": k, "steps": len(v.steps), "archetypes": [s.archetype for s in v.steps]}
|
||||
for k, v in BUILTIN_PIPELINES.items()
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
# --- Jobs API ---
|
||||
|
||||
@app.get("/api/jobs")
|
||||
async def list_jobs():
|
||||
"""List recurring jobs and their status."""
|
||||
if not job_scheduler:
|
||||
return {"jobs": []}
|
||||
return {"jobs": job_scheduler.list_jobs()}
|
||||
|
||||
|
||||
@app.post("/api/jobs/{name}/toggle")
|
||||
async def toggle_job(name: str):
|
||||
"""Enable or disable a recurring job."""
|
||||
if not job_scheduler:
|
||||
raise HTTPException(status_code=503, detail="Job scheduler not ready")
|
||||
if not job_scheduler.toggle_job(name):
|
||||
raise HTTPException(status_code=404, detail=f"Job '{name}' not found")
|
||||
return {"ok": True, "name": name}
|
||||
|
||||
|
||||
# --- Worker Integration API ---
|
||||
|
||||
@app.get("/api/workers/health")
|
||||
async def worker_health():
|
||||
"""Check health of all Cloudflare Workers."""
|
||||
if not worker_integration:
|
||||
raise HTTPException(status_code=503, detail="Worker integration not ready")
|
||||
return await worker_integration.check_all_workers()
|
||||
|
||||
|
||||
@app.get("/api/workers/search/stats")
|
||||
async def search_stats():
|
||||
"""Get search engine stats."""
|
||||
if not worker_integration:
|
||||
raise HTTPException(status_code=503, detail="Not ready")
|
||||
return await worker_integration.get_search_stats()
|
||||
|
||||
|
||||
@app.get("/api/workers/fleet")
|
||||
async def fleet_status():
|
||||
"""Get fleet status from fleet API."""
|
||||
if not worker_integration:
|
||||
raise HTTPException(status_code=503, detail="Not ready")
|
||||
return await worker_integration.get_fleet_status()
|
||||
|
||||
|
||||
@app.post("/api/tasks/batch")
|
||||
async def submit_batch(tasks: list[TaskRequest]):
|
||||
"""Submit multiple tasks at once."""
|
||||
results = []
|
||||
for req in tasks[:50]: # Max 50 per batch
|
||||
task_id = f"task-{uuid.uuid4().hex[:12]}"
|
||||
target_node = req.target_node or router.best_node(req.archetype) or ""
|
||||
task = TaskMessage(
|
||||
task_id=task_id, archetype=req.archetype, intent=req.intent,
|
||||
prompt=req.prompt, priority=req.priority, target_node=target_node,
|
||||
)
|
||||
await bus.publish_task(task)
|
||||
_pending_tasks[task_id] = task
|
||||
results.append({"task_id": task_id, "archetype": req.archetype, "target_node": target_node})
|
||||
return {"submitted": len(results), "tasks": results}
|
||||
|
||||
207
fleet/cecilia/opt-blackroad/orchestrator/jobs.py
Normal file
207
fleet/cecilia/opt-blackroad/orchestrator/jobs.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
BlackRoad Agent Orchestrator — Recurring Jobs & Worker Integrations
|
||||
Schedule agents to do real work: reindex search, collect analytics, monitor fleet.
|
||||
"""
|
||||
import asyncio
|
||||
import time
|
||||
import logging
|
||||
import aiohttp
|
||||
from dataclasses import dataclass
|
||||
from .nats_protocol import NATSBus, TaskMessage
|
||||
|
||||
log = logging.getLogger("orchestrator.jobs")
|
||||
|
||||
|
||||
@dataclass
|
||||
class RecurringJob:
|
||||
name: str
|
||||
archetype: str
|
||||
prompt: str
|
||||
interval_seconds: int
|
||||
intent: str = "scheduled"
|
||||
enabled: bool = True
|
||||
last_run: float = 0.0
|
||||
run_count: int = 0
|
||||
|
||||
|
||||
# Built-in recurring jobs
|
||||
RECURRING_JOBS = [
|
||||
RecurringJob(
|
||||
name="fleet-health-check",
|
||||
archetype="security", # Uses deepseek-r1:1.5b (fast)
|
||||
prompt="Check BlackRoad infrastructure health. List which services are up or down. Be brief — one line per service.",
|
||||
interval_seconds=600, # Every 10 minutes
|
||||
),
|
||||
RecurringJob(
|
||||
name="security-scan",
|
||||
archetype="security",
|
||||
prompt="Brief security check: any concerns with the BlackRoad fleet? Check auth, network, and access patterns. 3 bullet points max.",
|
||||
interval_seconds=3600, # Every hour
|
||||
),
|
||||
RecurringJob(
|
||||
name="code-index-refresh",
|
||||
archetype="coder", # Uses qwen2.5-coder:3b (medium speed)
|
||||
prompt="What are the most important recent changes across BlackRoad repos? Summarize in 5 bullet points.",
|
||||
interval_seconds=1800, # Every 30 minutes
|
||||
),
|
||||
RecurringJob(
|
||||
name="analytics-digest",
|
||||
archetype="security", # Fast model for simple analysis
|
||||
prompt="Summarize BlackRoad OS usage: estimated active users, top services, any anomalies. Keep it to 3 lines.",
|
||||
interval_seconds=3600, # Every hour
|
||||
),
|
||||
RecurringJob(
|
||||
name="creative-brief",
|
||||
archetype="security", # Use fast model
|
||||
prompt="Write one motivational sentence about building sovereign infrastructure. Keep it under 20 words.",
|
||||
interval_seconds=7200, # Every 2 hours
|
||||
enabled=False, # Disabled by default — nice-to-have
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class JobScheduler:
|
||||
"""Runs recurring agent jobs on schedule."""
|
||||
|
||||
def __init__(self, bus: NATSBus):
|
||||
self.bus = bus
|
||||
self.jobs = {j.name: j for j in RECURRING_JOBS}
|
||||
self._running = False
|
||||
|
||||
async def start(self):
|
||||
"""Start the job scheduler loop. Staggers initial runs to avoid thundering herd."""
|
||||
self._running = True
|
||||
# Stagger initial runs — offset each job by 60s
|
||||
offset = 0
|
||||
for job in self.jobs.values():
|
||||
job.last_run = time.time() + offset # Delay initial run
|
||||
offset += 60
|
||||
log.info("Job scheduler started with %d jobs (staggered)", len(self.jobs))
|
||||
|
||||
async def stop(self):
|
||||
self._running = False
|
||||
|
||||
async def _run_loop(self):
|
||||
while self._running:
|
||||
now = time.time()
|
||||
for job in self.jobs.values():
|
||||
if not job.enabled:
|
||||
continue
|
||||
if now - job.last_run >= job.interval_seconds:
|
||||
await self._execute_job(job)
|
||||
await asyncio.sleep(30) # Check every 30s
|
||||
|
||||
async def _execute_job(self, job: RecurringJob):
|
||||
"""Submit a job as a task to the orchestrator."""
|
||||
task = TaskMessage(
|
||||
task_id=f"job-{job.name}-{int(time.time())}",
|
||||
archetype=job.archetype,
|
||||
intent=job.intent,
|
||||
prompt=job.prompt,
|
||||
priority=7, # Lower priority than user tasks
|
||||
)
|
||||
await self.bus.publish_task(task)
|
||||
job.last_run = time.time()
|
||||
job.run_count += 1
|
||||
log.info("Job %s submitted (run #%d)", job.name, job.run_count)
|
||||
|
||||
def list_jobs(self) -> list[dict]:
|
||||
return [
|
||||
{
|
||||
"name": j.name,
|
||||
"archetype": j.archetype,
|
||||
"interval": j.interval_seconds,
|
||||
"enabled": j.enabled,
|
||||
"last_run": j.last_run,
|
||||
"run_count": j.run_count,
|
||||
}
|
||||
for j in self.jobs.values()
|
||||
]
|
||||
|
||||
def toggle_job(self, name: str) -> bool:
|
||||
job = self.jobs.get(name)
|
||||
if not job:
|
||||
return False
|
||||
job.enabled = not job.enabled
|
||||
log.info("Job %s %s", name, "enabled" if job.enabled else "disabled")
|
||||
return True
|
||||
|
||||
|
||||
class WorkerIntegration:
|
||||
"""Connect agent tasks to real Cloudflare Workers."""
|
||||
|
||||
def __init__(self):
|
||||
self._session: aiohttp.ClientSession | None = None
|
||||
|
||||
async def init(self):
|
||||
self._session = aiohttp.ClientSession()
|
||||
|
||||
async def close(self):
|
||||
if self._session:
|
||||
await self._session.close()
|
||||
|
||||
async def trigger_search_reindex(self, source: str = "github") -> dict:
|
||||
"""Trigger search index rebuild via the index Worker."""
|
||||
async with self._session.post(
|
||||
f"https://index.blackroad.io/api/index?source={source}",
|
||||
timeout=aiohttp.ClientTimeout(total=30),
|
||||
) as resp:
|
||||
return await resp.json()
|
||||
|
||||
async def get_fleet_status(self) -> dict:
|
||||
"""Pull fleet status from the fleet API."""
|
||||
async with self._session.get(
|
||||
"https://fleet-api.amundsonalexa.workers.dev/fleet",
|
||||
timeout=aiohttp.ClientTimeout(total=10),
|
||||
) as resp:
|
||||
return await resp.json()
|
||||
|
||||
async def get_search_stats(self) -> dict:
|
||||
"""Pull search stats."""
|
||||
async with self._session.get(
|
||||
"https://search.blackroad.io/api/stats",
|
||||
timeout=aiohttp.ClientTimeout(total=5),
|
||||
) as resp:
|
||||
return await resp.json()
|
||||
|
||||
async def get_analytics(self) -> dict:
|
||||
"""Pull analytics summary."""
|
||||
async with self._session.get(
|
||||
"https://analytics.blackroad.io/api/stats",
|
||||
timeout=aiohttp.ClientTimeout(total=5),
|
||||
) as resp:
|
||||
return await resp.json()
|
||||
|
||||
async def check_all_workers(self) -> dict:
|
||||
"""Health check all Workers in parallel."""
|
||||
endpoints = {
|
||||
"auth": "https://auth.blackroad.io/api/health",
|
||||
"pay": "https://pay.blackroad.io/health",
|
||||
"search": "https://search.blackroad.io/api/health",
|
||||
"portal": "https://portal.blackroad.io/api/health",
|
||||
"chat": "https://chat.blackroad.io/api/health",
|
||||
"images": "https://images.blackroad.io/api/health",
|
||||
"index": "https://index.blackroad.io/api/health",
|
||||
"analytics": "https://analytics.blackroad.io/api/health",
|
||||
"stats": "https://stats.blackroad.io/health",
|
||||
"agents": "https://agents.blackroad.io/health",
|
||||
"fleet": "https://fleet.blackroad.io/health",
|
||||
}
|
||||
|
||||
results = {}
|
||||
tasks = []
|
||||
for name, url in endpoints.items():
|
||||
tasks.append(self._check_one(name, url))
|
||||
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
name, status = await coro
|
||||
results[name] = status
|
||||
|
||||
return results
|
||||
|
||||
async def _check_one(self, name: str, url: str) -> tuple[str, str]:
|
||||
try:
|
||||
async with self._session.get(url, timeout=aiohttp.ClientTimeout(total=5)) as resp:
|
||||
return (name, "up" if resp.status == 200 else "down")
|
||||
except Exception:
|
||||
return (name, "down")
|
||||
169
fleet/cecilia/opt-blackroad/orchestrator/pipelines.py
Normal file
169
fleet/cecilia/opt-blackroad/orchestrator/pipelines.py
Normal file
@@ -0,0 +1,169 @@
|
||||
"""
|
||||
BlackRoad Agent Orchestrator — Task Pipelines
|
||||
Chain multiple agents together. Output of one feeds into the next.
|
||||
"""
|
||||
import asyncio
|
||||
import uuid
|
||||
import time
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from .nats_protocol import NATSBus, TaskMessage, ResultMessage
|
||||
|
||||
log = logging.getLogger("orchestrator.pipelines")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineStep:
|
||||
archetype: str
|
||||
prompt_template: str # Use {input} for previous step's output
|
||||
intent: str = "pipeline"
|
||||
target_node: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class Pipeline:
|
||||
name: str
|
||||
steps: list[PipelineStep]
|
||||
pipeline_id: str = ""
|
||||
status: str = "pending"
|
||||
results: list[ResultMessage] = field(default_factory=list)
|
||||
created_at: float = 0.0
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.pipeline_id:
|
||||
self.pipeline_id = f"pipe-{uuid.uuid4().hex[:12]}"
|
||||
if not self.created_at:
|
||||
self.created_at = time.time()
|
||||
|
||||
|
||||
# Pre-built pipelines
|
||||
BUILTIN_PIPELINES = {
|
||||
"research-report": Pipeline(
|
||||
name="Research Report",
|
||||
steps=[
|
||||
PipelineStep("researcher", "Research this topic thoroughly: {input}"),
|
||||
PipelineStep("analyst", "Analyze these research findings and extract key insights:\n{input}"),
|
||||
PipelineStep("creative", "Write a clear, engaging summary report based on this analysis:\n{input}"),
|
||||
],
|
||||
),
|
||||
"code-review": Pipeline(
|
||||
name="Code Review",
|
||||
steps=[
|
||||
PipelineStep("coder", "Review this code for bugs and improvements:\n{input}"),
|
||||
PipelineStep("security", "Check this code review for security vulnerabilities:\n{input}"),
|
||||
PipelineStep("coordinator", "Summarize the code review and security findings into actionable items:\n{input}"),
|
||||
],
|
||||
),
|
||||
"fleet-audit": Pipeline(
|
||||
name="Fleet Audit",
|
||||
steps=[
|
||||
PipelineStep("monitor", "Check the status of all BlackRoad infrastructure services: {input}"),
|
||||
PipelineStep("security", "Audit these infrastructure findings for security issues:\n{input}"),
|
||||
PipelineStep("analyst", "Produce a fleet health score and risk assessment:\n{input}"),
|
||||
],
|
||||
),
|
||||
"content-create": Pipeline(
|
||||
name="Content Creation",
|
||||
steps=[
|
||||
PipelineStep("researcher", "Research this topic for a blog post: {input}"),
|
||||
PipelineStep("creative", "Write an engaging blog post based on this research:\n{input}"),
|
||||
PipelineStep("coder", "Format this blog post as clean HTML with proper headings and structure:\n{input}"),
|
||||
],
|
||||
),
|
||||
"bug-fix": Pipeline(
|
||||
name="Bug Fix",
|
||||
steps=[
|
||||
PipelineStep("coder", "Analyze this bug report and identify the root cause:\n{input}"),
|
||||
PipelineStep("coder", "Write a fix for this bug based on the analysis:\n{input}"),
|
||||
PipelineStep("security", "Verify this fix doesn't introduce new vulnerabilities:\n{input}"),
|
||||
],
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
class PipelineExecutor:
|
||||
"""Executes multi-step pipelines by chaining agent tasks."""
|
||||
|
||||
def __init__(self, bus: NATSBus):
|
||||
self.bus = bus
|
||||
self._active: dict[str, Pipeline] = {}
|
||||
self._results: dict[str, ResultMessage] = {}
|
||||
self._waiters: dict[str, asyncio.Event] = {}
|
||||
|
||||
async def execute(self, pipeline: Pipeline, initial_input: str) -> Pipeline:
|
||||
"""Execute a pipeline, chaining results through each step."""
|
||||
pipeline.status = "running"
|
||||
self._active[pipeline.pipeline_id] = pipeline
|
||||
current_input = initial_input
|
||||
|
||||
log.info("Pipeline %s started: %s (%d steps)", pipeline.pipeline_id, pipeline.name, len(pipeline.steps))
|
||||
|
||||
for i, step in enumerate(pipeline.steps):
|
||||
step_num = i + 1
|
||||
task_id = f"{pipeline.pipeline_id}-step{step_num}"
|
||||
|
||||
# Build prompt from template
|
||||
prompt = step.prompt_template.replace("{input}", current_input)
|
||||
|
||||
# Create and publish task
|
||||
task = TaskMessage(
|
||||
task_id=task_id,
|
||||
archetype=step.archetype,
|
||||
intent=step.intent,
|
||||
prompt=prompt,
|
||||
priority=2,
|
||||
target_node=step.target_node,
|
||||
)
|
||||
|
||||
# Set up waiter
|
||||
event = asyncio.Event()
|
||||
self._waiters[task_id] = event
|
||||
|
||||
await self.bus.publish_task(task)
|
||||
log.info("Pipeline %s step %d/%d: %s task %s", pipeline.pipeline_id, step_num, len(pipeline.steps), step.archetype, task_id)
|
||||
|
||||
# Wait for result (timeout 5 min per step)
|
||||
try:
|
||||
await asyncio.wait_for(event.wait(), timeout=300)
|
||||
except asyncio.TimeoutError:
|
||||
log.error("Pipeline %s step %d timed out", pipeline.pipeline_id, step_num)
|
||||
pipeline.status = "failed"
|
||||
return pipeline
|
||||
|
||||
result = self._results.get(task_id)
|
||||
if not result or result.status != "completed":
|
||||
log.error("Pipeline %s step %d failed: %s", pipeline.pipeline_id, step_num, result.error if result else "no result")
|
||||
pipeline.status = "failed"
|
||||
return pipeline
|
||||
|
||||
pipeline.results.append(result)
|
||||
current_input = result.result
|
||||
log.info("Pipeline %s step %d completed in %dms", pipeline.pipeline_id, step_num, result.latency_ms)
|
||||
|
||||
pipeline.status = "completed"
|
||||
log.info("Pipeline %s completed: %d steps, total %dms",
|
||||
pipeline.pipeline_id, len(pipeline.steps),
|
||||
sum(r.latency_ms for r in pipeline.results))
|
||||
|
||||
self._active.pop(pipeline.pipeline_id, None)
|
||||
return pipeline
|
||||
|
||||
def on_result(self, result: ResultMessage):
|
||||
"""Called when a task result arrives. Unblocks pipeline steps."""
|
||||
self._results[result.task_id] = result
|
||||
event = self._waiters.pop(result.task_id, None)
|
||||
if event:
|
||||
event.set()
|
||||
|
||||
def list_pipelines(self) -> list[str]:
|
||||
return list(BUILTIN_PIPELINES.keys())
|
||||
|
||||
def get_builtin(self, name: str) -> Pipeline | None:
|
||||
template = BUILTIN_PIPELINES.get(name)
|
||||
if not template:
|
||||
return None
|
||||
# Return a fresh copy
|
||||
return Pipeline(
|
||||
name=template.name,
|
||||
steps=list(template.steps),
|
||||
)
|
||||
@@ -1,7 +1,6 @@
|
||||
LISTEN 0 4096 127.0.0.1:33551 0.0.0.0:*
|
||||
LISTEN 0 4096 127.0.0.1:9000 0.0.0.0:*
|
||||
LISTEN 0 4096 127.0.0.1:45685 0.0.0.0:*
|
||||
LISTEN 0 128 0.0.0.0:34001 0.0.0.0:*
|
||||
LISTEN 0 4096 127.0.0.1:37175 0.0.0.0:*
|
||||
LISTEN 0 32 127.0.0.1:53 0.0.0.0:*
|
||||
LISTEN 0 2048 0.0.0.0:8788 0.0.0.0:* users:(("python3",pid=1429,fd=16))
|
||||
LISTEN 0 5 0.0.0.0:8787 0.0.0.0:* users:(("python3",pid=1562,fd=3))
|
||||
@@ -12,7 +11,7 @@ LISTEN 0 511 0.0.0.0:80 0.0.0.0:*
|
||||
LISTEN 0 128 0.0.0.0:22 0.0.0.0:*
|
||||
LISTEN 0 200 127.0.0.1:5432 0.0.0.0:*
|
||||
LISTEN 0 32 192.168.4.96:53 0.0.0.0:*
|
||||
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=19895,fd=5))
|
||||
LISTEN 0 5 0.0.0.0:7890 0.0.0.0:* users:(("python3",pid=97297,fd=5))
|
||||
LISTEN 0 5 0.0.0.0:4010 0.0.0.0:* users:(("python3",pid=1035,fd=3))
|
||||
LISTEN 0 511 0.0.0.0:8080 0.0.0.0:*
|
||||
LISTEN 0 511 0.0.0.0:3100 0.0.0.0:*
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
{
|
||||
"hostname": "cecilia",
|
||||
"ts": "2026-03-16T03:26:52Z",
|
||||
"uptime_seconds": 2322,
|
||||
"ts": "2026-03-16T03:57:49Z",
|
||||
"uptime_seconds": 4179,
|
||||
"kernel": "6.12.62+rpt-rpi-2712",
|
||||
"temp_c": 40.8,
|
||||
"temp_c": 38.0,
|
||||
"memory_mb": {
|
||||
"total": 8062,
|
||||
"used": 3217,
|
||||
"free": 4845
|
||||
"used": 2867,
|
||||
"free": 5195
|
||||
},
|
||||
"disk": "81G/457G (19%)",
|
||||
"load": [
|
||||
0.48,
|
||||
0.5,
|
||||
0.47
|
||||
0.37,
|
||||
0.6,
|
||||
0.93
|
||||
],
|
||||
"ollama_models": [
|
||||
"deepseek-r1:1.5b",
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
{
|
||||
"hostname": "gematria",
|
||||
"ts": "2026-03-16T03:26:54Z",
|
||||
"uptime_seconds": 5430806,
|
||||
"ts": "2026-03-16T03:57:52Z",
|
||||
"uptime_seconds": 5432664,
|
||||
"kernel": "5.15.0-113-generic",
|
||||
"temp_c": 0,
|
||||
"memory_mb": {
|
||||
"total": 7937,
|
||||
"used": 3312,
|
||||
"free": 4177
|
||||
"used": 3302,
|
||||
"free": 4189
|
||||
},
|
||||
"disk": "52G/78G (67%)",
|
||||
"load": [
|
||||
3.1,
|
||||
3.09,
|
||||
3.09
|
||||
3.26,
|
||||
3.13,
|
||||
3.1
|
||||
],
|
||||
"ollama_models": [
|
||||
"qwen2.5:7b",
|
||||
|
||||
@@ -1,50 +1,50 @@
|
||||
[2026-03-15 22:04:02] [BEAT] [lucidia] load=2.23 mem=2933/8063MB temp=46.9C disk=33%
|
||||
[2026-03-15 22:04:12] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:05:43] [BEAT] [lucidia] load=37.88 mem=3628/8063MB temp=46.3C disk=33%
|
||||
[2026-03-15 22:06:01] [BEAT] [lucidia] load=29.62 mem=3648/8063MB temp=44.6C disk=33%
|
||||
[2026-03-15 22:06:15] [BEAT] [lucidia] load=23.22 mem=3648/8063MB temp=45.2C disk=33%
|
||||
[2026-03-15 22:06:15] [BEAT] [lucidia] load=23.22 mem=3646/8063MB temp=45.2C disk=33%
|
||||
[2026-03-15 22:07:01] [BEAT] [lucidia] load=13.05 mem=3613/8063MB temp=46.9C disk=33%
|
||||
[2026-03-15 22:08:01] [BEAT] [lucidia] load=5.60 mem=3604/8063MB temp=53.5C disk=33%
|
||||
[2026-03-15 22:09:01] [BEAT] [lucidia] load=3.31 mem=3593/8063MB temp=47.4C disk=33%
|
||||
[2026-03-15 22:10:01] [FLEET] [lucidia] Starting cross-node health check
|
||||
[2026-03-15 22:10:01] [BEAT] [lucidia] load=3.25 mem=3598/8063MB temp=57.3C disk=33%
|
||||
[2026-03-15 22:10:02] [FLEET] [lucidia] alice: UP temp=36C mem=3355MB disk=80%
|
||||
[2026-03-15 22:10:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:10:04] [FLEET] [lucidia] cecilia: UP temp=42C mem=4776MB disk=19%
|
||||
[2026-03-15 22:10:06] [FLEET] [lucidia] gematria: UP temp=C mem=4213MB disk=67%
|
||||
[2026-03-15 22:10:08] [FLEET] [lucidia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:10:09] [FLEET] [lucidia] anastasia: UP temp=C mem=201MB disk=69%
|
||||
[2026-03-15 22:11:01] [BEAT] [lucidia] load=2.19 mem=3609/8063MB temp=50.7C disk=33%
|
||||
[2026-03-15 22:11:24] [BEAT] [lucidia] load=1.92 mem=3572/8063MB temp=50.1C disk=33%
|
||||
[2026-03-15 22:11:24] [BEAT] [lucidia] load=1.92 mem=3571/8063MB temp=49.6C disk=33%
|
||||
[2026-03-15 22:12:01] [BEAT] [lucidia] load=3.10 mem=3618/8063MB temp=54.0C disk=33%
|
||||
[2026-03-15 22:13:01] [BEAT] [lucidia] load=1.67 mem=3562/8063MB temp=58.4C disk=33%
|
||||
[2026-03-15 22:14:01] [BEAT] [lucidia] load=1.45 mem=3573/8063MB temp=52.9C disk=33%
|
||||
[2026-03-15 22:15:01] [BEAT] [lucidia] load=2.51 mem=3622/8063MB temp=61.1C disk=33%
|
||||
[2026-03-15 22:15:47] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:16:01] [BEAT] [lucidia] load=2.37 mem=3562/8063MB temp=54.5C disk=33%
|
||||
[2026-03-15 22:16:34] [BEAT] [lucidia] load=3.15 mem=3577/8063MB temp=59.5C disk=33%
|
||||
[2026-03-15 22:16:34] [BEAT] [lucidia] load=3.15 mem=3576/8063MB temp=61.1C disk=33%
|
||||
[2026-03-15 22:17:01] [BEAT] [lucidia] load=4.58 mem=3558/8063MB temp=56.2C disk=33%
|
||||
[2026-03-15 22:18:01] [BEAT] [lucidia] load=3.43 mem=3575/8063MB temp=61.1C disk=33%
|
||||
[2026-03-15 22:19:01] [BEAT] [lucidia] load=2.19 mem=3537/8063MB temp=54.5C disk=33%
|
||||
[2026-03-15 22:19:08] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:20:01] [FLEET] [lucidia] Starting cross-node health check
|
||||
[2026-03-15 22:20:01] [BEAT] [lucidia] load=3.87 mem=3528/8063MB temp=64.5C disk=33%
|
||||
[2026-03-15 22:20:02] [FLEET] [lucidia] alice: UP temp=36C mem=3354MB disk=80%
|
||||
[2026-03-15 22:20:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:20:04] [FLEET] [lucidia] cecilia: UP temp=40C mem=4815MB disk=19%
|
||||
[2026-03-15 22:20:05] [FLEET] [lucidia] gematria: UP temp=C mem=4209MB disk=67%
|
||||
[2026-03-15 22:20:07] [FLEET] [lucidia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:20:08] [FLEET] [lucidia] anastasia: UP temp=C mem=277MB disk=69%
|
||||
[2026-03-15 22:21:01] [BEAT] [lucidia] load=2.23 mem=3539/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:21:42] [BEAT] [lucidia] load=3.43 mem=3513/8063MB temp=59.0C disk=33%
|
||||
[2026-03-15 22:21:43] [BEAT] [lucidia] load=3.43 mem=3512/8063MB temp=59.5C disk=33%
|
||||
[2026-03-15 22:22:01] [BEAT] [lucidia] load=2.59 mem=3547/8063MB temp=56.8C disk=33%
|
||||
[2026-03-15 22:23:02] [BEAT] [lucidia] load=3.14 mem=3496/8063MB temp=62.2C disk=33%
|
||||
[2026-03-15 22:24:01] [BEAT] [lucidia] load=2.45 mem=3507/8063MB temp=55.1C disk=33%
|
||||
[2026-03-15 22:25:01] [BEAT] [lucidia] load=1.57 mem=3537/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:26:01] [BEAT] [lucidia] load=1.95 mem=3454/8063MB temp=54.0C disk=33%
|
||||
[2026-03-15 22:26:51] [BEAT] [lucidia] load=2.08 mem=3528/8063MB temp=59.5C disk=33%
|
||||
[2026-03-15 22:26:52] [BEAT] [lucidia] load=2.08 mem=3521/8063MB temp=57.9C disk=33%
|
||||
[2026-03-15 22:34:08] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:35:01] [BEAT] [lucidia] load=1.90 mem=3487/8063MB temp=56.2C disk=33%
|
||||
[2026-03-15 22:36:01] [BEAT] [lucidia] load=2.06 mem=3457/8063MB temp=56.8C disk=33%
|
||||
[2026-03-15 22:37:01] [BEAT] [lucidia] load=2.33 mem=3515/8063MB temp=61.7C disk=33%
|
||||
[2026-03-15 22:37:09] [BEAT] [lucidia] load=3.86 mem=3515/8063MB temp=61.7C disk=33%
|
||||
[2026-03-15 22:37:10] [BEAT] [lucidia] load=3.86 mem=3514/8063MB temp=62.8C disk=33%
|
||||
[2026-03-15 22:38:01] [BEAT] [lucidia] load=3.72 mem=3513/8063MB temp=54.5C disk=33%
|
||||
[2026-03-15 22:39:01] [BEAT] [lucidia] load=3.09 mem=3471/8063MB temp=53.5C disk=33%
|
||||
[2026-03-15 22:40:01] [FLEET] [lucidia] Starting cross-node health check
|
||||
[2026-03-15 22:40:01] [BEAT] [lucidia] load=2.87 mem=3474/8063MB temp=61.7C disk=33%
|
||||
[2026-03-15 22:40:02] [FLEET] [lucidia] alice: UP temp=35C mem=3362MB disk=80%
|
||||
[2026-03-15 22:40:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:40:04] [FLEET] [lucidia] cecilia: UP temp=56C mem=5480MB disk=19%
|
||||
[2026-03-15 22:40:06] [FLEET] [lucidia] gematria: UP temp=C mem=4188MB disk=67%
|
||||
[2026-03-15 22:40:08] [FLEET] [lucidia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:40:09] [FLEET] [lucidia] anastasia: UP temp=C mem=275MB disk=69%
|
||||
[2026-03-15 22:41:01] [BEAT] [lucidia] load=2.12 mem=3506/8063MB temp=54.0C disk=33%
|
||||
[2026-03-15 22:42:01] [BEAT] [lucidia] load=3.38 mem=3536/8063MB temp=65.0C disk=33%
|
||||
[2026-03-15 22:42:19] [BEAT] [lucidia] load=3.82 mem=3546/8063MB temp=59.5C disk=33%
|
||||
[2026-03-15 22:42:19] [BEAT] [lucidia] load=3.82 mem=3546/8063MB temp=59.0C disk=33%
|
||||
[2026-03-15 22:43:01] [BEAT] [lucidia] load=3.45 mem=3525/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:44:01] [BEAT] [lucidia] load=3.24 mem=3510/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:45:01] [BEAT] [lucidia] load=2.56 mem=3490/8063MB temp=57.3C disk=33%
|
||||
[2026-03-15 22:45:46] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:46:01] [BEAT] [lucidia] load=1.80 mem=3506/8063MB temp=58.4C disk=33%
|
||||
[2026-03-15 22:47:01] [BEAT] [lucidia] load=1.77 mem=3473/8063MB temp=53.5C disk=33%
|
||||
[2026-03-15 22:47:29] [BEAT] [lucidia] load=1.70 mem=3482/8063MB temp=60.6C disk=33%
|
||||
[2026-03-15 22:47:30] [BEAT] [lucidia] load=1.70 mem=3484/8063MB temp=60.0C disk=33%
|
||||
[2026-03-15 22:48:01] [BEAT] [lucidia] load=2.67 mem=3480/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:49:01] [BEAT] [lucidia] load=2.76 mem=3479/8063MB temp=61.7C disk=33%
|
||||
[2026-03-15 22:49:09] [DIAL] [lucidia] Switchboard unreachable
|
||||
[2026-03-15 22:50:01] [FLEET] [lucidia] Starting cross-node health check
|
||||
[2026-03-15 22:50:01] [BEAT] [lucidia] load=2.31 mem=3527/8063MB temp=55.6C disk=33%
|
||||
[2026-03-15 22:50:02] [FLEET] [lucidia] alice: UP temp=36C mem=3349MB disk=80%
|
||||
[2026-03-15 22:50:04] [FLEET] [lucidia] octavia: DOWN (no ping response)
|
||||
[2026-03-15 22:50:04] [FLEET] [lucidia] cecilia: UP temp=42C mem=5049MB disk=19%
|
||||
[2026-03-15 22:50:06] [FLEET] [lucidia] gematria: UP temp=C mem=4188MB disk=67%
|
||||
[2026-03-15 22:50:08] [FLEET] [lucidia] aria: DOWN (no ping response)
|
||||
[2026-03-15 22:50:09] [FLEET] [lucidia] anastasia: UP temp=C mem=276MB disk=69%
|
||||
[2026-03-15 22:51:01] [BEAT] [lucidia] load=1.95 mem=3565/8063MB temp=55.1C disk=33%
|
||||
[2026-03-15 22:52:01] [BEAT] [lucidia] load=2.08 mem=3568/8063MB temp=60.0C disk=33%
|
||||
[2026-03-15 22:52:39] [BEAT] [lucidia] load=2.30 mem=3546/8063MB temp=52.4C disk=33%
|
||||
[2026-03-15 22:52:39] [BEAT] [lucidia] load=2.30 mem=3546/8063MB temp=51.8C disk=33%
|
||||
[2026-03-15 22:53:01] [BEAT] [lucidia] load=1.95 mem=3558/8063MB temp=51.2C disk=33%
|
||||
[2026-03-15 22:54:01] [BEAT] [lucidia] load=1.88 mem=3558/8063MB temp=48.5C disk=33%
|
||||
[2026-03-15 22:55:01] [BEAT] [lucidia] load=3.74 mem=3484/8063MB temp=58.4C disk=33%
|
||||
[2026-03-15 22:56:01] [BEAT] [lucidia] load=1.93 mem=3486/8063MB temp=48.5C disk=33%
|
||||
[2026-03-15 22:57:01] [BEAT] [lucidia] load=2.52 mem=3550/8063MB temp=48.0C disk=33%
|
||||
[2026-03-15 22:57:49] [BEAT] [lucidia] load=2.51 mem=3540/8063MB temp=53.5C disk=33%
|
||||
[2026-03-15 22:57:49] [BEAT] [lucidia] load=2.51 mem=3543/8063MB temp=53.5C disk=33%
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
blackroad-gitea Up 36 minutes
|
||||
road-pdns-admin Up 36 minutes (healthy)
|
||||
road-pdns Up 36 minutes
|
||||
road-dns-db Up 36 minutes
|
||||
roadauth Up 36 minutes
|
||||
roadapi Up 36 minutes
|
||||
blackroad-edge-agent Up 36 minutes
|
||||
blackroad.systems Up 36 minutes
|
||||
blackroadai.com Up 36 minutes
|
||||
blackroad-auth-gateway Up 36 minutes
|
||||
blackroad-metaverse Up 36 minutes
|
||||
blackroad-os Up 36 minutes
|
||||
blackroad-os-carpool Up 36 minutes
|
||||
pi-my-agent-1 Up 36 minutes (healthy)
|
||||
blackroad-gitea Up About an hour
|
||||
road-pdns-admin Up About an hour (healthy)
|
||||
road-pdns Up About an hour
|
||||
road-dns-db Up About an hour
|
||||
roadauth Up About an hour
|
||||
roadapi Up About an hour
|
||||
blackroad-edge-agent Up About an hour
|
||||
blackroad.systems Up About an hour
|
||||
blackroadai.com Up About an hour
|
||||
blackroad-auth-gateway Up About an hour
|
||||
blackroad-metaverse Up About an hour
|
||||
blackroad-os Up About an hour
|
||||
blackroad-os-carpool Up About an hour
|
||||
pi-my-agent-1 Up About an hour (healthy)
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"node":"lucidia","ts":"2026-03-16T03:26:51Z","load":2.08,"mem_free_mb":3521,"mem_total_mb":8063,"temp_c":57.9,"disk_pct":33,"throttle":"0x0"}
|
||||
{"node":"lucidia","ts":"2026-03-16T03:57:49Z","load":2.51,"mem_free_mb":3543,"mem_total_mb":8063,"temp_c":53.5,"disk_pct":33,"throttle":"0x0"}
|
||||
|
||||
@@ -28,6 +28,7 @@ pironman5.service
|
||||
pm2-pi.service
|
||||
polkit.service
|
||||
postgresql@17-main.service
|
||||
road-phone.service
|
||||
roadnet-failover.service
|
||||
rtkit-daemon.service
|
||||
serial-getty@ttyAMA10.service
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
{
|
||||
"hostname": "octavia",
|
||||
"ts": "2026-03-16T03:26:52Z",
|
||||
"uptime_seconds": 2370,
|
||||
"ts": "2026-03-16T03:57:49Z",
|
||||
"uptime_seconds": 4227,
|
||||
"kernel": "6.12.62+rpt-rpi-2712",
|
||||
"temp_c": 59.5,
|
||||
"temp_c": 51.8,
|
||||
"memory_mb": {
|
||||
"total": 8063,
|
||||
"used": 4549,
|
||||
"free": 3513
|
||||
"used": 4516,
|
||||
"free": 3546
|
||||
},
|
||||
"disk": "73G/235G (33%)",
|
||||
"load": [
|
||||
2.08,
|
||||
2.45,
|
||||
3.35
|
||||
2.51,
|
||||
2.42,
|
||||
2.55
|
||||
],
|
||||
"ollama_models": [
|
||||
"qwen2.5:3b",
|
||||
@@ -25,5 +25,5 @@
|
||||
],
|
||||
"throttle": "0x0",
|
||||
"voltage": "0.8587V",
|
||||
"services_running": 46
|
||||
"services_running": 47
|
||||
}
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"node":"octavia","status":"down","ts":"2026-03-16T03:26:52Z"}
|
||||
{"node":"octavia","status":"down","ts":"2026-03-16T03:57:49Z"}
|
||||
|
||||
190
scripts/blackroad-fleet-coordinator.sh
Executable file
190
scripts/blackroad-fleet-coordinator.sh
Executable file
@@ -0,0 +1,190 @@
|
||||
#!/bin/bash
|
||||
# BlackRoad Fleet Coordinator — Background process that:
|
||||
# 1. Monitors all nodes continuously
|
||||
# 2. Auto-heals crashed services
|
||||
# 3. Pushes telemetry to stats API
|
||||
# 4. Syncs state between Pis
|
||||
# 5. Alerts on problems
|
||||
#
|
||||
# Run: ./blackroad-fleet-coordinator.sh
|
||||
# Cron: */5 * * * * /Users/alexa/blackroad-fleet-coordinator.sh >> ~/.blackroad/logs/coordinator.log 2>&1
|
||||
|
||||
set -euo pipefail
|
||||
source ~/.blackroad/config/nodes.sh
|
||||
|
||||
LOG_DIR="$HOME/.blackroad/logs"
|
||||
STATE_DIR="$HOME/.blackroad/fleet-state"
|
||||
mkdir -p "$LOG_DIR" "$STATE_DIR"
|
||||
|
||||
STATS_URL="https://stats-blackroad.amundsonalexa.workers.dev"
|
||||
STATS_KEY="blackroad-stats-push-2026"
|
||||
|
||||
log() { printf "[%s] %s\n" "$(date '+%Y-%m-%d %H:%M:%S')" "$1"; }
|
||||
|
||||
# ── 1. PROBE ALL NODES ──
|
||||
probe_node() {
|
||||
local name=$1
|
||||
local ip="${NODE_IP[$name]}"
|
||||
local user="${NODE_USER[$name]:-pi}"
|
||||
local state_file="$STATE_DIR/${name}.json"
|
||||
local prev_status="unknown"
|
||||
[[ -f "$state_file" ]] && prev_status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
|
||||
|
||||
# Ping check
|
||||
if ! ping -c1 -W2 "$ip" &>/dev/null; then
|
||||
echo "{\"name\":\"$name\",\"ip\":\"$ip\",\"status\":\"down\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$state_file"
|
||||
if [[ "$prev_status" != "down" ]]; then
|
||||
log "ALERT: $name ($ip) went DOWN"
|
||||
# Could push to Slack here
|
||||
fi
|
||||
return
|
||||
fi
|
||||
|
||||
# SSH probe
|
||||
local data
|
||||
data=$(ssh $BR_SSH_OPTS "${user}@${ip}" "
|
||||
load=\$(cat /proc/loadavg | awk '{print \$1}')
|
||||
temp=\$(cat /sys/class/thermal/thermal_zone0/temp 2>/dev/null | awk '{printf \"%.0f\", \$1/1000}' || echo 0)
|
||||
mem_free=\$(free -m | awk '/Mem:/ {print \$4}')
|
||||
mem_total=\$(free -m | awk '/Mem:/ {print \$2}')
|
||||
disk=\$(df / | awk 'NR==2 {print \$5}' | tr -d '%')
|
||||
uptime_s=\$(cat /proc/uptime | awk '{print int(\$1)}')
|
||||
svcs=\$(systemctl list-units --type=service --state=running --no-pager --no-legend 2>/dev/null | wc -l)
|
||||
docker_c=\$(docker ps -q 2>/dev/null | wc -l || echo 0)
|
||||
echo \"\$load|\$temp|\$mem_free|\$mem_total|\$disk|\$uptime_s|\$svcs|\$docker_c\"
|
||||
" 2>/dev/null) || data=""
|
||||
|
||||
if [[ -z "$data" ]]; then
|
||||
echo "{\"name\":\"$name\",\"ip\":\"$ip\",\"status\":\"ssh_fail\",\"ts\":\"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"}" > "$state_file"
|
||||
log "WARN: $name ($ip) ping OK but SSH failed"
|
||||
return
|
||||
fi
|
||||
|
||||
IFS='|' read -r load temp mem_free mem_total disk uptime svcs docker_c <<< "$data"
|
||||
|
||||
cat > "$state_file" << EOF
|
||||
{"name":"$name","ip":"$ip","status":"up","load":$load,"temp":$temp,"mem_free":$mem_free,"mem_total":$mem_total,"disk_pct":$disk,"uptime_s":$uptime,"services":$svcs,"containers":$docker_c,"ts":"$(date -u +%Y-%m-%dT%H:%M:%SZ)"}
|
||||
EOF
|
||||
|
||||
# State change alerts
|
||||
if [[ "$prev_status" == "down" || "$prev_status" == "ssh_fail" ]]; then
|
||||
log "RECOVERED: $name ($ip) is back UP"
|
||||
fi
|
||||
|
||||
# Threshold alerts
|
||||
if [[ "$disk" -gt 90 ]]; then log "ALERT: $name disk at ${disk}%"; fi
|
||||
if [[ "$temp" -gt 75 ]]; then log "ALERT: $name temp at ${temp}C"; fi
|
||||
if [[ "$mem_free" -lt 200 ]]; then log "ALERT: $name low memory (${mem_free}MB free)"; fi
|
||||
}
|
||||
|
||||
# ── 2. SERVICE HEALTH CHECKS ──
|
||||
# Some services bind to localhost only (PostgreSQL, Redis) — check via SSH
|
||||
check_services() {
|
||||
local name=$1
|
||||
local ip="${NODE_IP[$name]}"
|
||||
local services="${NODE_SERVICES[$name]:-}"
|
||||
[[ -z "$services" ]] && return
|
||||
|
||||
IFS=',' read -ra svc_list <<< "$services"
|
||||
for svc in "${svc_list[@]}"; do
|
||||
local port=$(echo "$svc" | cut -d: -f1)
|
||||
local label=$(echo "$svc" | cut -d: -f2)
|
||||
# Try remote first, then check via SSH for localhost-bound services
|
||||
if ! nc -z -w2 "$ip" "$port" 2>/dev/null; then
|
||||
local ssh_check
|
||||
ssh_check=$(br_ssh "$name" "ss -tlnp 2>/dev/null | grep -q ':$port ' && echo ok || echo down" 2>/dev/null || echo "ssh_fail")
|
||||
if [[ "$ssh_check" != "ok" ]]; then
|
||||
log "SERVICE DOWN: $name:$port ($label)"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
# ── 3. AUTO-HEAL ──
|
||||
auto_heal() {
|
||||
# Check Ollama on Cecilia
|
||||
if ! nc -z -w2 192.168.4.96 11434 2>/dev/null; then
|
||||
log "HEAL: Restarting Ollama on Cecilia"
|
||||
br_ssh cecilia "sudo systemctl restart ollama" 2>/dev/null
|
||||
fi
|
||||
|
||||
# Check Gitea on Octavia
|
||||
if ! nc -z -w2 192.168.4.101 3100 2>/dev/null; then
|
||||
log "HEAL: Restarting Gitea (blackroad-git) on Octavia"
|
||||
br_ssh octavia "docker restart blackroad-git" 2>/dev/null
|
||||
fi
|
||||
|
||||
# Check cloudflared tunnels
|
||||
for node in cecilia lucidia; do
|
||||
local tunnel_ok
|
||||
tunnel_ok=$(br_ssh "$node" "systemctl is-active cloudflared 2>/dev/null" || echo "inactive")
|
||||
if [[ "$tunnel_ok" != "active" ]]; then
|
||||
log "HEAL: Restarting cloudflared on $node"
|
||||
br_ssh "$node" "sudo systemctl restart cloudflared" 2>/dev/null
|
||||
fi
|
||||
done
|
||||
|
||||
# Check NATS on Octavia
|
||||
local nats_ok
|
||||
nats_ok=$(br_ssh octavia "docker ps -q -f name=nats | head -1" 2>/dev/null || echo "")
|
||||
if [[ -z "$nats_ok" ]]; then
|
||||
log "HEAL: NATS container not running on Octavia"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── 4. PUSH TELEMETRY ──
|
||||
push_telemetry() {
|
||||
for name in "${ALL_NODES[@]}"; do
|
||||
local state_file="$STATE_DIR/${name}.json"
|
||||
[[ -f "$state_file" ]] || continue
|
||||
local status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
|
||||
[[ "$status" == "up" ]] || continue
|
||||
|
||||
curl -s --max-time 5 -X POST "$STATS_URL/api/push" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-API-Key: $STATS_KEY" \
|
||||
-d @"$state_file" >/dev/null 2>&1 || true
|
||||
done
|
||||
}
|
||||
|
||||
# ── 5. FLEET SUMMARY ──
|
||||
fleet_summary() {
|
||||
local up=0 down=0
|
||||
for name in "${ALL_NODES[@]}"; do
|
||||
local state_file="$STATE_DIR/${name}.json"
|
||||
if [[ -f "$state_file" ]]; then
|
||||
local status=$(python3 -c "import json;print(json.load(open('$state_file')).get('status','unknown'))" 2>/dev/null || echo "unknown")
|
||||
if [[ "$status" == "up" ]]; then
|
||||
up=$((up + 1))
|
||||
else
|
||||
down=$((down + 1))
|
||||
fi
|
||||
fi
|
||||
done
|
||||
log "FLEET: $up up, $down down ($(date))"
|
||||
}
|
||||
|
||||
# ── MAIN ──
|
||||
log "━━━ Fleet Coordinator Run ━━━"
|
||||
|
||||
# Probe all nodes (local Pis only - cloud nodes are slower)
|
||||
for name in "${PI_NODES[@]}"; do
|
||||
probe_node "$name"
|
||||
done
|
||||
|
||||
# Service checks
|
||||
for name in "${PI_NODES[@]}"; do
|
||||
local_status=$(python3 -c "import json;print(json.load(open('$STATE_DIR/${name}.json')).get('status','down'))" 2>/dev/null || echo "down")
|
||||
[[ "$local_status" == "up" ]] && check_services "$name"
|
||||
done
|
||||
|
||||
# Auto-heal
|
||||
auto_heal
|
||||
|
||||
# Push telemetry
|
||||
push_telemetry
|
||||
|
||||
# Summary
|
||||
fleet_summary
|
||||
|
||||
log "━━━ Done ━━━"
|
||||
@@ -5,7 +5,7 @@
|
||||
// Own your customers, subscriptions, invoices, and payments.
|
||||
// Stripe is just a dumb card charger underneath.
|
||||
|
||||
const VERSION = '1.0.0';
|
||||
const VERSION = '2.0.0';
|
||||
|
||||
// ─── Schema ──────────────────────────────────────────────────────────────
|
||||
const SCHEMA = [
|
||||
@@ -114,6 +114,37 @@ const SCHEMA = [
|
||||
`CREATE INDEX IF NOT EXISTS idx_payments_customer ON payments(customer_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_events_type ON events(type)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_events_customer ON events(customer_id)`,
|
||||
// v2: API keys for customers
|
||||
`CREATE TABLE IF NOT EXISTS api_keys (
|
||||
id TEXT PRIMARY KEY,
|
||||
customer_id TEXT NOT NULL REFERENCES customers(id),
|
||||
key_hash TEXT NOT NULL,
|
||||
key_prefix TEXT NOT NULL,
|
||||
name TEXT DEFAULT 'default',
|
||||
scopes TEXT DEFAULT '["api:read"]',
|
||||
rate_limit INTEGER DEFAULT 1000,
|
||||
last_used TEXT,
|
||||
usage_count INTEGER DEFAULT 0,
|
||||
status TEXT DEFAULT 'active',
|
||||
expires_at TEXT,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
)`,
|
||||
// v2: Usage metering
|
||||
`CREATE TABLE IF NOT EXISTS usage (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
customer_id TEXT NOT NULL REFERENCES customers(id),
|
||||
api_key_id TEXT REFERENCES api_keys(id),
|
||||
endpoint TEXT NOT NULL,
|
||||
method TEXT DEFAULT 'GET',
|
||||
status_code INTEGER,
|
||||
latency_ms INTEGER,
|
||||
tokens_used INTEGER DEFAULT 0,
|
||||
created_at TEXT DEFAULT (datetime('now'))
|
||||
)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_api_keys_customer ON api_keys(customer_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_api_keys_prefix ON api_keys(key_prefix)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_usage_customer ON usage(customer_id)`,
|
||||
`CREATE INDEX IF NOT EXISTS idx_usage_date ON usage(created_at)`,
|
||||
];
|
||||
|
||||
// ─── Seed Plans ──────────────────────────────────────────────────────────
|
||||
@@ -965,10 +996,129 @@ function handleHealth() {
|
||||
'GET /invoices?customer_id=',
|
||||
'GET /payments?customer_id=',
|
||||
'POST /webhook',
|
||||
'GET /keys?customer_id=',
|
||||
'POST /keys',
|
||||
'DELETE /keys?id=',
|
||||
'GET /usage?customer_id=&days=30',
|
||||
'POST /usage/record',
|
||||
],
|
||||
});
|
||||
}
|
||||
|
||||
// ─── API Keys ───────────────────────────────────────────────────────────
|
||||
async function handleApiKeys(request, db) {
|
||||
const url = new URL(request.url);
|
||||
|
||||
if (request.method === 'GET') {
|
||||
const customerId = url.searchParams.get('customer_id');
|
||||
if (!customerId) return err('customer_id required');
|
||||
const keys = await db.prepare(
|
||||
"SELECT id, customer_id, key_prefix, name, scopes, rate_limit, last_used, usage_count, status, expires_at, created_at FROM api_keys WHERE customer_id = ? ORDER BY created_at DESC"
|
||||
).bind(customerId).all();
|
||||
return json({ keys: keys.results.map(k => ({ ...k, scopes: JSON.parse(k.scopes || '[]') })) });
|
||||
}
|
||||
|
||||
if (request.method === 'POST') {
|
||||
const body = await request.json();
|
||||
const { customer_id, name, scopes, rate_limit, expires_in_days } = body;
|
||||
if (!customer_id) return err('customer_id required');
|
||||
|
||||
// Generate API key: rp_live_ + 32 random hex chars
|
||||
const rawKey = `rp_live_${randHex(16)}`;
|
||||
const keyPrefix = rawKey.slice(0, 12);
|
||||
// Hash the key for storage (don't store raw)
|
||||
const enc = new TextEncoder();
|
||||
const hashBuf = await crypto.subtle.digest('SHA-256', enc.encode(rawKey));
|
||||
const keyHash = Array.from(new Uint8Array(hashBuf)).map(b => b.toString(16).padStart(2, '0')).join('');
|
||||
|
||||
const id = `key_${uid()}`;
|
||||
const expiresAt = expires_in_days
|
||||
? new Date(Date.now() + expires_in_days * 86400000).toISOString()
|
||||
: null;
|
||||
|
||||
await db.prepare(
|
||||
'INSERT INTO api_keys (id, customer_id, key_hash, key_prefix, name, scopes, rate_limit, expires_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?)'
|
||||
).bind(id, customer_id, keyHash, keyPrefix, name || 'default', JSON.stringify(scopes || ['api:read']), rate_limit || 1000, expiresAt).run();
|
||||
|
||||
// Return the raw key ONLY on creation — can never be retrieved again
|
||||
return json({ id, key: rawKey, prefix: keyPrefix, name: name || 'default', warning: 'Save this key now. It cannot be retrieved again.' }, 201);
|
||||
}
|
||||
|
||||
if (request.method === 'DELETE') {
|
||||
const keyId = url.searchParams.get('id');
|
||||
if (!keyId) return err('id required');
|
||||
await db.prepare("UPDATE api_keys SET status = 'revoked' WHERE id = ?").bind(keyId).run();
|
||||
return json({ ok: true, revoked: keyId });
|
||||
}
|
||||
|
||||
return err('Method not allowed', 405);
|
||||
}
|
||||
|
||||
// ─── Usage / Metering ────────────────────────────────────────────────────
|
||||
async function handleUsage(request, db) {
|
||||
const url = new URL(request.url);
|
||||
const customerId = url.searchParams.get('customer_id');
|
||||
if (!customerId) return err('customer_id required');
|
||||
|
||||
const days = parseInt(url.searchParams.get('days') || '30');
|
||||
const since = new Date(Date.now() - days * 86400000).toISOString();
|
||||
|
||||
// Daily breakdown
|
||||
const daily = await db.prepare(
|
||||
`SELECT date(created_at) as day, COUNT(*) as requests, SUM(tokens_used) as tokens,
|
||||
AVG(latency_ms) as avg_latency, SUM(CASE WHEN status_code >= 400 THEN 1 ELSE 0 END) as errors
|
||||
FROM usage WHERE customer_id = ? AND created_at >= ? GROUP BY day ORDER BY day DESC`
|
||||
).bind(customerId, since).all();
|
||||
|
||||
// Totals
|
||||
const totals = await db.prepare(
|
||||
`SELECT COUNT(*) as total_requests, SUM(tokens_used) as total_tokens,
|
||||
AVG(latency_ms) as avg_latency, COUNT(DISTINCT endpoint) as unique_endpoints
|
||||
FROM usage WHERE customer_id = ? AND created_at >= ?`
|
||||
).bind(customerId, since).first();
|
||||
|
||||
// Top endpoints
|
||||
const endpoints = await db.prepare(
|
||||
`SELECT endpoint, method, COUNT(*) as count, AVG(latency_ms) as avg_latency
|
||||
FROM usage WHERE customer_id = ? AND created_at >= ?
|
||||
GROUP BY endpoint, method ORDER BY count DESC LIMIT 10`
|
||||
).bind(customerId, since).all();
|
||||
|
||||
return json({
|
||||
customer_id: customerId,
|
||||
period: { days, since },
|
||||
totals,
|
||||
daily: daily.results,
|
||||
top_endpoints: endpoints.results,
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Record Usage (internal — called by API gateway) ─────────────────────
|
||||
async function handleRecordUsage(request, db) {
|
||||
const body = await request.json();
|
||||
const { customer_id, api_key_id, endpoint, method, status_code, latency_ms, tokens_used } = body;
|
||||
if (!customer_id || !endpoint) return err('customer_id and endpoint required');
|
||||
|
||||
await db.prepare(
|
||||
'INSERT INTO usage (customer_id, api_key_id, endpoint, method, status_code, latency_ms, tokens_used) VALUES (?, ?, ?, ?, ?, ?, ?)'
|
||||
).bind(customer_id, api_key_id || null, endpoint, method || 'GET', status_code || 200, latency_ms || 0, tokens_used || 0).run();
|
||||
|
||||
// Update API key usage count
|
||||
if (api_key_id) {
|
||||
await db.prepare(
|
||||
"UPDATE api_keys SET usage_count = usage_count + 1, last_used = datetime('now') WHERE id = ?"
|
||||
).bind(api_key_id).run();
|
||||
}
|
||||
|
||||
return json({ ok: true });
|
||||
}
|
||||
|
||||
function randHex(bytes) {
|
||||
const arr = new Uint8Array(bytes);
|
||||
crypto.getRandomValues(arr);
|
||||
return Array.from(arr).map(b => b.toString(16).padStart(2, '0')).join('');
|
||||
}
|
||||
|
||||
// ─── Rate limiting ───────────────────────────────────────────────────────
|
||||
const rl = new Map();
|
||||
function rateLimit(ip, max = 30, windowSec = 60) {
|
||||
@@ -1040,6 +1190,15 @@ export default {
|
||||
case path === '/stats':
|
||||
response = await handleStats(db);
|
||||
break;
|
||||
case path === '/keys' || path === '/api-keys':
|
||||
response = await handleApiKeys(request, db);
|
||||
break;
|
||||
case path === '/usage':
|
||||
response = await handleUsage(request, db);
|
||||
break;
|
||||
case path === '/usage/record' && request.method === 'POST':
|
||||
response = await handleRecordUsage(request, db);
|
||||
break;
|
||||
default:
|
||||
response = err('Not found', 404);
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user