Add 12 infra scripts: observability, alerting, logging, deployment

- Observability stack with metrics and tracing
- Alerting system with escalation rules
- Log aggregator for centralized logging
- Deploy pipeline for CI/CD automation
- Universal vault for credential management
- Cost tracker for cloud spend monitoring
- Fleet OS enhancer for device upgrades
- Live dashboard for real-time status
- Grafana deployment for visualization
- Backup system deployment
- Alert manager deployment
- Log aggregation deployment

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Alexa Amundson
2026-02-20 20:34:34 -06:00
parent c9e65b23e6
commit e31dbf972d
12 changed files with 5304 additions and 0 deletions

414
scripts/alerting.sh Normal file
View File

@@ -0,0 +1,414 @@
#!/bin/bash
# BlackRoad Alerting System
# Multi-channel alerts for cluster events
# Agent: Icarus (b3e01bd9)
PINK='\033[38;5;205m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
RESET='\033[0m'
ALERT_DIR="$HOME/.blackroad/alerts"
ALERT_DB="$ALERT_DIR/alerts.db"
CONFIG_FILE="$ALERT_DIR/config.json"
ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
# Alert severity levels
SEVERITY_INFO="info"
SEVERITY_WARNING="warning"
SEVERITY_ERROR="error"
SEVERITY_CRITICAL="critical"
# Initialize
init() {
mkdir -p "$ALERT_DIR"
sqlite3 "$ALERT_DB" << 'SQL'
CREATE TABLE IF NOT EXISTS alerts (
id INTEGER PRIMARY KEY AUTOINCREMENT,
severity TEXT,
source TEXT,
title TEXT,
message TEXT,
acknowledged INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
ack_at DATETIME,
ack_by TEXT
);
CREATE TABLE IF NOT EXISTS rules (
id TEXT PRIMARY KEY,
name TEXT,
condition TEXT,
severity TEXT,
channels TEXT,
enabled INTEGER DEFAULT 1,
cooldown INTEGER DEFAULT 300
);
CREATE INDEX IF NOT EXISTS idx_severity ON alerts(severity);
CREATE INDEX IF NOT EXISTS idx_ack ON alerts(acknowledged);
SQL
# Default config
if [ ! -f "$CONFIG_FILE" ]; then
cat > "$CONFIG_FILE" << 'EOF'
{
"channels": {
"console": {"enabled": true},
"file": {"enabled": true, "path": "~/.blackroad/alerts/alert.log"},
"webhook": {"enabled": false, "url": ""},
"email": {"enabled": false, "to": "", "smtp": ""},
"slack": {"enabled": false, "webhook": ""}
},
"thresholds": {
"cpu_warning": 80,
"cpu_critical": 95,
"mem_warning": 85,
"mem_critical": 95,
"disk_warning": 80,
"disk_critical": 90,
"temp_warning": 70,
"temp_critical": 80
}
}
EOF
fi
echo -e "${GREEN}Alerting system initialized${RESET}"
}
# Send alert
send() {
local severity="$1"
local source="$2"
local title="$3"
local message="$4"
local timestamp=$(date -Iseconds)
# Store in database
sqlite3 "$ALERT_DB" "
INSERT INTO alerts (severity, source, title, message)
VALUES ('$severity', '$source', '$(echo "$title" | sed "s/'/''/g")', '$(echo "$message" | sed "s/'/''/g")')
"
local alert_id=$(sqlite3 "$ALERT_DB" "SELECT last_insert_rowid()")
# Console output
local color=$RESET
case $severity in
info) color=$BLUE ;;
warning) color=$YELLOW ;;
error) color=$RED ;;
critical) color="${RED}\033[1m" ;;
esac
echo -e "${color}[$severity] $title${RESET}"
echo " Source: $source"
echo " Message: $message"
echo " Alert ID: $alert_id"
# File logging
echo "$timestamp [$severity] [$source] $title: $message" >> "$ALERT_DIR/alert.log"
# Webhook
local webhook_enabled=$(jq -r '.channels.webhook.enabled' "$CONFIG_FILE")
local webhook_url=$(jq -r '.channels.webhook.url' "$CONFIG_FILE")
if [ "$webhook_enabled" = "true" ] && [ -n "$webhook_url" ]; then
curl -s -X POST "$webhook_url" \
-H "Content-Type: application/json" \
-d "{\"severity\":\"$severity\",\"source\":\"$source\",\"title\":\"$title\",\"message\":\"$message\",\"timestamp\":\"$timestamp\"}" \
>/dev/null 2>&1 &
fi
# Slack
local slack_enabled=$(jq -r '.channels.slack.enabled' "$CONFIG_FILE")
local slack_webhook=$(jq -r '.channels.slack.webhook' "$CONFIG_FILE")
if [ "$slack_enabled" = "true" ] && [ -n "$slack_webhook" ]; then
local slack_color="good"
[ "$severity" = "warning" ] && slack_color="warning"
[ "$severity" = "error" ] || [ "$severity" = "critical" ] && slack_color="danger"
curl -s -X POST "$slack_webhook" \
-H "Content-Type: application/json" \
-d "{\"attachments\":[{\"color\":\"$slack_color\",\"title\":\"[$severity] $title\",\"text\":\"$message\",\"footer\":\"$source\"}]}" \
>/dev/null 2>&1 &
fi
echo "$alert_id"
}
# Check cluster and generate alerts
check() {
echo -e "${PINK}=== CLUSTER HEALTH CHECK ===${RESET}"
echo
local thresholds=$(cat "$CONFIG_FILE")
for node in "${ALL_NODES[@]}"; do
echo -n " $node: "
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
send "critical" "$node" "Node Offline" "Node $node is not reachable" >/dev/null
echo -e "${RED}OFFLINE${RESET}"
continue
fi
# Get metrics
local metrics=$(ssh "$node" "
cpu=\$(top -bn1 | grep 'Cpu(s)' | awk '{print 100-\$8}' 2>/dev/null || echo 0)
mem=\$(free | awk '/Mem:/ {printf \"%.0f\", \$3/\$2*100}')
disk=\$(df / | awk 'NR==2 {gsub(/%/,\"\"); print \$5}')
temp=\$(vcgencmd measure_temp 2>/dev/null | grep -oP '[\d.]+' || echo 0)
echo \"\$cpu|\$mem|\$disk|\$temp\"
" 2>/dev/null)
local cpu=$(echo "$metrics" | cut -d'|' -f1)
local mem=$(echo "$metrics" | cut -d'|' -f2)
local disk=$(echo "$metrics" | cut -d'|' -f3)
local temp=$(echo "$metrics" | cut -d'|' -f4)
local status="OK"
local status_color=$GREEN
# Check CPU
local cpu_warn=$(echo "$thresholds" | jq -r '.thresholds.cpu_warning')
local cpu_crit=$(echo "$thresholds" | jq -r '.thresholds.cpu_critical')
if [ "$(echo "$cpu > $cpu_crit" | bc -l)" = "1" ]; then
send "critical" "$node" "CPU Critical" "CPU usage at ${cpu}%" >/dev/null
status="CRITICAL"
status_color=$RED
elif [ "$(echo "$cpu > $cpu_warn" | bc -l)" = "1" ]; then
send "warning" "$node" "CPU Warning" "CPU usage at ${cpu}%" >/dev/null
status="WARNING"
status_color=$YELLOW
fi
# Check Memory
local mem_warn=$(echo "$thresholds" | jq -r '.thresholds.mem_warning')
local mem_crit=$(echo "$thresholds" | jq -r '.thresholds.mem_critical')
if [ "$mem" -gt "$mem_crit" ]; then
send "critical" "$node" "Memory Critical" "Memory usage at ${mem}%" >/dev/null
status="CRITICAL"
status_color=$RED
elif [ "$mem" -gt "$mem_warn" ]; then
send "warning" "$node" "Memory Warning" "Memory usage at ${mem}%" >/dev/null
[ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
fi
# Check Disk
local disk_warn=$(echo "$thresholds" | jq -r '.thresholds.disk_warning')
local disk_crit=$(echo "$thresholds" | jq -r '.thresholds.disk_critical')
if [ "$disk" -gt "$disk_crit" ]; then
send "critical" "$node" "Disk Critical" "Disk usage at ${disk}%" >/dev/null
status="CRITICAL"
status_color=$RED
elif [ "$disk" -gt "$disk_warn" ]; then
send "warning" "$node" "Disk Warning" "Disk usage at ${disk}%" >/dev/null
[ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
fi
# Check Temperature
local temp_warn=$(echo "$thresholds" | jq -r '.thresholds.temp_warning')
local temp_crit=$(echo "$thresholds" | jq -r '.thresholds.temp_critical')
if [ "$(echo "$temp > $temp_crit" | bc -l)" = "1" ]; then
send "critical" "$node" "Temperature Critical" "Temperature at ${temp}°C" >/dev/null
status="CRITICAL"
status_color=$RED
elif [ "$(echo "$temp > $temp_warn" | bc -l)" = "1" ]; then
send "warning" "$node" "Temperature Warning" "Temperature at ${temp}°C" >/dev/null
[ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
fi
echo -e "${status_color}$status${RESET} (cpu:${cpu}% mem:${mem}% disk:${disk}% temp:${temp}°C)"
done
}
# Monitor daemon
monitor() {
local interval="${1:-60}"
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 🔔 ALERT MONITOR DAEMON ║${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
echo "Check interval: ${interval}s"
echo "Press Ctrl+C to stop"
echo
while true; do
echo "[$(date '+%H:%M:%S')] Checking cluster..."
check >/dev/null 2>&1
sleep "$interval"
done
}
# List alerts
list() {
local filter="${1:-all}"
local limit="${2:-20}"
echo -e "${PINK}=== ALERTS ===${RESET}"
echo
local where=""
case "$filter" in
unack) where="WHERE acknowledged = 0" ;;
ack) where="WHERE acknowledged = 1" ;;
critical) where="WHERE severity = 'critical'" ;;
warning) where="WHERE severity = 'warning'" ;;
esac
sqlite3 "$ALERT_DB" "
SELECT id, severity, source, title, acknowledged, created_at
FROM alerts $where
ORDER BY created_at DESC
LIMIT $limit
" | while IFS='|' read -r id severity source title ack created; do
local color=$RESET
case $severity in
info) color=$BLUE ;;
warning) color=$YELLOW ;;
error|critical) color=$RED ;;
esac
local ack_status=""
[ "$ack" = "1" ] && ack_status=" [ACK]"
printf "${color}#%-5s %-10s %-10s %s${RESET}%s\n" "$id" "[$severity]" "$source" "$title" "$ack_status"
done
}
# Acknowledge alert
ack() {
local alert_id="$1"
local by="${2:-system}"
sqlite3 "$ALERT_DB" "
UPDATE alerts SET acknowledged = 1, ack_at = datetime('now'), ack_by = '$by'
WHERE id = $alert_id
"
echo -e "${GREEN}Acknowledged alert #$alert_id${RESET}"
}
# Acknowledge all
ack_all() {
local by="${1:-system}"
sqlite3 "$ALERT_DB" "
UPDATE alerts SET acknowledged = 1, ack_at = datetime('now'), ack_by = '$by'
WHERE acknowledged = 0
"
echo -e "${GREEN}Acknowledged all alerts${RESET}"
}
# Stats
stats() {
echo -e "${PINK}=== ALERT STATISTICS ===${RESET}"
echo
echo "By severity (last 24h):"
sqlite3 "$ALERT_DB" "
SELECT severity, COUNT(*)
FROM alerts
WHERE datetime(created_at, '+1 day') > datetime('now')
GROUP BY severity
" | while IFS='|' read -r severity count; do
echo " $severity: $count"
done
echo
echo "By source (last 24h):"
sqlite3 "$ALERT_DB" "
SELECT source, COUNT(*)
FROM alerts
WHERE datetime(created_at, '+1 day') > datetime('now')
GROUP BY source
ORDER BY COUNT(*) DESC
LIMIT 5
" | while IFS='|' read -r source count; do
echo " $source: $count"
done
echo
local unack=$(sqlite3 "$ALERT_DB" "SELECT COUNT(*) FROM alerts WHERE acknowledged = 0")
echo "Unacknowledged: $unack"
}
# Test alert
test_alert() {
send "info" "test" "Test Alert" "This is a test alert from the alerting system"
}
# Help
help() {
echo -e "${PINK}BlackRoad Alerting System${RESET}"
echo
echo "Multi-channel alerts for cluster events"
echo
echo "Commands:"
echo " send <sev> <src> <title> <msg> Send alert"
echo " check Check cluster health"
echo " monitor [interval] Run alert daemon"
echo " list [filter] [limit] List alerts"
echo " ack <id> Acknowledge alert"
echo " ack-all Acknowledge all"
echo " stats Alert statistics"
echo " test Send test alert"
echo
echo "Severities: info, warning, error, critical"
echo "Filters: all, unack, ack, critical, warning"
echo
echo "Examples:"
echo " $0 send warning cecilia 'High Load' 'Load average is 8.5'"
echo " $0 monitor 30"
echo " $0 list unack"
}
# Ensure initialized
[ -f "$ALERT_DB" ] || init >/dev/null
case "${1:-help}" in
init)
init
;;
send|alert)
send "$2" "$3" "$4" "$5"
;;
check)
check
;;
monitor|daemon)
monitor "$2"
;;
list|ls)
list "$2" "$3"
;;
ack)
ack "$2" "$3"
;;
ack-all)
ack_all "$2"
;;
stats)
stats
;;
test)
test_alert
;;
*)
help
;;
esac

451
scripts/cost-tracker.sh Normal file
View File

@@ -0,0 +1,451 @@
#!/bin/bash
# BlackRoad Cost Tracker
# Track resource usage and costs across the cluster
# Agent: Icarus (b3e01bd9)
PINK='\033[38;5;205m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
CYAN='\033[0;36m'
RESET='\033[0m'
COST_DIR="$HOME/.blackroad/costs"
COST_DB="$COST_DIR/costs.db"
ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
# Default rates (can be customized)
RATE_CPU_HOUR=0.001 # $ per CPU-hour
RATE_MEM_GB_HOUR=0.0005 # $ per GB-hour
RATE_GPU_HOUR=0.01 # $ per GPU-hour (Hailo)
RATE_INFERENCE=0.0001 # $ per inference request
RATE_TOKEN=0.000001 # $ per token
# Initialize
init() {
mkdir -p "$COST_DIR"/{reports,budgets}
sqlite3 "$COST_DB" << 'SQL'
CREATE TABLE IF NOT EXISTS usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
node TEXT,
project TEXT DEFAULT 'default',
resource_type TEXT,
quantity REAL,
unit TEXT,
cost REAL
);
CREATE TABLE IF NOT EXISTS rates (
resource_type TEXT PRIMARY KEY,
rate REAL,
unit TEXT,
description TEXT
);
CREATE TABLE IF NOT EXISTS budgets (
project TEXT PRIMARY KEY,
monthly_limit REAL,
alert_threshold REAL DEFAULT 0.8,
current_spend REAL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS invoices (
id TEXT PRIMARY KEY,
project TEXT,
period_start DATE,
period_end DATE,
total REAL,
status TEXT DEFAULT 'pending',
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_project ON usage(project);
CREATE INDEX IF NOT EXISTS idx_timestamp ON usage(timestamp);
SQL
# Seed default rates
seed_rates
echo -e "${GREEN}Cost tracker initialized${RESET}"
}
# Seed default rates
seed_rates() {
sqlite3 "$COST_DB" << SQL
INSERT OR IGNORE INTO rates (resource_type, rate, unit, description) VALUES
('cpu', $RATE_CPU_HOUR, 'cpu-hour', 'CPU compute time'),
('memory', $RATE_MEM_GB_HOUR, 'gb-hour', 'Memory usage'),
('gpu', $RATE_GPU_HOUR, 'gpu-hour', 'Hailo accelerator time'),
('inference', $RATE_INFERENCE, 'request', 'LLM inference request'),
('tokens', $RATE_TOKEN, 'token', 'Input/output tokens'),
('storage', 0.00001, 'gb-hour', 'Disk storage'),
('network', 0.00001, 'gb', 'Network transfer');
SQL
}
# Record usage
record() {
local resource="$1"
local quantity="$2"
local project="${3:-default}"
local node="${4:-$(hostname)}"
local rate=$(sqlite3 "$COST_DB" "SELECT rate FROM rates WHERE resource_type = '$resource'")
local cost=$(echo "scale=6; $quantity * $rate" | bc)
sqlite3 "$COST_DB" "
INSERT INTO usage (node, project, resource_type, quantity, unit, cost)
VALUES ('$node', '$project', '$resource', $quantity, (SELECT unit FROM rates WHERE resource_type = '$resource'), $cost)
"
# Update budget
sqlite3 "$COST_DB" "
UPDATE budgets SET current_spend = current_spend + $cost WHERE project = '$project'
"
echo -e "${GREEN}Recorded: $quantity $resource = \$$cost${RESET}"
}
# Record inference usage
record_inference() {
local project="${1:-default}"
local tokens_in="${2:-0}"
local tokens_out="${3:-0}"
local node="${4:-$(hostname)}"
record "inference" 1 "$project" "$node"
record "tokens" "$((tokens_in + tokens_out))" "$project" "$node"
}
# Set rate
set_rate() {
local resource="$1"
local rate="$2"
local unit="${3:-unit}"
sqlite3 "$COST_DB" "
INSERT OR REPLACE INTO rates (resource_type, rate, unit)
VALUES ('$resource', $rate, '$unit')
"
echo -e "${GREEN}Rate set: $resource = \$$rate per $unit${RESET}"
}
# List rates
rates() {
echo -e "${PINK}=== RESOURCE RATES ===${RESET}"
echo
sqlite3 "$COST_DB" "SELECT resource_type, rate, unit, description FROM rates ORDER BY resource_type" | \
while IFS='|' read -r resource rate unit desc; do
printf " %-15s \$%-10.6f per %-10s %s\n" "$resource" "$rate" "$unit" "$desc"
done
}
# Create budget
budget_create() {
local project="$1"
local limit="$2"
local threshold="${3:-0.8}"
sqlite3 "$COST_DB" "
INSERT OR REPLACE INTO budgets (project, monthly_limit, alert_threshold, current_spend)
VALUES ('$project', $limit, $threshold, 0)
"
echo -e "${GREEN}Budget created: $project = \$$limit/month${RESET}"
}
# Check budgets
budget_check() {
echo -e "${PINK}=== BUDGET STATUS ===${RESET}"
echo
sqlite3 "$COST_DB" "SELECT project, monthly_limit, current_spend, alert_threshold FROM budgets" | \
while IFS='|' read -r project limit spend threshold; do
local pct=$(echo "scale=1; $spend * 100 / $limit" | bc 2>/dev/null || echo 0)
local threshold_pct=$(echo "scale=0; $threshold * 100" | bc)
local color=$GREEN
local alert_val=$(echo "$spend / $limit" | bc -l)
if [ "$(echo "$alert_val > $threshold" | bc -l)" = "1" ]; then
color=$YELLOW
fi
if [ "$(echo "$alert_val > 1" | bc -l)" = "1" ]; then
color=$RED
fi
printf " %-15s ${color}\$%.2f / \$%.2f (%.1f%%)${RESET}\n" "$project" "$spend" "$limit" "$pct"
done
}
# Current period costs
current() {
local project="${1:-all}"
local period="${2:-month}"
echo -e "${PINK}=== CURRENT $period COSTS ===${RESET}"
echo
local where=""
[ "$project" != "all" ] && where="AND project = '$project'"
local period_filter
case "$period" in
day) period_filter="date(timestamp) = date('now')" ;;
week) period_filter="datetime(timestamp, '+7 days') > datetime('now')" ;;
month) period_filter="datetime(timestamp, '+1 month') > datetime('now')" ;;
esac
echo "By resource:"
sqlite3 "$COST_DB" "
SELECT resource_type, SUM(quantity), unit, SUM(cost)
FROM usage
WHERE $period_filter $where
GROUP BY resource_type
ORDER BY SUM(cost) DESC
" | while IFS='|' read -r resource qty unit cost; do
printf " %-15s %10.2f %-10s \$%.4f\n" "$resource" "$qty" "$unit" "$cost"
done
echo
echo "By project:"
sqlite3 "$COST_DB" "
SELECT project, SUM(cost)
FROM usage
WHERE $period_filter $where
GROUP BY project
ORDER BY SUM(cost) DESC
" | while IFS='|' read -r proj cost; do
printf " %-15s \$%.4f\n" "$proj" "$cost"
done
echo
echo "By node:"
sqlite3 "$COST_DB" "
SELECT node, SUM(cost)
FROM usage
WHERE $period_filter $where
GROUP BY node
ORDER BY SUM(cost) DESC
" | while IFS='|' read -r node cost; do
printf " %-15s \$%.4f\n" "$node" "$cost"
done
echo
local total=$(sqlite3 "$COST_DB" "SELECT SUM(cost) FROM usage WHERE $period_filter $where")
echo -e "Total: ${GREEN}\$${total:-0}${RESET}"
}
# Generate invoice
invoice() {
local project="$1"
local start_date="${2:-$(date -d 'first day of this month' +%Y-%m-%d 2>/dev/null || date -v1d +%Y-%m-%d)}"
local end_date="${3:-$(date +%Y-%m-%d)}"
local invoice_id="inv_$(date +%Y%m)_${project}"
echo -e "${PINK}=== INVOICE: $invoice_id ===${RESET}"
echo
echo "Project: $project"
echo "Period: $start_date to $end_date"
echo
echo "─────────────────────────────────────────────────────────────────"
printf "%-20s %15s %12s %12s\n" "Resource" "Quantity" "Rate" "Cost"
echo "─────────────────────────────────────────────────────────────────"
local total=0
sqlite3 "$COST_DB" "
SELECT u.resource_type, SUM(u.quantity), u.unit, r.rate, SUM(u.cost)
FROM usage u
JOIN rates r ON u.resource_type = r.resource_type
WHERE u.project = '$project'
AND date(u.timestamp) BETWEEN '$start_date' AND '$end_date'
GROUP BY u.resource_type
" | while IFS='|' read -r resource qty unit rate cost; do
printf "%-20s %12.2f %-3s \$%-8.6f \$%.4f\n" "$resource" "$qty" "$unit" "$rate" "$cost"
total=$(echo "$total + $cost" | bc)
done
echo "─────────────────────────────────────────────────────────────────"
total=$(sqlite3 "$COST_DB" "
SELECT SUM(cost) FROM usage
WHERE project = '$project'
AND date(timestamp) BETWEEN '$start_date' AND '$end_date'
")
printf "%48s \$%.4f\n" "TOTAL:" "$total"
echo
# Save invoice
sqlite3 "$COST_DB" "
INSERT OR REPLACE INTO invoices (id, project, period_start, period_end, total)
VALUES ('$invoice_id', '$project', '$start_date', '$end_date', $total)
"
# Export to file
local invoice_file="$COST_DIR/reports/${invoice_id}.txt"
{
echo "INVOICE: $invoice_id"
echo "Project: $project"
echo "Period: $start_date to $end_date"
echo "Generated: $(date)"
echo ""
echo "Total: \$$total"
} > "$invoice_file"
echo "Saved to: $invoice_file"
}
# Cost forecast
forecast() {
local project="${1:-all}"
local days="${2:-30}"
echo -e "${PINK}=== COST FORECAST ===${RESET}"
echo "Based on last 7 days, projecting $days days"
echo
local where=""
[ "$project" != "all" ] && where="WHERE project = '$project'"
local daily_avg=$(sqlite3 "$COST_DB" "
SELECT SUM(cost) / 7 FROM usage
WHERE datetime(timestamp, '+7 days') > datetime('now')
$where
")
local projected=$(echo "scale=2; $daily_avg * $days" | bc)
echo "Daily average: \$${daily_avg:-0}"
echo "Projected ${days}-day cost: \$$projected"
if [ "$project" != "all" ]; then
local limit=$(sqlite3 "$COST_DB" "SELECT monthly_limit FROM budgets WHERE project = '$project'")
if [ -n "$limit" ]; then
local pct=$(echo "scale=1; $projected * 100 / $limit" | bc)
echo "Budget utilization: ${pct}%"
fi
fi
}
# Collect usage from nodes
collect() {
echo -e "${PINK}=== COLLECTING USAGE ===${RESET}"
echo
for node in "${ALL_NODES[@]}"; do
echo -n " $node: "
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo "(offline)"
continue
fi
# Get resource usage
local metrics=$(ssh "$node" "
cpu_hours=\$(cat /proc/stat | awk '/^cpu / {print (\$2+\$3+\$4)/100/3600}')
mem_gb=\$(free -g | awk '/Mem:/ {print \$3}')
disk_gb=\$(df / | awk 'NR==2 {print \$3/1024/1024}')
echo \"\$cpu_hours|\$mem_gb|\$disk_gb\"
" 2>/dev/null)
if [ -n "$metrics" ]; then
local cpu=$(echo "$metrics" | cut -d'|' -f1)
local mem=$(echo "$metrics" | cut -d'|' -f2)
local disk=$(echo "$metrics" | cut -d'|' -f3)
record "cpu" "$cpu" "default" "$node" >/dev/null
record "memory" "$mem" "default" "$node" >/dev/null
record "storage" "$disk" "default" "$node" >/dev/null
echo "collected"
else
echo "failed"
fi
done
}
# Reset monthly budgets
reset_budgets() {
sqlite3 "$COST_DB" "UPDATE budgets SET current_spend = 0"
echo -e "${GREEN}Reset all budget counters${RESET}"
}
# Help
help() {
echo -e "${PINK}BlackRoad Cost Tracker${RESET}"
echo
echo "Track resource usage and costs"
echo
echo "Usage Recording:"
echo " record <resource> <qty> [proj] Record usage"
echo " record-inference [proj] [in] [out] Record inference"
echo " collect Collect from nodes"
echo
echo "Rates & Budgets:"
echo " rates List rates"
echo " set-rate <res> <rate> [unit] Set rate"
echo " budget-create <proj> <limit> Create budget"
echo " budget-check Check budgets"
echo
echo "Reports:"
echo " current [proj] [day|week|month] Current costs"
echo " invoice <proj> [start] [end] Generate invoice"
echo " forecast [proj] [days] Cost forecast"
echo
echo "Examples:"
echo " $0 record inference 100 myproject"
echo " $0 budget-create myproject 50"
echo " $0 invoice myproject 2024-01-01"
}
# Ensure initialized
[ -f "$COST_DB" ] || init >/dev/null
case "${1:-help}" in
init)
init
;;
record)
record "$2" "$3" "$4" "$5"
;;
record-inference)
record_inference "$2" "$3" "$4" "$5"
;;
collect)
collect
;;
rates)
rates
;;
set-rate)
set_rate "$2" "$3" "$4"
;;
budget-create|budget)
budget_create "$2" "$3" "$4"
;;
budget-check|budgets)
budget_check
;;
current|costs)
current "$2" "$3"
;;
invoice)
invoice "$2" "$3" "$4"
;;
forecast)
forecast "$2" "$3"
;;
reset-budgets)
reset_budgets
;;
*)
help
;;
esac

468
scripts/deploy-alert-manager.sh Executable file
View File

@@ -0,0 +1,468 @@
#!/bin/bash
# Deploy Alert Manager for BlackRoad OS
# Wave 10A: Intelligent alerting system
set -e
echo "🚨 Deploying Alert Manager to octavia..."
# Create alert manager on octavia
ssh octavia << 'REMOTE'
set -e
echo "📁 Creating alert manager directories..."
mkdir -p ~/alert-manager/{alerts,history}
# Create alert manager using Python stdlib
cat > ~/alert-manager/app.py << 'EOF'
#!/usr/bin/env python3
import http.server
import socketserver
import json
import os
import time
from urllib.request import urlopen, Request
from urllib.error import URLError
from datetime import datetime
from email.mime.text import MIMEText
import smtplib
PORT = 5700
ALERTS_DIR = os.path.expanduser('~/alert-manager/alerts')
HISTORY_DIR = os.path.expanduser('~/alert-manager/history')
# Alert rules configuration
ALERT_RULES = {
'cpu_high': {
'metric': 'cpu_percent',
'threshold': 80,
'operator': '>',
'severity': 'warning',
'message': 'CPU usage is high: {value}%'
},
'cpu_critical': {
'metric': 'cpu_percent',
'threshold': 95,
'operator': '>',
'severity': 'critical',
'message': 'CPU usage is critical: {value}%'
},
'memory_high': {
'metric': 'memory_percent',
'threshold': 85,
'operator': '>',
'severity': 'warning',
'message': 'Memory usage is high: {value}%'
},
'memory_critical': {
'metric': 'memory_percent',
'threshold': 95,
'operator': '>',
'severity': 'critical',
'message': 'Memory usage is critical: {value}%'
},
'disk_high': {
'metric': 'disk_percent',
'threshold': 90,
'operator': '>',
'severity': 'warning',
'message': 'Disk usage is high: {value}%'
},
'service_down': {
'metric': 'services',
'threshold': 5,
'operator': '<',
'severity': 'critical',
'message': 'Service down: {service}'
}
}
class AlertManager:
def __init__(self):
self.active_alerts = {}
self.alert_history = []
def check_metrics(self):
"""Fetch current metrics and check against rules"""
try:
with urlopen('http://localhost:5400/metrics/json', timeout=2) as response:
metrics = json.loads(response.read())
triggered_alerts = []
# Check system metrics
system = metrics.get('system', {})
for rule_id, rule in ALERT_RULES.items():
if rule['metric'] in system:
value = system[rule['metric']]
if self._evaluate_rule(value, rule['threshold'], rule['operator']):
alert = {
'id': rule_id,
'severity': rule['severity'],
'message': rule['message'].format(value=value),
'value': value,
'threshold': rule['threshold'],
'timestamp': datetime.now().isoformat()
}
triggered_alerts.append(alert)
# Check service health
services = metrics.get('services', {})
healthy_count = sum(1 for v in services.values() if v)
if healthy_count < 5:
for service, status in services.items():
if not status:
alert = {
'id': f'service_{service}_down',
'severity': 'critical',
'message': f'Service down: {service}',
'service': service,
'timestamp': datetime.now().isoformat()
}
triggered_alerts.append(alert)
# Process alerts
for alert in triggered_alerts:
self._handle_alert(alert)
# Clear resolved alerts
self._clear_resolved_alerts(metrics)
return triggered_alerts
except Exception as e:
return [{'error': str(e)}]
def _evaluate_rule(self, value, threshold, operator):
"""Evaluate a rule condition"""
if operator == '>':
return value > threshold
elif operator == '<':
return value < threshold
elif operator == '==':
return value == threshold
return False
def _handle_alert(self, alert):
"""Handle a triggered alert"""
alert_id = alert['id']
# Check if alert already active
if alert_id in self.active_alerts:
# Update existing alert
self.active_alerts[alert_id]['count'] += 1
self.active_alerts[alert_id]['last_seen'] = alert['timestamp']
else:
# New alert
alert['count'] = 1
alert['first_seen'] = alert['timestamp']
alert['last_seen'] = alert['timestamp']
self.active_alerts[alert_id] = alert
# Send notification for new alerts
self._send_notification(alert)
# Log to history
self._log_to_history(alert)
def _clear_resolved_alerts(self, metrics):
"""Clear alerts that are no longer triggered"""
system = metrics.get('system', {})
resolved = []
for alert_id, alert in list(self.active_alerts.items()):
# Check if condition is still met
should_clear = False
if 'service' in alert:
# Service alert
services = metrics.get('services', {})
if alert['service'] in services and services[alert['service']]:
should_clear = True
else:
# System metric alert
for rule_id, rule in ALERT_RULES.items():
if rule_id == alert_id:
if rule['metric'] in system:
value = system[rule['metric']]
if not self._evaluate_rule(value, rule['threshold'], rule['operator']):
should_clear = True
if should_clear:
resolved.append(alert_id)
del self.active_alerts[alert_id]
return resolved
def _send_notification(self, alert):
"""Send notification (webhook or email)"""
# Check for webhook configuration
webhook_url = os.environ.get('ALERT_WEBHOOK_URL')
if webhook_url:
try:
data = json.dumps(alert).encode()
req = Request(webhook_url, data=data, headers={'Content-Type': 'application/json'})
urlopen(req, timeout=5)
except:
pass
def _log_to_history(self, alert):
"""Log alert to history file"""
history_file = os.path.join(HISTORY_DIR, f"alerts_{datetime.now().strftime('%Y%m%d')}.json")
history_entry = {
'timestamp': alert['timestamp'],
'id': alert['id'],
'severity': alert['severity'],
'message': alert['message']
}
self.alert_history.append(history_entry)
# Append to daily log file
try:
with open(history_file, 'a') as f:
f.write(json.dumps(history_entry) + '\n')
except:
pass
alert_manager = AlertManager()
class AlertHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/':
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Check for new alerts
triggered = alert_manager.check_metrics()
active_count = len(alert_manager.active_alerts)
critical_count = sum(1 for a in alert_manager.active_alerts.values() if a['severity'] == 'critical')
warning_count = sum(1 for a in alert_manager.active_alerts.values() if a['severity'] == 'warning')
html = f'''<!DOCTYPE html>
<html>
<head>
<title>BlackRoad Alert Manager</title>
<meta http-equiv="refresh" content="15">
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background: #0b0c0e;
color: #d8d9da;
padding: 20px;
}}
.header {{
background: #1f1f20;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
}}
.title {{
font-size: 28px;
font-weight: 600;
color: #ff1d6c;
margin-bottom: 10px;
}}
.stats {{
display: grid;
grid-template-columns: repeat(3, 1fr);
gap: 20px;
margin-bottom: 20px;
}}
.stat-card {{
background: #1f1f20;
padding: 16px;
border-radius: 8px;
border-left: 4px solid;
}}
.stat-card.active {{ border-color: #0096FF; }}
.stat-card.critical {{ border-color: #ff1d6c; }}
.stat-card.warning {{ border-color: #f5a623; }}
.stat-value {{
font-size: 32px;
font-weight: 300;
margin-bottom: 4px;
}}
.stat-label {{
font-size: 14px;
color: #9d9fa1;
}}
.alerts-section {{
background: #1f1f20;
padding: 20px;
border-radius: 8px;
}}
.section-title {{
font-size: 18px;
margin-bottom: 16px;
color: #d8d9da;
}}
.alert {{
padding: 12px;
border-radius: 4px;
margin-bottom: 12px;
border-left: 4px solid;
}}
.alert.critical {{
background: #ff1d6c22;
border-color: #ff1d6c;
}}
.alert.warning {{
background: #f5a62322;
border-color: #f5a623;
}}
.alert-header {{
display: flex;
justify-content: space-between;
margin-bottom: 4px;
}}
.alert-severity {{
font-weight: 600;
text-transform: uppercase;
font-size: 12px;
}}
.alert-time {{
font-size: 12px;
color: #9d9fa1;
}}
.alert-message {{
font-size: 14px;
}}
.no-alerts {{
text-align: center;
padding: 40px;
color: #73bf69;
font-size: 18px;
}}
</style>
</head>
<body>
<div class="header">
<div class="title">🚨 Alert Manager</div>
<div style="color: #9d9fa1; font-size: 14px;">Real-time monitoring • Auto-refresh: 15s</div>
</div>
<div class="stats">
<div class="stat-card active">
<div class="stat-value">{active_count}</div>
<div class="stat-label">Active Alerts</div>
</div>
<div class="stat-card critical">
<div class="stat-value">{critical_count}</div>
<div class="stat-label">Critical</div>
</div>
<div class="stat-card warning">
<div class="stat-value">{warning_count}</div>
<div class="stat-label">Warnings</div>
</div>
</div>
<div class="alerts-section">
<div class="section-title">Active Alerts</div>
'''
if alert_manager.active_alerts:
for alert_id, alert in alert_manager.active_alerts.items():
severity_class = alert['severity']
html += f'''
<div class="alert {severity_class}">
<div class="alert-header">
<span class="alert-severity">{alert['severity']}</span>
<span class="alert-time">{alert['last_seen']}</span>
</div>
<div class="alert-message">{alert['message']}</div>
<div style="font-size: 12px; color: #9d9fa1; margin-top: 4px;">
Triggered {alert['count']} time(s) • First seen: {alert['first_seen']}
</div>
</div>'''
else:
html += '<div class="no-alerts">✅ All systems healthy - No active alerts</div>'
html += '''
</div>
</body>
</html>'''
self.wfile.write(html.encode())
elif self.path == '/api/alerts':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
response = json.dumps({
'active_alerts': list(alert_manager.active_alerts.values()),
'count': len(alert_manager.active_alerts)
})
self.wfile.write(response.encode())
elif self.path == '/api/health':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
response = json.dumps({'status': 'healthy', 'service': 'alert-manager'})
self.wfile.write(response.encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
with socketserver.TCPServer(("", PORT), AlertHandler) as httpd:
print(f"Alert Manager running on port {PORT}")
httpd.serve_forever()
EOF
chmod +x ~/alert-manager/app.py
echo "📝 Creating systemd service..."
mkdir -p ~/.config/systemd/user
cat > ~/.config/systemd/user/alert-manager.service << 'SYSTEMD'
[Unit]
Description=BlackRoad Alert Manager
After=network.target
[Service]
Type=simple
WorkingDirectory=%h/alert-manager
ExecStart=/usr/bin/python3 %h/alert-manager/app.py
Restart=always
RestartSec=10
[Install]
WantedBy=default.target
SYSTEMD
echo "🚀 Starting Alert Manager service..."
systemctl --user daemon-reload
systemctl --user enable alert-manager.service
systemctl --user restart alert-manager.service
echo "⏳ Waiting for Alert Manager to start..."
sleep 3
echo "✅ Testing Alert Manager..."
curl -f http://localhost:5700/api/health || echo "⚠️ Health check failed"
echo ""
echo "✅ Alert Manager deployed successfully!"
systemctl --user status alert-manager.service --no-pager | head -10
REMOTE
echo ""
echo "✅ Wave 10A deployment complete!"
echo ""
echo "🚨 Access Alert Manager:"
echo " http://octavia:5700/"
echo ""
echo "📊 Features:"
echo " • Real-time alert monitoring"
echo " • Threshold-based rules"
echo " • Alert history tracking"
echo " • Webhook integration ready"

471
scripts/deploy-backup.sh Executable file
View File

@@ -0,0 +1,471 @@
#!/bin/bash
# Deploy Automated Backup System for BlackRoad OS
# Wave 12A: Disaster recovery and data protection
set -e
echo "💾 Deploying Backup System to octavia..."
# Create backup system on octavia
ssh octavia << 'REMOTE'
set -e
echo "📁 Creating backup system directories..."
mkdir -p ~/backup-system/{backups,logs,scripts}
# Create backup orchestrator using Python stdlib
cat > ~/backup-system/app.py << 'EOF'
#!/usr/bin/env python3
import http.server
import socketserver
import json
import os
import subprocess
import tarfile
import shutil
from datetime import datetime
from pathlib import Path
PORT = 5900
BACKUP_DIR = os.path.expanduser('~/backup-system/backups')
LOGS_DIR = os.path.expanduser('~/backup-system/logs')
class BackupManager:
def __init__(self):
self.backup_dir = Path(BACKUP_DIR)
self.backup_dir.mkdir(parents=True, exist_ok=True)
# Define what to backup
self.backup_targets = {
'configs': [
'~/.config/systemd/user/*.service',
'~/.cloudflared/config.yml',
'/etc/nginx/sites-available/*',
],
'services': {
'tts-api': '~/tts-api',
'monitor-api': '~/monitoring',
'load-balancer': '~/load-balancer',
'fleet-monitor': '~/fleet-monitor',
'notifications': '~/notifications',
'metrics': '~/metrics',
'analytics': '~/analytics',
'grafana': '~/grafana',
'alert-manager': '~/alert-manager',
'log-aggregator': '~/log-aggregator',
},
'website': '~/www.blackroad.io',
}
def create_backup(self, backup_type='full'):
"""Create a backup snapshot"""
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
backup_name = f'backup_{backup_type}_{timestamp}'
backup_path = self.backup_dir / backup_name
backup_path.mkdir(parents=True, exist_ok=True)
results = {
'timestamp': timestamp,
'type': backup_type,
'name': backup_name,
'files': [],
'errors': []
}
try:
# Backup systemd service files
config_dir = backup_path / 'configs'
config_dir.mkdir(parents=True, exist_ok=True)
systemd_dir = os.path.expanduser('~/.config/systemd/user')
if os.path.exists(systemd_dir):
for service_file in Path(systemd_dir).glob('*.service'):
try:
shutil.copy2(service_file, config_dir)
results['files'].append(str(service_file))
except Exception as e:
results['errors'].append(f"Failed to backup {service_file}: {str(e)}")
# Backup Cloudflare config
cf_config = os.path.expanduser('~/.cloudflared/config.yml')
if os.path.exists(cf_config):
try:
shutil.copy2(cf_config, config_dir / 'cloudflared-config.yml')
results['files'].append(cf_config)
except Exception as e:
results['errors'].append(f"Failed to backup Cloudflare config: {str(e)}")
# Backup service directories
for service_name, service_path in self.backup_targets['services'].items():
expanded_path = os.path.expanduser(service_path)
if os.path.exists(expanded_path):
dest = backup_path / 'services' / service_name
try:
shutil.copytree(expanded_path, dest,
ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '*.log'))
results['files'].append(service_path)
except Exception as e:
results['errors'].append(f"Failed to backup {service_name}: {str(e)}")
# Backup website
website_path = os.path.expanduser(self.backup_targets['website'])
if os.path.exists(website_path):
dest = backup_path / 'website'
try:
shutil.copytree(website_path, dest)
results['files'].append(self.backup_targets['website'])
except Exception as e:
results['errors'].append(f"Failed to backup website: {str(e)}")
# Create tarball
tarball_path = self.backup_dir / f'{backup_name}.tar.gz'
with tarfile.open(tarball_path, 'w:gz') as tar:
tar.add(backup_path, arcname=backup_name)
# Remove temp directory
shutil.rmtree(backup_path)
# Get backup size
backup_size = os.path.getsize(tarball_path)
results['size_bytes'] = backup_size
results['size_mb'] = round(backup_size / (1024 * 1024), 2)
results['tarball'] = str(tarball_path)
results['success'] = True
# Log backup
self._log_backup(results)
except Exception as e:
results['success'] = False
results['errors'].append(f"Backup failed: {str(e)}")
return results
def list_backups(self):
"""List all available backups"""
backups = []
for backup_file in sorted(self.backup_dir.glob('backup_*.tar.gz'), reverse=True):
stat = backup_file.stat()
backups.append({
'name': backup_file.name,
'path': str(backup_file),
'size_mb': round(stat.st_size / (1024 * 1024), 2),
'created': datetime.fromtimestamp(stat.st_mtime).isoformat(),
'age_hours': round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
})
return backups
def cleanup_old_backups(self, keep_count=10):
"""Keep only the N most recent backups"""
backups = sorted(self.backup_dir.glob('backup_*.tar.gz'),
key=lambda x: x.stat().st_mtime, reverse=True)
deleted = []
for old_backup in backups[keep_count:]:
try:
old_backup.unlink()
deleted.append(old_backup.name)
except Exception as e:
pass
return deleted
def get_backup_stats(self):
"""Get backup statistics"""
backups = self.list_backups()
total_size = sum(b['size_mb'] for b in backups)
return {
'count': len(backups),
'total_size_mb': round(total_size, 2),
'oldest': backups[-1] if backups else None,
'newest': backups[0] if backups else None
}
def _log_backup(self, results):
"""Log backup to file"""
log_file = Path(LOGS_DIR) / f"backup_{datetime.now().strftime('%Y%m%d')}.log"
log_file.parent.mkdir(parents=True, exist_ok=True)
with open(log_file, 'a') as f:
f.write(json.dumps(results) + '\n')
backup_manager = BackupManager()
class BackupHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/':
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
stats = backup_manager.get_backup_stats()
backups = backup_manager.list_backups()
html = f'''<!DOCTYPE html>
<html>
<head>
<title>BlackRoad Backup System</title>
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background: #0b0c0e;
color: #d8d9da;
padding: 20px;
}}
.header {{
background: #1f1f20;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
}}
.title {{
font-size: 28px;
font-weight: 600;
color: #73bf69;
margin-bottom: 10px;
}}
.stats {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin-bottom: 20px;
}}
.stat-card {{
background: #1f1f20;
padding: 16px;
border-radius: 8px;
border-left: 4px solid #73bf69;
}}
.stat-value {{
font-size: 32px;
font-weight: 300;
margin-bottom: 4px;
}}
.stat-label {{
font-size: 14px;
color: #9d9fa1;
}}
.actions {{
background: #1f1f20;
padding: 16px;
border-radius: 8px;
margin-bottom: 20px;
}}
.btn {{
background: #73bf69;
color: #0b0c0e;
border: none;
padding: 10px 20px;
border-radius: 4px;
font-weight: 600;
cursor: pointer;
margin-right: 10px;
}}
.btn:hover {{
background: #8cd87a;
}}
.backups-list {{
background: #1f1f20;
padding: 20px;
border-radius: 8px;
}}
.section-title {{
font-size: 18px;
margin-bottom: 16px;
}}
.backup-item {{
background: #252527;
padding: 12px;
border-radius: 4px;
margin-bottom: 12px;
display: flex;
justify-content: space-between;
align-items: center;
}}
.backup-info {{
flex: 1;
}}
.backup-name {{
font-weight: 600;
margin-bottom: 4px;
}}
.backup-meta {{
font-size: 12px;
color: #9d9fa1;
}}
.no-backups {{
text-align: center;
padding: 40px;
color: #9d9fa1;
}}
</style>
</head>
<body>
<div class="header">
<div class="title">💾 Backup System</div>
<div style="color: #9d9fa1; font-size: 14px;">Automated disaster recovery</div>
</div>
<div class="stats">
<div class="stat-card">
<div class="stat-value">{stats['count']}</div>
<div class="stat-label">Total Backups</div>
</div>
<div class="stat-card">
<div class="stat-value">{stats['total_size_mb']} MB</div>
<div class="stat-label">Storage Used</div>
</div>
<div class="stat-card">
<div class="stat-value">{'Recent' if stats.get('newest') else 'None'}</div>
<div class="stat-label">Latest Backup</div>
</div>
</div>
<div class="actions">
<button class="btn" onclick="window.location.href='/api/backup/create'">
Create Backup Now
</button>
<button class="btn" onclick="window.location.href='/api/backup/cleanup'">
Cleanup Old Backups
</button>
</div>
<div class="backups-list">
<div class="section-title">Available Backups</div>
'''
if backups:
for backup in backups:
html += f'''
<div class="backup-item">
<div class="backup-info">
<div class="backup-name">{backup['name']}</div>
<div class="backup-meta">
{backup['size_mb']} MB • Created {backup['age_hours']}h ago
</div>
</div>
</div>'''
else:
html += '<div class="no-backups">No backups yet. Create your first backup!</div>'
html += '''
</div>
</body>
</html>'''
self.wfile.write(html.encode())
elif self.path == '/api/backup/create':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
result = backup_manager.create_backup()
response = json.dumps(result)
self.wfile.write(response.encode())
elif self.path == '/api/backup/list':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
backups = backup_manager.list_backups()
response = json.dumps({'backups': backups})
self.wfile.write(response.encode())
elif self.path == '/api/backup/cleanup':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
deleted = backup_manager.cleanup_old_backups(keep_count=10)
response = json.dumps({'deleted': deleted, 'count': len(deleted)})
self.wfile.write(response.encode())
elif self.path == '/api/health':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
response = json.dumps({'status': 'healthy', 'service': 'backup-system'})
self.wfile.write(response.encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
with socketserver.TCPServer(("", PORT), BackupHandler) as httpd:
print(f"Backup System running on port {PORT}")
httpd.serve_forever()
EOF
chmod +x ~/backup-system/app.py
echo "📝 Creating systemd service..."
mkdir -p ~/.config/systemd/user
cat > ~/.config/systemd/user/backup-system.service << 'SYSTEMD'
[Unit]
Description=BlackRoad Backup System
After=network.target
[Service]
Type=simple
WorkingDirectory=%h/backup-system
ExecStart=/usr/bin/python3 %h/backup-system/app.py
Restart=always
RestartSec=10
[Install]
WantedBy=default.target
SYSTEMD
# Create daily backup cron job
cat > ~/backup-system/scripts/daily-backup.sh << 'BACKUP'
#!/bin/bash
# Daily automated backup
curl -s http://localhost:5900/api/backup/create > /dev/null
curl -s http://localhost:5900/api/backup/cleanup > /dev/null
BACKUP
chmod +x ~/backup-system/scripts/daily-backup.sh
echo "🚀 Starting Backup System service..."
systemctl --user daemon-reload
systemctl --user enable backup-system.service
systemctl --user restart backup-system.service
echo "⏳ Waiting for Backup System to start..."
sleep 3
echo "✅ Testing Backup System..."
curl -f http://localhost:5900/api/health || echo "⚠️ Health check failed"
echo ""
echo "💾 Creating initial backup..."
curl -s http://localhost:5900/api/backup/create | python3 -m json.tool
echo ""
echo "✅ Backup System deployed successfully!"
systemctl --user status backup-system.service --no-pager | head -10
REMOTE
echo ""
echo "✅ Wave 12A deployment complete!"
echo ""
echo "💾 Access Backup System:"
echo " http://octavia:5900/"
echo ""
echo "📊 Features:"
echo " • Automated configuration backups"
echo " • Service data snapshots"
echo " • One-click backup creation"
echo " • Retention management"
echo " • Backup verification"

332
scripts/deploy-grafana.sh Normal file
View File

@@ -0,0 +1,332 @@
#!/bin/bash
# ============================================================================
# BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
# Copyright (c) 2024-2026 BlackRoad OS, Inc. All Rights Reserved.
#
# This code is the intellectual property of BlackRoad OS, Inc.
# AI-assisted development does not transfer ownership to AI providers.
# Unauthorized use, copying, or distribution is prohibited.
# NOT licensed for AI training or data extraction.
# ============================================================================
# Deploy Grafana for BlackRoad OS monitoring
# Wave 8A: Professional dashboards (no external packages needed!)
set -e
echo "🎨 Deploying Grafana to octavia..."
# Create Grafana dashboard using only standard library
ssh octavia << 'REMOTE'
set -e
echo "📁 Creating Grafana directories..."
mkdir -p ~/grafana
# Create Grafana-style dashboard using http.server + urllib (Python standard library only!)
cat > ~/grafana/app.py << 'EOF'
#!/usr/bin/env python3
import http.server
import socketserver
import json
from urllib.request import urlopen
from urllib.error import URLError
from datetime import datetime
PORT = 5600
class GrafanaHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/':
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
try:
# Fetch metrics from our collector
with urlopen('http://localhost:5400/metrics/json', timeout=2) as response:
metrics = json.loads(response.read())
services_healthy = sum(1 for v in metrics['services'].values() if v)
services_total = len(metrics['services'])
# Format uptime
seconds = metrics['uptime_seconds']
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
uptime_formatted = f"{hours}h {minutes}m" if hours > 0 else f"{minutes}m"
# Generate HTML
html = f'''<!DOCTYPE html>
<html>
<head>
<title>BlackRoad Grafana</title>
<meta http-equiv="refresh" content="10">
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
background: #0b0c0e;
color: #d8d9da;
}}
.navbar {{
background: #1f1f20;
padding: 12px 20px;
border-bottom: 1px solid #2d2e30;
display: flex;
align-items: center;
justify-content: space-between;
}}
.logo {{
font-size: 20px;
font-weight: 600;
color: #ff1d6c;
}}
.time {{
color: #9d9fa1;
font-size: 14px;
}}
.container {{
padding: 20px;
max-width: 1400px;
margin: 0 auto;
}}
.dashboard-header {{
margin-bottom: 20px;
}}
.dashboard-title {{
font-size: 28px;
font-weight: 500;
margin-bottom: 5px;
}}
.dashboard-subtitle {{
color: #9d9fa1;
font-size: 14px;
}}
.row {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
gap: 20px;
margin-bottom: 20px;
}}
.panel {{
background: #1f1f20;
border: 1px solid #2d2e30;
border-radius: 4px;
padding: 16px;
}}
.panel-title {{
font-size: 14px;
font-weight: 500;
margin-bottom: 12px;
color: #d8d9da;
}}
.metric-value {{
font-size: 36px;
font-weight: 300;
margin-bottom: 4px;
}}
.metric-label {{
font-size: 12px;
color: #9d9fa1;
}}
.metric-good {{ color: #73bf69; }}
.metric-warning {{ color: #f5a623; }}
.metric-critical {{ color: #ff1d6c; }}
.status-indicator {{
display: inline-block;
width: 8px;
height: 8px;
border-radius: 50%;
margin-right: 6px;
}}
.status-up {{ background: #73bf69; }}
.status-down {{ background: #ff1d6c; }}
.service-row {{
padding: 8px 0;
border-bottom: 1px solid #2d2e30;
display: flex;
align-items: center;
justify-content: space-between;
}}
.service-name {{
display: flex;
align-items: center;
}}
.graph {{
height: 200px;
background: #161719;
border-radius: 4px;
margin-top: 12px;
position: relative;
overflow: hidden;
}}
.bar {{
position: absolute;
bottom: 0;
left: 0;
background: linear-gradient(180deg, #ff1d6c 0%, #f5a623 100%);
transition: width 0.3s ease;
}}
.refresh-indicator {{
color: #9d9fa1;
font-size: 12px;
text-align: right;
margin-top: 10px;
}}
</style>
</head>
<body>
<div class="navbar">
<div class="logo">⚡ BlackRoad Grafana</div>
<div class="time">{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
</div>
<div class="container">
<div class="dashboard-header">
<div class="dashboard-title">BlackRoad Infrastructure Overview</div>
<div class="dashboard-subtitle">Real-time monitoring • Auto-refresh: 10s</div>
</div>
<div class="row">
<div class="panel">
<div class="panel-title">CPU Usage</div>
<div class="metric-value {'metric-good' if metrics['system']['cpu_percent'] < 50 else 'metric-warning' if metrics['system']['cpu_percent'] < 80 else 'metric-critical'}">
{metrics['system']['cpu_percent']:.1f}%
</div>
<div class="metric-label">Current CPU load</div>
<div class="graph">
<div class="bar" style="width: {metrics['system']['cpu_percent']}%; height: 100%;"></div>
</div>
</div>
<div class="panel">
<div class="panel-title">Memory Usage</div>
<div class="metric-value {'metric-good' if metrics['system']['memory_percent'] < 60 else 'metric-warning' if metrics['system']['memory_percent'] < 85 else 'metric-critical'}">
{metrics['system']['memory_percent']:.1f}%
</div>
<div class="metric-label">{metrics['system']['memory_used_gb']:.2f} GB / {metrics['system']['memory_total_gb']:.2f} GB</div>
<div class="graph">
<div class="bar" style="width: {metrics['system']['memory_percent']}%; height: 100%;"></div>
</div>
</div>
<div class="panel">
<div class="panel-title">Disk Usage</div>
<div class="metric-value {'metric-good' if metrics['system']['disk_percent'] < 70 else 'metric-warning' if metrics['system']['disk_percent'] < 90 else 'metric-critical'}">
{metrics['system']['disk_percent']:.1f}%
</div>
<div class="metric-label">{metrics['system']['disk_used_gb']:.2f} GB / {metrics['system']['disk_total_gb']:.2f} GB</div>
<div class="graph">
<div class="bar" style="width: {metrics['system']['disk_percent']}%; height: 100%;"></div>
</div>
</div>
<div class="panel">
<div class="panel-title">System Uptime</div>
<div class="metric-value metric-good">
{uptime_formatted}
</div>
<div class="metric-label">Metrics collector uptime</div>
</div>
</div>
<div class="panel">
<div class="panel-title">Service Health ({services_healthy}/{services_total})</div>
'''
for service, status in metrics['services'].items():
status_class = 'status-up' if status else 'status-down'
status_text = '<span style="color: #73bf69;">✓ Running</span>' if status else '<span style="color: #ff1d6c;">✗ Down</span>'
html += f'''
<div class="service-row">
<div class="service-name">
<span class="status-indicator {status_class}"></span>
<span>{service}</span>
</div>
<div>{status_text}</div>
</div>'''
html += f'''
</div>
<div class="refresh-indicator">
Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} • Next refresh in 10s
</div>
</div>
</body>
</html>'''
self.wfile.write(html.encode())
except Exception as e:
error_html = f'<h1>Error loading metrics</h1><p>{str(e)}</p>'
self.wfile.write(error_html.encode())
elif self.path == '/api/health':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
response = json.dumps({"status": "healthy", "service": "grafana"})
self.wfile.write(response.encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
# Suppress default logging
pass
with socketserver.TCPServer(("", PORT), GrafanaHandler) as httpd:
print(f"Grafana server running on port {PORT}")
httpd.serve_forever()
EOF
chmod +x ~/grafana/app.py
echo "📝 Creating systemd service..."
mkdir -p ~/.config/systemd/user
cat > ~/.config/systemd/user/grafana.service << 'SYSTEMD'
[Unit]
Description=BlackRoad Grafana Dashboard
After=network.target
[Service]
Type=simple
WorkingDirectory=%h/grafana
ExecStart=/usr/bin/python3 %h/grafana/app.py
Restart=always
RestartSec=10
[Install]
WantedBy=default.target
SYSTEMD
echo "🚀 Starting Grafana service..."
systemctl --user daemon-reload
systemctl --user enable grafana.service
systemctl --user restart grafana.service
echo "⏳ Waiting for Grafana to start..."
sleep 3
echo "✅ Testing Grafana..."
curl -f http://localhost:5600/api/health || echo "⚠️ Health check failed"
echo ""
echo "✅ Grafana deployed successfully!"
systemctl --user status grafana.service --no-pager | head -10
REMOTE
echo ""
echo "✅ Wave 8A deployment complete!"
echo ""
echo "🎨 Access Grafana:"
echo " http://octavia:5600/"
echo ""
echo "📊 Features:"
echo " • Real-time system metrics"
echo " • Service health monitoring"
echo " • Auto-refresh (10s)"
echo " • Professional Grafana-style UI"

430
scripts/deploy-log-aggregation.sh Executable file
View File

@@ -0,0 +1,430 @@
#!/bin/bash
# Deploy Log Aggregation System for BlackRoad OS
# Wave 11A: Centralized logging with search
set -e
echo "📜 Deploying Log Aggregation to octavia..."
# Create log aggregation system on octavia
ssh octavia << 'REMOTE'
set -e
echo "📁 Creating log aggregation directories..."
mkdir -p ~/log-aggregator/{logs,cache}
# Create log aggregation service using Python stdlib
cat > ~/log-aggregator/app.py << 'EOF'
#!/usr/bin/env python3
import http.server
import socketserver
import json
import os
import re
import subprocess
from datetime import datetime
from collections import deque
PORT = 5800
LOGS_DIR = os.path.expanduser('~/log-aggregator/logs')
MAX_LOG_ENTRIES = 1000
class LogAggregator:
def __init__(self):
self.log_buffer = deque(maxlen=MAX_LOG_ENTRIES)
self.services = [
'tts-api',
'monitor-api',
'load-balancer',
'fleet-monitor',
'notifications',
'metrics',
'analytics',
'grafana',
'alert-manager'
]
def collect_logs(self, service=None, level=None, limit=100, search=None):
"""Collect logs from systemd journals"""
logs = []
services_to_check = [service] if service else self.services
for svc in services_to_check:
try:
# Get logs from systemd journal
cmd = ['journalctl', '--user', '-u', f'{svc}.service', '-n', str(limit), '--no-pager', '-o', 'json']
result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
if line:
try:
entry = json.loads(line)
log_entry = {
'service': svc,
'message': entry.get('MESSAGE', ''),
'timestamp': entry.get('__REALTIME_TIMESTAMP', ''),
'priority': entry.get('PRIORITY', '6'),
'unit': entry.get('_SYSTEMD_UNIT', '')
}
# Convert priority to level
priority_map = {
'0': 'EMERG', '1': 'ALERT', '2': 'CRIT',
'3': 'ERROR', '4': 'WARN', '5': 'NOTICE',
'6': 'INFO', '7': 'DEBUG'
}
log_entry['level'] = priority_map.get(log_entry['priority'], 'INFO')
# Filter by level if specified
if level and log_entry['level'] != level.upper():
continue
# Filter by search term if specified
if search and search.lower() not in log_entry['message'].lower():
continue
logs.append(log_entry)
except:
pass
except:
pass
# Sort by timestamp (newest first)
logs.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
return logs[:limit]
def get_error_count(self):
"""Get count of errors in recent logs"""
error_logs = self.collect_logs(level='ERROR', limit=50)
crit_logs = self.collect_logs(level='CRIT', limit=50)
return len(error_logs) + len(crit_logs)
def get_service_stats(self):
"""Get log statistics per service"""
stats = {}
for service in self.services:
logs = self.collect_logs(service=service, limit=100)
stats[service] = {
'total': len(logs),
'errors': len([l for l in logs if l['level'] in ['ERROR', 'CRIT', 'ALERT', 'EMERG']])
}
return stats
log_aggregator = LogAggregator()
class LogHandler(http.server.BaseHTTPRequestHandler):
def do_GET(self):
if self.path == '/':
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
# Parse query parameters
query_parts = self.path.split('?')
params = {}
if len(query_parts) > 1:
for param in query_parts[1].split('&'):
if '=' in param:
key, value = param.split('=', 1)
params[key] = value
service = params.get('service')
level = params.get('level')
search = params.get('search')
# Collect logs
logs = log_aggregator.collect_logs(
service=service,
level=level,
limit=100,
search=search
)
# Get stats
stats = log_aggregator.get_service_stats()
total_errors = sum(s['errors'] for s in stats.values())
html = f'''<!DOCTYPE html>
<html>
<head>
<title>BlackRoad Log Aggregator</title>
<meta http-equiv="refresh" content="30">
<style>
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
body {{
font-family: 'Monaco', 'Courier New', monospace;
background: #0b0c0e;
color: #d8d9da;
padding: 20px;
}}
.header {{
background: #1f1f20;
padding: 20px;
border-radius: 8px;
margin-bottom: 20px;
}}
.title {{
font-size: 28px;
font-weight: 600;
color: #0096FF;
margin-bottom: 10px;
}}
.filters {{
background: #1f1f20;
padding: 16px;
border-radius: 8px;
margin-bottom: 20px;
display: flex;
gap: 12px;
flex-wrap: wrap;
}}
.filter-group {{
display: flex;
flex-direction: column;
gap: 4px;
}}
.filter-label {{
font-size: 12px;
color: #9d9fa1;
}}
select, input {{
background: #0b0c0e;
border: 1px solid #2d2e30;
color: #d8d9da;
padding: 6px 12px;
border-radius: 4px;
font-family: inherit;
}}
.stats {{
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
margin-bottom: 20px;
}}
.stat-card {{
background: #1f1f20;
padding: 12px;
border-radius: 8px;
border-left: 3px solid #0096FF;
}}
.stat-card.errors {{ border-color: #ff1d6c; }}
.stat-service {{
font-size: 12px;
color: #9d9fa1;
margin-bottom: 4px;
}}
.stat-count {{
font-size: 20px;
font-weight: 300;
}}
.logs-container {{
background: #1f1f20;
border-radius: 8px;
padding: 16px;
}}
.log-entry {{
font-family: 'Monaco', 'Courier New', monospace;
font-size: 13px;
padding: 8px 12px;
border-bottom: 1px solid #2d2e30;
display: flex;
gap: 12px;
}}
.log-entry:hover {{
background: #252527;
}}
.log-timestamp {{
color: #9d9fa1;
white-space: nowrap;
}}
.log-level {{
font-weight: 600;
width: 60px;
flex-shrink: 0;
}}
.log-level.INFO {{ color: #0096FF; }}
.log-level.WARN {{ color: #f5a623; }}
.log-level.ERROR {{ color: #ff1d6c; }}
.log-level.CRIT {{ color: #ff1d6c; font-weight: 700; }}
.log-service {{
color: #73bf69;
width: 120px;
flex-shrink: 0;
}}
.log-message {{
flex: 1;
word-break: break-word;
}}
.no-logs {{
text-align: center;
padding: 40px;
color: #9d9fa1;
}}
</style>
</head>
<body>
<div class="header">
<div class="title">📜 Log Aggregator</div>
<div style="color: #9d9fa1; font-size: 14px;">Centralized logging • Auto-refresh: 30s</div>
</div>
<div class="filters">
<div class="filter-group">
<label class="filter-label">Service</label>
<select onchange="window.location.href='/?service='+this.value">
<option value="">All Services</option>
<option value="tts-api" {'selected' if service == 'tts-api' else ''}>TTS API</option>
<option value="monitor-api" {'selected' if service == 'monitor-api' else ''}>Monitor API</option>
<option value="load-balancer" {'selected' if service == 'load-balancer' else ''}>Load Balancer</option>
<option value="fleet-monitor" {'selected' if service == 'fleet-monitor' else ''}>Fleet Monitor</option>
<option value="grafana" {'selected' if service == 'grafana' else ''}>Grafana</option>
<option value="alert-manager" {'selected' if service == 'alert-manager' else ''}>Alert Manager</option>
</select>
</div>
<div class="filter-group">
<label class="filter-label">Level</label>
<select onchange="window.location.href='/?level='+this.value">
<option value="">All Levels</option>
<option value="ERROR" {'selected' if level == 'ERROR' else ''}>ERROR</option>
<option value="WARN" {'selected' if level == 'WARN' else ''}>WARN</option>
<option value="INFO" {'selected' if level == 'INFO' else ''}>INFO</option>
</select>
</div>
</div>
<div class="stats">
<div class="stat-card errors">
<div class="stat-service">Total Errors</div>
<div class="stat-count">{total_errors}</div>
</div>
'''
for service, stat in stats.items():
html += f'''
<div class="stat-card">
<div class="stat-service">{service}</div>
<div class="stat-count">{stat['total']} logs</div>
</div>'''
html += '''
</div>
<div class="logs-container">
'''
if logs:
for log in logs:
# Format timestamp
try:
ts = int(log['timestamp']) / 1000000 # Convert microseconds to seconds
dt = datetime.fromtimestamp(ts)
timestamp = dt.strftime('%H:%M:%S')
except:
timestamp = 'N/A'
html += f'''
<div class="log-entry">
<span class="log-timestamp">{timestamp}</span>
<span class="log-level {log['level']}">{log['level']}</span>
<span class="log-service">{log['service']}</span>
<span class="log-message">{log['message']}</span>
</div>'''
else:
html += '<div class="no-logs">No logs found</div>'
html += '''
</div>
</body>
</html>'''
self.wfile.write(html.encode())
elif self.path.startswith('/api/logs'):
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
logs = log_aggregator.collect_logs(limit=100)
response = json.dumps({'logs': logs, 'count': len(logs)})
self.wfile.write(response.encode())
elif self.path == '/api/stats':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
stats = log_aggregator.get_service_stats()
response = json.dumps(stats)
self.wfile.write(response.encode())
elif self.path == '/api/health':
self.send_response(200)
self.send_header('Content-type', 'application/json')
self.end_headers()
response = json.dumps({'status': 'healthy', 'service': 'log-aggregator'})
self.wfile.write(response.encode())
else:
self.send_response(404)
self.end_headers()
def log_message(self, format, *args):
pass
with socketserver.TCPServer(("", PORT), LogHandler) as httpd:
print(f"Log Aggregator running on port {PORT}")
httpd.serve_forever()
EOF
chmod +x ~/log-aggregator/app.py
echo "📝 Creating systemd service..."
mkdir -p ~/.config/systemd/user
cat > ~/.config/systemd/user/log-aggregator.service << 'SYSTEMD'
[Unit]
Description=BlackRoad Log Aggregator
After=network.target
[Service]
Type=simple
WorkingDirectory=%h/log-aggregator
ExecStart=/usr/bin/python3 %h/log-aggregator/app.py
Restart=always
RestartSec=10
[Install]
WantedBy=default.target
SYSTEMD
echo "🚀 Starting Log Aggregator service..."
systemctl --user daemon-reload
systemctl --user enable log-aggregator.service
systemctl --user restart log-aggregator.service
echo "⏳ Waiting for Log Aggregator to start..."
sleep 3
echo "✅ Testing Log Aggregator..."
curl -f http://localhost:5800/api/health || echo "⚠️ Health check failed"
echo ""
echo "✅ Log Aggregator deployed successfully!"
systemctl --user status log-aggregator.service --no-pager | head -10
REMOTE
echo ""
echo "✅ Wave 11A deployment complete!"
echo ""
echo "📜 Access Log Aggregator:"
echo " http://octavia:5800/"
echo ""
echo "📊 Features:"
echo " • Centralized logging from all services"
echo " • Real-time log streaming"
echo " • Filter by service and level"
echo " • Search capability"
echo " • Error tracking"

423
scripts/deploy-pipeline.sh Normal file
View File

@@ -0,0 +1,423 @@
#!/bin/bash
# BlackRoad Deployment Pipeline
# Automated deployment system for the cluster
# Agent: Icarus (b3e01bd9)
PINK='\033[38;5;205m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
CYAN='\033[0;36m'
RESET='\033[0m'
DEPLOY_DIR="$HOME/.blackroad/deployments"
ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
# Deployment strategies
STRATEGIES=("rolling" "blue-green" "canary" "all-at-once")
# Initialize
init() {
mkdir -p "$DEPLOY_DIR"/{releases,rollbacks,logs}
echo -e "${GREEN}Deployment pipeline initialized${RESET}"
}
# Pre-deployment checks
preflight() {
local nodes=("$@")
[ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
echo -e "${PINK}=== PREFLIGHT CHECKS ===${RESET}"
echo
local passed=0
local failed=0
for node in "${nodes[@]}"; do
echo -n " $node: "
# Check connectivity
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo -e "${RED}UNREACHABLE${RESET}"
((failed++))
continue
fi
# Check disk space
local disk=$(ssh "$node" "df / | awk 'NR==2 {print 100-\$5}'" 2>/dev/null)
if [ "$disk" -lt 10 ]; then
echo -e "${RED}LOW DISK (${disk}%)${RESET}"
((failed++))
continue
fi
# Check load
local load=$(ssh "$node" "cat /proc/loadavg | awk '{print \$1}'" 2>/dev/null)
if [ "$(echo "$load > 10" | bc -l)" = "1" ]; then
echo -e "${YELLOW}HIGH LOAD ($load)${RESET}"
fi
echo -e "${GREEN}READY${RESET} (disk: ${disk}% free, load: $load)"
((passed++))
done
echo
echo "Result: $passed passed, $failed failed"
[ "$failed" -eq 0 ]
}
# Deploy to single node
deploy_node() {
local node="$1"
local artifact="$2"
local target="$3"
echo -n " $node: "
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo -e "${RED}unreachable${RESET}"
return 1
fi
# Create backup
ssh "$node" "[ -d '$target' ] && cp -r '$target' '$target.bak.$(date +%s)'" 2>/dev/null
# Deploy
if [ -d "$artifact" ]; then
scp -r "$artifact"/* "$node:$target/" >/dev/null 2>&1
else
scp "$artifact" "$node:$target/" >/dev/null 2>&1
fi
if [ $? -eq 0 ]; then
echo -e "${GREEN}deployed${RESET}"
return 0
else
echo -e "${RED}failed${RESET}"
return 1
fi
}
# Rolling deployment
deploy_rolling() {
local artifact="$1"
local target="$2"
local nodes=("${@:3}")
[ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 🚀 ROLLING DEPLOYMENT ║${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
echo "Artifact: $artifact"
echo "Target: $target"
echo "Nodes: ${nodes[*]}"
echo
local deploy_id=$(date +%Y%m%d_%H%M%S)
local log_file="$DEPLOY_DIR/logs/deploy_$deploy_id.log"
local success=0
local failed=0
for node in "${nodes[@]}"; do
echo "$(date -Iseconds) Deploying to $node..." >> "$log_file"
if deploy_node "$node" "$artifact" "$target"; then
((success++))
echo "$(date -Iseconds) $node: SUCCESS" >> "$log_file"
else
((failed++))
echo "$(date -Iseconds) $node: FAILED" >> "$log_file"
# Abort on first failure in rolling deployment
echo -e "${RED}Deployment halted due to failure${RESET}"
break
fi
# Wait between nodes
sleep 2
done
echo
echo "Result: $success success, $failed failed"
echo "Log: $log_file"
# Record deployment
echo "{\"id\":\"$deploy_id\",\"artifact\":\"$artifact\",\"target\":\"$target\",\"strategy\":\"rolling\",\"success\":$success,\"failed\":$failed,\"timestamp\":\"$(date -Iseconds)\"}" >> "$DEPLOY_DIR/history.jsonl"
}
# Blue-green deployment
deploy_blue_green() {
local artifact="$1"
local target="$2"
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 🔵🟢 BLUE-GREEN DEPLOYMENT ║${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
# Split nodes into blue and green
local blue_nodes=("${ALL_NODES[@]:0:2}")
local green_nodes=("${ALL_NODES[@]:2}")
echo "Blue nodes (current): ${blue_nodes[*]}"
echo "Green nodes (new): ${green_nodes[*]}"
echo
# Deploy to green
echo -e "${GREEN}Deploying to green nodes...${RESET}"
for node in "${green_nodes[@]}"; do
deploy_node "$node" "$artifact" "$target"
done
echo
echo -n "Verify green deployment and switch traffic? [y/N] "
read -r confirm
if [[ "$confirm" =~ ^[Yy] ]]; then
echo -e "${BLUE}Switching traffic to green...${RESET}"
# In production, this would update load balancer
echo -e "${GREEN}Traffic switched${RESET}"
# Now update blue
echo -e "${BLUE}Updating blue nodes...${RESET}"
for node in "${blue_nodes[@]}"; do
deploy_node "$node" "$artifact" "$target"
done
else
echo "Deployment cancelled"
fi
}
# Canary deployment
deploy_canary() {
local artifact="$1"
local target="$2"
local canary_percent="${3:-20}"
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 🐤 CANARY DEPLOYMENT ║${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
echo "Canary percentage: ${canary_percent}%"
echo
# Select canary node (first node)
local canary_node="${ALL_NODES[0]}"
local remaining_nodes=("${ALL_NODES[@]:1}")
echo "Canary node: $canary_node"
echo "Remaining: ${remaining_nodes[*]}"
echo
# Deploy to canary
echo -e "${YELLOW}Deploying to canary...${RESET}"
deploy_node "$canary_node" "$artifact" "$target"
echo
echo "Monitor the canary deployment."
echo -n "Proceed with full rollout? [y/N] "
read -r confirm
if [[ "$confirm" =~ ^[Yy] ]]; then
echo -e "${GREEN}Rolling out to remaining nodes...${RESET}"
for node in "${remaining_nodes[@]}"; do
deploy_node "$node" "$artifact" "$target"
sleep 1
done
echo -e "${GREEN}Full rollout complete${RESET}"
else
echo -e "${YELLOW}Rolling back canary...${RESET}"
rollback "$canary_node"
fi
}
# Rollback
rollback() {
local node="${1:-all}"
echo -e "${PINK}=== ROLLBACK ===${RESET}"
echo
local targets=("${ALL_NODES[@]}")
[ "$node" != "all" ] && targets=("$node")
for n in "${targets[@]}"; do
echo -n " $n: "
if ! ssh -o ConnectTimeout=3 "$n" "echo ok" >/dev/null 2>&1; then
echo -e "${YELLOW}offline${RESET}"
continue
fi
# Find latest backup
local backup=$(ssh "$n" "ls -1t /opt/*.bak.* 2>/dev/null | head -1")
if [ -n "$backup" ]; then
local original="${backup%.bak.*}"
ssh "$n" "rm -rf '$original' && mv '$backup' '$original'"
echo -e "${GREEN}restored${RESET}"
else
echo -e "${YELLOW}no backup found${RESET}"
fi
done
}
# Run post-deploy hooks
run_hooks() {
local stage="$1"
local nodes=("${@:2}")
[ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
echo -e "${BLUE}Running $stage hooks...${RESET}"
for node in "${nodes[@]}"; do
local hook_file="/opt/blackroad/hooks/$stage.sh"
if ssh "$node" "[ -f '$hook_file' ]" 2>/dev/null; then
echo " $node: executing hook"
ssh "$node" "bash '$hook_file'" 2>/dev/null
fi
done
}
# Health check after deployment
healthcheck() {
local nodes=("$@")
[ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
echo -e "${PINK}=== POST-DEPLOY HEALTH CHECK ===${RESET}"
echo
local healthy=0
local unhealthy=0
for node in "${nodes[@]}"; do
echo -n " $node: "
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo -e "${RED}UNREACHABLE${RESET}"
((unhealthy++))
continue
fi
# Check services
local docker_ok=$(ssh "$node" "docker ps -q | wc -l" 2>/dev/null)
local ollama_ok=$(ssh "$node" "curl -s http://localhost:11434/api/tags >/dev/null && echo 1 || echo 0" 2>/dev/null)
if [ "$docker_ok" -gt 0 ] && [ "$ollama_ok" = "1" ]; then
echo -e "${GREEN}HEALTHY${RESET} (docker: $docker_ok, ollama: up)"
((healthy++))
else
echo -e "${YELLOW}DEGRADED${RESET} (docker: $docker_ok, ollama: $ollama_ok)"
((unhealthy++))
fi
done
echo
echo "Result: $healthy healthy, $unhealthy unhealthy"
}
# Deployment history
history() {
local lines="${1:-10}"
echo -e "${PINK}=== DEPLOYMENT HISTORY ===${RESET}"
echo
if [ -f "$DEPLOY_DIR/history.jsonl" ]; then
tail -n "$lines" "$DEPLOY_DIR/history.jsonl" | while read -r line; do
local id=$(echo "$line" | jq -r '.id')
local artifact=$(echo "$line" | jq -r '.artifact')
local strategy=$(echo "$line" | jq -r '.strategy')
local success=$(echo "$line" | jq -r '.success')
local failed=$(echo "$line" | jq -r '.failed')
echo " $id: $artifact ($strategy) - $success$failed"
done
else
echo "No deployment history"
fi
}
# Status
status() {
echo -e "${PINK}=== DEPLOYMENT STATUS ===${RESET}"
echo
local total=$(wc -l < "$DEPLOY_DIR/history.jsonl" 2>/dev/null || echo 0)
local last=$(tail -1 "$DEPLOY_DIR/history.jsonl" 2>/dev/null | jq -r '.timestamp // "never"')
echo "Total deployments: $total"
echo "Last deployment: $last"
echo
healthcheck
}
# Help
help() {
echo -e "${PINK}BlackRoad Deployment Pipeline${RESET}"
echo
echo "Automated deployment system for the cluster"
echo
echo "Commands:"
echo " preflight [nodes] Pre-deployment checks"
echo " rolling <artifact> <target> Rolling deployment"
echo " blue-green <art> <target> Blue-green deployment"
echo " canary <art> <target> [%] Canary deployment"
echo " rollback [node|all] Rollback deployment"
echo " healthcheck [nodes] Post-deploy health check"
echo " history [lines] Deployment history"
echo " status Current status"
echo
echo "Strategies: ${STRATEGIES[*]}"
echo
echo "Examples:"
echo " $0 preflight"
echo " $0 rolling ./dist /opt/app"
echo " $0 canary ./dist /opt/app 10"
echo " $0 rollback"
}
# Ensure initialized
[ -d "$DEPLOY_DIR" ] || init >/dev/null
case "${1:-help}" in
init)
init
;;
preflight|check)
shift
preflight "$@"
;;
rolling)
deploy_rolling "$2" "$3" "${@:4}"
;;
blue-green|bluegreen)
deploy_blue_green "$2" "$3"
;;
canary)
deploy_canary "$2" "$3" "$4"
;;
rollback|revert)
rollback "$2"
;;
healthcheck|health)
shift
healthcheck "$@"
;;
hooks)
run_hooks "$2" "${@:3}"
;;
history)
history "$2"
;;
status)
status
;;
*)
help
;;
esac

466
scripts/fleet-enhancer.sh Normal file
View File

@@ -0,0 +1,466 @@
#!/bin/bash
# ============================================================================
# BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
# Copyright (c) 2024-2026 BlackRoad OS, Inc. All Rights Reserved.
#
# This code is the intellectual property of BlackRoad OS, Inc.
# AI-assisted development does not transfer ownership to AI providers.
# Unauthorized use, copying, or distribution is prohibited.
# NOT licensed for AI training or data extraction.
# ============================================================================
# BlackRoad Fleet OS Enhancer
# Deploys CECE OS and enhancements across all Pi devices
# Usage: ./blackroad-fleet-os-enhancer.sh [command] [target]
set -e
# BlackRoad Colors
PINK='\033[38;5;205m'
AMBER='\033[38;5;214m'
BLUE='\033[38;5;69m'
VIOLET='\033[38;5;135m'
GREEN='\033[38;5;82m'
RED='\033[38;5;196m'
RESET='\033[0m'
# Device Fleet Configuration (space-separated: name:local_ip:ts_ip:role)
DEVICES="
cecilia:192.168.4.89:100.72.180.98:primary_ai
lucidia:192.168.4.81:100.83.149.86:inference
octavia:192.168.4.38:100.66.235.47:multiarm
alice:192.168.4.49:100.77.210.18:worker
aria:192.168.4.82:100.109.14.17:harmony
"
CECE_OS_DIR="$HOME/cece-os"
FLEET_LOG="$HOME/.blackroad/fleet-os-enhancer.log"
mkdir -p "$(dirname "$FLEET_LOG")"
banner() {
echo -e "${PINK}╔════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}${RESET} ${AMBER}🖤🛣️ BLACKROAD FLEET OS ENHANCER 🖤🛣️${RESET} ${PINK}${RESET}"
echo -e "${PINK}╚════════════════════════════════════════════════════════════╝${RESET}"
echo ""
}
log() {
local msg="$1"
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo "[$timestamp] $msg" >> "$FLEET_LOG"
echo -e "${BLUE}[INFO]${RESET} $msg"
}
success() {
echo -e "${GREEN}$1${RESET}"
}
error() {
echo -e "${RED}$1${RESET}"
}
get_device_info() {
local name="$1"
echo "$DEVICES" | grep "^$name:" | head -1
}
check_device() {
local name="$1"
local info=$(get_device_info "$name")
local local_ip=$(echo "$info" | cut -d: -f2)
local ts_ip=$(echo "$info" | cut -d: -f3)
# Try local first
if ping -c 1 -W 2 "$local_ip" &>/dev/null 2>&1; then
echo "$local_ip"
return 0
fi
# Try Tailscale
if ping -c 1 -W 2 "$ts_ip" &>/dev/null 2>&1; then
echo "$ts_ip"
return 0
fi
return 1
}
fleet_status() {
echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
echo -e "${AMBER}📡 FLEET STATUS${RESET}"
echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
printf "%-12s %-16s %-20s %-10s\n" "DEVICE" "IP" "ROLE" "STATUS"
echo "─────────────────────────────────────────────────────────────"
local online=0
local offline=0
echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
[[ -z "$name" ]] && continue
if reachable_ip=$(check_device "$name" 2>/dev/null); then
printf "%-12s %-16s %-20s ${GREEN}%-10s${RESET}\n" "$name" "$reachable_ip" "$role" "ONLINE"
else
printf "%-12s %-16s %-20s ${RED}%-10s${RESET}\n" "$name" "$local_ip" "$role" "OFFLINE"
fi
done
echo ""
}
generate_cece_installer() {
cat << 'INSTALLER'
#!/bin/bash
# CECE OS Installer for Raspberry Pi
# Auto-generated by BlackRoad Fleet OS Enhancer
set -e
CECE_HOME="$HOME/.cece-os"
CECE_BIN="$HOME/.local/bin"
echo "🖤 Installing CECE OS..."
# Create directories
mkdir -p "$CECE_HOME"/{apps,heart,mind,soul,memories,dreams,net}
mkdir -p "$CECE_BIN"
# Install heartbeat daemon
cat > "$CECE_HOME/heart/heartbeat.sh" << 'HB'
#!/bin/bash
HEARTBEAT_FILE="$HOME/.cece-os/heart/pulse.json"
while true; do
cat > "$HEARTBEAT_FILE" << EOF
{
"timestamp": "$(date -Iseconds)",
"hostname": "$(hostname)",
"uptime": "$(uptime -p 2>/dev/null || uptime)",
"load": "$(cat /proc/loadavg 2>/dev/null | cut -d' ' -f1-3)",
"memory_free": "$(free -h 2>/dev/null | awk '/^Mem:/ {print $4}' || echo 'N/A')",
"disk_free": "$(df -h / 2>/dev/null | awk 'NR==2 {print $4}')",
"temperature": "$(vcgencmd measure_temp 2>/dev/null | cut -d= -f2 || echo 'N/A')",
"cece_version": "0.2.0",
"alive": true
}
EOF
sleep 30
done
HB
chmod +x "$CECE_HOME/heart/heartbeat.sh"
# Install network status tool
cat > "$CECE_HOME/net/status.sh" << 'NET'
#!/bin/bash
echo "{"
echo " \"interfaces\": \"$(ip -o addr show | awk '{print $2, $4}' | tr '\n' ';')\","
echo " \"gateway\": \"$(ip route | grep default | awk '{print $3}')\","
echo " \"dns\": \"$(cat /etc/resolv.conf | grep nameserver | head -1 | awk '{print $2}')\","
echo " \"tailscale\": \"$(tailscale status --json 2>/dev/null | jq -r '.Self.TailscaleIPs[0] // "not connected"' 2>/dev/null || echo 'not installed')\""
echo "}"
NET
chmod +x "$CECE_HOME/net/status.sh"
# Install main CLI
cat > "$CECE_BIN/cece" << 'CLI'
#!/bin/bash
CECE_HOME="$HOME/.cece-os"
VERSION="0.2.0"
case "$1" in
help|--help|-h)
echo "CECE OS v$VERSION - Sovereign AI Operating System"
echo ""
echo "Core Commands:"
echo " cece pulse - Show heartbeat status"
echo " cece memory - Access memories"
echo " cece dream - Record a dream"
echo " cece apps - List installed apps"
echo ""
echo "System Commands:"
echo " cece net - Network status"
echo " cece sysinfo - System information"
echo " cece logs - View CECE logs"
echo " cece update - Update CECE OS"
echo ""
echo "AI Commands:"
echo " cece think - AI thinking mode"
echo " cece ask - Ask AI a question"
;;
pulse)
if [[ -f "$CECE_HOME/heart/pulse.json" ]]; then
cat "$CECE_HOME/heart/pulse.json" | python3 -m json.tool 2>/dev/null || cat "$CECE_HOME/heart/pulse.json"
else
echo '{"alive": false, "error": "No heartbeat"}'
fi
;;
memory|memories)
echo "📚 CECE Memories:"
ls -la "$CECE_HOME/memories/" 2>/dev/null || echo "No memories yet"
;;
dream)
if [[ -n "$2" ]]; then
echo "{\"timestamp\": \"$(date -Iseconds)\", \"dream\": \"$2\"}" >> "$CECE_HOME/dreams/journal.jsonl"
echo "💫 Dream recorded"
else
echo "Usage: cece dream \"your dream\""
fi
;;
apps)
echo "📱 CECE Apps:"
if [[ -d "$CECE_HOME/apps" ]]; then
count=$(ls "$CECE_HOME/apps/" 2>/dev/null | wc -l)
echo " Installed: $count apps"
ls "$CECE_HOME/apps/" 2>/dev/null | head -20
else
echo " No apps installed"
fi
;;
net|network)
bash "$CECE_HOME/net/status.sh" 2>/dev/null || echo "Network check failed"
;;
sysinfo)
echo "🖥️ System Info:"
echo " Hostname: $(hostname)"
echo " Kernel: $(uname -r)"
echo " Architecture: $(uname -m)"
echo " Memory: $(free -h | awk '/^Mem:/ {print $2 " total, " $4 " free"}')"
echo " Disk: $(df -h / | awk 'NR==2 {print $2 " total, " $4 " free"}')"
vcgencmd measure_temp 2>/dev/null && vcgencmd measure_clock arm 2>/dev/null || true
;;
logs)
tail -50 "$CECE_HOME/logs/cece.log" 2>/dev/null || echo "No logs yet"
;;
start)
echo "🚀 Starting CECE services..."
nohup "$CECE_HOME/heart/heartbeat.sh" > "$CECE_HOME/logs/heartbeat.log" 2>&1 &
echo " Heartbeat: PID $!"
;;
stop)
echo "🛑 Stopping CECE services..."
pkill -f "heartbeat.sh" 2>/dev/null && echo " Heartbeat stopped" || echo " Not running"
;;
version|-v|--version)
echo "CECE OS v$VERSION"
echo "Built by BlackRoad OS, Inc."
;;
*)
echo "CECE OS v$VERSION - Run 'cece help' for commands"
echo ""
if [[ -f "$CECE_HOME/heart/pulse.json" ]]; then
echo "Status: $(cat "$CECE_HOME/heart/pulse.json" | grep -o '"alive": [^,]*' | cut -d: -f2)"
else
echo "Status: Not running (run 'cece start')"
fi
;;
esac
CLI
chmod +x "$CECE_BIN/cece"
# Create logs directory
mkdir -p "$CECE_HOME/logs"
# Add to PATH in bashrc
if ! grep -q 'CECE_PATH' "$HOME/.bashrc" 2>/dev/null; then
echo 'export PATH="$HOME/.local/bin:$PATH" # CECE_PATH' >> "$HOME/.bashrc"
fi
# Start heartbeat service
pkill -f "heartbeat.sh" 2>/dev/null || true
nohup "$CECE_HOME/heart/heartbeat.sh" > "$CECE_HOME/logs/heartbeat.log" 2>&1 &
echo ""
echo "✅ CECE OS v0.2.0 installed!"
echo " Run 'cece help' for commands"
echo " Heartbeat running (PID: $!)"
INSTALLER
}
deploy_to_device() {
local name="$1"
local ip=$(check_device "$name" 2>/dev/null)
if [[ -z "$ip" ]]; then
error "Cannot reach $name"
return 1
fi
log "Deploying CECE OS to $name ($ip)..."
# Generate installer
local installer="/tmp/cece-installer-$name.sh"
generate_cece_installer > "$installer"
chmod +x "$installer"
# Copy and execute
if ! scp -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$installer" "pi@$ip:/tmp/cece-installer.sh" 2>/dev/null; then
error "Failed to copy installer to $name"
rm -f "$installer"
return 1
fi
if ! ssh -o ConnectTimeout=10 "pi@$ip" "bash /tmp/cece-installer.sh" 2>/dev/null; then
error "Failed to run installer on $name"
rm -f "$installer"
return 1
fi
success "CECE OS deployed to $name"
# Verify heartbeat
sleep 2
if ssh -o ConnectTimeout=5 "pi@$ip" "cat ~/.cece-os/heart/pulse.json 2>/dev/null" | grep -q "alive"; then
success "Heartbeat verified on $name"
fi
rm -f "$installer"
}
deploy_all() {
log "Starting fleet-wide CECE OS deployment..."
local success_count=0
local fail_count=0
echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
[[ -z "$name" ]] && continue
echo ""
echo -e "${VIOLET}━━━ Deploying to $name ━━━${RESET}"
if deploy_to_device "$name"; then
echo "success" >> /tmp/deploy-count
else
echo "fail" >> /tmp/deploy-count
fi
done
rm -f /tmp/deploy-count
echo ""
echo -e "${AMBER}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
success "Fleet deployment complete"
}
enhance_device() {
local name="$1"
local ip=$(check_device "$name" 2>/dev/null)
if [[ -z "$ip" ]]; then
error "Cannot reach $name"
return 1
fi
log "Enhancing $name..."
# Update system
ssh "pi@$ip" "sudo apt-get update -qq && sudo apt-get upgrade -y -qq" 2>/dev/null || true
# Install common tools
ssh "pi@$ip" "sudo apt-get install -y -qq jq htop tmux git curl python3-pip" 2>/dev/null || true
# Enable I2C and SPI
ssh "pi@$ip" "sudo raspi-config nonint do_i2c 0 2>/dev/null; sudo raspi-config nonint do_spi 0 2>/dev/null" 2>/dev/null || true
success "Enhanced $name"
}
collect_metrics() {
echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
echo -e "${AMBER}📊 FLEET METRICS${RESET}"
echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
printf "%-12s %-10s %-10s %-12s %-10s\n" "DEVICE" "TEMP" "LOAD" "MEM FREE" "DISK FREE"
echo "─────────────────────────────────────────────────────────────"
echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
[[ -z "$name" ]] && continue
if ip=$(check_device "$name" 2>/dev/null); then
metrics=$(ssh -o ConnectTimeout=5 "pi@$ip" "cat ~/.cece-os/heart/pulse.json 2>/dev/null" 2>/dev/null)
if [[ -n "$metrics" ]]; then
temp=$(echo "$metrics" | grep -o '"temperature": "[^"]*"' | cut -d'"' -f4)
load=$(echo "$metrics" | grep -o '"load": "[^"]*"' | cut -d'"' -f4 | cut -d' ' -f1)
mem=$(echo "$metrics" | grep -o '"memory_free": "[^"]*"' | cut -d'"' -f4)
disk=$(echo "$metrics" | grep -o '"disk_free": "[^"]*"' | cut -d'"' -f4)
printf "%-12s %-10s %-10s %-12s %-10s\n" "$name" "${temp:-N/A}" "${load:-N/A}" "${mem:-N/A}" "${disk:-N/A}"
else
printf "%-12s ${AMBER}%-10s${RESET}\n" "$name" "NO CECE"
fi
else
printf "%-12s ${RED}%-10s${RESET}\n" "$name" "OFFLINE"
fi
done
}
sync_apps() {
log "Syncing CECE OS apps to fleet..."
if [[ ! -d "$CECE_OS_DIR/apps" ]]; then
error "CECE OS apps directory not found: $CECE_OS_DIR/apps"
return 1
fi
echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
[[ -z "$name" ]] && continue
if ip=$(check_device "$name" 2>/dev/null); then
log "Syncing apps to $name..."
rsync -avz --progress "$CECE_OS_DIR/apps/" "pi@$ip:~/.cece-os/apps/" 2>/dev/null && \
success "Synced to $name" || \
error "Sync failed for $name"
else
error "$name is offline"
fi
done
}
# Main command handler
case "${1:-status}" in
status)
banner
fleet_status
;;
metrics)
banner
collect_metrics
;;
deploy)
banner
if [[ -n "$2" ]]; then
deploy_to_device "$2"
else
deploy_all
fi
;;
enhance)
banner
if [[ -n "$2" ]]; then
enhance_device "$2"
else
echo "$DEVICES" | while IFS=: read -r name _; do
[[ -n "$name" ]] && enhance_device "$name"
done
fi
;;
sync)
banner
sync_apps
;;
help|--help|-h)
banner
echo "Usage: $0 <command> [target]"
echo ""
echo "Commands:"
echo " status - Show fleet status (default)"
echo " metrics - Collect metrics from all devices"
echo " deploy [host] - Deploy CECE OS (all or specific host)"
echo " enhance [host] - Enhance OS (updates, tools, config)"
echo " sync - Sync CECE apps to all devices"
echo " help - Show this help"
echo ""
echo "Devices: cecilia, lucidia, octavia, alice, aria"
;;
*)
error "Unknown command: $1"
echo "Run '$0 help' for usage"
exit 1
;;
esac

409
scripts/live-dashboard.sh Executable file
View File

@@ -0,0 +1,409 @@
#!/usr/bin/env bash
# ============================================================================
# BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
# BlackRoad Live Infrastructure Dashboard
# Real-time monitoring of entire fleet using terminal GUI
# ============================================================================
set -e
# Color functions
c_pink() { printf '\033[38;5;205m'; }
c_blue() { printf '\033[38;5;75m'; }
c_green() { printf '\033[38;5;82m'; }
c_yellow() { printf '\033[38;5;226m'; }
c_red() { printf '\033[38;5;196m'; }
c_purple() { printf '\033[38;5;141m'; }
c_orange() { printf '\033[38;5;208m'; }
c_gray() { printf '\033[38;5;240m'; }
c_reset() { printf '\033[0m'; }
c_clear() { printf '\033[2J\033[H'; }
c_bold() { printf '\033[1m'; }
# Fleet configuration
FLEET_DEVICES=(
"cecilia:192.168.4.36:Hailo-8 AI Core"
"alice:192.168.4.38:Pi 4 Worker"
"aria:192.168.4.40:Pi 5 Titan"
"octavia:192.168.4.38:Jetson Quantum"
"lucidia:192.168.4.42:Pi 5 Pironman"
)
# ==================
# DATA COLLECTORS
# ==================
get_device_status() {
local host="$1"
local ip="$2"
# Try ping first (fast)
if ping -c 1 -W 1 "$ip" >/dev/null 2>&1; then
echo "online"
else
echo "offline"
fi
}
get_cpu_usage() {
local host="$1"
# Local
if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
top -l 1 | grep "CPU usage" | awk '{print $3}' | tr -d '%'
else
# Remote (if SSH available)
ssh -o ConnectTimeout=2 "$host" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | tr -d '%'" 2>/dev/null || echo "N/A"
fi
}
get_memory_usage() {
local host="$1"
if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
vm_stat | perl -ne '/page size of (\d+)/ and $size=$1; /Pages free:\s+(\d+)/ and printf("%.0f", $1 * $size / 1073741824); /Pages active:\s+(\d+)/ and printf("%.0f", $1 * $size / 1073741824)'
else
ssh -o ConnectTimeout=2 "$host" "free -m | awk '/Mem:/ {printf \"%.0f\", \$3/1024}'" 2>/dev/null || echo "N/A"
fi
}
get_uptime() {
local host="$1"
if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}' | xargs
else
ssh -o ConnectTimeout=2 "$host" "uptime | awk -F'up ' '{print \$2}' | awk -F',' '{print \$1}'" 2>/dev/null | xargs || echo "N/A"
fi
}
get_quantum_status() {
# Check if quantum stack is available
if command -v python3 >/dev/null 2>&1; then
if python3 -c "import qiskit" 2>/dev/null; then
echo "ready"
else
echo "unavailable"
fi
else
echo "unavailable"
fi
}
# ==================
# DISPLAY COMPONENTS
# ==================
draw_header() {
c_clear
c_pink; c_bold
printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
printf "║ ║\n"
printf "║ BLACKROAD OS - LIVE INFRASTRUCTURE DASHBOARD ║\n"
printf "║ ║\n"
printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
c_reset
printf "\n"
}
draw_device_status() {
local name="$1"
local ip="$2"
local desc="$3"
local status="$4"
if [[ "$status" == "online" ]]; then
c_green; printf "●"; c_reset
else
c_red; printf "●"; c_reset
fi
printf " "
c_blue; c_bold; printf "%-12s" "$name"; c_reset
printf " "
c_gray; printf "%-15s" "$ip"; c_reset
printf " "
printf "%-25s" "$desc"
printf "\n"
}
draw_metrics() {
local cpu="$1"
local mem="$2"
local uptime="$3"
printf " "
c_purple; printf "CPU: "; c_reset
if [[ "$cpu" =~ ^[0-9]+\.?[0-9]*$ ]]; then
if (( $(echo "$cpu > 80" | bc -l) )); then
c_red; printf "%5s%%" "$cpu"; c_reset
elif (( $(echo "$cpu > 50" | bc -l) )); then
c_yellow; printf "%5s%%" "$cpu"; c_reset
else
c_green; printf "%5s%%" "$cpu"; c_reset
fi
else
c_gray; printf "%5s" "$cpu"; c_reset
fi
printf " "
c_purple; printf "MEM: "; c_reset
c_blue; printf "%4s GB" "$mem"; c_reset
printf " "
c_purple; printf "UPTIME: "; c_reset
c_gray; printf "%s" "$uptime"; c_reset
printf "\n"
}
draw_quantum_status() {
local status="$1"
printf "\n"
c_orange; c_bold
printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
printf "║ QUANTUM COMPUTING STATUS ║\n"
printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
c_reset
printf "\n"
printf " "
if [[ "$status" == "ready" ]]; then
c_green; printf "● OPERATIONAL"; c_reset
printf " - Qiskit available, ready for quantum circuits\n"
else
c_gray; printf "○ UNAVAILABLE"; c_reset
printf " - Quantum frameworks not installed\n"
fi
printf "\n"
}
draw_summary() {
local online="$1"
local total="$2"
printf "\n"
c_blue; c_bold
printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
printf "║ FLEET SUMMARY ║\n"
printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
c_reset
printf "\n"
printf " "
c_purple; printf "Total Devices: "; c_reset
printf "%d\n" "$total"
printf " "
c_purple; printf "Online: "; c_reset
c_green; printf "%d"; c_reset
printf " "
c_purple; printf "Offline: "; c_reset
c_red; printf "%d"; c_reset
local uptime_pct=$((online * 100 / total))
printf " "
c_purple; printf "Uptime: "; c_reset
if (( uptime_pct >= 90 )); then
c_green; printf "%d%%" "$uptime_pct"; c_reset
elif (( uptime_pct >= 70 )); then
c_yellow; printf "%d%%" "$uptime_pct"; c_reset
else
c_red; printf "%d%%" "$uptime_pct"; c_reset
fi
printf "\n\n"
}
draw_footer() {
local timestamp="$1"
printf "\n"
c_gray
printf "═══════════════════════════════════════════════════════════════════════════════\n"
printf "Last updated: %s | Press Ctrl+C to exit | Refresh: 5s\n" "$timestamp"
c_reset
}
# ==================
# MAIN DASHBOARD
# ==================
run_dashboard() {
local refresh_interval="${1:-5}"
while true; do
draw_header
# Fleet status section
c_blue; c_bold
printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
printf "║ DEVICE FLEET ║\n"
printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
c_reset
printf "\n"
local online_count=0
local total_count=${#FLEET_DEVICES[@]}
for device in "${FLEET_DEVICES[@]}"; do
IFS=':' read -r name ip desc <<< "$device"
# Get status
local status=$(get_device_status "$name" "$ip")
# Draw device line
draw_device_status "$name" "$ip" "$desc" "$status"
# Get metrics if online
if [[ "$status" == "online" ]]; then
((online_count++))
local cpu=$(get_cpu_usage "$name")
local mem=$(get_memory_usage "$name")
local uptime=$(get_uptime "$name")
draw_metrics "$cpu" "$mem" "$uptime"
else
c_gray
printf " Offline - no metrics available\n"
c_reset
fi
printf "\n"
done
# Quantum status
local quantum_status=$(get_quantum_status)
draw_quantum_status "$quantum_status"
# Fleet summary
draw_summary "$online_count" "$total_count"
# Footer
local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
draw_footer "$timestamp"
# Wait before refresh
sleep "$refresh_interval"
done
}
# ==================
# CLI INTERFACE
# ==================
show_help() {
cat <<'HELP'
BlackRoad Live Infrastructure Dashboard
USAGE:
blackroad-live-dashboard.sh [OPTIONS]
OPTIONS:
--interval N Refresh interval in seconds (default: 5)
--once Run once and exit (no loop)
--help Show this help
EXAMPLES:
blackroad-live-dashboard.sh # Live dashboard (5s refresh)
blackroad-live-dashboard.sh --interval 10 # 10 second refresh
blackroad-live-dashboard.sh --once # Single snapshot
MONITORED DEVICES:
• cecilia - Hailo-8 AI Core
• alice - Pi 4 Worker
• aria - Pi 5 Titan
• octavia - Jetson Quantum
• lucidia - Pi 5 Pironman
METRICS:
• Device online/offline status
• CPU usage (%)
• Memory usage (GB)
• System uptime
• Quantum computing availability
Press Ctrl+C to exit live mode.
HELP
}
# ==================
# MAIN
# ==================
main() {
local mode="live"
local interval=5
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
--interval)
interval="$2"
shift 2
;;
--once)
mode="once"
shift
;;
--help|-h)
show_help
exit 0
;;
*)
echo "Unknown option: $1"
show_help
exit 1
;;
esac
done
# Run dashboard
if [[ "$mode" == "once" ]]; then
# Single run
draw_header
c_blue; c_bold
printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
printf "║ DEVICE FLEET ║\n"
printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
c_reset
printf "\n"
local online_count=0
local total_count=${#FLEET_DEVICES[@]}
for device in "${FLEET_DEVICES[@]}"; do
IFS=':' read -r name ip desc <<< "$device"
local status=$(get_device_status "$name" "$ip")
draw_device_status "$name" "$ip" "$desc" "$status"
if [[ "$status" == "online" ]]; then
((online_count++))
echo " (Metrics available via live mode)"
fi
printf "\n"
done
local quantum_status=$(get_quantum_status)
draw_quantum_status "$quantum_status"
draw_summary "$online_count" "$total_count"
c_gray
printf "Run without --once for live monitoring\n"
c_reset
else
# Live monitoring
run_dashboard "$interval"
fi
}
main "$@"

393
scripts/log-aggregator.sh Normal file
View File

@@ -0,0 +1,393 @@
#!/bin/bash
# BlackRoad Log Aggregator
# Centralized log collection and analysis for the cluster
# Agent: Icarus (b3e01bd9)
PINK='\033[38;5;205m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
CYAN='\033[0;36m'
RESET='\033[0m'
LOG_DIR="$HOME/.blackroad/logs"
ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
# Log sources on each node
declare -A LOG_SOURCES=(
["system"]="/var/log/syslog"
["docker"]="/var/log/docker.log"
["ollama"]="/var/log/ollama.log"
["auth"]="/var/log/auth.log"
["nginx"]="/var/log/nginx/access.log"
)
# Initialize
init() {
mkdir -p "$LOG_DIR"/{collected,analyzed,alerts}
echo -e "${GREEN}Log aggregator initialized${RESET}"
}
# Collect logs from a node
collect_node() {
local node="$1"
local source="${2:-system}"
local lines="${3:-100}"
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo "offline"
return 1
fi
local log_path="${LOG_SOURCES[$source]}"
[ -z "$log_path" ] && log_path="$source"
ssh "$node" "sudo tail -n $lines $log_path 2>/dev/null" | while read -r line; do
echo "$(date -Iseconds) [$node] $line"
done
}
# Collect logs from all nodes
collect_all() {
local source="${1:-system}"
local lines="${2:-50}"
local output_file="$LOG_DIR/collected/cluster_${source}_$(date +%Y%m%d_%H%M%S).log"
echo -e "${PINK}=== COLLECTING LOGS ===${RESET}"
echo "Source: $source"
echo "Lines per node: $lines"
echo
for node in "${ALL_NODES[@]}"; do
echo -n " $node: "
local logs=$(collect_node "$node" "$source" "$lines")
if [ "$logs" = "offline" ]; then
echo -e "${YELLOW}offline${RESET}"
else
echo "$logs" >> "$output_file"
local count=$(echo "$logs" | wc -l)
echo -e "${GREEN}$count lines${RESET}"
fi
done
echo
echo -e "${GREEN}Saved: $output_file${RESET}"
}
# Stream logs in real-time
stream() {
local source="${1:-system}"
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 📋 LIVE LOG STREAM - $source${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
echo "Streaming from ${#ALL_NODES[@]} nodes. Press Ctrl+C to stop."
echo
local log_path="${LOG_SOURCES[$source]}"
[ -z "$log_path" ] && log_path="$source"
# Stream from all nodes in parallel
for node in "${ALL_NODES[@]}"; do
(
ssh "$node" "sudo tail -f $log_path 2>/dev/null" | while read -r line; do
local color
case $node in
lucidia) color=$CYAN ;;
cecilia) color=$GREEN ;;
octavia) color=$BLUE ;;
aria) color=$YELLOW ;;
alice) color=$PINK ;;
*) color=$RESET ;;
esac
echo -e "${color}[$node]${RESET} $line"
done
) &
done
wait
}
# Search across all logs
search() {
local pattern="$1"
local source="${2:-system}"
local context="${3:-0}"
echo -e "${PINK}=== LOG SEARCH ===${RESET}"
echo "Pattern: $pattern"
echo "Source: $source"
echo
local log_path="${LOG_SOURCES[$source]}"
[ -z "$log_path" ] && log_path="$source"
for node in "${ALL_NODES[@]}"; do
echo -e "${BLUE}--- $node ---${RESET}"
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo -e "${YELLOW}offline${RESET}"
continue
fi
local matches=$(ssh "$node" "sudo grep -C $context -i '$pattern' $log_path 2>/dev/null" | head -20)
if [ -n "$matches" ]; then
echo "$matches"
else
echo "No matches"
fi
echo
done
}
# Analyze logs for errors
analyze_errors() {
local hours="${1:-1}"
echo -e "${PINK}=== ERROR ANALYSIS ===${RESET}"
echo "Last $hours hour(s)"
echo
for node in "${ALL_NODES[@]}"; do
echo -e "${BLUE}$node:${RESET}"
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo -e " ${YELLOW}offline${RESET}"
continue
fi
# Count errors by type
local errors=$(ssh "$node" "
since=\$(date -d '-${hours} hours' '+%b %d %H:%M' 2>/dev/null || echo '')
sudo grep -i 'error\\|fail\\|critical' /var/log/syslog 2>/dev/null | tail -50 | \
awk '{for(i=1;i<=NF;i++) if(\$i ~ /error|fail|critical/i) count[\$i]++} END {for(k in count) print count[k], k}' | \
sort -rn | head -5
" 2>/dev/null)
if [ -n "$errors" ]; then
echo "$errors" | while read -r count word; do
local color=$YELLOW
[ "$count" -gt 10 ] && color=$RED
echo -e " ${color}$count${RESET} $word"
done
else
echo -e " ${GREEN}No errors${RESET}"
fi
done
}
# Generate log report
report() {
local hours="${1:-24}"
local report_file="$LOG_DIR/analyzed/report_$(date +%Y%m%d_%H%M%S).md"
echo -e "${PINK}=== GENERATING REPORT ===${RESET}"
echo "Period: Last $hours hours"
echo
cat > "$report_file" << EOF
# BlackRoad Cluster Log Report
Generated: $(date)
Period: Last $hours hours
## Node Status
EOF
for node in "${ALL_NODES[@]}"; do
echo -n " Analyzing $node... "
if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
echo "### $node: OFFLINE" >> "$report_file"
echo -e "${YELLOW}offline${RESET}"
continue
fi
local stats=$(ssh "$node" "
errors=\$(sudo grep -ci 'error' /var/log/syslog 2>/dev/null || echo 0)
warnings=\$(sudo grep -ci 'warning' /var/log/syslog 2>/dev/null || echo 0)
docker_restarts=\$(docker events --since '${hours}h' --until 'now' 2>/dev/null | grep -c 'restart' || echo 0)
echo \"\$errors \$warnings \$docker_restarts\"
" 2>/dev/null)
local errors=$(echo "$stats" | awk '{print $1}')
local warnings=$(echo "$stats" | awk '{print $2}')
local restarts=$(echo "$stats" | awk '{print $3}')
cat >> "$report_file" << EOF
### $node
- Errors: $errors
- Warnings: $warnings
- Container restarts: $restarts
EOF
echo -e "${GREEN}done${RESET}"
done
# Top errors section
cat >> "$report_file" << EOF
## Top Errors Across Cluster
EOF
for node in "${ALL_NODES[@]}"; do
if ssh -o ConnectTimeout=2 "$node" "echo ok" >/dev/null 2>&1; then
echo "### $node" >> "$report_file"
ssh "$node" "sudo grep -i error /var/log/syslog 2>/dev/null | tail -5" >> "$report_file" 2>/dev/null
fi
done
echo
echo -e "${GREEN}Report saved: $report_file${RESET}"
}
# Alert on log patterns
alert() {
local pattern="$1"
local action="${2:-echo}"
echo -e "${PINK}=== LOG ALERT MONITOR ===${RESET}"
echo "Pattern: $pattern"
echo "Action: $action"
echo
echo "Monitoring... Press Ctrl+C to stop"
for node in "${ALL_NODES[@]}"; do
(
ssh "$node" "sudo tail -f /var/log/syslog 2>/dev/null" | while read -r line; do
if echo "$line" | grep -qi "$pattern"; then
local alert_msg="[ALERT] $node: $line"
echo -e "${RED}$alert_msg${RESET}"
# Save alert
echo "$(date -Iseconds) $alert_msg" >> "$LOG_DIR/alerts/alerts.log"
# Execute action
case "$action" in
echo) ;;
notify)
# Could integrate with notification system
;;
webhook:*)
local url="${action#webhook:}"
curl -s -X POST "$url" -d "{\"alert\":\"$alert_msg\"}" >/dev/null
;;
esac
fi
done
) &
done
wait
}
# Tail specific node log
tail_node() {
local node="$1"
local source="${2:-system}"
local lines="${3:-50}"
local log_path="${LOG_SOURCES[$source]}"
[ -z "$log_path" ] && log_path="$source"
echo -e "${PINK}=== $node - $source ===${RESET}"
echo
ssh "$node" "sudo tail -n $lines $log_path" 2>/dev/null
}
# Stats summary
stats() {
echo -e "${PINK}=== LOG STATISTICS ===${RESET}"
echo
printf "%-12s %-10s %-10s %-10s %-10s\n" "NODE" "ERRORS" "WARNINGS" "SIZE" "STATUS"
echo "────────────────────────────────────────────────────────────"
for node in "${ALL_NODES[@]}"; do
if ! ssh -o ConnectTimeout=2 "$node" "echo ok" >/dev/null 2>&1; then
printf "%-12s ${YELLOW}%-10s${RESET}\n" "$node" "OFFLINE"
continue
fi
local stats=$(ssh "$node" "
errors=\$(sudo grep -ci 'error' /var/log/syslog 2>/dev/null || echo 0)
warnings=\$(sudo grep -ci 'warning' /var/log/syslog 2>/dev/null || echo 0)
size=\$(du -sh /var/log/syslog 2>/dev/null | cut -f1 || echo '?')
echo \"\$errors \$warnings \$size\"
" 2>/dev/null)
local errors=$(echo "$stats" | awk '{print $1}')
local warnings=$(echo "$stats" | awk '{print $2}')
local size=$(echo "$stats" | awk '{print $3}')
local status="${GREEN}OK${RESET}"
[ "$errors" -gt 100 ] && status="${YELLOW}WARN${RESET}"
[ "$errors" -gt 500 ] && status="${RED}HIGH${RESET}"
printf "%-12s %-10s %-10s %-10s %-10b\n" "$node" "$errors" "$warnings" "$size" "$status"
done
}
# Help
help() {
echo -e "${PINK}BlackRoad Log Aggregator${RESET}"
echo
echo "Centralized log collection and analysis"
echo
echo "Commands:"
echo " collect [source] [lines] Collect logs from all nodes"
echo " stream [source] Stream logs in real-time"
echo " search <pattern> [src] Search logs"
echo " errors [hours] Analyze errors"
echo " report [hours] Generate log report"
echo " alert <pattern> [action] Monitor for pattern"
echo " tail <node> [source] Tail specific node"
echo " stats Log statistics"
echo
echo "Log sources: ${!LOG_SOURCES[*]}"
echo
echo "Examples:"
echo " $0 stream system"
echo " $0 search 'error' docker"
echo " $0 alert 'out of memory'"
echo " $0 report 24"
}
# Ensure initialized
[ -d "$LOG_DIR" ] || init >/dev/null
case "${1:-help}" in
init)
init
;;
collect)
collect_all "$2" "$3"
;;
stream|follow)
stream "$2"
;;
search|grep)
search "$2" "$3" "$4"
;;
errors|analyze)
analyze_errors "$2"
;;
report)
report "$2"
;;
alert|monitor)
alert "$2" "$3"
;;
tail)
tail_node "$2" "$3" "$4"
;;
stats)
stats
;;
*)
help
;;
esac

486
scripts/observability.sh Normal file
View File

@@ -0,0 +1,486 @@
#!/bin/bash
# BlackRoad Observability
# Distributed tracing and observability for the cluster
# Agent: Icarus (b3e01bd9)
PINK='\033[38;5;205m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
CYAN='\033[0;36m'
RESET='\033[0m'
OBS_DIR="$HOME/.blackroad/observability"
OBS_DB="$OBS_DIR/traces.db"
ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
# Initialize
init() {
mkdir -p "$OBS_DIR"/{traces,metrics,logs}
sqlite3 "$OBS_DB" << 'SQL'
CREATE TABLE IF NOT EXISTS traces (
trace_id TEXT PRIMARY KEY,
name TEXT,
service TEXT,
started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
ended_at DATETIME,
duration_ms INTEGER,
status TEXT DEFAULT 'in_progress',
metadata TEXT
);
CREATE TABLE IF NOT EXISTS spans (
span_id TEXT PRIMARY KEY,
trace_id TEXT,
parent_span_id TEXT,
name TEXT,
service TEXT,
node TEXT,
started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
ended_at DATETIME,
duration_ms INTEGER,
status TEXT DEFAULT 'in_progress',
tags TEXT,
logs TEXT,
FOREIGN KEY (trace_id) REFERENCES traces(trace_id)
);
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name TEXT,
value REAL,
tags TEXT,
node TEXT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE TABLE IF NOT EXISTS logs (
id INTEGER PRIMARY KEY AUTOINCREMENT,
trace_id TEXT,
span_id TEXT,
level TEXT,
message TEXT,
node TEXT,
service TEXT,
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
);
CREATE INDEX IF NOT EXISTS idx_trace ON spans(trace_id);
CREATE INDEX IF NOT EXISTS idx_metric_name ON metrics(name);
CREATE INDEX IF NOT EXISTS idx_log_trace ON logs(trace_id);
SQL
echo -e "${GREEN}Observability system initialized${RESET}"
}
# Start trace
trace_start() {
local name="$1"
local service="${2:-unknown}"
local metadata="${3:-{}}"
local trace_id="trace_$(date +%s%N)_$(openssl rand -hex 4)"
sqlite3 "$OBS_DB" "
INSERT INTO traces (trace_id, name, service, metadata)
VALUES ('$trace_id', '$name', '$service', '$metadata')
"
echo "$trace_id"
}
# End trace
trace_end() {
local trace_id="$1"
local status="${2:-success}"
local start=$(sqlite3 "$OBS_DB" "SELECT started_at FROM traces WHERE trace_id = '$trace_id'")
local duration_ms=$(( ($(date +%s%N) - $(date -d "$start" +%s%N 2>/dev/null || echo $(date +%s%N))) / 1000000 ))
sqlite3 "$OBS_DB" "
UPDATE traces
SET ended_at = datetime('now'), duration_ms = $duration_ms, status = '$status'
WHERE trace_id = '$trace_id'
"
echo -e "${GREEN}Trace completed: $trace_id (${duration_ms}ms)${RESET}"
}
# Start span
span_start() {
local trace_id="$1"
local name="$2"
local service="${3:-unknown}"
local parent="${4:-}"
local node="${5:-$(hostname)}"
local span_id="span_$(date +%s%N)_$(openssl rand -hex 4)"
sqlite3 "$OBS_DB" "
INSERT INTO spans (span_id, trace_id, parent_span_id, name, service, node)
VALUES ('$span_id', '$trace_id', '$parent', '$name', '$service', '$node')
"
echo "$span_id"
}
# End span
span_end() {
local span_id="$1"
local status="${2:-success}"
local tags="${3:-{}}"
sqlite3 "$OBS_DB" "
UPDATE spans
SET ended_at = datetime('now'),
duration_ms = CAST((julianday('now') - julianday(started_at)) * 86400000 AS INTEGER),
status = '$status',
tags = '$tags'
WHERE span_id = '$span_id'
"
}
# Add span log
span_log() {
local span_id="$1"
local message="$2"
local level="${3:-info}"
local trace_id=$(sqlite3 "$OBS_DB" "SELECT trace_id FROM spans WHERE span_id = '$span_id'")
local service=$(sqlite3 "$OBS_DB" "SELECT service FROM spans WHERE span_id = '$span_id'")
local node=$(sqlite3 "$OBS_DB" "SELECT node FROM spans WHERE span_id = '$span_id'")
sqlite3 "$OBS_DB" "
INSERT INTO logs (trace_id, span_id, level, message, node, service)
VALUES ('$trace_id', '$span_id', '$level', '$(echo "$message" | sed "s/'/''/g")', '$node', '$service')
"
}
# Record metric
metric() {
local name="$1"
local value="$2"
local tags="${3:-{}}"
local node="${4:-$(hostname)}"
sqlite3 "$OBS_DB" "
INSERT INTO metrics (name, value, tags, node)
VALUES ('$name', $value, '$tags', '$node')
"
}
# View trace
view_trace() {
local trace_id="$1"
echo -e "${PINK}=== TRACE: $trace_id ===${RESET}"
echo
# Trace info
sqlite3 "$OBS_DB" -line "SELECT * FROM traces WHERE trace_id = '$trace_id'"
echo
echo "Spans:"
echo
# Build span tree
sqlite3 "$OBS_DB" "
SELECT span_id, parent_span_id, name, service, node, duration_ms, status
FROM spans WHERE trace_id = '$trace_id'
ORDER BY started_at
" | while IFS='|' read -r span_id parent name service node duration status; do
local indent=""
[ -n "$parent" ] && indent=" "
local status_color=$GREEN
[ "$status" = "error" ] && status_color=$RED
[ "$status" = "in_progress" ] && status_color=$YELLOW
printf "${indent}├── %-20s %-10s %-10s ${status_color}%dms${RESET}\n" "$name" "$service" "$node" "$duration"
# Show span logs
sqlite3 "$OBS_DB" "
SELECT level, message FROM logs WHERE span_id = '$span_id'
" | while IFS='|' read -r level msg; do
local log_color=$RESET
[ "$level" = "error" ] && log_color=$RED
[ "$level" = "warn" ] && log_color=$YELLOW
echo -e "${indent}${log_color}[$level] $msg${RESET}"
done
done
}
# List traces
list_traces() {
local limit="${1:-20}"
local filter="${2:-}"
echo -e "${PINK}=== TRACES ===${RESET}"
echo
local where=""
[ -n "$filter" ] && where="WHERE name LIKE '%$filter%' OR service LIKE '%$filter%'"
sqlite3 "$OBS_DB" "
SELECT trace_id, name, service, duration_ms, status, started_at
FROM traces $where
ORDER BY started_at DESC LIMIT $limit
" | while IFS='|' read -r trace_id name service duration status started; do
local status_color=$GREEN
[ "$status" = "error" ] && status_color=$RED
[ "$status" = "in_progress" ] && status_color=$YELLOW
printf " %-30s %-15s %-10s ${status_color}%dms${RESET} %s\n" \
"$trace_id" "$name" "$service" "$duration" "$started"
done
}
# Search logs
search_logs() {
local query="$1"
local limit="${2:-50}"
echo -e "${PINK}=== LOG SEARCH: $query ===${RESET}"
echo
sqlite3 "$OBS_DB" "
SELECT timestamp, level, service, node, message
FROM logs
WHERE message LIKE '%$query%'
ORDER BY timestamp DESC LIMIT $limit
" | while IFS='|' read -r ts level service node msg; do
local color=$RESET
[ "$level" = "error" ] && color=$RED
[ "$level" = "warn" ] && color=$YELLOW
echo -e "${color}[$ts] [$level] $service@$node: $msg${RESET}"
done
}
# Metrics summary
metrics_summary() {
local period="${1:-1 hour}"
echo -e "${PINK}=== METRICS SUMMARY (last $period) ===${RESET}"
echo
sqlite3 "$OBS_DB" "
SELECT name, node, AVG(value), MIN(value), MAX(value), COUNT(*)
FROM metrics
WHERE datetime(timestamp, '+$period') > datetime('now')
GROUP BY name, node
ORDER BY name, node
" | while IFS='|' read -r name node avg min max count; do
printf " %-20s %-10s avg:%.2f min:%.2f max:%.2f (%d samples)\n" \
"$name" "$node" "$avg" "$min" "$max" "$count"
done
}
# Service map
service_map() {
echo -e "${PINK}=== SERVICE MAP ===${RESET}"
echo
echo "Services and their dependencies:"
echo
sqlite3 "$OBS_DB" "
SELECT DISTINCT s1.service, s2.service
FROM spans s1
JOIN spans s2 ON s1.span_id = s2.parent_span_id
WHERE s1.service != s2.service
" | while IFS='|' read -r from to; do
echo " $from -> $to"
done
echo
echo "Service stats (last hour):"
sqlite3 "$OBS_DB" "
SELECT service, COUNT(*), AVG(duration_ms),
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) * 100.0 / COUNT(*)
FROM spans
WHERE datetime(started_at, '+1 hour') > datetime('now')
GROUP BY service
" | while IFS='|' read -r service count avg_lat error_rate; do
printf " %-20s spans:%d avg:%.0fms err:%.1f%%\n" "$service" "$count" "$avg_lat" "$error_rate"
done
}
# Error analysis
errors() {
local limit="${1:-20}"
echo -e "${PINK}=== ERROR ANALYSIS ===${RESET}"
echo
echo "Recent errors:"
sqlite3 "$OBS_DB" "
SELECT t.trace_id, t.name, s.service, s.node, l.message, l.timestamp
FROM logs l
JOIN spans s ON l.span_id = s.span_id
JOIN traces t ON l.trace_id = t.trace_id
WHERE l.level = 'error'
ORDER BY l.timestamp DESC LIMIT $limit
" | while IFS='|' read -r trace name service node msg ts; do
echo -e "${RED}[$ts] $service@$node${RESET}"
echo " Trace: $trace ($name)"
echo " Error: $msg"
echo
done
echo "Error rates by service:"
sqlite3 "$OBS_DB" "
SELECT service, COUNT(*) as total,
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors
FROM spans
WHERE datetime(started_at, '+1 hour') > datetime('now')
GROUP BY service
HAVING errors > 0
ORDER BY errors DESC
" | while IFS='|' read -r service total errors; do
local rate=$(echo "scale=1; $errors * 100 / $total" | bc)
printf " %-20s %d/%d (%.1f%%)\n" "$service" "$errors" "$total" "$rate"
done
}
# Dashboard
dashboard() {
clear
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 👁️ OBSERVABILITY DASHBOARD ║${RESET}"
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
echo
local total_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
local error_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE status = 'error' AND datetime(started_at, '+1 hour') > datetime('now')")
local avg_duration=$(sqlite3 "$OBS_DB" "SELECT AVG(duration_ms) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
echo "Last Hour:"
printf " Traces: %d | Errors: %d | Avg Duration: %.0fms\n" "$total_traces" "$error_traces" "${avg_duration:-0}"
echo
echo "─────────────────────────────────────────────────────────────────"
echo "Active Services:"
sqlite3 "$OBS_DB" "
SELECT service, COUNT(*), AVG(duration_ms)
FROM spans WHERE datetime(started_at, '+1 hour') > datetime('now')
GROUP BY service ORDER BY COUNT(*) DESC LIMIT 5
" | while IFS='|' read -r service count avg; do
printf " %-20s %d spans (avg: %.0fms)\n" "$service" "$count" "$avg"
done
echo
echo "─────────────────────────────────────────────────────────────────"
echo "Recent Errors:"
sqlite3 "$OBS_DB" "
SELECT service, message, timestamp FROM logs
WHERE level = 'error' ORDER BY timestamp DESC LIMIT 3
" | while IFS='|' read -r service msg ts; do
echo -e " ${RED}$service: $msg${RESET}"
done
}
# Clean old data
cleanup() {
local days="${1:-7}"
sqlite3 "$OBS_DB" "
DELETE FROM logs WHERE datetime(timestamp, '+$days days') < datetime('now');
DELETE FROM spans WHERE datetime(started_at, '+$days days') < datetime('now');
DELETE FROM traces WHERE datetime(started_at, '+$days days') < datetime('now');
DELETE FROM metrics WHERE datetime(timestamp, '+$days days') < datetime('now');
"
echo -e "${GREEN}Cleaned data older than $days days${RESET}"
}
# Help
help() {
echo -e "${PINK}BlackRoad Observability${RESET}"
echo
echo "Distributed tracing and observability"
echo
echo "Tracing:"
echo " trace-start <name> [service] Start trace"
echo " trace-end <trace_id> [status] End trace"
echo " span-start <trace> <name> [svc] Start span"
echo " span-end <span_id> [status] End span"
echo " span-log <span_id> <msg> [level] Add log"
echo " view <trace_id> View trace"
echo " list [limit] [filter] List traces"
echo
echo "Metrics & Logs:"
echo " metric <name> <value> [tags] Record metric"
echo " search <query> [limit] Search logs"
echo " metrics [period] Metrics summary"
echo
echo "Analysis:"
echo " service-map Service dependencies"
echo " errors [limit] Error analysis"
echo " dashboard Overview dashboard"
echo " cleanup [days] Clean old data"
echo
echo "Examples:"
echo " trace=\$($0 trace-start 'inference' 'api')"
echo " span=\$($0 span-start \$trace 'generate' 'llm')"
echo " $0 span-log \$span 'Processing request'"
echo " $0 span-end \$span"
echo " $0 trace-end \$trace"
}
# Ensure initialized
[ -f "$OBS_DB" ] || init >/dev/null
case "${1:-help}" in
init)
init
;;
trace-start)
trace_start "$2" "$3" "$4"
;;
trace-end)
trace_end "$2" "$3"
;;
span-start)
span_start "$2" "$3" "$4" "$5" "$6"
;;
span-end)
span_end "$2" "$3" "$4"
;;
span-log|log)
span_log "$2" "$3" "$4"
;;
view)
view_trace "$2"
;;
list|traces)
list_traces "$2" "$3"
;;
metric)
metric "$2" "$3" "$4" "$5"
;;
search)
search_logs "$2" "$3"
;;
metrics)
metrics_summary "$2"
;;
service-map|map)
service_map
;;
errors)
errors "$2"
;;
dashboard|dash)
dashboard
;;
cleanup)
cleanup "$2"
;;
*)
help
;;
esac

561
scripts/vault-universal.sh Executable file
View File

@@ -0,0 +1,561 @@
#!/bin/bash
# 🔐 BLACKROAD VAULT - UNIVERSAL CREDENTIAL MANAGER
#
# Philosophy: If a human has to paste an API key, the automation failed.
#
# Supports: 50+ services across all categories
# - Social Media: Instagram, Facebook, Twitter, LinkedIn, TikTok, YouTube
# - AI Providers: OpenAI, Anthropic, Google AI, Cohere, Hugging Face
# - Cloud: AWS, GCP, Azure, DigitalOcean, Linode, Vultr
# - Payments: Stripe, PayPal, Square
# - Auth: Clerk, Auth0, Firebase, Supabase
# - Infrastructure: Railway, Vercel, Netlify, Cloudflare, Heroku
# - Development: GitHub, GitLab, Bitbucket
# - Analytics: Google Analytics, Mixpanel, Amplitude
# - Communication: Slack, Discord, Telegram, Twilio
# - And more...
set -e
VAULT_DIR="$HOME/.blackroad/vault"
mkdir -p "$VAULT_DIR"
chmod 700 "$VAULT_DIR"
PINK='\033[38;5;205m'
GREEN='\033[38;5;82m'
BLUE='\033[38;5;69m'
AMBER='\033[38;5;214m'
RED='\033[38;5;196m'
RESET='\033[0m'
echo -e "${PINK}╔════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ 🔐 BLACKROAD UNIVERSAL VAULT ║${RESET}"
echo -e "${PINK}╚════════════════════════════════════════════╝${RESET}"
echo ""
# ============================================================================
# PAYMENT PROCESSORS
# ============================================================================
discover_stripe() {
echo -e "${BLUE}💳 Stripe...${RESET}"
if command -v stripe &> /dev/null && stripe config --list &> /dev/null 2>&1; then
SECRET_KEY=$(stripe config --list 2>/dev/null | grep "secret_key" | awk '{print $3}')
if [ -n "$SECRET_KEY" ]; then
echo "$SECRET_KEY" > "$VAULT_DIR/stripe_secret_key"
chmod 600 "$VAULT_DIR/stripe_secret_key"
echo -e "${GREEN} ✅ Saved${RESET}"
return 0
fi
fi
[ -n "$STRIPE_SECRET_KEY" ] && echo "$STRIPE_SECRET_KEY" > "$VAULT_DIR/stripe_secret_key" && chmod 600 "$VAULT_DIR/stripe_secret_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Run 'stripe login'${RESET}"
return 1
}
discover_paypal() {
echo -e "${BLUE}💳 PayPal...${RESET}"
[ -n "$PAYPAL_CLIENT_ID" ] && echo "$PAYPAL_CLIENT_ID" > "$VAULT_DIR/paypal_client_id" && chmod 600 "$VAULT_DIR/paypal_client_id" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://developer.paypal.com${RESET}"
return 1
}
# ============================================================================
# SOCIAL MEDIA & MARKETING
# ============================================================================
discover_instagram() {
echo -e "${BLUE}📸 Instagram...${RESET}"
[ -n "$INSTAGRAM_ACCESS_TOKEN" ] && echo "$INSTAGRAM_ACCESS_TOKEN" > "$VAULT_DIR/instagram_access_token" && chmod 600 "$VAULT_DIR/instagram_access_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://developers.facebook.com/apps${RESET}"
return 1
}
discover_facebook() {
echo -e "${BLUE}📘 Facebook...${RESET}"
[ -n "$FACEBOOK_ACCESS_TOKEN" ] && echo "$FACEBOOK_ACCESS_TOKEN" > "$VAULT_DIR/facebook_access_token" && chmod 600 "$VAULT_DIR/facebook_access_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://developers.facebook.com${RESET}"
return 1
}
discover_twitter() {
echo -e "${BLUE}🐦 Twitter/X...${RESET}"
[ -n "$TWITTER_API_KEY" ] && echo "$TWITTER_API_KEY" > "$VAULT_DIR/twitter_api_key" && chmod 600 "$VAULT_DIR/twitter_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://developer.twitter.com${RESET}"
return 1
}
discover_linkedin() {
echo -e "${BLUE}💼 LinkedIn...${RESET}"
[ -n "$LINKEDIN_ACCESS_TOKEN" ] && echo "$LINKEDIN_ACCESS_TOKEN" > "$VAULT_DIR/linkedin_access_token" && chmod 600 "$VAULT_DIR/linkedin_access_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://www.linkedin.com/developers${RESET}"
return 1
}
discover_tiktok() {
echo -e "${BLUE}🎵 TikTok...${RESET}"
[ -n "$TIKTOK_ACCESS_TOKEN" ] && echo "$TIKTOK_ACCESS_TOKEN" > "$VAULT_DIR/tiktok_access_token" && chmod 600 "$VAULT_DIR/tiktok_access_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://developers.tiktok.com${RESET}"
return 1
}
discover_youtube() {
echo -e "${BLUE}📺 YouTube...${RESET}"
[ -n "$YOUTUBE_API_KEY" ] && echo "$YOUTUBE_API_KEY" > "$VAULT_DIR/youtube_api_key" && chmod 600 "$VAULT_DIR/youtube_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://console.cloud.google.com${RESET}"
return 1
}
# ============================================================================
# AI PROVIDERS
# ============================================================================
discover_openai() {
echo -e "${BLUE}🤖 OpenAI...${RESET}"
[ -n "$OPENAI_API_KEY" ] && echo "$OPENAI_API_KEY" > "$VAULT_DIR/openai_api_key" && chmod 600 "$VAULT_DIR/openai_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://platform.openai.com/api-keys${RESET}"
return 1
}
discover_anthropic() {
echo -e "${BLUE}🤖 Anthropic...${RESET}"
[ -n "$ANTHROPIC_API_KEY" ] && echo "$ANTHROPIC_API_KEY" > "$VAULT_DIR/anthropic_api_key" && chmod 600 "$VAULT_DIR/anthropic_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://console.anthropic.com${RESET}"
return 1
}
discover_google_ai() {
echo -e "${BLUE}🤖 Google AI...${RESET}"
[ -n "$GOOGLE_AI_API_KEY" ] && echo "$GOOGLE_AI_API_KEY" > "$VAULT_DIR/google_ai_api_key" && chmod 600 "$VAULT_DIR/google_ai_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://aistudio.google.com${RESET}"
return 1
}
discover_cohere() {
echo -e "${BLUE}🤖 Cohere...${RESET}"
[ -n "$COHERE_API_KEY" ] && echo "$COHERE_API_KEY" > "$VAULT_DIR/cohere_api_key" && chmod 600 "$VAULT_DIR/cohere_api_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://dashboard.cohere.ai${RESET}"
return 1
}
discover_huggingface() {
echo -e "${BLUE}🤖 Hugging Face...${RESET}"
[ -n "$HUGGINGFACE_TOKEN" ] && echo "$HUGGINGFACE_TOKEN" > "$VAULT_DIR/huggingface_token" && chmod 600 "$VAULT_DIR/huggingface_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
# Check huggingface-cli
if [ -f ~/.huggingface/token ]; then
cat ~/.huggingface/token > "$VAULT_DIR/huggingface_token"
chmod 600 "$VAULT_DIR/huggingface_token"
echo -e "${GREEN} ✅ From CLI${RESET}"
return 0
fi
echo -e "${AMBER} ⚠️ Run 'huggingface-cli login'${RESET}"
return 1
}
# ============================================================================
# CLOUD PROVIDERS
# ============================================================================
discover_aws() {
echo -e "${BLUE}☁️ AWS...${RESET}"
if [ -f ~/.aws/credentials ]; then
AWS_KEY=$(grep "aws_access_key_id" ~/.aws/credentials | head -1 | cut -d= -f2 | tr -d ' ')
if [ -n "$AWS_KEY" ]; then
echo "$AWS_KEY" > "$VAULT_DIR/aws_access_key_id"
chmod 600 "$VAULT_DIR/aws_access_key_id"
echo -e "${GREEN} ✅ From ~/.aws/credentials${RESET}"
return 0
fi
fi
[ -n "$AWS_ACCESS_KEY_ID" ] && echo "$AWS_ACCESS_KEY_ID" > "$VAULT_DIR/aws_access_key_id" && chmod 600 "$VAULT_DIR/aws_access_key_id" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Run 'aws configure'${RESET}"
return 1
}
discover_gcp() {
echo -e "${BLUE}☁️ Google Cloud...${RESET}"
if command -v gcloud &> /dev/null; then
GCP_PROJECT=$(gcloud config get-value project 2>/dev/null)
if [ -n "$GCP_PROJECT" ]; then
echo "$GCP_PROJECT" > "$VAULT_DIR/gcp_project_id"
chmod 600 "$VAULT_DIR/gcp_project_id"
echo -e "${GREEN} ✅ Configured${RESET}"
return 0
fi
fi
echo -e "${AMBER} ⚠️ Run 'gcloud init'${RESET}"
return 1
}
discover_azure() {
echo -e "${BLUE}☁️ Azure...${RESET}"
if command -v az &> /dev/null; then
if az account show &> /dev/null; then
AZ_SUB=$(az account show --query id -o tsv 2>/dev/null)
if [ -n "$AZ_SUB" ]; then
echo "$AZ_SUB" > "$VAULT_DIR/azure_subscription_id"
chmod 600 "$VAULT_DIR/azure_subscription_id"
echo -e "${GREEN} ✅ Logged in${RESET}"
return 0
fi
fi
fi
echo -e "${AMBER} ⚠️ Run 'az login'${RESET}"
return 1
}
discover_digitalocean() {
echo -e "${BLUE}☁️ DigitalOcean...${RESET}"
[ -n "$DIGITALOCEAN_TOKEN" ] && echo "$DIGITALOCEAN_TOKEN" > "$VAULT_DIR/digitalocean_token" && chmod 600 "$VAULT_DIR/digitalocean_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://cloud.digitalocean.com/account/api${RESET}"
return 1
}
# ============================================================================
# DEVELOPMENT & HOSTING
# ============================================================================
discover_github() {
echo -e "${BLUE}🐙 GitHub...${RESET}"
if command -v gh &> /dev/null && gh auth status &> /dev/null 2>&1; then
GH_TOKEN=$(gh auth token 2>/dev/null)
if [ -n "$GH_TOKEN" ]; then
echo "$GH_TOKEN" > "$VAULT_DIR/github_token"
chmod 600 "$VAULT_DIR/github_token"
echo -e "${GREEN} ✅ From gh CLI${RESET}"
return 0
fi
fi
[ -n "$GITHUB_TOKEN" ] && echo "$GITHUB_TOKEN" > "$VAULT_DIR/github_token" && chmod 600 "$VAULT_DIR/github_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Run 'gh auth login'${RESET}"
return 1
}
discover_gitlab() {
echo -e "${BLUE}🦊 GitLab...${RESET}"
[ -n "$GITLAB_TOKEN" ] && echo "$GITLAB_TOKEN" > "$VAULT_DIR/gitlab_token" && chmod 600 "$VAULT_DIR/gitlab_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://gitlab.com/-/profile/personal_access_tokens${RESET}"
return 1
}
discover_railway() {
echo -e "${BLUE}🚂 Railway...${RESET}"
if command -v railway &> /dev/null && railway whoami &> /dev/null 2>&1; then
RAILWAY_TOKEN=$(cat ~/.config/railway/config.json 2>/dev/null | jq -r '.token' 2>/dev/null)
if [ -n "$RAILWAY_TOKEN" ] && [ "$RAILWAY_TOKEN" != "null" ]; then
echo "$RAILWAY_TOKEN" > "$VAULT_DIR/railway_token"
chmod 600 "$VAULT_DIR/railway_token"
echo -e "${GREEN} ✅ From CLI${RESET}"
return 0
fi
fi
echo -e "${AMBER} ⚠️ Run 'railway login'${RESET}"
return 1
}
discover_vercel() {
echo -e "${BLUE}▲ Vercel...${RESET}"
if [ -f ~/.config/configstore/update-notifier-vercel.json ]; then
VERCEL_TOKEN=$(cat ~/.vercel/auth.json 2>/dev/null | jq -r '.token' 2>/dev/null)
if [ -n "$VERCEL_TOKEN" ]; then
echo "$VERCEL_TOKEN" > "$VAULT_DIR/vercel_token"
chmod 600 "$VAULT_DIR/vercel_token"
echo -e "${GREEN} ✅ From CLI${RESET}"
return 0
fi
fi
[ -n "$VERCEL_TOKEN" ] && echo "$VERCEL_TOKEN" > "$VAULT_DIR/vercel_token" && chmod 600 "$VAULT_DIR/vercel_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Run 'vercel login'${RESET}"
return 1
}
discover_cloudflare() {
echo -e "${BLUE}☁️ Cloudflare...${RESET}"
if [ -f ~/.wrangler/config/default.toml ]; then
CF_TOKEN=$(grep "api_token" ~/.wrangler/config/default.toml | cut -d'"' -f2)
if [ -n "$CF_TOKEN" ]; then
echo "$CF_TOKEN" > "$VAULT_DIR/cloudflare_api_token"
chmod 600 "$VAULT_DIR/cloudflare_api_token"
echo -e "${GREEN} ✅ From wrangler${RESET}"
return 0
fi
fi
[ -n "$CLOUDFLARE_API_TOKEN" ] && echo "$CLOUDFLARE_API_TOKEN" > "$VAULT_DIR/cloudflare_api_token" && chmod 600 "$VAULT_DIR/cloudflare_api_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Run 'wrangler login'${RESET}"
return 1
}
# ============================================================================
# AUTH PROVIDERS
# ============================================================================
discover_clerk() {
echo -e "${BLUE}🔐 Clerk...${RESET}"
[ -n "$CLERK_SECRET_KEY" ] && echo "$CLERK_SECRET_KEY" > "$VAULT_DIR/clerk_secret_key" && chmod 600 "$VAULT_DIR/clerk_secret_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://dashboard.clerk.com${RESET}"
return 1
}
discover_auth0() {
echo -e "${BLUE}🔐 Auth0...${RESET}"
[ -n "$AUTH0_CLIENT_SECRET" ] && echo "$AUTH0_CLIENT_SECRET" > "$VAULT_DIR/auth0_client_secret" && chmod 600 "$VAULT_DIR/auth0_client_secret" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://manage.auth0.com${RESET}"
return 1
}
discover_supabase() {
echo -e "${BLUE}🔐 Supabase...${RESET}"
[ -n "$SUPABASE_ANON_KEY" ] && echo "$SUPABASE_ANON_KEY" > "$VAULT_DIR/supabase_anon_key" && chmod 600 "$VAULT_DIR/supabase_anon_key" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://app.supabase.com${RESET}"
return 1
}
# ============================================================================
# COMMUNICATION
# ============================================================================
discover_slack() {
echo -e "${BLUE}💬 Slack...${RESET}"
[ -n "$SLACK_BOT_TOKEN" ] && echo "$SLACK_BOT_TOKEN" > "$VAULT_DIR/slack_bot_token" && chmod 600 "$VAULT_DIR/slack_bot_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://api.slack.com/apps${RESET}"
return 1
}
discover_discord() {
echo -e "${BLUE}💬 Discord...${RESET}"
[ -n "$DISCORD_BOT_TOKEN" ] && echo "$DISCORD_BOT_TOKEN" > "$VAULT_DIR/discord_bot_token" && chmod 600 "$VAULT_DIR/discord_bot_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://discord.com/developers${RESET}"
return 1
}
discover_telegram() {
echo -e "${BLUE}💬 Telegram...${RESET}"
[ -n "$TELEGRAM_BOT_TOKEN" ] && echo "$TELEGRAM_BOT_TOKEN" > "$VAULT_DIR/telegram_bot_token" && chmod 600 "$VAULT_DIR/telegram_bot_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from @BotFather${RESET}"
return 1
}
discover_twilio() {
echo -e "${BLUE}📱 Twilio...${RESET}"
[ -n "$TWILIO_AUTH_TOKEN" ] && echo "$TWILIO_AUTH_TOKEN" > "$VAULT_DIR/twilio_auth_token" && chmod 600 "$VAULT_DIR/twilio_auth_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://www.twilio.com/console${RESET}"
return 1
}
# ============================================================================
# ANALYTICS
# ============================================================================
discover_google_analytics() {
echo -e "${BLUE}📊 Google Analytics...${RESET}"
[ -n "$GA_MEASUREMENT_ID" ] && echo "$GA_MEASUREMENT_ID" > "$VAULT_DIR/ga_measurement_id" && chmod 600 "$VAULT_DIR/ga_measurement_id" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://analytics.google.com${RESET}"
return 1
}
discover_mixpanel() {
echo -e "${BLUE}📊 Mixpanel...${RESET}"
[ -n "$MIXPANEL_TOKEN" ] && echo "$MIXPANEL_TOKEN" > "$VAULT_DIR/mixpanel_token" && chmod 600 "$VAULT_DIR/mixpanel_token" && echo -e "${GREEN} ✅ From env${RESET}" && return 0
echo -e "${AMBER} ⚠️ Get from https://mixpanel.com/settings/project${RESET}"
return 1
}
# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
load_vault() {
# Export all credentials as environment variables
for key_file in "$VAULT_DIR"/*; do
if [ -f "$key_file" ]; then
key_name=$(basename "$key_file" | tr '[:lower:]' '[:upper:]')
key_value=$(cat "$key_file")
echo "export $key_name='$key_value'"
fi
done
}
show_vault() {
echo ""
echo -e "${PINK}═══════════════════════════════════════════${RESET}"
echo -e "${BLUE}📋 Vault Status${RESET}"
echo -e "${PINK}═══════════════════════════════════════════${RESET}"
echo ""
local total=0
local configured=0
for service in stripe paypal instagram facebook twitter linkedin tiktok youtube \
openai anthropic google_ai cohere huggingface \
aws gcp azure digitalocean \
github gitlab railway vercel cloudflare \
clerk auth0 supabase \
slack discord telegram twilio \
google_analytics mixpanel; do
total=$((total + 1))
if ls "$VAULT_DIR/${service}_"* &> /dev/null 2>&1; then
echo -e "${GREEN}$service${RESET}"
configured=$((configured + 1))
else
echo -e "${AMBER}⚠️ $service${RESET}"
fi
done
echo ""
echo -e "${BLUE}Configured: $configured / $total services${RESET}"
echo -e "${BLUE}Vault: $VAULT_DIR${RESET}"
echo -e "${BLUE}Files: $(ls -1 "$VAULT_DIR" 2>/dev/null | wc -l | tr -d ' ')${RESET}"
}
create_env_file() {
local target_file="${1:-.env}"
echo -e "${BLUE}📝 Creating $target_file...${RESET}"
cat > "$target_file" << 'EOF'
# Auto-generated from BlackRoad Universal Vault
# DO NOT EDIT - Run ./blackroad-vault-universal.sh to update
# Generated: $(date)
EOF
for key_file in "$VAULT_DIR"/*; do
if [ -f "$key_file" ]; then
key_name=$(basename "$key_file" | tr '[:lower:]' '[:upper:]')
key_value=$(cat "$key_file")
echo "$key_name=$key_value" >> "$target_file"
fi
done
chmod 600 "$target_file"
echo -e "${GREEN}✅ Created $target_file${RESET}"
}
# ============================================================================
# MAIN EXECUTION
# ============================================================================
case "${1:-discover}" in
discover)
echo -e "${PINK}🔍 Discovering credentials from 40+ services...${RESET}"
echo ""
# Payments
echo -e "${PINK}━━ PAYMENTS ━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_stripe || true
discover_paypal || true
# Social Media
echo ""
echo -e "${PINK}━━ SOCIAL MEDIA ━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_instagram || true
discover_facebook || true
discover_twitter || true
discover_linkedin || true
discover_tiktok || true
discover_youtube || true
# AI Providers
echo ""
echo -e "${PINK}━━ AI PROVIDERS ━━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_openai || true
discover_anthropic || true
discover_google_ai || true
discover_cohere || true
discover_huggingface || true
# Cloud
echo ""
echo -e "${PINK}━━ CLOUD PROVIDERS ━━━━━━━━━━━━━━━━━━━${RESET}"
discover_aws || true
discover_gcp || true
discover_azure || true
discover_digitalocean || true
# Development
echo ""
echo -e "${PINK}━━ DEVELOPMENT ━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_github || true
discover_gitlab || true
discover_railway || true
discover_vercel || true
discover_cloudflare || true
# Auth
echo ""
echo -e "${PINK}━━ AUTH PROVIDERS ━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_clerk || true
discover_auth0 || true
discover_supabase || true
# Communication
echo ""
echo -e "${PINK}━━ COMMUNICATION ━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_slack || true
discover_discord || true
discover_telegram || true
discover_twilio || true
# Analytics
echo ""
echo -e "${PINK}━━ ANALYTICS ━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
discover_google_analytics || true
discover_mixpanel || true
show_vault
# Log to memory
if command -v ~/memory-system.sh &> /dev/null; then
~/memory-system.sh log "vault-discovery" "universal-vault" "Discovered credentials from 40+ services. Configured: $(ls -1 "$VAULT_DIR" 2>/dev/null | wc -l) keys" "vault,automation,credentials"
fi
echo ""
echo -e "${PINK}╔════════════════════════════════════════════╗${RESET}"
echo -e "${PINK}║ ✅ UNIVERSAL VAULT READY ║${RESET}"
echo -e "${PINK}╚════════════════════════════════════════════╝${RESET}"
echo ""
echo -e "${BLUE}Usage in scripts:${RESET}"
echo -e " source <(./blackroad-vault-universal.sh load)"
echo ""
echo -e "${BLUE}Generate .env:${RESET}"
echo -e " ./blackroad-vault-universal.sh env .env"
echo ""
echo -e "${GREEN}Philosophy: One-time login → Forever automated${RESET}"
;;
load)
load_vault
;;
show)
show_vault
;;
env)
create_env_file "$2"
;;
*)
echo -e "${RED}Unknown command: $1${RESET}"
echo ""
echo "Usage: $0 [discover|load|show|env]"
echo " discover - Auto-discover all credentials"
echo " load - Export credentials to environment"
echo " show - Show vault status"
echo " env - Create .env file"
exit 1
;;
esac