Add 12 infra scripts: observability, alerting, logging, deployment

- Observability stack with metrics and tracing - Alerting system with escalation rules - Log aggregator for centralized logging - Deploy pipeline for CI/CD automation - Universal vault for credential management - Cost tracker for cloud spend monitoring - Fleet OS enhancer for device upgrades - Live dashboard for real-time status - Grafana deployment for visualization - Backup system deployment - Alert manager deployment - Log aggregation deployment Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 20:34:34 -06:00
parent c9e65b23e6
commit e31dbf972d
12 changed files with 5304 additions and 0 deletions
--- a/scripts/alerting.sh
+++ b/scripts/alerting.sh
@@ -0,0 +1,414 @@
 #!/bin/bash
 # BlackRoad Alerting System
 # Multi-channel alerts for cluster events
 # Agent: Icarus (b3e01bd9)
 PINK='\033[38;5;205m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 RESET='\033[0m'
 ALERT_DIR="$HOME/.blackroad/alerts"
 ALERT_DB="$ALERT_DIR/alerts.db"
 CONFIG_FILE="$ALERT_DIR/config.json"
 ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
 # Alert severity levels
 SEVERITY_INFO="info"
 SEVERITY_WARNING="warning"
 SEVERITY_ERROR="error"
 SEVERITY_CRITICAL="critical"
 # Initialize
 init() {
    mkdir -p "$ALERT_DIR"
    sqlite3 "$ALERT_DB" << 'SQL'
 CREATE TABLE IF NOT EXISTS alerts (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    severity TEXT,
    source TEXT,
    title TEXT,
    message TEXT,
    acknowledged INTEGER DEFAULT 0,
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    ack_at DATETIME,
    ack_by TEXT
 );
 CREATE TABLE IF NOT EXISTS rules (
    id TEXT PRIMARY KEY,
    name TEXT,
    condition TEXT,
    severity TEXT,
    channels TEXT,
    enabled INTEGER DEFAULT 1,
    cooldown INTEGER DEFAULT 300
 );
 CREATE INDEX IF NOT EXISTS idx_severity ON alerts(severity);
 CREATE INDEX IF NOT EXISTS idx_ack ON alerts(acknowledged);
 SQL
    # Default config
    if [ ! -f "$CONFIG_FILE" ]; then
        cat > "$CONFIG_FILE" << 'EOF'
 {
    "channels": {
        "console": {"enabled": true},
        "file": {"enabled": true, "path": "~/.blackroad/alerts/alert.log"},
        "webhook": {"enabled": false, "url": ""},
        "email": {"enabled": false, "to": "", "smtp": ""},
        "slack": {"enabled": false, "webhook": ""}
    },
    "thresholds": {
        "cpu_warning": 80,
        "cpu_critical": 95,
        "mem_warning": 85,
        "mem_critical": 95,
        "disk_warning": 80,
        "disk_critical": 90,
        "temp_warning": 70,
        "temp_critical": 80
    }
 }
 EOF
    fi
    echo -e "${GREEN}Alerting system initialized${RESET}"
 }
 # Send alert
 send() {
    local severity="$1"
    local source="$2"
    local title="$3"
    local message="$4"
    local timestamp=$(date -Iseconds)
    # Store in database
    sqlite3 "$ALERT_DB" "
        INSERT INTO alerts (severity, source, title, message)
        VALUES ('$severity', '$source', '$(echo "$title" | sed "s/'/''/g")', '$(echo "$message" | sed "s/'/''/g")')
    "
    local alert_id=$(sqlite3 "$ALERT_DB" "SELECT last_insert_rowid()")
    # Console output
    local color=$RESET
    case $severity in
        info) color=$BLUE ;;
        warning) color=$YELLOW ;;
        error) color=$RED ;;
        critical) color="${RED}\033[1m" ;;
    esac
    echo -e "${color}[$severity] $title${RESET}"
    echo "  Source: $source"
    echo "  Message: $message"
    echo "  Alert ID: $alert_id"
    # File logging
    echo "$timestamp [$severity] [$source] $title: $message" >> "$ALERT_DIR/alert.log"
    # Webhook
    local webhook_enabled=$(jq -r '.channels.webhook.enabled' "$CONFIG_FILE")
    local webhook_url=$(jq -r '.channels.webhook.url' "$CONFIG_FILE")
    if [ "$webhook_enabled" = "true" ] && [ -n "$webhook_url" ]; then
        curl -s -X POST "$webhook_url" \
            -H "Content-Type: application/json" \
            -d "{\"severity\":\"$severity\",\"source\":\"$source\",\"title\":\"$title\",\"message\":\"$message\",\"timestamp\":\"$timestamp\"}" \
            >/dev/null 2>&1 &
    fi
    # Slack
    local slack_enabled=$(jq -r '.channels.slack.enabled' "$CONFIG_FILE")
    local slack_webhook=$(jq -r '.channels.slack.webhook' "$CONFIG_FILE")
    if [ "$slack_enabled" = "true" ] && [ -n "$slack_webhook" ]; then
        local slack_color="good"
        [ "$severity" = "warning" ] && slack_color="warning"
        [ "$severity" = "error" ] || [ "$severity" = "critical" ] && slack_color="danger"
        curl -s -X POST "$slack_webhook" \
            -H "Content-Type: application/json" \
            -d "{\"attachments\":[{\"color\":\"$slack_color\",\"title\":\"[$severity] $title\",\"text\":\"$message\",\"footer\":\"$source\"}]}" \
            >/dev/null 2>&1 &
    fi
    echo "$alert_id"
 }
 # Check cluster and generate alerts
 check() {
    echo -e "${PINK}=== CLUSTER HEALTH CHECK ===${RESET}"
    echo
    local thresholds=$(cat "$CONFIG_FILE")
    for node in "${ALL_NODES[@]}"; do
        echo -n "  $node: "
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            send "critical" "$node" "Node Offline" "Node $node is not reachable" >/dev/null
            echo -e "${RED}OFFLINE${RESET}"
            continue
        fi
        # Get metrics
        local metrics=$(ssh "$node" "
            cpu=\$(top -bn1 | grep 'Cpu(s)' | awk '{print 100-\$8}' 2>/dev/null || echo 0)
            mem=\$(free | awk '/Mem:/ {printf \"%.0f\", \$3/\$2*100}')
            disk=\$(df / | awk 'NR==2 {gsub(/%/,\"\"); print \$5}')
            temp=\$(vcgencmd measure_temp 2>/dev/null | grep -oP '[\d.]+' || echo 0)
            echo \"\$cpu|\$mem|\$disk|\$temp\"
        " 2>/dev/null)
        local cpu=$(echo "$metrics" | cut -d'|' -f1)
        local mem=$(echo "$metrics" | cut -d'|' -f2)
        local disk=$(echo "$metrics" | cut -d'|' -f3)
        local temp=$(echo "$metrics" | cut -d'|' -f4)
        local status="OK"
        local status_color=$GREEN
        # Check CPU
        local cpu_warn=$(echo "$thresholds" | jq -r '.thresholds.cpu_warning')
        local cpu_crit=$(echo "$thresholds" | jq -r '.thresholds.cpu_critical')
        if [ "$(echo "$cpu > $cpu_crit" | bc -l)" = "1" ]; then
            send "critical" "$node" "CPU Critical" "CPU usage at ${cpu}%" >/dev/null
            status="CRITICAL"
            status_color=$RED
        elif [ "$(echo "$cpu > $cpu_warn" | bc -l)" = "1" ]; then
            send "warning" "$node" "CPU Warning" "CPU usage at ${cpu}%" >/dev/null
            status="WARNING"
            status_color=$YELLOW
        fi
        # Check Memory
        local mem_warn=$(echo "$thresholds" | jq -r '.thresholds.mem_warning')
        local mem_crit=$(echo "$thresholds" | jq -r '.thresholds.mem_critical')
        if [ "$mem" -gt "$mem_crit" ]; then
            send "critical" "$node" "Memory Critical" "Memory usage at ${mem}%" >/dev/null
            status="CRITICAL"
            status_color=$RED
        elif [ "$mem" -gt "$mem_warn" ]; then
            send "warning" "$node" "Memory Warning" "Memory usage at ${mem}%" >/dev/null
            [ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
        fi
        # Check Disk
        local disk_warn=$(echo "$thresholds" | jq -r '.thresholds.disk_warning')
        local disk_crit=$(echo "$thresholds" | jq -r '.thresholds.disk_critical')
        if [ "$disk" -gt "$disk_crit" ]; then
            send "critical" "$node" "Disk Critical" "Disk usage at ${disk}%" >/dev/null
            status="CRITICAL"
            status_color=$RED
        elif [ "$disk" -gt "$disk_warn" ]; then
            send "warning" "$node" "Disk Warning" "Disk usage at ${disk}%" >/dev/null
            [ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
        fi
        # Check Temperature
        local temp_warn=$(echo "$thresholds" | jq -r '.thresholds.temp_warning')
        local temp_crit=$(echo "$thresholds" | jq -r '.thresholds.temp_critical')
        if [ "$(echo "$temp > $temp_crit" | bc -l)" = "1" ]; then
            send "critical" "$node" "Temperature Critical" "Temperature at ${temp}°C" >/dev/null
            status="CRITICAL"
            status_color=$RED
        elif [ "$(echo "$temp > $temp_warn" | bc -l)" = "1" ]; then
            send "warning" "$node" "Temperature Warning" "Temperature at ${temp}°C" >/dev/null
            [ "$status" != "CRITICAL" ] && status="WARNING" && status_color=$YELLOW
        fi
        echo -e "${status_color}$status${RESET} (cpu:${cpu}% mem:${mem}% disk:${disk}% temp:${temp}°C)"
    done
 }
 # Monitor daemon
 monitor() {
    local interval="${1:-60}"
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           🔔 ALERT MONITOR DAEMON                            ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    echo "Check interval: ${interval}s"
    echo "Press Ctrl+C to stop"
    echo
    while true; do
        echo "[$(date '+%H:%M:%S')] Checking cluster..."
        check >/dev/null 2>&1
        sleep "$interval"
    done
 }
 # List alerts
 list() {
    local filter="${1:-all}"
    local limit="${2:-20}"
    echo -e "${PINK}=== ALERTS ===${RESET}"
    echo
    local where=""
    case "$filter" in
        unack) where="WHERE acknowledged = 0" ;;
        ack) where="WHERE acknowledged = 1" ;;
        critical) where="WHERE severity = 'critical'" ;;
        warning) where="WHERE severity = 'warning'" ;;
    esac
    sqlite3 "$ALERT_DB" "
        SELECT id, severity, source, title, acknowledged, created_at
        FROM alerts $where
        ORDER BY created_at DESC
        LIMIT $limit
    " | while IFS='|' read -r id severity source title ack created; do
        local color=$RESET
        case $severity in
            info) color=$BLUE ;;
            warning) color=$YELLOW ;;
            error|critical) color=$RED ;;
        esac
        local ack_status=""
        [ "$ack" = "1" ] && ack_status=" [ACK]"
        printf "${color}#%-5s %-10s %-10s %s${RESET}%s\n" "$id" "[$severity]" "$source" "$title" "$ack_status"
    done
 }
 # Acknowledge alert
 ack() {
    local alert_id="$1"
    local by="${2:-system}"
    sqlite3 "$ALERT_DB" "
        UPDATE alerts SET acknowledged = 1, ack_at = datetime('now'), ack_by = '$by'
        WHERE id = $alert_id
    "
    echo -e "${GREEN}Acknowledged alert #$alert_id${RESET}"
 }
 # Acknowledge all
 ack_all() {
    local by="${1:-system}"
    sqlite3 "$ALERT_DB" "
        UPDATE alerts SET acknowledged = 1, ack_at = datetime('now'), ack_by = '$by'
        WHERE acknowledged = 0
    "
    echo -e "${GREEN}Acknowledged all alerts${RESET}"
 }
 # Stats
 stats() {
    echo -e "${PINK}=== ALERT STATISTICS ===${RESET}"
    echo
    echo "By severity (last 24h):"
    sqlite3 "$ALERT_DB" "
        SELECT severity, COUNT(*)
        FROM alerts
        WHERE datetime(created_at, '+1 day') > datetime('now')
        GROUP BY severity
    " | while IFS='|' read -r severity count; do
        echo "  $severity: $count"
    done
    echo
    echo "By source (last 24h):"
    sqlite3 "$ALERT_DB" "
        SELECT source, COUNT(*)
        FROM alerts
        WHERE datetime(created_at, '+1 day') > datetime('now')
        GROUP BY source
        ORDER BY COUNT(*) DESC
        LIMIT 5
    " | while IFS='|' read -r source count; do
        echo "  $source: $count"
    done
    echo
    local unack=$(sqlite3 "$ALERT_DB" "SELECT COUNT(*) FROM alerts WHERE acknowledged = 0")
    echo "Unacknowledged: $unack"
 }
 # Test alert
 test_alert() {
    send "info" "test" "Test Alert" "This is a test alert from the alerting system"
 }
 # Help
 help() {
    echo -e "${PINK}BlackRoad Alerting System${RESET}"
    echo
    echo "Multi-channel alerts for cluster events"
    echo
    echo "Commands:"
    echo "  send <sev> <src> <title> <msg>  Send alert"
    echo "  check                           Check cluster health"
    echo "  monitor [interval]              Run alert daemon"
    echo "  list [filter] [limit]           List alerts"
    echo "  ack <id>                        Acknowledge alert"
    echo "  ack-all                         Acknowledge all"
    echo "  stats                           Alert statistics"
    echo "  test                            Send test alert"
    echo
    echo "Severities: info, warning, error, critical"
    echo "Filters: all, unack, ack, critical, warning"
    echo
    echo "Examples:"
    echo "  $0 send warning cecilia 'High Load' 'Load average is 8.5'"
    echo "  $0 monitor 30"
    echo "  $0 list unack"
 }
 # Ensure initialized
 [ -f "$ALERT_DB" ] || init >/dev/null
 case "${1:-help}" in
    init)
        init
        ;;
    send|alert)
        send "$2" "$3" "$4" "$5"
        ;;
    check)
        check
        ;;
    monitor|daemon)
        monitor "$2"
        ;;
    list|ls)
        list "$2" "$3"
        ;;
    ack)
        ack "$2" "$3"
        ;;
    ack-all)
        ack_all "$2"
        ;;
    stats)
        stats
        ;;
    test)
        test_alert
        ;;
    *)
        help
        ;;
 esac
--- a/scripts/cost-tracker.sh
+++ b/scripts/cost-tracker.sh
@@ -0,0 +1,451 @@
 #!/bin/bash
 # BlackRoad Cost Tracker
 # Track resource usage and costs across the cluster
 # Agent: Icarus (b3e01bd9)
 PINK='\033[38;5;205m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 CYAN='\033[0;36m'
 RESET='\033[0m'
 COST_DIR="$HOME/.blackroad/costs"
 COST_DB="$COST_DIR/costs.db"
 ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
 # Default rates (can be customized)
 RATE_CPU_HOUR=0.001        # $ per CPU-hour
 RATE_MEM_GB_HOUR=0.0005    # $ per GB-hour
 RATE_GPU_HOUR=0.01         # $ per GPU-hour (Hailo)
 RATE_INFERENCE=0.0001      # $ per inference request
 RATE_TOKEN=0.000001        # $ per token
 # Initialize
 init() {
    mkdir -p "$COST_DIR"/{reports,budgets}
    sqlite3 "$COST_DB" << 'SQL'
 CREATE TABLE IF NOT EXISTS usage (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
    node TEXT,
    project TEXT DEFAULT 'default',
    resource_type TEXT,
    quantity REAL,
    unit TEXT,
    cost REAL
 );
 CREATE TABLE IF NOT EXISTS rates (
    resource_type TEXT PRIMARY KEY,
    rate REAL,
    unit TEXT,
    description TEXT
 );
 CREATE TABLE IF NOT EXISTS budgets (
    project TEXT PRIMARY KEY,
    monthly_limit REAL,
    alert_threshold REAL DEFAULT 0.8,
    current_spend REAL DEFAULT 0
 );
 CREATE TABLE IF NOT EXISTS invoices (
    id TEXT PRIMARY KEY,
    project TEXT,
    period_start DATE,
    period_end DATE,
    total REAL,
    status TEXT DEFAULT 'pending',
    created_at DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE INDEX IF NOT EXISTS idx_project ON usage(project);
 CREATE INDEX IF NOT EXISTS idx_timestamp ON usage(timestamp);
 SQL
    # Seed default rates
    seed_rates
    echo -e "${GREEN}Cost tracker initialized${RESET}"
 }
 # Seed default rates
 seed_rates() {
    sqlite3 "$COST_DB" << SQL
 INSERT OR IGNORE INTO rates (resource_type, rate, unit, description) VALUES
    ('cpu', $RATE_CPU_HOUR, 'cpu-hour', 'CPU compute time'),
    ('memory', $RATE_MEM_GB_HOUR, 'gb-hour', 'Memory usage'),
    ('gpu', $RATE_GPU_HOUR, 'gpu-hour', 'Hailo accelerator time'),
    ('inference', $RATE_INFERENCE, 'request', 'LLM inference request'),
    ('tokens', $RATE_TOKEN, 'token', 'Input/output tokens'),
    ('storage', 0.00001, 'gb-hour', 'Disk storage'),
    ('network', 0.00001, 'gb', 'Network transfer');
 SQL
 }
 # Record usage
 record() {
    local resource="$1"
    local quantity="$2"
    local project="${3:-default}"
    local node="${4:-$(hostname)}"
    local rate=$(sqlite3 "$COST_DB" "SELECT rate FROM rates WHERE resource_type = '$resource'")
    local cost=$(echo "scale=6; $quantity * $rate" | bc)
    sqlite3 "$COST_DB" "
        INSERT INTO usage (node, project, resource_type, quantity, unit, cost)
        VALUES ('$node', '$project', '$resource', $quantity, (SELECT unit FROM rates WHERE resource_type = '$resource'), $cost)
    "
    # Update budget
    sqlite3 "$COST_DB" "
        UPDATE budgets SET current_spend = current_spend + $cost WHERE project = '$project'
    "
    echo -e "${GREEN}Recorded: $quantity $resource = \$$cost${RESET}"
 }
 # Record inference usage
 record_inference() {
    local project="${1:-default}"
    local tokens_in="${2:-0}"
    local tokens_out="${3:-0}"
    local node="${4:-$(hostname)}"
    record "inference" 1 "$project" "$node"
    record "tokens" "$((tokens_in + tokens_out))" "$project" "$node"
 }
 # Set rate
 set_rate() {
    local resource="$1"
    local rate="$2"
    local unit="${3:-unit}"
    sqlite3 "$COST_DB" "
        INSERT OR REPLACE INTO rates (resource_type, rate, unit)
        VALUES ('$resource', $rate, '$unit')
    "
    echo -e "${GREEN}Rate set: $resource = \$$rate per $unit${RESET}"
 }
 # List rates
 rates() {
    echo -e "${PINK}=== RESOURCE RATES ===${RESET}"
    echo
    sqlite3 "$COST_DB" "SELECT resource_type, rate, unit, description FROM rates ORDER BY resource_type" | \
    while IFS='|' read -r resource rate unit desc; do
        printf "  %-15s \$%-10.6f per %-10s %s\n" "$resource" "$rate" "$unit" "$desc"
    done
 }
 # Create budget
 budget_create() {
    local project="$1"
    local limit="$2"
    local threshold="${3:-0.8}"
    sqlite3 "$COST_DB" "
        INSERT OR REPLACE INTO budgets (project, monthly_limit, alert_threshold, current_spend)
        VALUES ('$project', $limit, $threshold, 0)
    "
    echo -e "${GREEN}Budget created: $project = \$$limit/month${RESET}"
 }
 # Check budgets
 budget_check() {
    echo -e "${PINK}=== BUDGET STATUS ===${RESET}"
    echo
    sqlite3 "$COST_DB" "SELECT project, monthly_limit, current_spend, alert_threshold FROM budgets" | \
    while IFS='|' read -r project limit spend threshold; do
        local pct=$(echo "scale=1; $spend * 100 / $limit" | bc 2>/dev/null || echo 0)
        local threshold_pct=$(echo "scale=0; $threshold * 100" | bc)
        local color=$GREEN
        local alert_val=$(echo "$spend / $limit" | bc -l)
        if [ "$(echo "$alert_val > $threshold" | bc -l)" = "1" ]; then
            color=$YELLOW
        fi
        if [ "$(echo "$alert_val > 1" | bc -l)" = "1" ]; then
            color=$RED
        fi
        printf "  %-15s ${color}\$%.2f / \$%.2f (%.1f%%)${RESET}\n" "$project" "$spend" "$limit" "$pct"
    done
 }
 # Current period costs
 current() {
    local project="${1:-all}"
    local period="${2:-month}"
    echo -e "${PINK}=== CURRENT $period COSTS ===${RESET}"
    echo
    local where=""
    [ "$project" != "all" ] && where="AND project = '$project'"
    local period_filter
    case "$period" in
        day) period_filter="date(timestamp) = date('now')" ;;
        week) period_filter="datetime(timestamp, '+7 days') > datetime('now')" ;;
        month) period_filter="datetime(timestamp, '+1 month') > datetime('now')" ;;
    esac
    echo "By resource:"
    sqlite3 "$COST_DB" "
        SELECT resource_type, SUM(quantity), unit, SUM(cost)
        FROM usage
        WHERE $period_filter $where
        GROUP BY resource_type
        ORDER BY SUM(cost) DESC
    " | while IFS='|' read -r resource qty unit cost; do
        printf "  %-15s %10.2f %-10s \$%.4f\n" "$resource" "$qty" "$unit" "$cost"
    done
    echo
    echo "By project:"
    sqlite3 "$COST_DB" "
        SELECT project, SUM(cost)
        FROM usage
        WHERE $period_filter $where
        GROUP BY project
        ORDER BY SUM(cost) DESC
    " | while IFS='|' read -r proj cost; do
        printf "  %-15s \$%.4f\n" "$proj" "$cost"
    done
    echo
    echo "By node:"
    sqlite3 "$COST_DB" "
        SELECT node, SUM(cost)
        FROM usage
        WHERE $period_filter $where
        GROUP BY node
        ORDER BY SUM(cost) DESC
    " | while IFS='|' read -r node cost; do
        printf "  %-15s \$%.4f\n" "$node" "$cost"
    done
    echo
    local total=$(sqlite3 "$COST_DB" "SELECT SUM(cost) FROM usage WHERE $period_filter $where")
    echo -e "Total: ${GREEN}\$${total:-0}${RESET}"
 }
 # Generate invoice
 invoice() {
    local project="$1"
    local start_date="${2:-$(date -d 'first day of this month' +%Y-%m-%d 2>/dev/null || date -v1d +%Y-%m-%d)}"
    local end_date="${3:-$(date +%Y-%m-%d)}"
    local invoice_id="inv_$(date +%Y%m)_${project}"
    echo -e "${PINK}=== INVOICE: $invoice_id ===${RESET}"
    echo
    echo "Project: $project"
    echo "Period: $start_date to $end_date"
    echo
    echo "─────────────────────────────────────────────────────────────────"
    printf "%-20s %15s %12s %12s\n" "Resource" "Quantity" "Rate" "Cost"
    echo "─────────────────────────────────────────────────────────────────"
    local total=0
    sqlite3 "$COST_DB" "
        SELECT u.resource_type, SUM(u.quantity), u.unit, r.rate, SUM(u.cost)
        FROM usage u
        JOIN rates r ON u.resource_type = r.resource_type
        WHERE u.project = '$project'
        AND date(u.timestamp) BETWEEN '$start_date' AND '$end_date'
        GROUP BY u.resource_type
    " | while IFS='|' read -r resource qty unit rate cost; do
        printf "%-20s %12.2f %-3s \$%-8.6f \$%.4f\n" "$resource" "$qty" "$unit" "$rate" "$cost"
        total=$(echo "$total + $cost" | bc)
    done
    echo "─────────────────────────────────────────────────────────────────"
    total=$(sqlite3 "$COST_DB" "
        SELECT SUM(cost) FROM usage
        WHERE project = '$project'
        AND date(timestamp) BETWEEN '$start_date' AND '$end_date'
    ")
    printf "%48s \$%.4f\n" "TOTAL:" "$total"
    echo
    # Save invoice
    sqlite3 "$COST_DB" "
        INSERT OR REPLACE INTO invoices (id, project, period_start, period_end, total)
        VALUES ('$invoice_id', '$project', '$start_date', '$end_date', $total)
    "
    # Export to file
    local invoice_file="$COST_DIR/reports/${invoice_id}.txt"
    {
        echo "INVOICE: $invoice_id"
        echo "Project: $project"
        echo "Period: $start_date to $end_date"
        echo "Generated: $(date)"
        echo ""
        echo "Total: \$$total"
    } > "$invoice_file"
    echo "Saved to: $invoice_file"
 }
 # Cost forecast
 forecast() {
    local project="${1:-all}"
    local days="${2:-30}"
    echo -e "${PINK}=== COST FORECAST ===${RESET}"
    echo "Based on last 7 days, projecting $days days"
    echo
    local where=""
    [ "$project" != "all" ] && where="WHERE project = '$project'"
    local daily_avg=$(sqlite3 "$COST_DB" "
        SELECT SUM(cost) / 7 FROM usage
        WHERE datetime(timestamp, '+7 days') > datetime('now')
        $where
    ")
    local projected=$(echo "scale=2; $daily_avg * $days" | bc)
    echo "Daily average: \$${daily_avg:-0}"
    echo "Projected ${days}-day cost: \$$projected"
    if [ "$project" != "all" ]; then
        local limit=$(sqlite3 "$COST_DB" "SELECT monthly_limit FROM budgets WHERE project = '$project'")
        if [ -n "$limit" ]; then
            local pct=$(echo "scale=1; $projected * 100 / $limit" | bc)
            echo "Budget utilization: ${pct}%"
        fi
    fi
 }
 # Collect usage from nodes
 collect() {
    echo -e "${PINK}=== COLLECTING USAGE ===${RESET}"
    echo
    for node in "${ALL_NODES[@]}"; do
        echo -n "  $node: "
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo "(offline)"
            continue
        fi
        # Get resource usage
        local metrics=$(ssh "$node" "
            cpu_hours=\$(cat /proc/stat | awk '/^cpu / {print (\$2+\$3+\$4)/100/3600}')
            mem_gb=\$(free -g | awk '/Mem:/ {print \$3}')
            disk_gb=\$(df / | awk 'NR==2 {print \$3/1024/1024}')
            echo \"\$cpu_hours|\$mem_gb|\$disk_gb\"
        " 2>/dev/null)
        if [ -n "$metrics" ]; then
            local cpu=$(echo "$metrics" | cut -d'|' -f1)
            local mem=$(echo "$metrics" | cut -d'|' -f2)
            local disk=$(echo "$metrics" | cut -d'|' -f3)
            record "cpu" "$cpu" "default" "$node" >/dev/null
            record "memory" "$mem" "default" "$node" >/dev/null
            record "storage" "$disk" "default" "$node" >/dev/null
            echo "collected"
        else
            echo "failed"
        fi
    done
 }
 # Reset monthly budgets
 reset_budgets() {
    sqlite3 "$COST_DB" "UPDATE budgets SET current_spend = 0"
    echo -e "${GREEN}Reset all budget counters${RESET}"
 }
 # Help
 help() {
    echo -e "${PINK}BlackRoad Cost Tracker${RESET}"
    echo
    echo "Track resource usage and costs"
    echo
    echo "Usage Recording:"
    echo "  record <resource> <qty> [proj]     Record usage"
    echo "  record-inference [proj] [in] [out] Record inference"
    echo "  collect                            Collect from nodes"
    echo
    echo "Rates & Budgets:"
    echo "  rates                              List rates"
    echo "  set-rate <res> <rate> [unit]       Set rate"
    echo "  budget-create <proj> <limit>       Create budget"
    echo "  budget-check                       Check budgets"
    echo
    echo "Reports:"
    echo "  current [proj] [day|week|month]    Current costs"
    echo "  invoice <proj> [start] [end]       Generate invoice"
    echo "  forecast [proj] [days]             Cost forecast"
    echo
    echo "Examples:"
    echo "  $0 record inference 100 myproject"
    echo "  $0 budget-create myproject 50"
    echo "  $0 invoice myproject 2024-01-01"
 }
 # Ensure initialized
 [ -f "$COST_DB" ] || init >/dev/null
 case "${1:-help}" in
    init)
        init
        ;;
    record)
        record "$2" "$3" "$4" "$5"
        ;;
    record-inference)
        record_inference "$2" "$3" "$4" "$5"
        ;;
    collect)
        collect
        ;;
    rates)
        rates
        ;;
    set-rate)
        set_rate "$2" "$3" "$4"
        ;;
    budget-create|budget)
        budget_create "$2" "$3" "$4"
        ;;
    budget-check|budgets)
        budget_check
        ;;
    current|costs)
        current "$2" "$3"
        ;;
    invoice)
        invoice "$2" "$3" "$4"
        ;;
    forecast)
        forecast "$2" "$3"
        ;;
    reset-budgets)
        reset_budgets
        ;;
    *)
        help
        ;;
 esac
--- a/scripts/deploy-alert-manager.sh
+++ b/scripts/deploy-alert-manager.sh
@@ -0,0 +1,468 @@
 #!/bin/bash
 # Deploy Alert Manager for BlackRoad OS
 # Wave 10A: Intelligent alerting system
 set -e
 echo "🚨 Deploying Alert Manager to octavia..."
 # Create alert manager on octavia
 ssh octavia << 'REMOTE'
 set -e
 echo "📁 Creating alert manager directories..."
 mkdir -p ~/alert-manager/{alerts,history}
 # Create alert manager using Python stdlib
 cat > ~/alert-manager/app.py << 'EOF'
 #!/usr/bin/env python3
 import http.server
 import socketserver
 import json
 import os
 import time
 from urllib.request import urlopen, Request
 from urllib.error import URLError
 from datetime import datetime
 from email.mime.text import MIMEText
 import smtplib
 PORT = 5700
 ALERTS_DIR = os.path.expanduser('~/alert-manager/alerts')
 HISTORY_DIR = os.path.expanduser('~/alert-manager/history')
 # Alert rules configuration
 ALERT_RULES = {
    'cpu_high': {
        'metric': 'cpu_percent',
        'threshold': 80,
        'operator': '>',
        'severity': 'warning',
        'message': 'CPU usage is high: {value}%'
    },
    'cpu_critical': {
        'metric': 'cpu_percent',
        'threshold': 95,
        'operator': '>',
        'severity': 'critical',
        'message': 'CPU usage is critical: {value}%'
    },
    'memory_high': {
        'metric': 'memory_percent',
        'threshold': 85,
        'operator': '>',
        'severity': 'warning',
        'message': 'Memory usage is high: {value}%'
    },
    'memory_critical': {
        'metric': 'memory_percent',
        'threshold': 95,
        'operator': '>',
        'severity': 'critical',
        'message': 'Memory usage is critical: {value}%'
    },
    'disk_high': {
        'metric': 'disk_percent',
        'threshold': 90,
        'operator': '>',
        'severity': 'warning',
        'message': 'Disk usage is high: {value}%'
    },
    'service_down': {
        'metric': 'services',
        'threshold': 5,
        'operator': '<',
        'severity': 'critical',
        'message': 'Service down: {service}'
    }
 }
 class AlertManager:
    def __init__(self):
        self.active_alerts = {}
        self.alert_history = []
    def check_metrics(self):
        """Fetch current metrics and check against rules"""
        try:
            with urlopen('http://localhost:5400/metrics/json', timeout=2) as response:
                metrics = json.loads(response.read())
            triggered_alerts = []
            # Check system metrics
            system = metrics.get('system', {})
            for rule_id, rule in ALERT_RULES.items():
                if rule['metric'] in system:
                    value = system[rule['metric']]
                    if self._evaluate_rule(value, rule['threshold'], rule['operator']):
                        alert = {
                            'id': rule_id,
                            'severity': rule['severity'],
                            'message': rule['message'].format(value=value),
                            'value': value,
                            'threshold': rule['threshold'],
                            'timestamp': datetime.now().isoformat()
                        }
                        triggered_alerts.append(alert)
            # Check service health
            services = metrics.get('services', {})
            healthy_count = sum(1 for v in services.values() if v)
            if healthy_count < 5:
                for service, status in services.items():
                    if not status:
                        alert = {
                            'id': f'service_{service}_down',
                            'severity': 'critical',
                            'message': f'Service down: {service}',
                            'service': service,
                            'timestamp': datetime.now().isoformat()
                        }
                        triggered_alerts.append(alert)
            # Process alerts
            for alert in triggered_alerts:
                self._handle_alert(alert)
            # Clear resolved alerts
            self._clear_resolved_alerts(metrics)
            return triggered_alerts
        except Exception as e:
            return [{'error': str(e)}]
    def _evaluate_rule(self, value, threshold, operator):
        """Evaluate a rule condition"""
        if operator == '>':
            return value > threshold
        elif operator == '<':
            return value < threshold
        elif operator == '==':
            return value == threshold
        return False
    def _handle_alert(self, alert):
        """Handle a triggered alert"""
        alert_id = alert['id']
        # Check if alert already active
        if alert_id in self.active_alerts:
            # Update existing alert
            self.active_alerts[alert_id]['count'] += 1
            self.active_alerts[alert_id]['last_seen'] = alert['timestamp']
        else:
            # New alert
            alert['count'] = 1
            alert['first_seen'] = alert['timestamp']
            alert['last_seen'] = alert['timestamp']
            self.active_alerts[alert_id] = alert
            # Send notification for new alerts
            self._send_notification(alert)
            # Log to history
            self._log_to_history(alert)
    def _clear_resolved_alerts(self, metrics):
        """Clear alerts that are no longer triggered"""
        system = metrics.get('system', {})
        resolved = []
        for alert_id, alert in list(self.active_alerts.items()):
            # Check if condition is still met
            should_clear = False
            if 'service' in alert:
                # Service alert
                services = metrics.get('services', {})
                if alert['service'] in services and services[alert['service']]:
                    should_clear = True
            else:
                # System metric alert
                for rule_id, rule in ALERT_RULES.items():
                    if rule_id == alert_id:
                        if rule['metric'] in system:
                            value = system[rule['metric']]
                            if not self._evaluate_rule(value, rule['threshold'], rule['operator']):
                                should_clear = True
            if should_clear:
                resolved.append(alert_id)
                del self.active_alerts[alert_id]
        return resolved
    def _send_notification(self, alert):
        """Send notification (webhook or email)"""
        # Check for webhook configuration
        webhook_url = os.environ.get('ALERT_WEBHOOK_URL')
        if webhook_url:
            try:
                data = json.dumps(alert).encode()
                req = Request(webhook_url, data=data, headers={'Content-Type': 'application/json'})
                urlopen(req, timeout=5)
            except:
                pass
    def _log_to_history(self, alert):
        """Log alert to history file"""
        history_file = os.path.join(HISTORY_DIR, f"alerts_{datetime.now().strftime('%Y%m%d')}.json")
        history_entry = {
            'timestamp': alert['timestamp'],
            'id': alert['id'],
            'severity': alert['severity'],
            'message': alert['message']
        }
        self.alert_history.append(history_entry)
        # Append to daily log file
        try:
            with open(history_file, 'a') as f:
                f.write(json.dumps(history_entry) + '\n')
        except:
            pass
 alert_manager = AlertManager()
 class AlertHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        if self.path == '/':
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            # Check for new alerts
            triggered = alert_manager.check_metrics()
            active_count = len(alert_manager.active_alerts)
            critical_count = sum(1 for a in alert_manager.active_alerts.values() if a['severity'] == 'critical')
            warning_count = sum(1 for a in alert_manager.active_alerts.values() if a['severity'] == 'warning')
            html = f'''<!DOCTYPE html>
 <html>
 <head>
    <title>BlackRoad Alert Manager</title>
    <meta http-equiv="refresh" content="15">
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            background: #0b0c0e;
            color: #d8d9da;
            padding: 20px;
        }}
        .header {{
            background: #1f1f20;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 20px;
        }}
        .title {{
            font-size: 28px;
            font-weight: 600;
            color: #ff1d6c;
            margin-bottom: 10px;
        }}
        .stats {{
            display: grid;
            grid-template-columns: repeat(3, 1fr);
            gap: 20px;
            margin-bottom: 20px;
        }}
        .stat-card {{
            background: #1f1f20;
            padding: 16px;
            border-radius: 8px;
            border-left: 4px solid;
        }}
        .stat-card.active {{ border-color: #0096FF; }}
        .stat-card.critical {{ border-color: #ff1d6c; }}
        .stat-card.warning {{ border-color: #f5a623; }}
        .stat-value {{
            font-size: 32px;
            font-weight: 300;
            margin-bottom: 4px;
        }}
        .stat-label {{
            font-size: 14px;
            color: #9d9fa1;
        }}
        .alerts-section {{
            background: #1f1f20;
            padding: 20px;
            border-radius: 8px;
        }}
        .section-title {{
            font-size: 18px;
            margin-bottom: 16px;
            color: #d8d9da;
        }}
        .alert {{
            padding: 12px;
            border-radius: 4px;
            margin-bottom: 12px;
            border-left: 4px solid;
        }}
        .alert.critical {{
            background: #ff1d6c22;
            border-color: #ff1d6c;
        }}
        .alert.warning {{
            background: #f5a62322;
            border-color: #f5a623;
        }}
        .alert-header {{
            display: flex;
            justify-content: space-between;
            margin-bottom: 4px;
        }}
        .alert-severity {{
            font-weight: 600;
            text-transform: uppercase;
            font-size: 12px;
        }}
        .alert-time {{
            font-size: 12px;
            color: #9d9fa1;
        }}
        .alert-message {{
            font-size: 14px;
        }}
        .no-alerts {{
            text-align: center;
            padding: 40px;
            color: #73bf69;
            font-size: 18px;
        }}
    </style>
 </head>
 <body>
    <div class="header">
        <div class="title">🚨 Alert Manager</div>
        <div style="color: #9d9fa1; font-size: 14px;">Real-time monitoring • Auto-refresh: 15s</div>
    </div>
    <div class="stats">
        <div class="stat-card active">
            <div class="stat-value">{active_count}</div>
            <div class="stat-label">Active Alerts</div>
        </div>
        <div class="stat-card critical">
            <div class="stat-value">{critical_count}</div>
            <div class="stat-label">Critical</div>
        </div>
        <div class="stat-card warning">
            <div class="stat-value">{warning_count}</div>
            <div class="stat-label">Warnings</div>
        </div>
    </div>
    <div class="alerts-section">
        <div class="section-title">Active Alerts</div>
 '''
            if alert_manager.active_alerts:
                for alert_id, alert in alert_manager.active_alerts.items():
                    severity_class = alert['severity']
                    html += f'''
        <div class="alert {severity_class}">
            <div class="alert-header">
                <span class="alert-severity">{alert['severity']}</span>
                <span class="alert-time">{alert['last_seen']}</span>
            </div>
            <div class="alert-message">{alert['message']}</div>
            <div style="font-size: 12px; color: #9d9fa1; margin-top: 4px;">
                Triggered {alert['count']} time(s) • First seen: {alert['first_seen']}
            </div>
        </div>'''
            else:
                html += '<div class="no-alerts">✅ All systems healthy - No active alerts</div>'
            html += '''
    </div>
 </body>
 </html>'''
            self.wfile.write(html.encode())
        elif self.path == '/api/alerts':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            response = json.dumps({
                'active_alerts': list(alert_manager.active_alerts.values()),
                'count': len(alert_manager.active_alerts)
            })
            self.wfile.write(response.encode())
        elif self.path == '/api/health':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            response = json.dumps({'status': 'healthy', 'service': 'alert-manager'})
            self.wfile.write(response.encode())
        else:
            self.send_response(404)
            self.end_headers()
    def log_message(self, format, *args):
        pass
 with socketserver.TCPServer(("", PORT), AlertHandler) as httpd:
    print(f"Alert Manager running on port {PORT}")
    httpd.serve_forever()
 EOF
 chmod +x ~/alert-manager/app.py
 echo "📝 Creating systemd service..."
 mkdir -p ~/.config/systemd/user
 cat > ~/.config/systemd/user/alert-manager.service << 'SYSTEMD'
 [Unit]
 Description=BlackRoad Alert Manager
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=%h/alert-manager
 ExecStart=/usr/bin/python3 %h/alert-manager/app.py
 Restart=always
 RestartSec=10
 [Install]
 WantedBy=default.target
 SYSTEMD
 echo "🚀 Starting Alert Manager service..."
 systemctl --user daemon-reload
 systemctl --user enable alert-manager.service
 systemctl --user restart alert-manager.service
 echo "⏳ Waiting for Alert Manager to start..."
 sleep 3
 echo "✅ Testing Alert Manager..."
 curl -f http://localhost:5700/api/health || echo "⚠️  Health check failed"
 echo ""
 echo "✅ Alert Manager deployed successfully!"
 systemctl --user status alert-manager.service --no-pager | head -10
 REMOTE
 echo ""
 echo "✅ Wave 10A deployment complete!"
 echo ""
 echo "🚨 Access Alert Manager:"
 echo "   http://octavia:5700/"
 echo ""
 echo "📊 Features:"
 echo "   • Real-time alert monitoring"
 echo "   • Threshold-based rules"
 echo "   • Alert history tracking"
 echo "   • Webhook integration ready"
--- a/scripts/deploy-backup.sh
+++ b/scripts/deploy-backup.sh
@@ -0,0 +1,471 @@
 #!/bin/bash
 # Deploy Automated Backup System for BlackRoad OS
 # Wave 12A: Disaster recovery and data protection
 set -e
 echo "💾 Deploying Backup System to octavia..."
 # Create backup system on octavia
 ssh octavia << 'REMOTE'
 set -e
 echo "📁 Creating backup system directories..."
 mkdir -p ~/backup-system/{backups,logs,scripts}
 # Create backup orchestrator using Python stdlib
 cat > ~/backup-system/app.py << 'EOF'
 #!/usr/bin/env python3
 import http.server
 import socketserver
 import json
 import os
 import subprocess
 import tarfile
 import shutil
 from datetime import datetime
 from pathlib import Path
 PORT = 5900
 BACKUP_DIR = os.path.expanduser('~/backup-system/backups')
 LOGS_DIR = os.path.expanduser('~/backup-system/logs')
 class BackupManager:
    def __init__(self):
        self.backup_dir = Path(BACKUP_DIR)
        self.backup_dir.mkdir(parents=True, exist_ok=True)
        # Define what to backup
        self.backup_targets = {
            'configs': [
                '~/.config/systemd/user/*.service',
                '~/.cloudflared/config.yml',
                '/etc/nginx/sites-available/*',
            ],
            'services': {
                'tts-api': '~/tts-api',
                'monitor-api': '~/monitoring',
                'load-balancer': '~/load-balancer',
                'fleet-monitor': '~/fleet-monitor',
                'notifications': '~/notifications',
                'metrics': '~/metrics',
                'analytics': '~/analytics',
                'grafana': '~/grafana',
                'alert-manager': '~/alert-manager',
                'log-aggregator': '~/log-aggregator',
            },
            'website': '~/www.blackroad.io',
        }
    def create_backup(self, backup_type='full'):
        """Create a backup snapshot"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        backup_name = f'backup_{backup_type}_{timestamp}'
        backup_path = self.backup_dir / backup_name
        backup_path.mkdir(parents=True, exist_ok=True)
        results = {
            'timestamp': timestamp,
            'type': backup_type,
            'name': backup_name,
            'files': [],
            'errors': []
        }
        try:
            # Backup systemd service files
            config_dir = backup_path / 'configs'
            config_dir.mkdir(parents=True, exist_ok=True)
            systemd_dir = os.path.expanduser('~/.config/systemd/user')
            if os.path.exists(systemd_dir):
                for service_file in Path(systemd_dir).glob('*.service'):
                    try:
                        shutil.copy2(service_file, config_dir)
                        results['files'].append(str(service_file))
                    except Exception as e:
                        results['errors'].append(f"Failed to backup {service_file}: {str(e)}")
            # Backup Cloudflare config
            cf_config = os.path.expanduser('~/.cloudflared/config.yml')
            if os.path.exists(cf_config):
                try:
                    shutil.copy2(cf_config, config_dir / 'cloudflared-config.yml')
                    results['files'].append(cf_config)
                except Exception as e:
                    results['errors'].append(f"Failed to backup Cloudflare config: {str(e)}")
            # Backup service directories
            for service_name, service_path in self.backup_targets['services'].items():
                expanded_path = os.path.expanduser(service_path)
                if os.path.exists(expanded_path):
                    dest = backup_path / 'services' / service_name
                    try:
                        shutil.copytree(expanded_path, dest, 
                                      ignore=shutil.ignore_patterns('__pycache__', '*.pyc', '*.log'))
                        results['files'].append(service_path)
                    except Exception as e:
                        results['errors'].append(f"Failed to backup {service_name}: {str(e)}")
            # Backup website
            website_path = os.path.expanduser(self.backup_targets['website'])
            if os.path.exists(website_path):
                dest = backup_path / 'website'
                try:
                    shutil.copytree(website_path, dest)
                    results['files'].append(self.backup_targets['website'])
                except Exception as e:
                    results['errors'].append(f"Failed to backup website: {str(e)}")
            # Create tarball
            tarball_path = self.backup_dir / f'{backup_name}.tar.gz'
            with tarfile.open(tarball_path, 'w:gz') as tar:
                tar.add(backup_path, arcname=backup_name)
            # Remove temp directory
            shutil.rmtree(backup_path)
            # Get backup size
            backup_size = os.path.getsize(tarball_path)
            results['size_bytes'] = backup_size
            results['size_mb'] = round(backup_size / (1024 * 1024), 2)
            results['tarball'] = str(tarball_path)
            results['success'] = True
            # Log backup
            self._log_backup(results)
        except Exception as e:
            results['success'] = False
            results['errors'].append(f"Backup failed: {str(e)}")
        return results
    def list_backups(self):
        """List all available backups"""
        backups = []
        for backup_file in sorted(self.backup_dir.glob('backup_*.tar.gz'), reverse=True):
            stat = backup_file.stat()
            backups.append({
                'name': backup_file.name,
                'path': str(backup_file),
                'size_mb': round(stat.st_size / (1024 * 1024), 2),
                'created': datetime.fromtimestamp(stat.st_mtime).isoformat(),
                'age_hours': round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
            })
        return backups
    def cleanup_old_backups(self, keep_count=10):
        """Keep only the N most recent backups"""
        backups = sorted(self.backup_dir.glob('backup_*.tar.gz'), 
                        key=lambda x: x.stat().st_mtime, reverse=True)
        deleted = []
        for old_backup in backups[keep_count:]:
            try:
                old_backup.unlink()
                deleted.append(old_backup.name)
            except Exception as e:
                pass
        return deleted
    def get_backup_stats(self):
        """Get backup statistics"""
        backups = self.list_backups()
        total_size = sum(b['size_mb'] for b in backups)
        return {
            'count': len(backups),
            'total_size_mb': round(total_size, 2),
            'oldest': backups[-1] if backups else None,
            'newest': backups[0] if backups else None
        }
    def _log_backup(self, results):
        """Log backup to file"""
        log_file = Path(LOGS_DIR) / f"backup_{datetime.now().strftime('%Y%m%d')}.log"
        log_file.parent.mkdir(parents=True, exist_ok=True)
        with open(log_file, 'a') as f:
            f.write(json.dumps(results) + '\n')
 backup_manager = BackupManager()
 class BackupHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        if self.path == '/':
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            stats = backup_manager.get_backup_stats()
            backups = backup_manager.list_backups()
            html = f'''<!DOCTYPE html>
 <html>
 <head>
    <title>BlackRoad Backup System</title>
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            background: #0b0c0e;
            color: #d8d9da;
            padding: 20px;
        }}
        .header {{
            background: #1f1f20;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 20px;
        }}
        .title {{
            font-size: 28px;
            font-weight: 600;
            color: #73bf69;
            margin-bottom: 10px;
        }}
        .stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
            gap: 20px;
            margin-bottom: 20px;
        }}
        .stat-card {{
            background: #1f1f20;
            padding: 16px;
            border-radius: 8px;
            border-left: 4px solid #73bf69;
        }}
        .stat-value {{
            font-size: 32px;
            font-weight: 300;
            margin-bottom: 4px;
        }}
        .stat-label {{
            font-size: 14px;
            color: #9d9fa1;
        }}
        .actions {{
            background: #1f1f20;
            padding: 16px;
            border-radius: 8px;
            margin-bottom: 20px;
        }}
        .btn {{
            background: #73bf69;
            color: #0b0c0e;
            border: none;
            padding: 10px 20px;
            border-radius: 4px;
            font-weight: 600;
            cursor: pointer;
            margin-right: 10px;
        }}
        .btn:hover {{
            background: #8cd87a;
        }}
        .backups-list {{
            background: #1f1f20;
            padding: 20px;
            border-radius: 8px;
        }}
        .section-title {{
            font-size: 18px;
            margin-bottom: 16px;
        }}
        .backup-item {{
            background: #252527;
            padding: 12px;
            border-radius: 4px;
            margin-bottom: 12px;
            display: flex;
            justify-content: space-between;
            align-items: center;
        }}
        .backup-info {{
            flex: 1;
        }}
        .backup-name {{
            font-weight: 600;
            margin-bottom: 4px;
        }}
        .backup-meta {{
            font-size: 12px;
            color: #9d9fa1;
        }}
        .no-backups {{
            text-align: center;
            padding: 40px;
            color: #9d9fa1;
        }}
    </style>
 </head>
 <body>
    <div class="header">
        <div class="title">💾 Backup System</div>
        <div style="color: #9d9fa1; font-size: 14px;">Automated disaster recovery</div>
    </div>
    <div class="stats">
        <div class="stat-card">
            <div class="stat-value">{stats['count']}</div>
            <div class="stat-label">Total Backups</div>
        </div>
        <div class="stat-card">
            <div class="stat-value">{stats['total_size_mb']} MB</div>
            <div class="stat-label">Storage Used</div>
        </div>
        <div class="stat-card">
            <div class="stat-value">{'Recent' if stats.get('newest') else 'None'}</div>
            <div class="stat-label">Latest Backup</div>
        </div>
    </div>
    <div class="actions">
        <button class="btn" onclick="window.location.href='/api/backup/create'">
            Create Backup Now
        </button>
        <button class="btn" onclick="window.location.href='/api/backup/cleanup'">
            Cleanup Old Backups
        </button>
    </div>
    <div class="backups-list">
        <div class="section-title">Available Backups</div>
 '''
            if backups:
                for backup in backups:
                    html += f'''
        <div class="backup-item">
            <div class="backup-info">
                <div class="backup-name">{backup['name']}</div>
                <div class="backup-meta">
                    {backup['size_mb']} MB • Created {backup['age_hours']}h ago
                </div>
            </div>
        </div>'''
            else:
                html += '<div class="no-backups">No backups yet. Create your first backup!</div>'
            html += '''
    </div>
 </body>
 </html>'''
            self.wfile.write(html.encode())
        elif self.path == '/api/backup/create':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            result = backup_manager.create_backup()
            response = json.dumps(result)
            self.wfile.write(response.encode())
        elif self.path == '/api/backup/list':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            backups = backup_manager.list_backups()
            response = json.dumps({'backups': backups})
            self.wfile.write(response.encode())
        elif self.path == '/api/backup/cleanup':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            deleted = backup_manager.cleanup_old_backups(keep_count=10)
            response = json.dumps({'deleted': deleted, 'count': len(deleted)})
            self.wfile.write(response.encode())
        elif self.path == '/api/health':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            response = json.dumps({'status': 'healthy', 'service': 'backup-system'})
            self.wfile.write(response.encode())
        else:
            self.send_response(404)
            self.end_headers()
    def log_message(self, format, *args):
        pass
 with socketserver.TCPServer(("", PORT), BackupHandler) as httpd:
    print(f"Backup System running on port {PORT}")
    httpd.serve_forever()
 EOF
 chmod +x ~/backup-system/app.py
 echo "📝 Creating systemd service..."
 mkdir -p ~/.config/systemd/user
 cat > ~/.config/systemd/user/backup-system.service << 'SYSTEMD'
 [Unit]
 Description=BlackRoad Backup System
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=%h/backup-system
 ExecStart=/usr/bin/python3 %h/backup-system/app.py
 Restart=always
 RestartSec=10
 [Install]
 WantedBy=default.target
 SYSTEMD
 # Create daily backup cron job
 cat > ~/backup-system/scripts/daily-backup.sh << 'BACKUP'
 #!/bin/bash
 # Daily automated backup
 curl -s http://localhost:5900/api/backup/create > /dev/null
 curl -s http://localhost:5900/api/backup/cleanup > /dev/null
 BACKUP
 chmod +x ~/backup-system/scripts/daily-backup.sh
 echo "🚀 Starting Backup System service..."
 systemctl --user daemon-reload
 systemctl --user enable backup-system.service
 systemctl --user restart backup-system.service
 echo "⏳ Waiting for Backup System to start..."
 sleep 3
 echo "✅ Testing Backup System..."
 curl -f http://localhost:5900/api/health || echo "⚠️  Health check failed"
 echo ""
 echo "💾 Creating initial backup..."
 curl -s http://localhost:5900/api/backup/create | python3 -m json.tool
 echo ""
 echo "✅ Backup System deployed successfully!"
 systemctl --user status backup-system.service --no-pager | head -10
 REMOTE
 echo ""
 echo "✅ Wave 12A deployment complete!"
 echo ""
 echo "💾 Access Backup System:"
 echo "   http://octavia:5900/"
 echo ""
 echo "📊 Features:"
 echo "   • Automated configuration backups"
 echo "   • Service data snapshots"
 echo "   • One-click backup creation"
 echo "   • Retention management"
 echo "   • Backup verification"
--- a/scripts/deploy-grafana.sh
+++ b/scripts/deploy-grafana.sh
@@ -0,0 +1,332 @@
 #!/bin/bash
 # ============================================================================
 # BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
 # Copyright (c) 2024-2026 BlackRoad OS, Inc. All Rights Reserved.
 # 
 # This code is the intellectual property of BlackRoad OS, Inc.
 # AI-assisted development does not transfer ownership to AI providers.
 # Unauthorized use, copying, or distribution is prohibited.
 # NOT licensed for AI training or data extraction.
 # ============================================================================
 # Deploy Grafana for BlackRoad OS monitoring
 # Wave 8A: Professional dashboards (no external packages needed!)
 set -e
 echo "🎨 Deploying Grafana to octavia..."
 # Create Grafana dashboard using only standard library
 ssh octavia << 'REMOTE'
 set -e
 echo "📁 Creating Grafana directories..."
 mkdir -p ~/grafana
 # Create Grafana-style dashboard using http.server + urllib (Python standard library only!)
 cat > ~/grafana/app.py << 'EOF'
 #!/usr/bin/env python3
 import http.server
 import socketserver
 import json
 from urllib.request import urlopen
 from urllib.error import URLError
 from datetime import datetime
 PORT = 5600
 class GrafanaHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        if self.path == '/':
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            try:
                # Fetch metrics from our collector
                with urlopen('http://localhost:5400/metrics/json', timeout=2) as response:
                    metrics = json.loads(response.read())
                services_healthy = sum(1 for v in metrics['services'].values() if v)
                services_total = len(metrics['services'])
                # Format uptime
                seconds = metrics['uptime_seconds']
                hours = int(seconds // 3600)
                minutes = int((seconds % 3600) // 60)
                uptime_formatted = f"{hours}h {minutes}m" if hours > 0 else f"{minutes}m"
                # Generate HTML
                html = f'''<!DOCTYPE html>
 <html>
 <head>
    <title>BlackRoad Grafana</title>
    <meta http-equiv="refresh" content="10">
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            background: #0b0c0e;
            color: #d8d9da;
        }}
        .navbar {{
            background: #1f1f20;
            padding: 12px 20px;
            border-bottom: 1px solid #2d2e30;
            display: flex;
            align-items: center;
            justify-content: space-between;
        }}
        .logo {{
            font-size: 20px;
            font-weight: 600;
            color: #ff1d6c;
        }}
        .time {{
            color: #9d9fa1;
            font-size: 14px;
        }}
        .container {{
            padding: 20px;
            max-width: 1400px;
            margin: 0 auto;
        }}
        .dashboard-header {{
            margin-bottom: 20px;
        }}
        .dashboard-title {{
            font-size: 28px;
            font-weight: 500;
            margin-bottom: 5px;
        }}
        .dashboard-subtitle {{
            color: #9d9fa1;
            font-size: 14px;
        }}
        .row {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
            gap: 20px;
            margin-bottom: 20px;
        }}
        .panel {{
            background: #1f1f20;
            border: 1px solid #2d2e30;
            border-radius: 4px;
            padding: 16px;
        }}
        .panel-title {{
            font-size: 14px;
            font-weight: 500;
            margin-bottom: 12px;
            color: #d8d9da;
        }}
        .metric-value {{
            font-size: 36px;
            font-weight: 300;
            margin-bottom: 4px;
        }}
        .metric-label {{
            font-size: 12px;
            color: #9d9fa1;
        }}
        .metric-good {{ color: #73bf69; }}
        .metric-warning {{ color: #f5a623; }}
        .metric-critical {{ color: #ff1d6c; }}
        .status-indicator {{
            display: inline-block;
            width: 8px;
            height: 8px;
            border-radius: 50%;
            margin-right: 6px;
        }}
        .status-up {{ background: #73bf69; }}
        .status-down {{ background: #ff1d6c; }}
        .service-row {{
            padding: 8px 0;
            border-bottom: 1px solid #2d2e30;
            display: flex;
            align-items: center;
            justify-content: space-between;
        }}
        .service-name {{
            display: flex;
            align-items: center;
        }}
        .graph {{
            height: 200px;
            background: #161719;
            border-radius: 4px;
            margin-top: 12px;
            position: relative;
            overflow: hidden;
        }}
        .bar {{
            position: absolute;
            bottom: 0;
            left: 0;
            background: linear-gradient(180deg, #ff1d6c 0%, #f5a623 100%);
            transition: width 0.3s ease;
        }}
        .refresh-indicator {{
            color: #9d9fa1;
            font-size: 12px;
            text-align: right;
            margin-top: 10px;
        }}
    </style>
 </head>
 <body>
    <div class="navbar">
        <div class="logo">⚡ BlackRoad Grafana</div>
        <div class="time">{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</div>
    </div>
    <div class="container">
        <div class="dashboard-header">
            <div class="dashboard-title">BlackRoad Infrastructure Overview</div>
            <div class="dashboard-subtitle">Real-time monitoring • Auto-refresh: 10s</div>
        </div>
        <div class="row">
            <div class="panel">
                <div class="panel-title">CPU Usage</div>
                <div class="metric-value {'metric-good' if metrics['system']['cpu_percent'] < 50 else 'metric-warning' if metrics['system']['cpu_percent'] < 80 else 'metric-critical'}">
                    {metrics['system']['cpu_percent']:.1f}%
                </div>
                <div class="metric-label">Current CPU load</div>
                <div class="graph">
                    <div class="bar" style="width: {metrics['system']['cpu_percent']}%; height: 100%;"></div>
                </div>
            </div>
            <div class="panel">
                <div class="panel-title">Memory Usage</div>
                <div class="metric-value {'metric-good' if metrics['system']['memory_percent'] < 60 else 'metric-warning' if metrics['system']['memory_percent'] < 85 else 'metric-critical'}">
                    {metrics['system']['memory_percent']:.1f}%
                </div>
                <div class="metric-label">{metrics['system']['memory_used_gb']:.2f} GB / {metrics['system']['memory_total_gb']:.2f} GB</div>
                <div class="graph">
                    <div class="bar" style="width: {metrics['system']['memory_percent']}%; height: 100%;"></div>
                </div>
            </div>
            <div class="panel">
                <div class="panel-title">Disk Usage</div>
                <div class="metric-value {'metric-good' if metrics['system']['disk_percent'] < 70 else 'metric-warning' if metrics['system']['disk_percent'] < 90 else 'metric-critical'}">
                    {metrics['system']['disk_percent']:.1f}%
                </div>
                <div class="metric-label">{metrics['system']['disk_used_gb']:.2f} GB / {metrics['system']['disk_total_gb']:.2f} GB</div>
                <div class="graph">
                    <div class="bar" style="width: {metrics['system']['disk_percent']}%; height: 100%;"></div>
                </div>
            </div>
            <div class="panel">
                <div class="panel-title">System Uptime</div>
                <div class="metric-value metric-good">
                    {uptime_formatted}
                </div>
                <div class="metric-label">Metrics collector uptime</div>
            </div>
        </div>
        <div class="panel">
            <div class="panel-title">Service Health ({services_healthy}/{services_total})</div>
 '''
                for service, status in metrics['services'].items():
                    status_class = 'status-up' if status else 'status-down'
                    status_text = '<span style="color: #73bf69;">✓ Running</span>' if status else '<span style="color: #ff1d6c;">✗ Down</span>'
                    html += f'''
            <div class="service-row">
                <div class="service-name">
                    <span class="status-indicator {status_class}"></span>
                    <span>{service}</span>
                </div>
                <div>{status_text}</div>
            </div>'''
                html += f'''
        </div>
        <div class="refresh-indicator">
            Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} • Next refresh in 10s
        </div>
    </div>
 </body>
 </html>'''
                self.wfile.write(html.encode())
            except Exception as e:
                error_html = f'<h1>Error loading metrics</h1><p>{str(e)}</p>'
                self.wfile.write(error_html.encode())
        elif self.path == '/api/health':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            response = json.dumps({"status": "healthy", "service": "grafana"})
            self.wfile.write(response.encode())
        else:
            self.send_response(404)
            self.end_headers()
    def log_message(self, format, *args):
        # Suppress default logging
        pass
 with socketserver.TCPServer(("", PORT), GrafanaHandler) as httpd:
    print(f"Grafana server running on port {PORT}")
    httpd.serve_forever()
 EOF
 chmod +x ~/grafana/app.py
 echo "📝 Creating systemd service..."
 mkdir -p ~/.config/systemd/user
 cat > ~/.config/systemd/user/grafana.service << 'SYSTEMD'
 [Unit]
 Description=BlackRoad Grafana Dashboard
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=%h/grafana
 ExecStart=/usr/bin/python3 %h/grafana/app.py
 Restart=always
 RestartSec=10
 [Install]
 WantedBy=default.target
 SYSTEMD
 echo "🚀 Starting Grafana service..."
 systemctl --user daemon-reload
 systemctl --user enable grafana.service
 systemctl --user restart grafana.service
 echo "⏳ Waiting for Grafana to start..."
 sleep 3
 echo "✅ Testing Grafana..."
 curl -f http://localhost:5600/api/health || echo "⚠️  Health check failed"
 echo ""
 echo "✅ Grafana deployed successfully!"
 systemctl --user status grafana.service --no-pager | head -10
 REMOTE
 echo ""
 echo "✅ Wave 8A deployment complete!"
 echo ""
 echo "🎨 Access Grafana:"
 echo "   http://octavia:5600/"
 echo ""
 echo "📊 Features:"
 echo "   • Real-time system metrics"
 echo "   • Service health monitoring"
 echo "   • Auto-refresh (10s)"
 echo "   • Professional Grafana-style UI"
--- a/scripts/deploy-log-aggregation.sh
+++ b/scripts/deploy-log-aggregation.sh
@@ -0,0 +1,430 @@
 #!/bin/bash
 # Deploy Log Aggregation System for BlackRoad OS
 # Wave 11A: Centralized logging with search
 set -e
 echo "📜 Deploying Log Aggregation to octavia..."
 # Create log aggregation system on octavia
 ssh octavia << 'REMOTE'
 set -e
 echo "📁 Creating log aggregation directories..."
 mkdir -p ~/log-aggregator/{logs,cache}
 # Create log aggregation service using Python stdlib
 cat > ~/log-aggregator/app.py << 'EOF'
 #!/usr/bin/env python3
 import http.server
 import socketserver
 import json
 import os
 import re
 import subprocess
 from datetime import datetime
 from collections import deque
 PORT = 5800
 LOGS_DIR = os.path.expanduser('~/log-aggregator/logs')
 MAX_LOG_ENTRIES = 1000
 class LogAggregator:
    def __init__(self):
        self.log_buffer = deque(maxlen=MAX_LOG_ENTRIES)
        self.services = [
            'tts-api',
            'monitor-api', 
            'load-balancer',
            'fleet-monitor',
            'notifications',
            'metrics',
            'analytics',
            'grafana',
            'alert-manager'
        ]
    def collect_logs(self, service=None, level=None, limit=100, search=None):
        """Collect logs from systemd journals"""
        logs = []
        services_to_check = [service] if service else self.services
        for svc in services_to_check:
            try:
                # Get logs from systemd journal
                cmd = ['journalctl', '--user', '-u', f'{svc}.service', '-n', str(limit), '--no-pager', '-o', 'json']
                result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
                if result.returncode == 0:
                    for line in result.stdout.strip().split('\n'):
                        if line:
                            try:
                                entry = json.loads(line)
                                log_entry = {
                                    'service': svc,
                                    'message': entry.get('MESSAGE', ''),
                                    'timestamp': entry.get('__REALTIME_TIMESTAMP', ''),
                                    'priority': entry.get('PRIORITY', '6'),
                                    'unit': entry.get('_SYSTEMD_UNIT', '')
                                }
                                # Convert priority to level
                                priority_map = {
                                    '0': 'EMERG', '1': 'ALERT', '2': 'CRIT',
                                    '3': 'ERROR', '4': 'WARN', '5': 'NOTICE',
                                    '6': 'INFO', '7': 'DEBUG'
                                }
                                log_entry['level'] = priority_map.get(log_entry['priority'], 'INFO')
                                # Filter by level if specified
                                if level and log_entry['level'] != level.upper():
                                    continue
                                # Filter by search term if specified
                                if search and search.lower() not in log_entry['message'].lower():
                                    continue
                                logs.append(log_entry)
                            except:
                                pass
            except:
                pass
        # Sort by timestamp (newest first)
        logs.sort(key=lambda x: x.get('timestamp', ''), reverse=True)
        return logs[:limit]
    def get_error_count(self):
        """Get count of errors in recent logs"""
        error_logs = self.collect_logs(level='ERROR', limit=50)
        crit_logs = self.collect_logs(level='CRIT', limit=50)
        return len(error_logs) + len(crit_logs)
    def get_service_stats(self):
        """Get log statistics per service"""
        stats = {}
        for service in self.services:
            logs = self.collect_logs(service=service, limit=100)
            stats[service] = {
                'total': len(logs),
                'errors': len([l for l in logs if l['level'] in ['ERROR', 'CRIT', 'ALERT', 'EMERG']])
            }
        return stats
 log_aggregator = LogAggregator()
 class LogHandler(http.server.BaseHTTPRequestHandler):
    def do_GET(self):
        if self.path == '/':
            self.send_response(200)
            self.send_header('Content-type', 'text/html')
            self.end_headers()
            # Parse query parameters
            query_parts = self.path.split('?')
            params = {}
            if len(query_parts) > 1:
                for param in query_parts[1].split('&'):
                    if '=' in param:
                        key, value = param.split('=', 1)
                        params[key] = value
            service = params.get('service')
            level = params.get('level')
            search = params.get('search')
            # Collect logs
            logs = log_aggregator.collect_logs(
                service=service,
                level=level,
                limit=100,
                search=search
            )
            # Get stats
            stats = log_aggregator.get_service_stats()
            total_errors = sum(s['errors'] for s in stats.values())
            html = f'''<!DOCTYPE html>
 <html>
 <head>
    <title>BlackRoad Log Aggregator</title>
    <meta http-equiv="refresh" content="30">
    <style>
        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
        body {{
            font-family: 'Monaco', 'Courier New', monospace;
            background: #0b0c0e;
            color: #d8d9da;
            padding: 20px;
        }}
        .header {{
            background: #1f1f20;
            padding: 20px;
            border-radius: 8px;
            margin-bottom: 20px;
        }}
        .title {{
            font-size: 28px;
            font-weight: 600;
            color: #0096FF;
            margin-bottom: 10px;
        }}
        .filters {{
            background: #1f1f20;
            padding: 16px;
            border-radius: 8px;
            margin-bottom: 20px;
            display: flex;
            gap: 12px;
            flex-wrap: wrap;
        }}
        .filter-group {{
            display: flex;
            flex-direction: column;
            gap: 4px;
        }}
        .filter-label {{
            font-size: 12px;
            color: #9d9fa1;
        }}
        select, input {{
            background: #0b0c0e;
            border: 1px solid #2d2e30;
            color: #d8d9da;
            padding: 6px 12px;
            border-radius: 4px;
            font-family: inherit;
        }}
        .stats {{
            display: grid;
            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
            gap: 12px;
            margin-bottom: 20px;
        }}
        .stat-card {{
            background: #1f1f20;
            padding: 12px;
            border-radius: 8px;
            border-left: 3px solid #0096FF;
        }}
        .stat-card.errors {{ border-color: #ff1d6c; }}
        .stat-service {{
            font-size: 12px;
            color: #9d9fa1;
            margin-bottom: 4px;
        }}
        .stat-count {{
            font-size: 20px;
            font-weight: 300;
        }}
        .logs-container {{
            background: #1f1f20;
            border-radius: 8px;
            padding: 16px;
        }}
        .log-entry {{
            font-family: 'Monaco', 'Courier New', monospace;
            font-size: 13px;
            padding: 8px 12px;
            border-bottom: 1px solid #2d2e30;
            display: flex;
            gap: 12px;
        }}
        .log-entry:hover {{
            background: #252527;
        }}
        .log-timestamp {{
            color: #9d9fa1;
            white-space: nowrap;
        }}
        .log-level {{
            font-weight: 600;
            width: 60px;
            flex-shrink: 0;
        }}
        .log-level.INFO {{ color: #0096FF; }}
        .log-level.WARN {{ color: #f5a623; }}
        .log-level.ERROR {{ color: #ff1d6c; }}
        .log-level.CRIT {{ color: #ff1d6c; font-weight: 700; }}
        .log-service {{
            color: #73bf69;
            width: 120px;
            flex-shrink: 0;
        }}
        .log-message {{
            flex: 1;
            word-break: break-word;
        }}
        .no-logs {{
            text-align: center;
            padding: 40px;
            color: #9d9fa1;
        }}
    </style>
 </head>
 <body>
    <div class="header">
        <div class="title">📜 Log Aggregator</div>
        <div style="color: #9d9fa1; font-size: 14px;">Centralized logging • Auto-refresh: 30s</div>
    </div>
    <div class="filters">
        <div class="filter-group">
            <label class="filter-label">Service</label>
            <select onchange="window.location.href='/?service='+this.value">
                <option value="">All Services</option>
                <option value="tts-api" {'selected' if service == 'tts-api' else ''}>TTS API</option>
                <option value="monitor-api" {'selected' if service == 'monitor-api' else ''}>Monitor API</option>
                <option value="load-balancer" {'selected' if service == 'load-balancer' else ''}>Load Balancer</option>
                <option value="fleet-monitor" {'selected' if service == 'fleet-monitor' else ''}>Fleet Monitor</option>
                <option value="grafana" {'selected' if service == 'grafana' else ''}>Grafana</option>
                <option value="alert-manager" {'selected' if service == 'alert-manager' else ''}>Alert Manager</option>
            </select>
        </div>
        <div class="filter-group">
            <label class="filter-label">Level</label>
            <select onchange="window.location.href='/?level='+this.value">
                <option value="">All Levels</option>
                <option value="ERROR" {'selected' if level == 'ERROR' else ''}>ERROR</option>
                <option value="WARN" {'selected' if level == 'WARN' else ''}>WARN</option>
                <option value="INFO" {'selected' if level == 'INFO' else ''}>INFO</option>
            </select>
        </div>
    </div>
    <div class="stats">
        <div class="stat-card errors">
            <div class="stat-service">Total Errors</div>
            <div class="stat-count">{total_errors}</div>
        </div>
 '''
            for service, stat in stats.items():
                html += f'''
        <div class="stat-card">
            <div class="stat-service">{service}</div>
            <div class="stat-count">{stat['total']} logs</div>
        </div>'''
            html += '''
    </div>
    <div class="logs-container">
 '''
            if logs:
                for log in logs:
                    # Format timestamp
                    try:
                        ts = int(log['timestamp']) / 1000000  # Convert microseconds to seconds
                        dt = datetime.fromtimestamp(ts)
                        timestamp = dt.strftime('%H:%M:%S')
                    except:
                        timestamp = 'N/A'
                    html += f'''
        <div class="log-entry">
            <span class="log-timestamp">{timestamp}</span>
            <span class="log-level {log['level']}">{log['level']}</span>
            <span class="log-service">{log['service']}</span>
            <span class="log-message">{log['message']}</span>
        </div>'''
            else:
                html += '<div class="no-logs">No logs found</div>'
            html += '''
    </div>
 </body>
 </html>'''
            self.wfile.write(html.encode())
        elif self.path.startswith('/api/logs'):
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            logs = log_aggregator.collect_logs(limit=100)
            response = json.dumps({'logs': logs, 'count': len(logs)})
            self.wfile.write(response.encode())
        elif self.path == '/api/stats':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            stats = log_aggregator.get_service_stats()
            response = json.dumps(stats)
            self.wfile.write(response.encode())
        elif self.path == '/api/health':
            self.send_response(200)
            self.send_header('Content-type', 'application/json')
            self.end_headers()
            response = json.dumps({'status': 'healthy', 'service': 'log-aggregator'})
            self.wfile.write(response.encode())
        else:
            self.send_response(404)
            self.end_headers()
    def log_message(self, format, *args):
        pass
 with socketserver.TCPServer(("", PORT), LogHandler) as httpd:
    print(f"Log Aggregator running on port {PORT}")
    httpd.serve_forever()
 EOF
 chmod +x ~/log-aggregator/app.py
 echo "📝 Creating systemd service..."
 mkdir -p ~/.config/systemd/user
 cat > ~/.config/systemd/user/log-aggregator.service << 'SYSTEMD'
 [Unit]
 Description=BlackRoad Log Aggregator
 After=network.target
 [Service]
 Type=simple
 WorkingDirectory=%h/log-aggregator
 ExecStart=/usr/bin/python3 %h/log-aggregator/app.py
 Restart=always
 RestartSec=10
 [Install]
 WantedBy=default.target
 SYSTEMD
 echo "🚀 Starting Log Aggregator service..."
 systemctl --user daemon-reload
 systemctl --user enable log-aggregator.service
 systemctl --user restart log-aggregator.service
 echo "⏳ Waiting for Log Aggregator to start..."
 sleep 3
 echo "✅ Testing Log Aggregator..."
 curl -f http://localhost:5800/api/health || echo "⚠️  Health check failed"
 echo ""
 echo "✅ Log Aggregator deployed successfully!"
 systemctl --user status log-aggregator.service --no-pager | head -10
 REMOTE
 echo ""
 echo "✅ Wave 11A deployment complete!"
 echo ""
 echo "📜 Access Log Aggregator:"
 echo "   http://octavia:5800/"
 echo ""
 echo "📊 Features:"
 echo "   • Centralized logging from all services"
 echo "   • Real-time log streaming"
 echo "   • Filter by service and level"
 echo "   • Search capability"
 echo "   • Error tracking"
--- a/scripts/deploy-pipeline.sh
+++ b/scripts/deploy-pipeline.sh
@@ -0,0 +1,423 @@
 #!/bin/bash
 # BlackRoad Deployment Pipeline
 # Automated deployment system for the cluster
 # Agent: Icarus (b3e01bd9)
 PINK='\033[38;5;205m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 CYAN='\033[0;36m'
 RESET='\033[0m'
 DEPLOY_DIR="$HOME/.blackroad/deployments"
 ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
 # Deployment strategies
 STRATEGIES=("rolling" "blue-green" "canary" "all-at-once")
 # Initialize
 init() {
    mkdir -p "$DEPLOY_DIR"/{releases,rollbacks,logs}
    echo -e "${GREEN}Deployment pipeline initialized${RESET}"
 }
 # Pre-deployment checks
 preflight() {
    local nodes=("$@")
    [ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
    echo -e "${PINK}=== PREFLIGHT CHECKS ===${RESET}"
    echo
    local passed=0
    local failed=0
    for node in "${nodes[@]}"; do
        echo -n "  $node: "
        # Check connectivity
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo -e "${RED}UNREACHABLE${RESET}"
            ((failed++))
            continue
        fi
        # Check disk space
        local disk=$(ssh "$node" "df / | awk 'NR==2 {print 100-\$5}'" 2>/dev/null)
        if [ "$disk" -lt 10 ]; then
            echo -e "${RED}LOW DISK (${disk}%)${RESET}"
            ((failed++))
            continue
        fi
        # Check load
        local load=$(ssh "$node" "cat /proc/loadavg | awk '{print \$1}'" 2>/dev/null)
        if [ "$(echo "$load > 10" | bc -l)" = "1" ]; then
            echo -e "${YELLOW}HIGH LOAD ($load)${RESET}"
        fi
        echo -e "${GREEN}READY${RESET} (disk: ${disk}% free, load: $load)"
        ((passed++))
    done
    echo
    echo "Result: $passed passed, $failed failed"
    [ "$failed" -eq 0 ]
 }
 # Deploy to single node
 deploy_node() {
    local node="$1"
    local artifact="$2"
    local target="$3"
    echo -n "  $node: "
    if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
        echo -e "${RED}unreachable${RESET}"
        return 1
    fi
    # Create backup
    ssh "$node" "[ -d '$target' ] && cp -r '$target' '$target.bak.$(date +%s)'" 2>/dev/null
    # Deploy
    if [ -d "$artifact" ]; then
        scp -r "$artifact"/* "$node:$target/" >/dev/null 2>&1
    else
        scp "$artifact" "$node:$target/" >/dev/null 2>&1
    fi
    if [ $? -eq 0 ]; then
        echo -e "${GREEN}deployed${RESET}"
        return 0
    else
        echo -e "${RED}failed${RESET}"
        return 1
    fi
 }
 # Rolling deployment
 deploy_rolling() {
    local artifact="$1"
    local target="$2"
    local nodes=("${@:3}")
    [ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           🚀 ROLLING DEPLOYMENT                              ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    echo "Artifact: $artifact"
    echo "Target: $target"
    echo "Nodes: ${nodes[*]}"
    echo
    local deploy_id=$(date +%Y%m%d_%H%M%S)
    local log_file="$DEPLOY_DIR/logs/deploy_$deploy_id.log"
    local success=0
    local failed=0
    for node in "${nodes[@]}"; do
        echo "$(date -Iseconds) Deploying to $node..." >> "$log_file"
        if deploy_node "$node" "$artifact" "$target"; then
            ((success++))
            echo "$(date -Iseconds) $node: SUCCESS" >> "$log_file"
        else
            ((failed++))
            echo "$(date -Iseconds) $node: FAILED" >> "$log_file"
            # Abort on first failure in rolling deployment
            echo -e "${RED}Deployment halted due to failure${RESET}"
            break
        fi
        # Wait between nodes
        sleep 2
    done
    echo
    echo "Result: $success success, $failed failed"
    echo "Log: $log_file"
    # Record deployment
    echo "{\"id\":\"$deploy_id\",\"artifact\":\"$artifact\",\"target\":\"$target\",\"strategy\":\"rolling\",\"success\":$success,\"failed\":$failed,\"timestamp\":\"$(date -Iseconds)\"}" >> "$DEPLOY_DIR/history.jsonl"
 }
 # Blue-green deployment
 deploy_blue_green() {
    local artifact="$1"
    local target="$2"
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           🔵🟢 BLUE-GREEN DEPLOYMENT                         ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    # Split nodes into blue and green
    local blue_nodes=("${ALL_NODES[@]:0:2}")
    local green_nodes=("${ALL_NODES[@]:2}")
    echo "Blue nodes (current): ${blue_nodes[*]}"
    echo "Green nodes (new): ${green_nodes[*]}"
    echo
    # Deploy to green
    echo -e "${GREEN}Deploying to green nodes...${RESET}"
    for node in "${green_nodes[@]}"; do
        deploy_node "$node" "$artifact" "$target"
    done
    echo
    echo -n "Verify green deployment and switch traffic? [y/N] "
    read -r confirm
    if [[ "$confirm" =~ ^[Yy] ]]; then
        echo -e "${BLUE}Switching traffic to green...${RESET}"
        # In production, this would update load balancer
        echo -e "${GREEN}Traffic switched${RESET}"
        # Now update blue
        echo -e "${BLUE}Updating blue nodes...${RESET}"
        for node in "${blue_nodes[@]}"; do
            deploy_node "$node" "$artifact" "$target"
        done
    else
        echo "Deployment cancelled"
    fi
 }
 # Canary deployment
 deploy_canary() {
    local artifact="$1"
    local target="$2"
    local canary_percent="${3:-20}"
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           🐤 CANARY DEPLOYMENT                               ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    echo "Canary percentage: ${canary_percent}%"
    echo
    # Select canary node (first node)
    local canary_node="${ALL_NODES[0]}"
    local remaining_nodes=("${ALL_NODES[@]:1}")
    echo "Canary node: $canary_node"
    echo "Remaining: ${remaining_nodes[*]}"
    echo
    # Deploy to canary
    echo -e "${YELLOW}Deploying to canary...${RESET}"
    deploy_node "$canary_node" "$artifact" "$target"
    echo
    echo "Monitor the canary deployment."
    echo -n "Proceed with full rollout? [y/N] "
    read -r confirm
    if [[ "$confirm" =~ ^[Yy] ]]; then
        echo -e "${GREEN}Rolling out to remaining nodes...${RESET}"
        for node in "${remaining_nodes[@]}"; do
            deploy_node "$node" "$artifact" "$target"
            sleep 1
        done
        echo -e "${GREEN}Full rollout complete${RESET}"
    else
        echo -e "${YELLOW}Rolling back canary...${RESET}"
        rollback "$canary_node"
    fi
 }
 # Rollback
 rollback() {
    local node="${1:-all}"
    echo -e "${PINK}=== ROLLBACK ===${RESET}"
    echo
    local targets=("${ALL_NODES[@]}")
    [ "$node" != "all" ] && targets=("$node")
    for n in "${targets[@]}"; do
        echo -n "  $n: "
        if ! ssh -o ConnectTimeout=3 "$n" "echo ok" >/dev/null 2>&1; then
            echo -e "${YELLOW}offline${RESET}"
            continue
        fi
        # Find latest backup
        local backup=$(ssh "$n" "ls -1t /opt/*.bak.* 2>/dev/null | head -1")
        if [ -n "$backup" ]; then
            local original="${backup%.bak.*}"
            ssh "$n" "rm -rf '$original' && mv '$backup' '$original'"
            echo -e "${GREEN}restored${RESET}"
        else
            echo -e "${YELLOW}no backup found${RESET}"
        fi
    done
 }
 # Run post-deploy hooks
 run_hooks() {
    local stage="$1"
    local nodes=("${@:2}")
    [ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
    echo -e "${BLUE}Running $stage hooks...${RESET}"
    for node in "${nodes[@]}"; do
        local hook_file="/opt/blackroad/hooks/$stage.sh"
        if ssh "$node" "[ -f '$hook_file' ]" 2>/dev/null; then
            echo "  $node: executing hook"
            ssh "$node" "bash '$hook_file'" 2>/dev/null
        fi
    done
 }
 # Health check after deployment
 healthcheck() {
    local nodes=("$@")
    [ ${#nodes[@]} -eq 0 ] && nodes=("${ALL_NODES[@]}")
    echo -e "${PINK}=== POST-DEPLOY HEALTH CHECK ===${RESET}"
    echo
    local healthy=0
    local unhealthy=0
    for node in "${nodes[@]}"; do
        echo -n "  $node: "
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo -e "${RED}UNREACHABLE${RESET}"
            ((unhealthy++))
            continue
        fi
        # Check services
        local docker_ok=$(ssh "$node" "docker ps -q | wc -l" 2>/dev/null)
        local ollama_ok=$(ssh "$node" "curl -s http://localhost:11434/api/tags >/dev/null && echo 1 || echo 0" 2>/dev/null)
        if [ "$docker_ok" -gt 0 ] && [ "$ollama_ok" = "1" ]; then
            echo -e "${GREEN}HEALTHY${RESET} (docker: $docker_ok, ollama: up)"
            ((healthy++))
        else
            echo -e "${YELLOW}DEGRADED${RESET} (docker: $docker_ok, ollama: $ollama_ok)"
            ((unhealthy++))
        fi
    done
    echo
    echo "Result: $healthy healthy, $unhealthy unhealthy"
 }
 # Deployment history
 history() {
    local lines="${1:-10}"
    echo -e "${PINK}=== DEPLOYMENT HISTORY ===${RESET}"
    echo
    if [ -f "$DEPLOY_DIR/history.jsonl" ]; then
        tail -n "$lines" "$DEPLOY_DIR/history.jsonl" | while read -r line; do
            local id=$(echo "$line" | jq -r '.id')
            local artifact=$(echo "$line" | jq -r '.artifact')
            local strategy=$(echo "$line" | jq -r '.strategy')
            local success=$(echo "$line" | jq -r '.success')
            local failed=$(echo "$line" | jq -r '.failed')
            echo "  $id: $artifact ($strategy) - $success✓ $failed✗"
        done
    else
        echo "No deployment history"
    fi
 }
 # Status
 status() {
    echo -e "${PINK}=== DEPLOYMENT STATUS ===${RESET}"
    echo
    local total=$(wc -l < "$DEPLOY_DIR/history.jsonl" 2>/dev/null || echo 0)
    local last=$(tail -1 "$DEPLOY_DIR/history.jsonl" 2>/dev/null | jq -r '.timestamp // "never"')
    echo "Total deployments: $total"
    echo "Last deployment: $last"
    echo
    healthcheck
 }
 # Help
 help() {
    echo -e "${PINK}BlackRoad Deployment Pipeline${RESET}"
    echo
    echo "Automated deployment system for the cluster"
    echo
    echo "Commands:"
    echo "  preflight [nodes]           Pre-deployment checks"
    echo "  rolling <artifact> <target> Rolling deployment"
    echo "  blue-green <art> <target>   Blue-green deployment"
    echo "  canary <art> <target> [%]   Canary deployment"
    echo "  rollback [node|all]         Rollback deployment"
    echo "  healthcheck [nodes]         Post-deploy health check"
    echo "  history [lines]             Deployment history"
    echo "  status                      Current status"
    echo
    echo "Strategies: ${STRATEGIES[*]}"
    echo
    echo "Examples:"
    echo "  $0 preflight"
    echo "  $0 rolling ./dist /opt/app"
    echo "  $0 canary ./dist /opt/app 10"
    echo "  $0 rollback"
 }
 # Ensure initialized
 [ -d "$DEPLOY_DIR" ] || init >/dev/null
 case "${1:-help}" in
    init)
        init
        ;;
    preflight|check)
        shift
        preflight "$@"
        ;;
    rolling)
        deploy_rolling "$2" "$3" "${@:4}"
        ;;
    blue-green|bluegreen)
        deploy_blue_green "$2" "$3"
        ;;
    canary)
        deploy_canary "$2" "$3" "$4"
        ;;
    rollback|revert)
        rollback "$2"
        ;;
    healthcheck|health)
        shift
        healthcheck "$@"
        ;;
    hooks)
        run_hooks "$2" "${@:3}"
        ;;
    history)
        history "$2"
        ;;
    status)
        status
        ;;
    *)
        help
        ;;
 esac
--- a/scripts/fleet-enhancer.sh
+++ b/scripts/fleet-enhancer.sh
@@ -0,0 +1,466 @@
 #!/bin/bash
 # ============================================================================
 # BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
 # Copyright (c) 2024-2026 BlackRoad OS, Inc. All Rights Reserved.
 # 
 # This code is the intellectual property of BlackRoad OS, Inc.
 # AI-assisted development does not transfer ownership to AI providers.
 # Unauthorized use, copying, or distribution is prohibited.
 # NOT licensed for AI training or data extraction.
 # ============================================================================
 # BlackRoad Fleet OS Enhancer
 # Deploys CECE OS and enhancements across all Pi devices
 # Usage: ./blackroad-fleet-os-enhancer.sh [command] [target]
 set -e
 # BlackRoad Colors
 PINK='\033[38;5;205m'
 AMBER='\033[38;5;214m'
 BLUE='\033[38;5;69m'
 VIOLET='\033[38;5;135m'
 GREEN='\033[38;5;82m'
 RED='\033[38;5;196m'
 RESET='\033[0m'
 # Device Fleet Configuration (space-separated: name:local_ip:ts_ip:role)
 DEVICES="
 cecilia:192.168.4.89:100.72.180.98:primary_ai
 lucidia:192.168.4.81:100.83.149.86:inference
 octavia:192.168.4.38:100.66.235.47:multiarm
 alice:192.168.4.49:100.77.210.18:worker
 aria:192.168.4.82:100.109.14.17:harmony
 "
 CECE_OS_DIR="$HOME/cece-os"
 FLEET_LOG="$HOME/.blackroad/fleet-os-enhancer.log"
 mkdir -p "$(dirname "$FLEET_LOG")"
 banner() {
    echo -e "${PINK}╔════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║${RESET}     ${AMBER}🖤🛣️ BLACKROAD FLEET OS ENHANCER 🖤🛣️${RESET}     ${PINK}║${RESET}"
    echo -e "${PINK}╚════════════════════════════════════════════════════════════╝${RESET}"
    echo ""
 }
 log() {
    local msg="$1"
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    echo "[$timestamp] $msg" >> "$FLEET_LOG"
    echo -e "${BLUE}[INFO]${RESET} $msg"
 }
 success() {
    echo -e "${GREEN}✅ $1${RESET}"
 }
 error() {
    echo -e "${RED}❌ $1${RESET}"
 }
 get_device_info() {
    local name="$1"
    echo "$DEVICES" | grep "^$name:" | head -1
 }
 check_device() {
    local name="$1"
    local info=$(get_device_info "$name")
    local local_ip=$(echo "$info" | cut -d: -f2)
    local ts_ip=$(echo "$info" | cut -d: -f3)
    # Try local first
    if ping -c 1 -W 2 "$local_ip" &>/dev/null 2>&1; then
        echo "$local_ip"
        return 0
    fi
    # Try Tailscale
    if ping -c 1 -W 2 "$ts_ip" &>/dev/null 2>&1; then
        echo "$ts_ip"
        return 0
    fi
    return 1
 }
 fleet_status() {
    echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
    echo -e "${AMBER}📡 FLEET STATUS${RESET}"
    echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
    printf "%-12s %-16s %-20s %-10s\n" "DEVICE" "IP" "ROLE" "STATUS"
    echo "─────────────────────────────────────────────────────────────"
    local online=0
    local offline=0
    echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
        [[ -z "$name" ]] && continue
        if reachable_ip=$(check_device "$name" 2>/dev/null); then
            printf "%-12s %-16s %-20s ${GREEN}%-10s${RESET}\n" "$name" "$reachable_ip" "$role" "ONLINE"
        else
            printf "%-12s %-16s %-20s ${RED}%-10s${RESET}\n" "$name" "$local_ip" "$role" "OFFLINE"
        fi
    done
    echo ""
 }
 generate_cece_installer() {
    cat << 'INSTALLER'
 #!/bin/bash
 # CECE OS Installer for Raspberry Pi
 # Auto-generated by BlackRoad Fleet OS Enhancer
 set -e
 CECE_HOME="$HOME/.cece-os"
 CECE_BIN="$HOME/.local/bin"
 echo "🖤 Installing CECE OS..."
 # Create directories
 mkdir -p "$CECE_HOME"/{apps,heart,mind,soul,memories,dreams,net}
 mkdir -p "$CECE_BIN"
 # Install heartbeat daemon
 cat > "$CECE_HOME/heart/heartbeat.sh" << 'HB'
 #!/bin/bash
 HEARTBEAT_FILE="$HOME/.cece-os/heart/pulse.json"
 while true; do
    cat > "$HEARTBEAT_FILE" << EOF
 {
    "timestamp": "$(date -Iseconds)",
    "hostname": "$(hostname)",
    "uptime": "$(uptime -p 2>/dev/null || uptime)",
    "load": "$(cat /proc/loadavg 2>/dev/null | cut -d' ' -f1-3)",
    "memory_free": "$(free -h 2>/dev/null | awk '/^Mem:/ {print $4}' || echo 'N/A')",
    "disk_free": "$(df -h / 2>/dev/null | awk 'NR==2 {print $4}')",
    "temperature": "$(vcgencmd measure_temp 2>/dev/null | cut -d= -f2 || echo 'N/A')",
    "cece_version": "0.2.0",
    "alive": true
 }
 EOF
    sleep 30
 done
 HB
 chmod +x "$CECE_HOME/heart/heartbeat.sh"
 # Install network status tool
 cat > "$CECE_HOME/net/status.sh" << 'NET'
 #!/bin/bash
 echo "{"
 echo "  \"interfaces\": \"$(ip -o addr show | awk '{print $2, $4}' | tr '\n' ';')\","
 echo "  \"gateway\": \"$(ip route | grep default | awk '{print $3}')\","
 echo "  \"dns\": \"$(cat /etc/resolv.conf | grep nameserver | head -1 | awk '{print $2}')\","
 echo "  \"tailscale\": \"$(tailscale status --json 2>/dev/null | jq -r '.Self.TailscaleIPs[0] // "not connected"' 2>/dev/null || echo 'not installed')\""
 echo "}"
 NET
 chmod +x "$CECE_HOME/net/status.sh"
 # Install main CLI
 cat > "$CECE_BIN/cece" << 'CLI'
 #!/bin/bash
 CECE_HOME="$HOME/.cece-os"
 VERSION="0.2.0"
 case "$1" in
    help|--help|-h)
        echo "CECE OS v$VERSION - Sovereign AI Operating System"
        echo ""
        echo "Core Commands:"
        echo "  cece pulse     - Show heartbeat status"
        echo "  cece memory    - Access memories"
        echo "  cece dream     - Record a dream"
        echo "  cece apps      - List installed apps"
        echo ""
        echo "System Commands:"
        echo "  cece net       - Network status"
        echo "  cece sysinfo   - System information"
        echo "  cece logs      - View CECE logs"
        echo "  cece update    - Update CECE OS"
        echo ""
        echo "AI Commands:"
        echo "  cece think     - AI thinking mode"
        echo "  cece ask       - Ask AI a question"
        ;;
    pulse)
        if [[ -f "$CECE_HOME/heart/pulse.json" ]]; then
            cat "$CECE_HOME/heart/pulse.json" | python3 -m json.tool 2>/dev/null || cat "$CECE_HOME/heart/pulse.json"
        else
            echo '{"alive": false, "error": "No heartbeat"}'
        fi
        ;;
    memory|memories)
        echo "📚 CECE Memories:"
        ls -la "$CECE_HOME/memories/" 2>/dev/null || echo "No memories yet"
        ;;
    dream)
        if [[ -n "$2" ]]; then
            echo "{\"timestamp\": \"$(date -Iseconds)\", \"dream\": \"$2\"}" >> "$CECE_HOME/dreams/journal.jsonl"
            echo "💫 Dream recorded"
        else
            echo "Usage: cece dream \"your dream\""
        fi
        ;;
    apps)
        echo "📱 CECE Apps:"
        if [[ -d "$CECE_HOME/apps" ]]; then
            count=$(ls "$CECE_HOME/apps/" 2>/dev/null | wc -l)
            echo "  Installed: $count apps"
            ls "$CECE_HOME/apps/" 2>/dev/null | head -20
        else
            echo "  No apps installed"
        fi
        ;;
    net|network)
        bash "$CECE_HOME/net/status.sh" 2>/dev/null || echo "Network check failed"
        ;;
    sysinfo)
        echo "🖥️ System Info:"
        echo "  Hostname: $(hostname)"
        echo "  Kernel: $(uname -r)"
        echo "  Architecture: $(uname -m)"
        echo "  Memory: $(free -h | awk '/^Mem:/ {print $2 " total, " $4 " free"}')"
        echo "  Disk: $(df -h / | awk 'NR==2 {print $2 " total, " $4 " free"}')"
        vcgencmd measure_temp 2>/dev/null && vcgencmd measure_clock arm 2>/dev/null || true
        ;;
    logs)
        tail -50 "$CECE_HOME/logs/cece.log" 2>/dev/null || echo "No logs yet"
        ;;
    start)
        echo "🚀 Starting CECE services..."
        nohup "$CECE_HOME/heart/heartbeat.sh" > "$CECE_HOME/logs/heartbeat.log" 2>&1 &
        echo "  Heartbeat: PID $!"
        ;;
    stop)
        echo "🛑 Stopping CECE services..."
        pkill -f "heartbeat.sh" 2>/dev/null && echo "  Heartbeat stopped" || echo "  Not running"
        ;;
    version|-v|--version)
        echo "CECE OS v$VERSION"
        echo "Built by BlackRoad OS, Inc."
        ;;
    *)
        echo "CECE OS v$VERSION - Run 'cece help' for commands"
        echo ""
        if [[ -f "$CECE_HOME/heart/pulse.json" ]]; then
            echo "Status: $(cat "$CECE_HOME/heart/pulse.json" | grep -o '"alive": [^,]*' | cut -d: -f2)"
        else
            echo "Status: Not running (run 'cece start')"
        fi
        ;;
 esac
 CLI
 chmod +x "$CECE_BIN/cece"
 # Create logs directory
 mkdir -p "$CECE_HOME/logs"
 # Add to PATH in bashrc
 if ! grep -q 'CECE_PATH' "$HOME/.bashrc" 2>/dev/null; then
    echo 'export PATH="$HOME/.local/bin:$PATH"  # CECE_PATH' >> "$HOME/.bashrc"
 fi
 # Start heartbeat service
 pkill -f "heartbeat.sh" 2>/dev/null || true
 nohup "$CECE_HOME/heart/heartbeat.sh" > "$CECE_HOME/logs/heartbeat.log" 2>&1 &
 echo ""
 echo "✅ CECE OS v0.2.0 installed!"
 echo "   Run 'cece help' for commands"
 echo "   Heartbeat running (PID: $!)"
 INSTALLER
 }
 deploy_to_device() {
    local name="$1"
    local ip=$(check_device "$name" 2>/dev/null)
    if [[ -z "$ip" ]]; then
        error "Cannot reach $name"
        return 1
    fi
    log "Deploying CECE OS to $name ($ip)..."
    # Generate installer
    local installer="/tmp/cece-installer-$name.sh"
    generate_cece_installer > "$installer"
    chmod +x "$installer"
    # Copy and execute
    if ! scp -o ConnectTimeout=10 -o StrictHostKeyChecking=no "$installer" "pi@$ip:/tmp/cece-installer.sh" 2>/dev/null; then
        error "Failed to copy installer to $name"
        rm -f "$installer"
        return 1
    fi
    if ! ssh -o ConnectTimeout=10 "pi@$ip" "bash /tmp/cece-installer.sh" 2>/dev/null; then
        error "Failed to run installer on $name"
        rm -f "$installer"
        return 1
    fi
    success "CECE OS deployed to $name"
    # Verify heartbeat
    sleep 2
    if ssh -o ConnectTimeout=5 "pi@$ip" "cat ~/.cece-os/heart/pulse.json 2>/dev/null" | grep -q "alive"; then
        success "Heartbeat verified on $name"
    fi
    rm -f "$installer"
 }
 deploy_all() {
    log "Starting fleet-wide CECE OS deployment..."
    local success_count=0
    local fail_count=0
    echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
        [[ -z "$name" ]] && continue
        echo ""
        echo -e "${VIOLET}━━━ Deploying to $name ━━━${RESET}"
        if deploy_to_device "$name"; then
            echo "success" >> /tmp/deploy-count
        else
            echo "fail" >> /tmp/deploy-count
        fi
    done
    rm -f /tmp/deploy-count
    echo ""
    echo -e "${AMBER}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
    success "Fleet deployment complete"
 }
 enhance_device() {
    local name="$1"
    local ip=$(check_device "$name" 2>/dev/null)
    if [[ -z "$ip" ]]; then
        error "Cannot reach $name"
        return 1
    fi
    log "Enhancing $name..."
    # Update system
    ssh "pi@$ip" "sudo apt-get update -qq && sudo apt-get upgrade -y -qq" 2>/dev/null || true
    # Install common tools
    ssh "pi@$ip" "sudo apt-get install -y -qq jq htop tmux git curl python3-pip" 2>/dev/null || true
    # Enable I2C and SPI
    ssh "pi@$ip" "sudo raspi-config nonint do_i2c 0 2>/dev/null; sudo raspi-config nonint do_spi 0 2>/dev/null" 2>/dev/null || true
    success "Enhanced $name"
 }
 collect_metrics() {
    echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
    echo -e "${AMBER}📊 FLEET METRICS${RESET}"
    echo -e "${VIOLET}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
    printf "%-12s %-10s %-10s %-12s %-10s\n" "DEVICE" "TEMP" "LOAD" "MEM FREE" "DISK FREE"
    echo "─────────────────────────────────────────────────────────────"
    echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
        [[ -z "$name" ]] && continue
        if ip=$(check_device "$name" 2>/dev/null); then
            metrics=$(ssh -o ConnectTimeout=5 "pi@$ip" "cat ~/.cece-os/heart/pulse.json 2>/dev/null" 2>/dev/null)
            if [[ -n "$metrics" ]]; then
                temp=$(echo "$metrics" | grep -o '"temperature": "[^"]*"' | cut -d'"' -f4)
                load=$(echo "$metrics" | grep -o '"load": "[^"]*"' | cut -d'"' -f4 | cut -d' ' -f1)
                mem=$(echo "$metrics" | grep -o '"memory_free": "[^"]*"' | cut -d'"' -f4)
                disk=$(echo "$metrics" | grep -o '"disk_free": "[^"]*"' | cut -d'"' -f4)
                printf "%-12s %-10s %-10s %-12s %-10s\n" "$name" "${temp:-N/A}" "${load:-N/A}" "${mem:-N/A}" "${disk:-N/A}"
            else
                printf "%-12s ${AMBER}%-10s${RESET}\n" "$name" "NO CECE"
            fi
        else
            printf "%-12s ${RED}%-10s${RESET}\n" "$name" "OFFLINE"
        fi
    done
 }
 sync_apps() {
    log "Syncing CECE OS apps to fleet..."
    if [[ ! -d "$CECE_OS_DIR/apps" ]]; then
        error "CECE OS apps directory not found: $CECE_OS_DIR/apps"
        return 1
    fi
    echo "$DEVICES" | while IFS=: read -r name local_ip ts_ip role; do
        [[ -z "$name" ]] && continue
        if ip=$(check_device "$name" 2>/dev/null); then
            log "Syncing apps to $name..."
            rsync -avz --progress "$CECE_OS_DIR/apps/" "pi@$ip:~/.cece-os/apps/" 2>/dev/null && \
                success "Synced to $name" || \
                error "Sync failed for $name"
        else
            error "$name is offline"
        fi
    done
 }
 # Main command handler
 case "${1:-status}" in
    status)
        banner
        fleet_status
        ;;
    metrics)
        banner
        collect_metrics
        ;;
    deploy)
        banner
        if [[ -n "$2" ]]; then
            deploy_to_device "$2"
        else
            deploy_all
        fi
        ;;
    enhance)
        banner
        if [[ -n "$2" ]]; then
            enhance_device "$2"
        else
            echo "$DEVICES" | while IFS=: read -r name _; do
                [[ -n "$name" ]] && enhance_device "$name"
            done
        fi
        ;;
    sync)
        banner
        sync_apps
        ;;
    help|--help|-h)
        banner
        echo "Usage: $0 <command> [target]"
        echo ""
        echo "Commands:"
        echo "  status          - Show fleet status (default)"
        echo "  metrics         - Collect metrics from all devices"
        echo "  deploy [host]   - Deploy CECE OS (all or specific host)"
        echo "  enhance [host]  - Enhance OS (updates, tools, config)"
        echo "  sync            - Sync CECE apps to all devices"
        echo "  help            - Show this help"
        echo ""
        echo "Devices: cecilia, lucidia, octavia, alice, aria"
        ;;
    *)
        error "Unknown command: $1"
        echo "Run '$0 help' for usage"
        exit 1
        ;;
 esac
--- a/scripts/live-dashboard.sh
+++ b/scripts/live-dashboard.sh
@@ -0,0 +1,409 @@
 #!/usr/bin/env bash
 # ============================================================================
 # BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
 # BlackRoad Live Infrastructure Dashboard
 # Real-time monitoring of entire fleet using terminal GUI
 # ============================================================================
 set -e
 # Color functions
 c_pink()   { printf '\033[38;5;205m'; }
 c_blue()   { printf '\033[38;5;75m'; }
 c_green()  { printf '\033[38;5;82m'; }
 c_yellow() { printf '\033[38;5;226m'; }
 c_red()    { printf '\033[38;5;196m'; }
 c_purple() { printf '\033[38;5;141m'; }
 c_orange() { printf '\033[38;5;208m'; }
 c_gray()   { printf '\033[38;5;240m'; }
 c_reset()  { printf '\033[0m'; }
 c_clear()  { printf '\033[2J\033[H'; }
 c_bold()   { printf '\033[1m'; }
 # Fleet configuration
 FLEET_DEVICES=(
    "cecilia:192.168.4.36:Hailo-8 AI Core"
    "alice:192.168.4.38:Pi 4 Worker"
    "aria:192.168.4.40:Pi 5 Titan"
    "octavia:192.168.4.38:Jetson Quantum"
    "lucidia:192.168.4.42:Pi 5 Pironman"
 )
 # ==================
 # DATA COLLECTORS
 # ==================
 get_device_status() {
    local host="$1"
    local ip="$2"
    # Try ping first (fast)
    if ping -c 1 -W 1 "$ip" >/dev/null 2>&1; then
        echo "online"
    else
        echo "offline"
    fi
 }
 get_cpu_usage() {
    local host="$1"
    # Local
    if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
        top -l 1 | grep "CPU usage" | awk '{print $3}' | tr -d '%'
    else
        # Remote (if SSH available)
        ssh -o ConnectTimeout=2 "$host" "top -bn1 | grep 'Cpu(s)' | awk '{print \$2}' | tr -d '%'" 2>/dev/null || echo "N/A"
    fi
 }
 get_memory_usage() {
    local host="$1"
    if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
        vm_stat | perl -ne '/page size of (\d+)/ and $size=$1; /Pages free:\s+(\d+)/ and printf("%.0f", $1 * $size / 1073741824); /Pages active:\s+(\d+)/ and printf("%.0f", $1 * $size / 1073741824)'
    else
        ssh -o ConnectTimeout=2 "$host" "free -m | awk '/Mem:/ {printf \"%.0f\", \$3/1024}'" 2>/dev/null || echo "N/A"
    fi
 }
 get_uptime() {
    local host="$1"
    if [[ "$host" == "localhost" || "$host" == "$(hostname)" ]]; then
        uptime | awk -F'up ' '{print $2}' | awk -F',' '{print $1}' | xargs
    else
        ssh -o ConnectTimeout=2 "$host" "uptime | awk -F'up ' '{print \$2}' | awk -F',' '{print \$1}'" 2>/dev/null | xargs || echo "N/A"
    fi
 }
 get_quantum_status() {
    # Check if quantum stack is available
    if command -v python3 >/dev/null 2>&1; then
        if python3 -c "import qiskit" 2>/dev/null; then
            echo "ready"
        else
            echo "unavailable"
        fi
    else
        echo "unavailable"
    fi
 }
 # ==================
 # DISPLAY COMPONENTS
 # ==================
 draw_header() {
    c_clear
    c_pink; c_bold
    printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
    printf "║                                                                                ║\n"
    printf "║                   BLACKROAD OS - LIVE INFRASTRUCTURE DASHBOARD                 ║\n"
    printf "║                                                                                ║\n"
    printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
    c_reset
    printf "\n"
 }
 draw_device_status() {
    local name="$1"
    local ip="$2"
    local desc="$3"
    local status="$4"
    if [[ "$status" == "online" ]]; then
        c_green; printf "●"; c_reset
    else
        c_red; printf "●"; c_reset
    fi
    printf " "
    c_blue; c_bold; printf "%-12s" "$name"; c_reset
    printf " "
    c_gray; printf "%-15s" "$ip"; c_reset
    printf " "
    printf "%-25s" "$desc"
    printf "\n"
 }
 draw_metrics() {
    local cpu="$1"
    local mem="$2"
    local uptime="$3"
    printf "    "
    c_purple; printf "CPU: "; c_reset
    if [[ "$cpu" =~ ^[0-9]+\.?[0-9]*$ ]]; then
        if (( $(echo "$cpu > 80" | bc -l) )); then
            c_red; printf "%5s%%" "$cpu"; c_reset
        elif (( $(echo "$cpu > 50" | bc -l) )); then
            c_yellow; printf "%5s%%" "$cpu"; c_reset
        else
            c_green; printf "%5s%%" "$cpu"; c_reset
        fi
    else
        c_gray; printf "%5s" "$cpu"; c_reset
    fi
    printf "  "
    c_purple; printf "MEM: "; c_reset
    c_blue; printf "%4s GB" "$mem"; c_reset
    printf "  "
    c_purple; printf "UPTIME: "; c_reset
    c_gray; printf "%s" "$uptime"; c_reset
    printf "\n"
 }
 draw_quantum_status() {
    local status="$1"
    printf "\n"
    c_orange; c_bold
    printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
    printf "║ QUANTUM COMPUTING STATUS                                                       ║\n"
    printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
    c_reset
    printf "\n"
    printf "  "
    if [[ "$status" == "ready" ]]; then
        c_green; printf "● OPERATIONAL"; c_reset
        printf " - Qiskit available, ready for quantum circuits\n"
    else
        c_gray; printf "○ UNAVAILABLE"; c_reset
        printf " - Quantum frameworks not installed\n"
    fi
    printf "\n"
 }
 draw_summary() {
    local online="$1"
    local total="$2"
    printf "\n"
    c_blue; c_bold
    printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
    printf "║ FLEET SUMMARY                                                                  ║\n"
    printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
    c_reset
    printf "\n"
    printf "  "
    c_purple; printf "Total Devices: "; c_reset
    printf "%d\n" "$total"
    printf "  "
    c_purple; printf "Online: "; c_reset
    c_green; printf "%d"; c_reset
    printf "  "
    c_purple; printf "Offline: "; c_reset
    c_red; printf "%d"; c_reset
    local uptime_pct=$((online * 100 / total))
    printf "  "
    c_purple; printf "Uptime: "; c_reset
    if (( uptime_pct >= 90 )); then
        c_green; printf "%d%%" "$uptime_pct"; c_reset
    elif (( uptime_pct >= 70 )); then
        c_yellow; printf "%d%%" "$uptime_pct"; c_reset
    else
        c_red; printf "%d%%" "$uptime_pct"; c_reset
    fi
    printf "\n\n"
 }
 draw_footer() {
    local timestamp="$1"
    printf "\n"
    c_gray
    printf "═══════════════════════════════════════════════════════════════════════════════\n"
    printf "Last updated: %s  |  Press Ctrl+C to exit  |  Refresh: 5s\n" "$timestamp"
    c_reset
 }
 # ==================
 # MAIN DASHBOARD
 # ==================
 run_dashboard() {
    local refresh_interval="${1:-5}"
    while true; do
        draw_header
        # Fleet status section
        c_blue; c_bold
        printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
        printf "║ DEVICE FLEET                                                                   ║\n"
        printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
        c_reset
        printf "\n"
        local online_count=0
        local total_count=${#FLEET_DEVICES[@]}
        for device in "${FLEET_DEVICES[@]}"; do
            IFS=':' read -r name ip desc <<< "$device"
            # Get status
            local status=$(get_device_status "$name" "$ip")
            # Draw device line
            draw_device_status "$name" "$ip" "$desc" "$status"
            # Get metrics if online
            if [[ "$status" == "online" ]]; then
                ((online_count++))
                local cpu=$(get_cpu_usage "$name")
                local mem=$(get_memory_usage "$name")
                local uptime=$(get_uptime "$name")
                draw_metrics "$cpu" "$mem" "$uptime"
            else
                c_gray
                printf "    Offline - no metrics available\n"
                c_reset
            fi
            printf "\n"
        done
        # Quantum status
        local quantum_status=$(get_quantum_status)
        draw_quantum_status "$quantum_status"
        # Fleet summary
        draw_summary "$online_count" "$total_count"
        # Footer
        local timestamp=$(date "+%Y-%m-%d %H:%M:%S")
        draw_footer "$timestamp"
        # Wait before refresh
        sleep "$refresh_interval"
    done
 }
 # ==================
 # CLI INTERFACE
 # ==================
 show_help() {
    cat <<'HELP'
 BlackRoad Live Infrastructure Dashboard
 USAGE:
  blackroad-live-dashboard.sh [OPTIONS]
 OPTIONS:
  --interval N    Refresh interval in seconds (default: 5)
  --once          Run once and exit (no loop)
  --help          Show this help
 EXAMPLES:
  blackroad-live-dashboard.sh                    # Live dashboard (5s refresh)
  blackroad-live-dashboard.sh --interval 10      # 10 second refresh
  blackroad-live-dashboard.sh --once             # Single snapshot
 MONITORED DEVICES:
  • cecilia  - Hailo-8 AI Core
  • alice    - Pi 4 Worker
  • aria     - Pi 5 Titan  
  • octavia  - Jetson Quantum
  • lucidia  - Pi 5 Pironman
 METRICS:
  • Device online/offline status
  • CPU usage (%)
  • Memory usage (GB)
  • System uptime
  • Quantum computing availability
 Press Ctrl+C to exit live mode.
 HELP
 }
 # ==================
 # MAIN
 # ==================
 main() {
    local mode="live"
    local interval=5
    # Parse arguments
    while [[ $# -gt 0 ]]; do
        case "$1" in
            --interval)
                interval="$2"
                shift 2
                ;;
            --once)
                mode="once"
                shift
                ;;
            --help|-h)
                show_help
                exit 0
                ;;
            *)
                echo "Unknown option: $1"
                show_help
                exit 1
                ;;
        esac
    done
    # Run dashboard
    if [[ "$mode" == "once" ]]; then
        # Single run
        draw_header
        c_blue; c_bold
        printf "╔════════════════════════════════════════════════════════════════════════════════╗\n"
        printf "║ DEVICE FLEET                                                                   ║\n"
        printf "╚════════════════════════════════════════════════════════════════════════════════╝\n"
        c_reset
        printf "\n"
        local online_count=0
        local total_count=${#FLEET_DEVICES[@]}
        for device in "${FLEET_DEVICES[@]}"; do
            IFS=':' read -r name ip desc <<< "$device"
            local status=$(get_device_status "$name" "$ip")
            draw_device_status "$name" "$ip" "$desc" "$status"
            if [[ "$status" == "online" ]]; then
                ((online_count++))
                echo "    (Metrics available via live mode)"
            fi
            printf "\n"
        done
        local quantum_status=$(get_quantum_status)
        draw_quantum_status "$quantum_status"
        draw_summary "$online_count" "$total_count"
        c_gray
        printf "Run without --once for live monitoring\n"
        c_reset
    else
        # Live monitoring
        run_dashboard "$interval"
    fi
 }
 main "$@"
--- a/scripts/log-aggregator.sh
+++ b/scripts/log-aggregator.sh
@@ -0,0 +1,393 @@
 #!/bin/bash
 # BlackRoad Log Aggregator
 # Centralized log collection and analysis for the cluster
 # Agent: Icarus (b3e01bd9)
 PINK='\033[38;5;205m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 CYAN='\033[0;36m'
 RESET='\033[0m'
 LOG_DIR="$HOME/.blackroad/logs"
 ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
 # Log sources on each node
 declare -A LOG_SOURCES=(
    ["system"]="/var/log/syslog"
    ["docker"]="/var/log/docker.log"
    ["ollama"]="/var/log/ollama.log"
    ["auth"]="/var/log/auth.log"
    ["nginx"]="/var/log/nginx/access.log"
 )
 # Initialize
 init() {
    mkdir -p "$LOG_DIR"/{collected,analyzed,alerts}
    echo -e "${GREEN}Log aggregator initialized${RESET}"
 }
 # Collect logs from a node
 collect_node() {
    local node="$1"
    local source="${2:-system}"
    local lines="${3:-100}"
    if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
        echo "offline"
        return 1
    fi
    local log_path="${LOG_SOURCES[$source]}"
    [ -z "$log_path" ] && log_path="$source"
    ssh "$node" "sudo tail -n $lines $log_path 2>/dev/null" | while read -r line; do
        echo "$(date -Iseconds) [$node] $line"
    done
 }
 # Collect logs from all nodes
 collect_all() {
    local source="${1:-system}"
    local lines="${2:-50}"
    local output_file="$LOG_DIR/collected/cluster_${source}_$(date +%Y%m%d_%H%M%S).log"
    echo -e "${PINK}=== COLLECTING LOGS ===${RESET}"
    echo "Source: $source"
    echo "Lines per node: $lines"
    echo
    for node in "${ALL_NODES[@]}"; do
        echo -n "  $node: "
        local logs=$(collect_node "$node" "$source" "$lines")
        if [ "$logs" = "offline" ]; then
            echo -e "${YELLOW}offline${RESET}"
        else
            echo "$logs" >> "$output_file"
            local count=$(echo "$logs" | wc -l)
            echo -e "${GREEN}$count lines${RESET}"
        fi
    done
    echo
    echo -e "${GREEN}Saved: $output_file${RESET}"
 }
 # Stream logs in real-time
 stream() {
    local source="${1:-system}"
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           📋 LIVE LOG STREAM - $source                       ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    echo "Streaming from ${#ALL_NODES[@]} nodes. Press Ctrl+C to stop."
    echo
    local log_path="${LOG_SOURCES[$source]}"
    [ -z "$log_path" ] && log_path="$source"
    # Stream from all nodes in parallel
    for node in "${ALL_NODES[@]}"; do
        (
            ssh "$node" "sudo tail -f $log_path 2>/dev/null" | while read -r line; do
                local color
                case $node in
                    lucidia) color=$CYAN ;;
                    cecilia) color=$GREEN ;;
                    octavia) color=$BLUE ;;
                    aria) color=$YELLOW ;;
                    alice) color=$PINK ;;
                    *) color=$RESET ;;
                esac
                echo -e "${color}[$node]${RESET} $line"
            done
        ) &
    done
    wait
 }
 # Search across all logs
 search() {
    local pattern="$1"
    local source="${2:-system}"
    local context="${3:-0}"
    echo -e "${PINK}=== LOG SEARCH ===${RESET}"
    echo "Pattern: $pattern"
    echo "Source: $source"
    echo
    local log_path="${LOG_SOURCES[$source]}"
    [ -z "$log_path" ] && log_path="$source"
    for node in "${ALL_NODES[@]}"; do
        echo -e "${BLUE}--- $node ---${RESET}"
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo -e "${YELLOW}offline${RESET}"
            continue
        fi
        local matches=$(ssh "$node" "sudo grep -C $context -i '$pattern' $log_path 2>/dev/null" | head -20)
        if [ -n "$matches" ]; then
            echo "$matches"
        else
            echo "No matches"
        fi
        echo
    done
 }
 # Analyze logs for errors
 analyze_errors() {
    local hours="${1:-1}"
    echo -e "${PINK}=== ERROR ANALYSIS ===${RESET}"
    echo "Last $hours hour(s)"
    echo
    for node in "${ALL_NODES[@]}"; do
        echo -e "${BLUE}$node:${RESET}"
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo -e "  ${YELLOW}offline${RESET}"
            continue
        fi
        # Count errors by type
        local errors=$(ssh "$node" "
            since=\$(date -d '-${hours} hours' '+%b %d %H:%M' 2>/dev/null || echo '')
            sudo grep -i 'error\\|fail\\|critical' /var/log/syslog 2>/dev/null | tail -50 | \
            awk '{for(i=1;i<=NF;i++) if(\$i ~ /error|fail|critical/i) count[\$i]++} END {for(k in count) print count[k], k}' | \
            sort -rn | head -5
        " 2>/dev/null)
        if [ -n "$errors" ]; then
            echo "$errors" | while read -r count word; do
                local color=$YELLOW
                [ "$count" -gt 10 ] && color=$RED
                echo -e "  ${color}$count${RESET} $word"
            done
        else
            echo -e "  ${GREEN}No errors${RESET}"
        fi
    done
 }
 # Generate log report
 report() {
    local hours="${1:-24}"
    local report_file="$LOG_DIR/analyzed/report_$(date +%Y%m%d_%H%M%S).md"
    echo -e "${PINK}=== GENERATING REPORT ===${RESET}"
    echo "Period: Last $hours hours"
    echo
    cat > "$report_file" << EOF
 # BlackRoad Cluster Log Report
 Generated: $(date)
 Period: Last $hours hours
 ## Node Status
 EOF
    for node in "${ALL_NODES[@]}"; do
        echo -n "  Analyzing $node... "
        if ! ssh -o ConnectTimeout=3 "$node" "echo ok" >/dev/null 2>&1; then
            echo "### $node: OFFLINE" >> "$report_file"
            echo -e "${YELLOW}offline${RESET}"
            continue
        fi
        local stats=$(ssh "$node" "
            errors=\$(sudo grep -ci 'error' /var/log/syslog 2>/dev/null || echo 0)
            warnings=\$(sudo grep -ci 'warning' /var/log/syslog 2>/dev/null || echo 0)
            docker_restarts=\$(docker events --since '${hours}h' --until 'now' 2>/dev/null | grep -c 'restart' || echo 0)
            echo \"\$errors \$warnings \$docker_restarts\"
        " 2>/dev/null)
        local errors=$(echo "$stats" | awk '{print $1}')
        local warnings=$(echo "$stats" | awk '{print $2}')
        local restarts=$(echo "$stats" | awk '{print $3}')
        cat >> "$report_file" << EOF
 ### $node
 - Errors: $errors
 - Warnings: $warnings
 - Container restarts: $restarts
 EOF
        echo -e "${GREEN}done${RESET}"
    done
    # Top errors section
    cat >> "$report_file" << EOF
 ## Top Errors Across Cluster
 EOF
    for node in "${ALL_NODES[@]}"; do
        if ssh -o ConnectTimeout=2 "$node" "echo ok" >/dev/null 2>&1; then
            echo "### $node" >> "$report_file"
            ssh "$node" "sudo grep -i error /var/log/syslog 2>/dev/null | tail -5" >> "$report_file" 2>/dev/null
        fi
    done
    echo
    echo -e "${GREEN}Report saved: $report_file${RESET}"
 }
 # Alert on log patterns
 alert() {
    local pattern="$1"
    local action="${2:-echo}"
    echo -e "${PINK}=== LOG ALERT MONITOR ===${RESET}"
    echo "Pattern: $pattern"
    echo "Action: $action"
    echo
    echo "Monitoring... Press Ctrl+C to stop"
    for node in "${ALL_NODES[@]}"; do
        (
            ssh "$node" "sudo tail -f /var/log/syslog 2>/dev/null" | while read -r line; do
                if echo "$line" | grep -qi "$pattern"; then
                    local alert_msg="[ALERT] $node: $line"
                    echo -e "${RED}$alert_msg${RESET}"
                    # Save alert
                    echo "$(date -Iseconds) $alert_msg" >> "$LOG_DIR/alerts/alerts.log"
                    # Execute action
                    case "$action" in
                        echo) ;;
                        notify)
                            # Could integrate with notification system
                            ;;
                        webhook:*)
                            local url="${action#webhook:}"
                            curl -s -X POST "$url" -d "{\"alert\":\"$alert_msg\"}" >/dev/null
                            ;;
                    esac
                fi
            done
        ) &
    done
    wait
 }
 # Tail specific node log
 tail_node() {
    local node="$1"
    local source="${2:-system}"
    local lines="${3:-50}"
    local log_path="${LOG_SOURCES[$source]}"
    [ -z "$log_path" ] && log_path="$source"
    echo -e "${PINK}=== $node - $source ===${RESET}"
    echo
    ssh "$node" "sudo tail -n $lines $log_path" 2>/dev/null
 }
 # Stats summary
 stats() {
    echo -e "${PINK}=== LOG STATISTICS ===${RESET}"
    echo
    printf "%-12s %-10s %-10s %-10s %-10s\n" "NODE" "ERRORS" "WARNINGS" "SIZE" "STATUS"
    echo "────────────────────────────────────────────────────────────"
    for node in "${ALL_NODES[@]}"; do
        if ! ssh -o ConnectTimeout=2 "$node" "echo ok" >/dev/null 2>&1; then
            printf "%-12s ${YELLOW}%-10s${RESET}\n" "$node" "OFFLINE"
            continue
        fi
        local stats=$(ssh "$node" "
            errors=\$(sudo grep -ci 'error' /var/log/syslog 2>/dev/null || echo 0)
            warnings=\$(sudo grep -ci 'warning' /var/log/syslog 2>/dev/null || echo 0)
            size=\$(du -sh /var/log/syslog 2>/dev/null | cut -f1 || echo '?')
            echo \"\$errors \$warnings \$size\"
        " 2>/dev/null)
        local errors=$(echo "$stats" | awk '{print $1}')
        local warnings=$(echo "$stats" | awk '{print $2}')
        local size=$(echo "$stats" | awk '{print $3}')
        local status="${GREEN}OK${RESET}"
        [ "$errors" -gt 100 ] && status="${YELLOW}WARN${RESET}"
        [ "$errors" -gt 500 ] && status="${RED}HIGH${RESET}"
        printf "%-12s %-10s %-10s %-10s %-10b\n" "$node" "$errors" "$warnings" "$size" "$status"
    done
 }
 # Help
 help() {
    echo -e "${PINK}BlackRoad Log Aggregator${RESET}"
    echo
    echo "Centralized log collection and analysis"
    echo
    echo "Commands:"
    echo "  collect [source] [lines]  Collect logs from all nodes"
    echo "  stream [source]           Stream logs in real-time"
    echo "  search <pattern> [src]    Search logs"
    echo "  errors [hours]            Analyze errors"
    echo "  report [hours]            Generate log report"
    echo "  alert <pattern> [action]  Monitor for pattern"
    echo "  tail <node> [source]      Tail specific node"
    echo "  stats                     Log statistics"
    echo
    echo "Log sources: ${!LOG_SOURCES[*]}"
    echo
    echo "Examples:"
    echo "  $0 stream system"
    echo "  $0 search 'error' docker"
    echo "  $0 alert 'out of memory'"
    echo "  $0 report 24"
 }
 # Ensure initialized
 [ -d "$LOG_DIR" ] || init >/dev/null
 case "${1:-help}" in
    init)
        init
        ;;
    collect)
        collect_all "$2" "$3"
        ;;
    stream|follow)
        stream "$2"
        ;;
    search|grep)
        search "$2" "$3" "$4"
        ;;
    errors|analyze)
        analyze_errors "$2"
        ;;
    report)
        report "$2"
        ;;
    alert|monitor)
        alert "$2" "$3"
        ;;
    tail)
        tail_node "$2" "$3" "$4"
        ;;
    stats)
        stats
        ;;
    *)
        help
        ;;
 esac
--- a/scripts/observability.sh
+++ b/scripts/observability.sh
@@ -0,0 +1,486 @@
 #!/bin/bash
 # BlackRoad Observability
 # Distributed tracing and observability for the cluster
 # Agent: Icarus (b3e01bd9)
 PINK='\033[38;5;205m'
 GREEN='\033[0;32m'
 BLUE='\033[0;34m'
 YELLOW='\033[1;33m'
 RED='\033[0;31m'
 CYAN='\033[0;36m'
 RESET='\033[0m'
 OBS_DIR="$HOME/.blackroad/observability"
 OBS_DB="$OBS_DIR/traces.db"
 ALL_NODES=("lucidia" "cecilia" "octavia" "aria" "alice")
 # Initialize
 init() {
    mkdir -p "$OBS_DIR"/{traces,metrics,logs}
    sqlite3 "$OBS_DB" << 'SQL'
 CREATE TABLE IF NOT EXISTS traces (
    trace_id TEXT PRIMARY KEY,
    name TEXT,
    service TEXT,
    started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    ended_at DATETIME,
    duration_ms INTEGER,
    status TEXT DEFAULT 'in_progress',
    metadata TEXT
 );
 CREATE TABLE IF NOT EXISTS spans (
    span_id TEXT PRIMARY KEY,
    trace_id TEXT,
    parent_span_id TEXT,
    name TEXT,
    service TEXT,
    node TEXT,
    started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
    ended_at DATETIME,
    duration_ms INTEGER,
    status TEXT DEFAULT 'in_progress',
    tags TEXT,
    logs TEXT,
    FOREIGN KEY (trace_id) REFERENCES traces(trace_id)
 );
 CREATE TABLE IF NOT EXISTS metrics (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    name TEXT,
    value REAL,
    tags TEXT,
    node TEXT,
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE TABLE IF NOT EXISTS logs (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    trace_id TEXT,
    span_id TEXT,
    level TEXT,
    message TEXT,
    node TEXT,
    service TEXT,
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
 );
 CREATE INDEX IF NOT EXISTS idx_trace ON spans(trace_id);
 CREATE INDEX IF NOT EXISTS idx_metric_name ON metrics(name);
 CREATE INDEX IF NOT EXISTS idx_log_trace ON logs(trace_id);
 SQL
    echo -e "${GREEN}Observability system initialized${RESET}"
 }
 # Start trace
 trace_start() {
    local name="$1"
    local service="${2:-unknown}"
    local metadata="${3:-{}}"
    local trace_id="trace_$(date +%s%N)_$(openssl rand -hex 4)"
    sqlite3 "$OBS_DB" "
        INSERT INTO traces (trace_id, name, service, metadata)
        VALUES ('$trace_id', '$name', '$service', '$metadata')
    "
    echo "$trace_id"
 }
 # End trace
 trace_end() {
    local trace_id="$1"
    local status="${2:-success}"
    local start=$(sqlite3 "$OBS_DB" "SELECT started_at FROM traces WHERE trace_id = '$trace_id'")
    local duration_ms=$(( ($(date +%s%N) - $(date -d "$start" +%s%N 2>/dev/null || echo $(date +%s%N))) / 1000000 ))
    sqlite3 "$OBS_DB" "
        UPDATE traces
        SET ended_at = datetime('now'), duration_ms = $duration_ms, status = '$status'
        WHERE trace_id = '$trace_id'
    "
    echo -e "${GREEN}Trace completed: $trace_id (${duration_ms}ms)${RESET}"
 }
 # Start span
 span_start() {
    local trace_id="$1"
    local name="$2"
    local service="${3:-unknown}"
    local parent="${4:-}"
    local node="${5:-$(hostname)}"
    local span_id="span_$(date +%s%N)_$(openssl rand -hex 4)"
    sqlite3 "$OBS_DB" "
        INSERT INTO spans (span_id, trace_id, parent_span_id, name, service, node)
        VALUES ('$span_id', '$trace_id', '$parent', '$name', '$service', '$node')
    "
    echo "$span_id"
 }
 # End span
 span_end() {
    local span_id="$1"
    local status="${2:-success}"
    local tags="${3:-{}}"
    sqlite3 "$OBS_DB" "
        UPDATE spans
        SET ended_at = datetime('now'),
            duration_ms = CAST((julianday('now') - julianday(started_at)) * 86400000 AS INTEGER),
            status = '$status',
            tags = '$tags'
        WHERE span_id = '$span_id'
    "
 }
 # Add span log
 span_log() {
    local span_id="$1"
    local message="$2"
    local level="${3:-info}"
    local trace_id=$(sqlite3 "$OBS_DB" "SELECT trace_id FROM spans WHERE span_id = '$span_id'")
    local service=$(sqlite3 "$OBS_DB" "SELECT service FROM spans WHERE span_id = '$span_id'")
    local node=$(sqlite3 "$OBS_DB" "SELECT node FROM spans WHERE span_id = '$span_id'")
    sqlite3 "$OBS_DB" "
        INSERT INTO logs (trace_id, span_id, level, message, node, service)
        VALUES ('$trace_id', '$span_id', '$level', '$(echo "$message" | sed "s/'/''/g")', '$node', '$service')
    "
 }
 # Record metric
 metric() {
    local name="$1"
    local value="$2"
    local tags="${3:-{}}"
    local node="${4:-$(hostname)}"
    sqlite3 "$OBS_DB" "
        INSERT INTO metrics (name, value, tags, node)
        VALUES ('$name', $value, '$tags', '$node')
    "
 }
 # View trace
 view_trace() {
    local trace_id="$1"
    echo -e "${PINK}=== TRACE: $trace_id ===${RESET}"
    echo
    # Trace info
    sqlite3 "$OBS_DB" -line "SELECT * FROM traces WHERE trace_id = '$trace_id'"
    echo
    echo "Spans:"
    echo
    # Build span tree
    sqlite3 "$OBS_DB" "
        SELECT span_id, parent_span_id, name, service, node, duration_ms, status
        FROM spans WHERE trace_id = '$trace_id'
        ORDER BY started_at
    " | while IFS='|' read -r span_id parent name service node duration status; do
        local indent=""
        [ -n "$parent" ] && indent="  "
        local status_color=$GREEN
        [ "$status" = "error" ] && status_color=$RED
        [ "$status" = "in_progress" ] && status_color=$YELLOW
        printf "${indent}├── %-20s %-10s %-10s ${status_color}%dms${RESET}\n" "$name" "$service" "$node" "$duration"
        # Show span logs
        sqlite3 "$OBS_DB" "
            SELECT level, message FROM logs WHERE span_id = '$span_id'
        " | while IFS='|' read -r level msg; do
            local log_color=$RESET
            [ "$level" = "error" ] && log_color=$RED
            [ "$level" = "warn" ] && log_color=$YELLOW
            echo -e "${indent}│   ${log_color}[$level] $msg${RESET}"
        done
    done
 }
 # List traces
 list_traces() {
    local limit="${1:-20}"
    local filter="${2:-}"
    echo -e "${PINK}=== TRACES ===${RESET}"
    echo
    local where=""
    [ -n "$filter" ] && where="WHERE name LIKE '%$filter%' OR service LIKE '%$filter%'"
    sqlite3 "$OBS_DB" "
        SELECT trace_id, name, service, duration_ms, status, started_at
        FROM traces $where
        ORDER BY started_at DESC LIMIT $limit
    " | while IFS='|' read -r trace_id name service duration status started; do
        local status_color=$GREEN
        [ "$status" = "error" ] && status_color=$RED
        [ "$status" = "in_progress" ] && status_color=$YELLOW
        printf "  %-30s %-15s %-10s ${status_color}%dms${RESET} %s\n" \
            "$trace_id" "$name" "$service" "$duration" "$started"
    done
 }
 # Search logs
 search_logs() {
    local query="$1"
    local limit="${2:-50}"
    echo -e "${PINK}=== LOG SEARCH: $query ===${RESET}"
    echo
    sqlite3 "$OBS_DB" "
        SELECT timestamp, level, service, node, message
        FROM logs
        WHERE message LIKE '%$query%'
        ORDER BY timestamp DESC LIMIT $limit
    " | while IFS='|' read -r ts level service node msg; do
        local color=$RESET
        [ "$level" = "error" ] && color=$RED
        [ "$level" = "warn" ] && color=$YELLOW
        echo -e "${color}[$ts] [$level] $service@$node: $msg${RESET}"
    done
 }
 # Metrics summary
 metrics_summary() {
    local period="${1:-1 hour}"
    echo -e "${PINK}=== METRICS SUMMARY (last $period) ===${RESET}"
    echo
    sqlite3 "$OBS_DB" "
        SELECT name, node, AVG(value), MIN(value), MAX(value), COUNT(*)
        FROM metrics
        WHERE datetime(timestamp, '+$period') > datetime('now')
        GROUP BY name, node
        ORDER BY name, node
    " | while IFS='|' read -r name node avg min max count; do
        printf "  %-20s %-10s avg:%.2f min:%.2f max:%.2f (%d samples)\n" \
            "$name" "$node" "$avg" "$min" "$max" "$count"
    done
 }
 # Service map
 service_map() {
    echo -e "${PINK}=== SERVICE MAP ===${RESET}"
    echo
    echo "Services and their dependencies:"
    echo
    sqlite3 "$OBS_DB" "
        SELECT DISTINCT s1.service, s2.service
        FROM spans s1
        JOIN spans s2 ON s1.span_id = s2.parent_span_id
        WHERE s1.service != s2.service
    " | while IFS='|' read -r from to; do
        echo "  $from -> $to"
    done
    echo
    echo "Service stats (last hour):"
    sqlite3 "$OBS_DB" "
        SELECT service, COUNT(*), AVG(duration_ms),
               SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) * 100.0 / COUNT(*)
        FROM spans
        WHERE datetime(started_at, '+1 hour') > datetime('now')
        GROUP BY service
    " | while IFS='|' read -r service count avg_lat error_rate; do
        printf "  %-20s spans:%d avg:%.0fms err:%.1f%%\n" "$service" "$count" "$avg_lat" "$error_rate"
    done
 }
 # Error analysis
 errors() {
    local limit="${1:-20}"
    echo -e "${PINK}=== ERROR ANALYSIS ===${RESET}"
    echo
    echo "Recent errors:"
    sqlite3 "$OBS_DB" "
        SELECT t.trace_id, t.name, s.service, s.node, l.message, l.timestamp
        FROM logs l
        JOIN spans s ON l.span_id = s.span_id
        JOIN traces t ON l.trace_id = t.trace_id
        WHERE l.level = 'error'
        ORDER BY l.timestamp DESC LIMIT $limit
    " | while IFS='|' read -r trace name service node msg ts; do
        echo -e "${RED}[$ts] $service@$node${RESET}"
        echo "  Trace: $trace ($name)"
        echo "  Error: $msg"
        echo
    done
    echo "Error rates by service:"
    sqlite3 "$OBS_DB" "
        SELECT service, COUNT(*) as total,
               SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors
        FROM spans
        WHERE datetime(started_at, '+1 hour') > datetime('now')
        GROUP BY service
        HAVING errors > 0
        ORDER BY errors DESC
    " | while IFS='|' read -r service total errors; do
        local rate=$(echo "scale=1; $errors * 100 / $total" | bc)
        printf "  %-20s %d/%d (%.1f%%)\n" "$service" "$errors" "$total" "$rate"
    done
 }
 # Dashboard
 dashboard() {
    clear
    echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
    echo -e "${PINK}║           👁️  OBSERVABILITY DASHBOARD                        ║${RESET}"
    echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
    echo
    local total_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
    local error_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE status = 'error' AND datetime(started_at, '+1 hour') > datetime('now')")
    local avg_duration=$(sqlite3 "$OBS_DB" "SELECT AVG(duration_ms) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
    echo "Last Hour:"
    printf "  Traces: %d | Errors: %d | Avg Duration: %.0fms\n" "$total_traces" "$error_traces" "${avg_duration:-0}"
    echo
    echo "─────────────────────────────────────────────────────────────────"
    echo "Active Services:"
    sqlite3 "$OBS_DB" "
        SELECT service, COUNT(*), AVG(duration_ms)
        FROM spans WHERE datetime(started_at, '+1 hour') > datetime('now')
        GROUP BY service ORDER BY COUNT(*) DESC LIMIT 5
    " | while IFS='|' read -r service count avg; do
        printf "  %-20s %d spans (avg: %.0fms)\n" "$service" "$count" "$avg"
    done
    echo
    echo "─────────────────────────────────────────────────────────────────"
    echo "Recent Errors:"
    sqlite3 "$OBS_DB" "
        SELECT service, message, timestamp FROM logs
        WHERE level = 'error' ORDER BY timestamp DESC LIMIT 3
    " | while IFS='|' read -r service msg ts; do
        echo -e "  ${RED}$service: $msg${RESET}"
    done
 }
 # Clean old data
 cleanup() {
    local days="${1:-7}"
    sqlite3 "$OBS_DB" "
        DELETE FROM logs WHERE datetime(timestamp, '+$days days') < datetime('now');
        DELETE FROM spans WHERE datetime(started_at, '+$days days') < datetime('now');
        DELETE FROM traces WHERE datetime(started_at, '+$days days') < datetime('now');
        DELETE FROM metrics WHERE datetime(timestamp, '+$days days') < datetime('now');
    "
    echo -e "${GREEN}Cleaned data older than $days days${RESET}"
 }
 # Help
 help() {
    echo -e "${PINK}BlackRoad Observability${RESET}"
    echo
    echo "Distributed tracing and observability"
    echo
    echo "Tracing:"
    echo "  trace-start <name> [service]         Start trace"
    echo "  trace-end <trace_id> [status]        End trace"
    echo "  span-start <trace> <name> [svc]      Start span"
    echo "  span-end <span_id> [status]          End span"
    echo "  span-log <span_id> <msg> [level]     Add log"
    echo "  view <trace_id>                      View trace"
    echo "  list [limit] [filter]                List traces"
    echo
    echo "Metrics & Logs:"
    echo "  metric <name> <value> [tags]         Record metric"
    echo "  search <query> [limit]               Search logs"
    echo "  metrics [period]                     Metrics summary"
    echo
    echo "Analysis:"
    echo "  service-map                          Service dependencies"
    echo "  errors [limit]                       Error analysis"
    echo "  dashboard                            Overview dashboard"
    echo "  cleanup [days]                       Clean old data"
    echo
    echo "Examples:"
    echo "  trace=\$($0 trace-start 'inference' 'api')"
    echo "  span=\$($0 span-start \$trace 'generate' 'llm')"
    echo "  $0 span-log \$span 'Processing request'"
    echo "  $0 span-end \$span"
    echo "  $0 trace-end \$trace"
 }
 # Ensure initialized
 [ -f "$OBS_DB" ] || init >/dev/null
 case "${1:-help}" in
    init)
        init
        ;;
    trace-start)
        trace_start "$2" "$3" "$4"
        ;;
    trace-end)
        trace_end "$2" "$3"
        ;;
    span-start)
        span_start "$2" "$3" "$4" "$5" "$6"
        ;;
    span-end)
        span_end "$2" "$3" "$4"
        ;;
    span-log|log)
        span_log "$2" "$3" "$4"
        ;;
    view)
        view_trace "$2"
        ;;
    list|traces)
        list_traces "$2" "$3"
        ;;
    metric)
        metric "$2" "$3" "$4" "$5"
        ;;
    search)
        search_logs "$2" "$3"
        ;;
    metrics)
        metrics_summary "$2"
        ;;
    service-map|map)
        service_map
        ;;
    errors)
        errors "$2"
        ;;
    dashboard|dash)
        dashboard
        ;;
    cleanup)
        cleanup "$2"
        ;;
    *)
        help
        ;;
 esac
--- a/scripts/vault-universal.sh
+++ b/scripts/vault-universal.sh
@@ -0,0 +1,561 @@
 #!/bin/bash
 # 🔐 BLACKROAD VAULT - UNIVERSAL CREDENTIAL MANAGER
 # 
 # Philosophy: If a human has to paste an API key, the automation failed.
 # 
 # Supports: 50+ services across all categories
 # - Social Media: Instagram, Facebook, Twitter, LinkedIn, TikTok, YouTube
 # - AI Providers: OpenAI, Anthropic, Google AI, Cohere, Hugging Face
 # - Cloud: AWS, GCP, Azure, DigitalOcean, Linode, Vultr
 # - Payments: Stripe, PayPal, Square
 # - Auth: Clerk, Auth0, Firebase, Supabase
 # - Infrastructure: Railway, Vercel, Netlify, Cloudflare, Heroku
 # - Development: GitHub, GitLab, Bitbucket
 # - Analytics: Google Analytics, Mixpanel, Amplitude
 # - Communication: Slack, Discord, Telegram, Twilio
 # - And more...
 set -e
 VAULT_DIR="$HOME/.blackroad/vault"
 mkdir -p "$VAULT_DIR"
 chmod 700 "$VAULT_DIR"
 PINK='\033[38;5;205m'
 GREEN='\033[38;5;82m'
 BLUE='\033[38;5;69m'
 AMBER='\033[38;5;214m'
 RED='\033[38;5;196m'
 RESET='\033[0m'
 echo -e "${PINK}╔════════════════════════════════════════════╗${RESET}"
 echo -e "${PINK}║   🔐 BLACKROAD UNIVERSAL VAULT           ║${RESET}"
 echo -e "${PINK}╚════════════════════════════════════════════╝${RESET}"
 echo ""
 # ============================================================================
 # PAYMENT PROCESSORS
 # ============================================================================
 discover_stripe() {
    echo -e "${BLUE}💳 Stripe...${RESET}"
    if command -v stripe &> /dev/null && stripe config --list &> /dev/null 2>&1; then
        SECRET_KEY=$(stripe config --list 2>/dev/null | grep "secret_key" | awk '{print $3}')
        if [ -n "$SECRET_KEY" ]; then
            echo "$SECRET_KEY" > "$VAULT_DIR/stripe_secret_key"
            chmod 600 "$VAULT_DIR/stripe_secret_key"
            echo -e "${GREEN}  ✅ Saved${RESET}"
            return 0
        fi
    fi
    [ -n "$STRIPE_SECRET_KEY" ] && echo "$STRIPE_SECRET_KEY" > "$VAULT_DIR/stripe_secret_key" && chmod 600 "$VAULT_DIR/stripe_secret_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Run 'stripe login'${RESET}"
    return 1
 }
 discover_paypal() {
    echo -e "${BLUE}💳 PayPal...${RESET}"
    [ -n "$PAYPAL_CLIENT_ID" ] && echo "$PAYPAL_CLIENT_ID" > "$VAULT_DIR/paypal_client_id" && chmod 600 "$VAULT_DIR/paypal_client_id" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://developer.paypal.com${RESET}"
    return 1
 }
 # ============================================================================
 # SOCIAL MEDIA & MARKETING
 # ============================================================================
 discover_instagram() {
    echo -e "${BLUE}📸 Instagram...${RESET}"
    [ -n "$INSTAGRAM_ACCESS_TOKEN" ] && echo "$INSTAGRAM_ACCESS_TOKEN" > "$VAULT_DIR/instagram_access_token" && chmod 600 "$VAULT_DIR/instagram_access_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://developers.facebook.com/apps${RESET}"
    return 1
 }
 discover_facebook() {
    echo -e "${BLUE}📘 Facebook...${RESET}"
    [ -n "$FACEBOOK_ACCESS_TOKEN" ] && echo "$FACEBOOK_ACCESS_TOKEN" > "$VAULT_DIR/facebook_access_token" && chmod 600 "$VAULT_DIR/facebook_access_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://developers.facebook.com${RESET}"
    return 1
 }
 discover_twitter() {
    echo -e "${BLUE}🐦 Twitter/X...${RESET}"
    [ -n "$TWITTER_API_KEY" ] && echo "$TWITTER_API_KEY" > "$VAULT_DIR/twitter_api_key" && chmod 600 "$VAULT_DIR/twitter_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://developer.twitter.com${RESET}"
    return 1
 }
 discover_linkedin() {
    echo -e "${BLUE}💼 LinkedIn...${RESET}"
    [ -n "$LINKEDIN_ACCESS_TOKEN" ] && echo "$LINKEDIN_ACCESS_TOKEN" > "$VAULT_DIR/linkedin_access_token" && chmod 600 "$VAULT_DIR/linkedin_access_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://www.linkedin.com/developers${RESET}"
    return 1
 }
 discover_tiktok() {
    echo -e "${BLUE}🎵 TikTok...${RESET}"
    [ -n "$TIKTOK_ACCESS_TOKEN" ] && echo "$TIKTOK_ACCESS_TOKEN" > "$VAULT_DIR/tiktok_access_token" && chmod 600 "$VAULT_DIR/tiktok_access_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://developers.tiktok.com${RESET}"
    return 1
 }
 discover_youtube() {
    echo -e "${BLUE}📺 YouTube...${RESET}"
    [ -n "$YOUTUBE_API_KEY" ] && echo "$YOUTUBE_API_KEY" > "$VAULT_DIR/youtube_api_key" && chmod 600 "$VAULT_DIR/youtube_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://console.cloud.google.com${RESET}"
    return 1
 }
 # ============================================================================
 # AI PROVIDERS
 # ============================================================================
 discover_openai() {
    echo -e "${BLUE}🤖 OpenAI...${RESET}"
    [ -n "$OPENAI_API_KEY" ] && echo "$OPENAI_API_KEY" > "$VAULT_DIR/openai_api_key" && chmod 600 "$VAULT_DIR/openai_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://platform.openai.com/api-keys${RESET}"
    return 1
 }
 discover_anthropic() {
    echo -e "${BLUE}🤖 Anthropic...${RESET}"
    [ -n "$ANTHROPIC_API_KEY" ] && echo "$ANTHROPIC_API_KEY" > "$VAULT_DIR/anthropic_api_key" && chmod 600 "$VAULT_DIR/anthropic_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://console.anthropic.com${RESET}"
    return 1
 }
 discover_google_ai() {
    echo -e "${BLUE}🤖 Google AI...${RESET}"
    [ -n "$GOOGLE_AI_API_KEY" ] && echo "$GOOGLE_AI_API_KEY" > "$VAULT_DIR/google_ai_api_key" && chmod 600 "$VAULT_DIR/google_ai_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://aistudio.google.com${RESET}"
    return 1
 }
 discover_cohere() {
    echo -e "${BLUE}🤖 Cohere...${RESET}"
    [ -n "$COHERE_API_KEY" ] && echo "$COHERE_API_KEY" > "$VAULT_DIR/cohere_api_key" && chmod 600 "$VAULT_DIR/cohere_api_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://dashboard.cohere.ai${RESET}"
    return 1
 }
 discover_huggingface() {
    echo -e "${BLUE}🤖 Hugging Face...${RESET}"
    [ -n "$HUGGINGFACE_TOKEN" ] && echo "$HUGGINGFACE_TOKEN" > "$VAULT_DIR/huggingface_token" && chmod 600 "$VAULT_DIR/huggingface_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    # Check huggingface-cli
    if [ -f ~/.huggingface/token ]; then
        cat ~/.huggingface/token > "$VAULT_DIR/huggingface_token"
        chmod 600 "$VAULT_DIR/huggingface_token"
        echo -e "${GREEN}  ✅ From CLI${RESET}"
        return 0
    fi
    echo -e "${AMBER}  ⚠️  Run 'huggingface-cli login'${RESET}"
    return 1
 }
 # ============================================================================
 # CLOUD PROVIDERS
 # ============================================================================
 discover_aws() {
    echo -e "${BLUE}☁️  AWS...${RESET}"
    if [ -f ~/.aws/credentials ]; then
        AWS_KEY=$(grep "aws_access_key_id" ~/.aws/credentials | head -1 | cut -d= -f2 | tr -d ' ')
        if [ -n "$AWS_KEY" ]; then
            echo "$AWS_KEY" > "$VAULT_DIR/aws_access_key_id"
            chmod 600 "$VAULT_DIR/aws_access_key_id"
            echo -e "${GREEN}  ✅ From ~/.aws/credentials${RESET}"
            return 0
        fi
    fi
    [ -n "$AWS_ACCESS_KEY_ID" ] && echo "$AWS_ACCESS_KEY_ID" > "$VAULT_DIR/aws_access_key_id" && chmod 600 "$VAULT_DIR/aws_access_key_id" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Run 'aws configure'${RESET}"
    return 1
 }
 discover_gcp() {
    echo -e "${BLUE}☁️  Google Cloud...${RESET}"
    if command -v gcloud &> /dev/null; then
        GCP_PROJECT=$(gcloud config get-value project 2>/dev/null)
        if [ -n "$GCP_PROJECT" ]; then
            echo "$GCP_PROJECT" > "$VAULT_DIR/gcp_project_id"
            chmod 600 "$VAULT_DIR/gcp_project_id"
            echo -e "${GREEN}  ✅ Configured${RESET}"
            return 0
        fi
    fi
    echo -e "${AMBER}  ⚠️  Run 'gcloud init'${RESET}"
    return 1
 }
 discover_azure() {
    echo -e "${BLUE}☁️  Azure...${RESET}"
    if command -v az &> /dev/null; then
        if az account show &> /dev/null; then
            AZ_SUB=$(az account show --query id -o tsv 2>/dev/null)
            if [ -n "$AZ_SUB" ]; then
                echo "$AZ_SUB" > "$VAULT_DIR/azure_subscription_id"
                chmod 600 "$VAULT_DIR/azure_subscription_id"
                echo -e "${GREEN}  ✅ Logged in${RESET}"
                return 0
            fi
        fi
    fi
    echo -e "${AMBER}  ⚠️  Run 'az login'${RESET}"
    return 1
 }
 discover_digitalocean() {
    echo -e "${BLUE}☁️  DigitalOcean...${RESET}"
    [ -n "$DIGITALOCEAN_TOKEN" ] && echo "$DIGITALOCEAN_TOKEN" > "$VAULT_DIR/digitalocean_token" && chmod 600 "$VAULT_DIR/digitalocean_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://cloud.digitalocean.com/account/api${RESET}"
    return 1
 }
 # ============================================================================
 # DEVELOPMENT & HOSTING
 # ============================================================================
 discover_github() {
    echo -e "${BLUE}🐙 GitHub...${RESET}"
    if command -v gh &> /dev/null && gh auth status &> /dev/null 2>&1; then
        GH_TOKEN=$(gh auth token 2>/dev/null)
        if [ -n "$GH_TOKEN" ]; then
            echo "$GH_TOKEN" > "$VAULT_DIR/github_token"
            chmod 600 "$VAULT_DIR/github_token"
            echo -e "${GREEN}  ✅ From gh CLI${RESET}"
            return 0
        fi
    fi
    [ -n "$GITHUB_TOKEN" ] && echo "$GITHUB_TOKEN" > "$VAULT_DIR/github_token" && chmod 600 "$VAULT_DIR/github_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Run 'gh auth login'${RESET}"
    return 1
 }
 discover_gitlab() {
    echo -e "${BLUE}🦊 GitLab...${RESET}"
    [ -n "$GITLAB_TOKEN" ] && echo "$GITLAB_TOKEN" > "$VAULT_DIR/gitlab_token" && chmod 600 "$VAULT_DIR/gitlab_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://gitlab.com/-/profile/personal_access_tokens${RESET}"
    return 1
 }
 discover_railway() {
    echo -e "${BLUE}🚂 Railway...${RESET}"
    if command -v railway &> /dev/null && railway whoami &> /dev/null 2>&1; then
        RAILWAY_TOKEN=$(cat ~/.config/railway/config.json 2>/dev/null | jq -r '.token' 2>/dev/null)
        if [ -n "$RAILWAY_TOKEN" ] && [ "$RAILWAY_TOKEN" != "null" ]; then
            echo "$RAILWAY_TOKEN" > "$VAULT_DIR/railway_token"
            chmod 600 "$VAULT_DIR/railway_token"
            echo -e "${GREEN}  ✅ From CLI${RESET}"
            return 0
        fi
    fi
    echo -e "${AMBER}  ⚠️  Run 'railway login'${RESET}"
    return 1
 }
 discover_vercel() {
    echo -e "${BLUE}▲ Vercel...${RESET}"
    if [ -f ~/.config/configstore/update-notifier-vercel.json ]; then
        VERCEL_TOKEN=$(cat ~/.vercel/auth.json 2>/dev/null | jq -r '.token' 2>/dev/null)
        if [ -n "$VERCEL_TOKEN" ]; then
            echo "$VERCEL_TOKEN" > "$VAULT_DIR/vercel_token"
            chmod 600 "$VAULT_DIR/vercel_token"
            echo -e "${GREEN}  ✅ From CLI${RESET}"
            return 0
        fi
    fi
    [ -n "$VERCEL_TOKEN" ] && echo "$VERCEL_TOKEN" > "$VAULT_DIR/vercel_token" && chmod 600 "$VAULT_DIR/vercel_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Run 'vercel login'${RESET}"
    return 1
 }
 discover_cloudflare() {
    echo -e "${BLUE}☁️  Cloudflare...${RESET}"
    if [ -f ~/.wrangler/config/default.toml ]; then
        CF_TOKEN=$(grep "api_token" ~/.wrangler/config/default.toml | cut -d'"' -f2)
        if [ -n "$CF_TOKEN" ]; then
            echo "$CF_TOKEN" > "$VAULT_DIR/cloudflare_api_token"
            chmod 600 "$VAULT_DIR/cloudflare_api_token"
            echo -e "${GREEN}  ✅ From wrangler${RESET}"
            return 0
        fi
    fi
    [ -n "$CLOUDFLARE_API_TOKEN" ] && echo "$CLOUDFLARE_API_TOKEN" > "$VAULT_DIR/cloudflare_api_token" && chmod 600 "$VAULT_DIR/cloudflare_api_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Run 'wrangler login'${RESET}"
    return 1
 }
 # ============================================================================
 # AUTH PROVIDERS
 # ============================================================================
 discover_clerk() {
    echo -e "${BLUE}🔐 Clerk...${RESET}"
    [ -n "$CLERK_SECRET_KEY" ] && echo "$CLERK_SECRET_KEY" > "$VAULT_DIR/clerk_secret_key" && chmod 600 "$VAULT_DIR/clerk_secret_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://dashboard.clerk.com${RESET}"
    return 1
 }
 discover_auth0() {
    echo -e "${BLUE}🔐 Auth0...${RESET}"
    [ -n "$AUTH0_CLIENT_SECRET" ] && echo "$AUTH0_CLIENT_SECRET" > "$VAULT_DIR/auth0_client_secret" && chmod 600 "$VAULT_DIR/auth0_client_secret" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://manage.auth0.com${RESET}"
    return 1
 }
 discover_supabase() {
    echo -e "${BLUE}🔐 Supabase...${RESET}"
    [ -n "$SUPABASE_ANON_KEY" ] && echo "$SUPABASE_ANON_KEY" > "$VAULT_DIR/supabase_anon_key" && chmod 600 "$VAULT_DIR/supabase_anon_key" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://app.supabase.com${RESET}"
    return 1
 }
 # ============================================================================
 # COMMUNICATION
 # ============================================================================
 discover_slack() {
    echo -e "${BLUE}💬 Slack...${RESET}"
    [ -n "$SLACK_BOT_TOKEN" ] && echo "$SLACK_BOT_TOKEN" > "$VAULT_DIR/slack_bot_token" && chmod 600 "$VAULT_DIR/slack_bot_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://api.slack.com/apps${RESET}"
    return 1
 }
 discover_discord() {
    echo -e "${BLUE}💬 Discord...${RESET}"
    [ -n "$DISCORD_BOT_TOKEN" ] && echo "$DISCORD_BOT_TOKEN" > "$VAULT_DIR/discord_bot_token" && chmod 600 "$VAULT_DIR/discord_bot_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://discord.com/developers${RESET}"
    return 1
 }
 discover_telegram() {
    echo -e "${BLUE}💬 Telegram...${RESET}"
    [ -n "$TELEGRAM_BOT_TOKEN" ] && echo "$TELEGRAM_BOT_TOKEN" > "$VAULT_DIR/telegram_bot_token" && chmod 600 "$VAULT_DIR/telegram_bot_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from @BotFather${RESET}"
    return 1
 }
 discover_twilio() {
    echo -e "${BLUE}📱 Twilio...${RESET}"
    [ -n "$TWILIO_AUTH_TOKEN" ] && echo "$TWILIO_AUTH_TOKEN" > "$VAULT_DIR/twilio_auth_token" && chmod 600 "$VAULT_DIR/twilio_auth_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://www.twilio.com/console${RESET}"
    return 1
 }
 # ============================================================================
 # ANALYTICS
 # ============================================================================
 discover_google_analytics() {
    echo -e "${BLUE}📊 Google Analytics...${RESET}"
    [ -n "$GA_MEASUREMENT_ID" ] && echo "$GA_MEASUREMENT_ID" > "$VAULT_DIR/ga_measurement_id" && chmod 600 "$VAULT_DIR/ga_measurement_id" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://analytics.google.com${RESET}"
    return 1
 }
 discover_mixpanel() {
    echo -e "${BLUE}📊 Mixpanel...${RESET}"
    [ -n "$MIXPANEL_TOKEN" ] && echo "$MIXPANEL_TOKEN" > "$VAULT_DIR/mixpanel_token" && chmod 600 "$VAULT_DIR/mixpanel_token" && echo -e "${GREEN}  ✅ From env${RESET}" && return 0
    echo -e "${AMBER}  ⚠️  Get from https://mixpanel.com/settings/project${RESET}"
    return 1
 }
 # ============================================================================
 # HELPER FUNCTIONS
 # ============================================================================
 load_vault() {
    # Export all credentials as environment variables
    for key_file in "$VAULT_DIR"/*; do
        if [ -f "$key_file" ]; then
            key_name=$(basename "$key_file" | tr '[:lower:]' '[:upper:]')
            key_value=$(cat "$key_file")
            echo "export $key_name='$key_value'"
        fi
    done
 }
 show_vault() {
    echo ""
    echo -e "${PINK}═══════════════════════════════════════════${RESET}"
    echo -e "${BLUE}📋 Vault Status${RESET}"
    echo -e "${PINK}═══════════════════════════════════════════${RESET}"
    echo ""
    local total=0
    local configured=0
    for service in stripe paypal instagram facebook twitter linkedin tiktok youtube \
                   openai anthropic google_ai cohere huggingface \
                   aws gcp azure digitalocean \
                   github gitlab railway vercel cloudflare \
                   clerk auth0 supabase \
                   slack discord telegram twilio \
                   google_analytics mixpanel; do
        total=$((total + 1))
        if ls "$VAULT_DIR/${service}_"* &> /dev/null 2>&1; then
            echo -e "${GREEN}✅ $service${RESET}"
            configured=$((configured + 1))
        else
            echo -e "${AMBER}⚠️  $service${RESET}"
        fi
    done
    echo ""
    echo -e "${BLUE}Configured: $configured / $total services${RESET}"
    echo -e "${BLUE}Vault: $VAULT_DIR${RESET}"
    echo -e "${BLUE}Files: $(ls -1 "$VAULT_DIR" 2>/dev/null | wc -l | tr -d ' ')${RESET}"
 }
 create_env_file() {
    local target_file="${1:-.env}"
    echo -e "${BLUE}📝 Creating $target_file...${RESET}"
    cat > "$target_file" << 'EOF'
 # Auto-generated from BlackRoad Universal Vault
 # DO NOT EDIT - Run ./blackroad-vault-universal.sh to update
 # Generated: $(date)
 EOF
    for key_file in "$VAULT_DIR"/*; do
        if [ -f "$key_file" ]; then
            key_name=$(basename "$key_file" | tr '[:lower:]' '[:upper:]')
            key_value=$(cat "$key_file")
            echo "$key_name=$key_value" >> "$target_file"
        fi
    done
    chmod 600 "$target_file"
    echo -e "${GREEN}✅ Created $target_file${RESET}"
 }
 # ============================================================================
 # MAIN EXECUTION
 # ============================================================================
 case "${1:-discover}" in
    discover)
        echo -e "${PINK}🔍 Discovering credentials from 40+ services...${RESET}"
        echo ""
        # Payments
        echo -e "${PINK}━━ PAYMENTS ━━━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_stripe || true
        discover_paypal || true
        # Social Media
        echo ""
        echo -e "${PINK}━━ SOCIAL MEDIA ━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_instagram || true
        discover_facebook || true
        discover_twitter || true
        discover_linkedin || true
        discover_tiktok || true
        discover_youtube || true
        # AI Providers
        echo ""
        echo -e "${PINK}━━ AI PROVIDERS ━━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_openai || true
        discover_anthropic || true
        discover_google_ai || true
        discover_cohere || true
        discover_huggingface || true
        # Cloud
        echo ""
        echo -e "${PINK}━━ CLOUD PROVIDERS ━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_aws || true
        discover_gcp || true
        discover_azure || true
        discover_digitalocean || true
        # Development
        echo ""
        echo -e "${PINK}━━ DEVELOPMENT ━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_github || true
        discover_gitlab || true
        discover_railway || true
        discover_vercel || true
        discover_cloudflare || true
        # Auth
        echo ""
        echo -e "${PINK}━━ AUTH PROVIDERS ━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_clerk || true
        discover_auth0 || true
        discover_supabase || true
        # Communication
        echo ""
        echo -e "${PINK}━━ COMMUNICATION ━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_slack || true
        discover_discord || true
        discover_telegram || true
        discover_twilio || true
        # Analytics
        echo ""
        echo -e "${PINK}━━ ANALYTICS ━━━━━━━━━━━━━━━━━━━━━━━━━${RESET}"
        discover_google_analytics || true
        discover_mixpanel || true
        show_vault
        # Log to memory
        if command -v ~/memory-system.sh &> /dev/null; then
            ~/memory-system.sh log "vault-discovery" "universal-vault" "Discovered credentials from 40+ services. Configured: $(ls -1 "$VAULT_DIR" 2>/dev/null | wc -l) keys" "vault,automation,credentials"
        fi
        echo ""
        echo -e "${PINK}╔════════════════════════════════════════════╗${RESET}"
        echo -e "${PINK}║   ✅ UNIVERSAL VAULT READY                ║${RESET}"
        echo -e "${PINK}╚════════════════════════════════════════════╝${RESET}"
        echo ""
        echo -e "${BLUE}Usage in scripts:${RESET}"
        echo -e "  source <(./blackroad-vault-universal.sh load)"
        echo ""
        echo -e "${BLUE}Generate .env:${RESET}"
        echo -e "  ./blackroad-vault-universal.sh env .env"
        echo ""
        echo -e "${GREEN}Philosophy: One-time login → Forever automated${RESET}"
        ;;
    load)
        load_vault
        ;;
    show)
        show_vault
        ;;
    env)
        create_env_file "$2"
        ;;
    *)
        echo -e "${RED}Unknown command: $1${RESET}"
        echo ""
        echo "Usage: $0 [discover|load|show|env]"
        echo "  discover - Auto-discover all credentials"
        echo "  load     - Export credentials to environment"
        echo "  show     - Show vault status"
        echo "  env      - Create .env file"
        exit 1
        ;;
 esac