bin/ 230 CLI tools (ask-*, br-*, agent-*, roadid, carpool) scripts/ 99 automation scripts fleet/ Node configs and deployment workers/ Cloudflare Worker sources (roadpay, road-search, squad webhooks) roadc/ RoadC programming language roadnet/ Mesh network (5 APs, WireGuard) operator/ Memory system scripts config/ System configs dotfiles/ Shell configs docs/ Documentation BlackRoad OS — Pave Tomorrow. RoadChain-SHA2048: d1a24f55318d338b RoadChain-Identity: alexa@sovereign RoadChain-Full: d1a24f55318d338b24b60bad7be39286379c76ae5470817482100cb0ddbbcb97e147d07ac7243da0a9f0363e4e5c833d612b9c0df3a3cd20802465420278ef74875a5b77f55af6fe42a931b8b635b3d0d0b6bde9abf33dc42eea52bc03c951406d8cbe49f1a3d29b26a94dade05e9477f34a7d4d4c6ec4005c3c2ac54e73a68440c512c8e83fd9b1fe234750b898ef8f4032c23db173961fe225e67a0432b5293a9714f76c5c57ed5fdf35b9fb40fd73c03ebf88b7253c6a0575f5afb6a6b49b3bda310602fb1ef676859962dad2aebbb2875814b30eee0a8ba195e482d4cbc91d8819e7f38f6db53e8063401649c77bb994371473cabfb917fb53e8cbe73d60
521 lines
16 KiB
Bash
Executable File
521 lines
16 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# BlackRoad Observability
|
|
# Distributed tracing and observability for the cluster
|
|
set -eo pipefail
|
|
|
|
# Source centralized config
|
|
NODES_CONFIG="$HOME/.blackroad/config/nodes.sh"
|
|
if [[ -f "$NODES_CONFIG" ]]; then
|
|
source "$NODES_CONFIG"
|
|
else
|
|
PINK='\033[38;5;205m'; GREEN='\033[0;32m'; BLUE='\033[0;34m'
|
|
YELLOW='\033[1;33m'; RED='\033[0;31m'; CYAN='\033[0;36m'; RESET='\033[0m'
|
|
fi
|
|
|
|
# Dependency check
|
|
for dep in sqlite3; do
|
|
command -v "$dep" &>/dev/null || { echo "ERROR: $dep required" >&2; exit 1; }
|
|
done
|
|
|
|
# macOS-compatible nanosecond timestamp
|
|
_timestamp_ns() {
|
|
if command -v gdate &>/dev/null; then
|
|
gdate +%s%N
|
|
else
|
|
echo "$(date +%s)000000000"
|
|
fi
|
|
}
|
|
|
|
# SQL-safe string escaping (prevent injection)
|
|
_sql_escape() {
|
|
echo "$1" | sed "s/'/''/g"
|
|
}
|
|
|
|
OBS_DIR="$HOME/.blackroad/observability"
|
|
OBS_DB="$OBS_DIR/traces.db"
|
|
|
|
# Initialize
|
|
init() {
|
|
mkdir -p "$OBS_DIR"/{traces,metrics,logs}
|
|
|
|
sqlite3 "$OBS_DB" << 'SQL'
|
|
CREATE TABLE IF NOT EXISTS traces (
|
|
trace_id TEXT PRIMARY KEY,
|
|
name TEXT,
|
|
service TEXT,
|
|
started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
ended_at DATETIME,
|
|
duration_ms INTEGER,
|
|
status TEXT DEFAULT 'in_progress',
|
|
metadata TEXT
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS spans (
|
|
span_id TEXT PRIMARY KEY,
|
|
trace_id TEXT,
|
|
parent_span_id TEXT,
|
|
name TEXT,
|
|
service TEXT,
|
|
node TEXT,
|
|
started_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
ended_at DATETIME,
|
|
duration_ms INTEGER,
|
|
status TEXT DEFAULT 'in_progress',
|
|
tags TEXT,
|
|
logs TEXT,
|
|
FOREIGN KEY (trace_id) REFERENCES traces(trace_id)
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS metrics (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
name TEXT,
|
|
value REAL,
|
|
tags TEXT,
|
|
node TEXT,
|
|
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE TABLE IF NOT EXISTS logs (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
trace_id TEXT,
|
|
span_id TEXT,
|
|
level TEXT,
|
|
message TEXT,
|
|
node TEXT,
|
|
service TEXT,
|
|
timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
);
|
|
|
|
CREATE INDEX IF NOT EXISTS idx_trace ON spans(trace_id);
|
|
CREATE INDEX IF NOT EXISTS idx_span_parent ON spans(parent_span_id);
|
|
CREATE INDEX IF NOT EXISTS idx_metric_name ON metrics(name);
|
|
CREATE INDEX IF NOT EXISTS idx_metric_ts ON metrics(timestamp);
|
|
CREATE INDEX IF NOT EXISTS idx_log_trace ON logs(trace_id);
|
|
CREATE INDEX IF NOT EXISTS idx_log_span ON logs(span_id);
|
|
CREATE INDEX IF NOT EXISTS idx_log_level ON logs(level);
|
|
CREATE INDEX IF NOT EXISTS idx_trace_status ON traces(status);
|
|
CREATE INDEX IF NOT EXISTS idx_trace_started ON traces(started_at);
|
|
CREATE INDEX IF NOT EXISTS idx_span_started ON spans(started_at);
|
|
|
|
PRAGMA journal_mode=WAL;
|
|
PRAGMA busy_timeout=5000;
|
|
SQL
|
|
|
|
echo -e "${GREEN}Observability system initialized${RESET}"
|
|
}
|
|
|
|
# Start trace
|
|
trace_start() {
|
|
local name; name=$(_sql_escape "${1:?trace name required}")
|
|
local service; service=$(_sql_escape "${2:-unknown}")
|
|
local metadata; metadata=$(_sql_escape "${3:-{}}")
|
|
|
|
local trace_id="trace_$(_timestamp_ns)_$(openssl rand -hex 4)"
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
INSERT INTO traces (trace_id, name, service, metadata)
|
|
VALUES ('$trace_id', '$name', '$service', '$metadata')
|
|
"
|
|
|
|
echo "$trace_id"
|
|
}
|
|
|
|
# End trace
|
|
trace_end() {
|
|
local trace_id="$1"
|
|
local status="${2:-success}"
|
|
local safe_id; safe_id=$(_sql_escape "$trace_id")
|
|
local safe_status; safe_status=$(_sql_escape "$status")
|
|
|
|
# Calculate duration using SQLite's own timestamp math (not broken shell math)
|
|
sqlite3 "$OBS_DB" "
|
|
UPDATE traces
|
|
SET ended_at = datetime('now'),
|
|
duration_ms = CAST((julianday('now') - julianday(started_at)) * 86400000 AS INTEGER),
|
|
status = '$safe_status'
|
|
WHERE trace_id = '$safe_id'
|
|
"
|
|
|
|
local duration_ms
|
|
duration_ms=$(sqlite3 "$OBS_DB" "SELECT duration_ms FROM traces WHERE trace_id = '$safe_id'")
|
|
echo -e "${GREEN}Trace completed: $trace_id (${duration_ms:-0}ms)${RESET}"
|
|
}
|
|
|
|
# Start span
|
|
span_start() {
|
|
local trace_id; trace_id=$(_sql_escape "${1:?trace_id required}")
|
|
local name; name=$(_sql_escape "${2:?span name required}")
|
|
local service; service=$(_sql_escape "${3:-unknown}")
|
|
local parent; parent=$(_sql_escape "${4:-}")
|
|
local node; node=$(_sql_escape "${5:-$(hostname)}")
|
|
|
|
local span_id="span_$(_timestamp_ns)_$(openssl rand -hex 4)"
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
INSERT INTO spans (span_id, trace_id, parent_span_id, name, service, node)
|
|
VALUES ('$span_id', '$trace_id', '$parent', '$name', '$service', '$node')
|
|
"
|
|
|
|
echo "$span_id"
|
|
}
|
|
|
|
# End span
|
|
span_end() {
|
|
local span_id; span_id=$(_sql_escape "${1:?span_id required}")
|
|
local status; status=$(_sql_escape "${2:-success}")
|
|
local tags; tags=$(_sql_escape "${3:-{}}")
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
UPDATE spans
|
|
SET ended_at = datetime('now'),
|
|
duration_ms = CAST((julianday('now') - julianday(started_at)) * 86400000 AS INTEGER),
|
|
status = '$status',
|
|
tags = '$tags'
|
|
WHERE span_id = '$span_id'
|
|
"
|
|
}
|
|
|
|
# Add span log
|
|
span_log() {
|
|
local span_id; span_id=$(_sql_escape "${1:?span_id required}")
|
|
local message; message=$(_sql_escape "${2:?message required}")
|
|
local level; level=$(_sql_escape "${3:-info}")
|
|
|
|
# Single query to get span context and insert log (reduces 3 queries to 1)
|
|
sqlite3 "$OBS_DB" "
|
|
INSERT INTO logs (trace_id, span_id, level, message, node, service)
|
|
SELECT trace_id, '$span_id', '$level', '$message', node, service
|
|
FROM spans WHERE span_id = '$span_id'
|
|
"
|
|
}
|
|
|
|
# Record metric
|
|
metric() {
|
|
local name; name=$(_sql_escape "${1:?metric name required}")
|
|
local value="${2:?value required}"
|
|
local tags; tags=$(_sql_escape "${3:-{}}")
|
|
local node; node=$(_sql_escape "${4:-$(hostname)}")
|
|
|
|
# Validate value is numeric
|
|
[[ "$value" =~ ^-?[0-9]*\.?[0-9]+$ ]] || { echo "ERROR: value must be numeric" >&2; return 1; }
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
INSERT INTO metrics (name, value, tags, node)
|
|
VALUES ('$name', $value, '$tags', '$node')
|
|
"
|
|
}
|
|
|
|
# View trace
|
|
view_trace() {
|
|
local trace_id="$1"
|
|
|
|
echo -e "${PINK}=== TRACE: $trace_id ===${RESET}"
|
|
echo
|
|
|
|
# Trace info
|
|
sqlite3 "$OBS_DB" -line "SELECT * FROM traces WHERE trace_id = '$trace_id'"
|
|
|
|
echo
|
|
echo "Spans:"
|
|
echo
|
|
|
|
# Build span tree
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT span_id, parent_span_id, name, service, node, duration_ms, status
|
|
FROM spans WHERE trace_id = '$trace_id'
|
|
ORDER BY started_at
|
|
" | while IFS='|' read -r span_id parent name service node duration status; do
|
|
local indent=""
|
|
[ -n "$parent" ] && indent=" "
|
|
|
|
local status_color=$GREEN
|
|
[ "$status" = "error" ] && status_color=$RED
|
|
[ "$status" = "in_progress" ] && status_color=$YELLOW
|
|
|
|
printf "${indent}├── %-20s %-10s %-10s ${status_color}%dms${RESET}\n" "$name" "$service" "$node" "$duration"
|
|
|
|
# Show span logs
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT level, message FROM logs WHERE span_id = '$span_id'
|
|
" | while IFS='|' read -r level msg; do
|
|
local log_color=$RESET
|
|
[ "$level" = "error" ] && log_color=$RED
|
|
[ "$level" = "warn" ] && log_color=$YELLOW
|
|
|
|
echo -e "${indent}│ ${log_color}[$level] $msg${RESET}"
|
|
done
|
|
done
|
|
}
|
|
|
|
# List traces
|
|
list_traces() {
|
|
local limit="${1:-20}"
|
|
local filter="${2:-}"
|
|
|
|
echo -e "${PINK}=== TRACES ===${RESET}"
|
|
echo
|
|
|
|
local where=""
|
|
[ -n "$filter" ] && where="WHERE name LIKE '%$filter%' OR service LIKE '%$filter%'"
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT trace_id, name, service, duration_ms, status, started_at
|
|
FROM traces $where
|
|
ORDER BY started_at DESC LIMIT $limit
|
|
" | while IFS='|' read -r trace_id name service duration status started; do
|
|
local status_color=$GREEN
|
|
[ "$status" = "error" ] && status_color=$RED
|
|
[ "$status" = "in_progress" ] && status_color=$YELLOW
|
|
|
|
printf " %-30s %-15s %-10s ${status_color}%dms${RESET} %s\n" \
|
|
"$trace_id" "$name" "$service" "$duration" "$started"
|
|
done
|
|
}
|
|
|
|
# Search logs
|
|
search_logs() {
|
|
local query="$1"
|
|
local limit="${2:-50}"
|
|
|
|
echo -e "${PINK}=== LOG SEARCH: $query ===${RESET}"
|
|
echo
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT timestamp, level, service, node, message
|
|
FROM logs
|
|
WHERE message LIKE '%$query%'
|
|
ORDER BY timestamp DESC LIMIT $limit
|
|
" | while IFS='|' read -r ts level service node msg; do
|
|
local color=$RESET
|
|
[ "$level" = "error" ] && color=$RED
|
|
[ "$level" = "warn" ] && color=$YELLOW
|
|
|
|
echo -e "${color}[$ts] [$level] $service@$node: $msg${RESET}"
|
|
done
|
|
}
|
|
|
|
# Metrics summary
|
|
metrics_summary() {
|
|
local period="${1:-1 hour}"
|
|
|
|
echo -e "${PINK}=== METRICS SUMMARY (last $period) ===${RESET}"
|
|
echo
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT name, node, AVG(value), MIN(value), MAX(value), COUNT(*)
|
|
FROM metrics
|
|
WHERE datetime(timestamp, '+$period') > datetime('now')
|
|
GROUP BY name, node
|
|
ORDER BY name, node
|
|
" | while IFS='|' read -r name node avg min max count; do
|
|
printf " %-20s %-10s avg:%.2f min:%.2f max:%.2f (%d samples)\n" \
|
|
"$name" "$node" "$avg" "$min" "$max" "$count"
|
|
done
|
|
}
|
|
|
|
# Service map
|
|
service_map() {
|
|
echo -e "${PINK}=== SERVICE MAP ===${RESET}"
|
|
echo
|
|
|
|
echo "Services and their dependencies:"
|
|
echo
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT DISTINCT s1.service, s2.service
|
|
FROM spans s1
|
|
JOIN spans s2 ON s1.span_id = s2.parent_span_id
|
|
WHERE s1.service != s2.service
|
|
" | while IFS='|' read -r from to; do
|
|
echo " $from -> $to"
|
|
done
|
|
|
|
echo
|
|
echo "Service stats (last hour):"
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT service, COUNT(*), AVG(duration_ms),
|
|
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) * 100.0 / COUNT(*)
|
|
FROM spans
|
|
WHERE datetime(started_at, '+1 hour') > datetime('now')
|
|
GROUP BY service
|
|
" | while IFS='|' read -r service count avg_lat error_rate; do
|
|
printf " %-20s spans:%d avg:%.0fms err:%.1f%%\n" "$service" "$count" "$avg_lat" "$error_rate"
|
|
done
|
|
}
|
|
|
|
# Error analysis
|
|
errors() {
|
|
local limit="${1:-20}"
|
|
|
|
echo -e "${PINK}=== ERROR ANALYSIS ===${RESET}"
|
|
echo
|
|
|
|
echo "Recent errors:"
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT t.trace_id, t.name, s.service, s.node, l.message, l.timestamp
|
|
FROM logs l
|
|
JOIN spans s ON l.span_id = s.span_id
|
|
JOIN traces t ON l.trace_id = t.trace_id
|
|
WHERE l.level = 'error'
|
|
ORDER BY l.timestamp DESC LIMIT $limit
|
|
" | while IFS='|' read -r trace name service node msg ts; do
|
|
echo -e "${RED}[$ts] $service@$node${RESET}"
|
|
echo " Trace: $trace ($name)"
|
|
echo " Error: $msg"
|
|
echo
|
|
done
|
|
|
|
echo "Error rates by service:"
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT service, COUNT(*) as total,
|
|
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as errors
|
|
FROM spans
|
|
WHERE datetime(started_at, '+1 hour') > datetime('now')
|
|
GROUP BY service
|
|
HAVING errors > 0
|
|
ORDER BY errors DESC
|
|
" | while IFS='|' read -r service total errors; do
|
|
local rate=$(echo "scale=1; $errors * 100 / $total" | bc)
|
|
printf " %-20s %d/%d (%.1f%%)\n" "$service" "$errors" "$total" "$rate"
|
|
done
|
|
}
|
|
|
|
# Dashboard
|
|
dashboard() {
|
|
clear
|
|
echo -e "${PINK}╔══════════════════════════════════════════════════════════════╗${RESET}"
|
|
echo -e "${PINK}║ 👁️ OBSERVABILITY DASHBOARD ║${RESET}"
|
|
echo -e "${PINK}╚══════════════════════════════════════════════════════════════╝${RESET}"
|
|
echo
|
|
|
|
local total_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
|
|
local error_traces=$(sqlite3 "$OBS_DB" "SELECT COUNT(*) FROM traces WHERE status = 'error' AND datetime(started_at, '+1 hour') > datetime('now')")
|
|
local avg_duration=$(sqlite3 "$OBS_DB" "SELECT AVG(duration_ms) FROM traces WHERE datetime(started_at, '+1 hour') > datetime('now')")
|
|
|
|
echo "Last Hour:"
|
|
printf " Traces: %d | Errors: %d | Avg Duration: %.0fms\n" "$total_traces" "$error_traces" "${avg_duration:-0}"
|
|
echo
|
|
|
|
echo "─────────────────────────────────────────────────────────────────"
|
|
echo "Active Services:"
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT service, COUNT(*), AVG(duration_ms)
|
|
FROM spans WHERE datetime(started_at, '+1 hour') > datetime('now')
|
|
GROUP BY service ORDER BY COUNT(*) DESC LIMIT 5
|
|
" | while IFS='|' read -r service count avg; do
|
|
printf " %-20s %d spans (avg: %.0fms)\n" "$service" "$count" "$avg"
|
|
done
|
|
|
|
echo
|
|
echo "─────────────────────────────────────────────────────────────────"
|
|
echo "Recent Errors:"
|
|
sqlite3 "$OBS_DB" "
|
|
SELECT service, message, timestamp FROM logs
|
|
WHERE level = 'error' ORDER BY timestamp DESC LIMIT 3
|
|
" | while IFS='|' read -r service msg ts; do
|
|
echo -e " ${RED}$service: $msg${RESET}"
|
|
done
|
|
}
|
|
|
|
# Clean old data
|
|
cleanup() {
|
|
local days="${1:-7}"
|
|
|
|
sqlite3 "$OBS_DB" "
|
|
DELETE FROM logs WHERE datetime(timestamp, '+$days days') < datetime('now');
|
|
DELETE FROM spans WHERE datetime(started_at, '+$days days') < datetime('now');
|
|
DELETE FROM traces WHERE datetime(started_at, '+$days days') < datetime('now');
|
|
DELETE FROM metrics WHERE datetime(timestamp, '+$days days') < datetime('now');
|
|
"
|
|
|
|
echo -e "${GREEN}Cleaned data older than $days days${RESET}"
|
|
}
|
|
|
|
# Help
|
|
help() {
|
|
echo -e "${PINK}BlackRoad Observability${RESET}"
|
|
echo
|
|
echo "Distributed tracing and observability"
|
|
echo
|
|
echo "Tracing:"
|
|
echo " trace-start <name> [service] Start trace"
|
|
echo " trace-end <trace_id> [status] End trace"
|
|
echo " span-start <trace> <name> [svc] Start span"
|
|
echo " span-end <span_id> [status] End span"
|
|
echo " span-log <span_id> <msg> [level] Add log"
|
|
echo " view <trace_id> View trace"
|
|
echo " list [limit] [filter] List traces"
|
|
echo
|
|
echo "Metrics & Logs:"
|
|
echo " metric <name> <value> [tags] Record metric"
|
|
echo " search <query> [limit] Search logs"
|
|
echo " metrics [period] Metrics summary"
|
|
echo
|
|
echo "Analysis:"
|
|
echo " service-map Service dependencies"
|
|
echo " errors [limit] Error analysis"
|
|
echo " dashboard Overview dashboard"
|
|
echo " cleanup [days] Clean old data"
|
|
echo
|
|
echo "Examples:"
|
|
echo " trace=\$($0 trace-start 'inference' 'api')"
|
|
echo " span=\$($0 span-start \$trace 'generate' 'llm')"
|
|
echo " $0 span-log \$span 'Processing request'"
|
|
echo " $0 span-end \$span"
|
|
echo " $0 trace-end \$trace"
|
|
}
|
|
|
|
# Ensure initialized
|
|
[ -f "$OBS_DB" ] || init >/dev/null
|
|
|
|
case "${1:-help}" in
|
|
init)
|
|
init
|
|
;;
|
|
trace-start)
|
|
trace_start "$2" "$3" "$4"
|
|
;;
|
|
trace-end)
|
|
trace_end "$2" "$3"
|
|
;;
|
|
span-start)
|
|
span_start "$2" "$3" "$4" "$5" "$6"
|
|
;;
|
|
span-end)
|
|
span_end "$2" "$3" "$4"
|
|
;;
|
|
span-log|log)
|
|
span_log "$2" "$3" "$4"
|
|
;;
|
|
view)
|
|
view_trace "$2"
|
|
;;
|
|
list|traces)
|
|
list_traces "$2" "$3"
|
|
;;
|
|
metric)
|
|
metric "$2" "$3" "$4" "$5"
|
|
;;
|
|
search)
|
|
search_logs "$2" "$3"
|
|
;;
|
|
metrics)
|
|
metrics_summary "$2"
|
|
;;
|
|
service-map|map)
|
|
service_map
|
|
;;
|
|
errors)
|
|
errors "$2"
|
|
;;
|
|
dashboard|dash)
|
|
dashboard
|
|
;;
|
|
cleanup)
|
|
cleanup "$2"
|
|
;;
|
|
*)
|
|
help
|
|
;;
|
|
esac
|