#!/usr/bin/env bash # ============================================================================ # BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL # Copyright (c) 2025-2026 BlackRoad OS, Inc. All Rights Reserved. # ============================================================================ # br-infer - Production AI inference across the fleet # Smart routing with health checks, retries, and metrics # Usage: br-infer [options] "prompt" set -eo pipefail source "$HOME/.blackroad/config/nodes.sh" 2>/dev/null || true INFER_DIR="$HOME/.blackroad/inference" INFER_DB="$INFER_DIR/inference.db" CACHE_DIR="$INFER_DIR/cache" mkdir -p "$INFER_DIR" "$CACHE_DIR" _sql() { sqlite3 "$INFER_DB" "$@" 2>/dev/null; } _sql_escape() { echo "$1" | sed "s/'/''/g"; } OLLAMA_NODES=(cecilia lucidia alice) DEFAULT_MODEL="${BR_DEFAULT_MODEL:-llama3.2}" MAX_RETRIES=2 TIMEOUT_S=120 init_db() { _sql <<'SQL' CREATE TABLE IF NOT EXISTS inferences ( id INTEGER PRIMARY KEY AUTOINCREMENT, timestamp DATETIME DEFAULT CURRENT_TIMESTAMP, node TEXT, model TEXT, prompt_len INTEGER, response_len INTEGER, latency_ms INTEGER, status TEXT, cached INTEGER DEFAULT 0 ); CREATE TABLE IF NOT EXISTS node_health ( node TEXT PRIMARY KEY, last_check DATETIME, status TEXT DEFAULT 'unknown', avg_latency_ms REAL DEFAULT 0, success_count INTEGER DEFAULT 0, fail_count INTEGER DEFAULT 0, models TEXT ); CREATE INDEX IF NOT EXISTS idx_infer_ts ON inferences(timestamp); CREATE INDEX IF NOT EXISTS idx_infer_node ON inferences(node); PRAGMA journal_mode=WAL; PRAGMA busy_timeout=5000; SQL } # Health check all nodes, update DB cmd_health() { printf '%bChecking fleet inference health...%b\n\n' "$AMBER" "$RESET" printf ' %-12s %-8s %-10s %-8s %s\n' "NODE" "STATUS" "LATENCY" "MODELS" "ENDPOINT" printf ' %-12s %-8s %-10s %-8s %s\n' "────" "──────" "───────" "──────" "────────" for node in "${OLLAMA_NODES[@]}"; do local ip="${NODE_IP[$node]:-}" [[ -z "$ip" ]] && continue printf ' %-12s ' "$node" local start_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))') local tags tags=$(curl -sf --connect-timeout 3 --max-time 5 "http://${ip}:11434/api/tags" 2>/dev/null) local end_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))') local lat=$((end_ms - start_ms)) if [[ -n "$tags" ]]; then local count count=$(echo "$tags" | jq '.models | length' 2>/dev/null || echo "0") local model_list model_list=$(echo "$tags" | jq -r '[.models[].name] | join(",")' 2>/dev/null || echo "") printf '%b%-8s%b %-10s %-8s %s\n' "$GREEN" "UP" "$RESET" "${lat}ms" "$count" "http://${ip}:11434" _sql "INSERT OR REPLACE INTO node_health (node, last_check, status, avg_latency_ms, models) VALUES ('$node', datetime('now'), 'up', $lat, '$(_sql_escape "$model_list")')" else printf '%b%-8s%b %-10s %-8s %s\n' "$RED" "DOWN" "$RESET" "—" "—" "http://${ip}:11434" _sql "INSERT OR REPLACE INTO node_health (node, last_check, status) VALUES ('$node', datetime('now'), 'down')" fi done # Also check localhost printf ' %-12s ' "localhost" if curl -sf --connect-timeout 1 "http://localhost:11434/api/tags" &>/dev/null; then printf '%b%-8s%b\n' "$GREEN" "UP" "$RESET" else printf '%b%-8s%b\n' "$AMBER" "N/A" "$RESET" fi echo } # Find best available node (least latency, healthy) find_best_node() { local model="$1" # Check local first if curl -sf --connect-timeout 1 "http://localhost:11434/api/tags" &>/dev/null; then echo "localhost:localhost" return 0 fi # Try nodes by health record (best latency first) local best_node="" local best_ip="" local best_lat=999999 for node in "${OLLAMA_NODES[@]}"; do local ip="${NODE_IP[$node]:-}" [[ -z "$ip" ]] && continue # Quick health check if curl -sf --connect-timeout 2 "http://${ip}:11434/api/tags" &>/dev/null; then local cached_lat cached_lat=$(_sql "SELECT avg_latency_ms FROM node_health WHERE node='$node' AND status='up'" 2>/dev/null) cached_lat="${cached_lat:-500}" if [[ "${cached_lat%.*}" -lt "$best_lat" ]]; then best_lat="${cached_lat%.*}" best_node="$node" best_ip="$ip" fi fi done if [[ -n "$best_node" ]]; then echo "${best_node}:${best_ip}" return 0 fi return 1 } # Core inference do_infer() { local model="$1" local prompt="$2" local node_info="$3" local node="${node_info%%:*}" local ip="${node_info#*:}" local host="http://${ip}:11434" [[ "$ip" == "localhost" ]] && host="http://localhost:11434" local start_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))') local response response=$(curl -sf --max-time "$TIMEOUT_S" "${host}/api/generate" \ -d "$(jq -n --arg m "$model" --arg p "$prompt" '{model: $m, prompt: $p, stream: false}')" 2>/dev/null) local end_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))') local lat=$((end_ms - start_ms)) local text text=$(echo "$response" | jq -r '.response // empty' 2>/dev/null) if [[ -n "$text" ]]; then # Record success _sql "INSERT INTO inferences (node, model, prompt_len, response_len, latency_ms, status) VALUES ('$node', '$(_sql_escape "$model")', ${#prompt}, ${#text}, $lat, 'success')" _sql "UPDATE node_health SET success_count = success_count + 1, avg_latency_ms = (avg_latency_ms * success_count + $lat) / (success_count + 1) WHERE node='$node'" echo "$text" return 0 else _sql "INSERT INTO inferences (node, model, prompt_len, response_len, latency_ms, status) VALUES ('$node', '$(_sql_escape "$model")', ${#prompt}, 0, $lat, 'fail')" _sql "UPDATE node_health SET fail_count = fail_count + 1 WHERE node='$node'" return 1 fi } # Main query with retries and fallback cmd_query() { local model="$DEFAULT_MODEL" local system_prompt="" local json_output=false while [[ $# -gt 0 ]]; do case "$1" in -m|--model) model="$2"; shift 2 ;; -s|--system) system_prompt="$2"; shift 2 ;; -j|--json) json_output=true; shift ;; *) break ;; esac done local prompt="$*" # Read from stdin if no prompt if [[ -z "$prompt" && ! -t 0 ]]; then prompt=$(cat) fi [[ -z "$prompt" ]] && { echo "Usage: br-infer \"prompt\"" >&2; return 1; } # Prepend system prompt if given [[ -n "$system_prompt" ]] && prompt="$system_prompt\n\n$prompt" local attempt=0 local tried_nodes=() while [[ $attempt -le $MAX_RETRIES ]]; do local node_info node_info=$(find_best_node "$model") || { printf '%bNo inference nodes available%b\n' "$RED" "$RESET" >&2 return 1 } local node="${node_info%%:*}" # Skip already-tried nodes local skip=false for tried in "${tried_nodes[@]}"; do [[ "$tried" == "$node" ]] && skip=true done $skip && { ((attempt++)); continue; } tried_nodes+=("$node") [[ $attempt -gt 0 ]] && printf '%bRetrying on %s...%b\n' "$AMBER" "$node" "$RESET" >&2 printf '%b→ %s%b (model: %s)\n' "$PINK" "$node" "$RESET" "$model" >&2 local result if result=$(do_infer "$model" "$prompt" "$node_info"); then if $json_output; then jq -n --arg node "$node" --arg model "$model" --arg response "$result" \ '{node: $node, model: $model, response: $response}' else echo "$result" fi return 0 fi ((attempt++)) done printf '%bAll inference attempts failed%b\n' "$RED" "$RESET" >&2 return 1 } # Batch inference cmd_batch() { local model="$DEFAULT_MODEL" local input_file="" while [[ $# -gt 0 ]]; do case "$1" in -m|--model) model="$2"; shift 2 ;; *) input_file="$1"; shift ;; esac done [[ -z "$input_file" || ! -f "$input_file" ]] && { echo "Usage: br-infer batch [-m model] " >&2 echo " File should contain one prompt per line" >&2 return 1 } local total=0 success=0 fail=0 while IFS= read -r line; do [[ -z "$line" || "$line" == "#"* ]] && continue ((total++)) printf '%b[%d] %b' "$AMBER" "$total" "$RESET" >&2 if result=$(cmd_query -m "$model" "$line" 2>/dev/null); then ((success++)) printf '%bOK%b\n' "$GREEN" "$RESET" >&2 echo "---" echo "PROMPT: $line" echo "RESPONSE: $result" echo "" else ((fail++)) printf '%bFAIL%b\n' "$RED" "$RESET" >&2 fi done < "$input_file" printf '\n%bBatch complete: %d/%d succeeded%b\n' "$PINK" "$success" "$total" "$RESET" >&2 } # Stats cmd_stats() { printf '%b╔══════════════════════════════════════════╗%b\n' "$PINK" "$RESET" printf '%b║ Inference Statistics ║%b\n' "$PINK" "$RESET" printf '%b╚══════════════════════════════════════════╝%b\n\n' "$PINK" "$RESET" local total=$(_sql "SELECT COUNT(*) FROM inferences") local success=$(_sql "SELECT COUNT(*) FROM inferences WHERE status='success'") local avg_lat=$(_sql "SELECT CAST(AVG(latency_ms) AS INTEGER) FROM inferences WHERE status='success'") local total_tokens=$(_sql "SELECT SUM(response_len) FROM inferences WHERE status='success'") printf ' Total requests: %s\n' "${total:-0}" printf ' Success rate: ' if [[ "${total:-0}" -gt 0 ]]; then printf '%s%%\n' "$(echo "scale=1; ${success:-0} * 100 / $total" | bc 2>/dev/null || echo "?")" else printf 'N/A\n' fi printf ' Avg latency: %sms\n' "${avg_lat:-0}" printf ' Total chars out: %s\n\n' "${total_tokens:-0}" printf ' %bBy node:%b\n' "$BLUE" "$RESET" _sql "SELECT node, COUNT(*), CAST(AVG(latency_ms) AS INTEGER), SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) FROM inferences GROUP BY node ORDER BY COUNT(*) DESC" | \ while IFS='|' read -r node cnt avg_l succ; do printf ' %-12s %4d reqs avg:%4dms ok:%d\n' "$node" "$cnt" "$avg_l" "$succ" done echo "" printf ' %bBy model:%b\n' "$BLUE" "$RESET" _sql "SELECT model, COUNT(*), CAST(AVG(latency_ms) AS INTEGER) FROM inferences WHERE status='success' GROUP BY model ORDER BY COUNT(*) DESC" | \ while IFS='|' read -r model cnt avg_l; do printf ' %-25s %4d reqs avg:%4dms\n' "$model" "$cnt" "$avg_l" done echo "" printf ' %bLast 10:%b\n' "$BLUE" "$RESET" _sql "SELECT node, model, latency_ms, status, timestamp FROM inferences ORDER BY id DESC LIMIT 10" | \ while IFS='|' read -r node model lat status ts; do local icon="+" [[ "$status" == "fail" ]] && icon="x" printf ' %s %-10s %-20s %5dms %s\n' "$icon" "$node" "$model" "$lat" "$ts" done echo } # List available models across fleet cmd_models() { printf '%bFleet Models:%b\n\n' "$PINK" "$RESET" printf ' %-30s %s\n' "MODEL" "AVAILABLE ON" printf ' %-30s %s\n' "─────" "────────────" declare -A model_nodes for node in "${OLLAMA_NODES[@]}"; do local ip="${NODE_IP[$node]:-}" [[ -z "$ip" ]] && continue local tags tags=$(curl -sf --connect-timeout 3 "http://${ip}:11434/api/tags" 2>/dev/null) || continue echo "$tags" | jq -r '.models[].name' 2>/dev/null | while read -r m; do echo "${m}|${node}" done done | sort | while IFS='|' read -r m n; do printf ' %-30s %s\n' "$m" "$n" done echo } usage() { cat < Batch inference (one prompt per line) br-infer health Check fleet health br-infer models List available models br-infer stats Inference statistics ${BLUE}OPTIONS:${RESET} -m, --model MODEL Model to use (default: ${DEFAULT_MODEL}) -s, --system TEXT System prompt -j, --json JSON output ${GREEN}EXAMPLES:${RESET} br-infer "explain TCP in one sentence" br-infer -m mistral "write a haiku about code" br-infer -j "what is 2+2" echo "hello" | br-infer br-infer batch prompts.txt br-infer health br-infer stats EOF } [[ -f "$INFER_DB" ]] || init_db case "${1:-}" in health|h) cmd_health ;; stats|s) cmd_stats ;; models|m) cmd_models ;; batch|b) shift; cmd_batch "$@" ;; -h|--help|help) usage ;; "") usage ;; *) cmd_query "$@" ;; esac