blackroad/bin/br-infer

#!/usr/bin/env bash
# ============================================================================
# BLACKROAD OS, INC. - PROPRIETARY AND CONFIDENTIAL
# Copyright (c) 2025-2026 BlackRoad OS, Inc. All Rights Reserved.
# ============================================================================
# br-infer - Production AI inference across the fleet
# Smart routing with health checks, retries, and metrics
# Usage: br-infer [options] "prompt"
set -eo pipefail

source "$HOME/.blackroad/config/nodes.sh" 2>/dev/null || true

INFER_DIR="$HOME/.blackroad/inference"
INFER_DB="$INFER_DIR/inference.db"
CACHE_DIR="$INFER_DIR/cache"
mkdir -p "$INFER_DIR" "$CACHE_DIR"

_sql() { sqlite3 "$INFER_DB" "$@" 2>/dev/null; }
_sql_escape() { echo "$1" | sed "s/'/''/g"; }

OLLAMA_NODES=(cecilia lucidia alice)
DEFAULT_MODEL="${BR_DEFAULT_MODEL:-llama3.2}"
MAX_RETRIES=2
TIMEOUT_S=120

init_db() {
  _sql <<'SQL'
CREATE TABLE IF NOT EXISTS inferences (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP,
    node TEXT,
    model TEXT,
    prompt_len INTEGER,
    response_len INTEGER,
    latency_ms INTEGER,
    status TEXT,
    cached INTEGER DEFAULT 0
);
CREATE TABLE IF NOT EXISTS node_health (
    node TEXT PRIMARY KEY,
    last_check DATETIME,
    status TEXT DEFAULT 'unknown',
    avg_latency_ms REAL DEFAULT 0,
    success_count INTEGER DEFAULT 0,
    fail_count INTEGER DEFAULT 0,
    models TEXT
);
CREATE INDEX IF NOT EXISTS idx_infer_ts ON inferences(timestamp);
CREATE INDEX IF NOT EXISTS idx_infer_node ON inferences(node);
PRAGMA journal_mode=WAL;
PRAGMA busy_timeout=5000;
SQL
}

# Health check all nodes, update DB
cmd_health() {
  printf '%bChecking fleet inference health...%b\n\n' "$AMBER" "$RESET"
  printf '  %-12s %-8s %-10s %-8s %s\n' "NODE" "STATUS" "LATENCY" "MODELS" "ENDPOINT"
  printf '  %-12s %-8s %-10s %-8s %s\n' "────" "──────" "───────" "──────" "────────"

  for node in "${OLLAMA_NODES[@]}"; do
    local ip="${NODE_IP[$node]:-}"
    [[ -z "$ip" ]] && continue

    printf '  %-12s ' "$node"

    local start_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))')
    local tags
    tags=$(curl -sf --connect-timeout 3 --max-time 5 "http://${ip}:11434/api/tags" 2>/dev/null)
    local end_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))')
    local lat=$((end_ms - start_ms))

    if [[ -n "$tags" ]]; then
      local count
      count=$(echo "$tags" | jq '.models | length' 2>/dev/null || echo "0")
      local model_list
      model_list=$(echo "$tags" | jq -r '[.models[].name] | join(",")' 2>/dev/null || echo "")

      printf '%b%-8s%b %-10s %-8s %s\n' "$GREEN" "UP" "$RESET" "${lat}ms" "$count" "http://${ip}:11434"

      _sql "INSERT OR REPLACE INTO node_health (node, last_check, status, avg_latency_ms, models)
            VALUES ('$node', datetime('now'), 'up', $lat, '$(_sql_escape "$model_list")')"
    else
      printf '%b%-8s%b %-10s %-8s %s\n' "$RED" "DOWN" "$RESET" "—" "—" "http://${ip}:11434"

      _sql "INSERT OR REPLACE INTO node_health (node, last_check, status)
            VALUES ('$node', datetime('now'), 'down')"
    fi
  done

  # Also check localhost
  printf '  %-12s ' "localhost"
  if curl -sf --connect-timeout 1 "http://localhost:11434/api/tags" &>/dev/null; then
    printf '%b%-8s%b\n' "$GREEN" "UP" "$RESET"
  else
    printf '%b%-8s%b\n' "$AMBER" "N/A" "$RESET"
  fi
  echo
}

# Find best available node (least latency, healthy)
find_best_node() {
  local model="$1"

  # Check local first
  if curl -sf --connect-timeout 1 "http://localhost:11434/api/tags" &>/dev/null; then
    echo "localhost:localhost"
    return 0
  fi

  # Try nodes by health record (best latency first)
  local best_node=""
  local best_ip=""
  local best_lat=999999

  for node in "${OLLAMA_NODES[@]}"; do
    local ip="${NODE_IP[$node]:-}"
    [[ -z "$ip" ]] && continue

    # Quick health check
    if curl -sf --connect-timeout 2 "http://${ip}:11434/api/tags" &>/dev/null; then
      local cached_lat
      cached_lat=$(_sql "SELECT avg_latency_ms FROM node_health WHERE node='$node' AND status='up'" 2>/dev/null)
      cached_lat="${cached_lat:-500}"

      if [[ "${cached_lat%.*}" -lt "$best_lat" ]]; then
        best_lat="${cached_lat%.*}"
        best_node="$node"
        best_ip="$ip"
      fi
    fi
  done

  if [[ -n "$best_node" ]]; then
    echo "${best_node}:${best_ip}"
    return 0
  fi

  return 1
}

# Core inference
do_infer() {
  local model="$1"
  local prompt="$2"
  local node_info="$3"

  local node="${node_info%%:*}"
  local ip="${node_info#*:}"

  local host="http://${ip}:11434"
  [[ "$ip" == "localhost" ]] && host="http://localhost:11434"

  local start_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))')

  local response
  response=$(curl -sf --max-time "$TIMEOUT_S" "${host}/api/generate" \
    -d "$(jq -n --arg m "$model" --arg p "$prompt" '{model: $m, prompt: $p, stream: false}')" 2>/dev/null)

  local end_ms=$(date +%s%3N 2>/dev/null || python3 -c 'import time; print(int(time.time()*1000))')
  local lat=$((end_ms - start_ms))

  local text
  text=$(echo "$response" | jq -r '.response // empty' 2>/dev/null)

  if [[ -n "$text" ]]; then
    # Record success
    _sql "INSERT INTO inferences (node, model, prompt_len, response_len, latency_ms, status)
          VALUES ('$node', '$(_sql_escape "$model")', ${#prompt}, ${#text}, $lat, 'success')"
    _sql "UPDATE node_health SET success_count = success_count + 1,
          avg_latency_ms = (avg_latency_ms * success_count + $lat) / (success_count + 1)
          WHERE node='$node'"
    echo "$text"
    return 0
  else
    _sql "INSERT INTO inferences (node, model, prompt_len, response_len, latency_ms, status)
          VALUES ('$node', '$(_sql_escape "$model")', ${#prompt}, 0, $lat, 'fail')"
    _sql "UPDATE node_health SET fail_count = fail_count + 1 WHERE node='$node'"
    return 1
  fi
}

# Main query with retries and fallback
cmd_query() {
  local model="$DEFAULT_MODEL"
  local system_prompt=""
  local json_output=false

  while [[ $# -gt 0 ]]; do
    case "$1" in
      -m|--model) model="$2"; shift 2 ;;
      -s|--system) system_prompt="$2"; shift 2 ;;
      -j|--json) json_output=true; shift ;;
      *) break ;;
    esac
  done

  local prompt="$*"

  # Read from stdin if no prompt
  if [[ -z "$prompt" && ! -t 0 ]]; then
    prompt=$(cat)
  fi

  [[ -z "$prompt" ]] && { echo "Usage: br-infer \"prompt\"" >&2; return 1; }

  # Prepend system prompt if given
  [[ -n "$system_prompt" ]] && prompt="$system_prompt\n\n$prompt"

  local attempt=0
  local tried_nodes=()

  while [[ $attempt -le $MAX_RETRIES ]]; do
    local node_info
    node_info=$(find_best_node "$model") || {
      printf '%bNo inference nodes available%b\n' "$RED" "$RESET" >&2
      return 1
    }

    local node="${node_info%%:*}"

    # Skip already-tried nodes
    local skip=false
    for tried in "${tried_nodes[@]}"; do
      [[ "$tried" == "$node" ]] && skip=true
    done
    $skip && { ((attempt++)); continue; }

    tried_nodes+=("$node")

    [[ $attempt -gt 0 ]] && printf '%bRetrying on %s...%b\n' "$AMBER" "$node" "$RESET" >&2
    printf '%b→ %s%b (model: %s)\n' "$PINK" "$node" "$RESET" "$model" >&2

    local result
    if result=$(do_infer "$model" "$prompt" "$node_info"); then
      if $json_output; then
        jq -n --arg node "$node" --arg model "$model" --arg response "$result" \
          '{node: $node, model: $model, response: $response}'
      else
        echo "$result"
      fi
      return 0
    fi

    ((attempt++))
  done

  printf '%bAll inference attempts failed%b\n' "$RED" "$RESET" >&2
  return 1
}

# Batch inference
cmd_batch() {
  local model="$DEFAULT_MODEL"
  local input_file=""

  while [[ $# -gt 0 ]]; do
    case "$1" in
      -m|--model) model="$2"; shift 2 ;;
      *) input_file="$1"; shift ;;
    esac
  done

  [[ -z "$input_file" || ! -f "$input_file" ]] && {
    echo "Usage: br-infer batch [-m model] <file>" >&2
    echo "  File should contain one prompt per line" >&2
    return 1
  }

  local total=0 success=0 fail=0
  while IFS= read -r line; do
    [[ -z "$line" || "$line" == "#"* ]] && continue
    ((total++))

    printf '%b[%d] %b' "$AMBER" "$total" "$RESET" >&2
    if result=$(cmd_query -m "$model" "$line" 2>/dev/null); then
      ((success++))
      printf '%bOK%b\n' "$GREEN" "$RESET" >&2
      echo "---"
      echo "PROMPT: $line"
      echo "RESPONSE: $result"
      echo ""
    else
      ((fail++))
      printf '%bFAIL%b\n' "$RED" "$RESET" >&2
    fi
  done < "$input_file"

  printf '\n%bBatch complete: %d/%d succeeded%b\n' "$PINK" "$success" "$total" "$RESET" >&2
}

# Stats
cmd_stats() {
  printf '%b╔══════════════════════════════════════════╗%b\n' "$PINK" "$RESET"
  printf '%b║      Inference Statistics                 ║%b\n' "$PINK" "$RESET"
  printf '%b╚══════════════════════════════════════════╝%b\n\n' "$PINK" "$RESET"

  local total=$(_sql "SELECT COUNT(*) FROM inferences")
  local success=$(_sql "SELECT COUNT(*) FROM inferences WHERE status='success'")
  local avg_lat=$(_sql "SELECT CAST(AVG(latency_ms) AS INTEGER) FROM inferences WHERE status='success'")
  local total_tokens=$(_sql "SELECT SUM(response_len) FROM inferences WHERE status='success'")

  printf '  Total requests:     %s\n' "${total:-0}"
  printf '  Success rate:       '
  if [[ "${total:-0}" -gt 0 ]]; then
    printf '%s%%\n' "$(echo "scale=1; ${success:-0} * 100 / $total" | bc 2>/dev/null || echo "?")"
  else
    printf 'N/A\n'
  fi
  printf '  Avg latency:        %sms\n' "${avg_lat:-0}"
  printf '  Total chars out:    %s\n\n' "${total_tokens:-0}"

  printf '  %bBy node:%b\n' "$BLUE" "$RESET"
  _sql "SELECT node, COUNT(*), CAST(AVG(latency_ms) AS INTEGER), SUM(CASE WHEN status='success' THEN 1 ELSE 0 END) FROM inferences GROUP BY node ORDER BY COUNT(*) DESC" | \
  while IFS='|' read -r node cnt avg_l succ; do
    printf '    %-12s %4d reqs  avg:%4dms  ok:%d\n' "$node" "$cnt" "$avg_l" "$succ"
  done

  echo ""
  printf '  %bBy model:%b\n' "$BLUE" "$RESET"
  _sql "SELECT model, COUNT(*), CAST(AVG(latency_ms) AS INTEGER) FROM inferences WHERE status='success' GROUP BY model ORDER BY COUNT(*) DESC" | \
  while IFS='|' read -r model cnt avg_l; do
    printf '    %-25s %4d reqs  avg:%4dms\n' "$model" "$cnt" "$avg_l"
  done

  echo ""
  printf '  %bLast 10:%b\n' "$BLUE" "$RESET"
  _sql "SELECT node, model, latency_ms, status, timestamp FROM inferences ORDER BY id DESC LIMIT 10" | \
  while IFS='|' read -r node model lat status ts; do
    local icon="+"
    [[ "$status" == "fail" ]] && icon="x"
    printf '    %s %-10s %-20s %5dms  %s\n' "$icon" "$node" "$model" "$lat" "$ts"
  done
  echo
}

# List available models across fleet
cmd_models() {
  printf '%bFleet Models:%b\n\n' "$PINK" "$RESET"
  printf '  %-30s %s\n' "MODEL" "AVAILABLE ON"
  printf '  %-30s %s\n' "─────" "────────────"

  declare -A model_nodes
  for node in "${OLLAMA_NODES[@]}"; do
    local ip="${NODE_IP[$node]:-}"
    [[ -z "$ip" ]] && continue

    local tags
    tags=$(curl -sf --connect-timeout 3 "http://${ip}:11434/api/tags" 2>/dev/null) || continue

    echo "$tags" | jq -r '.models[].name' 2>/dev/null | while read -r m; do
      echo "${m}|${node}"
    done
  done | sort | while IFS='|' read -r m n; do
    printf '  %-30s %s\n' "$m" "$n"
  done
  echo
}

usage() {
  cat <<EOF
${PINK}br-infer${RESET} - Production AI inference across the fleet

${BLUE}USAGE:${RESET}
  br-infer [options] "prompt"       Run inference
  br-infer batch <file>             Batch inference (one prompt per line)
  br-infer health                   Check fleet health
  br-infer models                   List available models
  br-infer stats                    Inference statistics

${BLUE}OPTIONS:${RESET}
  -m, --model MODEL    Model to use (default: ${DEFAULT_MODEL})
  -s, --system TEXT     System prompt
  -j, --json            JSON output

${GREEN}EXAMPLES:${RESET}
  br-infer "explain TCP in one sentence"
  br-infer -m mistral "write a haiku about code"
  br-infer -j "what is 2+2"
  echo "hello" | br-infer
  br-infer batch prompts.txt
  br-infer health
  br-infer stats
EOF
}

[[ -f "$INFER_DB" ]] || init_db

case "${1:-}" in
  health|h)     cmd_health ;;
  stats|s)      cmd_stats ;;
  models|m)     cmd_models ;;
  batch|b)      shift; cmd_batch "$@" ;;
  -h|--help|help) usage ;;
  "")           usage ;;
  *)            cmd_query "$@" ;;
esac