mirror of
https://github.com/blackboxprogramming/blackroad-os-kpis.git
synced 2026-03-18 06:34:02 -05:00
RoadChain-SHA2048: f59eb7b3bb74e603 RoadChain-Identity: alexa@sovereign RoadChain-Full: f59eb7b3bb74e60325e3dc2bde2dce2d3f77e4afdadb4b559bf44b95318aac5e44854cc056bec1414243177e469feccdf8a5bf21697916f004706ac784fc70a9ce221703ff29c91581884e5903b5d4a6127a3b570eda54801cf7641a17b13490bb8a3d3be04ee01a96030087800f62f02f47e6ca3d76a3e832c8cdfbeefa3ffbd57acc133d9a7d684161e565dd53636a48410cd38d322620c3fac516a79e5831edf3dab2e81c484f583628c617f85f066351f403163cea6e939484ab33bdaaaa27b23695999aa0e68ae9bff10bf9dfbabcd2785b286600e940359f8e2122c708ed62e7a358accafd224da40151104d77017d4c91fa5b35cce8ca7a728d1b37e0
82 lines
2.8 KiB
Bash
82 lines
2.8 KiB
Bash
#!/bin/bash
|
|
# Collect service-level KPIs from fleet: Ollama, Docker, PostgreSQL, Nginx, systemd
|
|
|
|
source "$(dirname "$0")/../lib/common.sh"
|
|
set +e
|
|
|
|
log "Collecting service KPIs..."
|
|
|
|
OUT=$(snapshot_file services)
|
|
PROBE_SCRIPT="$(dirname "$0")/services-probe.py"
|
|
|
|
nodes_json='{'
|
|
first=true
|
|
|
|
for entry in $FLEET_NODES; do
|
|
node=$(echo "$entry" | cut -d: -f1)
|
|
ip=$(echo "$entry" | cut -d: -f2)
|
|
user=$(get_ssh_user "$node")
|
|
|
|
log "Probing services on $node..."
|
|
|
|
result=$(ssh -o ConnectTimeout=3 -o ServerAliveInterval=3 -o ServerAliveCountMax=2 \
|
|
-o StrictHostKeyChecking=no -o BatchMode=yes \
|
|
"$user@$ip" "python3 -" < "$PROBE_SCRIPT" 2>/dev/null || echo '')
|
|
|
|
if [ -n "$result" ]; then
|
|
if [ "$first" = true ]; then
|
|
nodes_json="$nodes_json\"$node\": $result"
|
|
first=false
|
|
else
|
|
nodes_json="$nodes_json, \"$node\": $result"
|
|
fi
|
|
ok "$node: services probed"
|
|
else
|
|
if [ "$first" = true ]; then
|
|
nodes_json="$nodes_json\"$node\": {\"status\": \"offline\"}"
|
|
first=false
|
|
else
|
|
nodes_json="$nodes_json, \"$node\": {\"status\": \"offline\"}"
|
|
fi
|
|
err "$node: offline"
|
|
fi
|
|
done
|
|
|
|
nodes_json="$nodes_json}"
|
|
|
|
# Aggregate
|
|
python3 -c "
|
|
import json
|
|
|
|
nodes = json.loads('''$nodes_json''')
|
|
online = {k: v for k, v in nodes.items() if v.get('status') != 'offline'}
|
|
|
|
output = {
|
|
'source': 'services',
|
|
'collected_at': '$TIMESTAMP',
|
|
'date': '$TODAY',
|
|
'totals': {
|
|
'ollama_models': sum(v.get('ollama', {}).get('count', 0) for v in online.values()),
|
|
'ollama_size_gb': round(sum(v.get('ollama', {}).get('size_gb', 0) for v in online.values()), 1),
|
|
'docker_containers': sum(v.get('docker', {}).get('running', 0) for v in online.values()),
|
|
'docker_images': sum(v.get('docker', {}).get('images', 0) for v in online.values()),
|
|
'postgres_dbs': sum(v.get('postgres', {}).get('databases', 0) for v in online.values()),
|
|
'nginx_sites': sum(v.get('nginx', {}).get('sites', 0) for v in online.values()),
|
|
'systemd_services': sum(v.get('systemd', {}).get('services', 0) for v in online.values()),
|
|
'systemd_timers': sum(v.get('systemd', {}).get('timers', 0) for v in online.values()),
|
|
'systemd_failed': sum(v.get('systemd', {}).get('failed', 0) for v in online.values()),
|
|
'processes': sum(v.get('processes', 0) for v in online.values()),
|
|
'network_connections': sum(v.get('connections', 0) for v in online.values()),
|
|
'swap_used_mb': sum(v.get('swap', {}).get('used_mb', 0) for v in online.values()),
|
|
'swap_total_mb': sum(v.get('swap', {}).get('total_mb', 0) for v in online.values()),
|
|
'tailscale_peers': max((v.get('tailscale_peers', 0) for v in online.values()), default=0)
|
|
},
|
|
'nodes': nodes
|
|
}
|
|
|
|
with open('$OUT', 'w') as f:
|
|
json.dump(output, f, indent=2)
|
|
" 2>/dev/null
|
|
|
|
ok "Services collected"
|