blackroad-os-prism-console/.github/workflows/health-monitor.yml

name: Health Monitor & Auto-Heal

on:
  schedule:
    - cron: '*/5 * * * *'  # Every 5 minutes
  workflow_dispatch:

jobs:
  health-check:
    runs-on: ubuntu-latest

    steps:
      - name: Checkout
        uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332

      - name: Check Service Health
        id: health
        run: |
          echo "🏥 Checking service health..."

          # Detect which service this is
          SERVICE_NAME="${GITHUB_REPOSITORY#*/}"
          echo "service_name=$SERVICE_NAME" >> $GITHUB_OUTPUT

          # Try to determine the deployment URL
          # This would normally come from Railway via their API
          HEALTH_URL=""

          if [ -f "railway.json" ] || [ -f "railway.toml" ]; then
            echo "is_railway=true" >> $GITHUB_OUTPUT
          fi

          # For now, check if we have a known production URL
          case "$SERVICE_NAME" in
            "blackroad-os-web")
              HEALTH_URL="https://www.blackroad.io/health"
              ;;
            "blackroad-os-prism-console")
              HEALTH_URL="https://app.blackroad.io/health"
              ;;
            "blackroad-os-agents")
              HEALTH_URL="https://agents.blackroad.io/health"
              ;;
            "blackroad-os-operator")
              HEALTH_URL="https://ops.blackroad.io/health"
              ;;
            "blackroad-api")
              HEALTH_URL="https://api.blackroad.io/health"
              ;;
          esac

          echo "health_url=$HEALTH_URL" >> $GITHUB_OUTPUT

          if [ -n "$HEALTH_URL" ]; then
            echo "Testing: $HEALTH_URL"

            # Attempt health check with retries
            RETRY_COUNT=0
            MAX_RETRIES=3
            HEALTH_OK=false

            while [ $RETRY_COUNT -lt $MAX_RETRIES ]; do
              HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" || echo "000")

              if [ "$HTTP_CODE" = "200" ]; then
                echo "✅ Health check passed (HTTP $HTTP_CODE)"
                HEALTH_OK=true
                break
              else
                echo "⚠️ Health check failed (HTTP $HTTP_CODE) - Retry $((RETRY_COUNT + 1))/$MAX_RETRIES"
                RETRY_COUNT=$((RETRY_COUNT + 1))
                sleep 10
              fi
            done

            if [ "$HEALTH_OK" = "false" ]; then
              echo "health_status=failed" >> $GITHUB_OUTPUT
              echo "❌ Service is unhealthy after $MAX_RETRIES attempts"
              exit 1
            else
              echo "health_status=ok" >> $GITHUB_OUTPUT
            fi
          else
            echo "health_status=unknown" >> $GITHUB_OUTPUT
            echo "⚠️ No health URL configured for $SERVICE_NAME"
          fi

      - name: Auto-Restart Service
        if: steps.health.outputs.health_status == 'failed' && steps.health.outputs.is_railway == 'true'
        env:
          RAILWAY_TOKEN: ${{ secrets.RAILWAY_TOKEN }}
        run: |
          echo "🔄 Attempting to restart unhealthy service..."

          npm install -g @railway/cli

          # Get current deployment and restart
          railway status || true
          railway up --detach || {
            echo "⚠️ Restart failed, triggering full redeploy..."
            git commit --allow-empty -m "🤖 Auto-heal: Force redeploy due to health check failure

            Health check failed after multiple retries
            Triggering full redeployment

            Co-Authored-By: Claude <noreply@anthropic.com>"
            git push
          }

          echo "✅ Restart initiated"

      - name: Verify Recovery
        if: steps.health.outputs.health_status == 'failed'
        run: |
          echo "🔍 Waiting for service recovery..."
          sleep 60

          HEALTH_URL="${{ steps.health.outputs.health_url }}"
          if [ -n "$HEALTH_URL" ]; then
            HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "$HEALTH_URL" || echo "000")

            if [ "$HTTP_CODE" = "200" ]; then
              echo "✅ Service recovered successfully!"
            else
              echo "❌ Service still unhealthy after restart (HTTP $HTTP_CODE)"
              exit 1
            fi
          fi

      - name: Create Incident Issue
        if: failure()
        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea
        with:
          script: |
            const service = '${{ steps.health.outputs.service_name }}';
            const healthUrl = '${{ steps.health.outputs.health_url }}';

            const issue = {
              owner: context.repo.owner,
              repo: context.repo.repo,
              title: `🚨 Service Down: ${service}`,
              body: `## Service Health Alert

            **Service:** \`${service}\`
            **Health URL:** ${healthUrl || 'Not configured'}
            **Status:** Unhealthy
            **Auto-Restart:** Failed

            ### Timeline
            - Health check failed after 3 retries
            - Auto-restart attempted
            - Recovery verification failed

            ### Next Steps
            1. Check Railway logs for errors
            2. Review recent deployments
            3. Verify environment variables
            4. Check external dependencies (databases, APIs)

            ### Monitoring
            - Workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

            This incident was automatically detected and requires manual intervention.

            cc: @blackboxprogramming

            ---
            🤖 Auto-generated incident report`,
              labels: ['incident', 'service-down', 'urgent', 'auto-detected']
            };

            // Check for existing open incident
            const { data: issues } = await github.rest.issues.listForRepo({
              owner: context.repo.owner,
              repo: context.repo.repo,
              state: 'open',
              labels: 'service-down'
            });

            const existingIncident = issues.find(i =>
              i.title.includes(service)
            );

            if (!existingIncident) {
              const { data: newIssue } = await github.rest.issues.create(issue);
              console.log('Created incident:', newIssue.html_url);
            } else {
              console.log('Updating existing incident:', existingIncident.html_url);
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: existingIncident.number,
                body: `🔄 Service still down. Auto-restart attempt failed.

            Health check time: ${new Date().toISOString()}

            Recovery actions taken:
            - Retry health check (3 attempts)
            - Railway restart command
            - Force redeploy trigger

            Manual intervention required.`
              });
            }