From f3abe91a999d49df7d1fcaa6d311d61c80d17f61 Mon Sep 17 00:00:00 2001 From: Victoria Godsoe Date: Tue, 13 Jan 2026 11:39:05 -0800 Subject: [PATCH 1/5] Add monitoring for LIRA self-hosted CI runners --- .../workflows/monitor_selfhosted_runners.yml | 76 +++++++++++++++++++ .github/workflows/runner_heartbeat.yml | 32 ++++++++ 2 files changed, 108 insertions(+) create mode 100644 .github/workflows/monitor_selfhosted_runners.yml create mode 100644 .github/workflows/runner_heartbeat.yml diff --git a/.github/workflows/monitor_selfhosted_runners.yml b/.github/workflows/monitor_selfhosted_runners.yml new file mode 100644 index 0000000..cfaf761 --- /dev/null +++ b/.github/workflows/monitor_selfhosted_runners.yml @@ -0,0 +1,76 @@ +name: Monitor Self-Hosted Runners + +on: + schedule: + - cron: "0 1 * * 1" # every Monday at 1AM UTC + workflow_dispatch: + push: + branches: + - vgodsoe/monitor-runners # Temporary: remove before merging + paths: + - '.github/workflows/monitor_selfhosted_runners.yml' + +jobs: + check: + runs-on: ubuntu-latest + + steps: + - name: Get list of org runners + id: runners + run: | + curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners \ + > runners.json + + echo "names=$(jq -r '.runners[].name' runners.json | tr '\n' ',')" >> $GITHUB_OUTPUT + + - name: Get artifacts + id: artifacts + run: | + curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/actions/artifacts \ + > artifacts.json + + - name: Check each runner heartbeat + id: check + run: | + missing="" + now=$(date -u +%s) + + for r in $(echo "${{ steps.runners.outputs.names }}" | tr ',' ' '); do + [ -z "$r" ] && continue + echo "Checking $r" + + ts=$(jq -r --arg r "$r" '.artifacts[] | select(.name=="heartbeat-"+$r) | .updated_at' artifacts.json) + + if [ -z "$ts" ] || [ "$ts" == "null" ]; then + echo "No heartbeat found for $r" + missing="$missing $r" + continue + fi + + hb=$(date -d "$ts" +%s) + diff=$((now - hb)) + + if [ $diff -gt 691200 ]; then # 8 days (weekly + 1 day buffer) + echo "Heartbeat stale for $r (last seen: $ts)" + missing="$missing $r" + fi + done + + echo "missing=$missing" >> $GITHUB_OUTPUT + + - name: Fail if any runner missing + if: steps.check.outputs.missing != '' + run: | + echo "Missing or stale runners: ${{ steps.check.outputs.missing }}" + exit 1 + + - name: Send Teams alert + if: failure() + run: | + curl -H "Content-Type: application/json" \ + -d "{\"text\": \"⚠️ Self-hosted runner(s) offline: ${{ steps.check.outputs.missing }}\"}" \ + ${{ secrets.TEAMS_WEBHOOK_URL }} diff --git a/.github/workflows/runner_heartbeat.yml b/.github/workflows/runner_heartbeat.yml new file mode 100644 index 0000000..a68c2e9 --- /dev/null +++ b/.github/workflows/runner_heartbeat.yml @@ -0,0 +1,32 @@ +name: Runner Heartbeat + +on: + schedule: + - cron: "0 1 * * 0" # every Sunday at 1AM UTC + workflow_dispatch: + push: + branches: + - vgodsoe/monitor-runners # Temporary: remove before merging + paths: + - '.github/workflows/runner_heartbeat.yml' + +permissions: + actions: write # Required to upload artifacts + +jobs: + heartbeat: + runs-on: [self-hosted] + steps: + - name: Create heartbeat file + shell: bash + run: | + timestamp=$(date -Iseconds) + runner="${{ runner.name }}" + echo "$timestamp" > "heartbeat-$runner.txt" + + - name: Upload heartbeat artifact + uses: actions/upload-artifact@v4 + with: + name: heartbeat-${{ runner.name }} + path: heartbeat-${{ runner.name }}.txt + retention-days: 14 From 0eff37a1a7077ff10d2dba25d7f0057aea1187d3 Mon Sep 17 00:00:00 2001 From: Victoria Godsoe Date: Tue, 13 Jan 2026 11:42:42 -0800 Subject: [PATCH 2/5] Change from bash to pwsh so it'll work on windows and linux --- .github/workflows/runner_heartbeat.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/runner_heartbeat.yml b/.github/workflows/runner_heartbeat.yml index a68c2e9..1b50f42 100644 --- a/.github/workflows/runner_heartbeat.yml +++ b/.github/workflows/runner_heartbeat.yml @@ -18,11 +18,11 @@ jobs: runs-on: [self-hosted] steps: - name: Create heartbeat file - shell: bash + shell: pwsh run: | - timestamp=$(date -Iseconds) - runner="${{ runner.name }}" - echo "$timestamp" > "heartbeat-$runner.txt" + $timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK" + $runner = "${{ runner.name }}" + "$timestamp" | Out-File -FilePath "heartbeat-$runner.txt" -Encoding utf8 - name: Upload heartbeat artifact uses: actions/upload-artifact@v4 From 7d7d2ecbe6f248e4cb055bd26d9ecaa8e7b846ed Mon Sep 17 00:00:00 2001 From: Victoria Godsoe Date: Tue, 13 Jan 2026 11:45:01 -0800 Subject: [PATCH 3/5] powershell --- .github/workflows/runner_heartbeat.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/runner_heartbeat.yml b/.github/workflows/runner_heartbeat.yml index 1b50f42..0e564e7 100644 --- a/.github/workflows/runner_heartbeat.yml +++ b/.github/workflows/runner_heartbeat.yml @@ -18,7 +18,7 @@ jobs: runs-on: [self-hosted] steps: - name: Create heartbeat file - shell: pwsh + shell: powershell run: | $timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK" $runner = "${{ runner.name }}" From a0266f372a44eae957eddf3c94d45b0652d8feb3 Mon Sep 17 00:00:00 2001 From: Victoria Godsoe Date: Tue, 13 Jan 2026 11:49:49 -0800 Subject: [PATCH 4/5] Tested and ready to deploy --- .github/workflows/monitor_selfhosted_runners.yml | 7 +------ .github/workflows/runner_heartbeat.yml | 5 ----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.github/workflows/monitor_selfhosted_runners.yml b/.github/workflows/monitor_selfhosted_runners.yml index cfaf761..c756a5d 100644 --- a/.github/workflows/monitor_selfhosted_runners.yml +++ b/.github/workflows/monitor_selfhosted_runners.yml @@ -2,13 +2,8 @@ name: Monitor Self-Hosted Runners on: schedule: - - cron: "0 1 * * 1" # every Monday at 1AM UTC + - cron: "0 4 * * 0" # every Sunday at 4AM UTC workflow_dispatch: - push: - branches: - - vgodsoe/monitor-runners # Temporary: remove before merging - paths: - - '.github/workflows/monitor_selfhosted_runners.yml' jobs: check: diff --git a/.github/workflows/runner_heartbeat.yml b/.github/workflows/runner_heartbeat.yml index 0e564e7..9a0e60a 100644 --- a/.github/workflows/runner_heartbeat.yml +++ b/.github/workflows/runner_heartbeat.yml @@ -4,11 +4,6 @@ on: schedule: - cron: "0 1 * * 0" # every Sunday at 1AM UTC workflow_dispatch: - push: - branches: - - vgodsoe/monitor-runners # Temporary: remove before merging - paths: - - '.github/workflows/runner_heartbeat.yml' permissions: actions: write # Required to upload artifacts From 968a0e5e9545325c5500de95e029035628e05d26 Mon Sep 17 00:00:00 2001 From: Victoria Godsoe Date: Tue, 13 Jan 2026 13:51:37 -0800 Subject: [PATCH 5/5] Fixing the workflows for the runners and ensuring the teams plugin works --- .../workflows/monitor_selfhosted_runners.yml | 82 ++++++++++++++++--- .github/workflows/runner_heartbeat.yml | 20 ++++- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/.github/workflows/monitor_selfhosted_runners.yml b/.github/workflows/monitor_selfhosted_runners.yml index c756a5d..aca7c05 100644 --- a/.github/workflows/monitor_selfhosted_runners.yml +++ b/.github/workflows/monitor_selfhosted_runners.yml @@ -2,23 +2,47 @@ name: Monitor Self-Hosted Runners on: schedule: - - cron: "0 4 * * 0" # every Sunday at 4AM UTC + - cron: "0 1 * * 1" # every Monday at 1AM UTC workflow_dispatch: +permissions: + actions: read # Required to read artifacts + jobs: check: runs-on: ubuntu-latest steps: - - name: Get list of org runners + - name: Get list of runners id: runners run: | + # Try to get repo-level runners first + curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + https://api.github.com/repos/${{ github.repository }}/actions/runners \ + > repo_runners.json + + # Also get org-level runners curl -s \ -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ https://api.github.com/orgs/${{ github.repository_owner }}/actions/runners \ - > runners.json - - echo "names=$(jq -r '.runners[].name' runners.json | tr '\n' ',')" >> $GITHUB_OUTPUT + > org_runners.json + + # Combine and deduplicate runner names + repo_names=$(jq -r '.runners[]?.name // empty' repo_runners.json 2>/dev/null || echo "") + org_names=$(jq -r '.runners[]?.name // empty' org_runners.json 2>/dev/null || echo "") + + # Fallback to known runner list if API fails + known_runners="xsjevo04" + + if [ -n "$repo_names" ] || [ -n "$org_names" ]; then + all_names=$(echo -e "$repo_names\n$org_names" | sort -u | tr '\n' ',' | sed 's/,$//') + echo "names=$all_names" >> $GITHUB_OUTPUT + echo "Using API discovered runners: $all_names" + else + echo "names=$known_runners" >> $GITHUB_OUTPUT + echo "Using fallback runner list: $known_runners" + fi - name: Get artifacts id: artifacts @@ -38,7 +62,7 @@ jobs: [ -z "$r" ] && continue echo "Checking $r" - ts=$(jq -r --arg r "$r" '.artifacts[] | select(.name=="heartbeat-"+$r) | .updated_at' artifacts.json) + ts=$(jq -r --arg r "$r" '[.artifacts[] | select(.name=="heartbeat-"+$r)] | sort_by(.updated_at) | last | .updated_at // empty' artifacts.json) if [ -z "$ts" ] || [ "$ts" == "null" ]; then echo "No heartbeat found for $r" @@ -63,9 +87,47 @@ jobs: echo "Missing or stale runners: ${{ steps.check.outputs.missing }}" exit 1 - - name: Send Teams alert + - name: Send Teams alert via Power Automate if: failure() run: | - curl -H "Content-Type: application/json" \ - -d "{\"text\": \"⚠️ Self-hosted runner(s) offline: ${{ steps.check.outputs.missing }}\"}" \ - ${{ secrets.TEAMS_WEBHOOK_URL }} + echo "Sending Teams alert via Power Automate..." + response=$(curl -s -w "%{http_code}" -X POST \ + -H "Content-Type: application/json" \ + -d '{ + "type": "AdaptiveCard", + "version": "1.3", + "body": [ + { + "type": "TextBlock", + "text": "⚠️ Self-hosted Runner Alert", + "weight": "bolder", + "size": "medium", + "color": "attention" + }, + { + "type": "TextBlock", + "text": "The following runners are offline or missing heartbeats:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${{ steps.check.outputs.missing }}", + "wrap": true, + "fontType": "monospace" + } + ] + }' \ + "${{ secrets.TEAMS_WEBHOOK_URL }}") + + http_code="${response: -3}" + response_body="${response%???}" + + echo "HTTP Status: $http_code" + echo "Response: $response_body" + + + if [ "$http_code" != "200" ] && [ "$http_code" != "202" ]; then + echo "Power Automate webhook failed with status $http_code" + else + echo "Teams alert sent successfully via Power Automate" + fi diff --git a/.github/workflows/runner_heartbeat.yml b/.github/workflows/runner_heartbeat.yml index 9a0e60a..390f483 100644 --- a/.github/workflows/runner_heartbeat.yml +++ b/.github/workflows/runner_heartbeat.yml @@ -1,5 +1,7 @@ name: Runner Heartbeat +# Change to trigger push + on: schedule: - cron: "0 1 * * 0" # every Sunday at 1AM UTC @@ -10,15 +12,29 @@ permissions: jobs: heartbeat: - runs-on: [self-hosted] + strategy: + matrix: + runner: + - xsjevo04 + fail-fast: false # Continue even if one runner fails + runs-on: [self-hosted, "${{ matrix.runner }}"] steps: - - name: Create heartbeat file + - name: Create heartbeat file (Windows) + if: runner.os == 'Windows' shell: powershell run: | $timestamp = Get-Date -Format "yyyy-MM-ddTHH:mm:ssK" $runner = "${{ runner.name }}" "$timestamp" | Out-File -FilePath "heartbeat-$runner.txt" -Encoding utf8 + - name: Create heartbeat file (Linux) + if: runner.os == 'Linux' + shell: bash + run: | + timestamp=$(date -Iseconds) + runner="${{ runner.name }}" + echo "$timestamp" > "heartbeat-$runner.txt" + - name: Upload heartbeat artifact uses: actions/upload-artifact@v4 with: