aboutsummaryrefslogtreecommitdiffstats
path: root/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
diff options
context:
space:
mode:
Diffstat (limited to 'terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl')
-rw-r--r--terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl32
1 files changed, 32 insertions, 0 deletions
diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
index 4918a5f5bd..d851628fcd 100644
--- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
+++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -188,6 +188,24 @@ job "${job_name}" {
data = <<EOH
---
groups:
+- name: "Jenkins Job Health Exporter"
+ rules:
+ - alert: JenkinsJobHealthExporterFailures
+ expr: jenkins_job_failure{id=~".*"} >= 10
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
+ description: "Job: {{ $labels.id }}"
+ - alert: JenkinsJobHealthExporterUnstable
+ expr: jenkins_job_unstable{id=~".*"} >= 10
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
+ description: "Job: {{ $labels.id }}"
- name: "Consul"
rules:
- alert: ConsulServiceHealthcheckFailed
@@ -523,6 +541,20 @@ scrape_configs:
- targets: [ '10.32.8.16:8080' ]
- targets: [ '10.32.8.17:8080' ]
+ - job_name: 'Jenkins Job Health Exporter'
+ static_configs:
+ - targets: [ '10.30.51.32:9186' ]
+ metric_relabel_configs:
+ - source_labels: [ __name__ ]
+ regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+ action: replace
+ replacement: '$1'
+ target_label: id
+ - source_labels: [ __name__ ]
+ regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+ replacement: 'jenkins_job_$2'
+ target_label: __name__
+
- job_name: 'Node Exporter'
static_configs:
- targets: [ '10.30.51.28:9100' ]