From 8843893ca7531cbb2212a5ed79882909c8374381 Mon Sep 17 00:00:00 2001
From: pmikus <pmikus@cisco.com>
Date: Wed, 28 Jul 2021 10:44:43 +0000
Subject: Infra: upgrade monitoring solution

+ bump versions
- remove cadvisor

Signed-off-by: pmikus <pmikus@cisco.com>
Change-Id: I3de95531f1b09f7254152254e92f225dde653e45
---
 fdio.infra.ansible/nomad.yaml                      |  2 -
 .../roles/prometheus_exporter/defaults/main.yaml   | 32 +++++++---
 .../roles/prometheus_exporter/handlers/main.yaml   | 16 -----
 .../roles/prometheus_exporter/tasks/main.yaml      | 69 +++++++++++++++++++--
 .../prometheus_exporter/tasks/ubuntu_bionic.yaml   | 33 ----------
 .../prometheus_exporter/tasks/ubuntu_focal.yaml    | 33 ----------
 fdio.infra.ansible/vpp_device.yaml                 |  2 -
 fdio.infra.terraform/1n_nmd/main.tf                |  6 +-
 .../1n_nmd/prometheus/conf/nomad/prometheus.hcl    | 34 +----------
 .../1n_nmd/prometheus/variables.tf                 |  2 +-
 fdio.infra.terraform/1n_nmd/terraform.tfstate      | 70 ++++++++++-----------
 .../1n_nmd/terraform.tfstate.backup                | 71 ++++++++++++----------
 12 files changed, 164 insertions(+), 206 deletions(-)
 delete mode 100644 fdio.infra.ansible/roles/prometheus_exporter/handlers/main.yaml
 delete mode 100644 fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
 delete mode 100644 fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_focal.yaml

diff --git a/fdio.infra.ansible/nomad.yaml b/fdio.infra.ansible/nomad.yaml
index dc13386756..1ebbb417cd 100644
--- a/fdio.infra.ansible/nomad.yaml
+++ b/fdio.infra.ansible/nomad.yaml
@@ -26,7 +26,5 @@
       tags: prometheus_exporter
     - role: jenkins_job_health_exporter
       tags: jenkins_job_health_exporter
-    - role: cadvisor
-      tags: cadvisor
     - role: cleanup
       tags: cleanup
diff --git a/fdio.infra.ansible/roles/prometheus_exporter/defaults/main.yaml b/fdio.infra.ansible/roles/prometheus_exporter/defaults/main.yaml
index eb2b94cb26..e97ad2b0c7 100644
--- a/fdio.infra.ansible/roles/prometheus_exporter/defaults/main.yaml
+++ b/fdio.infra.ansible/roles/prometheus_exporter/defaults/main.yaml
@@ -2,16 +2,30 @@
 # file: roles/prometheus_exporter/defaults/main.yaml
 
 # Inst - Exporters.
-ne_packages: "{{ ne_packages_by_distro[ansible_distribution | lower][ansible_machine] }}"
+packages: "{{ packages_base + packages_by_distro[ansible_distribution | lower] + packages_by_arch[ansible_machine] }}"
 
-ne_packages_by_distro:
+packages_base:
+  - []
+
+packages_by_distro:
   ubuntu:
-    aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_arm64.deb"
-    x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-node-exporter/prometheus-node-exporter_1.0.1+ds-1_amd64.deb"
+    - "python3-docker"
+    - "python3-dockerpty"
 
-be_packages: "{{ be_packages_by_distro[ansible_distribution | lower][ansible_machine] }}"
+packages_by_arch:
+  aarch64:
+    - []
+  x86_64:
+    - []
 
-be_packages_by_distro:
-  ubuntu:
-    aarch64: "http://ports.ubuntu.com/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_arm64.deb"
-    x86_64: "http://archive.ubuntu.com/ubuntu/pool/universe/p/prometheus-blackbox-exporter/prometheus-blackbox-exporter_0.17.0+ds-1_amd64.deb"
+ne_image: "{{ ne_image_by_arch[ansible_machine] }}"
+
+ne_image_by_arch:
+  aarch64: "prom/node-exporter:v1.2.0"
+  x86_64: "prom/node-exporter:v1.2.0"
+
+be_image: "{{ be_image_by_arch[ansible_machine] }}"
+
+be_image_by_arch:
+  aarch64: "prom/blackbox-exporter:v0.19.0"
+  x86_64: "prom/blackbox-exporter:v0.19.0"
diff --git a/fdio.infra.ansible/roles/prometheus_exporter/handlers/main.yaml b/fdio.infra.ansible/roles/prometheus_exporter/handlers/main.yaml
deleted file mode 100644
index 9c374eaa61..0000000000
--- a/fdio.infra.ansible/roles/prometheus_exporter/handlers/main.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
----
-# file roles/prometheus_exporter/handlers/main.yaml
-
-- name: Restart Prometheus Node Exporter
-  systemd:
-    daemon_reload: true
-    enabled: true
-    name: "prometheus-node-exporter"
-    state: "restarted"
-
-- name: Restart Prometheus Blackbox Exporter
-  systemd:
-    daemon_reload: true
-    enabled: true
-    name: "prometheus-blackbox-exporter"
-    state: "restarted"
\ No newline at end of file
diff --git a/fdio.infra.ansible/roles/prometheus_exporter/tasks/main.yaml b/fdio.infra.ansible/roles/prometheus_exporter/tasks/main.yaml
index b38215c4a2..3f4b563352 100644
--- a/fdio.infra.ansible/roles/prometheus_exporter/tasks/main.yaml
+++ b/fdio.infra.ansible/roles/prometheus_exporter/tasks/main.yaml
@@ -1,15 +1,72 @@
 ---
 # file: roles/prometheus_exporter/tasks/main.yaml
 
-- include_tasks: "{{ ansible_distribution|lower }}_{{ ansible_distribution_release }}.yaml"
+- name: Inst - Update Package Cache (APT)
+  apt:
+    update_cache: yes
+    cache_valid_time: 3600
+  when:
+    - ansible_distribution|lower == 'ubuntu'
   tags:
     - prometheus-inst
 
+- name: Inst - Prerequisites
+  package:
+    name: "{{ packages | flatten(levels=1) }}"
+    state: latest
+  tags:
+    - prometheus-inst
+
+- name: Inst - Start a NodeExporter container
+  docker_container:
+    name: "NodeExporter"
+    image: "{{ ne_image }}"
+    state: "started"
+    restart_policy: "unless-stopped"
+    detach: yes
+    ports:
+     - "9100:9100"
+    privileged: yes
+    command:
+      - "--path.procfs=/host/proc"
+      - "--path.rootfs=/rootfs"
+      - "--path.sysfs=/host/sys"
+      - "--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)"
+    volumes:
+      - "/:/rootfs:ro"
+      - "/proc:/host/proc:ro"
+      - "/sys:/host/sys:ro"
+  tags:
+    - prometheus-inst
+
+- name: Inst - Create a Config Directory
+  ansible.builtin.file:
+    path: "/etc/prometheus/"
+    state: "directory"
+    mode: "0755"
+  tags:
+    - prometheus-conf-blackbox-exporter
+
 - name: Conf - Prometheus Blackbox Exporter
   copy:
-   src: 'files/blackbox.yml'
-   dest: '/etc/prometheus/blackbox.yml'
-  notify:
-    - "Restart Prometheus Blackbox Exporter"
+   src: "files/blackbox.yml"
+   dest: "/etc/prometheus/blackbox.yml"
+  tags:
+    - prometheus-conf-blackbox-exporter
+
+- name: Inst - Start a BlackBoxExporter container
+  docker_container:
+    name: "BlackBoxExporter"
+    image: "{{ be_image }}"
+    state: "started"
+    restart_policy: "unless-stopped"
+    detach: yes
+    ports:
+     - "9115:9115"
+    privileged: yes
+    command:
+      - "--config.file=/config/blackbox.yml"
+    volumes:
+      - "/etc/prometheus/blackbox.yml:/config/blackbox.yml:ro"
   tags:
-    - prometheus-conf-blackbox-exporter
\ No newline at end of file
+    - prometheus-inst
\ No newline at end of file
diff --git a/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml b/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
deleted file mode 100644
index 566753e272..0000000000
--- a/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
----
-# file: roles/prometheus_exporter/tasks/ubuntu_bionic.yaml
-
-- name: Inst - Update Package Cache (APT)
-  apt:
-    update_cache: yes
-    cache_valid_time: 3600
-  tags:
-    - prometheus-inst-prerequisites
-
-- name: Inst - Prerequisites
-  package:
-    name: "init-system-helpers"
-    default_release: "bionic-backports"
-    state: latest
-  tags:
-    - prometheus-inst-prerequisites
-
-- name: Inst - Prometheus Node Exporter
-  apt:
-    deb: "{{ ne_packages }}"
-  notify:
-    - "Restart Prometheus Node Exporter"
-  tags:
-    - prometheus-inst-node-exporter
-
-- name: Inst - Prometheus Blackbox Exporter
-  apt:
-    deb: "{{ be_packages }}"
-  notify:
-    - "Restart Prometheus Blackbox Exporter"
-  tags:
-    - prometheus-inst-blackbox-exporter
\ No newline at end of file
diff --git a/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_focal.yaml b/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_focal.yaml
deleted file mode 100644
index 3d7064355e..0000000000
--- a/fdio.infra.ansible/roles/prometheus_exporter/tasks/ubuntu_focal.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
----
-# file: roles/prometheus_exporter/tasks/ubuntu_focal.yaml
-
-- name: Inst - Update Package Cache (APT)
-  apt:
-    update_cache: yes
-    cache_valid_time: 3600
-  tags:
-    - prometheus-inst-prerequisites
-
-- name: Inst - Prerequisites
-  package:
-    name: "init-system-helpers"
-    default_release: "focal-backports"
-    state: latest
-  tags:
-    - prometheus-inst-prerequisites
-
-- name: Inst - Prometheus Node Exporter
-  apt:
-    deb: "{{ ne_packages }}"
-  notify:
-    - "Restart Prometheus Node Exporter"
-  tags:
-    - prometheus-inst-node-exporter
-
-- name: Inst - Prometheus Blackbox Exporter
-  apt:
-    deb: "{{ be_packages }}"
-  notify:
-    - "Restart Prometheus Blackbox Exporter"
-  tags:
-    - prometheus-inst-blackbox-exporter
\ No newline at end of file
diff --git a/fdio.infra.ansible/vpp_device.yaml b/fdio.infra.ansible/vpp_device.yaml
index 9e334e1749..ae54669ae9 100644
--- a/fdio.infra.ansible/vpp_device.yaml
+++ b/fdio.infra.ansible/vpp_device.yaml
@@ -30,8 +30,6 @@
       tags: prometheus_exporter
     - role: jenkins_job_health_exporter
       tags: jenkins_job_health_exporter
-    - role: cadvisor
-      tags: cadvisor
     - role: vpp_device
       tags: vpp_device
     - role: kernel_vm
diff --git a/fdio.infra.terraform/1n_nmd/main.tf b/fdio.infra.terraform/1n_nmd/main.tf
index 1d5c3e4f7d..d48f12a046 100644
--- a/fdio.infra.terraform/1n_nmd/main.tf
+++ b/fdio.infra.terraform/1n_nmd/main.tf
@@ -76,7 +76,7 @@ module "minio" {
   minio_service_name             = "storage"
   minio_host                     = "http://10.32.8.1{4...7}"
   minio_port                     = 9000
-  minio_container_image          = "minio/minio:RELEASE.2021-07-08T01-15-01Z"
+  minio_container_image          = "minio/minio:RELEASE.2021-07-27T02-40-15Z"
   minio_vault_secret             = {
     use_vault_provider           = false,
     vault_kv_policy_name         = "kv-secret",
@@ -91,7 +91,7 @@ module "minio" {
 
   # minio client
   mc_job_name                    = "prod-mc"
-  mc_container_image             = "minio/mc:RELEASE.2021-07-08T01-15-01Z"
+  mc_container_image             = "minio/mc:RELEASE.2021-07-27T02-40-15Z"
   mc_extra_commands              = [
     "mc policy set public LOCALMINIO/logs.fd.io",
     "mc policy set public LOCALMINIO/docs.fd.io",
@@ -140,7 +140,7 @@ module "prometheus" {
   }
   prometheus_data_dir            = "/data/"
   prometheus_use_host_volume     = true
-  prometheus_version             = "2.24.0"
+  prometheus_version             = "2.28.1"
   prometheus_cpu                 = 2000
   prometheus_mem                 = 8192
   prometheus_port                = 9090
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl
index adc30318c4..3d0b2c2eef 100644
--- a/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl
+++ b/fdio.infra.terraform/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -175,7 +175,7 @@ job "${job_name}" {
         args            = [
           "--config.file=secrets/prometheus.yml",
           "--storage.tsdb.path=${data_dir}prometheus/",
-          "--storage.tsdb.retention.time=15d"
+          "--storage.tsdb.retention.time=7d"
         ]
       }
 
@@ -265,14 +265,6 @@ groups:
     annotations:
       summary: "Prometheus target missing (instance {{ $labels.instance }})."
       description: "A Prometheus target has disappeared. An exporter might be crashed."
-  - alert: HostHighCpuLoad
-    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 95
-    for: 0m
-    labels:
-      severity: warning
-    annotations:
-      summary: "Host high CPU load (instance {{ $labels.instance }})."
-      description: "CPU load is > 95%."
   - alert: HostOutOfMemory
     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
     for: 2m
@@ -540,30 +532,6 @@ scrape_configs:
         replacement: localhost:9115
     metrics_path: /probe
 
-  - job_name: 'cAdvisor Exporter'
-    static_configs:
-      - targets: [ '10.30.51.28:8080' ]
-      - targets: [ '10.30.51.29:8080' ]
-      - targets: [ '10.30.51.30:8080' ]
-      #- targets: [ '10.30.51.32:8080' ]
-      - targets: [ '10.30.51.33:8080' ]
-      - targets: [ '10.30.51.34:8080' ]
-      - targets: [ '10.30.51.35:8080' ]
-      - targets: [ '10.30.51.39:8080' ]
-      - targets: [ '10.30.51.40:8080' ]
-      - targets: [ '10.30.51.50:8080' ]
-      - targets: [ '10.30.51.51:8080' ]
-      - targets: [ '10.30.51.65:8080' ]
-      - targets: [ '10.30.51.66:8080' ]
-      - targets: [ '10.30.51.67:8080' ]
-      - targets: [ '10.30.51.68:8080' ]
-      - targets: [ '10.30.51.70:8080' ]
-      - targets: [ '10.30.51.71:8080' ]
-      - targets: [ '10.32.8.14:8080' ]
-      - targets: [ '10.32.8.15:8080' ]
-      - targets: [ '10.32.8.16:8080' ]
-      - targets: [ '10.32.8.17:8080' ]
-
   - job_name: 'Jenkins Job Health Exporter'
     static_configs:
       - targets: [ '10.30.51.32:9186' ]
diff --git a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
index a509533ccd..55ffa33856 100644
--- a/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
+++ b/fdio.infra.terraform/1n_nmd/prometheus/variables.tf
@@ -33,7 +33,7 @@ variable "prometheus_service_name" {
 variable "prometheus_version" {
   description = "Prometheus version"
   type        = string
-  default     = "v2.24.0"
+  default     = "v2.28.1"
 }
 
 variable "prometheus_use_canary" {
diff --git a/fdio.infra.terraform/1n_nmd/terraform.tfstate b/fdio.infra.terraform/1n_nmd/terraform.tfstate
index 2a90a2d90e..7e4875858d 100644
--- a/fdio.infra.terraform/1n_nmd/terraform.tfstate
+++ b/fdio.infra.terraform/1n_nmd/terraform.tfstate
@@ -1,7 +1,7 @@
 {
   "version": 4,
-  "terraform_version": "1.0.0",
-  "serial": 1170,
+  "terraform_version": "1.0.2",
+  "serial": 1178,
   "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
   "outputs": {},
   "resources": [
@@ -197,15 +197,15 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "558e3a2e96b2b0f9d94e57f74a784d89020ce8d1d877e7adb82301065dfbacb7",
-            "rendered": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2021-07-08T01-15-01Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
+            "id": "e9e956124e6fed4268bec199b9d7fd7aaff506e906b53621c6dc0e3116c40f52",
+            "rendered": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2021-07-27T02-40-15Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
             "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies    = \"${vault_kv_policy_name}\"\n      }\n     %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"${image}\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"${command}\"\n        ]\n        dns_servers  = [ \"$${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        %{ if use_vault_provider }\n        {{ with secret \"${vault_kv_path}\" }}\n        MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n        MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n        {{ end }}\n        %{ else }\n        MINIO_ACCESS_KEY = \"${access_key}\"\n        MINIO_SECRET_KEY = \"${secret_key}\"\n        %{ endif }\n        ${ envs }\n      }\n    }\n  }\n}\n",
             "vars": {
               "access_key": "minio",
               "command": "mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage",
               "datacenters": "yul1",
               "envs": "",
-              "image": "minio/mc:RELEASE.2021-07-08T01-15-01Z",
+              "image": "minio/mc:RELEASE.2021-07-27T02-40-15Z",
               "job_name": "prod-mc",
               "minio_port": "9000",
               "minio_service_name": "storage",
@@ -229,8 +229,8 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "3df2a58bed278738e5280a087b9d56413910e767c60d6702506bd7820cfdd5b4",
-            "rendered": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-08T01-15-01Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "id": "68243169c0df15ef6b56803101ed0c744a84545c34df366ee4036052c6292ef3",
+            "rendered": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-27T02-40-15Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
             "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = ${group_count}\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    %{ if use_host_volume }\n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    %{ if use_host_volume }\n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"${data_dir}\"\n        read_only        = false\n      }\n    %{ endif }\n\n    %{ if use_vault_provider }\n      vault {\n        policies         = \"${vault_kv_policy_name}\"\n      }\n    %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"${image}\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"${host}:${port}${data_dir}\" ]\n        port_map {\n          http           = ${port}\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n%{ if use_vault_provider }\n{{ with secret \"${vault_kv_path}\" }}\n        MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n        MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n{{ end }}\n%{ else }\n        MINIO_ACCESS_KEY = \"${access_key}\"\n        MINIO_SECRET_KEY = \"${secret_key}\"\n%{ endif }\n        ${ envs }\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"${service_name}\"\n        port       = \"http\"\n        tags       = [ \"storage$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = ${cpu}\n        memory     = ${memory}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = ${port}\n          }\n        }\n      }\n    }\n  }\n}\n",
             "vars": {
               "access_key": "minio",
@@ -242,7 +242,7 @@
               "group_count": "4",
               "host": "http://10.32.8.1{4...7}",
               "host_volume": "prod-volume-data1-1",
-              "image": "minio/minio:RELEASE.2021-07-08T01-15-01Z",
+              "image": "minio/minio:RELEASE.2021-07-27T02-40-15Z",
               "job_name": "prod-minio",
               "memory": "40000",
               "memory_proxy": "128",
@@ -270,27 +270,23 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "da0b1991-d59e-dfe5-3dec-ec64dd9b8ec2",
-              "3a53d56a-fb8e-7218-99fe-0798241aea0d",
-              "59658812-d06d-29bd-e5f3-d35f6ff4def9",
-              "263d4e38-c44c-26c0-451d-c5b1679f5250",
-              "b66d7688-6fdb-2f49-f5d5-24719386531f",
-              "7b3c5ce9-996e-0dc3-3e9e-aced1dd928ca",
-              "732074da-1082-4cc3-8717-e83421a97739",
-              "3172e318-19f8-c9d7-17af-7cab03429e6f"
+              "8e200c5b-97ee-20a0-3383-96af1adaef6a",
+              "44e1a0f0-5328-eafb-3beb-bfd53af2b618",
+              "a6695ea0-2563-72ae-1791-a08ef933ff72",
+              "51126acb-945c-2629-5787-76b85677ddd0"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "2d18d058-a30a-a944-feb9-3e2ed8804600",
+            "deployment_id": "33d8013c-7c7c-6828-a204-58d9286b61ef",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-minio",
-            "jobspec": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-08T01-15-01Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "jobspec": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-27T02-40-15Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
             "json": null,
-            "modify_index": "8769570",
+            "modify_index": "8949013",
             "name": "prod-minio",
             "namespace": "default",
             "policy_override": null,
@@ -434,9 +430,9 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "113240c48bda39b1664958a0b9fb6058f6e613ae612cc61ff958d1a78c94c46b",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "46dfbb29c15ad00720c6f4ce699529b7934bdd666bb7656d7c17b128e7609f07",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.28.1.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
             "vars": {
               "cpu": "2000",
               "data_dir": "/data/",
@@ -447,11 +443,11 @@
               "mem": "8192",
               "port": "9090",
               "service_name": "prometheus",
-              "url": "https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz",
+              "url": "https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz",
               "use_canary": "true",
               "use_host_volume": "true",
               "use_vault_provider": "false",
-              "version": "2.24.0"
+              "version": "2.28.1"
             }
           },
           "sensitive_attributes": []
@@ -469,23 +465,27 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "20d22188-d04e-2f7d-57e7-62b79c39156d",
-              "050b4a00-1192-4a2b-4a06-ca8916d09597",
-              "dd7ade6e-43c4-cac3-3294-9bd32d6b9764",
-              "037ac779-c647-2dba-5b64-d11e2243066c"
+              "06f831b0-4ee5-0f2b-5888-081f69d25efc",
+              "fd5bd3d9-7d24-5c99-d426-3f9e95a0ea3c",
+              "7c689e4b-4601-0299-4d5e-f906a3e1b30b",
+              "f355c50b-7702-e65b-e629-ffa41678b880",
+              "d4eae934-c539-fc71-b7db-16740286ea39",
+              "fe8466ca-c32a-d6a0-86f4-b12b7816a3ac",
+              "87ec193b-dc4d-2d39-8241-86a38ddaf827",
+              "ea93f680-7e8b-6e3e-d33a-5b0c71a73dd0"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "9a4c9dd4-e3b9-33db-26f7-6e1e05962d90",
+            "deployment_id": "c27f5df9-0819-f33a-fb30-180edcd2ffcd",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.28.1.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
             "json": null,
-            "modify_index": "7575038",
+            "modify_index": "8949350",
             "name": "prod-prometheus",
             "namespace": "default",
             "policy_override": null,
@@ -569,10 +569,10 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "7d13dda4-e47f-e2b5-5ebe-e4c2c57f6a66",
-              "e41391a1-bfbf-f1cf-831d-d87da6da8b94",
-              "c9753985-b97f-5d58-79f3-843b63411333",
-              "aa511cae-fa5b-1127-1718-819660c5702b"
+              "4194fecb-5e83-0b00-09d9-7561ee7d57f4",
+              "314b725e-3907-d1f5-2538-72c7343f8cda",
+              "d89ab9c0-4c2c-8430-f7ea-b609fc8285ac",
+              "055428eb-e662-5e07-14f1-afc32a2a491f"
             ],
             "datacenters": [
               "yul1"
diff --git a/fdio.infra.terraform/1n_nmd/terraform.tfstate.backup b/fdio.infra.terraform/1n_nmd/terraform.tfstate.backup
index c58139acf0..e6522a5c42 100644
--- a/fdio.infra.terraform/1n_nmd/terraform.tfstate.backup
+++ b/fdio.infra.terraform/1n_nmd/terraform.tfstate.backup
@@ -1,7 +1,7 @@
 {
   "version": 4,
-  "terraform_version": "1.0.0",
-  "serial": 1167,
+  "terraform_version": "1.0.2",
+  "serial": 1176,
   "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
   "outputs": {},
   "resources": [
@@ -59,7 +59,7 @@
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "1cd42f80-d43d-96cf-bce9-c1f77872e48b",
+            "deployment_id": "4dc972fd-cb2c-51d8-f730-08b5f7554ceb",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
@@ -67,7 +67,7 @@
             "id": "prod-alertmanager",
             "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'TE07RD1V1/B01U1NV9HV3/hKZXJJ74g2JcISq4K3QC1eG9'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'TE07RD1V1/B01UUK23B6C/hZTcCu42FUv8d6rtirHtcYIi'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
             "json": null,
-            "modify_index": "8219920",
+            "modify_index": "8259282",
             "name": "prod-alertmanager",
             "namespace": "default",
             "policy_override": null,
@@ -197,15 +197,15 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "ca0c95bbe91c4ac393d5cd5bef8efb90fb5f176f266ba233542fd4338b51c6cc",
-            "rendered": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
+            "id": "e9e956124e6fed4268bec199b9d7fd7aaff506e906b53621c6dc0e3116c40f52",
+            "rendered": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2021-07-27T02-40-15Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
             "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies    = \"${vault_kv_policy_name}\"\n      }\n     %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"${image}\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"${command}\"\n        ]\n        dns_servers  = [ \"$${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        %{ if use_vault_provider }\n        {{ with secret \"${vault_kv_path}\" }}\n        MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n        MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n        {{ end }}\n        %{ else }\n        MINIO_ACCESS_KEY = \"${access_key}\"\n        MINIO_SECRET_KEY = \"${secret_key}\"\n        %{ endif }\n        ${ envs }\n      }\n    }\n  }\n}\n",
             "vars": {
               "access_key": "minio",
               "command": "mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage",
               "datacenters": "yul1",
               "envs": "",
-              "image": "minio/mc:RELEASE.2020-12-10T01-26-17Z",
+              "image": "minio/mc:RELEASE.2021-07-27T02-40-15Z",
               "job_name": "prod-mc",
               "minio_port": "9000",
               "minio_service_name": "storage",
@@ -229,8 +229,8 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "159eaa53aac72f9e698b899822c1bc37e57a815560e0c075a5ce4222752e71e4",
-            "rendered": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "id": "68243169c0df15ef6b56803101ed0c744a84545c34df366ee4036052c6292ef3",
+            "rendered": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-27T02-40-15Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
             "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = ${group_count}\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    %{ if use_host_volume }\n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    %{ if use_host_volume }\n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"${data_dir}\"\n        read_only        = false\n      }\n    %{ endif }\n\n    %{ if use_vault_provider }\n      vault {\n        policies         = \"${vault_kv_policy_name}\"\n      }\n    %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"${image}\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"${host}:${port}${data_dir}\" ]\n        port_map {\n          http           = ${port}\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n%{ if use_vault_provider }\n{{ with secret \"${vault_kv_path}\" }}\n        MINIO_ACCESS_KEY = \"{{ .Data.data.${vault_kv_field_access_key} }}\"\n        MINIO_SECRET_KEY = \"{{ .Data.data.${vault_kv_field_secret_key} }}\"\n{{ end }}\n%{ else }\n        MINIO_ACCESS_KEY = \"${access_key}\"\n        MINIO_SECRET_KEY = \"${secret_key}\"\n%{ endif }\n        ${ envs }\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"${service_name}\"\n        port       = \"http\"\n        tags       = [ \"storage$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = ${cpu}\n        memory     = ${memory}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = ${port}\n          }\n        }\n      }\n    }\n  }\n}\n",
             "vars": {
               "access_key": "minio",
@@ -242,7 +242,7 @@
               "group_count": "4",
               "host": "http://10.32.8.1{4...7}",
               "host_volume": "prod-volume-data1-1",
-              "image": "minio/minio:RELEASE.2020-12-03T05-49-24Z",
+              "image": "minio/minio:RELEASE.2021-07-27T02-40-15Z",
               "job_name": "prod-minio",
               "memory": "40000",
               "memory_proxy": "128",
@@ -270,22 +270,23 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "bd315c5e-035d-3a08-698d-d119fb48035f",
-              "e82e0119-4d2e-89eb-baa3-0ec6bde71661",
-              "24e462a6-6d55-a109-47bf-4c83ad5596a7"
+              "8e200c5b-97ee-20a0-3383-96af1adaef6a",
+              "44e1a0f0-5328-eafb-3beb-bfd53af2b618",
+              "a6695ea0-2563-72ae-1791-a08ef933ff72",
+              "51126acb-945c-2629-5787-76b85677ddd0"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "3d2b1511-51ee-f7e8-5e23-36a0e1022776",
+            "deployment_id": "33d8013c-7c7c-6828-a204-58d9286b61ef",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-minio",
-            "jobspec": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2020-12-03T05-49-24Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
+            "jobspec": "job \"prod-minio\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type        = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # All groups in this job should be scheduled on different hosts.\n  constraint {\n    operator          = \"distinct_hosts\"\n    value             = \"true\"\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-minio\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count                = 4\n\n    # https://www.nomadproject.io/docs/job-specification/volume\n    \n    volume \"prod-volume1-minio\" {\n      type               = \"host\"\n      read_only          = false\n      source             = \"prod-volume-data1-1\"\n    }\n    \n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval           = \"30m\"\n      attempts           = 40\n      delay              = \"15s\"\n      mode               = \"delay\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task.html\n    #\n    task \"prod-task1-minio\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver             = \"docker\"\n\n    \n      volume_mount {\n        volume           = \"prod-volume1-minio\"\n        destination      = \"/data/\"\n        read_only        = false\n      }\n    \n\n    \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image            = \"minio/minio:RELEASE.2021-07-27T02-40-15Z\"\n        dns_servers      = [ \"172.17.0.1\" ]\n        network_mode     = \"host\"\n        command          = \"server\"\n        args             = [ \"http://10.32.8.1{4...7}:9000/data/\" ]\n        port_map {\n          http           = 9000\n        }\n        privileged       = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n\n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n\n        MINIO_BROWSER=\"off\"\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name       = \"storage\"\n        port       = \"http\"\n        tags       = [ \"storage${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name     = \"Min.io Server HTTP Check Live\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/live\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n        check {\n          name     = \"Min.io Server HTTP Check Ready\"\n          type     = \"http\"\n          port     = \"http\"\n          protocol = \"http\"\n          method   = \"GET\"\n          path     = \"/minio/health/ready\"\n          interval = \"10s\"\n          timeout  = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu        = 40000\n        memory     = 40000\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"http\" {\n            static = 9000\n          }\n        }\n      }\n    }\n  }\n}\n",
             "json": null,
-            "modify_index": "7541139",
+            "modify_index": "8949013",
             "name": "prod-minio",
             "namespace": "default",
             "policy_override": null,
@@ -366,8 +367,7 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "0cb0b594-09c2-bad6-fca4-0fbc2479ef3e",
-              "99dab0de-0e97-4ad8-56eb-ffdd062ef656"
+              "0cb0b594-09c2-bad6-fca4-0fbc2479ef3e"
             ],
             "datacenters": [
               "yul1"
@@ -430,9 +430,9 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "113240c48bda39b1664958a0b9fb6058f6e613ae612cc61ff958d1a78c94c46b",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "c41b13aaa5709d1a72ae0408a62304996e89730e7cb3311ca798d5a807a6295f",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.28.1.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
             "vars": {
               "cpu": "2000",
               "data_dir": "/data/",
@@ -443,11 +443,11 @@
               "mem": "8192",
               "port": "9090",
               "service_name": "prometheus",
-              "url": "https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz",
+              "url": "https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz",
               "use_canary": "true",
               "use_host_volume": "true",
               "use_vault_provider": "false",
-              "version": "2.24.0"
+              "version": "2.28.1"
             }
           },
           "sensitive_attributes": []
@@ -465,22 +465,27 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "dd7ade6e-43c4-cac3-3294-9bd32d6b9764",
-              "d4ad3ff3-7b2d-b257-e24e-93955addefad",
-              "037ac779-c647-2dba-5b64-d11e2243066c"
+              "d4eae934-c539-fc71-b7db-16740286ea39",
+              "fe8466ca-c32a-d6a0-86f4-b12b7816a3ac",
+              "87ec193b-dc4d-2d39-8241-86a38ddaf827",
+              "ea93f680-7e8b-6e3e-d33a-5b0c71a73dd0",
+              "cc55245c-e81e-a5ba-fd64-899e4ef63f7e",
+              "828cc735-f166-62f9-64e5-7fcc3aa0010d",
+              "284ef377-252c-c03b-a4ab-76c7fba69b92",
+              "69e2ab96-b63c-24a3-555b-5fa95bf46e55"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "9a4c9dd4-e3b9-33db-26f7-6e1e05962d90",
+            "deployment_id": "1526c7d4-16b3-45d7-4eec-a436cc272c7d",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The reschedule stanza specifies the group's rescheduling strategy. If\n  # specified at the job level, the configuration will apply to all groups\n  # within the job. If the reschedule stanza is present on both the job and the\n  # group, they are merged with the group stanza taking the highest precedence\n  # and then the job.\n  reschedule {\n    delay             = \"30s\"\n    delay_function    = \"constant\"\n    unlimited         = true\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The restart stanza configures a tasks's behavior on task failure. Restarts\n    # happen on the client that is running the task.\n    #\n    # https://www.nomadproject.io/docs/job-specification/restart\n    #\n    restart {\n      interval  = \"30m\"\n      attempts  = 40\n      delay     = \"15s\"\n      mode      = \"delay\"\n    }\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.28.1.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=7d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.28.1/prometheus-2.28.1.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e jenkins_job_success{id=~\".*\"}\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
             "json": null,
-            "modify_index": "7575038",
+            "modify_index": "8949256",
             "name": "prod-prometheus",
             "namespace": "default",
             "policy_override": null,
@@ -564,10 +569,10 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "516e389c-e3ce-92ed-217d-68fceb56f61e",
-              "93db39cc-26d0-ef8a-02af-f10d0e7d8db0",
-              "a8ee42a1-ff17-8a57-6276-75dc9498653e",
-              "a497e3ee-2d8b-2750-620e-223d23114ba0"
+              "4194fecb-5e83-0b00-09d9-7561ee7d57f4",
+              "314b725e-3907-d1f5-2538-72c7343f8cda",
+              "d89ab9c0-4c2c-8430-f7ea-b609fc8285ac",
+              "055428eb-e662-5e07-14f1-afc32a2a491f"
             ],
             "datacenters": [
               "yul1"
-- 
cgit 1.2.3-korg