Infra: JenkinsJobHealthExporter

- Integration of Jenkins Job checker Signed-off-by: pmikus <pmikus@cisco.com> Change-Id: I822039cb64a3a352b49314ddab7c6099af3fe644
author: pmikus <pmikus@cisco.com> 2021-02-05 14:51:43 +0000
committer: Peter Mikus <pmikus@cisco.com> 2021-02-10 09:04:46 +0000
commit: 0017c9d8372ef306ac73aae22bb0d17631c944d2 (patch)
tree: d24d4ff9ee33b4a31cdddfba89d2ae9a4b2e0fdd /terraform-ci-infra/1n_nmd/terraform.tfstate
parent: 60b531215d36e2402b1b6c768bd4fd4d4b210fd0 (diff)
1 files changed, 31 insertions, 27 deletions
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate
index d60f10206f..576e2aac58 100644
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate
@@ -1,7 +1,7 @@
 {
   "version": 4,
   "terraform_version": "0.14.5",
-  "serial": 1042,
+  "serial": 1112,
   "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
   "outputs": {},
   "resources": [
@@ -16,20 +16,23 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "1ea874db11980359d80aada65912759b6acd9a7399f011d823a72eb7c5b4a329",
-            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: ${default_receiver}\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n  slack_configs:\n  - channel: '#${slack_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033",
+            "rendered": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = ${group_count}\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"$${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: '${slack_default_receiver}'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: ${slack_jenkins_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_jenkins_receiver}'\n\n  - match_re:\n      service: .*\n    receiver: ${slack_default_receiver}\n    routes:\n    - match:\n        severity: critical\n      receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n    channel: '#${slack_jenkins_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: '${slack_default_receiver}'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n    channel: '#${slack_default_channel}'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
             "vars": {
               "cpu": "1000",
               "datacenters": "yul1",
-              "default_receiver": "default-receiver",
               "group_count": "1",
               "job_name": "prod-alertmanager",
               "mem": "1024",
               "port": "9093",
               "service_name": "alertmanager",
-              "slack_api_key": "XX",
-              "slack_channel": "fdio-infra-monitoring",
+              "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6",
+              "slack_default_channel": "fdio-infra-monitoring",
+              "slack_default_receiver": "default-slack-receiver",
+              "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz",
+              "slack_jenkins_channel": "fdio-jobs-monitoring",
+              "slack_jenkins_receiver": "jenkins-slack-receiver",
               "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
               "use_canary": "true",
               "use_vault_provider": "false",
@@ -51,21 +54,20 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "e0aa0e03-a425-2ec8-06ad-ccf89f0f4ee3",
-              "bf013613-1ac7-6155-69e0-51ff57719549"
+              "c5e80ff9-5f53-8022-0179-5b628bba986f"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "d132f4bd-ef31-5406-97e6-8831ad4a3db5",
+            "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-alertmanager",
-            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\nglobal:\n  # The API URL to use for Slack notifications.\n  slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  # This routes performs a regular expression match on alert labels to\n  # catch alerts that are related to a list of services.\n  - match_re:\n      service: .*\n    receiver: default-receiver\n    # The service has a sub-route for critical alerts, any alerts\n    # that do not match, i.e. severity != critical, fall-back to the\n    # parent node and are sent to 'team-X-mails'\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  # Apply inhibition if the alertname is the same.\n  # CAUTION:\n  #   If all label names listed in `equal` are missing\n  #   from both the source and target alerts,\n  #   the inhibition rule will apply!\n  equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n  slack_configs:\n  - channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-alertmanager\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-alertmanager\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count             = 1\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute       = \"${attr.cpu.arch}\"\n      operator        = \"!=\"\n      value           = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-alertmanager\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver          = \"exec\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command       = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n        args          = [\n          \"--config.file=secrets/alertmanager.yml\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alertmanager.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n#  # CA certificate to validate the server certificate with.\n#  ca_file: \u003cfilepath\u003e ]\n#\n#  # Certificate and key files for client cert authentication to the server.\n#  cert_file: \u003cfilepath\u003e\n#  key_file: \u003cfilepath\u003e\n#\n#  # ServerName extension to indicate the name of the server.\n#  # http://tools.ietf.org/html/rfc4366#section-3.1\n#  server_name: \u003cstring\u003e\n#\n#  # Disable validation of the server certificate.\n#  insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n  receiver: 'default-slack-receiver'\n\n  # The labels by which incoming alerts are grouped together. For example,\n  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n  # be batched into a single group.\n  #\n  # To aggregate by all possible labels use '...' as the sole label name.\n  # This effectively disables aggregation entirely, passing through all\n  # alerts as-is. This is unlikely to be what you want, unless you have\n  # a very low alert volume or your upstream notification system performs\n  # its own grouping. Example: group_by: [...]\n  group_by: ['alertname']\n\n  # When a new group of alerts is created by an incoming alert, wait at\n  # least 'group_wait' to send the initial notification.\n  # This way ensures that you get multiple alerts for the same group that start\n  # firing shortly after another are batched together on the first\n  # notification.\n  group_wait: 30s\n\n  # When the first notification was sent, wait 'group_interval' to send a batch\n  # of new alerts that started firing for that group.\n  group_interval: 5m\n\n  # If an alert has successfully been sent, wait 'repeat_interval' to\n  # resend them.\n  repeat_interval: 3h\n\n  # All the above attributes are inherited by all child routes and can\n  # overwritten on each.\n  # The child route trees.\n  routes:\n  - match_re:\n      alertname: JenkinsJob.*\n    receiver: jenkins-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'jenkins-slack-receiver'\n\n  - match_re:\n      service: .*\n    receiver: default-slack-receiver\n    routes:\n    - match:\n        severity: critical\n      receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n    severity: 'critical'\n  target_match:\n    severity: 'warning'\n  equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-jobs-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\n\n- name: 'default-slack-receiver'\n  slack_configs:\n  - api_url: 'https://hooks.slack.com/services/'\n    channel: '#fdio-infra-monitoring'\n    send_resolved: true\n    icon_url: https://avatars3.githubusercontent.com/u/3380462\n    title: |-\n     [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n     {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n       {{\" \"}}(\n       {{- with .CommonLabels.Remove .GroupLabels.Names }}\n         {{- range $index, $label := .SortedPairs -}}\n           {{ if $index }}, {{ end }}\n           {{- $label.Name }}=\"{{ $label.Value -}}\"\n         {{- end }}\n       {{- end -}}\n       )\n     {{- end }}\n    text: \u003e-\n     {{ range .Alerts -}}\n     *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n     *Description:* {{ .Annotations.description }}\n\n     *Details:*\n       {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n       {{ end }}\n     {{ end }}\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"alertmanager\"\n        port            = \"alertmanager\"\n        tags            = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Alertmanager Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 1000\n        memory          = 1024\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"alertmanager\" {\n            static      = 9093\n          }\n        }\n      }\n    }\n  }\n}",
             "json": null,
-            "modify_index": "7202773",
+            "modify_index": "7224418",
             "name": "prod-alertmanager",
             "namespace": "default",
             "policy_override": null,
@@ -268,7 +270,7 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "7e9cea8d-a5e5-47db-7f9c-653c59c93928"
+              "a1bc1a10-95a5-3c33-a46d-849522abe4fe"
             ],
             "datacenters": [
               "yul1"
@@ -281,7 +283,7 @@
             "id": "prod-mc",
             "jobspec": "job \"prod-mc\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers.html\n  #\n  type        = \"batch\"\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group.html\n  #\n  group \"prod-group1-mc\" {\n    task \"prod-task1-create-buckets\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver        = \"docker\"\n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        image       = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n        entrypoint  = [\n          \"/bin/sh\",\n          \"-c\",\n          \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n        ]\n        dns_servers  = [ \"${attr.unique.network.ip-address}\" ]\n        privileged   = false\n      }\n\n      # The env stanza configures a list of environment variables to populate\n      # the task's environment before starting.\n      env {\n        \n        MINIO_ACCESS_KEY = \"minio\"\n        MINIO_SECRET_KEY = \"minio123\"\n        \n        \n      }\n    }\n  }\n}\n",
             "json": null,
-            "modify_index": "7202809",
+            "modify_index": "7254549",
             "name": "prod-mc",
             "namespace": "default",
             "policy_override": null,
@@ -477,9 +479,9 @@
           "schema_version": 0,
           "attributes": {
             "filename": null,
-            "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268",
-            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
-            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
+            "id": "f4fe3d282a60afb0ef6713540a32d855d65713707e68cf8636a1afd38521b604",
+            "rendered": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "template": "job \"${job_name}\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"${datacenters}\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n%{ if use_canary }\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n%{ endif }\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-${service_name}\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = ${group_count}\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    %{ if use_host_volume }\n    volume \"prod-volume1-${service_name}\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"${host_volume}\"\n    }\n    %{ endif }\n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"$${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-${service_name}\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      %{ if use_host_volume }\n      volume_mount {\n        volume          = \"prod-volume1-${service_name}\"\n        destination     = \"${data_dir}\"\n        read_only       = false\n      }\n      %{ endif }\n\n      %{ if use_vault_provider }\n      vault {\n        policies        = \"${vault_kv_policy_name}\"\n      }\n      %{ endif }\n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-${version}.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=${data_dir}prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"${url}\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"${service_name}\"\n        port            = \"${service_name}\"\n        tags            = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = ${cpu}\n        memory          = ${mem}\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"${service_name}\" {\n            static      = ${port}\n          }\n        }\n      }\n    }\n  }\n}",
             "vars": {
               "cpu": "2000",
               "data_dir": "/data/",
@@ -512,23 +514,26 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
-              "81af62a3-09d6-5e29-a63a-e13952f9caec",
-              "1516d874-0001-78b1-f6ec-727067af2f29",
-              "fcf4df05-3083-e96a-9e8b-0856c0fed8f9",
-              "6eaada1d-5bea-7b95-6564-7c0ff31b3053"
+              "fc9ab3e3-7854-65d5-0c82-28283383976a",
+              "5e526c5d-f7b5-f275-357c-48a739a96071",
+              "d816ca29-4cf4-f9be-4d48-edbddac472ae",
+              "794f7531-14f5-c1c3-9bf3-88936ee335e3",
+              "b1edacab-e57f-e014-4d4c-50e563c81ab7",
+              "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954",
+              "922f75f9-39c1-c232-9ef6-81d68a03c719"
             ],
             "datacenters": [
               "yul1"
             ],
-            "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+            "deployment_id": "0a36a612-38de-5e1c-448e-72e29a2fb5f3",
             "deployment_status": "successful",
             "deregister_on_destroy": true,
             "deregister_on_id_change": true,
             "detach": false,
             "id": "prod-prometheus",
-            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
+            "jobspec": "job \"prod-prometheus\" {\n  # The \"region\" parameter specifies the region in which to execute the job.\n  # If omitted, this inherits the default region name of \"global\".\n  # region = \"global\"\n  #\n  # The \"datacenters\" parameter specifies the list of datacenters which should\n  # be considered when placing this task. This must be provided.\n  datacenters         = \"yul1\"\n\n  # The \"type\" parameter controls the type of job, which impacts the scheduler's\n  # decision on placement. This configuration is optional and defaults to\n  # \"service\". For a full list of job types and their differences, please see\n  # the online documentation.\n  #\n  # For more information, please see the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/jobspec/schedulers\n  #\n  type                = \"service\"\n\n  update {\n    # The \"max_parallel\" parameter specifies the maximum number of updates to\n    # perform in parallel. In this case, this specifies to update a single task\n    # at a time.\n    max_parallel      = 1\n\n    health_check      = \"checks\"\n\n    # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n    # must be in the healthy state before it is marked as healthy and unblocks\n    # further allocations from being updated.\n    min_healthy_time  = \"10s\"\n\n    # The \"healthy_deadline\" parameter specifies the deadline in which the\n    # allocation must be marked as healthy after which the allocation is\n    # automatically transitioned to unhealthy. Transitioning to unhealthy will\n    # fail the deployment and potentially roll back the job if \"auto_revert\" is\n    # set to true.\n    healthy_deadline  = \"3m\"\n\n    # The \"progress_deadline\" parameter specifies the deadline in which an\n    # allocation must be marked as healthy. The deadline begins when the first\n    # allocation for the deployment is created and is reset whenever an allocation\n    # as part of the deployment transitions to a healthy state. If no allocation\n    # transitions to the healthy state before the progress deadline, the\n    # deployment is marked as failed.\n    progress_deadline = \"10m\"\n\n\n    # The \"canary\" parameter specifies that changes to the job that would result\n    # in destructive updates should create the specified number of canaries\n    # without stopping any previous allocations. Once the operator determines the\n    # canaries are healthy, they can be promoted which unblocks a rolling update\n    # of the remaining allocations at a rate of \"max_parallel\".\n    #\n    # Further, setting \"canary\" equal to the count of the task group allows\n    # blue/green deployments. When the job is updated, a full set of the new\n    # version is deployed and upon promotion the old version is stopped.\n    canary            = 1\n\n    # Specifies if the job should auto-promote to the canary version when all\n    # canaries become healthy during a deployment. Defaults to false which means\n    # canaries must be manually updated with the nomad deployment promote\n    # command.\n    auto_promote      = true\n\n    # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n    # last stable job on deployment failure. A job is marked as stable if all the\n    # allocations as part of its deployment were marked healthy.\n    auto_revert       = true\n\n  }\n\n  # The \"group\" stanza defines a series of tasks that should be co-located on\n  # the same Nomad client. Any task within a group will be placed on the same\n  # client.\n  #\n  # For more information and examples on the \"group\" stanza, please see\n  # the online documentation at:\n  #\n  #     https://www.nomadproject.io/docs/job-specification/group\n  #\n  group \"prod-group1-prometheus\" {\n    # The \"count\" parameter specifies the number of the task groups that should\n    # be running under this group. This value must be non-negative and defaults\n    # to 1.\n    count               = 4\n\n    # The volume stanza allows the group to specify that it requires a given\n    # volume from the cluster.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/volume\n    #\n    \n    volume \"prod-volume1-prometheus\" {\n      type              = \"host\"\n      read_only         = false\n      source            = \"prod-volume-data1-1\"\n    }\n    \n\n    # The constraint allows restricting the set of eligible nodes. Constraints\n    # may filter on attributes or client metadata.\n    #\n    # For more information and examples on the \"volume\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/constraint\n    #\n    constraint {\n      attribute         = \"${attr.cpu.arch}\"\n      operator          = \"!=\"\n      value             = \"arm64\"\n    }\n\n    # The \"task\" stanza creates an individual unit of work, such as a Docker\n    # container, web application, or batch processing.\n    #\n    # For more information and examples on the \"task\" stanza, please see\n    # the online documentation at:\n    #\n    #     https://www.nomadproject.io/docs/job-specification/task\n    #\n    task \"prod-task1-prometheus\" {\n      # The \"driver\" parameter specifies the task driver that should be used to\n      # run the task.\n      driver            = \"exec\"\n\n      \n      volume_mount {\n        volume          = \"prod-volume1-prometheus\"\n        destination     = \"/data/\"\n        read_only       = false\n      }\n      \n\n      \n\n      # The \"config\" stanza specifies the driver configuration, which is passed\n      # directly to the driver to start the task. The details of configurations\n      # are specific to each driver, so please see specific driver\n      # documentation for more information.\n      config {\n        command         = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n        args            = [\n          \"--config.file=secrets/prometheus.yml\",\n          \"--storage.tsdb.path=/data/prometheus/\",\n          \"--storage.tsdb.retention.time=15d\"\n        ]\n      }\n\n      # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n      # such as a file, tarball, or binary. Nomad downloads artifacts using the\n      # popular go-getter library, which permits downloading artifacts from a\n      # variety of locations using a URL as the input source.\n      #\n      # For more information and examples on the \"artifact\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/artifact\n      #\n      artifact {\n        source          = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n      }\n\n      # The \"template\" stanza instructs Nomad to manage a template, such as\n      # a configuration file or script. This template can optionally pull data\n      # from Consul or Vault to populate runtime configuration data.\n      #\n      # For more information and examples on the \"template\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/template\n      #\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/alerts.yml\"\n        left_delimiter  = \"{{{\"\n        right_delimiter = \"}}}\"\n        data            = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n  rules:\n  - alert: JenkinsJobHealthExporterFailures\n    expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n  - alert: JenkinsJobHealthExporterUnstable\n    expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n      description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n  rules:\n  - alert: ConsulServiceHealthcheckFailed\n    expr: consul_catalog_service_node_healthy == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n      description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n  - alert: ConsulMissingMasterNode\n    expr: consul_raft_peers \u003c 3\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n      description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n  - alert: ConsulAgentUnhealthy\n    expr: consul_health_node_status{status=\"critical\"} == 1\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n      description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n  rules:\n  - alert: NodeDown\n    expr: up == 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n      description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n  - alert: HostHighCpuLoad\n    expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n      description: \"CPU load is \u003e 95%.\"\n  - alert: HostOutOfMemory\n    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n      description: \"Node memory is filling up (\u003c 10% left).\"\n  - alert: HostOomKillDetected\n    expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n      description: \"OOM kill detected.\"\n  - alert: HostMemoryUnderMemoryPressure\n    expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n      description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n  - alert: HostOutOfDiskSpace\n    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n      description: \"Disk is almost full (\u003c 10% left).\"\n  - alert: HostRaidDiskFailure\n    expr: node_md_disks{state=\"failed\"} \u003e 0\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n      description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n  - alert: HostConntrackLimit\n    expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n      description: \"The number of conntrack is approching limit.\"\n  - alert: HostNetworkInterfaceSaturated\n    expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n    for: 1m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n      description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n  - alert: HostSystemdServiceCrashed\n    expr: node_systemd_unit_state{state=\"failed\"} == 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n      description: \"SystemD service crashed.\"\n  - alert: HostEdacCorrectableErrorsDetected\n    expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: info\n    annotations:\n      summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n  - alert: HostEdacUncorrectableErrorsDetected\n    expr: node_edac_uncorrectable_errors_total \u003e 0\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n      description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n  rules:\n  - alert: MinioDiskOffline\n    expr: minio_offline_disks \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n      description: \"Minio disk is offline.\"\n  - alert: MinioStorageSpaceExhausted\n    expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n    for: 2m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n      description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n  rules:\n  - alert: PrometheusConfigurationReloadFailure\n    expr: prometheus_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"Prometheus configuration reload error.\"\n  - alert: PrometheusTooManyRestarts\n    expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n      description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n  - alert: PrometheusAlertmanagerConfigurationReloadFailure\n    expr: alertmanager_config_last_reload_successful != 1\n    for: 0m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n      description: \"AlertManager configuration reload error.\"\n  - alert: PrometheusRuleEvaluationFailures\n    expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n  - alert: PrometheusTargetScrapingSlow\n    expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n    for: 5m\n    labels:\n      severity: warning\n    annotations:\n      summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n      description: \"Prometheus is scraping exporters slowly.\"\n  - alert: PrometheusTsdbCompactionsFailed\n    expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n  - alert: PrometheusTsdbHeadTruncationsFailed\n    expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n  - alert: PrometheusTsdbWalCorruptions\n    expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n  - alert: PrometheusTsdbWalTruncationsFailed\n    expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n    for: 0m\n    labels:\n      severity: critical\n    annotations:\n      summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n      description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n      }\n\n      template {\n        change_mode     = \"noop\"\n        change_signal   = \"SIGINT\"\n        destination     = \"secrets/prometheus.yml\"\n        data            = \u003c\u003cEOH\n---\nglobal:\n  scrape_interval:     5s\n  scrape_timeout:      5s\n  evaluation_interval: 5s\n\nalerting:\n  alertmanagers:\n  - consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\nrule_files:\n  - 'alerts.yml'\n\nscrape_configs:\n\n  - job_name: 'Nomad Cluster'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'nomad-client', 'nomad' ]\n    relabel_configs:\n    - source_labels: [__meta_consul_tags]\n      regex: '(.*)http(.*)'\n      action: keep\n    metrics_path: /v1/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Consul Cluster'\n    static_configs:\n      - targets: [ '10.30.51.28:8500' ]\n      - targets: [ '10.30.51.29:8500' ]\n      - targets: [ '10.30.51.30:8500' ]\n      - targets: [ '10.30.51.32:8500' ]\n      - targets: [ '10.30.51.33:8500' ]\n      - targets: [ '10.30.51.34:8500' ]\n      - targets: [ '10.30.51.35:8500' ]\n      - targets: [ '10.30.51.39:8500' ]\n      - targets: [ '10.30.51.40:8500' ]\n      - targets: [ '10.30.51.50:8500' ]\n      - targets: [ '10.30.51.51:8500' ]\n      - targets: [ '10.30.51.65:8500' ]\n      - targets: [ '10.30.51.66:8500' ]\n      - targets: [ '10.30.51.67:8500' ]\n      - targets: [ '10.30.51.68:8500' ]\n      - targets: [ '10.30.51.70:8500' ]\n      - targets: [ '10.30.51.71:8500' ]\n      - targets: [ '10.32.8.14:8500' ]\n      - targets: [ '10.32.8.15:8500' ]\n      - targets: [ '10.32.8.16:8500' ]\n      - targets: [ '10.32.8.17:8500' ]\n    metrics_path: /v1/agent/metrics\n    params:\n      format: [ 'prometheus' ]\n\n  - job_name: 'Blackbox Exporter (icmp)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n      - targets: [ '10.30.51.32' ]\n    params:\n      module: [ 'icmp_v4' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'Blackbox Exporter (http)'\n    static_configs:\n      - targets: [ 'gerrit.fd.io' ]\n      - targets: [ 'jenkins.fd.io' ]\n    params:\n      module: [ 'http_2xx' ]\n    relabel_configs:\n      - source_labels: [__address__]\n        target_label: __param_target\n      - source_labels: [__param_target]\n        target_label: instance\n      - target_label: __address__\n        replacement: localhost:9115\n    metrics_path: /probe\n\n  - job_name: 'cAdvisor Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:8080' ]\n      - targets: [ '10.30.51.29:8080' ]\n      - targets: [ '10.30.51.30:8080' ]\n      #- targets: [ '10.30.51.32:8080' ]\n      - targets: [ '10.30.51.33:8080' ]\n      - targets: [ '10.30.51.34:8080' ]\n      - targets: [ '10.30.51.35:8080' ]\n      - targets: [ '10.30.51.39:8080' ]\n      - targets: [ '10.30.51.40:8080' ]\n      - targets: [ '10.30.51.50:8080' ]\n      - targets: [ '10.30.51.51:8080' ]\n      - targets: [ '10.30.51.65:8080' ]\n      - targets: [ '10.30.51.66:8080' ]\n      - targets: [ '10.30.51.67:8080' ]\n      - targets: [ '10.30.51.68:8080' ]\n      - targets: [ '10.30.51.70:8080' ]\n      - targets: [ '10.30.51.71:8080' ]\n      - targets: [ '10.32.8.14:8080' ]\n      - targets: [ '10.32.8.15:8080' ]\n      - targets: [ '10.32.8.16:8080' ]\n      - targets: [ '10.32.8.17:8080' ]\n\n  - job_name: 'Jenkins Job Health Exporter'\n    static_configs:\n      - targets: [ '10.30.51.32:9186' ]\n    metric_relabel_configs:\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        action: replace\n        replacement: '$1'\n        target_label: id\n      - source_labels: [ __name__ ]\n        regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n        replacement: 'jenkins_job_$2'\n        target_label: __name__\n\n  - job_name: 'Node Exporter'\n    static_configs:\n      - targets: [ '10.30.51.28:9100' ]\n      - targets: [ '10.30.51.29:9100' ]\n      - targets: [ '10.30.51.30:9100' ]\n      - targets: [ '10.30.51.32:9100' ]\n      - targets: [ '10.30.51.33:9100' ]\n      - targets: [ '10.30.51.34:9100' ]\n      - targets: [ '10.30.51.35:9100' ]\n      - targets: [ '10.30.51.39:9100' ]\n      - targets: [ '10.30.51.40:9100' ]\n      - targets: [ '10.30.51.50:9100' ]\n      - targets: [ '10.30.51.51:9100' ]\n      - targets: [ '10.30.51.65:9100' ]\n      - targets: [ '10.30.51.66:9100' ]\n      - targets: [ '10.30.51.67:9100' ]\n      - targets: [ '10.30.51.68:9100' ]\n      - targets: [ '10.30.51.70:9100' ]\n      - targets: [ '10.30.51.71:9100' ]\n      - targets: [ '10.32.8.14:9100' ]\n      - targets: [ '10.32.8.15:9100' ]\n      - targets: [ '10.32.8.16:9100' ]\n      - targets: [ '10.32.8.17:9100' ]\n\n  - job_name: 'Alertmanager'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'alertmanager' ]\n\n  - job_name: 'Grafana'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'grafana' ]\n\n  - job_name: 'Prometheus'\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'prometheus' ]\n\n  - job_name: 'Minio'\n    bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n    consul_sd_configs:\n    - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n      services: [ 'storage' ]\n    metrics_path: /minio/prometheus/metrics\nEOH\n      }\n\n      # The service stanza instructs Nomad to register a service with Consul.\n      #\n      # For more information and examples on the \"task\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/service\n      #\n      service {\n        name            = \"prometheus\"\n        port            = \"prometheus\"\n        tags            = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n        check {\n          name          = \"Prometheus Check Live\"\n          type          = \"http\"\n          path          = \"/-/healthy\"\n          interval      = \"10s\"\n          timeout       = \"2s\"\n        }\n      }\n\n      # The \"resources\" stanza describes the requirements a task needs to\n      # execute. Resource requirements include memory, network, cpu, and more.\n      # This ensures the task will execute on a machine that contains enough\n      # resource capacity.\n      #\n      # For more information and examples on the \"resources\" stanza, please see\n      # the online documentation at:\n      #\n      #     https://www.nomadproject.io/docs/job-specification/resources\n      #\n      resources {\n        cpu             = 2000\n        memory          = 8192\n        # The network stanza specifies the networking requirements for the task\n        # group, including the network mode and port allocations. When scheduling\n        # jobs in Nomad they are provisioned across your fleet of machines along\n        # with other jobs and services. Because you don't know in advance what host\n        # your job will be provisioned on, Nomad will provide your tasks with\n        # network configuration when they start up.\n        #\n        # For more information and examples on the \"template\" stanza, please see\n        # the online documentation at:\n        #\n        #     https://www.nomadproject.io/docs/job-specification/network\n        #\n        network {\n          port \"prometheus\" {\n            static      = 9090\n          }\n        }\n      }\n    }\n  }\n}",
             "json": null,
-            "modify_index": "7201194",
+            "modify_index": "7254539",
             "name": "prod-prometheus",
             "namespace": "default",
             "policy_override": null,
@@ -610,10 +615,9 @@
           "schema_version": 0,
           "attributes": {
             "allocation_ids": [
+              "a51e95c4-8ff0-47d9-6f88-f475942141bc",
               "0239788e-a5eb-b54d-0cf2-2404b3ec7c53",
-              "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
-              "190f5699-0d10-7f85-ac68-96927702070b",
-              "2fbbef45-f00a-1367-08c4-c2239de9e78d"
+              "190f5699-0d10-7f85-ac68-96927702070b"
             ],
             "datacenters": [
               "yul1"
author	pmikus <pmikus@cisco.com>	2021-02-05 14:51:43 +0000
committer	Peter Mikus <pmikus@cisco.com>	2021-02-10 09:04:46 +0000
commit	0017c9d8372ef306ac73aae22bb0d17631c944d2 (patch)
tree	d24d4ff9ee33b4a31cdddfba89d2ae9a4b2e0fdd /terraform-ci-infra/1n_nmd/terraform.tfstate
parent	60b531215d36e2402b1b6c768bd4fd4d4b210fd0 (diff)