aboutsummaryrefslogtreecommitdiffstats
path: root/terraform-ci-infra
diff options
context:
space:
mode:
authorpmikus <pmikus@cisco.com>2021-02-05 14:51:43 +0000
committerPeter Mikus <pmikus@cisco.com>2021-02-10 09:04:46 +0000
commit0017c9d8372ef306ac73aae22bb0d17631c944d2 (patch)
treed24d4ff9ee33b4a31cdddfba89d2ae9a4b2e0fdd /terraform-ci-infra
parent60b531215d36e2402b1b6c768bd4fd4d4b210fd0 (diff)
Infra: JenkinsJobHealthExporter
- Integration of Jenkins Job checker Signed-off-by: pmikus <pmikus@cisco.com> Change-Id: I822039cb64a3a352b49314ddab7c6099af3fe644
Diffstat (limited to 'terraform-ci-infra')
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl64
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/main.tf31
-rw-r--r--terraform-ci-infra/1n_nmd/alertmanager/variables.tf34
-rw-r--r--terraform-ci-infra/1n_nmd/main.tf38
-rw-r--r--terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl32
-rw-r--r--terraform-ci-infra/1n_nmd/terraform.tfstate58
-rw-r--r--terraform-ci-infra/1n_nmd/terraform.tfstate.backup168
7 files changed, 208 insertions, 217 deletions
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
index fafe623c85..40d84e337a 100644
--- a/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
+++ b/terraform-ci-infra/1n_nmd/alertmanager/conf/nomad/alertmanager.hcl
@@ -160,10 +160,6 @@ job "${job_name}" {
left_delimiter = "{{{"
right_delimiter = "}}}"
data = <<EOH
-global:
- # The API URL to use for Slack notifications.
- slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'
-
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'
@@ -185,7 +181,7 @@ templates:
# The root route on which each incoming alert enters.
route:
- receiver: '${default_receiver}'
+ receiver: '${slack_default_receiver}'
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
@@ -217,18 +213,21 @@ route:
# overwritten on each.
# The child route trees.
routes:
- # This routes performs a regular expression match on alert labels to
- # catch alerts that are related to a list of services.
+ - match_re:
+ alertname: JenkinsJob.*
+ receiver: ${slack_jenkins_receiver}
+ routes:
+ - match:
+ severity: critical
+ receiver: '${slack_jenkins_receiver}'
+
- match_re:
service: .*
- receiver: ${default_receiver}
- # The service has a sub-route for critical alerts, any alerts
- # that do not match, i.e. severity != critical, fall-back to the
- # parent node and are sent to 'team-X-mails'
+ receiver: ${slack_default_receiver}
routes:
- match:
severity: critical
- receiver: '${default_receiver}'
+ receiver: '${slack_default_receiver}'
# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
@@ -239,17 +238,42 @@ inhibit_rules:
severity: 'critical'
target_match:
severity: 'warning'
- # Apply inhibition if the alertname is the same.
- # CAUTION:
- # If all label names listed in `equal` are missing
- # from both the source and target alerts,
- # the inhibition rule will apply!
- equal: ['alertname', 'cluster', 'service']
+ equal: ['alertname', 'instance']
receivers:
-- name: '${default_receiver}'
+- name: '${slack_jenkins_receiver}'
+ slack_configs:
+ - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'
+ channel: '#${slack_jenkins_channel}'
+ send_resolved: true
+ icon_url: https://avatars3.githubusercontent.com/u/3380462
+ title: |-
+ [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}
+ {{- if gt (len .CommonLabels) (len .GroupLabels) -}}
+ {{" "}}(
+ {{- with .CommonLabels.Remove .GroupLabels.Names }}
+ {{- range $index, $label := .SortedPairs -}}
+ {{ if $index }}, {{ end }}
+ {{- $label.Name }}="{{ $label.Value -}}"
+ {{- end }}
+ {{- end -}}
+ )
+ {{- end }}
+ text: >-
+ {{ range .Alerts -}}
+ *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}
+
+ *Description:* {{ .Annotations.description }}
+
+ *Details:*
+ {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
+ {{ end }}
+ {{ end }}
+
+- name: '${slack_default_receiver}'
slack_configs:
- - channel: '#${slack_channel}'
+ - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'
+ channel: '#${slack_default_channel}'
send_resolved: true
icon_url: https://avatars3.githubusercontent.com/u/3380462
title: |-
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/main.tf b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
index 8307601669..9525aabc0c 100644
--- a/terraform-ci-infra/1n_nmd/alertmanager/main.tf
+++ b/terraform-ci-infra/1n_nmd/alertmanager/main.tf
@@ -14,20 +14,23 @@ locals {
data "template_file" "nomad_job_alertmanager" {
template = file("${path.module}/conf/nomad/alertmanager.hcl")
vars = {
- datacenters = local.datacenters
- url = local.alertmanager_url
- job_name = var.alertmanager_job_name
- use_canary = var.alertmanager_use_canary
- group_count = var.alertmanager_group_count
- service_name = var.alertmanager_service_name
- use_vault_provider = var.alertmanager_vault_secret.use_vault_provider
- version = var.alertmanager_version
- cpu = var.alertmanager_cpu
- mem = var.alertmanager_mem
- port = var.alertmanager_port
- slack_api_key = var.alertmanager_slack_api_key
- slack_channel = var.alertmanager_slack_channel
- default_receiver = var.alertmanager_default_receiver
+ datacenters = local.datacenters
+ url = local.alertmanager_url
+ job_name = var.alertmanager_job_name
+ use_canary = var.alertmanager_use_canary
+ group_count = var.alertmanager_group_count
+ service_name = var.alertmanager_service_name
+ use_vault_provider = var.alertmanager_vault_secret.use_vault_provider
+ version = var.alertmanager_version
+ cpu = var.alertmanager_cpu
+ mem = var.alertmanager_mem
+ port = var.alertmanager_port
+ slack_jenkins_api_key = var.alertmanager_slack_jenkins_api_key
+ slack_jenkins_channel = var.alertmanager_slack_jenkins_channel
+ slack_jenkins_receiver = var.alertmanager_slack_jenkins_receiver
+ slack_default_api_key = var.alertmanager_slack_default_api_key
+ slack_default_channel = var.alertmanager_slack_default_channel
+ slack_default_receiver = var.alertmanager_slack_default_receiver
}
}
diff --git a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
index 152530b96d..ffedf24f3d 100644
--- a/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
+++ b/terraform-ci-infra/1n_nmd/alertmanager/variables.tf
@@ -65,20 +65,38 @@ variable "alertmanager_port" {
default = 9093
}
-variable "alertmanager_default_receiver" {
- description = "Alertmanager default receiver"
+variable "alertmanager_slack_jenkins_api_key" {
+ description = "Alertmanager jenkins slack API key"
type = string
- default = "default-receiver"
+ default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
+}
+
+variable "alertmanager_slack_jenkins_receiver" {
+ description = "Alertmanager jenkins slack receiver"
+ type = string
+ default = "jenkins-slack-receiver"
+}
+
+variable "alertmanager_slack_jenkins_channel" {
+ description = "Alertmanager jenkins slack channel"
+ type = string
+ default = "jenkins-channel"
}
-variable "alertmanager_slack_api_key" {
- description = "Alertmanager slack API key"
+variable "alertmanager_slack_default_api_key" {
+ description = "Alertmanager default slack API key"
type = string
default = "XXXXXXXXX/XXXXXXXXXXX/XXXXXXXXXXXXXXXXXXXXXXXX"
}
-variable "alertmanager_slack_channel" {
- description = "Alertmanager slack channel"
+variable "alertmanager_slack_default_receiver" {
+ description = "Alertmanager default slack receiver"
+ type = string
+ default = "default-slack-receiver"
+}
+
+variable "alertmanager_slack_default_channel" {
+ description = "Alertmanager default slack channel"
type = string
- default = "slack-channel"
+ default = "default-channel"
} \ No newline at end of file
diff --git a/terraform-ci-infra/1n_nmd/main.tf b/terraform-ci-infra/1n_nmd/main.tf
index 985f9ae7c6..b21b76b2ab 100644
--- a/terraform-ci-infra/1n_nmd/main.tf
+++ b/terraform-ci-infra/1n_nmd/main.tf
@@ -5,31 +5,33 @@
# and have them automatically associated with the root provider
# configurations.
module "alertmanager" {
- source = "./alertmanager"
- providers = {
+ source = "./alertmanager"
+ providers = {
nomad = nomad.yul1
}
# nomad
- nomad_datacenters = [ "yul1" ]
+ nomad_datacenters = [ "yul1" ]
# alertmanager
- alertmanager_job_name = "prod-alertmanager"
- alertmanager_use_canary = true
- alertmanager_group_count = 1
- alertmanager_vault_secret = {
- use_vault_provider = false,
- vault_kv_policy_name = "kv-secret",
- vault_kv_path = "secret/data/prometheus",
- vault_kv_field_access_key = "access_key",
- vault_kv_field_secret_key = "secret_key"
+ alertmanager_job_name = "prod-alertmanager"
+ alertmanager_use_canary = true
+ alertmanager_group_count = 1
+ alertmanager_vault_secret = {
+ use_vault_provider = false,
+ vault_kv_policy_name = "kv-secret",
+ vault_kv_path = "secret/data/prometheus",
+ vault_kv_field_access_key = "access_key",
+ vault_kv_field_secret_key = "secret_key"
}
- alertmanager_version = "0.21.0"
- alertmanager_cpu = 1000
- alertmanager_mem = 1024
- alertmanager_port = 9093
- alertmanager_slack_api_key = "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6"
- alertmanager_slack_channel = "fdio-infra-monitoring"
+ alertmanager_version = "0.21.0"
+ alertmanager_cpu = 1000
+ alertmanager_mem = 1024
+ alertmanager_port = 9093
+ alertmanager_slack_jenkins_api_key = "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz"
+ alertmanager_slack_jenkins_channel = "fdio-jobs-monitoring"
+ alertmanager_slack_default_api_key = "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6"
+ alertmanager_slack_default_channel = "fdio-infra-monitoring"
}
module "grafana" {
diff --git a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
index 4918a5f5bd..d851628fcd 100644
--- a/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
+++ b/terraform-ci-infra/1n_nmd/prometheus/conf/nomad/prometheus.hcl
@@ -188,6 +188,24 @@ job "${job_name}" {
data = <<EOH
---
groups:
+- name: "Jenkins Job Health Exporter"
+ rules:
+ - alert: JenkinsJobHealthExporterFailures
+ expr: jenkins_job_failure{id=~".*"} >= 10
+ for: 0m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Jenkins Job Health detected high failure rate on jenkins jobs."
+ description: "Job: {{ $labels.id }}"
+ - alert: JenkinsJobHealthExporterUnstable
+ expr: jenkins_job_unstable{id=~".*"} >= 10
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Jenkins Job Health detected high unstable rate on jenkins jobs."
+ description: "Job: {{ $labels.id }}"
- name: "Consul"
rules:
- alert: ConsulServiceHealthcheckFailed
@@ -523,6 +541,20 @@ scrape_configs:
- targets: [ '10.32.8.16:8080' ]
- targets: [ '10.32.8.17:8080' ]
+ - job_name: 'Jenkins Job Health Exporter'
+ static_configs:
+ - targets: [ '10.30.51.32:9186' ]
+ metric_relabel_configs:
+ - source_labels: [ __name__ ]
+ regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+ action: replace
+ replacement: '$1'
+ target_label: id
+ - source_labels: [ __name__ ]
+ regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'
+ replacement: 'jenkins_job_$2'
+ target_label: __name__
+
- job_name: 'Node Exporter'
static_configs:
- targets: [ '10.30.51.28:9100' ]
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate
index d60f10206f..576e2aac58 100644
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate
@@ -1,7 +1,7 @@
{
"version": 4,
"terraform_version": "0.14.5",
- "serial": 1042,
+ "serial": 1112,
"lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
"outputs": {},
"resources": [
@@ -16,20 +16,23 @@
"schema_version": 0,
"attributes": {
"filename": null,
- "id": "1ea874db11980359d80aada65912759b6acd9a7399f011d823a72eb7c5b4a329",
- "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
- "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033",
+ "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${slack_default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: ${slack_jenkins_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_jenkins_receiver}'\n\n - match_re:\n service: .*\n receiver: ${slack_default_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n channel: '#${slack_jenkins_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: '${slack_default_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n channel: '#${slack_default_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
"vars": {
"cpu": "1000",
"datacenters": "yul1",
- "default_receiver": "default-receiver",
"group_count": "1",
"job_name": "prod-alertmanager",
"mem": "1024",
"port": "9093",
"service_name": "alertmanager",
- "slack_api_key": "XX",
- "slack_channel": "fdio-infra-monitoring",
+ "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6",
+ "slack_default_channel": "fdio-infra-monitoring",
+ "slack_default_receiver": "default-slack-receiver",
+ "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz",
+ "slack_jenkins_channel": "fdio-jobs-monitoring",
+ "slack_jenkins_receiver": "jenkins-slack-receiver",
"url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
"use_canary": "true",
"use_vault_provider": "false",
@@ -51,21 +54,20 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
- "e0aa0e03-a425-2ec8-06ad-ccf89f0f4ee3",
- "bf013613-1ac7-6155-69e0-51ff57719549"
+ "c5e80ff9-5f53-8022-0179-5b628bba986f"
],
"datacenters": [
"yul1"
],
- "deployment_id": "d132f4bd-ef31-5406-97e6-8831ad4a3db5",
+ "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1",
"deployment_status": "successful",
"deregister_on_destroy": true,
"deregister_on_id_change": true,
"detach": false,
"id": "prod-alertmanager",
- "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
"json": null,
- "modify_index": "7202773",
+ "modify_index": "7224418",
"name": "prod-alertmanager",
"namespace": "default",
"policy_override": null,
@@ -268,7 +270,7 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
- "7e9cea8d-a5e5-47db-7f9c-653c59c93928"
+ "a1bc1a10-95a5-3c33-a46d-849522abe4fe"
],
"datacenters": [
"yul1"
@@ -281,7 +283,7 @@
"id": "prod-mc",
"jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
"json": null,
- "modify_index": "7202809",
+ "modify_index": "7254549",
"name": "prod-mc",
"namespace": "default",
"policy_override": null,
@@ -477,9 +479,9 @@
"schema_version": 0,
"attributes": {
"filename": null,
- "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268",
- "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
- "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "id": "f4fe3d282a60afb0ef6713540a32d855d65713707e68cf8636a1afd38521b604",
+ "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
"vars": {
"cpu": "2000",
"data_dir": "/data/",
@@ -512,23 +514,26 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
- "81af62a3-09d6-5e29-a63a-e13952f9caec",
- "1516d874-0001-78b1-f6ec-727067af2f29",
- "fcf4df05-3083-e96a-9e8b-0856c0fed8f9",
- "6eaada1d-5bea-7b95-6564-7c0ff31b3053"
+ "fc9ab3e3-7854-65d5-0c82-28283383976a",
+ "5e526c5d-f7b5-f275-357c-48a739a96071",
+ "d816ca29-4cf4-f9be-4d48-edbddac472ae",
+ "794f7531-14f5-c1c3-9bf3-88936ee335e3",
+ "b1edacab-e57f-e014-4d4c-50e563c81ab7",
+ "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954",
+ "922f75f9-39c1-c232-9ef6-81d68a03c719"
],
"datacenters": [
"yul1"
],
- "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+ "deployment_id": "0a36a612-38de-5e1c-448e-72e29a2fb5f3",
"deployment_status": "successful",
"deregister_on_destroy": true,
"deregister_on_id_change": true,
"detach": false,
"id": "prod-prometheus",
- "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
"json": null,
- "modify_index": "7201194",
+ "modify_index": "7254539",
"name": "prod-prometheus",
"namespace": "default",
"policy_override": null,
@@ -610,10 +615,9 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
+ "a51e95c4-8ff0-47d9-6f88-f475942141bc",
"0239788e-a5eb-b54d-0cf2-2404b3ec7c53",
- "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
- "190f5699-0d10-7f85-ac68-96927702070b",
- "2fbbef45-f00a-1367-08c4-c2239de9e78d"
+ "190f5699-0d10-7f85-ac68-96927702070b"
],
"datacenters": [
"yul1"
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
index f40e2c2c37..ba758f7298 100644
--- a/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
+++ b/terraform-ci-infra/1n_nmd/terraform.tfstate.backup
@@ -1,7 +1,7 @@
{
"version": 4,
"terraform_version": "0.14.5",
- "serial": 1037,
+ "serial": 1108,
"lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9",
"outputs": {},
"resources": [
@@ -16,20 +16,23 @@
"schema_version": 0,
"attributes": {
"filename": null,
- "id": "f723f0544ca4fcac1162853257d62928794f50523b9186cdf2c24684721bd058",
- "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
- "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033",
+ "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/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'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/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'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${slack_default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: ${slack_jenkins_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_jenkins_receiver}'\n\n - match_re:\n service: .*\n receiver: ${slack_default_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n channel: '#${slack_jenkins_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: '${slack_default_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n channel: '#${slack_default_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
"vars": {
"cpu": "1000",
"datacenters": "yul1",
- "default_receiver": "default-receiver",
"group_count": "1",
"job_name": "prod-alertmanager",
"mem": "1024",
"port": "9093",
"service_name": "alertmanager",
- "slack_api_key": "XX",
- "slack_channel": "fdio-infra-monitoring",
+ "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6",
+ "slack_default_channel": "fdio-infra-monitoring",
+ "slack_default_receiver": "default-slack-receiver",
+ "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz",
+ "slack_jenkins_channel": "fdio-jobs-monitoring",
+ "slack_jenkins_receiver": "jenkins-slack-receiver",
"url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz",
"use_canary": "true",
"use_vault_provider": "false",
@@ -51,20 +54,20 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
- "bf013613-1ac7-6155-69e0-51ff57719549"
+ "c5e80ff9-5f53-8022-0179-5b628bba986f"
],
"datacenters": [
"yul1"
],
- "deployment_id": "36a8a7ba-bd0c-bce9-bcc1-284e352dfd32",
+ "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1",
"deployment_status": "successful",
"deregister_on_destroy": true,
"deregister_on_id_change": true,
"detach": false,
"id": "prod-alertmanager",
- "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname', 'cluster', 'service']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
+ "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/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'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/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'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}",
"json": null,
- "modify_index": "7150816",
+ "modify_index": "7224418",
"name": "prod-alertmanager",
"namespace": "default",
"policy_override": null,
@@ -97,111 +100,6 @@
]
},
{
- "module": "module.exporter",
- "mode": "data",
- "type": "template_file",
- "name": "nomad_job_exporter",
- "provider": "provider[\"registry.terraform.io/hashicorp/template\"]",
- "instances": [
- {
- "schema_version": 0,
- "attributes": {
- "filename": null,
- "id": "04a453cfc12a27f7b6d323a51d4cde3c76479feb5ccbfdfc2b43f6d4556e4f63",
- "rendered": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
- "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-${cadvisor_service_name}-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"${cadvisor_image}\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-${cadvisor_service_name}-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${cadvisor_service_name}\"\n port = \"${cadvisor_service_name}\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${cadvisor_service_name}\" {\n static = ${cadvisor_port}\n }\n }\n }\n }\n }\n}",
- "vars": {
- "blackbox_port": "9115",
- "blackbox_service_name": "blackboxexporter",
- "blackbox_url_amd64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-amd64.tar.gz",
- "blackbox_url_arm64": "https://github.com/prometheus/blackbox_exporter/releases/download/v0.18.0/blackbox_exporter-0.18.0.linux-arm64.tar.gz",
- "blackbox_version": "0.18.0",
- "cadvisor_image": "gcr.io/cadvisor/cadvisor:latest",
- "cadvisor_port": "8080",
- "cadvisor_service_name": "cadvisorexporter",
- "datacenters": "yul1",
- "job_name": "prod-exporter",
- "node_port": "9100",
- "node_service_name": "nodeexporter",
- "node_url_amd64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-amd64.tar.gz",
- "node_url_arm64": "https://github.com/prometheus/node_exporter/releases/download/v1.0.1/node_exporter-1.0.1.linux-arm64.tar.gz",
- "node_version": "1.0.1",
- "use_canary": "false"
- }
- },
- "sensitive_attributes": []
- }
- ]
- },
- {
- "module": "module.exporter",
- "mode": "managed",
- "type": "nomad_job",
- "name": "nomad_job_exporter",
- "provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
- "instances": [
- {
- "schema_version": 0,
- "attributes": {
- "allocation_ids": null,
- "datacenters": [
- "yul1"
- ],
- "deployment_id": "",
- "deployment_status": "",
- "deregister_on_destroy": true,
- "deregister_on_id_change": true,
- "detach": false,
- "id": "prod-exporter",
- "jobspec": "job \"prod-exporter\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"system\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-exporter-amd64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-amd64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"gcr.io/cadvisor/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n\n group \"prod-group1-exporter-arm64\" {\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"==\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task3-cadvisorexporter-arm64\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n # There is currently no official release for arm yet...using community.\n image = \"zcube/cadvisor:latest\"\n volumes = [\n \"/:/rootfs:ro\",\n \"/var/run:/var/run:rw\",\n \"/sys:/sys:ro\",\n \"/var/lib/docker/:/var/lib/docker:ro\",\n \"/cgroup:/cgroup:ro\"\n ]\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"cadvisorexporter\"\n port = \"cadvisorexporter\"\n check {\n name = \"cAdvisor Check Live\"\n type = \"http\"\n path = \"/metrics\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 500\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"cadvisorexporter\" {\n static = 8080\n }\n }\n }\n }\n }\n}",
- "json": null,
- "modify_index": "7201184",
- "name": "prod-exporter",
- "namespace": "default",
- "policy_override": null,
- "purge_on_destroy": null,
- "region": "global",
- "task_groups": [
- {
- "count": 1,
- "meta": {},
- "name": "prod-group1-exporter-amd64",
- "task": [
- {
- "driver": "docker",
- "meta": {},
- "name": "prod-task3-cadvisorexporter-amd64",
- "volume_mounts": []
- }
- ],
- "volumes": []
- },
- {
- "count": 1,
- "meta": {},
- "name": "prod-group1-exporter-arm64",
- "task": [
- {
- "driver": "docker",
- "meta": {},
- "name": "prod-task3-cadvisorexporter-arm64",
- "volume_mounts": []
- }
- ],
- "volumes": []
- }
- ],
- "type": "system"
- },
- "sensitive_attributes": [],
- "private": "bnVsbA==",
- "dependencies": [
- "module.exporter.data.template_file.nomad_job_exporter"
- ]
- }
- ]
- },
- {
"module": "module.grafana",
"mode": "data",
"type": "template_file",
@@ -369,10 +267,11 @@
"provider": "provider[\"registry.terraform.io/hashicorp/nomad\"].yul1",
"instances": [
{
- "status": "tainted",
"schema_version": 0,
"attributes": {
- "allocation_ids": null,
+ "allocation_ids": [
+ "631050b2-0973-35f9-30ab-1f80b2b56b28"
+ ],
"datacenters": [
"yul1"
],
@@ -384,12 +283,12 @@
"id": "prod-mc",
"jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n",
"json": null,
- "modify_index": "7201202",
+ "modify_index": "7254273",
"name": "prod-mc",
"namespace": "default",
"policy_override": null,
"purge_on_destroy": null,
- "region": null,
+ "region": "global",
"task_groups": [
{
"count": 1,
@@ -580,9 +479,9 @@
"schema_version": 0,
"attributes": {
"filename": null,
- "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268",
- "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
- "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
+ "id": "4a4840cd7c13ce07d6bf0743f56a523eabde46a27c583d17865967a03dbf717a",
+ "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}",
"vars": {
"cpu": "2000",
"data_dir": "/data/",
@@ -614,19 +513,29 @@
{
"schema_version": 0,
"attributes": {
- "allocation_ids": null,
+ "allocation_ids": [
+ "7722e831-d956-55a4-c375-f0e33485d234",
+ "b1edacab-e57f-e014-4d4c-50e563c81ab7",
+ "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954",
+ "c9c55369-2e27-8087-b156-f1923a57e1a0",
+ "922f75f9-39c1-c232-9ef6-81d68a03c719",
+ "6e23349f-9644-a20b-5c0e-db9d79733b3d",
+ "0337957a-0d04-f4b6-4dfc-3b828a731498",
+ "2705a9d9-913a-f2db-2535-abd85e9a5e6b",
+ "57b9c1f1-0dbe-321e-5cce-861f766c5ee3"
+ ],
"datacenters": [
"yul1"
],
- "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d",
+ "deployment_id": "02342e30-fde4-3402-b201-ef649d404a95",
"deployment_status": "successful",
"deregister_on_destroy": true,
"deregister_on_id_change": true,
"detach": false,
"id": "prod-prometheus",
- "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
+ "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on VPP jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}",
"json": null,
- "modify_index": "7201194",
+ "modify_index": "7254268",
"name": "prod-prometheus",
"namespace": "default",
"policy_override": null,
@@ -708,10 +617,9 @@
"schema_version": 0,
"attributes": {
"allocation_ids": [
+ "a51e95c4-8ff0-47d9-6f88-f475942141bc",
"0239788e-a5eb-b54d-0cf2-2404b3ec7c53",
- "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1",
- "190f5699-0d10-7f85-ac68-96927702070b",
- "2fbbef45-f00a-1367-08c4-c2239de9e78d"
+ "190f5699-0d10-7f85-ac68-96927702070b"
],
"datacenters": [
"yul1"