diff options
author | pmikus <pmikus@cisco.com> | 2021-02-05 14:51:43 +0000 |
---|---|---|
committer | Peter Mikus <pmikus@cisco.com> | 2021-02-10 09:04:46 +0000 |
commit | 0017c9d8372ef306ac73aae22bb0d17631c944d2 (patch) | |
tree | d24d4ff9ee33b4a31cdddfba89d2ae9a4b2e0fdd /terraform-ci-infra/1n_nmd/terraform.tfstate | |
parent | 60b531215d36e2402b1b6c768bd4fd4d4b210fd0 (diff) |
Infra: JenkinsJobHealthExporter
- Integration of Jenkins Job checker
Signed-off-by: pmikus <pmikus@cisco.com>
Change-Id: I822039cb64a3a352b49314ddab7c6099af3fe644
Diffstat (limited to 'terraform-ci-infra/1n_nmd/terraform.tfstate')
-rw-r--r-- | terraform-ci-infra/1n_nmd/terraform.tfstate | 58 |
1 files changed, 31 insertions, 27 deletions
diff --git a/terraform-ci-infra/1n_nmd/terraform.tfstate b/terraform-ci-infra/1n_nmd/terraform.tfstate index d60f10206f..576e2aac58 100644 --- a/terraform-ci-infra/1n_nmd/terraform.tfstate +++ b/terraform-ci-infra/1n_nmd/terraform.tfstate @@ -1,7 +1,7 @@ { "version": 4, "terraform_version": "0.14.5", - "serial": 1042, + "serial": 1112, "lineage": "e4e7f30a-652d-7a31-e31c-5e3a3388c9b9", "outputs": {}, "resources": [ @@ -16,20 +16,23 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "1ea874db11980359d80aada65912759b6acd9a7399f011d823a72eb7c5b4a329", - "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/${slack_api_key}'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: ${default_receiver}\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: '${default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: '${default_receiver}'\n slack_configs:\n - channel: '#${slack_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "9078b0e92799cb1fa8e6a6a0f7c9950fd4c0b563b3e98fd4168ebd9c770ba033", + "rendered": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-${version}.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: '${slack_default_receiver}'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: ${slack_jenkins_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_jenkins_receiver}'\n\n - match_re:\n service: .*\n receiver: ${slack_default_receiver}\n routes:\n - match:\n severity: critical\n receiver: '${slack_default_receiver}'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: '${slack_jenkins_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_jenkins_api_key}'\n channel: '#${slack_jenkins_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: '${slack_default_receiver}'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/${slack_default_api_key}'\n channel: '#${slack_default_channel}'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "1000", "datacenters": "yul1", - "default_receiver": "default-receiver", "group_count": "1", "job_name": "prod-alertmanager", "mem": "1024", "port": "9093", "service_name": "alertmanager", - "slack_api_key": "XX", - "slack_channel": "fdio-infra-monitoring", + "slack_default_api_key": "TE07RD1V1/B01L7PQK9S8/pbADGhhhj60JSxHRi3K0NoW6", + "slack_default_channel": "fdio-infra-monitoring", + "slack_default_receiver": "default-slack-receiver", + "slack_jenkins_api_key": "TE07RD1V1/B01LPL8KM0F/JZQh0gF5U5SoNYnedBknzdHz", + "slack_jenkins_channel": "fdio-jobs-monitoring", + "slack_jenkins_receiver": "jenkins-slack-receiver", "url": "https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz", "use_canary": "true", "use_vault_provider": "false", @@ -51,21 +54,20 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "e0aa0e03-a425-2ec8-06ad-ccf89f0f4ee3", - "bf013613-1ac7-6155-69e0-51ff57719549" + "c5e80ff9-5f53-8022-0179-5b628bba986f" ], "datacenters": [ "yul1" ], - "deployment_id": "d132f4bd-ef31-5406-97e6-8831ad4a3db5", + "deployment_id": "84b19f76-906d-acdb-a7e9-041a07ad72e1", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, "detach": false, "id": "prod-alertmanager", - "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\nglobal:\n # The API URL to use for Slack notifications.\n slack_api_url: 'https://hooks.slack.com/services/XX'\n\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n # This routes performs a regular expression match on alert labels to\n # catch alerts that are related to a list of services.\n - match_re:\n service: .*\n receiver: default-receiver\n # The service has a sub-route for critical alerts, any alerts\n # that do not match, i.e. severity != critical, fall-back to the\n # parent node and are sent to 'team-X-mails'\n routes:\n - match:\n severity: critical\n receiver: 'default-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n # Apply inhibition if the alertname is the same.\n # CAUTION:\n # If all label names listed in `equal` are missing\n # from both the source and target alerts,\n # the inhibition rule will apply!\n equal: ['alertname', 'cluster', 'service']\n\nreceivers:\n- name: 'default-receiver'\n slack_configs:\n - channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-alertmanager\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-alertmanager\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 1\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-alertmanager\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/alertmanager-0.21.0.linux-amd64/alertmanager\"\n args = [\n \"--config.file=secrets/alertmanager.yml\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alertmanager.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n# The directory from which notification templates are read.\ntemplates:\n- '/etc/alertmanager/template/*.tmpl'\n\n#tls_config:\n# # CA certificate to validate the server certificate with.\n# ca_file: \u003cfilepath\u003e ]\n#\n# # Certificate and key files for client cert authentication to the server.\n# cert_file: \u003cfilepath\u003e\n# key_file: \u003cfilepath\u003e\n#\n# # ServerName extension to indicate the name of the server.\n# # http://tools.ietf.org/html/rfc4366#section-3.1\n# server_name: \u003cstring\u003e\n#\n# # Disable validation of the server certificate.\n# insecure_skip_verify: true\n\n# The root route on which each incoming alert enters.\nroute:\n receiver: 'default-slack-receiver'\n\n # The labels by which incoming alerts are grouped together. For example,\n # multiple alerts coming in for cluster=A and alertname=LatencyHigh would\n # be batched into a single group.\n #\n # To aggregate by all possible labels use '...' as the sole label name.\n # This effectively disables aggregation entirely, passing through all\n # alerts as-is. This is unlikely to be what you want, unless you have\n # a very low alert volume or your upstream notification system performs\n # its own grouping. Example: group_by: [...]\n group_by: ['alertname']\n\n # When a new group of alerts is created by an incoming alert, wait at\n # least 'group_wait' to send the initial notification.\n # This way ensures that you get multiple alerts for the same group that start\n # firing shortly after another are batched together on the first\n # notification.\n group_wait: 30s\n\n # When the first notification was sent, wait 'group_interval' to send a batch\n # of new alerts that started firing for that group.\n group_interval: 5m\n\n # If an alert has successfully been sent, wait 'repeat_interval' to\n # resend them.\n repeat_interval: 3h\n\n # All the above attributes are inherited by all child routes and can\n # overwritten on each.\n # The child route trees.\n routes:\n - match_re:\n alertname: JenkinsJob.*\n receiver: jenkins-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'jenkins-slack-receiver'\n\n - match_re:\n service: .*\n receiver: default-slack-receiver\n routes:\n - match:\n severity: critical\n receiver: 'default-slack-receiver'\n\n# Inhibition rules allow to mute a set of alerts given that another alert is\n# firing.\n# We use this to mute any warning-level notifications if the same alert is\n# already critical.\ninhibit_rules:\n- source_match:\n severity: 'critical'\n target_match:\n severity: 'warning'\n equal: ['alertname', 'instance']\n\nreceivers:\n- name: 'jenkins-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-jobs-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\n\n- name: 'default-slack-receiver'\n slack_configs:\n - api_url: 'https://hooks.slack.com/services/'\n channel: '#fdio-infra-monitoring'\n send_resolved: true\n icon_url: https://avatars3.githubusercontent.com/u/3380462\n title: |-\n [{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.alertname }} for {{ .CommonLabels.job }}\n {{- if gt (len .CommonLabels) (len .GroupLabels) -}}\n {{\" \"}}(\n {{- with .CommonLabels.Remove .GroupLabels.Names }}\n {{- range $index, $label := .SortedPairs -}}\n {{ if $index }}, {{ end }}\n {{- $label.Name }}=\"{{ $label.Value -}}\"\n {{- end }}\n {{- end -}}\n )\n {{- end }}\n text: \u003e-\n {{ range .Alerts -}}\n *Alert:* {{ .Annotations.summary }}{{ if .Labels.severity }} - `{{ .Labels.severity }}`{{ end }}\n\n *Description:* {{ .Annotations.description }}\n\n *Details:*\n {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`\n {{ end }}\n {{ end }}\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"alertmanager\"\n port = \"alertmanager\"\n tags = [ \"alertmanager${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Alertmanager Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 1000\n memory = 1024\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"alertmanager\" {\n static = 9093\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7202773", + "modify_index": "7224418", "name": "prod-alertmanager", "namespace": "default", "policy_override": null, @@ -268,7 +270,7 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "7e9cea8d-a5e5-47db-7f9c-653c59c93928" + "a1bc1a10-95a5-3c33-a46d-849522abe4fe" ], "datacenters": [ "yul1" @@ -281,7 +283,7 @@ "id": "prod-mc", "jobspec": "job \"prod-mc\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers.html\n #\n type = \"batch\"\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group.html\n #\n group \"prod-group1-mc\" {\n task \"prod-task1-create-buckets\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"docker\"\n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n image = \"minio/mc:RELEASE.2020-12-10T01-26-17Z\"\n entrypoint = [\n \"/bin/sh\",\n \"-c\",\n \"mc config host add LOCALMINIO http://storage.service.consul:9000 $MINIO_ACCESS_KEY $MINIO_SECRET_KEY \u0026\u0026 mc mb -p LOCALMINIO/logs.fd.io LOCALMINIO/docs.fd.io ; mc policy set public LOCALMINIO/logs.fd.io mc policy set public LOCALMINIO/docs.fd.io mc ilm add --expiry-days '180' LOCALMINIO/logs.fd.io mc admin user add LOCALMINIO storage Storage1234 mc admin policy set LOCALMINIO writeonly user=storage\"\n ]\n dns_servers = [ \"${attr.unique.network.ip-address}\" ]\n privileged = false\n }\n\n # The env stanza configures a list of environment variables to populate\n # the task's environment before starting.\n env {\n \n MINIO_ACCESS_KEY = \"minio\"\n MINIO_SECRET_KEY = \"minio123\"\n \n \n }\n }\n }\n}\n", "json": null, - "modify_index": "7202809", + "modify_index": "7254549", "name": "prod-mc", "namespace": "default", "policy_override": null, @@ -477,9 +479,9 @@ "schema_version": 0, "attributes": { "filename": null, - "id": "f5c5b3316512492fa4e1caea185bafeeeb51b619e76b08ab473ca9523c2b8268", - "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", - "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", + "id": "f4fe3d282a60afb0ef6713540a32d855d65713707e68cf8636a1afd38521b604", + "rendered": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "template": "job \"${job_name}\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"${datacenters}\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n%{ if use_canary }\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n%{ endif }\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-${service_name}\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = ${group_count}\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n %{ if use_host_volume }\n volume \"prod-volume1-${service_name}\" {\n type = \"host\"\n read_only = false\n source = \"${host_volume}\"\n }\n %{ endif }\n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"$${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-${service_name}\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n %{ if use_host_volume }\n volume_mount {\n volume = \"prod-volume1-${service_name}\"\n destination = \"${data_dir}\"\n read_only = false\n }\n %{ endif }\n\n %{ if use_vault_provider }\n vault {\n policies = \"${vault_kv_policy_name}\"\n }\n %{ endif }\n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-${version}.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=${data_dir}prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"${url}\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"${service_name}\"\n port = \"${service_name}\"\n tags = [ \"${service_name}$${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = ${cpu}\n memory = ${mem}\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"${service_name}\" {\n static = ${port}\n }\n }\n }\n }\n }\n}", "vars": { "cpu": "2000", "data_dir": "/data/", @@ -512,23 +514,26 @@ "schema_version": 0, "attributes": { "allocation_ids": [ - "81af62a3-09d6-5e29-a63a-e13952f9caec", - "1516d874-0001-78b1-f6ec-727067af2f29", - "fcf4df05-3083-e96a-9e8b-0856c0fed8f9", - "6eaada1d-5bea-7b95-6564-7c0ff31b3053" + "fc9ab3e3-7854-65d5-0c82-28283383976a", + "5e526c5d-f7b5-f275-357c-48a739a96071", + "d816ca29-4cf4-f9be-4d48-edbddac472ae", + "794f7531-14f5-c1c3-9bf3-88936ee335e3", + "b1edacab-e57f-e014-4d4c-50e563c81ab7", + "2ee67e9b-5bc1-ca2b-c3c4-0509e02ee954", + "922f75f9-39c1-c232-9ef6-81d68a03c719" ], "datacenters": [ "yul1" ], - "deployment_id": "4cbf8370-f2d7-16c2-d500-f6495d43537d", + "deployment_id": "0a36a612-38de-5e1c-448e-72e29a2fb5f3", "deployment_status": "successful", "deregister_on_destroy": true, "deregister_on_id_change": true, "detach": false, "id": "prod-prometheus", - "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", + "jobspec": "job \"prod-prometheus\" {\n # The \"region\" parameter specifies the region in which to execute the job.\n # If omitted, this inherits the default region name of \"global\".\n # region = \"global\"\n #\n # The \"datacenters\" parameter specifies the list of datacenters which should\n # be considered when placing this task. This must be provided.\n datacenters = \"yul1\"\n\n # The \"type\" parameter controls the type of job, which impacts the scheduler's\n # decision on placement. This configuration is optional and defaults to\n # \"service\". For a full list of job types and their differences, please see\n # the online documentation.\n #\n # For more information, please see the online documentation at:\n #\n # https://www.nomadproject.io/docs/jobspec/schedulers\n #\n type = \"service\"\n\n update {\n # The \"max_parallel\" parameter specifies the maximum number of updates to\n # perform in parallel. In this case, this specifies to update a single task\n # at a time.\n max_parallel = 1\n\n health_check = \"checks\"\n\n # The \"min_healthy_time\" parameter specifies the minimum time the allocation\n # must be in the healthy state before it is marked as healthy and unblocks\n # further allocations from being updated.\n min_healthy_time = \"10s\"\n\n # The \"healthy_deadline\" parameter specifies the deadline in which the\n # allocation must be marked as healthy after which the allocation is\n # automatically transitioned to unhealthy. Transitioning to unhealthy will\n # fail the deployment and potentially roll back the job if \"auto_revert\" is\n # set to true.\n healthy_deadline = \"3m\"\n\n # The \"progress_deadline\" parameter specifies the deadline in which an\n # allocation must be marked as healthy. The deadline begins when the first\n # allocation for the deployment is created and is reset whenever an allocation\n # as part of the deployment transitions to a healthy state. If no allocation\n # transitions to the healthy state before the progress deadline, the\n # deployment is marked as failed.\n progress_deadline = \"10m\"\n\n\n # The \"canary\" parameter specifies that changes to the job that would result\n # in destructive updates should create the specified number of canaries\n # without stopping any previous allocations. Once the operator determines the\n # canaries are healthy, they can be promoted which unblocks a rolling update\n # of the remaining allocations at a rate of \"max_parallel\".\n #\n # Further, setting \"canary\" equal to the count of the task group allows\n # blue/green deployments. When the job is updated, a full set of the new\n # version is deployed and upon promotion the old version is stopped.\n canary = 1\n\n # Specifies if the job should auto-promote to the canary version when all\n # canaries become healthy during a deployment. Defaults to false which means\n # canaries must be manually updated with the nomad deployment promote\n # command.\n auto_promote = true\n\n # The \"auto_revert\" parameter specifies if the job should auto-revert to the\n # last stable job on deployment failure. A job is marked as stable if all the\n # allocations as part of its deployment were marked healthy.\n auto_revert = true\n\n }\n\n # The \"group\" stanza defines a series of tasks that should be co-located on\n # the same Nomad client. Any task within a group will be placed on the same\n # client.\n #\n # For more information and examples on the \"group\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/group\n #\n group \"prod-group1-prometheus\" {\n # The \"count\" parameter specifies the number of the task groups that should\n # be running under this group. This value must be non-negative and defaults\n # to 1.\n count = 4\n\n # The volume stanza allows the group to specify that it requires a given\n # volume from the cluster.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/volume\n #\n \n volume \"prod-volume1-prometheus\" {\n type = \"host\"\n read_only = false\n source = \"prod-volume-data1-1\"\n }\n \n\n # The constraint allows restricting the set of eligible nodes. Constraints\n # may filter on attributes or client metadata.\n #\n # For more information and examples on the \"volume\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/constraint\n #\n constraint {\n attribute = \"${attr.cpu.arch}\"\n operator = \"!=\"\n value = \"arm64\"\n }\n\n # The \"task\" stanza creates an individual unit of work, such as a Docker\n # container, web application, or batch processing.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/task\n #\n task \"prod-task1-prometheus\" {\n # The \"driver\" parameter specifies the task driver that should be used to\n # run the task.\n driver = \"exec\"\n\n \n volume_mount {\n volume = \"prod-volume1-prometheus\"\n destination = \"/data/\"\n read_only = false\n }\n \n\n \n\n # The \"config\" stanza specifies the driver configuration, which is passed\n # directly to the driver to start the task. The details of configurations\n # are specific to each driver, so please see specific driver\n # documentation for more information.\n config {\n command = \"local/prometheus-2.24.0.linux-amd64/prometheus\"\n args = [\n \"--config.file=secrets/prometheus.yml\",\n \"--storage.tsdb.path=/data/prometheus/\",\n \"--storage.tsdb.retention.time=15d\"\n ]\n }\n\n # The artifact stanza instructs Nomad to fetch and unpack a remote resource,\n # such as a file, tarball, or binary. Nomad downloads artifacts using the\n # popular go-getter library, which permits downloading artifacts from a\n # variety of locations using a URL as the input source.\n #\n # For more information and examples on the \"artifact\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/artifact\n #\n artifact {\n source = \"https://github.com/prometheus/prometheus/releases/download/v2.24.0/prometheus-2.24.0.linux-amd64.tar.gz\"\n }\n\n # The \"template\" stanza instructs Nomad to manage a template, such as\n # a configuration file or script. This template can optionally pull data\n # from Consul or Vault to populate runtime configuration data.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/template\n #\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/alerts.yml\"\n left_delimiter = \"{{{\"\n right_delimiter = \"}}}\"\n data = \u003c\u003cEOH\n---\ngroups:\n- name: \"Jenkins Job Health Exporter\"\n rules:\n - alert: JenkinsJobHealthExporterFailures\n expr: jenkins_job_failure{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Jenkins Job Health detected high failure rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n - alert: JenkinsJobHealthExporterUnstable\n expr: jenkins_job_unstable{id=~\".*\"} \u003e= 10\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Jenkins Job Health detected high unstable rate on jenkins jobs.\"\n description: \"Job: {{ $labels.id }}\"\n- name: \"Consul\"\n rules:\n - alert: ConsulServiceHealthcheckFailed\n expr: consul_catalog_service_node_healthy == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul service healthcheck failed (instance {{ $labels.instance }}).\"\n description: \"Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`.\"\n - alert: ConsulMissingMasterNode\n expr: consul_raft_peers \u003c 3\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul missing master node (instance {{ $labels.instance }}).\"\n description: \"Numbers of consul raft peers should be 3, in order to preserve quorum.\"\n - alert: ConsulAgentUnhealthy\n expr: consul_health_node_status{status=\"critical\"} == 1\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Consul agent unhealthy (instance {{ $labels.instance }}).\"\n description: \"A Consul agent is down.\"\n- name: \"Hosts\"\n rules:\n - alert: NodeDown\n expr: up == 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus target missing (instance {{ $labels.instance }}).\"\n description: \"A Prometheus target has disappeared. An exporter might be crashed.\"\n - alert: HostHighCpuLoad\n expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100) \u003e 95\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host high CPU load (instance {{ $labels.instance }}).\"\n description: \"CPU load is \u003e 95%.\"\n - alert: HostOutOfMemory\n expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of memory (instance {{ $labels.instance }}).\"\n description: \"Node memory is filling up (\u003c 10% left).\"\n - alert: HostOomKillDetected\n expr: increase(node_vmstat_oom_kill[1m]) \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host OOM kill detected (instance {{ $labels.instance }}).\"\n description: \"OOM kill detected.\"\n - alert: HostMemoryUnderMemoryPressure\n expr: rate(node_vmstat_pgmajfault[1m]) \u003e 1000\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host memory under memory pressure (instance {{ $labels.instance }}).\"\n description: \"The node is under heavy memory pressure. High rate of major page faults.\"\n - alert: HostOutOfDiskSpace\n expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes \u003c 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host out of disk space (instance {{ $labels.instance }}).\"\n description: \"Disk is almost full (\u003c 10% left).\"\n - alert: HostRaidDiskFailure\n expr: node_md_disks{state=\"failed\"} \u003e 0\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Host RAID disk failure (instance {{ $labels.instance }}).\"\n description: \"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap.\"\n - alert: HostConntrackLimit\n expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit \u003e 0.8\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Host conntrack limit (instance {{ $labels.instance }}).\"\n description: \"The number of conntrack is approching limit.\"\n - alert: HostNetworkInterfaceSaturated\n expr: (rate(node_network_receive_bytes_total{device!~\"^tap.*\"}[1m]) + rate(node_network_transmit_bytes_total{device!~\"^tap.*\"}[1m])) / node_network_speed_bytes{device!~\"^tap.*\"} \u003e 0.8\n for: 1m\n labels:\n severity: warning\n annotations:\n summary: \"Host Network Interface Saturated (instance {{ $labels.instance }}).\"\n description: \"The network interface {{ $labels.interface }} on {{ $labels.instance }} is getting overloaded.\"\n - alert: HostSystemdServiceCrashed\n expr: node_systemd_unit_state{state=\"failed\"} == 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host SystemD service crashed (instance {{ $labels.instance }}).\"\n description: \"SystemD service crashed.\"\n - alert: HostEdacCorrectableErrorsDetected\n expr: increase(node_edac_correctable_errors_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: info\n annotations:\n summary: \"Host EDAC Correctable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.'\n - alert: HostEdacUncorrectableErrorsDetected\n expr: node_edac_uncorrectable_errors_total \u003e 0\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}).\"\n description: '{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.'\n- name: \"Min.io\"\n rules:\n - alert: MinioDiskOffline\n expr: minio_offline_disks \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Minio disk offline (instance {{ $labels.instance }})\"\n description: \"Minio disk is offline.\"\n - alert: MinioStorageSpaceExhausted\n expr: minio_disk_storage_free_bytes / 1024 / 1024 / 1024 \u003c 10\n for: 2m\n labels:\n severity: warning\n annotations:\n summary: \"Minio storage space exhausted (instance {{ $labels.instance }}).\"\n description: \"Minio storage space is low (\u003c 10 GB).\"\n- name: \"Prometheus\"\n rules:\n - alert: PrometheusConfigurationReloadFailure\n expr: prometheus_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"Prometheus configuration reload error.\"\n - alert: PrometheusTooManyRestarts\n expr: changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager\"}[15m]) \u003e 2\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus too many restarts (instance {{ $labels.instance }}).\"\n description: \"Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\"\n - alert: PrometheusAlertmanagerConfigurationReloadFailure\n expr: alertmanager_config_last_reload_successful != 1\n for: 0m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}).\"\n description: \"AlertManager configuration reload error.\"\n - alert: PrometheusRuleEvaluationFailures\n expr: increase(prometheus_rule_evaluation_failures_total[3m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus rule evaluation failures (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\"\n - alert: PrometheusTargetScrapingSlow\n expr: prometheus_target_interval_length_seconds{quantile=\"0.9\"} \u003e 60\n for: 5m\n labels:\n severity: warning\n annotations:\n summary: \"Prometheus target scraping slow (instance {{ $labels.instance }}).\"\n description: \"Prometheus is scraping exporters slowly.\"\n - alert: PrometheusTsdbCompactionsFailed\n expr: increase(prometheus_tsdb_compactions_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB compactions failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB compactions failures.\"\n - alert: PrometheusTsdbHeadTruncationsFailed\n expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB head truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB head truncation failures.\"\n - alert: PrometheusTsdbWalCorruptions\n expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL corruptions.\"\n - alert: PrometheusTsdbWalTruncationsFailed\n expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) \u003e 0\n for: 0m\n labels:\n severity: critical\n annotations:\n summary: \"Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}).\"\n description: \"Prometheus encountered {{ $value }} TSDB WAL truncation failures.\"\nEOH\n }\n\n template {\n change_mode = \"noop\"\n change_signal = \"SIGINT\"\n destination = \"secrets/prometheus.yml\"\n data = \u003c\u003cEOH\n---\nglobal:\n scrape_interval: 5s\n scrape_timeout: 5s\n evaluation_interval: 5s\n\nalerting:\n alertmanagers:\n - consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\nrule_files:\n - 'alerts.yml'\n\nscrape_configs:\n\n - job_name: 'Nomad Cluster'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'nomad-client', 'nomad' ]\n relabel_configs:\n - source_labels: [__meta_consul_tags]\n regex: '(.*)http(.*)'\n action: keep\n metrics_path: /v1/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Consul Cluster'\n static_configs:\n - targets: [ '10.30.51.28:8500' ]\n - targets: [ '10.30.51.29:8500' ]\n - targets: [ '10.30.51.30:8500' ]\n - targets: [ '10.30.51.32:8500' ]\n - targets: [ '10.30.51.33:8500' ]\n - targets: [ '10.30.51.34:8500' ]\n - targets: [ '10.30.51.35:8500' ]\n - targets: [ '10.30.51.39:8500' ]\n - targets: [ '10.30.51.40:8500' ]\n - targets: [ '10.30.51.50:8500' ]\n - targets: [ '10.30.51.51:8500' ]\n - targets: [ '10.30.51.65:8500' ]\n - targets: [ '10.30.51.66:8500' ]\n - targets: [ '10.30.51.67:8500' ]\n - targets: [ '10.30.51.68:8500' ]\n - targets: [ '10.30.51.70:8500' ]\n - targets: [ '10.30.51.71:8500' ]\n - targets: [ '10.32.8.14:8500' ]\n - targets: [ '10.32.8.15:8500' ]\n - targets: [ '10.32.8.16:8500' ]\n - targets: [ '10.32.8.17:8500' ]\n metrics_path: /v1/agent/metrics\n params:\n format: [ 'prometheus' ]\n\n - job_name: 'Blackbox Exporter (icmp)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n - targets: [ '10.30.51.32' ]\n params:\n module: [ 'icmp_v4' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'Blackbox Exporter (http)'\n static_configs:\n - targets: [ 'gerrit.fd.io' ]\n - targets: [ 'jenkins.fd.io' ]\n params:\n module: [ 'http_2xx' ]\n relabel_configs:\n - source_labels: [__address__]\n target_label: __param_target\n - source_labels: [__param_target]\n target_label: instance\n - target_label: __address__\n replacement: localhost:9115\n metrics_path: /probe\n\n - job_name: 'cAdvisor Exporter'\n static_configs:\n - targets: [ '10.30.51.28:8080' ]\n - targets: [ '10.30.51.29:8080' ]\n - targets: [ '10.30.51.30:8080' ]\n #- targets: [ '10.30.51.32:8080' ]\n - targets: [ '10.30.51.33:8080' ]\n - targets: [ '10.30.51.34:8080' ]\n - targets: [ '10.30.51.35:8080' ]\n - targets: [ '10.30.51.39:8080' ]\n - targets: [ '10.30.51.40:8080' ]\n - targets: [ '10.30.51.50:8080' ]\n - targets: [ '10.30.51.51:8080' ]\n - targets: [ '10.30.51.65:8080' ]\n - targets: [ '10.30.51.66:8080' ]\n - targets: [ '10.30.51.67:8080' ]\n - targets: [ '10.30.51.68:8080' ]\n - targets: [ '10.30.51.70:8080' ]\n - targets: [ '10.30.51.71:8080' ]\n - targets: [ '10.32.8.14:8080' ]\n - targets: [ '10.32.8.15:8080' ]\n - targets: [ '10.32.8.16:8080' ]\n - targets: [ '10.32.8.17:8080' ]\n\n - job_name: 'Jenkins Job Health Exporter'\n static_configs:\n - targets: [ '10.30.51.32:9186' ]\n metric_relabel_configs:\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n action: replace\n replacement: '$1'\n target_label: id\n - source_labels: [ __name__ ]\n regex: '^(vpp.*|csit.*)_(success|failure|total|unstable|reqtime_ms)$'\n replacement: 'jenkins_job_$2'\n target_label: __name__\n\n - job_name: 'Node Exporter'\n static_configs:\n - targets: [ '10.30.51.28:9100' ]\n - targets: [ '10.30.51.29:9100' ]\n - targets: [ '10.30.51.30:9100' ]\n - targets: [ '10.30.51.32:9100' ]\n - targets: [ '10.30.51.33:9100' ]\n - targets: [ '10.30.51.34:9100' ]\n - targets: [ '10.30.51.35:9100' ]\n - targets: [ '10.30.51.39:9100' ]\n - targets: [ '10.30.51.40:9100' ]\n - targets: [ '10.30.51.50:9100' ]\n - targets: [ '10.30.51.51:9100' ]\n - targets: [ '10.30.51.65:9100' ]\n - targets: [ '10.30.51.66:9100' ]\n - targets: [ '10.30.51.67:9100' ]\n - targets: [ '10.30.51.68:9100' ]\n - targets: [ '10.30.51.70:9100' ]\n - targets: [ '10.30.51.71:9100' ]\n - targets: [ '10.32.8.14:9100' ]\n - targets: [ '10.32.8.15:9100' ]\n - targets: [ '10.32.8.16:9100' ]\n - targets: [ '10.32.8.17:9100' ]\n\n - job_name: 'Alertmanager'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'alertmanager' ]\n\n - job_name: 'Grafana'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'grafana' ]\n\n - job_name: 'Prometheus'\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'prometheus' ]\n\n - job_name: 'Minio'\n bearer_token: eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJleHAiOjQ3NjQ1ODEzMzcsImlzcyI6InByb21ldGhldXMiLCJzdWIiOiJtaW5pbyJ9.oeTw3EIaiFmlDikrHXWiWXMH2vxLfDLkfjEC7G2N3M_keH_xyA_l2ofLLNYtopa_3GCEZnxLQdPuFZrmgpkDWg\n consul_sd_configs:\n - server: '{{ env \"NOMAD_IP_prometheus\" }}:8500'\n services: [ 'storage' ]\n metrics_path: /minio/prometheus/metrics\nEOH\n }\n\n # The service stanza instructs Nomad to register a service with Consul.\n #\n # For more information and examples on the \"task\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/service\n #\n service {\n name = \"prometheus\"\n port = \"prometheus\"\n tags = [ \"prometheus${NOMAD_ALLOC_INDEX}\" ]\n check {\n name = \"Prometheus Check Live\"\n type = \"http\"\n path = \"/-/healthy\"\n interval = \"10s\"\n timeout = \"2s\"\n }\n }\n\n # The \"resources\" stanza describes the requirements a task needs to\n # execute. Resource requirements include memory, network, cpu, and more.\n # This ensures the task will execute on a machine that contains enough\n # resource capacity.\n #\n # For more information and examples on the \"resources\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/resources\n #\n resources {\n cpu = 2000\n memory = 8192\n # The network stanza specifies the networking requirements for the task\n # group, including the network mode and port allocations. When scheduling\n # jobs in Nomad they are provisioned across your fleet of machines along\n # with other jobs and services. Because you don't know in advance what host\n # your job will be provisioned on, Nomad will provide your tasks with\n # network configuration when they start up.\n #\n # For more information and examples on the \"template\" stanza, please see\n # the online documentation at:\n #\n # https://www.nomadproject.io/docs/job-specification/network\n #\n network {\n port \"prometheus\" {\n static = 9090\n }\n }\n }\n }\n }\n}", "json": null, - "modify_index": "7201194", + "modify_index": "7254539", "name": "prod-prometheus", "namespace": "default", "policy_override": null, @@ -610,10 +615,9 @@ "schema_version": 0, "attributes": { "allocation_ids": [ + "a51e95c4-8ff0-47d9-6f88-f475942141bc", "0239788e-a5eb-b54d-0cf2-2404b3ec7c53", - "ecfcd9ad-b959-0fa4-c1ec-8b82195830f1", - "190f5699-0d10-7f85-ac68-96927702070b", - "2fbbef45-f00a-1367-08c4-c2239de9e78d" + "190f5699-0d10-7f85-ac68-96927702070b" ], "datacenters": [ "yul1" |